add index

2025-10-23 13:27:37 -05:00 · 2025-10-23 13:27:37 -05:00 · ca4d7fb8ee
commit ca4d7fb8ee
parent 6d73be14cc
12 changed files with 1698 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,21 @@
 *.html
 *.css
 *.out
 index.pdf
 _minted/
 *.aux
 *.log
 *.4ct
 *.4tc
 *.dvi
 *.idv
 *.lg
 *.tmp
 *.toc
 *.xref
 *~
 verbatim/*.tex
 verbatim/*.svg
 verbatim/*.pdf
 verbatim/output
 images/*.data
--- a/build.sh
+++ b/build.sh
@ -0,0 +1,28 @@
 set -eux
 rm -f verbatim/output/*.svg
 make4ht --shell-escape index.tex "pic-m,pic-equation,svg"
 echo 'img[alt="PIC"] { width: 100%; }' >> index.css
 echo '.cmtt-10 { font-size: 0.9em; }' >> index.css
 echo 'img[src="index3x.svg"] { height: 2.5em; }' >> index.css
 sed -i '/prefers-color-scheme/d' index.css
 sed -i 's| </span>|</span> |g' index.html
 sed -i '/figure.figure/d' index.css
 echo 'figure.figure { margin-left: 20px; margin-right: 20px;  }' >> index.css
 echo 'pre.verbatim { font-size: 0.9em; }' >> index.css
 sed -i 's|color-scheme: light dark;||g' index.css
 echo 'figcaption.caption { margin-bottom: 1.3em; margin-top: 0.3em; }' >> index.css
 sed -i 's/index.css/index2.css/g' index.html
 mv index.css index2.css
 python replace_video.py index.html
 for file in verbatim/output/*.svg; do
    sed -i 's|rgb(0%, 0%, 100%)||g' "$file"
    python resize_svg.py "$file"
 done
--- a/deploy.sh
+++ b/deploy.sh
@ -0,0 +1 @@
 rsync --delete -arv * root@az1.idk.st:/var/www/r500/
--- a/diagrams/fragment_inputs.dot
+++ b/diagrams/fragment_inputs.dot
@ -0,0 +1,61 @@
 digraph D {
  graph [ranksep="1" splines=line ordering="in"];
  node [shape=box];
  edge [arrowhead=none];
  subgraph cluster_W {
    addr [shape=none]
    temp
    const
    float
  }
  subgraph cluster_Z {
    {rank=same
    src [shape=none]
    src0 [label="src0"]
    src1 [label="src1"]
    src2 [label="src2"]
    srcp
    }
  }
  subgraph cluster_R {
    {rank=same
    opcode [shape=none];
    a [label = "a"];
    b [label = "b"];
    c [label = "c"];
    }
  }
  temp:s -> src0:n
  temp:s -> src1:n
  temp:s -> src2:n
  const:s -> src0:n
  const:s -> src1:n
  const:s -> src2:n
  float:s -> src0:n
  float:s -> src1:n
  float:s -> src2:n
  src0:s -> a:n
  src1:s -> a:n
  src2:s -> a:n
  srcp:s -> a:n
  src0:s -> b:n
  src1:s -> b:n
  src2:s -> b:n
  srcp:s -> b:n
  src0:s -> c:n
  src1:s -> c:n
  src2:s -> c:n
  srcp:s -> c:n
 }
--- a/diagrams/fragment_inputs.svg
+++ b/diagrams/fragment_inputs.svg
@ -0,0 +1,205 @@
 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
 "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
 <!-- Generated by graphviz version 12.2.1 (20241206.2353)
 -->
 <!-- Title: D Pages: 1 -->
 <svg width="382pt" height="292pt"
 viewBox="0.00 0.00 382.00 292.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
 <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 288)">
 <title>D</title>
 <polygon fill="white" stroke="none" points="-4,4 -4,-288 378,-288 378,4 -4,4"/>
 <g id="clust1" class="cluster">
 <title>cluster_W</title>
 <polygon fill="none" stroke="black" points="8,-224 8,-276 294,-276 294,-224 8,-224"/>
 </g>
 <g id="clust2" class="cluster">
 <title>cluster_Z</title>
 <polygon fill="none" stroke="black" points="8,-116 8,-168 366,-168 366,-116 8,-116"/>
 </g>
 <g id="clust4" class="cluster">
 <title>cluster_R</title>
 <polygon fill="none" stroke="black" points="35,-8 35,-60 333,-60 333,-8 35,-8"/>
 </g>
 <!-- addr -->
 <g id="node1" class="node">
 <title>addr</title>
 <text text-anchor="middle" x="43" y="-245.32" font-family="Times,serif" font-size="14.00">addr</text>
 </g>
 <!-- temp -->
 <g id="node2" class="node">
 <title>temp</title>
 <polygon fill="none" stroke="black" points="142,-268 88,-268 88,-232 142,-232 142,-268"/>
 <text text-anchor="middle" x="115" y="-245.32" font-family="Times,serif" font-size="14.00">temp</text>
 </g>
 <!-- src0 -->
 <g id="node6" class="node">
 <title>src0</title>
 <polygon fill="none" stroke="black" points="214,-160 160,-160 160,-124 214,-124 214,-160"/>
 <text text-anchor="middle" x="187" y="-137.32" font-family="Times,serif" font-size="14.00">src0</text>
 </g>
 <!-- temp&#45;&gt;src0 -->
 <g id="edge1" class="edge">
 <title>temp:s&#45;&gt;src0:n</title>
 <path fill="none" stroke="black" d="M115,-231C115,-231 187,-161 187,-161"/>
 </g>
 <!-- src1 -->
 <g id="node7" class="node">
 <title>src1</title>
 <polygon fill="none" stroke="black" points="142,-160 88,-160 88,-124 142,-124 142,-160"/>
 <text text-anchor="middle" x="115" y="-137.32" font-family="Times,serif" font-size="14.00">src1</text>
 </g>
 <!-- temp&#45;&gt;src1 -->
 <g id="edge2" class="edge">
 <title>temp:s&#45;&gt;src1:n</title>
 <path fill="none" stroke="black" d="M115,-231C115,-231 115,-161 115,-161"/>
 </g>
 <!-- src2 -->
 <g id="node8" class="node">
 <title>src2</title>
 <polygon fill="none" stroke="black" points="286,-160 232,-160 232,-124 286,-124 286,-160"/>
 <text text-anchor="middle" x="259" y="-137.32" font-family="Times,serif" font-size="14.00">src2</text>
 </g>
 <!-- temp&#45;&gt;src2 -->
 <g id="edge3" class="edge">
 <title>temp:s&#45;&gt;src2:n</title>
 <path fill="none" stroke="black" d="M115,-231C115,-231 259,-161 259,-161"/>
 </g>
 <!-- const -->
 <g id="node3" class="node">
 <title>const</title>
 <polygon fill="none" stroke="black" points="214.12,-268 159.88,-268 159.88,-232 214.12,-232 214.12,-268"/>
 <text text-anchor="middle" x="187" y="-245.32" font-family="Times,serif" font-size="14.00">const</text>
 </g>
 <!-- const&#45;&gt;src0 -->
 <g id="edge4" class="edge">
 <title>const:s&#45;&gt;src0:n</title>
 <path fill="none" stroke="black" d="M187,-231C187,-231 187,-161 187,-161"/>
 </g>
 <!-- const&#45;&gt;src1 -->
 <g id="edge5" class="edge">
 <title>const:s&#45;&gt;src1:n</title>
 <path fill="none" stroke="black" d="M187,-231C187,-231 115,-161 115,-161"/>
 </g>
 <!-- const&#45;&gt;src2 -->
 <g id="edge6" class="edge">
 <title>const:s&#45;&gt;src2:n</title>
 <path fill="none" stroke="black" d="M187,-231C187,-231 259,-161 259,-161"/>
 </g>
 <!-- float -->
 <g id="node4" class="node">
 <title>float</title>
 <polygon fill="none" stroke="black" points="286,-268 232,-268 232,-232 286,-232 286,-268"/>
 <text text-anchor="middle" x="259" y="-245.32" font-family="Times,serif" font-size="14.00">float</text>
 </g>
 <!-- float&#45;&gt;src0 -->
 <g id="edge7" class="edge">
 <title>float:s&#45;&gt;src0:n</title>
 <path fill="none" stroke="black" d="M259,-231C259,-231 187,-161 187,-161"/>
 </g>
 <!-- float&#45;&gt;src1 -->
 <g id="edge8" class="edge">
 <title>float:s&#45;&gt;src1:n</title>
 <path fill="none" stroke="black" d="M259,-231C259,-231 115,-161 115,-161"/>
 </g>
 <!-- float&#45;&gt;src2 -->
 <g id="edge9" class="edge">
 <title>float:s&#45;&gt;src2:n</title>
 <path fill="none" stroke="black" d="M259,-231C259,-231 259,-161 259,-161"/>
 </g>
 <!-- src -->
 <g id="node5" class="node">
 <title>src</title>
 <text text-anchor="middle" x="43" y="-137.32" font-family="Times,serif" font-size="14.00">src</text>
 </g>
 <!-- a -->
 <g id="node11" class="node">
 <title>a</title>
 <polygon fill="none" stroke="black" points="181,-52 127,-52 127,-16 181,-16 181,-52"/>
 <text text-anchor="middle" x="154" y="-29.32" font-family="Times,serif" font-size="14.00">a</text>
 </g>
 <!-- src0&#45;&gt;a -->
 <g id="edge10" class="edge">
 <title>src0:s&#45;&gt;a:n</title>
 <path fill="none" stroke="black" d="M187,-123C187,-123 154,-53 154,-53"/>
 </g>
 <!-- b -->
 <g id="node12" class="node">
 <title>b</title>
 <polygon fill="none" stroke="black" points="253,-52 199,-52 199,-16 253,-16 253,-52"/>
 <text text-anchor="middle" x="226" y="-29.32" font-family="Times,serif" font-size="14.00">b</text>
 </g>
 <!-- src0&#45;&gt;b -->
 <g id="edge14" class="edge">
 <title>src0:s&#45;&gt;b:n</title>
 <path fill="none" stroke="black" d="M187,-123C187,-123 226,-53 226,-53"/>
 </g>
 <!-- c -->
 <g id="node13" class="node">
 <title>c</title>
 <polygon fill="none" stroke="black" points="325,-52 271,-52 271,-16 325,-16 325,-52"/>
 <text text-anchor="middle" x="298" y="-29.32" font-family="Times,serif" font-size="14.00">c</text>
 </g>
 <!-- src0&#45;&gt;c -->
 <g id="edge18" class="edge">
 <title>src0:s&#45;&gt;c:n</title>
 <path fill="none" stroke="black" d="M187,-123C187,-123 298,-53 298,-53"/>
 </g>
 <!-- src1&#45;&gt;a -->
 <g id="edge11" class="edge">
 <title>src1:s&#45;&gt;a:n</title>
 <path fill="none" stroke="black" d="M115,-123C115,-123 154,-53 154,-53"/>
 </g>
 <!-- src1&#45;&gt;b -->
 <g id="edge15" class="edge">
 <title>src1:s&#45;&gt;b:n</title>
 <path fill="none" stroke="black" d="M115,-123C115,-123 226,-53 226,-53"/>
 </g>
 <!-- src1&#45;&gt;c -->
 <g id="edge19" class="edge">
 <title>src1:s&#45;&gt;c:n</title>
 <path fill="none" stroke="black" d="M115,-123C115,-123 298,-53 298,-53"/>
 </g>
 <!-- src2&#45;&gt;a -->
 <g id="edge12" class="edge">
 <title>src2:s&#45;&gt;a:n</title>
 <path fill="none" stroke="black" d="M259,-123C259,-123 154,-53 154,-53"/>
 </g>
 <!-- src2&#45;&gt;b -->
 <g id="edge16" class="edge">
 <title>src2:s&#45;&gt;b:n</title>
 <path fill="none" stroke="black" d="M259,-123C259,-123 226,-53 226,-53"/>
 </g>
 <!-- src2&#45;&gt;c -->
 <g id="edge20" class="edge">
 <title>src2:s&#45;&gt;c:n</title>
 <path fill="none" stroke="black" d="M259,-123C259,-123 298,-53 298,-53"/>
 </g>
 <!-- srcp -->
 <g id="node9" class="node">
 <title>srcp</title>
 <polygon fill="none" stroke="black" points="358,-160 304,-160 304,-124 358,-124 358,-160"/>
 <text text-anchor="middle" x="331" y="-137.32" font-family="Times,serif" font-size="14.00">srcp</text>
 </g>
 <!-- srcp&#45;&gt;a -->
 <g id="edge13" class="edge">
 <title>srcp:s&#45;&gt;a:n</title>
 <path fill="none" stroke="black" d="M331,-123C331,-123 154,-53 154,-53"/>
 </g>
 <!-- srcp&#45;&gt;b -->
 <g id="edge17" class="edge">
 <title>srcp:s&#45;&gt;b:n</title>
 <path fill="none" stroke="black" d="M331,-123C331,-123 226,-53 226,-53"/>
 </g>
 <!-- srcp&#45;&gt;c -->
 <g id="edge21" class="edge">
 <title>srcp:s&#45;&gt;c:n</title>
 <path fill="none" stroke="black" d="M331,-123C331,-123 298,-53 298,-53"/>
 </g>
 <!-- opcode -->
 <g id="node10" class="node">
 <title>opcode</title>
 <text text-anchor="middle" x="76" y="-29.32" font-family="Times,serif" font-size="14.00">opcode</text>
 </g>
 </g>
 </svg>
--- a/diagrams/vertex_inputs.dot
+++ b/diagrams/vertex_inputs.dot
@ -0,0 +1,36 @@
 digraph D {
  graph [ranksep="1" splines=line];
  node [shape=box];
  edge [arrowhead=none];
  input
  const
  temp
  alt_temp
  opcode [shape=none];
  a [label = "a"];
  b [label = "b"];
  c [label = "c"];
  subgraph cluster_R {
    {rank=same opcode a b c}
  }
  input:s -> a:n
  input:s -> b:n
  input:s -> c:n
  const:s -> a:n
  const:s -> b:n
  const:s -> c:n
  temp:s -> a:n
  temp:s -> b:n
  temp:s -> c:n
  alt_temp:s -> a:n
  alt_temp:s -> b:n
  alt_temp:s -> c:n
 }
--- a/diagrams/vertex_inputs.svg
+++ b/diagrams/vertex_inputs.svg
@ -0,0 +1,124 @@
 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
 "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
 <!-- Generated by graphviz version 12.2.1 (20241206.2353)
 -->
 <!-- Title: D Pages: 1 -->
 <svg width="366pt" height="168pt"
 viewBox="0.00 0.00 366.12 168.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
 <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 164)">
 <title>D</title>
 <polygon fill="white" stroke="none" points="-4,4 -4,-164 362.12,-164 362.12,4 -4,4"/>
 <g id="clust1" class="cluster">
 <title>cluster_R</title>
 <polygon fill="none" stroke="black" points="8,-8 8,-60 306,-60 306,-8 8,-8"/>
 </g>
 <!-- input -->
 <g id="node1" class="node">
 <title>input</title>
 <polygon fill="none" stroke="black" points="118,-160 64,-160 64,-124 118,-124 118,-160"/>
 <text text-anchor="middle" x="91" y="-137.32" font-family="Times,serif" font-size="14.00">input</text>
 </g>
 <!-- a -->
 <g id="node6" class="node">
 <title>a</title>
 <polygon fill="none" stroke="black" points="154,-52 100,-52 100,-16 154,-16 154,-52"/>
 <text text-anchor="middle" x="127" y="-29.32" font-family="Times,serif" font-size="14.00">a</text>
 </g>
 <!-- input&#45;&gt;a -->
 <g id="edge1" class="edge">
 <title>input:s&#45;&gt;a:n</title>
 <path fill="none" stroke="black" d="M91,-124C91,-124 127,-53 127,-53"/>
 </g>
 <!-- b -->
 <g id="node7" class="node">
 <title>b</title>
 <polygon fill="none" stroke="black" points="226,-52 172,-52 172,-16 226,-16 226,-52"/>
 <text text-anchor="middle" x="199" y="-29.32" font-family="Times,serif" font-size="14.00">b</text>
 </g>
 <!-- input&#45;&gt;b -->
 <g id="edge2" class="edge">
 <title>input:s&#45;&gt;b:n</title>
 <path fill="none" stroke="black" d="M91,-124C91,-124 199,-53 199,-53"/>
 </g>
 <!-- c -->
 <g id="node8" class="node">
 <title>c</title>
 <polygon fill="none" stroke="black" points="298,-52 244,-52 244,-16 298,-16 298,-52"/>
 <text text-anchor="middle" x="271" y="-29.32" font-family="Times,serif" font-size="14.00">c</text>
 </g>
 <!-- input&#45;&gt;c -->
 <g id="edge3" class="edge">
 <title>input:s&#45;&gt;c:n</title>
 <path fill="none" stroke="black" d="M91,-124C91,-124 271,-53 271,-53"/>
 </g>
 <!-- const -->
 <g id="node2" class="node">
 <title>const</title>
 <polygon fill="none" stroke="black" points="190.12,-160 135.88,-160 135.88,-124 190.12,-124 190.12,-160"/>
 <text text-anchor="middle" x="163" y="-137.32" font-family="Times,serif" font-size="14.00">const</text>
 </g>
 <!-- const&#45;&gt;a -->
 <g id="edge4" class="edge">
 <title>const:s&#45;&gt;a:n</title>
 <path fill="none" stroke="black" d="M163,-124C163,-124 127,-53 127,-53"/>
 </g>
 <!-- const&#45;&gt;b -->
 <g id="edge5" class="edge">
 <title>const:s&#45;&gt;b:n</title>
 <path fill="none" stroke="black" d="M163,-124C163,-124 199,-53 199,-53"/>
 </g>
 <!-- const&#45;&gt;c -->
 <g id="edge6" class="edge">
 <title>const:s&#45;&gt;c:n</title>
 <path fill="none" stroke="black" d="M163,-124C163,-124 271,-53 271,-53"/>
 </g>
 <!-- temp -->
 <g id="node3" class="node">
 <title>temp</title>
 <polygon fill="none" stroke="black" points="262,-160 208,-160 208,-124 262,-124 262,-160"/>
 <text text-anchor="middle" x="235" y="-137.32" font-family="Times,serif" font-size="14.00">temp</text>
 </g>
 <!-- temp&#45;&gt;a -->
 <g id="edge7" class="edge">
 <title>temp:s&#45;&gt;a:n</title>
 <path fill="none" stroke="black" d="M235,-124C235,-124 127,-53 127,-53"/>
 </g>
 <!-- temp&#45;&gt;b -->
 <g id="edge8" class="edge">
 <title>temp:s&#45;&gt;b:n</title>
 <path fill="none" stroke="black" d="M235,-124C235,-124 199,-53 199,-53"/>
 </g>
 <!-- temp&#45;&gt;c -->
 <g id="edge9" class="edge">
 <title>temp:s&#45;&gt;c:n</title>
 <path fill="none" stroke="black" d="M235,-124C235,-124 271,-53 271,-53"/>
 </g>
 <!-- alt_temp -->
 <g id="node4" class="node">
 <title>alt_temp</title>
 <polygon fill="none" stroke="black" points="358.12,-160 279.88,-160 279.88,-124 358.12,-124 358.12,-160"/>
 <text text-anchor="middle" x="319" y="-137.32" font-family="Times,serif" font-size="14.00">alt_temp</text>
 </g>
 <!-- alt_temp&#45;&gt;a -->
 <g id="edge10" class="edge">
 <title>alt_temp:s&#45;&gt;a:n</title>
 <path fill="none" stroke="black" d="M319,-124C319,-124 127,-53 127,-53"/>
 </g>
 <!-- alt_temp&#45;&gt;b -->
 <g id="edge11" class="edge">
 <title>alt_temp:s&#45;&gt;b:n</title>
 <path fill="none" stroke="black" d="M319,-124C319,-124 199,-53 199,-53"/>
 </g>
 <!-- alt_temp&#45;&gt;c -->
 <g id="edge12" class="edge">
 <title>alt_temp:s&#45;&gt;c:n</title>
 <path fill="none" stroke="black" d="M319,-124C319,-124 271,-53 271,-53"/>
 </g>
 <!-- opcode -->
 <g id="node5" class="node">
 <title>opcode</title>
 <text text-anchor="middle" x="49" y="-29.32" font-family="Times,serif" font-size="14.00">opcode</text>
 </g>
 </g>
 </svg>
--- a/index.tex
+++ b/index.tex
@ -0,0 +1,905 @@
 \documentclass[20pt]{article}
 \usepackage[font=small,labelfont=bf]{caption}
 \usepackage{hyperref}
 \hypersetup{
    colorlinks=true,
    linkcolor=blue,
    filecolor=magenta,
    urlcolor=cyan,
    pdftitle={Dreamcast},
    pdfpagemode=FullScreen,
    }
 \usepackage{graphicx}
 \graphicspath{ {./images/} }
 \usepackage{minted}
 \title{Radeon R500}
 \date{}
 \begin{document}
 \maketitle
 \href{images/x1950xt.jpg}{\includegraphics{images/x1950xt.jpg}}
 \tableofcontents
 \section{Introduction}
 The primary/minimal project goal is "draw a triangle on a Radeon R500 via direct
 memory-mapped hardware register and texture memory accesses". This means no
 \href{https://mesa3d.org/}{Mesa}, no
 \href{https://github.com/torvalds/linux/tree/v6.12/drivers/gpu/drm/radeon}{radeon}
 kernel module, and certainly no OpenGL or Direct3D.
 I have worked directly with several other graphics units in the past
 (\href{https://github.com/buhman/saturn-examples}{Saturn VDP1},
 \href{https://github.com/buhman/dreamcast}{Dreamcast Holly},
 \href{https://github.com/buhman/voodoo}{Voodoo 2}). In all of these projects,
 my strategy is generally:
 \begin{itemize}
 \item read the entire \href{doc/R5xx_Acceleration_v1.5.pdf}{reference
  documentation} at least once, front-to-back
 \item copy all hardware register definitions from the documentation to a
  spreadsheet or text file (sometimes typing everything by hand if I am in such
  a chill mood)
 \item progressively build increasingly-complex example programs that exercise
  the hardware
 \end{itemize}
 The rabbit hole for R500 seems significantly deeper, considering this is the
 first graphics unit I've worked with that has programmable vertex and pixel
 shader engines.
 \subsection{Hardware}
 For testing, I currently have this hardware configuration:
 \begin{itemize}
 \item ASUS P4B-LX (Intel 845) motherboard
 \item Intel Pentium 4 2.6GHz SL6PP (Northwood)
 \item 1024 MB RAM
 \item 32GB PATA SSD
 \item ATI Radeon X1650 PRO 512MB AGP
 \end{itemize}
 I also have the X1950 XT PCIe shown in the photo, which amazingly has never been
 used, and prior to the photo was sealed in an antistatic bag from manufacture to
 now.
 \subsection{Test setup}
 While in my other (video game console) projects I typically insist on
 ``bare-metal'' development with no operating system or third-party library
 running on the target hardware, my experience with x86 is much more limited.
 While it is something I am interested in doing, I believe creating a
 zero-dependency ``code upload'' mechanism for an x86-pc that does not depend on
 an operating system would severely delay my progress on R500-specific work.
 For my initial exploration of R500, I will instead be manipulating the hardware
 primarily from Linux kernel space. This Linux kernel code does not actually
 meaningfully depend on Linux APIs beyond calling \texttt{ioremap} to get usable
 memory mappings for R500 PCI resources (texture/framebuffer memory and
 registers).
 \section{Progress: 07 Oct 2025}
 From 01 Oct 2025 to 07 Oct 2025, I achieved the following:
 \begin{itemize}
 \item I wrote a reasonably complete AtomBIOS disassembler
 \item I can disable (IBM PC) VGA mode and manipulate the native framebuffer
 \item I can upload microcode to the ``command processor'', and I can write to
  scratch registers via command processor packets (this is uncoincidentally the
  same command processor test that the radeon kernel module does).
 \item I stepped through Mesa functions as invoked by a simple OpenGL
  application, and created \href{mesa/glDrawArrays.txt}{a list of R500
    registers/values} that are written by Mesa during \texttt{glDrawArrays}.
 \end{itemize}
 I did not achieve the following:
 \begin{itemize}
 \item I attempted to manipulate the R500 register state and command processor
  into drawing a triangle, but I have not been successful yet
 \end{itemize}
 \subsection{Documentation}
 In general, I note that the R500 documentation is significantly weaker than I
 hoped, and does not contain enough information to draw a triangle on the R500
 from the documentation alone (with no prior knowledge about previous Radeon
 graphics units).
 In addition to the lack of prose, in several cases I've noticed both Mesa and
 Linux reference R500 registers that are
 \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/pci/undocumented_3d_registers.h}{not
  present at all} in the documentation.
 \subsection{AtomBIOS}
 AtomBIOS physically exists as a section inside the ROM on R500 graphics units.
 AtomBIOS is notably used for setting PLL/pixel clock frequencies and display
 resolutions, among several other functions.
 The Radeon graphics hardware itself does not execute AtomBIOS code--instead, it
 is expected that the host (e.g: x86) CPU evaluate the instructions in the
 AtomBIOS command tables. Generally the outcome of evaluating AtomBIOS code is
 that several ``register write'' instructions will be executed, changing the
 state of the graphics unit.
 My original goal in studying AtomBIOS was that I thought I would need it to set
 up the R500 display controller to a reasonable state (as a prerequisite for
 drawing 3D graphics). However, after actually experimenting with ``disable VGA
 mode'', I currently believe that I don't actually need to implement
 resolution/mode changes, and can proceed without it.
 \subsection{PIO mode}
 The Linux kernel exclusively communicates with R500 via ``PCI bus mastering''.
 A ``ring buffer'' is allocated in ``GTT'' space, which from the graphics unit's
 perspective exists in the same address space as framebuffer memory, but is an
 address that is outside the framebuffer memory that physically exists.
 I also observed via debugfs that the GTT apparently involves some sort of sparse
 page mapping, but I don't understand how this works from an x86 perspective.
 In the absence of an understanding of how to make my own ``GTT'' address space,
 I attempted to operate the R500 in ``PIO'' mode. This has the advantage of being
 able to simply write to registers via (simple) PCI memory-mapped accesses, but
 it has the disadvantage that Linux doesn't use R500 this way, so I have no
 reference implementation for how PIO mode should be used.
 \subsection{Triangle drawing attempt \#1}
 I translated my \href{mesa/glDrawArrays.txt}{glDrawArrays notes} to
 \href{https://git.idk.st/bilbo/r500/src/commit/b6472e4c16946f44e02d82f31adaa411df009c67/pci/triangle.c}{equivalent
  register writes}.
 This does not work, and I don't yet understand why. The main issue is that most
 of the time when I execute that code, Linux appears to ``hang'' completely, and
 my ``printk'' messages are never sent over ssh. On the rare occasion when the
 ``hang'' does not occur, a triangle is nevertheless not drawn on the
 framebuffer.
 I have a few ideas for how to proceed:
 \begin{itemize}
 \item Move the ``triangle.c'' register accesses to userspace via
  \texttt{/sys/bus/pci}, which might improve debuggability
 \item Abandon the ``write a kernel module'' idea completely, and instead
  interact with the R500 via \href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/radeon_drv.c#L565-L577}{radeon DRM ioctls}
 \end{itemize}
 The latter is perhaps both the most attractive, and the most work. I currently
 don't have any understanding of GEM buffers, radeon buffer objects, etc.., so
 I'd need to study these in more detail.
 \section{Progress: 14 Oct 2025}
 From 08 Oct 2025 to 14 Oct 2025, I achieved the following:
 \begin{itemize}
 \item I studied how Mesa interacts with the \texttt{radeon} kernel module via
  \texttt{DRM\_RADEON\_} ioctls.
 \item I wrote simple R500 \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/regs/pvs_disassemble.py}{vertex shader} and \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/regs/us_disassemble.py}{pixel shader} disassemblers.
 \item I wrote a \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/regs/parse_packets.py}{tool} to print R500 ``PM4'' packets in human-readable form.
 \item I laboriously \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/regs/bits}{copied and reformatted} all bit definitions from \href{doc/R5xx_Acceleration_v1.5.pdf}{R5xx\_Acceleration\_v1.5.pdf}
 \item I wrote \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/regs}{several other miscellaneous tools} related to register and bit parsing and manipulation.
 \item I wrote two \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/drm/single_color.c}{humble} \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/drm/vertex_color.c}{demos} to draw a triangle on R500.
 \end{itemize}
 \subsection{Radeon DRM}
 As implied in the last update, primarily due to my lack of experience with
 bare-metal x86, I decided it would be a better approach to interact with R500
 Command Processor via the \texttt{radeon} kernel module, which provides a
 partially reasonable interface for this via the \texttt{DRM\_RADEON\_CS} ioctl.
 All \texttt{DRM\_RADEON\_} ioctls are mostly or entirely undocumented. Instead,
 I built debugging symbols for Mesa and other supporting libraries so that I
 could set breakpoints in GDB to observe what sequences of \texttt{DRM\_RADEON\_}
 ioctls Mesa uses.
 From my previous \href{mesa/glDrawArrays.txt}{glDrawArrays notes} observations,
 I noticed this strange sequence:
 \begin{verbatim}
 0x0000138a  // type 0 packet, count=0, starting offset = RB3D_COLOROFFSET0
 0x00000000  // RB3D_COLOROFFSET0 = 0
 0xc0001000  // type 3 packet, count=0, opcode=NOP
 0x00000000  // zero (meaningless data)
 \end{verbatim}
 At first, it seemed Mesa was deliberately setting the colorbuffer write address
 to (VRAM address) zero, which seemed like a strange choice considering I am
 debugging an X11/GLX OpenGL application--surely the colorbuffer address would be
 some non-zero value several megabytes after the beginning of VRAM.
 I later attempted to send my own PM4 packet via \texttt{DRM\_RADEON\_CS}. This
 initial attempt returned \texttt{Invalid argument}, with the following
 message in dmesg:
 \begin{verbatim}
 [ 1205.978993] [drm:radeon_cs_packet_next_reloc [radeon]] *ERROR* No packet3 for relocation for packet at 14.
 [ 1205.979427] [drm] ib[14]=0x0000138E
 [ 1205.979433] [drm] ib[15]=0x00C00640
 [ 1205.979437] [drm:r300_packet0_check [radeon]] *ERROR* No reloc for ib[13]=0x4E28
 [ 1205.979545] [drm] ib[12]=0x0000138A
 [ 1205.979548] [drm] ib[13]=0x00000000
 [ 1205.979553] [drm:radeon_cs_ioctl [radeon]] *ERROR* Invalid command stream !
 \end{verbatim}
 This error message comes from
 \href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/r300.c#L664-L669}{drm/radeon/r300.c}.
 The meaningless data following the type-3 NOP packet is used by the kernel to
 \href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/radeon_cs.c#L875-L889}{index}
 the \texttt{DRM\_RADEON\_CS} ``relocs'' array (an array of GEM buffer handles).
 It seems perhaps the design goal was to never expose the VRAM address of GEM
 buffers to userspace (indeed there seems to be no way to retrieve that via any
 GEM ioctls). This restriction is slightly disappointing, as I would have
 preferred to be able to send unmodified packet data to the R500.
 However, at the moment this does not appear to be a significant issue, as a
 relatively small number of registers are modified by the Linux kernel's packet
 parser prior creating the indirect buffer that is actually sent to the R500
 hardware.
 \subsection{Indirect buffers}
 There appears to be a lot of memory-to-memory copying in the
 Linux/Mesa/DRM/GEM/radeon graphics stack:
 \begin{itemize}
 \item Mesa writes the OpenGL state to various internal structures
 \item Mesa \href{https://gitlab.freedesktop.org/mesa/mesa/-/blob/25.0/src/gallium/drivers/r300/r300_emit.c?ref_type=heads}{copies} OpenGL state to packet commands in a userspace buffer
 \item Mesa
  \href{https://gitlab.freedesktop.org/mesa/mesa/-/blob/25.0/src/gallium/winsys/radeon/drm/radeon_drm_cs.c?ref_type=heads#L486-487}{passes
    the address} of the userspace buffer to the kernel via
  \texttt{DRM\_RADEON\_CS}
 \item Linux
  \href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/radeon_cs.c#L340-L358}{copies
    the entire userspace buffer} to kernel space (calling kvmalloc/kvfree on
  each ioctl)
 \item The \texttt{radeon\_cs\_parser} parses and modifies the buffer originally
  generated by Mesa
 \item \href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/radeon_cs.c#L613}{radeon\_cs\_ib\_fill} copies the parser result to gpu address space.
 \end{itemize}
 Eventually,
 \href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/r100.c#L3709-L3722}{r100\_ring\_ib\_execute}
 is called, which writes the indirect buffer address (now in GPU address space)
 to the ring.
 It would be interesting to experiment with writing a packet buffer directly in
 GPU/GTT address space (from Linux userspace), with zero copies. This would
 require an entirely new set of ioctls.
 \subsection{Triangle drawing attempt \#2}
 These images were never drawn on-screen. I extracted them from VRAM via
 \texttt{/sys/kernel/debug/radeon\_vram}.
 \begin{figure}
  \href{images/single_color_macrotiled.png}{\includegraphics{images/single_color_macrotiled.png}}
  \caption*{R500 framebuffer capture, \texttt{single\_color.c}}
 \end{figure}
 Though I was not aware of it yet, the above image was indeed my triangle, and
 \texttt{COLORPITCH0} was merely in ``macrotiled'' mode. Once I realized this, I
 produced this image (still in off-screen VRAM):
 \begin{figure}
  \href{images/single_color.png}{\includegraphics{images/single_color.png}}
  \caption*{R500 framebuffer capture, \texttt{single\_color.c}}
 \end{figure}
 This \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/drm/single_color.c}{``single color''} demo deliberately uses the very simple vertex and fragment
 shaders:
 \begin{figure}
 \begin{verbatim}
 instruction[0]:
  0x00f00203  dst: VE_ADD out[0].xyzw
  0x00d10001  src0: input[0].xyzw
  0x01248001  src1: input[0].0000
  0x01248001  src2: input[0].0000
 \end{verbatim}
 \caption*{R500 vertex shader (1 instruction, 128-bit control word)}
 \end{figure}
 This vertex shader is doing the equivalent of:
 \begin{figure}
  \href{verbatim/vertex_shader_equivalent_single_color.glsl}{\includegraphics{verbatim/output/vertex_shader_equivalent_single_color.glsl.pdf}}
 \end{figure}
 The W component \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae//drm/single_color.c#L339}{comes from}
 \texttt{VAP\_PROG\_STREAM\_CNTL\_EXT\_\_SWIZZLE\_SELECT\_W\_0(5)}, which
 swizzles W to a constant \texttt{1.0}, despite W not being present in the vertex
 data.
 \begin{figure}
 \begin{verbatim}
 instruction[0]:
  0x00078005  OUT RGBA
  0x08020080  RGB ADDR0=0.0 ADDR1=0.0 ADDR2=0.0
  0x08020080  ALPHA ADDR0=0.0 ADDR1=0.0 ADDR2=0.0
  0x1c9b04d8  RGB_SEL_A=src0.110 RGB_SEL_B=src0.110 TARGET=A
  0x1c810003  ALPHA_OP=OP_MAX ALPHA_SEL_A=src0.0 ALPHA_SEL_B=src0.0 TARGET=A
  0x00000005  RGB_OP=OP_MAX
 \end{verbatim}
 \caption*{R500 fragment shader (1 instruction, 192-bit control word)}
 \end{figure}
 This fragment shader is doing the equivalent of:
 \begin{figure}
  \href{verbatim/fragment_shader_equivalent_single_color.glsl}{\includegraphics{verbatim/output/fragment_shader_equivalent_single_color.glsl.pdf}}
 \end{figure}
 via the src swizzles. I think it is interesting that there are so many options
 for producing inline constants within the fragment shader.
 The ``target'' fragment shader field also seems interesting. I am excited to
 write shaders that use multiple output buffers.
 \subsection{DRM/KMS/GBM}
 These renders were not displayed on-screen, so I looked for ways to correct
 this.
 Perhaps the most obvious method would be to write to the display controller
 registers (\texttt{D1GRPH\_PRIMARY\_SURFACE\_ADDRESS}) via
 \texttt{RADEON\_DRM\_CS}. However, this does not work due to the command parser
 anti-fun implemented in
 \href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/r300.c#L643}{r300\_packet0\_check}:
 any register not present in that case statement is considered invalid, and the
 packet buffer is not submitted.
 I attempted to do this the ``right way'' via the DRM/KMS/GBM APIs. I then
 learned that this does not behave correctly on my R500 because demos that wait
 for the flag returned by \texttt{DRM\_IOCTL\_MODE\_PAGE\_FLIP} hang forever.
 I noticed this earlier on Xorg/GLX as well, as I have been using the
 \texttt{vblank\_mode=0} environment variable to avoid hanging forever in
 \texttt{glXSwapBuffers}. This appears to be a Linux kernel bug, but I didn't
 investigate this further.
 \subsection{On-screen drawing}
 I noticed in \texttt{/sys/kernel/debug/radeon\_vram\_mm} that the Linux console
 is only using a single framebuffer (and does not double-buffer).
 This is fortunate, because this means I can simply
 \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/pci_user/main.c#L48}{mmap
  the register address space} and write
 \texttt{D1GRPH\_PRIMARY\_SURFACE\_ADDRESS} myself without worrying about the
 Linux console overwriting my change. I observed the \texttt{0x813000} value from
 \texttt{/sys/kernel/debug/radeon\_vram\_mm}--there appears to be no other way to
 get the vram address of a GEM buffer.
 This is ``good enough'' for now, though at some point I'll want to learn how to
 do proper vblank-synchronized double buffering.
 \subsection{Triangle drawing attempt \#3}
 I felt the next logical step was to learn how attributes and constants are
 passed through the shader pipeline, so I then \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/drm/vertex_color.c}{created a demo} that produced this image (this time also displayed on-screen):
 \begin{figure}
  \href{images/vertex_color.png}{\includegraphics{images/vertex_color.png}}
  \caption*{R500 framebuffer capture, \texttt{vertex\_color.c}}
 \end{figure}
 \begin{figure}
 \begin{verbatim}
 instruction[0]:
  0x00702203  dst: VE_ADD out[1].xyz_
  0x01d10021  src0: input[1].xyz_
  0x01248021  src1: input[1].0000
  0x01248021  src2: input[1].0000
 instruction[1]:
  0x00f00203  dst: VE_ADD out[0].xyzw
  0x01510001  src0: input[0].xyz1
  0x01248001  src1: input[0].0000
  0x01248001  src2: input[0].0000
 \end{verbatim}
 \caption*{R500 vertex shader (2 instructions, 128-bit control words)}
 \end{figure}
 This vertex shader is doing the equivalent of
 \begin{figure}
  \href{verbatim/vertex_shader_equivalent_vertex_color.glsl}{\includegraphics{verbatim/output/vertex_shader_equivalent_vertex_color.glsl.pdf}}
 \end{figure}
 The extra vertex input is fed to the vertex shader via changes to
 \texttt{VAP\_PROG\_STREAM\_CNTL\_0},
 \texttt{VAP\_PROG\_STREAM\_CNTL\_EXT\_0}. Based on my currently limited
 understanding, it seems that arranging the vertex data like this:
 \begin{figure}
  \href{verbatim/vap_prog_stream_vertices.c}{\includegraphics{verbatim/output/vap_prog_stream_vertices.c.pdf}}
 \end{figure}
 Is easier to deal with in \texttt{VAP\_PROG\_STREAM\_CNTL} than:
 \begin{figure}
  \href{verbatim/vap_prog_stream_vertices2.c}{\includegraphics{verbatim/output/vap_prog_stream_vertices2.c.pdf}}
 \end{figure}
 \begin{figure}
 \begin{verbatim}
 instruction[0]:
  0x00078005  OUT RGBA
  0x08020000  RGB ADDR0=temp[0] ADDR1=0.0 ADDR2=0.0
  0x08020080  ALPHA ADDR0=0.0 ADDR1=0.0 ADDR2=0.0
  0x1c440220  RGB_SEL_A=src0.rgb RGB_SEL_B=src0.rgb TARGET=A
  0x1cc18003  ALPHA_OP=OP_MAX ALPHA_SEL_A=src0.1 ALPHA_SEL_B=src0.1 TARGET=A
  0x00000005  RGB_OP=OP_MAX
 \end{verbatim}
 \caption*{R500 fragment shader (1 instruction, 192-bit control word)}
 \end{figure}
 This fragment shader is doing the equivalent of:
 \begin{figure}
  \href{verbatim/fragment_shader_equivalent_vertex_color.glsl}{\includegraphics{verbatim/output/fragment_shader_equivalent_vertex_color.glsl.pdf}}
 \end{figure}
 The \texttt{temp} input appears to be written by
 \texttt{VAP\_OUT\_VTX\_FMT\_0\__VTX\_COLOR\_0\_PRESENT} and read due to the
 changes to \texttt{RS\_COUNT} and \texttt{RS\_INST\_0}.
 \section{Progress: 21 Oct 2025}
 From 15 Oct 2025 to 21 Oct 2025, I achieved the following (roughly in chronological order):
 \begin{itemize}
 \item I learned how the vertex fetcher is \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/vertex_color_aos.c#L387-L401}{configured}
 \item I learned how the ``point list'' drawing primitive can be used to \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/single_color_clear.c#L504}{clear the screen}
 \item I invented a new syntax for R500 vertex shader assembly (ATI never specified one themselves)
 \item I modified my R500 \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/regs/pvs_disassemble.py}{vertex shader disassembler} to emit this new vertex shader syntax
 \item I wrote a R500 \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/regs/assembler/vs}{vertex shader assembler} that can process my vertex shader assembly syntax
 \item I create several animated demos with \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/single_color_clear_translate_vblank.c#L849-L859}{vblank-synchronized double buffering}
 \item I learned how to configure and draw (multi-)textured triangles
 \item I learned how to configure, clear, and use Z-buffers
 \item I made a \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/texture_cube_clear_zwrite_vertex_shader.c}{textured rotating cube demo} that uses my first non-trivial \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/cube_rotate.vs.asm}{handwritten vertex shader assembly program}
 \item I invented a new syntax for R500 fragment shader assembly (ATI never specified one themselves)
 \item I wrote a new R500 \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/regs/us_disassemble2.py}{fragment shader disassembler} that emits this new fragment shader syntax
 \item I wrote a R500 \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/regs/assembler/fs}{fragment shader assembler} that can process my fragment shader assembly syntax
 \item I wrote a ``shadertoy''-style demo that uses my first non-trivial \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/shadertoy_palette.fs.asm}{handwritten fragment shader assembly program}
 \end{itemize}
 \subsection{DRM\_RADEON\_CS state tracking}
 While attempting refactor one of my R500 demos to send fewer registers per
 \texttt{DRM\_RADEON\_CS} ioctl, I found that there is a ``state tracker'' within
 the \texttt{drm/radeon/r100}. For example, even if you don't use or depend on a
 Z-buffer, \texttt{DRM\_RADEON\_CS} will still reject your packet buffer
 depending on its own (imagined) concept of what the GPU state is. For example:
 \begin{verbatim}
 [ 1614.729278] [drm:r100_cs_track_check [radeon]] *ERROR* [drm] No buffer for z buffer !
 [ 1614.729626] [drm:radeon_cs_ioctl [radeon]] *ERROR* Invalid command stream !
 \end{verbatim}
 This happens because \texttt{track->z\_enabled} is
 \href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/r100.c#L2435}{initially
  true} at the start of a \texttt{DRM\_RADEON\_CS} ioctl, and does not become
 false unless the packet buffer
 \href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/r300.c#L836-L843}{contains
  a write} to \texttt{ZB\_CNTL}.
 This seems a bit heavy-handed. Even if the model were ``multiple applications
 may be using the GPU, so a single application can't depend on previously-set
 register state'', it would still be better if the kernel didn't try to enforce
 this by restricting permissible content of a packet buffer.
 \subsection{Vertex transform bypass}
 Mesa uses a ``point'' 3D primitive to implement \texttt{glClear} on R500. It
 does this by first uploading this vertex shader:
 \begin{figure}
  \href{verbatim/mesa_glclear.vs.asm}{\includegraphics{verbatim/output/mesa_glclear.vs.asm.pdf}}
  \caption*{\texttt{mesa\_glclear.vs.asm}}
 \end{figure}
 This shader does nothing to the input other than copy it to the output, where
 \texttt{out[0]} is the position vector, and \texttt{out[1]} is sent to the
 fragment shader as a ``texture coordinate''. That fragment shader, in turn, does
 not use the texture coordinate:
 \begin{figure}
  \href{verbatim/mesa_glclear.fs.asm}{\includegraphics{verbatim/output/mesa_glclear.fs.asm.pdf}}
  \caption*{\texttt{mesa\_glclear.fs.asm}}
 \end{figure}
 In my ``clear''
 \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/single_color_clear_rotate_vblank.c#L539}{implementation},
 I instead set \texttt{PVS\_BYPASS}, which ``bypasses'' the vertex shader
 completely, sending the vertices directly to the rasterizer. This is convenient
 because it obviates the need to upload/change vertex shaders just to clear the
 color and Z -buffers.
 \subsection{Animation attempt \#1}
 With a working colorbuffer clear, I wrote the
 \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/single_color_clear_translate.c#L786}{single\_color\_clear\_translate.c}
 demo to translate my triangle position coordinates in a loop that waits for
 \texttt{DRM\_RADEON\_GEM\_WAIT\_IDLE} between each frame. This attempt
 produced the following images:
 \begin{figure}
  \includegraphics{videos/single_color_clear_translate.png}
  \caption*{R500 DVI capture, \texttt{single\_color\_clear\_translate.c}}
 \end{figure}
 This was intended to be a smooth animation, yet it is not. It also seems several
 frames are never being displayed--the translation step is much smaller than what
 is shown in the video.
 This, interestingly, is exactly identical to how OpenGL/GLX applications behave
 on R500 with \texttt{vblank\_mode=0}.
 \subsection{Animation attempt \#2}
 I read the R500 display controller \href{doc/RRG-216M56-03oOEM.pdf}{register reference guide} again.
 It appears to suggest the \texttt{D1CRTC\_UPDATE\_INSTANTLY} bit, when unset,
 might cause changes to \texttt{D1GRPH\_PRIMARY\_SURFACE\_ADDRESS} to be delayed in
 hardware until the next vertical blanking interval begins.
 This can be combined with polling \texttt{D1GRPH\_SURFACE\_UPDATE\_PENDING} to
 later determine when the vblank-synchronized frame change actually occured.
 This is precisely what I implemented in
 \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/single_color_clear_translate_vblank.c#L854-L855}{single\_color\_clear\_translate\_vblank.c}:
 \begin{figure}
  \includegraphics{videos/single_color_clear_translate_vblank.png}
  \caption*{R500 DVI capture, \texttt{single\_color\_clear\_translate\_vblank.c}}
 \end{figure}
 This is much closer to what I intended. The
 \texttt{D1GRPH\_SURFACE\_UPDATE\_PENDING} part is certainly working as I
 expected. Setting/unsetting \texttt{D1CRTC\_UPDATE\_INSTANTLY} appears to have
 no effect on \texttt{D1GRPH\_PRIMARY\_SURFACE\_ADDRESS} behavior, so I feel my
 understanding of R500 double-buffering is still incomplete.
 \subsection{Multiple-texture sampling}
 I am amazed and delighted how simple multiple-texture sampling is on R500.
 As a counter-example, while Sega Dreamcast does have a fairly capable
 fixed-function blending unit, to use the blending unit with multiple-texture
 sampled polygons one needs to render the polygon multiple times (at least once
 per texture) to an accumulation buffer. Blending is then performed between the
 currently-sampled texture and the previously-accumulated result, and the blend
 result is written to the accumulation buffer. From a vertex transformation
 perspective, it can be inconvenient/inefficient to be required to buffer entire
 triangle strips so that they can be submitted more than once per frame without
 duplicating the clip/transform computations.
 This is the fragment shader for
 \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/texture_dual.c}{texture\_dual.c}
 (disassembly of code originally generated by Mesa):
 \begin{figure}
  \href{verbatim/texture_dual.fs.asm}{\includegraphics{verbatim/output/texture_dual.fs.asm.pdf}}
  \caption*{\texttt{texture\_dual.fs.asm}}
 \end{figure}
 This pre-subtract multiply-add is an algebraic rearrangement of this GLSL code:
 \begin{figure}
  \href{verbatim/texture_dual.fs.glsl}{\includegraphics{verbatim/output/texture_dual.fs.glsl.pdf}}
  \caption*{\texttt{texture\_dual.fs.glsl}}
 \end{figure}
 Which produces this image:
 \begin{figure}
  \href{images/texture_dual.png}{\includegraphics{images/texture_dual.png}}
  \caption*{R500 framebuffer capture, \texttt{texture\_dual.c}}
 \end{figure}
 Being able to manipulate the texture samples as fragment shader unit temporaries
 rather than as a sequence of accumulation buffer operations has me feeling excited
 to do more with this.
 \subsection{Z-buffer clear}
 I've never worked with traditional Z-buffers before--Sega Saturn uses
 \href{https://en.wikipedia.org/wiki/Painter\%27s_algorithm}{painter's algorithm}
 exclusively, and Sega Dreamcast uses a ``depth accumulation buffer''
 that isn't directly readable/writable.
 It is slightly obvious in retrospect, but it took me several minutes to realize
 that a ``depth clear'' can be implemented by covering the entire screen with a
 ``point'' primitive with the desired initial depth while \texttt{ZFUNC} set to
 \texttt{ALWAYS}.
 \subsection{Drawing a 3D cube}
 With working double-buffering, Z-buffering, and the ability to clear each of
 these every frame, I felt I was finally ready to draw something ``3D''.
 I thought it would be fun to first start with a cube that is transformed in
 ``software'' on the x86 CPU (not using a vertex shader). This sequence of videos
 shows my progression on implementing this:
 \begin{figure}
  \includegraphics{videos/texture_cube.png}
  \caption*{R500 DVI capture, \texttt{texture\_cube.c}}
 \end{figure}
 \begin{figure}
  \includegraphics{videos/texture_cube_clear.png}
  \caption*{R500 DVI capture, \texttt{texture\_cube\_clear.c}}
 \end{figure}
 \begin{figure}
  \includegraphics{videos/texture_cube_clear_zwrite.png}
  \caption*{R500 DVI capture, \texttt{texture\_cube\_clear\_zwrite.c}}
 \end{figure}
 \subsection{Drawing a 3D cube with vertex shaders}
 I then decided it would be fun to hand-write a ``3D rotation'' vertex shader
 from scratch. I first implemented the rotation in GLSL:
 \begin{figure}
  \href{verbatim/cube_rotate.vs.glsl}{\includegraphics{verbatim/output/cube_rotate.vs.glsl.pdf}}
  \caption*{\texttt{cube\_rotate.vs.glsl}}
 \end{figure}
 I verified that the GLSL version worked as expected in OpenGL, then I translated
 the GLSL to R500 vertex shader assembly, as:
 \begin{figure}
  \href{verbatim/cube_rotate.vs.asm}{\includegraphics{verbatim/output/cube_rotate.vs.asm.pdf}}
  \caption*{\texttt{cube\_rotate.vs.asm}}
 \end{figure}
 However, when I first executed the vertex shader cube rotation demo, I found
 it did not work as expected:
 \begin{figure}
  \includegraphics{videos/texture_cube_clear_zwrite_vertex_shader_incorrect.png}
  \caption*{R500 DVI capture, \texttt{texture\_cube\_clear\_zwrite\_vertex\_shader.c}\\(incorrect vertex shader assembler output)}
 \end{figure}
 After hours of debugging, I eventually found the issue was in this instruction:
 \begin{figure}
  \href{verbatim/cube_rotate_3_temp.vs.asm}{\includegraphics{verbatim/output/cube_rotate_3_temp.vs.asm.pdf}}
 \end{figure}
 \href{doc/R5xx_Acceleration_v1.5.pdf}{R5xx\_Acceleration\_v1.5.pdf} briefly mentions this on pages 98 and 99:
 \begin{quote}
 The PVS\_DST\_MACRO\_INST bit was meant to be used for MACROS such as a
 vector-matrix multiply, but currently is only set for the following cases:
 A VE\_MULTIPLY\_ADD or VE\_MULTIPLYX2\_ADD instruction with all 3 source
 operands using unique PVS\_REG\_TEMPORARY vector addresses.  Since R300 only has
 two read ports on the temporary memory, this special case of these instructions
 is broken up (by the HW) into 2 operations.
 \end{quote}
 I read this paragraph much earlier, but I didn't fully understand it until
 now. Indeed, this multiply-add has three unique \texttt{temp} addresses, and
 must be encoded as a ``macro'' instruction.
 I fixed this in my vertex shader assembler by
 \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/regs/assembler/vs/validator.py}{counting the number of unique temp addresses}
 referenced by each instruction, promoting \texttt{VE\_MULTIPLY\_ADD} to
 \texttt{PVS\_MACRO\_OP\_2CLK\_MADD} if more than two unique \texttt{temp}
 addresses are referenced.
 With this change, reassembling the same vertex shader source code now produces a
 correct vertex shader cube rotation:
 \begin{figure}
  \includegraphics{videos/texture_cube_clear_zwrite_vertex_shader.png}
  \caption*{R500 DVI capture, \texttt{texture\_cube\_clear\_zwrite\_vertex\_shader.c}\\(correct vertex shader assembler output)}
 \end{figure}
 \subsection{Comparison with Mesa's R500 vertex shader compiler}
 My ``cube rotation'' vertex shader,
 \href{https://git.idk.st/bilbo/r500/src/commit/50244c7c95/drm/cube_rotate.vs.asm}{cube\_rotate.vs.asm}
 is 15 instructions.
 Mesa's R500 vertex shader compiler generated a
 \href{https://git.idk.st/bilbo/r500/src/commit/50244c7c95/shader_examples/mesa/texture_cube_depth_vertex_shader.vs.txt}{27-instruction vertex shader}
 from \href{https://r500.idk.st/verbatim/cube_rotate.vs.glsl}{semantically equivalent GLSL code}. Disassembly:
 \begin{figure}
  \href{verbatim/mesa_cube_rotate.vs.asm}{\includegraphics{verbatim/output/mesa_cube_rotate.vs.asm.pdf}}
  \caption*{\texttt{mesa\_cube\_rotate.vs.asm}}
 \end{figure}
 I was not particularly trying to write concise code, but I find this difference
 in instruction count to be surprising. In general it seems Mesa's R500 vertex
 shader compiler failed to vectorize several operations, and does significantly
 more scalar multiplies and scalar multiply-adds than my implementation.
 Ignoring algorithmic improvements (such as lifting the sin/cos calculation to
 x86 code and instead sending a 4x4 matrix to the vertex shader), there is still
 more opportunity for optimization beyond my 15-instruction implementation.
 Particularly, the vertex shader unit has a ``dual math'' instruction mode, where
 ``vector engine'' (VE\_) and ``math engine'' (ME\_) operations can be executed
 simultaneously in the same instruction. \texttt{cube\_rotate.vs.asm} would
 indeed benefit from such an optimization--most of the \texttt{ME\_SIN} and
 \texttt{ME\_COS} instructions could be interleaved with the \texttt{VE\_MUL} and
 \texttt{VE\_MAD} operations that follow (at significant expense to
 human-readability).
 I am curious to see more examples of the difference between Mesa's R500 vertex
 shader compiler output and my own vertex shader assembly.
 \subsection{Fragment shader instruction expressiveness}
 Compared to the R500 vertex shader instructions, the R500 fragment shader
 instructions are significantly more featureful. This makes inventing a syntax
 that can fully express the range of operations that a R500 fragment shader
 instruction can do more complex.
 A significant difference is where R500 vertex shaders have a single tier of
 operand argument decoding, as in:
 \begin{figure}
  \includegraphics{diagrams/vertex_inputs.svg}
  \caption*{R500 vertex shader instruction operand inputs (simplified)}
 \end{figure}
 While R500 fragment shaders have multiple tiers of operand argument decoding, as
 in:
 \begin{figure}
  \includegraphics{diagrams/fragment_inputs.svg}
  \caption*{R500 fragment shader instruction operand inputs (simplified)}
 \end{figure}
 I've written several \href{https://github.com/buhman/scu-dsp-asm}{nice assemblers}
 for other architectures in the past, but I've never seen any instruction set
 as expressive as R500 fragment shaders.
 I attempted to directly reflect this ``multiple tiers of operand argument
 decoding'' in the syntax I invented for fragment shader ALU instructions.
 These instructions are also vector instructions: a total of 24 floating point
 input operands and 8 floating results could be evaluated per instruction.
 With this abundance of expressiveness and a relatively high skill ceiling, I'm
 amazed R500 fragment shader assembly isn't more popular in programming
 competitions, general everyday conversation, etc...
 \subsection{Fragment shader assembler bugs}
 There were two ``I spent a lot of time debugging this'' issues I encountered
 with my fragment shader assembler.
 The first was in this code I wrote to draw a fragment shaded circle, as in:
 \begin{figure}
  \href{images/shadertoy_circle.png}{\includegraphics{images/shadertoy_circle.png}}
  \caption*{R500 framebuffer capture, \texttt{shadertoy\_circle.fs.asm}}
 \end{figure}
 However, in an earlier version of my fragment shader assembler, I produced this
 image instead:
 \begin{figure}
  \href{images/shadertoy_circle_incorrect.png}{\includegraphics{images/shadertoy_circle_incorrect.png}}
  \caption*{R500 framebuffer capture, \texttt{shadertoy\_circle.fs.asm}\\(incorrect assembler output)}
 \end{figure}
 In this handwritten fragment shader code:
 \begin{figure}
  \href{verbatim/shadertoy_circle.fs.asm}{\includegraphics{verbatim/output/shadertoy_circle.fs.asm.pdf}}
  \caption*{\texttt{shadertoy\_circle.fs.asm}}
 \end{figure}
 \href{doc/R5xx_Acceleration_v1.5.pdf}{R5xx\_Acceleration\_v1.5.pdf} says briefly on page 241:
 \begin{quote}
 Specifies whether to insert a NOP instruction after this.  This would get
 specified in order to meet dependency requirements for the pre-subtract inputs,
 and dependency requirements for src0 of an MDH/MDV instruction.
 \end{quote}
 The issue is the pre-subtract input for the \texttt{MAD |srcp.a| src0.1 -src2.a}
 instruction depends on the write to \texttt{temp[0].a} from the immediately
 preceding \texttt{RCP src0.a} instruction--a pipeline hazard.
 To fix this, I added support for
 \href{https://git.idk.st/bilbo/r500/commit/fe0684ca5e58ed3be026410812c042e883bdce71}{generating the \texttt{NOP} bit}
 in my fragment shader assembler.
 \subsection{More fragment shader assembler bugs}
 While trying to produce this image:
 \begin{figure}
  \href{images/shadertoy_palette.png}{\includegraphics{images/shadertoy_palette.png}}
  \caption*{R500 framebuffer capture, \texttt{shadertoy\_palette.fs.asm}}
 \end{figure}
 My fragment shader code instead produced this image:
 \begin{figure}
  \href{images/shadertoy_palette_incorrect.png}{\includegraphics{images/shadertoy_palette_incorrect.png}}
  \caption*{R500 framebuffer capture, \texttt{shadertoy\_palette.fs.asm}\\(incorrect assembler output)}
 \end{figure}
 The issue was simply that in the chaos of all of the other features I was
 implementing for my fragment shader assembler, I
 \href{https://git.idk.st/bilbo/r500/commit/f6a0fc4fab5dee3085dcf4b9a984244bba05d5ca}{forgot to emit the \texttt{ADDRD} bits}.
 This meant that while fragment shader code that exclusively uses zero-address
 destinations, such as \texttt{shadertoy\_circle.fs.asm}, appeared to work
 completely correctly, I encountered this bug as soon as I started using non-zero
 addresses such as \texttt{temp[1]} in my fragment shader code.
 \subsection{Comparison to Direct3D ``asm''}
 Prior to Direct3D 10, Microsoft previously defined a specification for both
 \href{https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx9-graphics-reference-asm-vs-3-0}{vertex shader assembly} and
 \href{https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx9-graphics-reference-asm-ps-3-0}{fragment shader assembly}.
 The Direct3D ``asm'' name is slightly deceptive, however, as the
 \texttt{vs\_3\_0} and \texttt{ps\_3\_0} instruction syntax does not map 1-to-1
 with any hardware that exists.
 It would perhaps be more accurate to think of Direct3D's ``asm''
 language and compiler as more analogous to a
 \href{https://en.wikipedia.org/wiki/BASIC}{shader BASIC} than as a true assembly
 language on the same level as ``6502 assembly'', ``Z80 assembly'' and similar.
 In contrast, my R500 assembly syntaxes are deliberately/explicitly mapped 1-to-1
 with R500 instructions.
 \subsection{Fragment shader animated demo}
 \begin{figure}
  \includegraphics{videos/shadertoy_palette.png}
  \caption*{R500 DVI capture, \texttt{shadertoy\_palette.fs.asm}}
 \end{figure}
 The R500 fragment shader code that I handwrote for this is:
 \begin{figure}
  \href{verbatim/shadertoy_palette.fs.asm}{\includegraphics{verbatim/output/shadertoy_palette.fs.asm.pdf}}
  \caption*{\texttt{shadertoy\_palette.fs.asm}}
 \end{figure}
 The \texttt{float} constants are interesting--they are decoded almost
 identically to the
 \href{https://en.wikipedia.org/wiki/Minifloat#8-bit_(1.4.3)}{8-bit (1.4.3) (bias 7) format shown on Wikipedia},
 except:
 \begin{itemize}
 \item There is no sign bit (the value is always positive--positive values
  can be swizzled to produce negative operands)
 \item There is no ``zero'' value (zero can also be instead obtained via
  swizzles); the ``all zeros'' bit pattern instead has a value of
  \texttt{0.0009765625}.
 \item There are no infinite or not-a-number values: a ``15'' exponent is treated
  as 15.
 \end{itemize}
 The exponent/mantissa table that shows example 7-bit float values on page 106 of
 \href{doc/R5xx_Acceleration_v1.5.pdf}{R5xx\_Acceleration\_v1.5.pdf} is incorrect.
 \end{document}
--- a/mesa/glDrawArrays.txt
+++ b/mesa/glDrawArrays.txt
@ -0,0 +1,197 @@
 // _mesa_draw_arrays
 // r300_draw_vbo
 // r300_draw_arrays_immediate
  vertex_size = 3
  dwords = 13
  // r300_prepare_for_rendering
    // r300_emit_states
      // r300_reserve_cs_dwords
        389
      // r300_emit_dirty_state
        // r300_emit_gpu_flush
          SC_SCISSOR0 = 0
          SC_SCISSOR1 = (width - 1), (height - 1) // 600, 600
          // cb_flush_clean
          RB3D_DSTCACHE_CTLSTAT = 0xa
          ZB_ZCACHE_CTLSTAT = 0x3
          WAIT_UNTIL [0x1720] =  RADEON_WAIT_3D_IDLECLEAN
        // r300_emit_aa_state
          GB_AA_CONFIG = 0
          RB3D_AARESOLVE_CTL = 0
        // r300_emit_fb_state
          RB3D_CCTL = 16384
          RB3D_COLOROFFSET0 = 0
          //OUT_CS_RELOC
            OUT_CS(0xc0001000); /* PKT3_NOP */ \
            OUT_CS(0);
          RB3D_COLORPITCH0 = 0xc10640
          //OUT_CS_RELOC
            OUT_CS(0xc0001000); /* PKT3_NOP */ \
            OUT_CS(0);
          ZB_FORMAT = 2
          ZB_DEPTHOFFSET = 0
          //OUT_CS_RELOC
            OUT_CS(0xc0001000); /* PKT3_NOP */ \
            OUT_CS(4);
          ZB_DEPTHPITCH = 0x30640
          //OUT_CS_RELOC
            OUT_CS(0xc0001000); /* PKT3_NOP */ \
            OUT_CS(4);
        // r300_emit_hyperz_state
          ZB_BW_CNTL = 0
          ZB_DEPTHCLEARVALUE = 0
          SC_HYPERZ_EN = 0x1c
          GB_Z_PEQ_CONFIG = 0
        // r300_emit_ztop_state
          ZB_ZTOP = 1
        // r300_emit_dsa_state
          FG_ALPHA_FUNC = 0
          ZB_CNTL = 0
          ZB_ZSTENCILCNTL = 0
          ZB_STENCILREFMASK = 0
          ZB_STENCILREFMASK_BF = 0
          FG_ALPHA_VALUE = 0
        // r300_emit_blend_state
          RB3D_ROPCNTL = 0
          RB3D_BLENDCNTL = 0
          RB3D_ABLENDCNTL = 0
          RB3D_COLOR_CHANNEL_MASK = 15
          RB3D_DITHER_CTL = 0
        // r300_emit_blend_color_state
          RB3D_CONSTANT_COLOR_AR = 0
          RB3D_CONSTANT_COLOR_GB = 0
        // r300_emit_scissor_state
          SC_CLIP_0_A = 0, 0
          SC_CLIP_0_B = 0 - 1, 0 - 1
        // r300_emit_sample_mask
          SC_SCREENDOOR = 63 | (63 << 6) | (63 << 12) | (63 << 18)
        // r300_emit_invariant_state
          GB_SELECT = 0
          FG_FOG_BLEND = 0
          GA_OFFSET = 0
          SU_TEX_WRAP = 0
          SU_DEPTH_SCALE = 16777215.0f (0x4b7fffff)
          SU_DEPTH_OFFSET = 0
          SC_EDGERULE = 0x2da49525
          RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD = 0x1010101
          RB3D_DISCARD_SRC_PIXEL_GTE_THRESHOLD = 0xfefefefe
          GA_COLOR_CONTROL_PS3 = 0
          SU_TEX_WRAP_PS3 = 0
        // r300_emit_viewport_state
          VAP_VPORT_XSCALE = 300
          VAP_VPORT_XOFFSET = 300
          VAP_VPORT_YSCALE = -300
          VAP_VPORT_YOFFSET = 300
          VAP_VPORT_ZSCALE = 0.5
          VAP_VPORT_ZOFFSET = 0.5
          VAP_VTE_CNTL = 0x43f
        // r300_emit_pvs_flush
          VAP_PVS_STATE_FLUSH_REG = 0
        // r300_emit_vap_invariant_state
          VAP_PVS_VTX_TIMEOUT_REG = 0xffff
          VAP_GB_VERT_CLIP_ADJ = 1.0f (0x3f800000)
          VAP_GB_VERT_DISC_ADJ = 1.0f (0x3f800000)
          VAP_GB_HORZ_CLIP_ADJ = 1.0f (0x3f800000)
          VAP_GB_HORZ_DISC_ADJ = 1.0f (0x3f800000)
          VAP_PSC_SGN_NORM_CNTL = 0xaaaaaaaa
          VAP_TEX_TO_COLOR_CNTL = 0
        // r300_emit_vertex_stream_state
          VAP_PROG_STREAM_CNTL_0 = 0x2002
          VAP_PROG_STREAM_CNTL_EXT_0 = 0xfa88
        // r300_emit_vs_state
          VAP_PVS_CODE_CNTL_0 = 0
          VAP_PVS_CODE_CNTL_1 = 0
          VAP_PVS_VECTOR_INDX_REG = 0
          VAP_PVS_VECTOR_DATA_REG_128 = (ONE_REG_WR:)
            {0xf00203, 0xd10001, 0x1248001, 0x1248001}
          VAP_CNTL = 0xb0055a
          VAP_PVS_FLOW_CNTL_OPC = 0
          VAP_PVS_FLOW_CNTL_ADDRS_LW_[0-15] = 0
          VAP_PVS_FLOW_CNTL_ADDRS_UW_[0-15] = 0
          VAP_PVS_FLOW_CNTL_LOOP_INDEX_[0-15] = 0
        // r300_emit_clip_state
          VAP_PVS_VECTOR_INDX_REG = 0x600
          VAP_PVS_VECTOR_DATA_REG_128 =
            {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} (24)
        // r300_emit_rs_block_state
          VAP_VTX_STATE_CNTL = 0x5555
          VAP_VSM_VTX_ASSM [0x2184] = 0x1
          VAP_OUTPUT_VTX_FMT_0 = 1
          VAP_OUTPUT_VTX_FMT_1 = 4
          GB_ENABLE = 0
          RS_IP_0 = 0x30000000
          RS_COUNT = 0x40080
          RS_INST_COUNT = 0
          RS_INST_0 = 0
        // r300_emit_rs_state
          VAP_CNTL_STATUS = 0
          VAP_CLIP_CNTL = 0xc000
          GA_POINT_SIZE = 0x60006
          GA_POINT_MINMAX = 0x60006
          GA_LINE_CNTL = 0x20006
          SU_POLY_OFFSET_ENABLE = 0
          SU_CULL_MODE = 0
          GA_LINE_STIPPLE_CONFIG = 0
          GA_LINE_STIPPLE_VALUE = 0
          GA_POLY_MODE = 0
          GA_ROUND_MODE = 0x31
          SC_CLIP_RULE = 0xffff
          GA_POINT_S0 = 0
          GA_POINT_T0 = 1.0f (0x3f800000)
          GA_POINT_S1 = 1.0f (0x3f800000)
          GA_POINT_T1 = 0
        // r300_emit_fb_state_pipelined
          US_OUT_FMT_0 = 0x1b00
          US_OUT_FMT_1 = 0xf
          US_OUT_FMT_2 = 0xf
          US_OUT_FMT_3 = 0xf
          GB_MSPOS0 = 0x66666666
          GB_MSPOS1 = 0x6666666
        // r500_emit_fs
          US_CONFIG = 2
          US_PIXSIZE = 1
          US_FC_CTRL = 0
          US_CODE_RANGE = 0
          US_CODE_OFFSET = 0
          US_CODE_ADDR = 0
          GA_US_VECTOR_INDEX = 0
          GA_US_VECTOR_DATA = (ONE_REG_WR:)
            {0x78005, 0x8020080, 0x8020080, 0x1c9b04d8, 0x1c810003, 0x5}
          FG_DEPTH_SRC = 0
          US_W_FMT = 0
        // r500_emit_fs_rc_constant_state
          [nothing]
        // r500_emit_fs_constants
          [nothing]
        // r300_emit_vs_constants
          VAP_PVS_CONST_CNTL = 0
        // r300_emit_texture_cache_inval
          TX_INVALTAGS = 0
        // r300_emit_textures_state
          TX_ENABLE = 0
        // r300_emit_query_start
          [nothing]
        // r500_emit_index_bias
          VAP_INDEX_OFFSET = 0
        // r300_emit_draw_init
          GA_COLOR_CONTROL = 0x3aaaa
          VAP_VF_MAX_VTX_INDX = 2
          VAP_VF_MIN_VTX_INDX = 0
        // r300_draw_arrays_immediate
          VAP_VTX_SIZE = 3
          [
            PACKET3_3D_DRAW_IMMD_2 (3 * 3)
            0x30034 // VAP_VF_CNTL
            {0.5, -0.5, 0}
            {-0.5, -0.5, 0}
            {0, 0.5, 0}
          ]
--- a/replace_video.py
+++ b/replace_video.py
@ -0,0 +1,26 @@
 import sys
 scale = 1.5
 def transform():
    with open(sys.argv[1]) as f:
        for line in f.readlines():
            if "<img alt='PIC' src='videos/" in line:
                begin, end = line.split("<img", maxsplit=1)
                yield begin
                img, rest = end.split("/>", maxsplit=1)
                yield rest
                src = img.split("src='")[1].split("'")[0]
                assert src.endswith(".png"), src
                src = src.removesuffix(".png") + ".mp4"
                yield "<video style='width: 100%;' controls=''>"
                yield f"<source src='{src}' type='video/mp4'>"
                yield "</video>"
            else:
                yield line
 lines = list(transform())
 with open(sys.argv[1], 'w') as f:
    f.write(''.join(lines))
--- a/resize_svg.py
+++ b/resize_svg.py
@ -0,0 +1,22 @@
 import sys
 scale = 1.5
 def transform():
    with open(sys.argv[1]) as f:
        for line in f.readlines():
            if line.strip().startswith("<svg xmlns"):
                width = line.split('width="')[1].split('"')[0]
                height = line.split('height="')[1].split('"')[0]
                viewbox = line.split('viewBox="')[1].split('"')[0]
                width = float(width) * scale
                height = float(height) * scale
                template = f'<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="{width}" height="{height}" viewBox="{viewbox}">'
                yield template
            else:
                yield line
 lines = list(transform())
 with open(sys.argv[1], 'w') as f:
    f.write('\n'.join(lines))
--- a/verbatim.sh
+++ b/verbatim.sh
@ -0,0 +1,72 @@
 set -eux
 cd verbatim/
 mkdir -p output
 for i in *.asm; do
    cat <<EOF > $i.tex
 \documentclass[varwidth=13.1cm, border={0.0cm 0.0cm 0.0cm 0.0cm}]{standalone}
 \usepackage{minted}
 \setminted[python]{breaklines, linenos, frame=lines, framesep=2mm, fontsize=\huge, numbersep=5pt}
 \standaloneenv{minted}
 \begin{document}
 \begin{minted}{haskell}
 EOF
    cat $i >> $i.tex
    cat <<EOF >> $i.tex
 \end{minted}
 \end{document}
 EOF
    pdflatex -shell-escape -output-directory=output $i.tex
    pdflatex -shell-escape -output-directory=output $i.tex
 done
 for i in *.glsl; do
    cat <<EOF > $i.tex
 \documentclass[varwidth=13.1cm, border={0.0cm 0.0cm 0.0cm 0.0cm}]{standalone}
 \usepackage{minted}
 \setminted[python]{breaklines, linenos, frame=lines, framesep=2mm, fontsize=\huge, numbersep=5pt}
 \standaloneenv{minted}
 \begin{document}
 \begin{minted}{glsl}
 EOF
    cat $i >> $i.tex
    cat <<EOF >> $i.tex
 \end{minted}
 \end{document}
 EOF
    pdflatex -shell-escape -output-directory=output $i.tex
    pdflatex -shell-escape -output-directory=output $i.tex
 done
 for i in *.c; do
    cat <<EOF > $i.tex
 \documentclass[varwidth=13.1cm, border={0.0cm 0.0cm 0.0cm 0.0cm}]{standalone}
 \usepackage{minted}
 \setminted[python]{breaklines, linenos, frame=lines, framesep=2mm, fontsize=\huge, numbersep=5pt}
 \standaloneenv{minted}
 \begin{document}
 \begin{minted}{c}
 EOF
    cat $i >> $i.tex
    cat <<EOF >> $i.tex
 \end{minted}
 \end{document}
 EOF
    pdflatex -shell-escape -output-directory=output $i.tex
    pdflatex -shell-escape -output-directory=output $i.tex
 done
		`@ -0,0 +1 @@`
							`rsync --delete -arv * root@az1.idk.st:/var/www/r500/`