diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..5e7a2b1
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,21 @@
+*.html
+*.css
+*.out
+index.pdf
+_minted/
+*.aux
+*.log
+*.4ct
+*.4tc
+*.dvi
+*.idv
+*.lg
+*.tmp
+*.toc
+*.xref
+*~
+verbatim/*.tex
+verbatim/*.svg
+verbatim/*.pdf
+verbatim/output
+images/*.data
diff --git a/build.sh b/build.sh
new file mode 100644
index 0000000..8492761
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,28 @@
+set -eux
+
+rm -f verbatim/output/*.svg
+
+make4ht --shell-escape index.tex "pic-m,pic-equation,svg"
+
+echo 'img[alt="PIC"] { width: 100%; }' >> index.css
+echo '.cmtt-10 { font-size: 0.9em; }' >> index.css
+echo 'img[src="index3x.svg"] { height: 2.5em; }' >> index.css
+
+sed -i '/prefers-color-scheme/d' index.css
+sed -i 's| </span>|</span> |g' index.html
+sed -i '/figure.figure/d' index.css
+echo 'figure.figure { margin-left: 20px; margin-right: 20px;  }' >> index.css
+echo 'pre.verbatim { font-size: 0.9em; }' >> index.css
+sed -i 's|color-scheme: light dark;||g' index.css
+echo 'figcaption.caption { margin-bottom: 1.3em; margin-top: 0.3em; }' >> index.css
+
+sed -i 's/index.css/index2.css/g' index.html
+
+mv index.css index2.css
+
+python replace_video.py index.html
+
+for file in verbatim/output/*.svg; do
+    sed -i 's|rgb(0%, 0%, 100%)||g' "$file"
+    python resize_svg.py "$file"
+done
diff --git a/deploy.sh b/deploy.sh
new file mode 100644
index 0000000..bdbfe13
--- /dev/null
+++ b/deploy.sh
@@ -0,0 +1 @@
+rsync --delete -arv * root@az1.idk.st:/var/www/r500/
diff --git a/diagrams/fragment_inputs.dot b/diagrams/fragment_inputs.dot
new file mode 100644
index 0000000..4b2365c
--- /dev/null
+++ b/diagrams/fragment_inputs.dot
@@ -0,0 +1,61 @@
+digraph D {
+  graph [ranksep="1" splines=line ordering="in"];
+  node [shape=box];
+  edge [arrowhead=none];
+
+  subgraph cluster_W {
+    addr [shape=none]
+
+    temp
+    const
+    float
+  }
+
+  subgraph cluster_Z {
+    {rank=same
+    src [shape=none]
+    src0 [label="src0"]
+    src1 [label="src1"]
+    src2 [label="src2"]
+    srcp
+    }
+
+  }
+
+  subgraph cluster_R {
+
+    {rank=same
+    opcode [shape=none];
+    a [label = "a"];
+    b [label = "b"];
+    c [label = "c"];
+    }
+  }
+
+  temp:s -> src0:n
+  temp:s -> src1:n
+  temp:s -> src2:n
+
+  const:s -> src0:n
+  const:s -> src1:n
+  const:s -> src2:n
+
+  float:s -> src0:n
+  float:s -> src1:n
+  float:s -> src2:n
+
+  src0:s -> a:n
+  src1:s -> a:n
+  src2:s -> a:n
+  srcp:s -> a:n
+
+  src0:s -> b:n
+  src1:s -> b:n
+  src2:s -> b:n
+  srcp:s -> b:n
+
+  src0:s -> c:n
+  src1:s -> c:n
+  src2:s -> c:n
+  srcp:s -> c:n
+}
diff --git a/diagrams/fragment_inputs.svg b/diagrams/fragment_inputs.svg
new file mode 100644
index 0000000..c459c8c
--- /dev/null
+++ b/diagrams/fragment_inputs.svg
@@ -0,0 +1,205 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 12.2.1 (20241206.2353)
+ -->
+<!-- Title: D Pages: 1 -->
+<svg width="382pt" height="292pt"
+ viewBox="0.00 0.00 382.00 292.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 288)">
+<title>D</title>
+<polygon fill="white" stroke="none" points="-4,4 -4,-288 378,-288 378,4 -4,4"/>
+<g id="clust1" class="cluster">
+<title>cluster_W</title>
+<polygon fill="none" stroke="black" points="8,-224 8,-276 294,-276 294,-224 8,-224"/>
+</g>
+<g id="clust2" class="cluster">
+<title>cluster_Z</title>
+<polygon fill="none" stroke="black" points="8,-116 8,-168 366,-168 366,-116 8,-116"/>
+</g>
+<g id="clust4" class="cluster">
+<title>cluster_R</title>
+<polygon fill="none" stroke="black" points="35,-8 35,-60 333,-60 333,-8 35,-8"/>
+</g>
+<!-- addr -->
+<g id="node1" class="node">
+<title>addr</title>
+<text text-anchor="middle" x="43" y="-245.32" font-family="Times,serif" font-size="14.00">addr</text>
+</g>
+<!-- temp -->
+<g id="node2" class="node">
+<title>temp</title>
+<polygon fill="none" stroke="black" points="142,-268 88,-268 88,-232 142,-232 142,-268"/>
+<text text-anchor="middle" x="115" y="-245.32" font-family="Times,serif" font-size="14.00">temp</text>
+</g>
+<!-- src0 -->
+<g id="node6" class="node">
+<title>src0</title>
+<polygon fill="none" stroke="black" points="214,-160 160,-160 160,-124 214,-124 214,-160"/>
+<text text-anchor="middle" x="187" y="-137.32" font-family="Times,serif" font-size="14.00">src0</text>
+</g>
+<!-- temp&#45;&gt;src0 -->
+<g id="edge1" class="edge">
+<title>temp:s&#45;&gt;src0:n</title>
+<path fill="none" stroke="black" d="M115,-231C115,-231 187,-161 187,-161"/>
+</g>
+<!-- src1 -->
+<g id="node7" class="node">
+<title>src1</title>
+<polygon fill="none" stroke="black" points="142,-160 88,-160 88,-124 142,-124 142,-160"/>
+<text text-anchor="middle" x="115" y="-137.32" font-family="Times,serif" font-size="14.00">src1</text>
+</g>
+<!-- temp&#45;&gt;src1 -->
+<g id="edge2" class="edge">
+<title>temp:s&#45;&gt;src1:n</title>
+<path fill="none" stroke="black" d="M115,-231C115,-231 115,-161 115,-161"/>
+</g>
+<!-- src2 -->
+<g id="node8" class="node">
+<title>src2</title>
+<polygon fill="none" stroke="black" points="286,-160 232,-160 232,-124 286,-124 286,-160"/>
+<text text-anchor="middle" x="259" y="-137.32" font-family="Times,serif" font-size="14.00">src2</text>
+</g>
+<!-- temp&#45;&gt;src2 -->
+<g id="edge3" class="edge">
+<title>temp:s&#45;&gt;src2:n</title>
+<path fill="none" stroke="black" d="M115,-231C115,-231 259,-161 259,-161"/>
+</g>
+<!-- const -->
+<g id="node3" class="node">
+<title>const</title>
+<polygon fill="none" stroke="black" points="214.12,-268 159.88,-268 159.88,-232 214.12,-232 214.12,-268"/>
+<text text-anchor="middle" x="187" y="-245.32" font-family="Times,serif" font-size="14.00">const</text>
+</g>
+<!-- const&#45;&gt;src0 -->
+<g id="edge4" class="edge">
+<title>const:s&#45;&gt;src0:n</title>
+<path fill="none" stroke="black" d="M187,-231C187,-231 187,-161 187,-161"/>
+</g>
+<!-- const&#45;&gt;src1 -->
+<g id="edge5" class="edge">
+<title>const:s&#45;&gt;src1:n</title>
+<path fill="none" stroke="black" d="M187,-231C187,-231 115,-161 115,-161"/>
+</g>
+<!-- const&#45;&gt;src2 -->
+<g id="edge6" class="edge">
+<title>const:s&#45;&gt;src2:n</title>
+<path fill="none" stroke="black" d="M187,-231C187,-231 259,-161 259,-161"/>
+</g>
+<!-- float -->
+<g id="node4" class="node">
+<title>float</title>
+<polygon fill="none" stroke="black" points="286,-268 232,-268 232,-232 286,-232 286,-268"/>
+<text text-anchor="middle" x="259" y="-245.32" font-family="Times,serif" font-size="14.00">float</text>
+</g>
+<!-- float&#45;&gt;src0 -->
+<g id="edge7" class="edge">
+<title>float:s&#45;&gt;src0:n</title>
+<path fill="none" stroke="black" d="M259,-231C259,-231 187,-161 187,-161"/>
+</g>
+<!-- float&#45;&gt;src1 -->
+<g id="edge8" class="edge">
+<title>float:s&#45;&gt;src1:n</title>
+<path fill="none" stroke="black" d="M259,-231C259,-231 115,-161 115,-161"/>
+</g>
+<!-- float&#45;&gt;src2 -->
+<g id="edge9" class="edge">
+<title>float:s&#45;&gt;src2:n</title>
+<path fill="none" stroke="black" d="M259,-231C259,-231 259,-161 259,-161"/>
+</g>
+<!-- src -->
+<g id="node5" class="node">
+<title>src</title>
+<text text-anchor="middle" x="43" y="-137.32" font-family="Times,serif" font-size="14.00">src</text>
+</g>
+<!-- a -->
+<g id="node11" class="node">
+<title>a</title>
+<polygon fill="none" stroke="black" points="181,-52 127,-52 127,-16 181,-16 181,-52"/>
+<text text-anchor="middle" x="154" y="-29.32" font-family="Times,serif" font-size="14.00">a</text>
+</g>
+<!-- src0&#45;&gt;a -->
+<g id="edge10" class="edge">
+<title>src0:s&#45;&gt;a:n</title>
+<path fill="none" stroke="black" d="M187,-123C187,-123 154,-53 154,-53"/>
+</g>
+<!-- b -->
+<g id="node12" class="node">
+<title>b</title>
+<polygon fill="none" stroke="black" points="253,-52 199,-52 199,-16 253,-16 253,-52"/>
+<text text-anchor="middle" x="226" y="-29.32" font-family="Times,serif" font-size="14.00">b</text>
+</g>
+<!-- src0&#45;&gt;b -->
+<g id="edge14" class="edge">
+<title>src0:s&#45;&gt;b:n</title>
+<path fill="none" stroke="black" d="M187,-123C187,-123 226,-53 226,-53"/>
+</g>
+<!-- c -->
+<g id="node13" class="node">
+<title>c</title>
+<polygon fill="none" stroke="black" points="325,-52 271,-52 271,-16 325,-16 325,-52"/>
+<text text-anchor="middle" x="298" y="-29.32" font-family="Times,serif" font-size="14.00">c</text>
+</g>
+<!-- src0&#45;&gt;c -->
+<g id="edge18" class="edge">
+<title>src0:s&#45;&gt;c:n</title>
+<path fill="none" stroke="black" d="M187,-123C187,-123 298,-53 298,-53"/>
+</g>
+<!-- src1&#45;&gt;a -->
+<g id="edge11" class="edge">
+<title>src1:s&#45;&gt;a:n</title>
+<path fill="none" stroke="black" d="M115,-123C115,-123 154,-53 154,-53"/>
+</g>
+<!-- src1&#45;&gt;b -->
+<g id="edge15" class="edge">
+<title>src1:s&#45;&gt;b:n</title>
+<path fill="none" stroke="black" d="M115,-123C115,-123 226,-53 226,-53"/>
+</g>
+<!-- src1&#45;&gt;c -->
+<g id="edge19" class="edge">
+<title>src1:s&#45;&gt;c:n</title>
+<path fill="none" stroke="black" d="M115,-123C115,-123 298,-53 298,-53"/>
+</g>
+<!-- src2&#45;&gt;a -->
+<g id="edge12" class="edge">
+<title>src2:s&#45;&gt;a:n</title>
+<path fill="none" stroke="black" d="M259,-123C259,-123 154,-53 154,-53"/>
+</g>
+<!-- src2&#45;&gt;b -->
+<g id="edge16" class="edge">
+<title>src2:s&#45;&gt;b:n</title>
+<path fill="none" stroke="black" d="M259,-123C259,-123 226,-53 226,-53"/>
+</g>
+<!-- src2&#45;&gt;c -->
+<g id="edge20" class="edge">
+<title>src2:s&#45;&gt;c:n</title>
+<path fill="none" stroke="black" d="M259,-123C259,-123 298,-53 298,-53"/>
+</g>
+<!-- srcp -->
+<g id="node9" class="node">
+<title>srcp</title>
+<polygon fill="none" stroke="black" points="358,-160 304,-160 304,-124 358,-124 358,-160"/>
+<text text-anchor="middle" x="331" y="-137.32" font-family="Times,serif" font-size="14.00">srcp</text>
+</g>
+<!-- srcp&#45;&gt;a -->
+<g id="edge13" class="edge">
+<title>srcp:s&#45;&gt;a:n</title>
+<path fill="none" stroke="black" d="M331,-123C331,-123 154,-53 154,-53"/>
+</g>
+<!-- srcp&#45;&gt;b -->
+<g id="edge17" class="edge">
+<title>srcp:s&#45;&gt;b:n</title>
+<path fill="none" stroke="black" d="M331,-123C331,-123 226,-53 226,-53"/>
+</g>
+<!-- srcp&#45;&gt;c -->
+<g id="edge21" class="edge">
+<title>srcp:s&#45;&gt;c:n</title>
+<path fill="none" stroke="black" d="M331,-123C331,-123 298,-53 298,-53"/>
+</g>
+<!-- opcode -->
+<g id="node10" class="node">
+<title>opcode</title>
+<text text-anchor="middle" x="76" y="-29.32" font-family="Times,serif" font-size="14.00">opcode</text>
+</g>
+</g>
+</svg>
diff --git a/diagrams/vertex_inputs.dot b/diagrams/vertex_inputs.dot
new file mode 100644
index 0000000..08564e3
--- /dev/null
+++ b/diagrams/vertex_inputs.dot
@@ -0,0 +1,36 @@
+digraph D {
+  graph [ranksep="1" splines=line];
+  node [shape=box];
+  edge [arrowhead=none];
+
+  input
+  const
+  temp
+  alt_temp
+
+  opcode [shape=none];
+  a [label = "a"];
+  b [label = "b"];
+  c [label = "c"];
+
+  subgraph cluster_R {
+
+    {rank=same opcode a b c}
+  }
+
+  input:s -> a:n
+  input:s -> b:n
+  input:s -> c:n
+
+  const:s -> a:n
+  const:s -> b:n
+  const:s -> c:n
+
+  temp:s -> a:n
+  temp:s -> b:n
+  temp:s -> c:n
+
+  alt_temp:s -> a:n
+  alt_temp:s -> b:n
+  alt_temp:s -> c:n
+}
diff --git a/diagrams/vertex_inputs.svg b/diagrams/vertex_inputs.svg
new file mode 100644
index 0000000..726ce47
--- /dev/null
+++ b/diagrams/vertex_inputs.svg
@@ -0,0 +1,124 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 12.2.1 (20241206.2353)
+ -->
+<!-- Title: D Pages: 1 -->
+<svg width="366pt" height="168pt"
+ viewBox="0.00 0.00 366.12 168.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 164)">
+<title>D</title>
+<polygon fill="white" stroke="none" points="-4,4 -4,-164 362.12,-164 362.12,4 -4,4"/>
+<g id="clust1" class="cluster">
+<title>cluster_R</title>
+<polygon fill="none" stroke="black" points="8,-8 8,-60 306,-60 306,-8 8,-8"/>
+</g>
+<!-- input -->
+<g id="node1" class="node">
+<title>input</title>
+<polygon fill="none" stroke="black" points="118,-160 64,-160 64,-124 118,-124 118,-160"/>
+<text text-anchor="middle" x="91" y="-137.32" font-family="Times,serif" font-size="14.00">input</text>
+</g>
+<!-- a -->
+<g id="node6" class="node">
+<title>a</title>
+<polygon fill="none" stroke="black" points="154,-52 100,-52 100,-16 154,-16 154,-52"/>
+<text text-anchor="middle" x="127" y="-29.32" font-family="Times,serif" font-size="14.00">a</text>
+</g>
+<!-- input&#45;&gt;a -->
+<g id="edge1" class="edge">
+<title>input:s&#45;&gt;a:n</title>
+<path fill="none" stroke="black" d="M91,-124C91,-124 127,-53 127,-53"/>
+</g>
+<!-- b -->
+<g id="node7" class="node">
+<title>b</title>
+<polygon fill="none" stroke="black" points="226,-52 172,-52 172,-16 226,-16 226,-52"/>
+<text text-anchor="middle" x="199" y="-29.32" font-family="Times,serif" font-size="14.00">b</text>
+</g>
+<!-- input&#45;&gt;b -->
+<g id="edge2" class="edge">
+<title>input:s&#45;&gt;b:n</title>
+<path fill="none" stroke="black" d="M91,-124C91,-124 199,-53 199,-53"/>
+</g>
+<!-- c -->
+<g id="node8" class="node">
+<title>c</title>
+<polygon fill="none" stroke="black" points="298,-52 244,-52 244,-16 298,-16 298,-52"/>
+<text text-anchor="middle" x="271" y="-29.32" font-family="Times,serif" font-size="14.00">c</text>
+</g>
+<!-- input&#45;&gt;c -->
+<g id="edge3" class="edge">
+<title>input:s&#45;&gt;c:n</title>
+<path fill="none" stroke="black" d="M91,-124C91,-124 271,-53 271,-53"/>
+</g>
+<!-- const -->
+<g id="node2" class="node">
+<title>const</title>
+<polygon fill="none" stroke="black" points="190.12,-160 135.88,-160 135.88,-124 190.12,-124 190.12,-160"/>
+<text text-anchor="middle" x="163" y="-137.32" font-family="Times,serif" font-size="14.00">const</text>
+</g>
+<!-- const&#45;&gt;a -->
+<g id="edge4" class="edge">
+<title>const:s&#45;&gt;a:n</title>
+<path fill="none" stroke="black" d="M163,-124C163,-124 127,-53 127,-53"/>
+</g>
+<!-- const&#45;&gt;b -->
+<g id="edge5" class="edge">
+<title>const:s&#45;&gt;b:n</title>
+<path fill="none" stroke="black" d="M163,-124C163,-124 199,-53 199,-53"/>
+</g>
+<!-- const&#45;&gt;c -->
+<g id="edge6" class="edge">
+<title>const:s&#45;&gt;c:n</title>
+<path fill="none" stroke="black" d="M163,-124C163,-124 271,-53 271,-53"/>
+</g>
+<!-- temp -->
+<g id="node3" class="node">
+<title>temp</title>
+<polygon fill="none" stroke="black" points="262,-160 208,-160 208,-124 262,-124 262,-160"/>
+<text text-anchor="middle" x="235" y="-137.32" font-family="Times,serif" font-size="14.00">temp</text>
+</g>
+<!-- temp&#45;&gt;a -->
+<g id="edge7" class="edge">
+<title>temp:s&#45;&gt;a:n</title>
+<path fill="none" stroke="black" d="M235,-124C235,-124 127,-53 127,-53"/>
+</g>
+<!-- temp&#45;&gt;b -->
+<g id="edge8" class="edge">
+<title>temp:s&#45;&gt;b:n</title>
+<path fill="none" stroke="black" d="M235,-124C235,-124 199,-53 199,-53"/>
+</g>
+<!-- temp&#45;&gt;c -->
+<g id="edge9" class="edge">
+<title>temp:s&#45;&gt;c:n</title>
+<path fill="none" stroke="black" d="M235,-124C235,-124 271,-53 271,-53"/>
+</g>
+<!-- alt_temp -->
+<g id="node4" class="node">
+<title>alt_temp</title>
+<polygon fill="none" stroke="black" points="358.12,-160 279.88,-160 279.88,-124 358.12,-124 358.12,-160"/>
+<text text-anchor="middle" x="319" y="-137.32" font-family="Times,serif" font-size="14.00">alt_temp</text>
+</g>
+<!-- alt_temp&#45;&gt;a -->
+<g id="edge10" class="edge">
+<title>alt_temp:s&#45;&gt;a:n</title>
+<path fill="none" stroke="black" d="M319,-124C319,-124 127,-53 127,-53"/>
+</g>
+<!-- alt_temp&#45;&gt;b -->
+<g id="edge11" class="edge">
+<title>alt_temp:s&#45;&gt;b:n</title>
+<path fill="none" stroke="black" d="M319,-124C319,-124 199,-53 199,-53"/>
+</g>
+<!-- alt_temp&#45;&gt;c -->
+<g id="edge12" class="edge">
+<title>alt_temp:s&#45;&gt;c:n</title>
+<path fill="none" stroke="black" d="M319,-124C319,-124 271,-53 271,-53"/>
+</g>
+<!-- opcode -->
+<g id="node5" class="node">
+<title>opcode</title>
+<text text-anchor="middle" x="49" y="-29.32" font-family="Times,serif" font-size="14.00">opcode</text>
+</g>
+</g>
+</svg>
diff --git a/index.tex b/index.tex
new file mode 100644
index 0000000..ded3477
--- /dev/null
+++ b/index.tex
@@ -0,0 +1,905 @@
+\documentclass[20pt]{article}
+
+\usepackage[font=small,labelfont=bf]{caption}
+\usepackage{hyperref}
+\hypersetup{
+    colorlinks=true,
+    linkcolor=blue,
+    filecolor=magenta,
+    urlcolor=cyan,
+    pdftitle={Dreamcast},
+    pdfpagemode=FullScreen,
+    }
+
+\usepackage{graphicx}
+\graphicspath{ {./images/} }
+
+\usepackage{minted}
+
+\title{Radeon R500}
+\date{}
+
+\begin{document}
+
+\maketitle
+\href{images/x1950xt.jpg}{\includegraphics{images/x1950xt.jpg}}
+
+\tableofcontents
+
+\section{Introduction}
+
+The primary/minimal project goal is "draw a triangle on a Radeon R500 via direct
+memory-mapped hardware register and texture memory accesses". This means no
+\href{https://mesa3d.org/}{Mesa}, no
+\href{https://github.com/torvalds/linux/tree/v6.12/drivers/gpu/drm/radeon}{radeon}
+kernel module, and certainly no OpenGL or Direct3D.
+
+I have worked directly with several other graphics units in the past
+(\href{https://github.com/buhman/saturn-examples}{Saturn VDP1},
+\href{https://github.com/buhman/dreamcast}{Dreamcast Holly},
+\href{https://github.com/buhman/voodoo}{Voodoo 2}). In all of these projects,
+my strategy is generally:
+
+\begin{itemize}
+\item read the entire \href{doc/R5xx_Acceleration_v1.5.pdf}{reference
+  documentation} at least once, front-to-back
+\item copy all hardware register definitions from the documentation to a
+  spreadsheet or text file (sometimes typing everything by hand if I am in such
+  a chill mood)
+\item progressively build increasingly-complex example programs that exercise
+  the hardware
+\end{itemize}
+
+The rabbit hole for R500 seems significantly deeper, considering this is the
+first graphics unit I've worked with that has programmable vertex and pixel
+shader engines.
+
+\subsection{Hardware}
+
+For testing, I currently have this hardware configuration:
+
+\begin{itemize}
+\item ASUS P4B-LX (Intel 845) motherboard
+\item Intel Pentium 4 2.6GHz SL6PP (Northwood)
+\item 1024 MB RAM
+\item 32GB PATA SSD
+\item ATI Radeon X1650 PRO 512MB AGP
+\end{itemize}
+
+I also have the X1950 XT PCIe shown in the photo, which amazingly has never been
+used, and prior to the photo was sealed in an antistatic bag from manufacture to
+now.
+
+\subsection{Test setup}
+
+While in my other (video game console) projects I typically insist on
+``bare-metal'' development with no operating system or third-party library
+running on the target hardware, my experience with x86 is much more limited.
+
+While it is something I am interested in doing, I believe creating a
+zero-dependency ``code upload'' mechanism for an x86-pc that does not depend on
+an operating system would severely delay my progress on R500-specific work.
+
+For my initial exploration of R500, I will instead be manipulating the hardware
+primarily from Linux kernel space. This Linux kernel code does not actually
+meaningfully depend on Linux APIs beyond calling \texttt{ioremap} to get usable
+memory mappings for R500 PCI resources (texture/framebuffer memory and
+registers).
+
+\section{Progress: 07 Oct 2025}
+
+From 01 Oct 2025 to 07 Oct 2025, I achieved the following:
+
+\begin{itemize}
+\item I wrote a reasonably complete AtomBIOS disassembler
+\item I can disable (IBM PC) VGA mode and manipulate the native framebuffer
+\item I can upload microcode to the ``command processor'', and I can write to
+  scratch registers via command processor packets (this is uncoincidentally the
+  same command processor test that the radeon kernel module does).
+\item I stepped through Mesa functions as invoked by a simple OpenGL
+  application, and created \href{mesa/glDrawArrays.txt}{a list of R500
+    registers/values} that are written by Mesa during \texttt{glDrawArrays}.
+\end{itemize}
+
+I did not achieve the following:
+
+\begin{itemize}
+\item I attempted to manipulate the R500 register state and command processor
+  into drawing a triangle, but I have not been successful yet
+\end{itemize}
+
+\subsection{Documentation}
+
+In general, I note that the R500 documentation is significantly weaker than I
+hoped, and does not contain enough information to draw a triangle on the R500
+from the documentation alone (with no prior knowledge about previous Radeon
+graphics units).
+
+In addition to the lack of prose, in several cases I've noticed both Mesa and
+Linux reference R500 registers that are
+\href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/pci/undocumented_3d_registers.h}{not
+  present at all} in the documentation.
+
+\subsection{AtomBIOS}
+
+AtomBIOS physically exists as a section inside the ROM on R500 graphics units.
+AtomBIOS is notably used for setting PLL/pixel clock frequencies and display
+resolutions, among several other functions.
+
+The Radeon graphics hardware itself does not execute AtomBIOS code--instead, it
+is expected that the host (e.g: x86) CPU evaluate the instructions in the
+AtomBIOS command tables. Generally the outcome of evaluating AtomBIOS code is
+that several ``register write'' instructions will be executed, changing the
+state of the graphics unit.
+
+My original goal in studying AtomBIOS was that I thought I would need it to set
+up the R500 display controller to a reasonable state (as a prerequisite for
+drawing 3D graphics). However, after actually experimenting with ``disable VGA
+mode'', I currently believe that I don't actually need to implement
+resolution/mode changes, and can proceed without it.
+
+\subsection{PIO mode}
+
+The Linux kernel exclusively communicates with R500 via ``PCI bus mastering''.
+A ``ring buffer'' is allocated in ``GTT'' space, which from the graphics unit's
+perspective exists in the same address space as framebuffer memory, but is an
+address that is outside the framebuffer memory that physically exists.
+
+I also observed via debugfs that the GTT apparently involves some sort of sparse
+page mapping, but I don't understand how this works from an x86 perspective.
+
+In the absence of an understanding of how to make my own ``GTT'' address space,
+I attempted to operate the R500 in ``PIO'' mode. This has the advantage of being
+able to simply write to registers via (simple) PCI memory-mapped accesses, but
+it has the disadvantage that Linux doesn't use R500 this way, so I have no
+reference implementation for how PIO mode should be used.
+
+\subsection{Triangle drawing attempt \#1}
+
+I translated my \href{mesa/glDrawArrays.txt}{glDrawArrays notes} to
+\href{https://git.idk.st/bilbo/r500/src/commit/b6472e4c16946f44e02d82f31adaa411df009c67/pci/triangle.c}{equivalent
+  register writes}.
+
+This does not work, and I don't yet understand why. The main issue is that most
+of the time when I execute that code, Linux appears to ``hang'' completely, and
+my ``printk'' messages are never sent over ssh. On the rare occasion when the
+``hang'' does not occur, a triangle is nevertheless not drawn on the
+framebuffer.
+
+I have a few ideas for how to proceed:
+
+\begin{itemize}
+\item Move the ``triangle.c'' register accesses to userspace via
+  \texttt{/sys/bus/pci}, which might improve debuggability
+\item Abandon the ``write a kernel module'' idea completely, and instead
+  interact with the R500 via \href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/radeon_drv.c#L565-L577}{radeon DRM ioctls}
+\end{itemize}
+
+The latter is perhaps both the most attractive, and the most work. I currently
+don't have any understanding of GEM buffers, radeon buffer objects, etc.., so
+I'd need to study these in more detail.
+
+\section{Progress: 14 Oct 2025}
+
+From 08 Oct 2025 to 14 Oct 2025, I achieved the following:
+
+\begin{itemize}
+\item I studied how Mesa interacts with the \texttt{radeon} kernel module via
+  \texttt{DRM\_RADEON\_} ioctls.
+\item I wrote simple R500 \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/regs/pvs_disassemble.py}{vertex shader} and \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/regs/us_disassemble.py}{pixel shader} disassemblers.
+\item I wrote a \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/regs/parse_packets.py}{tool} to print R500 ``PM4'' packets in human-readable form.
+\item I laboriously \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/regs/bits}{copied and reformatted} all bit definitions from \href{doc/R5xx_Acceleration_v1.5.pdf}{R5xx\_Acceleration\_v1.5.pdf}
+\item I wrote \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/regs}{several other miscellaneous tools} related to register and bit parsing and manipulation.
+\item I wrote two \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/drm/single_color.c}{humble} \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/drm/vertex_color.c}{demos} to draw a triangle on R500.
+\end{itemize}
+
+\subsection{Radeon DRM}
+
+As implied in the last update, primarily due to my lack of experience with
+bare-metal x86, I decided it would be a better approach to interact with R500
+Command Processor via the \texttt{radeon} kernel module, which provides a
+partially reasonable interface for this via the \texttt{DRM\_RADEON\_CS} ioctl.
+
+All \texttt{DRM\_RADEON\_} ioctls are mostly or entirely undocumented. Instead,
+I built debugging symbols for Mesa and other supporting libraries so that I
+could set breakpoints in GDB to observe what sequences of \texttt{DRM\_RADEON\_}
+ioctls Mesa uses.
+
+From my previous \href{mesa/glDrawArrays.txt}{glDrawArrays notes} observations,
+I noticed this strange sequence:
+
+\begin{verbatim}
+0x0000138a  // type 0 packet, count=0, starting offset = RB3D_COLOROFFSET0
+0x00000000  // RB3D_COLOROFFSET0 = 0
+0xc0001000  // type 3 packet, count=0, opcode=NOP
+0x00000000  // zero (meaningless data)
+\end{verbatim}
+
+At first, it seemed Mesa was deliberately setting the colorbuffer write address
+to (VRAM address) zero, which seemed like a strange choice considering I am
+debugging an X11/GLX OpenGL application--surely the colorbuffer address would be
+some non-zero value several megabytes after the beginning of VRAM.
+
+I later attempted to send my own PM4 packet via \texttt{DRM\_RADEON\_CS}. This
+initial attempt returned \texttt{Invalid argument}, with the following
+message in dmesg:
+
+\begin{verbatim}
+[ 1205.978993] [drm:radeon_cs_packet_next_reloc [radeon]] *ERROR* No packet3 for relocation for packet at 14.
+[ 1205.979427] [drm] ib[14]=0x0000138E
+[ 1205.979433] [drm] ib[15]=0x00C00640
+[ 1205.979437] [drm:r300_packet0_check [radeon]] *ERROR* No reloc for ib[13]=0x4E28
+[ 1205.979545] [drm] ib[12]=0x0000138A
+[ 1205.979548] [drm] ib[13]=0x00000000
+[ 1205.979553] [drm:radeon_cs_ioctl [radeon]] *ERROR* Invalid command stream !
+\end{verbatim}
+
+This error message comes from
+\href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/r300.c#L664-L669}{drm/radeon/r300.c}.
+
+The meaningless data following the type-3 NOP packet is used by the kernel to
+\href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/radeon_cs.c#L875-L889}{index}
+the \texttt{DRM\_RADEON\_CS} ``relocs'' array (an array of GEM buffer handles).
+
+It seems perhaps the design goal was to never expose the VRAM address of GEM
+buffers to userspace (indeed there seems to be no way to retrieve that via any
+GEM ioctls). This restriction is slightly disappointing, as I would have
+preferred to be able to send unmodified packet data to the R500.
+
+However, at the moment this does not appear to be a significant issue, as a
+relatively small number of registers are modified by the Linux kernel's packet
+parser prior creating the indirect buffer that is actually sent to the R500
+hardware.
+
+\subsection{Indirect buffers}
+
+There appears to be a lot of memory-to-memory copying in the
+Linux/Mesa/DRM/GEM/radeon graphics stack:
+
+\begin{itemize}
+\item Mesa writes the OpenGL state to various internal structures
+\item Mesa \href{https://gitlab.freedesktop.org/mesa/mesa/-/blob/25.0/src/gallium/drivers/r300/r300_emit.c?ref_type=heads}{copies} OpenGL state to packet commands in a userspace buffer
+\item Mesa
+  \href{https://gitlab.freedesktop.org/mesa/mesa/-/blob/25.0/src/gallium/winsys/radeon/drm/radeon_drm_cs.c?ref_type=heads#L486-487}{passes
+    the address} of the userspace buffer to the kernel via
+  \texttt{DRM\_RADEON\_CS}
+\item Linux
+  \href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/radeon_cs.c#L340-L358}{copies
+    the entire userspace buffer} to kernel space (calling kvmalloc/kvfree on
+  each ioctl)
+\item The \texttt{radeon\_cs\_parser} parses and modifies the buffer originally
+  generated by Mesa
+\item \href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/radeon_cs.c#L613}{radeon\_cs\_ib\_fill} copies the parser result to gpu address space.
+\end{itemize}
+
+Eventually,
+\href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/r100.c#L3709-L3722}{r100\_ring\_ib\_execute}
+is called, which writes the indirect buffer address (now in GPU address space)
+to the ring.
+
+It would be interesting to experiment with writing a packet buffer directly in
+GPU/GTT address space (from Linux userspace), with zero copies. This would
+require an entirely new set of ioctls.
+
+\subsection{Triangle drawing attempt \#2}
+
+These images were never drawn on-screen. I extracted them from VRAM via
+\texttt{/sys/kernel/debug/radeon\_vram}.
+
+\begin{figure}
+  \href{images/single_color_macrotiled.png}{\includegraphics{images/single_color_macrotiled.png}}
+  \caption*{R500 framebuffer capture, \texttt{single\_color.c}}
+\end{figure}
+
+Though I was not aware of it yet, the above image was indeed my triangle, and
+\texttt{COLORPITCH0} was merely in ``macrotiled'' mode. Once I realized this, I
+produced this image (still in off-screen VRAM):
+
+\begin{figure}
+  \href{images/single_color.png}{\includegraphics{images/single_color.png}}
+  \caption*{R500 framebuffer capture, \texttt{single\_color.c}}
+\end{figure}
+
+This \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/drm/single_color.c}{``single color''} demo deliberately uses the very simple vertex and fragment
+shaders:
+
+\begin{figure}
+\begin{verbatim}
+instruction[0]:
+  0x00f00203  dst: VE_ADD out[0].xyzw
+  0x00d10001  src0: input[0].xyzw
+  0x01248001  src1: input[0].0000
+  0x01248001  src2: input[0].0000
+\end{verbatim}
+\caption*{R500 vertex shader (1 instruction, 128-bit control word)}
+\end{figure}
+
+This vertex shader is doing the equivalent of:
+
+\begin{figure}
+  \href{verbatim/vertex_shader_equivalent_single_color.glsl}{\includegraphics{verbatim/output/vertex_shader_equivalent_single_color.glsl.pdf}}
+\end{figure}
+
+The W component \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae//drm/single_color.c#L339}{comes from}
+\texttt{VAP\_PROG\_STREAM\_CNTL\_EXT\_\_SWIZZLE\_SELECT\_W\_0(5)}, which
+swizzles W to a constant \texttt{1.0}, despite W not being present in the vertex
+data.
+
+\begin{figure}
+\begin{verbatim}
+instruction[0]:
+  0x00078005  OUT RGBA
+  0x08020080  RGB ADDR0=0.0 ADDR1=0.0 ADDR2=0.0
+  0x08020080  ALPHA ADDR0=0.0 ADDR1=0.0 ADDR2=0.0
+  0x1c9b04d8  RGB_SEL_A=src0.110 RGB_SEL_B=src0.110 TARGET=A
+  0x1c810003  ALPHA_OP=OP_MAX ALPHA_SEL_A=src0.0 ALPHA_SEL_B=src0.0 TARGET=A
+  0x00000005  RGB_OP=OP_MAX
+\end{verbatim}
+\caption*{R500 fragment shader (1 instruction, 192-bit control word)}
+\end{figure}
+
+This fragment shader is doing the equivalent of:
+
+\begin{figure}
+  \href{verbatim/fragment_shader_equivalent_single_color.glsl}{\includegraphics{verbatim/output/fragment_shader_equivalent_single_color.glsl.pdf}}
+\end{figure}
+
+via the src swizzles. I think it is interesting that there are so many options
+for producing inline constants within the fragment shader.
+
+The ``target'' fragment shader field also seems interesting. I am excited to
+write shaders that use multiple output buffers.
+
+\subsection{DRM/KMS/GBM}
+
+These renders were not displayed on-screen, so I looked for ways to correct
+this.
+
+Perhaps the most obvious method would be to write to the display controller
+registers (\texttt{D1GRPH\_PRIMARY\_SURFACE\_ADDRESS}) via
+\texttt{RADEON\_DRM\_CS}. However, this does not work due to the command parser
+anti-fun implemented in
+\href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/r300.c#L643}{r300\_packet0\_check}:
+any register not present in that case statement is considered invalid, and the
+packet buffer is not submitted.
+
+I attempted to do this the ``right way'' via the DRM/KMS/GBM APIs. I then
+learned that this does not behave correctly on my R500 because demos that wait
+for the flag returned by \texttt{DRM\_IOCTL\_MODE\_PAGE\_FLIP} hang forever.
+
+I noticed this earlier on Xorg/GLX as well, as I have been using the
+\texttt{vblank\_mode=0} environment variable to avoid hanging forever in
+\texttt{glXSwapBuffers}. This appears to be a Linux kernel bug, but I didn't
+investigate this further.
+
+\subsection{On-screen drawing}
+
+I noticed in \texttt{/sys/kernel/debug/radeon\_vram\_mm} that the Linux console
+is only using a single framebuffer (and does not double-buffer).
+
+This is fortunate, because this means I can simply
+\href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/pci_user/main.c#L48}{mmap
+  the register address space} and write
+\texttt{D1GRPH\_PRIMARY\_SURFACE\_ADDRESS} myself without worrying about the
+Linux console overwriting my change. I observed the \texttt{0x813000} value from
+\texttt{/sys/kernel/debug/radeon\_vram\_mm}--there appears to be no other way to
+get the vram address of a GEM buffer.
+
+This is ``good enough'' for now, though at some point I'll want to learn how to
+do proper vblank-synchronized double buffering.
+
+\subsection{Triangle drawing attempt \#3}
+
+I felt the next logical step was to learn how attributes and constants are
+passed through the shader pipeline, so I then \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/drm/vertex_color.c}{created a demo} that produced this image (this time also displayed on-screen):
+
+\begin{figure}
+  \href{images/vertex_color.png}{\includegraphics{images/vertex_color.png}}
+  \caption*{R500 framebuffer capture, \texttt{vertex\_color.c}}
+\end{figure}
+
+\begin{figure}
+\begin{verbatim}
+instruction[0]:
+  0x00702203  dst: VE_ADD out[1].xyz_
+  0x01d10021  src0: input[1].xyz_
+  0x01248021  src1: input[1].0000
+  0x01248021  src2: input[1].0000
+instruction[1]:
+  0x00f00203  dst: VE_ADD out[0].xyzw
+  0x01510001  src0: input[0].xyz1
+  0x01248001  src1: input[0].0000
+  0x01248001  src2: input[0].0000
+\end{verbatim}
+\caption*{R500 vertex shader (2 instructions, 128-bit control words)}
+\end{figure}
+
+This vertex shader is doing the equivalent of
+
+\begin{figure}
+  \href{verbatim/vertex_shader_equivalent_vertex_color.glsl}{\includegraphics{verbatim/output/vertex_shader_equivalent_vertex_color.glsl.pdf}}
+\end{figure}
+
+The extra vertex input is fed to the vertex shader via changes to
+\texttt{VAP\_PROG\_STREAM\_CNTL\_0},
+\texttt{VAP\_PROG\_STREAM\_CNTL\_EXT\_0}. Based on my currently limited
+understanding, it seems that arranging the vertex data like this:
+
+\begin{figure}
+  \href{verbatim/vap_prog_stream_vertices.c}{\includegraphics{verbatim/output/vap_prog_stream_vertices.c.pdf}}
+\end{figure}
+
+Is easier to deal with in \texttt{VAP\_PROG\_STREAM\_CNTL} than:
+
+\begin{figure}
+  \href{verbatim/vap_prog_stream_vertices2.c}{\includegraphics{verbatim/output/vap_prog_stream_vertices2.c.pdf}}
+\end{figure}
+
+\begin{figure}
+\begin{verbatim}
+instruction[0]:
+  0x00078005  OUT RGBA
+  0x08020000  RGB ADDR0=temp[0] ADDR1=0.0 ADDR2=0.0
+  0x08020080  ALPHA ADDR0=0.0 ADDR1=0.0 ADDR2=0.0
+  0x1c440220  RGB_SEL_A=src0.rgb RGB_SEL_B=src0.rgb TARGET=A
+  0x1cc18003  ALPHA_OP=OP_MAX ALPHA_SEL_A=src0.1 ALPHA_SEL_B=src0.1 TARGET=A
+  0x00000005  RGB_OP=OP_MAX
+\end{verbatim}
+\caption*{R500 fragment shader (1 instruction, 192-bit control word)}
+\end{figure}
+
+This fragment shader is doing the equivalent of:
+
+\begin{figure}
+  \href{verbatim/fragment_shader_equivalent_vertex_color.glsl}{\includegraphics{verbatim/output/fragment_shader_equivalent_vertex_color.glsl.pdf}}
+\end{figure}
+
+The \texttt{temp} input appears to be written by
+\texttt{VAP\_OUT\_VTX\_FMT\_0\__VTX\_COLOR\_0\_PRESENT} and read due to the
+changes to \texttt{RS\_COUNT} and \texttt{RS\_INST\_0}.
+
+\section{Progress: 21 Oct 2025}
+
+From 15 Oct 2025 to 21 Oct 2025, I achieved the following (roughly in chronological order):
+
+\begin{itemize}
+\item I learned how the vertex fetcher is \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/vertex_color_aos.c#L387-L401}{configured}
+\item I learned how the ``point list'' drawing primitive can be used to \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/single_color_clear.c#L504}{clear the screen}
+\item I invented a new syntax for R500 vertex shader assembly (ATI never specified one themselves)
+\item I modified my R500 \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/regs/pvs_disassemble.py}{vertex shader disassembler} to emit this new vertex shader syntax
+\item I wrote a R500 \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/regs/assembler/vs}{vertex shader assembler} that can process my vertex shader assembly syntax
+\item I create several animated demos with \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/single_color_clear_translate_vblank.c#L849-L859}{vblank-synchronized double buffering}
+\item I learned how to configure and draw (multi-)textured triangles
+\item I learned how to configure, clear, and use Z-buffers
+\item I made a \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/texture_cube_clear_zwrite_vertex_shader.c}{textured rotating cube demo} that uses my first non-trivial \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/cube_rotate.vs.asm}{handwritten vertex shader assembly program}
+\item I invented a new syntax for R500 fragment shader assembly (ATI never specified one themselves)
+\item I wrote a new R500 \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/regs/us_disassemble2.py}{fragment shader disassembler} that emits this new fragment shader syntax
+\item I wrote a R500 \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/regs/assembler/fs}{fragment shader assembler} that can process my fragment shader assembly syntax
+\item I wrote a ``shadertoy''-style demo that uses my first non-trivial \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/shadertoy_palette.fs.asm}{handwritten fragment shader assembly program}
+\end{itemize}
+
+\subsection{DRM\_RADEON\_CS state tracking}
+
+While attempting refactor one of my R500 demos to send fewer registers per
+\texttt{DRM\_RADEON\_CS} ioctl, I found that there is a ``state tracker'' within
+the \texttt{drm/radeon/r100}. For example, even if you don't use or depend on a
+Z-buffer, \texttt{DRM\_RADEON\_CS} will still reject your packet buffer
+depending on its own (imagined) concept of what the GPU state is. For example:
+
+\begin{verbatim}
+[ 1614.729278] [drm:r100_cs_track_check [radeon]] *ERROR* [drm] No buffer for z buffer !
+[ 1614.729626] [drm:radeon_cs_ioctl [radeon]] *ERROR* Invalid command stream !
+\end{verbatim}
+
+This happens because \texttt{track->z\_enabled} is
+\href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/r100.c#L2435}{initially
+  true} at the start of a \texttt{DRM\_RADEON\_CS} ioctl, and does not become
+false unless the packet buffer
+\href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/r300.c#L836-L843}{contains
+  a write} to \texttt{ZB\_CNTL}.
+
+This seems a bit heavy-handed. Even if the model were ``multiple applications
+may be using the GPU, so a single application can't depend on previously-set
+register state'', it would still be better if the kernel didn't try to enforce
+this by restricting permissible content of a packet buffer.
+
+\subsection{Vertex transform bypass}
+
+Mesa uses a ``point'' 3D primitive to implement \texttt{glClear} on R500. It
+does this by first uploading this vertex shader:
+
+\begin{figure}
+  \href{verbatim/mesa_glclear.vs.asm}{\includegraphics{verbatim/output/mesa_glclear.vs.asm.pdf}}
+  \caption*{\texttt{mesa\_glclear.vs.asm}}
+\end{figure}
+
+This shader does nothing to the input other than copy it to the output, where
+\texttt{out[0]} is the position vector, and \texttt{out[1]} is sent to the
+fragment shader as a ``texture coordinate''. That fragment shader, in turn, does
+not use the texture coordinate:
+
+\begin{figure}
+  \href{verbatim/mesa_glclear.fs.asm}{\includegraphics{verbatim/output/mesa_glclear.fs.asm.pdf}}
+  \caption*{\texttt{mesa\_glclear.fs.asm}}
+\end{figure}
+
+In my ``clear''
+\href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/single_color_clear_rotate_vblank.c#L539}{implementation},
+I instead set \texttt{PVS\_BYPASS}, which ``bypasses'' the vertex shader
+completely, sending the vertices directly to the rasterizer. This is convenient
+because it obviates the need to upload/change vertex shaders just to clear the
+color and Z -buffers.
+
+\subsection{Animation attempt \#1}
+
+With a working colorbuffer clear, I wrote the
+\href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/single_color_clear_translate.c#L786}{single\_color\_clear\_translate.c}
+demo to translate my triangle position coordinates in a loop that waits for
+\texttt{DRM\_RADEON\_GEM\_WAIT\_IDLE} between each frame. This attempt
+produced the following images:
+
+\begin{figure}
+  \includegraphics{videos/single_color_clear_translate.png}
+  \caption*{R500 DVI capture, \texttt{single\_color\_clear\_translate.c}}
+\end{figure}
+
+This was intended to be a smooth animation, yet it is not. It also seems several
+frames are never being displayed--the translation step is much smaller than what
+is shown in the video.
+
+This, interestingly, is exactly identical to how OpenGL/GLX applications behave
+on R500 with \texttt{vblank\_mode=0}.
+
+\subsection{Animation attempt \#2}
+
+I read the R500 display controller \href{doc/RRG-216M56-03oOEM.pdf}{register reference guide} again.
+It appears to suggest the \texttt{D1CRTC\_UPDATE\_INSTANTLY} bit, when unset,
+might cause changes to \texttt{D1GRPH\_PRIMARY\_SURFACE\_ADDRESS} to be delayed in
+hardware until the next vertical blanking interval begins.
+
+This can be combined with polling \texttt{D1GRPH\_SURFACE\_UPDATE\_PENDING} to
+later determine when the vblank-synchronized frame change actually occured.
+
+This is precisely what I implemented in
+\href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/single_color_clear_translate_vblank.c#L854-L855}{single\_color\_clear\_translate\_vblank.c}:
+
+\begin{figure}
+  \includegraphics{videos/single_color_clear_translate_vblank.png}
+  \caption*{R500 DVI capture, \texttt{single\_color\_clear\_translate\_vblank.c}}
+\end{figure}
+
+This is much closer to what I intended. The
+\texttt{D1GRPH\_SURFACE\_UPDATE\_PENDING} part is certainly working as I
+expected. Setting/unsetting \texttt{D1CRTC\_UPDATE\_INSTANTLY} appears to have
+no effect on \texttt{D1GRPH\_PRIMARY\_SURFACE\_ADDRESS} behavior, so I feel my
+understanding of R500 double-buffering is still incomplete.
+
+\subsection{Multiple-texture sampling}
+
+I am amazed and delighted how simple multiple-texture sampling is on R500.
+
+As a counter-example, while Sega Dreamcast does have a fairly capable
+fixed-function blending unit, to use the blending unit with multiple-texture
+sampled polygons one needs to render the polygon multiple times (at least once
+per texture) to an accumulation buffer. Blending is then performed between the
+currently-sampled texture and the previously-accumulated result, and the blend
+result is written to the accumulation buffer. From a vertex transformation
+perspective, it can be inconvenient/inefficient to be required to buffer entire
+triangle strips so that they can be submitted more than once per frame without
+duplicating the clip/transform computations.
+
+This is the fragment shader for
+\href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/texture_dual.c}{texture\_dual.c}
+(disassembly of code originally generated by Mesa):
+
+\begin{figure}
+  \href{verbatim/texture_dual.fs.asm}{\includegraphics{verbatim/output/texture_dual.fs.asm.pdf}}
+  \caption*{\texttt{texture\_dual.fs.asm}}
+\end{figure}
+
+This pre-subtract multiply-add is an algebraic rearrangement of this GLSL code:
+
+\begin{figure}
+  \href{verbatim/texture_dual.fs.glsl}{\includegraphics{verbatim/output/texture_dual.fs.glsl.pdf}}
+  \caption*{\texttt{texture\_dual.fs.glsl}}
+\end{figure}
+
+Which produces this image:
+
+\begin{figure}
+  \href{images/texture_dual.png}{\includegraphics{images/texture_dual.png}}
+  \caption*{R500 framebuffer capture, \texttt{texture\_dual.c}}
+\end{figure}
+
+Being able to manipulate the texture samples as fragment shader unit temporaries
+rather than as a sequence of accumulation buffer operations has me feeling excited
+to do more with this.
+
+\subsection{Z-buffer clear}
+
+I've never worked with traditional Z-buffers before--Sega Saturn uses
+\href{https://en.wikipedia.org/wiki/Painter\%27s_algorithm}{painter's algorithm}
+exclusively, and Sega Dreamcast uses a ``depth accumulation buffer''
+that isn't directly readable/writable.
+
+It is slightly obvious in retrospect, but it took me several minutes to realize
+that a ``depth clear'' can be implemented by covering the entire screen with a
+``point'' primitive with the desired initial depth while \texttt{ZFUNC} set to
+\texttt{ALWAYS}.
+
+\subsection{Drawing a 3D cube}
+
+With working double-buffering, Z-buffering, and the ability to clear each of
+these every frame, I felt I was finally ready to draw something ``3D''.
+
+I thought it would be fun to first start with a cube that is transformed in
+``software'' on the x86 CPU (not using a vertex shader). This sequence of videos
+shows my progression on implementing this:
+
+\begin{figure}
+  \includegraphics{videos/texture_cube.png}
+  \caption*{R500 DVI capture, \texttt{texture\_cube.c}}
+\end{figure}
+
+\begin{figure}
+  \includegraphics{videos/texture_cube_clear.png}
+  \caption*{R500 DVI capture, \texttt{texture\_cube\_clear.c}}
+\end{figure}
+
+\begin{figure}
+  \includegraphics{videos/texture_cube_clear_zwrite.png}
+  \caption*{R500 DVI capture, \texttt{texture\_cube\_clear\_zwrite.c}}
+\end{figure}
+
+\subsection{Drawing a 3D cube with vertex shaders}
+
+I then decided it would be fun to hand-write a ``3D rotation'' vertex shader
+from scratch. I first implemented the rotation in GLSL:
+
+\begin{figure}
+  \href{verbatim/cube_rotate.vs.glsl}{\includegraphics{verbatim/output/cube_rotate.vs.glsl.pdf}}
+  \caption*{\texttt{cube\_rotate.vs.glsl}}
+\end{figure}
+
+I verified that the GLSL version worked as expected in OpenGL, then I translated
+the GLSL to R500 vertex shader assembly, as:
+
+\begin{figure}
+  \href{verbatim/cube_rotate.vs.asm}{\includegraphics{verbatim/output/cube_rotate.vs.asm.pdf}}
+  \caption*{\texttt{cube\_rotate.vs.asm}}
+\end{figure}
+
+However, when I first executed the vertex shader cube rotation demo, I found
+it did not work as expected:
+
+\begin{figure}
+  \includegraphics{videos/texture_cube_clear_zwrite_vertex_shader_incorrect.png}
+  \caption*{R500 DVI capture, \texttt{texture\_cube\_clear\_zwrite\_vertex\_shader.c}\\(incorrect vertex shader assembler output)}
+\end{figure}
+
+After hours of debugging, I eventually found the issue was in this instruction:
+
+\begin{figure}
+  \href{verbatim/cube_rotate_3_temp.vs.asm}{\includegraphics{verbatim/output/cube_rotate_3_temp.vs.asm.pdf}}
+\end{figure}
+
+\href{doc/R5xx_Acceleration_v1.5.pdf}{R5xx\_Acceleration\_v1.5.pdf} briefly mentions this on pages 98 and 99:
+
+\begin{quote}
+The PVS\_DST\_MACRO\_INST bit was meant to be used for MACROS such as a
+vector-matrix multiply, but currently is only set for the following cases:
+
+A VE\_MULTIPLY\_ADD or VE\_MULTIPLYX2\_ADD instruction with all 3 source
+operands using unique PVS\_REG\_TEMPORARY vector addresses.  Since R300 only has
+two read ports on the temporary memory, this special case of these instructions
+is broken up (by the HW) into 2 operations.
+\end{quote}
+
+I read this paragraph much earlier, but I didn't fully understand it until
+now. Indeed, this multiply-add has three unique \texttt{temp} addresses, and
+must be encoded as a ``macro'' instruction.
+
+I fixed this in my vertex shader assembler by
+\href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/regs/assembler/vs/validator.py}{counting the number of unique temp addresses}
+referenced by each instruction, promoting \texttt{VE\_MULTIPLY\_ADD} to
+\texttt{PVS\_MACRO\_OP\_2CLK\_MADD} if more than two unique \texttt{temp}
+addresses are referenced.
+
+With this change, reassembling the same vertex shader source code now produces a
+correct vertex shader cube rotation:
+
+\begin{figure}
+  \includegraphics{videos/texture_cube_clear_zwrite_vertex_shader.png}
+  \caption*{R500 DVI capture, \texttt{texture\_cube\_clear\_zwrite\_vertex\_shader.c}\\(correct vertex shader assembler output)}
+\end{figure}
+
+\subsection{Comparison with Mesa's R500 vertex shader compiler}
+
+My ``cube rotation'' vertex shader,
+\href{https://git.idk.st/bilbo/r500/src/commit/50244c7c95/drm/cube_rotate.vs.asm}{cube\_rotate.vs.asm}
+is 15 instructions.
+
+Mesa's R500 vertex shader compiler generated a
+\href{https://git.idk.st/bilbo/r500/src/commit/50244c7c95/shader_examples/mesa/texture_cube_depth_vertex_shader.vs.txt}{27-instruction vertex shader}
+from \href{https://r500.idk.st/verbatim/cube_rotate.vs.glsl}{semantically equivalent GLSL code}. Disassembly:
+
+\begin{figure}
+  \href{verbatim/mesa_cube_rotate.vs.asm}{\includegraphics{verbatim/output/mesa_cube_rotate.vs.asm.pdf}}
+  \caption*{\texttt{mesa\_cube\_rotate.vs.asm}}
+\end{figure}
+
+I was not particularly trying to write concise code, but I find this difference
+in instruction count to be surprising. In general it seems Mesa's R500 vertex
+shader compiler failed to vectorize several operations, and does significantly
+more scalar multiplies and scalar multiply-adds than my implementation.
+
+Ignoring algorithmic improvements (such as lifting the sin/cos calculation to
+x86 code and instead sending a 4x4 matrix to the vertex shader), there is still
+more opportunity for optimization beyond my 15-instruction implementation.
+
+Particularly, the vertex shader unit has a ``dual math'' instruction mode, where
+``vector engine'' (VE\_) and ``math engine'' (ME\_) operations can be executed
+simultaneously in the same instruction. \texttt{cube\_rotate.vs.asm} would
+indeed benefit from such an optimization--most of the \texttt{ME\_SIN} and
+\texttt{ME\_COS} instructions could be interleaved with the \texttt{VE\_MUL} and
+\texttt{VE\_MAD} operations that follow (at significant expense to
+human-readability).
+
+I am curious to see more examples of the difference between Mesa's R500 vertex
+shader compiler output and my own vertex shader assembly.
+
+\subsection{Fragment shader instruction expressiveness}
+
+Compared to the R500 vertex shader instructions, the R500 fragment shader
+instructions are significantly more featureful. This makes inventing a syntax
+that can fully express the range of operations that a R500 fragment shader
+instruction can do more complex.
+
+A significant difference is where R500 vertex shaders have a single tier of
+operand argument decoding, as in:
+
+\begin{figure}
+  \includegraphics{diagrams/vertex_inputs.svg}
+  \caption*{R500 vertex shader instruction operand inputs (simplified)}
+\end{figure}
+
+While R500 fragment shaders have multiple tiers of operand argument decoding, as
+in:
+
+\begin{figure}
+  \includegraphics{diagrams/fragment_inputs.svg}
+  \caption*{R500 fragment shader instruction operand inputs (simplified)}
+\end{figure}
+
+I've written several \href{https://github.com/buhman/scu-dsp-asm}{nice assemblers}
+for other architectures in the past, but I've never seen any instruction set
+as expressive as R500 fragment shaders.
+
+I attempted to directly reflect this ``multiple tiers of operand argument
+decoding'' in the syntax I invented for fragment shader ALU instructions.
+
+These instructions are also vector instructions: a total of 24 floating point
+input operands and 8 floating results could be evaluated per instruction.
+
+With this abundance of expressiveness and a relatively high skill ceiling, I'm
+amazed R500 fragment shader assembly isn't more popular in programming
+competitions, general everyday conversation, etc...
+
+\subsection{Fragment shader assembler bugs}
+
+There were two ``I spent a lot of time debugging this'' issues I encountered
+with my fragment shader assembler.
+
+The first was in this code I wrote to draw a fragment shaded circle, as in:
+
+\begin{figure}
+  \href{images/shadertoy_circle.png}{\includegraphics{images/shadertoy_circle.png}}
+  \caption*{R500 framebuffer capture, \texttt{shadertoy\_circle.fs.asm}}
+\end{figure}
+
+However, in an earlier version of my fragment shader assembler, I produced this
+image instead:
+
+\begin{figure}
+  \href{images/shadertoy_circle_incorrect.png}{\includegraphics{images/shadertoy_circle_incorrect.png}}
+  \caption*{R500 framebuffer capture, \texttt{shadertoy\_circle.fs.asm}\\(incorrect assembler output)}
+\end{figure}
+
+In this handwritten fragment shader code:
+
+\begin{figure}
+  \href{verbatim/shadertoy_circle.fs.asm}{\includegraphics{verbatim/output/shadertoy_circle.fs.asm.pdf}}
+  \caption*{\texttt{shadertoy\_circle.fs.asm}}
+\end{figure}
+
+\href{doc/R5xx_Acceleration_v1.5.pdf}{R5xx\_Acceleration\_v1.5.pdf} says briefly on page 241:
+
+\begin{quote}
+Specifies whether to insert a NOP instruction after this.  This would get
+specified in order to meet dependency requirements for the pre-subtract inputs,
+and dependency requirements for src0 of an MDH/MDV instruction.
+\end{quote}
+
+The issue is the pre-subtract input for the \texttt{MAD |srcp.a| src0.1 -src2.a}
+instruction depends on the write to \texttt{temp[0].a} from the immediately
+preceding \texttt{RCP src0.a} instruction--a pipeline hazard.
+
+To fix this, I added support for
+\href{https://git.idk.st/bilbo/r500/commit/fe0684ca5e58ed3be026410812c042e883bdce71}{generating the \texttt{NOP} bit}
+in my fragment shader assembler.
+
+\subsection{More fragment shader assembler bugs}
+
+While trying to produce this image:
+
+\begin{figure}
+  \href{images/shadertoy_palette.png}{\includegraphics{images/shadertoy_palette.png}}
+  \caption*{R500 framebuffer capture, \texttt{shadertoy\_palette.fs.asm}}
+\end{figure}
+
+My fragment shader code instead produced this image:
+
+\begin{figure}
+  \href{images/shadertoy_palette_incorrect.png}{\includegraphics{images/shadertoy_palette_incorrect.png}}
+  \caption*{R500 framebuffer capture, \texttt{shadertoy\_palette.fs.asm}\\(incorrect assembler output)}
+\end{figure}
+
+The issue was simply that in the chaos of all of the other features I was
+implementing for my fragment shader assembler, I
+\href{https://git.idk.st/bilbo/r500/commit/f6a0fc4fab5dee3085dcf4b9a984244bba05d5ca}{forgot to emit the \texttt{ADDRD} bits}.
+
+This meant that while fragment shader code that exclusively uses zero-address
+destinations, such as \texttt{shadertoy\_circle.fs.asm}, appeared to work
+completely correctly, I encountered this bug as soon as I started using non-zero
+addresses such as \texttt{temp[1]} in my fragment shader code.
+
+\subsection{Comparison to Direct3D ``asm''}
+
+Prior to Direct3D 10, Microsoft previously defined a specification for both
+\href{https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx9-graphics-reference-asm-vs-3-0}{vertex shader assembly} and
+\href{https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx9-graphics-reference-asm-ps-3-0}{fragment shader assembly}.
+
+The Direct3D ``asm'' name is slightly deceptive, however, as the
+\texttt{vs\_3\_0} and \texttt{ps\_3\_0} instruction syntax does not map 1-to-1
+with any hardware that exists.
+
+It would perhaps be more accurate to think of Direct3D's ``asm''
+language and compiler as more analogous to a
+\href{https://en.wikipedia.org/wiki/BASIC}{shader BASIC} than as a true assembly
+language on the same level as ``6502 assembly'', ``Z80 assembly'' and similar.
+
+In contrast, my R500 assembly syntaxes are deliberately/explicitly mapped 1-to-1
+with R500 instructions.
+
+\subsection{Fragment shader animated demo}
+
+\begin{figure}
+  \includegraphics{videos/shadertoy_palette.png}
+  \caption*{R500 DVI capture, \texttt{shadertoy\_palette.fs.asm}}
+\end{figure}
+
+The R500 fragment shader code that I handwrote for this is:
+
+\begin{figure}
+  \href{verbatim/shadertoy_palette.fs.asm}{\includegraphics{verbatim/output/shadertoy_palette.fs.asm.pdf}}
+  \caption*{\texttt{shadertoy\_palette.fs.asm}}
+\end{figure}
+
+The \texttt{float} constants are interesting--they are decoded almost
+identically to the
+\href{https://en.wikipedia.org/wiki/Minifloat#8-bit_(1.4.3)}{8-bit (1.4.3) (bias 7) format shown on Wikipedia},
+except:
+\begin{itemize}
+\item There is no sign bit (the value is always positive--positive values
+  can be swizzled to produce negative operands)
+\item There is no ``zero'' value (zero can also be instead obtained via
+  swizzles); the ``all zeros'' bit pattern instead has a value of
+  \texttt{0.0009765625}.
+\item There are no infinite or not-a-number values: a ``15'' exponent is treated
+  as 15.
+\end{itemize}
+
+The exponent/mantissa table that shows example 7-bit float values on page 106 of
+\href{doc/R5xx_Acceleration_v1.5.pdf}{R5xx\_Acceleration\_v1.5.pdf} is incorrect.
+
+\end{document}
diff --git a/mesa/glDrawArrays.txt b/mesa/glDrawArrays.txt
new file mode 100644
index 0000000..a9439ad
--- /dev/null
+++ b/mesa/glDrawArrays.txt
@@ -0,0 +1,197 @@
+// _mesa_draw_arrays
+
+// r300_draw_vbo
+
+// r300_draw_arrays_immediate
+
+  vertex_size = 3
+  dwords = 13
+
+  // r300_prepare_for_rendering
+    // r300_emit_states
+      // r300_reserve_cs_dwords
+        389
+      // r300_emit_dirty_state
+        // r300_emit_gpu_flush
+          SC_SCISSOR0 = 0
+          SC_SCISSOR1 = (width - 1), (height - 1) // 600, 600
+          // cb_flush_clean
+          RB3D_DSTCACHE_CTLSTAT = 0xa
+          ZB_ZCACHE_CTLSTAT = 0x3
+          WAIT_UNTIL [0x1720] =  RADEON_WAIT_3D_IDLECLEAN
+        // r300_emit_aa_state
+          GB_AA_CONFIG = 0
+          RB3D_AARESOLVE_CTL = 0
+        // r300_emit_fb_state
+          RB3D_CCTL = 16384
+          RB3D_COLOROFFSET0 = 0
+          //OUT_CS_RELOC
+            OUT_CS(0xc0001000); /* PKT3_NOP */ \
+            OUT_CS(0);
+
+          RB3D_COLORPITCH0 = 0xc10640
+          //OUT_CS_RELOC
+            OUT_CS(0xc0001000); /* PKT3_NOP */ \
+            OUT_CS(0);
+
+          ZB_FORMAT = 2
+          ZB_DEPTHOFFSET = 0
+          //OUT_CS_RELOC
+            OUT_CS(0xc0001000); /* PKT3_NOP */ \
+            OUT_CS(4);
+
+          ZB_DEPTHPITCH = 0x30640
+          //OUT_CS_RELOC
+            OUT_CS(0xc0001000); /* PKT3_NOP */ \
+            OUT_CS(4);
+        // r300_emit_hyperz_state
+          ZB_BW_CNTL = 0
+          ZB_DEPTHCLEARVALUE = 0
+          SC_HYPERZ_EN = 0x1c
+          GB_Z_PEQ_CONFIG = 0
+        // r300_emit_ztop_state
+          ZB_ZTOP = 1
+        // r300_emit_dsa_state
+          FG_ALPHA_FUNC = 0
+          ZB_CNTL = 0
+          ZB_ZSTENCILCNTL = 0
+          ZB_STENCILREFMASK = 0
+          ZB_STENCILREFMASK_BF = 0
+          FG_ALPHA_VALUE = 0
+        // r300_emit_blend_state
+          RB3D_ROPCNTL = 0
+          RB3D_BLENDCNTL = 0
+          RB3D_ABLENDCNTL = 0
+          RB3D_COLOR_CHANNEL_MASK = 15
+          RB3D_DITHER_CTL = 0
+        // r300_emit_blend_color_state
+          RB3D_CONSTANT_COLOR_AR = 0
+          RB3D_CONSTANT_COLOR_GB = 0
+        // r300_emit_scissor_state
+          SC_CLIP_0_A = 0, 0
+          SC_CLIP_0_B = 0 - 1, 0 - 1
+        // r300_emit_sample_mask
+          SC_SCREENDOOR = 63 | (63 << 6) | (63 << 12) | (63 << 18)
+        // r300_emit_invariant_state
+          GB_SELECT = 0
+          FG_FOG_BLEND = 0
+          GA_OFFSET = 0
+          SU_TEX_WRAP = 0
+          SU_DEPTH_SCALE = 16777215.0f (0x4b7fffff)
+          SU_DEPTH_OFFSET = 0
+          SC_EDGERULE = 0x2da49525
+          RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD = 0x1010101
+          RB3D_DISCARD_SRC_PIXEL_GTE_THRESHOLD = 0xfefefefe
+          GA_COLOR_CONTROL_PS3 = 0
+          SU_TEX_WRAP_PS3 = 0
+        // r300_emit_viewport_state
+          VAP_VPORT_XSCALE = 300
+          VAP_VPORT_XOFFSET = 300
+          VAP_VPORT_YSCALE = -300
+          VAP_VPORT_YOFFSET = 300
+          VAP_VPORT_ZSCALE = 0.5
+          VAP_VPORT_ZOFFSET = 0.5
+          VAP_VTE_CNTL = 0x43f
+        // r300_emit_pvs_flush
+          VAP_PVS_STATE_FLUSH_REG = 0
+        // r300_emit_vap_invariant_state
+          VAP_PVS_VTX_TIMEOUT_REG = 0xffff
+          VAP_GB_VERT_CLIP_ADJ = 1.0f (0x3f800000)
+          VAP_GB_VERT_DISC_ADJ = 1.0f (0x3f800000)
+          VAP_GB_HORZ_CLIP_ADJ = 1.0f (0x3f800000)
+          VAP_GB_HORZ_DISC_ADJ = 1.0f (0x3f800000)
+          VAP_PSC_SGN_NORM_CNTL = 0xaaaaaaaa
+          VAP_TEX_TO_COLOR_CNTL = 0
+        // r300_emit_vertex_stream_state
+          VAP_PROG_STREAM_CNTL_0 = 0x2002
+          VAP_PROG_STREAM_CNTL_EXT_0 = 0xfa88
+        // r300_emit_vs_state
+          VAP_PVS_CODE_CNTL_0 = 0
+          VAP_PVS_CODE_CNTL_1 = 0
+          VAP_PVS_VECTOR_INDX_REG = 0
+          VAP_PVS_VECTOR_DATA_REG_128 = (ONE_REG_WR:)
+            {0xf00203, 0xd10001, 0x1248001, 0x1248001}
+          VAP_CNTL = 0xb0055a
+          VAP_PVS_FLOW_CNTL_OPC = 0
+          VAP_PVS_FLOW_CNTL_ADDRS_LW_[0-15] = 0
+          VAP_PVS_FLOW_CNTL_ADDRS_UW_[0-15] = 0
+          VAP_PVS_FLOW_CNTL_LOOP_INDEX_[0-15] = 0
+        // r300_emit_clip_state
+          VAP_PVS_VECTOR_INDX_REG = 0x600
+          VAP_PVS_VECTOR_DATA_REG_128 =
+            {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} (24)
+        // r300_emit_rs_block_state
+          VAP_VTX_STATE_CNTL = 0x5555
+          VAP_VSM_VTX_ASSM [0x2184] = 0x1
+          VAP_OUTPUT_VTX_FMT_0 = 1
+          VAP_OUTPUT_VTX_FMT_1 = 4
+          GB_ENABLE = 0
+          RS_IP_0 = 0x30000000
+          RS_COUNT = 0x40080
+          RS_INST_COUNT = 0
+          RS_INST_0 = 0
+        // r300_emit_rs_state
+          VAP_CNTL_STATUS = 0
+          VAP_CLIP_CNTL = 0xc000
+          GA_POINT_SIZE = 0x60006
+          GA_POINT_MINMAX = 0x60006
+          GA_LINE_CNTL = 0x20006
+          SU_POLY_OFFSET_ENABLE = 0
+          SU_CULL_MODE = 0
+          GA_LINE_STIPPLE_CONFIG = 0
+          GA_LINE_STIPPLE_VALUE = 0
+          GA_POLY_MODE = 0
+          GA_ROUND_MODE = 0x31
+          SC_CLIP_RULE = 0xffff
+          GA_POINT_S0 = 0
+          GA_POINT_T0 = 1.0f (0x3f800000)
+          GA_POINT_S1 = 1.0f (0x3f800000)
+          GA_POINT_T1 = 0
+        // r300_emit_fb_state_pipelined
+          US_OUT_FMT_0 = 0x1b00
+          US_OUT_FMT_1 = 0xf
+          US_OUT_FMT_2 = 0xf
+          US_OUT_FMT_3 = 0xf
+          GB_MSPOS0 = 0x66666666
+          GB_MSPOS1 = 0x6666666
+        // r500_emit_fs
+          US_CONFIG = 2
+          US_PIXSIZE = 1
+          US_FC_CTRL = 0
+          US_CODE_RANGE = 0
+          US_CODE_OFFSET = 0
+          US_CODE_ADDR = 0
+          GA_US_VECTOR_INDEX = 0
+          GA_US_VECTOR_DATA = (ONE_REG_WR:)
+            {0x78005, 0x8020080, 0x8020080, 0x1c9b04d8, 0x1c810003, 0x5}
+          FG_DEPTH_SRC = 0
+          US_W_FMT = 0
+        // r500_emit_fs_rc_constant_state
+          [nothing]
+        // r500_emit_fs_constants
+          [nothing]
+        // r300_emit_vs_constants
+          VAP_PVS_CONST_CNTL = 0
+        // r300_emit_texture_cache_inval
+          TX_INVALTAGS = 0
+        // r300_emit_textures_state
+          TX_ENABLE = 0
+        // r300_emit_query_start
+          [nothing]
+        // r500_emit_index_bias
+          VAP_INDEX_OFFSET = 0
+        // r300_emit_draw_init
+          GA_COLOR_CONTROL = 0x3aaaa
+          VAP_VF_MAX_VTX_INDX = 2
+          VAP_VF_MIN_VTX_INDX = 0
+
+        // r300_draw_arrays_immediate
+          VAP_VTX_SIZE = 3
+
+          [
+            PACKET3_3D_DRAW_IMMD_2 (3 * 3)
+            0x30034 // VAP_VF_CNTL
+            {0.5, -0.5, 0}
+            {-0.5, -0.5, 0}
+            {0, 0.5, 0}
+          ]
diff --git a/replace_video.py b/replace_video.py
new file mode 100644
index 0000000..37617b1
--- /dev/null
+++ b/replace_video.py
@@ -0,0 +1,26 @@
+import sys
+
+scale = 1.5
+
+def transform():
+    with open(sys.argv[1]) as f:
+        for line in f.readlines():
+            if "<img alt='PIC' src='videos/" in line:
+                begin, end = line.split("<img", maxsplit=1)
+                yield begin
+
+                img, rest = end.split("/>", maxsplit=1)
+                yield rest
+
+                src = img.split("src='")[1].split("'")[0]
+                assert src.endswith(".png"), src
+                src = src.removesuffix(".png") + ".mp4"
+                yield "<video style='width: 100%;' controls=''>"
+                yield f"<source src='{src}' type='video/mp4'>"
+                yield "</video>"
+            else:
+                yield line
+
+lines = list(transform())
+with open(sys.argv[1], 'w') as f:
+    f.write(''.join(lines))
diff --git a/resize_svg.py b/resize_svg.py
new file mode 100644
index 0000000..62b9ab2
--- /dev/null
+++ b/resize_svg.py
@@ -0,0 +1,22 @@
+import sys
+
+scale = 1.5
+
+def transform():
+    with open(sys.argv[1]) as f:
+        for line in f.readlines():
+            if line.strip().startswith("<svg xmlns"):
+                width = line.split('width="')[1].split('"')[0]
+                height = line.split('height="')[1].split('"')[0]
+                viewbox = line.split('viewBox="')[1].split('"')[0]
+                width = float(width) * scale
+                height = float(height) * scale
+
+                template = f'<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="{width}" height="{height}" viewBox="{viewbox}">'
+                yield template
+            else:
+                yield line
+
+lines = list(transform())
+with open(sys.argv[1], 'w') as f:
+    f.write('\n'.join(lines))
diff --git a/verbatim.sh b/verbatim.sh
new file mode 100644
index 0000000..4a53c54
--- /dev/null
+++ b/verbatim.sh
@@ -0,0 +1,72 @@
+set -eux
+
+cd verbatim/
+
+mkdir -p output
+
+for i in *.asm; do
+    cat <<EOF > $i.tex
+\documentclass[varwidth=13.1cm, border={0.0cm 0.0cm 0.0cm 0.0cm}]{standalone}
+\usepackage{minted}
+\setminted[python]{breaklines, linenos, frame=lines, framesep=2mm, fontsize=\huge, numbersep=5pt}
+\standaloneenv{minted}
+\begin{document}
+
+\begin{minted}{haskell}
+EOF
+    cat $i >> $i.tex
+
+    cat <<EOF >> $i.tex
+\end{minted}
+
+\end{document}
+EOF
+
+    pdflatex -shell-escape -output-directory=output $i.tex
+    pdflatex -shell-escape -output-directory=output $i.tex
+done
+
+
+for i in *.glsl; do
+    cat <<EOF > $i.tex
+\documentclass[varwidth=13.1cm, border={0.0cm 0.0cm 0.0cm 0.0cm}]{standalone}
+\usepackage{minted}
+\setminted[python]{breaklines, linenos, frame=lines, framesep=2mm, fontsize=\huge, numbersep=5pt}
+\standaloneenv{minted}
+\begin{document}
+
+\begin{minted}{glsl}
+EOF
+    cat $i >> $i.tex
+
+    cat <<EOF >> $i.tex
+\end{minted}
+
+\end{document}
+EOF
+
+    pdflatex -shell-escape -output-directory=output $i.tex
+    pdflatex -shell-escape -output-directory=output $i.tex
+done
+
+for i in *.c; do
+    cat <<EOF > $i.tex
+\documentclass[varwidth=13.1cm, border={0.0cm 0.0cm 0.0cm 0.0cm}]{standalone}
+\usepackage{minted}
+\setminted[python]{breaklines, linenos, frame=lines, framesep=2mm, fontsize=\huge, numbersep=5pt}
+\standaloneenv{minted}
+\begin{document}
+
+\begin{minted}{c}
+EOF
+    cat $i >> $i.tex
+
+    cat <<EOF >> $i.tex
+\end{minted}
+
+\end{document}
+EOF
+
+    pdflatex -shell-escape -output-directory=output $i.tex
+    pdflatex -shell-escape -output-directory=output $i.tex
+done