diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..5e7a2b1
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,21 @@
+*.html
+*.css
+*.out
+index.pdf
+_minted/
+*.aux
+*.log
+*.4ct
+*.4tc
+*.dvi
+*.idv
+*.lg
+*.tmp
+*.toc
+*.xref
+*~
+verbatim/*.tex
+verbatim/*.svg
+verbatim/*.pdf
+verbatim/output
+images/*.data
diff --git a/build.sh b/build.sh
new file mode 100644
index 0000000..8492761
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,28 @@
+set -eux
+
+rm -f verbatim/output/*.svg
+
+make4ht --shell-escape index.tex "pic-m,pic-equation,svg"
+
+echo 'img[alt="PIC"] { width: 100%; }' >> index.css
+echo '.cmtt-10 { font-size: 0.9em; }' >> index.css
+echo 'img[src="index3x.svg"] { height: 2.5em; }' >> index.css
+
+sed -i '/prefers-color-scheme/d' index.css
+sed -i 's| | |g' index.html
+sed -i '/figure.figure/d' index.css
+echo 'figure.figure { margin-left: 20px; margin-right: 20px; }' >> index.css
+echo 'pre.verbatim { font-size: 0.9em; }' >> index.css
+sed -i 's|color-scheme: light dark;||g' index.css
+echo 'figcaption.caption { margin-bottom: 1.3em; margin-top: 0.3em; }' >> index.css
+
+sed -i 's/index.css/index2.css/g' index.html
+
+mv index.css index2.css
+
+python replace_video.py index.html
+
+for file in verbatim/output/*.svg; do
+ sed -i 's|rgb(0%, 0%, 100%)||g' "$file"
+ python resize_svg.py "$file"
+done
diff --git a/deploy.sh b/deploy.sh
new file mode 100644
index 0000000..bdbfe13
--- /dev/null
+++ b/deploy.sh
@@ -0,0 +1 @@
+rsync --delete -arv * root@az1.idk.st:/var/www/r500/
diff --git a/diagrams/fragment_inputs.dot b/diagrams/fragment_inputs.dot
new file mode 100644
index 0000000..4b2365c
--- /dev/null
+++ b/diagrams/fragment_inputs.dot
@@ -0,0 +1,61 @@
+digraph D {
+ graph [ranksep="1" splines=line ordering="in"];
+ node [shape=box];
+ edge [arrowhead=none];
+
+ subgraph cluster_W {
+ addr [shape=none]
+
+ temp
+ const
+ float
+ }
+
+ subgraph cluster_Z {
+ {rank=same
+ src [shape=none]
+ src0 [label="src0"]
+ src1 [label="src1"]
+ src2 [label="src2"]
+ srcp
+ }
+
+ }
+
+ subgraph cluster_R {
+
+ {rank=same
+ opcode [shape=none];
+ a [label = "a"];
+ b [label = "b"];
+ c [label = "c"];
+ }
+ }
+
+ temp:s -> src0:n
+ temp:s -> src1:n
+ temp:s -> src2:n
+
+ const:s -> src0:n
+ const:s -> src1:n
+ const:s -> src2:n
+
+ float:s -> src0:n
+ float:s -> src1:n
+ float:s -> src2:n
+
+ src0:s -> a:n
+ src1:s -> a:n
+ src2:s -> a:n
+ srcp:s -> a:n
+
+ src0:s -> b:n
+ src1:s -> b:n
+ src2:s -> b:n
+ srcp:s -> b:n
+
+ src0:s -> c:n
+ src1:s -> c:n
+ src2:s -> c:n
+ srcp:s -> c:n
+}
diff --git a/diagrams/fragment_inputs.svg b/diagrams/fragment_inputs.svg
new file mode 100644
index 0000000..c459c8c
--- /dev/null
+++ b/diagrams/fragment_inputs.svg
@@ -0,0 +1,205 @@
+
+
+
+
+
diff --git a/diagrams/vertex_inputs.dot b/diagrams/vertex_inputs.dot
new file mode 100644
index 0000000..08564e3
--- /dev/null
+++ b/diagrams/vertex_inputs.dot
@@ -0,0 +1,36 @@
+digraph D {
+ graph [ranksep="1" splines=line];
+ node [shape=box];
+ edge [arrowhead=none];
+
+ input
+ const
+ temp
+ alt_temp
+
+ opcode [shape=none];
+ a [label = "a"];
+ b [label = "b"];
+ c [label = "c"];
+
+ subgraph cluster_R {
+
+ {rank=same opcode a b c}
+ }
+
+ input:s -> a:n
+ input:s -> b:n
+ input:s -> c:n
+
+ const:s -> a:n
+ const:s -> b:n
+ const:s -> c:n
+
+ temp:s -> a:n
+ temp:s -> b:n
+ temp:s -> c:n
+
+ alt_temp:s -> a:n
+ alt_temp:s -> b:n
+ alt_temp:s -> c:n
+}
diff --git a/diagrams/vertex_inputs.svg b/diagrams/vertex_inputs.svg
new file mode 100644
index 0000000..726ce47
--- /dev/null
+++ b/diagrams/vertex_inputs.svg
@@ -0,0 +1,124 @@
+
+
+
+
+
diff --git a/index.tex b/index.tex
new file mode 100644
index 0000000..ded3477
--- /dev/null
+++ b/index.tex
@@ -0,0 +1,905 @@
+\documentclass[20pt]{article}
+
+\usepackage[font=small,labelfont=bf]{caption}
+\usepackage{hyperref}
+\hypersetup{
+ colorlinks=true,
+ linkcolor=blue,
+ filecolor=magenta,
+ urlcolor=cyan,
+ pdftitle={Dreamcast},
+ pdfpagemode=FullScreen,
+ }
+
+\usepackage{graphicx}
+\graphicspath{ {./images/} }
+
+\usepackage{minted}
+
+\title{Radeon R500}
+\date{}
+
+\begin{document}
+
+\maketitle
+\href{images/x1950xt.jpg}{\includegraphics{images/x1950xt.jpg}}
+
+\tableofcontents
+
+\section{Introduction}
+
+The primary/minimal project goal is "draw a triangle on a Radeon R500 via direct
+memory-mapped hardware register and texture memory accesses". This means no
+\href{https://mesa3d.org/}{Mesa}, no
+\href{https://github.com/torvalds/linux/tree/v6.12/drivers/gpu/drm/radeon}{radeon}
+kernel module, and certainly no OpenGL or Direct3D.
+
+I have worked directly with several other graphics units in the past
+(\href{https://github.com/buhman/saturn-examples}{Saturn VDP1},
+\href{https://github.com/buhman/dreamcast}{Dreamcast Holly},
+\href{https://github.com/buhman/voodoo}{Voodoo 2}). In all of these projects,
+my strategy is generally:
+
+\begin{itemize}
+\item read the entire \href{doc/R5xx_Acceleration_v1.5.pdf}{reference
+ documentation} at least once, front-to-back
+\item copy all hardware register definitions from the documentation to a
+ spreadsheet or text file (sometimes typing everything by hand if I am in such
+ a chill mood)
+\item progressively build increasingly-complex example programs that exercise
+ the hardware
+\end{itemize}
+
+The rabbit hole for R500 seems significantly deeper, considering this is the
+first graphics unit I've worked with that has programmable vertex and pixel
+shader engines.
+
+\subsection{Hardware}
+
+For testing, I currently have this hardware configuration:
+
+\begin{itemize}
+\item ASUS P4B-LX (Intel 845) motherboard
+\item Intel Pentium 4 2.6GHz SL6PP (Northwood)
+\item 1024 MB RAM
+\item 32GB PATA SSD
+\item ATI Radeon X1650 PRO 512MB AGP
+\end{itemize}
+
+I also have the X1950 XT PCIe shown in the photo, which amazingly has never been
+used, and prior to the photo was sealed in an antistatic bag from manufacture to
+now.
+
+\subsection{Test setup}
+
+While in my other (video game console) projects I typically insist on
+``bare-metal'' development with no operating system or third-party library
+running on the target hardware, my experience with x86 is much more limited.
+
+While it is something I am interested in doing, I believe creating a
+zero-dependency ``code upload'' mechanism for an x86-pc that does not depend on
+an operating system would severely delay my progress on R500-specific work.
+
+For my initial exploration of R500, I will instead be manipulating the hardware
+primarily from Linux kernel space. This Linux kernel code does not actually
+meaningfully depend on Linux APIs beyond calling \texttt{ioremap} to get usable
+memory mappings for R500 PCI resources (texture/framebuffer memory and
+registers).
+
+\section{Progress: 07 Oct 2025}
+
+From 01 Oct 2025 to 07 Oct 2025, I achieved the following:
+
+\begin{itemize}
+\item I wrote a reasonably complete AtomBIOS disassembler
+\item I can disable (IBM PC) VGA mode and manipulate the native framebuffer
+\item I can upload microcode to the ``command processor'', and I can write to
+ scratch registers via command processor packets (this is uncoincidentally the
+ same command processor test that the radeon kernel module does).
+\item I stepped through Mesa functions as invoked by a simple OpenGL
+ application, and created \href{mesa/glDrawArrays.txt}{a list of R500
+ registers/values} that are written by Mesa during \texttt{glDrawArrays}.
+\end{itemize}
+
+I did not achieve the following:
+
+\begin{itemize}
+\item I attempted to manipulate the R500 register state and command processor
+ into drawing a triangle, but I have not been successful yet
+\end{itemize}
+
+\subsection{Documentation}
+
+In general, I note that the R500 documentation is significantly weaker than I
+hoped, and does not contain enough information to draw a triangle on the R500
+from the documentation alone (with no prior knowledge about previous Radeon
+graphics units).
+
+In addition to the lack of prose, in several cases I've noticed both Mesa and
+Linux reference R500 registers that are
+\href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/pci/undocumented_3d_registers.h}{not
+ present at all} in the documentation.
+
+\subsection{AtomBIOS}
+
+AtomBIOS physically exists as a section inside the ROM on R500 graphics units.
+AtomBIOS is notably used for setting PLL/pixel clock frequencies and display
+resolutions, among several other functions.
+
+The Radeon graphics hardware itself does not execute AtomBIOS code--instead, it
+is expected that the host (e.g: x86) CPU evaluate the instructions in the
+AtomBIOS command tables. Generally the outcome of evaluating AtomBIOS code is
+that several ``register write'' instructions will be executed, changing the
+state of the graphics unit.
+
+My original goal in studying AtomBIOS was that I thought I would need it to set
+up the R500 display controller to a reasonable state (as a prerequisite for
+drawing 3D graphics). However, after actually experimenting with ``disable VGA
+mode'', I currently believe that I don't actually need to implement
+resolution/mode changes, and can proceed without it.
+
+\subsection{PIO mode}
+
+The Linux kernel exclusively communicates with R500 via ``PCI bus mastering''.
+A ``ring buffer'' is allocated in ``GTT'' space, which from the graphics unit's
+perspective exists in the same address space as framebuffer memory, but is an
+address that is outside the framebuffer memory that physically exists.
+
+I also observed via debugfs that the GTT apparently involves some sort of sparse
+page mapping, but I don't understand how this works from an x86 perspective.
+
+In the absence of an understanding of how to make my own ``GTT'' address space,
+I attempted to operate the R500 in ``PIO'' mode. This has the advantage of being
+able to simply write to registers via (simple) PCI memory-mapped accesses, but
+it has the disadvantage that Linux doesn't use R500 this way, so I have no
+reference implementation for how PIO mode should be used.
+
+\subsection{Triangle drawing attempt \#1}
+
+I translated my \href{mesa/glDrawArrays.txt}{glDrawArrays notes} to
+\href{https://git.idk.st/bilbo/r500/src/commit/b6472e4c16946f44e02d82f31adaa411df009c67/pci/triangle.c}{equivalent
+ register writes}.
+
+This does not work, and I don't yet understand why. The main issue is that most
+of the time when I execute that code, Linux appears to ``hang'' completely, and
+my ``printk'' messages are never sent over ssh. On the rare occasion when the
+``hang'' does not occur, a triangle is nevertheless not drawn on the
+framebuffer.
+
+I have a few ideas for how to proceed:
+
+\begin{itemize}
+\item Move the ``triangle.c'' register accesses to userspace via
+ \texttt{/sys/bus/pci}, which might improve debuggability
+\item Abandon the ``write a kernel module'' idea completely, and instead
+ interact with the R500 via \href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/radeon_drv.c#L565-L577}{radeon DRM ioctls}
+\end{itemize}
+
+The latter is perhaps both the most attractive, and the most work. I currently
+don't have any understanding of GEM buffers, radeon buffer objects, etc.., so
+I'd need to study these in more detail.
+
+\section{Progress: 14 Oct 2025}
+
+From 08 Oct 2025 to 14 Oct 2025, I achieved the following:
+
+\begin{itemize}
+\item I studied how Mesa interacts with the \texttt{radeon} kernel module via
+ \texttt{DRM\_RADEON\_} ioctls.
+\item I wrote simple R500 \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/regs/pvs_disassemble.py}{vertex shader} and \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/regs/us_disassemble.py}{pixel shader} disassemblers.
+\item I wrote a \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/regs/parse_packets.py}{tool} to print R500 ``PM4'' packets in human-readable form.
+\item I laboriously \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/regs/bits}{copied and reformatted} all bit definitions from \href{doc/R5xx_Acceleration_v1.5.pdf}{R5xx\_Acceleration\_v1.5.pdf}
+\item I wrote \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/regs}{several other miscellaneous tools} related to register and bit parsing and manipulation.
+\item I wrote two \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/drm/single_color.c}{humble} \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/drm/vertex_color.c}{demos} to draw a triangle on R500.
+\end{itemize}
+
+\subsection{Radeon DRM}
+
+As implied in the last update, primarily due to my lack of experience with
+bare-metal x86, I decided it would be a better approach to interact with R500
+Command Processor via the \texttt{radeon} kernel module, which provides a
+partially reasonable interface for this via the \texttt{DRM\_RADEON\_CS} ioctl.
+
+All \texttt{DRM\_RADEON\_} ioctls are mostly or entirely undocumented. Instead,
+I built debugging symbols for Mesa and other supporting libraries so that I
+could set breakpoints in GDB to observe what sequences of \texttt{DRM\_RADEON\_}
+ioctls Mesa uses.
+
+From my previous \href{mesa/glDrawArrays.txt}{glDrawArrays notes} observations,
+I noticed this strange sequence:
+
+\begin{verbatim}
+0x0000138a // type 0 packet, count=0, starting offset = RB3D_COLOROFFSET0
+0x00000000 // RB3D_COLOROFFSET0 = 0
+0xc0001000 // type 3 packet, count=0, opcode=NOP
+0x00000000 // zero (meaningless data)
+\end{verbatim}
+
+At first, it seemed Mesa was deliberately setting the colorbuffer write address
+to (VRAM address) zero, which seemed like a strange choice considering I am
+debugging an X11/GLX OpenGL application--surely the colorbuffer address would be
+some non-zero value several megabytes after the beginning of VRAM.
+
+I later attempted to send my own PM4 packet via \texttt{DRM\_RADEON\_CS}. This
+initial attempt returned \texttt{Invalid argument}, with the following
+message in dmesg:
+
+\begin{verbatim}
+[ 1205.978993] [drm:radeon_cs_packet_next_reloc [radeon]] *ERROR* No packet3 for relocation for packet at 14.
+[ 1205.979427] [drm] ib[14]=0x0000138E
+[ 1205.979433] [drm] ib[15]=0x00C00640
+[ 1205.979437] [drm:r300_packet0_check [radeon]] *ERROR* No reloc for ib[13]=0x4E28
+[ 1205.979545] [drm] ib[12]=0x0000138A
+[ 1205.979548] [drm] ib[13]=0x00000000
+[ 1205.979553] [drm:radeon_cs_ioctl [radeon]] *ERROR* Invalid command stream !
+\end{verbatim}
+
+This error message comes from
+\href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/r300.c#L664-L669}{drm/radeon/r300.c}.
+
+The meaningless data following the type-3 NOP packet is used by the kernel to
+\href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/radeon_cs.c#L875-L889}{index}
+the \texttt{DRM\_RADEON\_CS} ``relocs'' array (an array of GEM buffer handles).
+
+It seems perhaps the design goal was to never expose the VRAM address of GEM
+buffers to userspace (indeed there seems to be no way to retrieve that via any
+GEM ioctls). This restriction is slightly disappointing, as I would have
+preferred to be able to send unmodified packet data to the R500.
+
+However, at the moment this does not appear to be a significant issue, as a
+relatively small number of registers are modified by the Linux kernel's packet
+parser prior creating the indirect buffer that is actually sent to the R500
+hardware.
+
+\subsection{Indirect buffers}
+
+There appears to be a lot of memory-to-memory copying in the
+Linux/Mesa/DRM/GEM/radeon graphics stack:
+
+\begin{itemize}
+\item Mesa writes the OpenGL state to various internal structures
+\item Mesa \href{https://gitlab.freedesktop.org/mesa/mesa/-/blob/25.0/src/gallium/drivers/r300/r300_emit.c?ref_type=heads}{copies} OpenGL state to packet commands in a userspace buffer
+\item Mesa
+ \href{https://gitlab.freedesktop.org/mesa/mesa/-/blob/25.0/src/gallium/winsys/radeon/drm/radeon_drm_cs.c?ref_type=heads#L486-487}{passes
+ the address} of the userspace buffer to the kernel via
+ \texttt{DRM\_RADEON\_CS}
+\item Linux
+ \href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/radeon_cs.c#L340-L358}{copies
+ the entire userspace buffer} to kernel space (calling kvmalloc/kvfree on
+ each ioctl)
+\item The \texttt{radeon\_cs\_parser} parses and modifies the buffer originally
+ generated by Mesa
+\item \href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/radeon_cs.c#L613}{radeon\_cs\_ib\_fill} copies the parser result to gpu address space.
+\end{itemize}
+
+Eventually,
+\href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/r100.c#L3709-L3722}{r100\_ring\_ib\_execute}
+is called, which writes the indirect buffer address (now in GPU address space)
+to the ring.
+
+It would be interesting to experiment with writing a packet buffer directly in
+GPU/GTT address space (from Linux userspace), with zero copies. This would
+require an entirely new set of ioctls.
+
+\subsection{Triangle drawing attempt \#2}
+
+These images were never drawn on-screen. I extracted them from VRAM via
+\texttt{/sys/kernel/debug/radeon\_vram}.
+
+\begin{figure}
+ \href{images/single_color_macrotiled.png}{\includegraphics{images/single_color_macrotiled.png}}
+ \caption*{R500 framebuffer capture, \texttt{single\_color.c}}
+\end{figure}
+
+Though I was not aware of it yet, the above image was indeed my triangle, and
+\texttt{COLORPITCH0} was merely in ``macrotiled'' mode. Once I realized this, I
+produced this image (still in off-screen VRAM):
+
+\begin{figure}
+ \href{images/single_color.png}{\includegraphics{images/single_color.png}}
+ \caption*{R500 framebuffer capture, \texttt{single\_color.c}}
+\end{figure}
+
+This \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/drm/single_color.c}{``single color''} demo deliberately uses the very simple vertex and fragment
+shaders:
+
+\begin{figure}
+\begin{verbatim}
+instruction[0]:
+ 0x00f00203 dst: VE_ADD out[0].xyzw
+ 0x00d10001 src0: input[0].xyzw
+ 0x01248001 src1: input[0].0000
+ 0x01248001 src2: input[0].0000
+\end{verbatim}
+\caption*{R500 vertex shader (1 instruction, 128-bit control word)}
+\end{figure}
+
+This vertex shader is doing the equivalent of:
+
+\begin{figure}
+ \href{verbatim/vertex_shader_equivalent_single_color.glsl}{\includegraphics{verbatim/output/vertex_shader_equivalent_single_color.glsl.pdf}}
+\end{figure}
+
+The W component \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae//drm/single_color.c#L339}{comes from}
+\texttt{VAP\_PROG\_STREAM\_CNTL\_EXT\_\_SWIZZLE\_SELECT\_W\_0(5)}, which
+swizzles W to a constant \texttt{1.0}, despite W not being present in the vertex
+data.
+
+\begin{figure}
+\begin{verbatim}
+instruction[0]:
+ 0x00078005 OUT RGBA
+ 0x08020080 RGB ADDR0=0.0 ADDR1=0.0 ADDR2=0.0
+ 0x08020080 ALPHA ADDR0=0.0 ADDR1=0.0 ADDR2=0.0
+ 0x1c9b04d8 RGB_SEL_A=src0.110 RGB_SEL_B=src0.110 TARGET=A
+ 0x1c810003 ALPHA_OP=OP_MAX ALPHA_SEL_A=src0.0 ALPHA_SEL_B=src0.0 TARGET=A
+ 0x00000005 RGB_OP=OP_MAX
+\end{verbatim}
+\caption*{R500 fragment shader (1 instruction, 192-bit control word)}
+\end{figure}
+
+This fragment shader is doing the equivalent of:
+
+\begin{figure}
+ \href{verbatim/fragment_shader_equivalent_single_color.glsl}{\includegraphics{verbatim/output/fragment_shader_equivalent_single_color.glsl.pdf}}
+\end{figure}
+
+via the src swizzles. I think it is interesting that there are so many options
+for producing inline constants within the fragment shader.
+
+The ``target'' fragment shader field also seems interesting. I am excited to
+write shaders that use multiple output buffers.
+
+\subsection{DRM/KMS/GBM}
+
+These renders were not displayed on-screen, so I looked for ways to correct
+this.
+
+Perhaps the most obvious method would be to write to the display controller
+registers (\texttt{D1GRPH\_PRIMARY\_SURFACE\_ADDRESS}) via
+\texttt{RADEON\_DRM\_CS}. However, this does not work due to the command parser
+anti-fun implemented in
+\href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/r300.c#L643}{r300\_packet0\_check}:
+any register not present in that case statement is considered invalid, and the
+packet buffer is not submitted.
+
+I attempted to do this the ``right way'' via the DRM/KMS/GBM APIs. I then
+learned that this does not behave correctly on my R500 because demos that wait
+for the flag returned by \texttt{DRM\_IOCTL\_MODE\_PAGE\_FLIP} hang forever.
+
+I noticed this earlier on Xorg/GLX as well, as I have been using the
+\texttt{vblank\_mode=0} environment variable to avoid hanging forever in
+\texttt{glXSwapBuffers}. This appears to be a Linux kernel bug, but I didn't
+investigate this further.
+
+\subsection{On-screen drawing}
+
+I noticed in \texttt{/sys/kernel/debug/radeon\_vram\_mm} that the Linux console
+is only using a single framebuffer (and does not double-buffer).
+
+This is fortunate, because this means I can simply
+\href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/pci_user/main.c#L48}{mmap
+ the register address space} and write
+\texttt{D1GRPH\_PRIMARY\_SURFACE\_ADDRESS} myself without worrying about the
+Linux console overwriting my change. I observed the \texttt{0x813000} value from
+\texttt{/sys/kernel/debug/radeon\_vram\_mm}--there appears to be no other way to
+get the vram address of a GEM buffer.
+
+This is ``good enough'' for now, though at some point I'll want to learn how to
+do proper vblank-synchronized double buffering.
+
+\subsection{Triangle drawing attempt \#3}
+
+I felt the next logical step was to learn how attributes and constants are
+passed through the shader pipeline, so I then \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/drm/vertex_color.c}{created a demo} that produced this image (this time also displayed on-screen):
+
+\begin{figure}
+ \href{images/vertex_color.png}{\includegraphics{images/vertex_color.png}}
+ \caption*{R500 framebuffer capture, \texttt{vertex\_color.c}}
+\end{figure}
+
+\begin{figure}
+\begin{verbatim}
+instruction[0]:
+ 0x00702203 dst: VE_ADD out[1].xyz_
+ 0x01d10021 src0: input[1].xyz_
+ 0x01248021 src1: input[1].0000
+ 0x01248021 src2: input[1].0000
+instruction[1]:
+ 0x00f00203 dst: VE_ADD out[0].xyzw
+ 0x01510001 src0: input[0].xyz1
+ 0x01248001 src1: input[0].0000
+ 0x01248001 src2: input[0].0000
+\end{verbatim}
+\caption*{R500 vertex shader (2 instructions, 128-bit control words)}
+\end{figure}
+
+This vertex shader is doing the equivalent of
+
+\begin{figure}
+ \href{verbatim/vertex_shader_equivalent_vertex_color.glsl}{\includegraphics{verbatim/output/vertex_shader_equivalent_vertex_color.glsl.pdf}}
+\end{figure}
+
+The extra vertex input is fed to the vertex shader via changes to
+\texttt{VAP\_PROG\_STREAM\_CNTL\_0},
+\texttt{VAP\_PROG\_STREAM\_CNTL\_EXT\_0}. Based on my currently limited
+understanding, it seems that arranging the vertex data like this:
+
+\begin{figure}
+ \href{verbatim/vap_prog_stream_vertices.c}{\includegraphics{verbatim/output/vap_prog_stream_vertices.c.pdf}}
+\end{figure}
+
+Is easier to deal with in \texttt{VAP\_PROG\_STREAM\_CNTL} than:
+
+\begin{figure}
+ \href{verbatim/vap_prog_stream_vertices2.c}{\includegraphics{verbatim/output/vap_prog_stream_vertices2.c.pdf}}
+\end{figure}
+
+\begin{figure}
+\begin{verbatim}
+instruction[0]:
+ 0x00078005 OUT RGBA
+ 0x08020000 RGB ADDR0=temp[0] ADDR1=0.0 ADDR2=0.0
+ 0x08020080 ALPHA ADDR0=0.0 ADDR1=0.0 ADDR2=0.0
+ 0x1c440220 RGB_SEL_A=src0.rgb RGB_SEL_B=src0.rgb TARGET=A
+ 0x1cc18003 ALPHA_OP=OP_MAX ALPHA_SEL_A=src0.1 ALPHA_SEL_B=src0.1 TARGET=A
+ 0x00000005 RGB_OP=OP_MAX
+\end{verbatim}
+\caption*{R500 fragment shader (1 instruction, 192-bit control word)}
+\end{figure}
+
+This fragment shader is doing the equivalent of:
+
+\begin{figure}
+ \href{verbatim/fragment_shader_equivalent_vertex_color.glsl}{\includegraphics{verbatim/output/fragment_shader_equivalent_vertex_color.glsl.pdf}}
+\end{figure}
+
+The \texttt{temp} input appears to be written by
+\texttt{VAP\_OUT\_VTX\_FMT\_0\__VTX\_COLOR\_0\_PRESENT} and read due to the
+changes to \texttt{RS\_COUNT} and \texttt{RS\_INST\_0}.
+
+\section{Progress: 21 Oct 2025}
+
+From 15 Oct 2025 to 21 Oct 2025, I achieved the following (roughly in chronological order):
+
+\begin{itemize}
+\item I learned how the vertex fetcher is \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/vertex_color_aos.c#L387-L401}{configured}
+\item I learned how the ``point list'' drawing primitive can be used to \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/single_color_clear.c#L504}{clear the screen}
+\item I invented a new syntax for R500 vertex shader assembly (ATI never specified one themselves)
+\item I modified my R500 \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/regs/pvs_disassemble.py}{vertex shader disassembler} to emit this new vertex shader syntax
+\item I wrote a R500 \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/regs/assembler/vs}{vertex shader assembler} that can process my vertex shader assembly syntax
+\item I create several animated demos with \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/single_color_clear_translate_vblank.c#L849-L859}{vblank-synchronized double buffering}
+\item I learned how to configure and draw (multi-)textured triangles
+\item I learned how to configure, clear, and use Z-buffers
+\item I made a \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/texture_cube_clear_zwrite_vertex_shader.c}{textured rotating cube demo} that uses my first non-trivial \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/cube_rotate.vs.asm}{handwritten vertex shader assembly program}
+\item I invented a new syntax for R500 fragment shader assembly (ATI never specified one themselves)
+\item I wrote a new R500 \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/regs/us_disassemble2.py}{fragment shader disassembler} that emits this new fragment shader syntax
+\item I wrote a R500 \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/regs/assembler/fs}{fragment shader assembler} that can process my fragment shader assembly syntax
+\item I wrote a ``shadertoy''-style demo that uses my first non-trivial \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/shadertoy_palette.fs.asm}{handwritten fragment shader assembly program}
+\end{itemize}
+
+\subsection{DRM\_RADEON\_CS state tracking}
+
+While attempting refactor one of my R500 demos to send fewer registers per
+\texttt{DRM\_RADEON\_CS} ioctl, I found that there is a ``state tracker'' within
+the \texttt{drm/radeon/r100}. For example, even if you don't use or depend on a
+Z-buffer, \texttt{DRM\_RADEON\_CS} will still reject your packet buffer
+depending on its own (imagined) concept of what the GPU state is. For example:
+
+\begin{verbatim}
+[ 1614.729278] [drm:r100_cs_track_check [radeon]] *ERROR* [drm] No buffer for z buffer !
+[ 1614.729626] [drm:radeon_cs_ioctl [radeon]] *ERROR* Invalid command stream !
+\end{verbatim}
+
+This happens because \texttt{track->z\_enabled} is
+\href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/r100.c#L2435}{initially
+ true} at the start of a \texttt{DRM\_RADEON\_CS} ioctl, and does not become
+false unless the packet buffer
+\href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/r300.c#L836-L843}{contains
+ a write} to \texttt{ZB\_CNTL}.
+
+This seems a bit heavy-handed. Even if the model were ``multiple applications
+may be using the GPU, so a single application can't depend on previously-set
+register state'', it would still be better if the kernel didn't try to enforce
+this by restricting permissible content of a packet buffer.
+
+\subsection{Vertex transform bypass}
+
+Mesa uses a ``point'' 3D primitive to implement \texttt{glClear} on R500. It
+does this by first uploading this vertex shader:
+
+\begin{figure}
+ \href{verbatim/mesa_glclear.vs.asm}{\includegraphics{verbatim/output/mesa_glclear.vs.asm.pdf}}
+ \caption*{\texttt{mesa\_glclear.vs.asm}}
+\end{figure}
+
+This shader does nothing to the input other than copy it to the output, where
+\texttt{out[0]} is the position vector, and \texttt{out[1]} is sent to the
+fragment shader as a ``texture coordinate''. That fragment shader, in turn, does
+not use the texture coordinate:
+
+\begin{figure}
+ \href{verbatim/mesa_glclear.fs.asm}{\includegraphics{verbatim/output/mesa_glclear.fs.asm.pdf}}
+ \caption*{\texttt{mesa\_glclear.fs.asm}}
+\end{figure}
+
+In my ``clear''
+\href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/single_color_clear_rotate_vblank.c#L539}{implementation},
+I instead set \texttt{PVS\_BYPASS}, which ``bypasses'' the vertex shader
+completely, sending the vertices directly to the rasterizer. This is convenient
+because it obviates the need to upload/change vertex shaders just to clear the
+color and Z -buffers.
+
+\subsection{Animation attempt \#1}
+
+With a working colorbuffer clear, I wrote the
+\href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/single_color_clear_translate.c#L786}{single\_color\_clear\_translate.c}
+demo to translate my triangle position coordinates in a loop that waits for
+\texttt{DRM\_RADEON\_GEM\_WAIT\_IDLE} between each frame. This attempt
+produced the following images:
+
+\begin{figure}
+ \includegraphics{videos/single_color_clear_translate.png}
+ \caption*{R500 DVI capture, \texttt{single\_color\_clear\_translate.c}}
+\end{figure}
+
+This was intended to be a smooth animation, yet it is not. It also seems several
+frames are never being displayed--the translation step is much smaller than what
+is shown in the video.
+
+This, interestingly, is exactly identical to how OpenGL/GLX applications behave
+on R500 with \texttt{vblank\_mode=0}.
+
+\subsection{Animation attempt \#2}
+
+I read the R500 display controller \href{doc/RRG-216M56-03oOEM.pdf}{register reference guide} again.
+It appears to suggest the \texttt{D1CRTC\_UPDATE\_INSTANTLY} bit, when unset,
+might cause changes to \texttt{D1GRPH\_PRIMARY\_SURFACE\_ADDRESS} to be delayed in
+hardware until the next vertical blanking interval begins.
+
+This can be combined with polling \texttt{D1GRPH\_SURFACE\_UPDATE\_PENDING} to
+later determine when the vblank-synchronized frame change actually occured.
+
+This is precisely what I implemented in
+\href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/single_color_clear_translate_vblank.c#L854-L855}{single\_color\_clear\_translate\_vblank.c}:
+
+\begin{figure}
+ \includegraphics{videos/single_color_clear_translate_vblank.png}
+ \caption*{R500 DVI capture, \texttt{single\_color\_clear\_translate\_vblank.c}}
+\end{figure}
+
+This is much closer to what I intended. The
+\texttt{D1GRPH\_SURFACE\_UPDATE\_PENDING} part is certainly working as I
+expected. Setting/unsetting \texttt{D1CRTC\_UPDATE\_INSTANTLY} appears to have
+no effect on \texttt{D1GRPH\_PRIMARY\_SURFACE\_ADDRESS} behavior, so I feel my
+understanding of R500 double-buffering is still incomplete.
+
+\subsection{Multiple-texture sampling}
+
+I am amazed and delighted how simple multiple-texture sampling is on R500.
+
+As a counter-example, while Sega Dreamcast does have a fairly capable
+fixed-function blending unit, to use the blending unit with multiple-texture
+sampled polygons one needs to render the polygon multiple times (at least once
+per texture) to an accumulation buffer. Blending is then performed between the
+currently-sampled texture and the previously-accumulated result, and the blend
+result is written to the accumulation buffer. From a vertex transformation
+perspective, it can be inconvenient/inefficient to be required to buffer entire
+triangle strips so that they can be submitted more than once per frame without
+duplicating the clip/transform computations.
+
+This is the fragment shader for
+\href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/texture_dual.c}{texture\_dual.c}
+(disassembly of code originally generated by Mesa):
+
+\begin{figure}
+ \href{verbatim/texture_dual.fs.asm}{\includegraphics{verbatim/output/texture_dual.fs.asm.pdf}}
+ \caption*{\texttt{texture\_dual.fs.asm}}
+\end{figure}
+
+This pre-subtract multiply-add is an algebraic rearrangement of this GLSL code:
+
+\begin{figure}
+ \href{verbatim/texture_dual.fs.glsl}{\includegraphics{verbatim/output/texture_dual.fs.glsl.pdf}}
+ \caption*{\texttt{texture\_dual.fs.glsl}}
+\end{figure}
+
+Which produces this image:
+
+\begin{figure}
+ \href{images/texture_dual.png}{\includegraphics{images/texture_dual.png}}
+ \caption*{R500 framebuffer capture, \texttt{texture\_dual.c}}
+\end{figure}
+
+Being able to manipulate the texture samples as fragment shader unit temporaries
+rather than as a sequence of accumulation buffer operations has me feeling excited
+to do more with this.
+
+\subsection{Z-buffer clear}
+
+I've never worked with traditional Z-buffers before--Sega Saturn uses
+\href{https://en.wikipedia.org/wiki/Painter\%27s_algorithm}{painter's algorithm}
+exclusively, and Sega Dreamcast uses a ``depth accumulation buffer''
+that isn't directly readable/writable.
+
+It is slightly obvious in retrospect, but it took me several minutes to realize
+that a ``depth clear'' can be implemented by covering the entire screen with a
+``point'' primitive with the desired initial depth while \texttt{ZFUNC} set to
+\texttt{ALWAYS}.
+
+\subsection{Drawing a 3D cube}
+
+With working double-buffering, Z-buffering, and the ability to clear each of
+these every frame, I felt I was finally ready to draw something ``3D''.
+
+I thought it would be fun to first start with a cube that is transformed in
+``software'' on the x86 CPU (not using a vertex shader). This sequence of videos
+shows my progression on implementing this:
+
+\begin{figure}
+ \includegraphics{videos/texture_cube.png}
+ \caption*{R500 DVI capture, \texttt{texture\_cube.c}}
+\end{figure}
+
+\begin{figure}
+ \includegraphics{videos/texture_cube_clear.png}
+ \caption*{R500 DVI capture, \texttt{texture\_cube\_clear.c}}
+\end{figure}
+
+\begin{figure}
+ \includegraphics{videos/texture_cube_clear_zwrite.png}
+ \caption*{R500 DVI capture, \texttt{texture\_cube\_clear\_zwrite.c}}
+\end{figure}
+
+\subsection{Drawing a 3D cube with vertex shaders}
+
+I then decided it would be fun to hand-write a ``3D rotation'' vertex shader
+from scratch. I first implemented the rotation in GLSL:
+
+\begin{figure}
+ \href{verbatim/cube_rotate.vs.glsl}{\includegraphics{verbatim/output/cube_rotate.vs.glsl.pdf}}
+ \caption*{\texttt{cube\_rotate.vs.glsl}}
+\end{figure}
+
+I verified that the GLSL version worked as expected in OpenGL, then I translated
+the GLSL to R500 vertex shader assembly, as:
+
+\begin{figure}
+ \href{verbatim/cube_rotate.vs.asm}{\includegraphics{verbatim/output/cube_rotate.vs.asm.pdf}}
+ \caption*{\texttt{cube\_rotate.vs.asm}}
+\end{figure}
+
+However, when I first executed the vertex shader cube rotation demo, I found
+it did not work as expected:
+
+\begin{figure}
+ \includegraphics{videos/texture_cube_clear_zwrite_vertex_shader_incorrect.png}
+ \caption*{R500 DVI capture, \texttt{texture\_cube\_clear\_zwrite\_vertex\_shader.c}\\(incorrect vertex shader assembler output)}
+\end{figure}
+
+After hours of debugging, I eventually found the issue was in this instruction:
+
+\begin{figure}
+ \href{verbatim/cube_rotate_3_temp.vs.asm}{\includegraphics{verbatim/output/cube_rotate_3_temp.vs.asm.pdf}}
+\end{figure}
+
+\href{doc/R5xx_Acceleration_v1.5.pdf}{R5xx\_Acceleration\_v1.5.pdf} briefly mentions this on pages 98 and 99:
+
+\begin{quote}
+The PVS\_DST\_MACRO\_INST bit was meant to be used for MACROS such as a
+vector-matrix multiply, but currently is only set for the following cases:
+
+A VE\_MULTIPLY\_ADD or VE\_MULTIPLYX2\_ADD instruction with all 3 source
+operands using unique PVS\_REG\_TEMPORARY vector addresses. Since R300 only has
+two read ports on the temporary memory, this special case of these instructions
+is broken up (by the HW) into 2 operations.
+\end{quote}
+
+I read this paragraph much earlier, but I didn't fully understand it until
+now. Indeed, this multiply-add has three unique \texttt{temp} addresses, and
+must be encoded as a ``macro'' instruction.
+
+I fixed this in my vertex shader assembler by
+\href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/regs/assembler/vs/validator.py}{counting the number of unique temp addresses}
+referenced by each instruction, promoting \texttt{VE\_MULTIPLY\_ADD} to
+\texttt{PVS\_MACRO\_OP\_2CLK\_MADD} if more than two unique \texttt{temp}
+addresses are referenced.
+
+With this change, reassembling the same vertex shader source code now produces a
+correct vertex shader cube rotation:
+
+\begin{figure}
+ \includegraphics{videos/texture_cube_clear_zwrite_vertex_shader.png}
+ \caption*{R500 DVI capture, \texttt{texture\_cube\_clear\_zwrite\_vertex\_shader.c}\\(correct vertex shader assembler output)}
+\end{figure}
+
+\subsection{Comparison with Mesa's R500 vertex shader compiler}
+
+My ``cube rotation'' vertex shader,
+\href{https://git.idk.st/bilbo/r500/src/commit/50244c7c95/drm/cube_rotate.vs.asm}{cube\_rotate.vs.asm}
+is 15 instructions.
+
+Mesa's R500 vertex shader compiler generated a
+\href{https://git.idk.st/bilbo/r500/src/commit/50244c7c95/shader_examples/mesa/texture_cube_depth_vertex_shader.vs.txt}{27-instruction vertex shader}
+from \href{https://r500.idk.st/verbatim/cube_rotate.vs.glsl}{semantically equivalent GLSL code}. Disassembly:
+
+\begin{figure}
+ \href{verbatim/mesa_cube_rotate.vs.asm}{\includegraphics{verbatim/output/mesa_cube_rotate.vs.asm.pdf}}
+ \caption*{\texttt{mesa\_cube\_rotate.vs.asm}}
+\end{figure}
+
+I was not particularly trying to write concise code, but I find this difference
+in instruction count to be surprising. In general it seems Mesa's R500 vertex
+shader compiler failed to vectorize several operations, and does significantly
+more scalar multiplies and scalar multiply-adds than my implementation.
+
+Ignoring algorithmic improvements (such as lifting the sin/cos calculation to
+x86 code and instead sending a 4x4 matrix to the vertex shader), there is still
+more opportunity for optimization beyond my 15-instruction implementation.
+
+Particularly, the vertex shader unit has a ``dual math'' instruction mode, where
+``vector engine'' (VE\_) and ``math engine'' (ME\_) operations can be executed
+simultaneously in the same instruction. \texttt{cube\_rotate.vs.asm} would
+indeed benefit from such an optimization--most of the \texttt{ME\_SIN} and
+\texttt{ME\_COS} instructions could be interleaved with the \texttt{VE\_MUL} and
+\texttt{VE\_MAD} operations that follow (at significant expense to
+human-readability).
+
+I am curious to see more examples of the difference between Mesa's R500 vertex
+shader compiler output and my own vertex shader assembly.
+
+\subsection{Fragment shader instruction expressiveness}
+
+Compared to the R500 vertex shader instructions, the R500 fragment shader
+instructions are significantly more featureful. This makes inventing a syntax
+that can fully express the range of operations that a R500 fragment shader
+instruction can do more complex.
+
+A significant difference is where R500 vertex shaders have a single tier of
+operand argument decoding, as in:
+
+\begin{figure}
+ \includegraphics{diagrams/vertex_inputs.svg}
+ \caption*{R500 vertex shader instruction operand inputs (simplified)}
+\end{figure}
+
+While R500 fragment shaders have multiple tiers of operand argument decoding, as
+in:
+
+\begin{figure}
+ \includegraphics{diagrams/fragment_inputs.svg}
+ \caption*{R500 fragment shader instruction operand inputs (simplified)}
+\end{figure}
+
+I've written several \href{https://github.com/buhman/scu-dsp-asm}{nice assemblers}
+for other architectures in the past, but I've never seen any instruction set
+as expressive as R500 fragment shaders.
+
+I attempted to directly reflect this ``multiple tiers of operand argument
+decoding'' in the syntax I invented for fragment shader ALU instructions.
+
+These instructions are also vector instructions: a total of 24 floating point
+input operands and 8 floating results could be evaluated per instruction.
+
+With this abundance of expressiveness and a relatively high skill ceiling, I'm
+amazed R500 fragment shader assembly isn't more popular in programming
+competitions, general everyday conversation, etc...
+
+\subsection{Fragment shader assembler bugs}
+
+There were two ``I spent a lot of time debugging this'' issues I encountered
+with my fragment shader assembler.
+
+The first was in this code I wrote to draw a fragment shaded circle, as in:
+
+\begin{figure}
+ \href{images/shadertoy_circle.png}{\includegraphics{images/shadertoy_circle.png}}
+ \caption*{R500 framebuffer capture, \texttt{shadertoy\_circle.fs.asm}}
+\end{figure}
+
+However, in an earlier version of my fragment shader assembler, I produced this
+image instead:
+
+\begin{figure}
+ \href{images/shadertoy_circle_incorrect.png}{\includegraphics{images/shadertoy_circle_incorrect.png}}
+ \caption*{R500 framebuffer capture, \texttt{shadertoy\_circle.fs.asm}\\(incorrect assembler output)}
+\end{figure}
+
+In this handwritten fragment shader code:
+
+\begin{figure}
+ \href{verbatim/shadertoy_circle.fs.asm}{\includegraphics{verbatim/output/shadertoy_circle.fs.asm.pdf}}
+ \caption*{\texttt{shadertoy\_circle.fs.asm}}
+\end{figure}
+
+\href{doc/R5xx_Acceleration_v1.5.pdf}{R5xx\_Acceleration\_v1.5.pdf} says briefly on page 241:
+
+\begin{quote}
+Specifies whether to insert a NOP instruction after this. This would get
+specified in order to meet dependency requirements for the pre-subtract inputs,
+and dependency requirements for src0 of an MDH/MDV instruction.
+\end{quote}
+
+The issue is the pre-subtract input for the \texttt{MAD |srcp.a| src0.1 -src2.a}
+instruction depends on the write to \texttt{temp[0].a} from the immediately
+preceding \texttt{RCP src0.a} instruction--a pipeline hazard.
+
+To fix this, I added support for
+\href{https://git.idk.st/bilbo/r500/commit/fe0684ca5e58ed3be026410812c042e883bdce71}{generating the \texttt{NOP} bit}
+in my fragment shader assembler.
+
+\subsection{More fragment shader assembler bugs}
+
+While trying to produce this image:
+
+\begin{figure}
+ \href{images/shadertoy_palette.png}{\includegraphics{images/shadertoy_palette.png}}
+ \caption*{R500 framebuffer capture, \texttt{shadertoy\_palette.fs.asm}}
+\end{figure}
+
+My fragment shader code instead produced this image:
+
+\begin{figure}
+ \href{images/shadertoy_palette_incorrect.png}{\includegraphics{images/shadertoy_palette_incorrect.png}}
+ \caption*{R500 framebuffer capture, \texttt{shadertoy\_palette.fs.asm}\\(incorrect assembler output)}
+\end{figure}
+
+The issue was simply that in the chaos of all of the other features I was
+implementing for my fragment shader assembler, I
+\href{https://git.idk.st/bilbo/r500/commit/f6a0fc4fab5dee3085dcf4b9a984244bba05d5ca}{forgot to emit the \texttt{ADDRD} bits}.
+
+This meant that while fragment shader code that exclusively uses zero-address
+destinations, such as \texttt{shadertoy\_circle.fs.asm}, appeared to work
+completely correctly, I encountered this bug as soon as I started using non-zero
+addresses such as \texttt{temp[1]} in my fragment shader code.
+
+\subsection{Comparison to Direct3D ``asm''}
+
+Prior to Direct3D 10, Microsoft previously defined a specification for both
+\href{https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx9-graphics-reference-asm-vs-3-0}{vertex shader assembly} and
+\href{https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx9-graphics-reference-asm-ps-3-0}{fragment shader assembly}.
+
+The Direct3D ``asm'' name is slightly deceptive, however, as the
+\texttt{vs\_3\_0} and \texttt{ps\_3\_0} instruction syntax does not map 1-to-1
+with any hardware that exists.
+
+It would perhaps be more accurate to think of Direct3D's ``asm''
+language and compiler as more analogous to a
+\href{https://en.wikipedia.org/wiki/BASIC}{shader BASIC} than as a true assembly
+language on the same level as ``6502 assembly'', ``Z80 assembly'' and similar.
+
+In contrast, my R500 assembly syntaxes are deliberately/explicitly mapped 1-to-1
+with R500 instructions.
+
+\subsection{Fragment shader animated demo}
+
+\begin{figure}
+ \includegraphics{videos/shadertoy_palette.png}
+ \caption*{R500 DVI capture, \texttt{shadertoy\_palette.fs.asm}}
+\end{figure}
+
+The R500 fragment shader code that I handwrote for this is:
+
+\begin{figure}
+ \href{verbatim/shadertoy_palette.fs.asm}{\includegraphics{verbatim/output/shadertoy_palette.fs.asm.pdf}}
+ \caption*{\texttt{shadertoy\_palette.fs.asm}}
+\end{figure}
+
+The \texttt{float} constants are interesting--they are decoded almost
+identically to the
+\href{https://en.wikipedia.org/wiki/Minifloat#8-bit_(1.4.3)}{8-bit (1.4.3) (bias 7) format shown on Wikipedia},
+except:
+\begin{itemize}
+\item There is no sign bit (the value is always positive--positive values
+ can be swizzled to produce negative operands)
+\item There is no ``zero'' value (zero can also be instead obtained via
+ swizzles); the ``all zeros'' bit pattern instead has a value of
+ \texttt{0.0009765625}.
+\item There are no infinite or not-a-number values: a ``15'' exponent is treated
+ as 15.
+\end{itemize}
+
+The exponent/mantissa table that shows example 7-bit float values on page 106 of
+\href{doc/R5xx_Acceleration_v1.5.pdf}{R5xx\_Acceleration\_v1.5.pdf} is incorrect.
+
+\end{document}
diff --git a/mesa/glDrawArrays.txt b/mesa/glDrawArrays.txt
new file mode 100644
index 0000000..a9439ad
--- /dev/null
+++ b/mesa/glDrawArrays.txt
@@ -0,0 +1,197 @@
+// _mesa_draw_arrays
+
+// r300_draw_vbo
+
+// r300_draw_arrays_immediate
+
+ vertex_size = 3
+ dwords = 13
+
+ // r300_prepare_for_rendering
+ // r300_emit_states
+ // r300_reserve_cs_dwords
+ 389
+ // r300_emit_dirty_state
+ // r300_emit_gpu_flush
+ SC_SCISSOR0 = 0
+ SC_SCISSOR1 = (width - 1), (height - 1) // 600, 600
+ // cb_flush_clean
+ RB3D_DSTCACHE_CTLSTAT = 0xa
+ ZB_ZCACHE_CTLSTAT = 0x3
+ WAIT_UNTIL [0x1720] = RADEON_WAIT_3D_IDLECLEAN
+ // r300_emit_aa_state
+ GB_AA_CONFIG = 0
+ RB3D_AARESOLVE_CTL = 0
+ // r300_emit_fb_state
+ RB3D_CCTL = 16384
+ RB3D_COLOROFFSET0 = 0
+ //OUT_CS_RELOC
+ OUT_CS(0xc0001000); /* PKT3_NOP */ \
+ OUT_CS(0);
+
+ RB3D_COLORPITCH0 = 0xc10640
+ //OUT_CS_RELOC
+ OUT_CS(0xc0001000); /* PKT3_NOP */ \
+ OUT_CS(0);
+
+ ZB_FORMAT = 2
+ ZB_DEPTHOFFSET = 0
+ //OUT_CS_RELOC
+ OUT_CS(0xc0001000); /* PKT3_NOP */ \
+ OUT_CS(4);
+
+ ZB_DEPTHPITCH = 0x30640
+ //OUT_CS_RELOC
+ OUT_CS(0xc0001000); /* PKT3_NOP */ \
+ OUT_CS(4);
+ // r300_emit_hyperz_state
+ ZB_BW_CNTL = 0
+ ZB_DEPTHCLEARVALUE = 0
+ SC_HYPERZ_EN = 0x1c
+ GB_Z_PEQ_CONFIG = 0
+ // r300_emit_ztop_state
+ ZB_ZTOP = 1
+ // r300_emit_dsa_state
+ FG_ALPHA_FUNC = 0
+ ZB_CNTL = 0
+ ZB_ZSTENCILCNTL = 0
+ ZB_STENCILREFMASK = 0
+ ZB_STENCILREFMASK_BF = 0
+ FG_ALPHA_VALUE = 0
+ // r300_emit_blend_state
+ RB3D_ROPCNTL = 0
+ RB3D_BLENDCNTL = 0
+ RB3D_ABLENDCNTL = 0
+ RB3D_COLOR_CHANNEL_MASK = 15
+ RB3D_DITHER_CTL = 0
+ // r300_emit_blend_color_state
+ RB3D_CONSTANT_COLOR_AR = 0
+ RB3D_CONSTANT_COLOR_GB = 0
+ // r300_emit_scissor_state
+ SC_CLIP_0_A = 0, 0
+ SC_CLIP_0_B = 0 - 1, 0 - 1
+ // r300_emit_sample_mask
+ SC_SCREENDOOR = 63 | (63 << 6) | (63 << 12) | (63 << 18)
+ // r300_emit_invariant_state
+ GB_SELECT = 0
+ FG_FOG_BLEND = 0
+ GA_OFFSET = 0
+ SU_TEX_WRAP = 0
+ SU_DEPTH_SCALE = 16777215.0f (0x4b7fffff)
+ SU_DEPTH_OFFSET = 0
+ SC_EDGERULE = 0x2da49525
+ RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD = 0x1010101
+ RB3D_DISCARD_SRC_PIXEL_GTE_THRESHOLD = 0xfefefefe
+ GA_COLOR_CONTROL_PS3 = 0
+ SU_TEX_WRAP_PS3 = 0
+ // r300_emit_viewport_state
+ VAP_VPORT_XSCALE = 300
+ VAP_VPORT_XOFFSET = 300
+ VAP_VPORT_YSCALE = -300
+ VAP_VPORT_YOFFSET = 300
+ VAP_VPORT_ZSCALE = 0.5
+ VAP_VPORT_ZOFFSET = 0.5
+ VAP_VTE_CNTL = 0x43f
+ // r300_emit_pvs_flush
+ VAP_PVS_STATE_FLUSH_REG = 0
+ // r300_emit_vap_invariant_state
+ VAP_PVS_VTX_TIMEOUT_REG = 0xffff
+ VAP_GB_VERT_CLIP_ADJ = 1.0f (0x3f800000)
+ VAP_GB_VERT_DISC_ADJ = 1.0f (0x3f800000)
+ VAP_GB_HORZ_CLIP_ADJ = 1.0f (0x3f800000)
+ VAP_GB_HORZ_DISC_ADJ = 1.0f (0x3f800000)
+ VAP_PSC_SGN_NORM_CNTL = 0xaaaaaaaa
+ VAP_TEX_TO_COLOR_CNTL = 0
+ // r300_emit_vertex_stream_state
+ VAP_PROG_STREAM_CNTL_0 = 0x2002
+ VAP_PROG_STREAM_CNTL_EXT_0 = 0xfa88
+ // r300_emit_vs_state
+ VAP_PVS_CODE_CNTL_0 = 0
+ VAP_PVS_CODE_CNTL_1 = 0
+ VAP_PVS_VECTOR_INDX_REG = 0
+ VAP_PVS_VECTOR_DATA_REG_128 = (ONE_REG_WR:)
+ {0xf00203, 0xd10001, 0x1248001, 0x1248001}
+ VAP_CNTL = 0xb0055a
+ VAP_PVS_FLOW_CNTL_OPC = 0
+ VAP_PVS_FLOW_CNTL_ADDRS_LW_[0-15] = 0
+ VAP_PVS_FLOW_CNTL_ADDRS_UW_[0-15] = 0
+ VAP_PVS_FLOW_CNTL_LOOP_INDEX_[0-15] = 0
+ // r300_emit_clip_state
+ VAP_PVS_VECTOR_INDX_REG = 0x600
+ VAP_PVS_VECTOR_DATA_REG_128 =
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} (24)
+ // r300_emit_rs_block_state
+ VAP_VTX_STATE_CNTL = 0x5555
+ VAP_VSM_VTX_ASSM [0x2184] = 0x1
+ VAP_OUTPUT_VTX_FMT_0 = 1
+ VAP_OUTPUT_VTX_FMT_1 = 4
+ GB_ENABLE = 0
+ RS_IP_0 = 0x30000000
+ RS_COUNT = 0x40080
+ RS_INST_COUNT = 0
+ RS_INST_0 = 0
+ // r300_emit_rs_state
+ VAP_CNTL_STATUS = 0
+ VAP_CLIP_CNTL = 0xc000
+ GA_POINT_SIZE = 0x60006
+ GA_POINT_MINMAX = 0x60006
+ GA_LINE_CNTL = 0x20006
+ SU_POLY_OFFSET_ENABLE = 0
+ SU_CULL_MODE = 0
+ GA_LINE_STIPPLE_CONFIG = 0
+ GA_LINE_STIPPLE_VALUE = 0
+ GA_POLY_MODE = 0
+ GA_ROUND_MODE = 0x31
+ SC_CLIP_RULE = 0xffff
+ GA_POINT_S0 = 0
+ GA_POINT_T0 = 1.0f (0x3f800000)
+ GA_POINT_S1 = 1.0f (0x3f800000)
+ GA_POINT_T1 = 0
+ // r300_emit_fb_state_pipelined
+ US_OUT_FMT_0 = 0x1b00
+ US_OUT_FMT_1 = 0xf
+ US_OUT_FMT_2 = 0xf
+ US_OUT_FMT_3 = 0xf
+ GB_MSPOS0 = 0x66666666
+ GB_MSPOS1 = 0x6666666
+ // r500_emit_fs
+ US_CONFIG = 2
+ US_PIXSIZE = 1
+ US_FC_CTRL = 0
+ US_CODE_RANGE = 0
+ US_CODE_OFFSET = 0
+ US_CODE_ADDR = 0
+ GA_US_VECTOR_INDEX = 0
+ GA_US_VECTOR_DATA = (ONE_REG_WR:)
+ {0x78005, 0x8020080, 0x8020080, 0x1c9b04d8, 0x1c810003, 0x5}
+ FG_DEPTH_SRC = 0
+ US_W_FMT = 0
+ // r500_emit_fs_rc_constant_state
+ [nothing]
+ // r500_emit_fs_constants
+ [nothing]
+ // r300_emit_vs_constants
+ VAP_PVS_CONST_CNTL = 0
+ // r300_emit_texture_cache_inval
+ TX_INVALTAGS = 0
+ // r300_emit_textures_state
+ TX_ENABLE = 0
+ // r300_emit_query_start
+ [nothing]
+ // r500_emit_index_bias
+ VAP_INDEX_OFFSET = 0
+ // r300_emit_draw_init
+ GA_COLOR_CONTROL = 0x3aaaa
+ VAP_VF_MAX_VTX_INDX = 2
+ VAP_VF_MIN_VTX_INDX = 0
+
+ // r300_draw_arrays_immediate
+ VAP_VTX_SIZE = 3
+
+ [
+ PACKET3_3D_DRAW_IMMD_2 (3 * 3)
+ 0x30034 // VAP_VF_CNTL
+ {0.5, -0.5, 0}
+ {-0.5, -0.5, 0}
+ {0, 0.5, 0}
+ ]
diff --git a/replace_video.py b/replace_video.py
new file mode 100644
index 0000000..37617b1
--- /dev/null
+++ b/replace_video.py
@@ -0,0 +1,26 @@
+import sys
+
+scale = 1.5
+
+def transform():
+ with open(sys.argv[1]) as f:
+ for line in f.readlines():
+ if ""
+ yield f""
+ yield ""
+ else:
+ yield line
+
+lines = list(transform())
+with open(sys.argv[1], 'w') as f:
+ f.write(''.join(lines))
diff --git a/resize_svg.py b/resize_svg.py
new file mode 100644
index 0000000..62b9ab2
--- /dev/null
+++ b/resize_svg.py
@@ -0,0 +1,22 @@
+import sys
+
+scale = 1.5
+
+def transform():
+ with open(sys.argv[1]) as f:
+ for line in f.readlines():
+ if line.strip().startswith("