diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5e7a2b1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,21 @@ +*.html +*.css +*.out +index.pdf +_minted/ +*.aux +*.log +*.4ct +*.4tc +*.dvi +*.idv +*.lg +*.tmp +*.toc +*.xref +*~ +verbatim/*.tex +verbatim/*.svg +verbatim/*.pdf +verbatim/output +images/*.data diff --git a/build.sh b/build.sh new file mode 100644 index 0000000..8492761 --- /dev/null +++ b/build.sh @@ -0,0 +1,28 @@ +set -eux + +rm -f verbatim/output/*.svg + +make4ht --shell-escape index.tex "pic-m,pic-equation,svg" + +echo 'img[alt="PIC"] { width: 100%; }' >> index.css +echo '.cmtt-10 { font-size: 0.9em; }' >> index.css +echo 'img[src="index3x.svg"] { height: 2.5em; }' >> index.css + +sed -i '/prefers-color-scheme/d' index.css +sed -i 's| | |g' index.html +sed -i '/figure.figure/d' index.css +echo 'figure.figure { margin-left: 20px; margin-right: 20px; }' >> index.css +echo 'pre.verbatim { font-size: 0.9em; }' >> index.css +sed -i 's|color-scheme: light dark;||g' index.css +echo 'figcaption.caption { margin-bottom: 1.3em; margin-top: 0.3em; }' >> index.css + +sed -i 's/index.css/index2.css/g' index.html + +mv index.css index2.css + +python replace_video.py index.html + +for file in verbatim/output/*.svg; do + sed -i 's|rgb(0%, 0%, 100%)||g' "$file" + python resize_svg.py "$file" +done diff --git a/deploy.sh b/deploy.sh new file mode 100644 index 0000000..bdbfe13 --- /dev/null +++ b/deploy.sh @@ -0,0 +1 @@ +rsync --delete -arv * root@az1.idk.st:/var/www/r500/ diff --git a/diagrams/fragment_inputs.dot b/diagrams/fragment_inputs.dot new file mode 100644 index 0000000..4b2365c --- /dev/null +++ b/diagrams/fragment_inputs.dot @@ -0,0 +1,61 @@ +digraph D { + graph [ranksep="1" splines=line ordering="in"]; + node [shape=box]; + edge [arrowhead=none]; + + subgraph cluster_W { + addr [shape=none] + + temp + const + float + } + + subgraph cluster_Z { + {rank=same + src [shape=none] + src0 [label="src0"] + src1 [label="src1"] + src2 [label="src2"] + srcp + } + + } + + subgraph cluster_R { + + {rank=same + opcode [shape=none]; + a [label = "a"]; + b [label = "b"]; + c [label = "c"]; + } + } + + temp:s -> src0:n + temp:s -> src1:n + temp:s -> src2:n + + const:s -> src0:n + const:s -> src1:n + const:s -> src2:n + + float:s -> src0:n + float:s -> src1:n + float:s -> src2:n + + src0:s -> a:n + src1:s -> a:n + src2:s -> a:n + srcp:s -> a:n + + src0:s -> b:n + src1:s -> b:n + src2:s -> b:n + srcp:s -> b:n + + src0:s -> c:n + src1:s -> c:n + src2:s -> c:n + srcp:s -> c:n +} diff --git a/diagrams/fragment_inputs.svg b/diagrams/fragment_inputs.svg new file mode 100644 index 0000000..c459c8c --- /dev/null +++ b/diagrams/fragment_inputs.svg @@ -0,0 +1,205 @@ + + + + + + +D + + +cluster_W + + + +cluster_Z + + + +cluster_R + + + + +addr +addr + + + +temp + +temp + + + +src0 + +src0 + + + +temp:s->src0:n + + + + +src1 + +src1 + + + +temp:s->src1:n + + + + +src2 + +src2 + + + +temp:s->src2:n + + + + +const + +const + + + +const:s->src0:n + + + + +const:s->src1:n + + + + +const:s->src2:n + + + + +float + +float + + + +float:s->src0:n + + + + +float:s->src1:n + + + + +float:s->src2:n + + + + +src +src + + + +a + +a + + + +src0:s->a:n + + + + +b + +b + + + +src0:s->b:n + + + + +c + +c + + + +src0:s->c:n + + + + +src1:s->a:n + + + + +src1:s->b:n + + + + +src1:s->c:n + + + + +src2:s->a:n + + + + +src2:s->b:n + + + + +src2:s->c:n + + + + +srcp + +srcp + + + +srcp:s->a:n + + + + +srcp:s->b:n + + + + +srcp:s->c:n + + + + +opcode +opcode + + + diff --git a/diagrams/vertex_inputs.dot b/diagrams/vertex_inputs.dot new file mode 100644 index 0000000..08564e3 --- /dev/null +++ b/diagrams/vertex_inputs.dot @@ -0,0 +1,36 @@ +digraph D { + graph [ranksep="1" splines=line]; + node [shape=box]; + edge [arrowhead=none]; + + input + const + temp + alt_temp + + opcode [shape=none]; + a [label = "a"]; + b [label = "b"]; + c [label = "c"]; + + subgraph cluster_R { + + {rank=same opcode a b c} + } + + input:s -> a:n + input:s -> b:n + input:s -> c:n + + const:s -> a:n + const:s -> b:n + const:s -> c:n + + temp:s -> a:n + temp:s -> b:n + temp:s -> c:n + + alt_temp:s -> a:n + alt_temp:s -> b:n + alt_temp:s -> c:n +} diff --git a/diagrams/vertex_inputs.svg b/diagrams/vertex_inputs.svg new file mode 100644 index 0000000..726ce47 --- /dev/null +++ b/diagrams/vertex_inputs.svg @@ -0,0 +1,124 @@ + + + + + + +D + + +cluster_R + + + + +input + +input + + + +a + +a + + + +input:s->a:n + + + + +b + +b + + + +input:s->b:n + + + + +c + +c + + + +input:s->c:n + + + + +const + +const + + + +const:s->a:n + + + + +const:s->b:n + + + + +const:s->c:n + + + + +temp + +temp + + + +temp:s->a:n + + + + +temp:s->b:n + + + + +temp:s->c:n + + + + +alt_temp + +alt_temp + + + +alt_temp:s->a:n + + + + +alt_temp:s->b:n + + + + +alt_temp:s->c:n + + + + +opcode +opcode + + + diff --git a/index.tex b/index.tex new file mode 100644 index 0000000..ded3477 --- /dev/null +++ b/index.tex @@ -0,0 +1,905 @@ +\documentclass[20pt]{article} + +\usepackage[font=small,labelfont=bf]{caption} +\usepackage{hyperref} +\hypersetup{ + colorlinks=true, + linkcolor=blue, + filecolor=magenta, + urlcolor=cyan, + pdftitle={Dreamcast}, + pdfpagemode=FullScreen, + } + +\usepackage{graphicx} +\graphicspath{ {./images/} } + +\usepackage{minted} + +\title{Radeon R500} +\date{} + +\begin{document} + +\maketitle +\href{images/x1950xt.jpg}{\includegraphics{images/x1950xt.jpg}} + +\tableofcontents + +\section{Introduction} + +The primary/minimal project goal is "draw a triangle on a Radeon R500 via direct +memory-mapped hardware register and texture memory accesses". This means no +\href{https://mesa3d.org/}{Mesa}, no +\href{https://github.com/torvalds/linux/tree/v6.12/drivers/gpu/drm/radeon}{radeon} +kernel module, and certainly no OpenGL or Direct3D. + +I have worked directly with several other graphics units in the past +(\href{https://github.com/buhman/saturn-examples}{Saturn VDP1}, +\href{https://github.com/buhman/dreamcast}{Dreamcast Holly}, +\href{https://github.com/buhman/voodoo}{Voodoo 2}). In all of these projects, +my strategy is generally: + +\begin{itemize} +\item read the entire \href{doc/R5xx_Acceleration_v1.5.pdf}{reference + documentation} at least once, front-to-back +\item copy all hardware register definitions from the documentation to a + spreadsheet or text file (sometimes typing everything by hand if I am in such + a chill mood) +\item progressively build increasingly-complex example programs that exercise + the hardware +\end{itemize} + +The rabbit hole for R500 seems significantly deeper, considering this is the +first graphics unit I've worked with that has programmable vertex and pixel +shader engines. + +\subsection{Hardware} + +For testing, I currently have this hardware configuration: + +\begin{itemize} +\item ASUS P4B-LX (Intel 845) motherboard +\item Intel Pentium 4 2.6GHz SL6PP (Northwood) +\item 1024 MB RAM +\item 32GB PATA SSD +\item ATI Radeon X1650 PRO 512MB AGP +\end{itemize} + +I also have the X1950 XT PCIe shown in the photo, which amazingly has never been +used, and prior to the photo was sealed in an antistatic bag from manufacture to +now. + +\subsection{Test setup} + +While in my other (video game console) projects I typically insist on +``bare-metal'' development with no operating system or third-party library +running on the target hardware, my experience with x86 is much more limited. + +While it is something I am interested in doing, I believe creating a +zero-dependency ``code upload'' mechanism for an x86-pc that does not depend on +an operating system would severely delay my progress on R500-specific work. + +For my initial exploration of R500, I will instead be manipulating the hardware +primarily from Linux kernel space. This Linux kernel code does not actually +meaningfully depend on Linux APIs beyond calling \texttt{ioremap} to get usable +memory mappings for R500 PCI resources (texture/framebuffer memory and +registers). + +\section{Progress: 07 Oct 2025} + +From 01 Oct 2025 to 07 Oct 2025, I achieved the following: + +\begin{itemize} +\item I wrote a reasonably complete AtomBIOS disassembler +\item I can disable (IBM PC) VGA mode and manipulate the native framebuffer +\item I can upload microcode to the ``command processor'', and I can write to + scratch registers via command processor packets (this is uncoincidentally the + same command processor test that the radeon kernel module does). +\item I stepped through Mesa functions as invoked by a simple OpenGL + application, and created \href{mesa/glDrawArrays.txt}{a list of R500 + registers/values} that are written by Mesa during \texttt{glDrawArrays}. +\end{itemize} + +I did not achieve the following: + +\begin{itemize} +\item I attempted to manipulate the R500 register state and command processor + into drawing a triangle, but I have not been successful yet +\end{itemize} + +\subsection{Documentation} + +In general, I note that the R500 documentation is significantly weaker than I +hoped, and does not contain enough information to draw a triangle on the R500 +from the documentation alone (with no prior knowledge about previous Radeon +graphics units). + +In addition to the lack of prose, in several cases I've noticed both Mesa and +Linux reference R500 registers that are +\href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/pci/undocumented_3d_registers.h}{not + present at all} in the documentation. + +\subsection{AtomBIOS} + +AtomBIOS physically exists as a section inside the ROM on R500 graphics units. +AtomBIOS is notably used for setting PLL/pixel clock frequencies and display +resolutions, among several other functions. + +The Radeon graphics hardware itself does not execute AtomBIOS code--instead, it +is expected that the host (e.g: x86) CPU evaluate the instructions in the +AtomBIOS command tables. Generally the outcome of evaluating AtomBIOS code is +that several ``register write'' instructions will be executed, changing the +state of the graphics unit. + +My original goal in studying AtomBIOS was that I thought I would need it to set +up the R500 display controller to a reasonable state (as a prerequisite for +drawing 3D graphics). However, after actually experimenting with ``disable VGA +mode'', I currently believe that I don't actually need to implement +resolution/mode changes, and can proceed without it. + +\subsection{PIO mode} + +The Linux kernel exclusively communicates with R500 via ``PCI bus mastering''. +A ``ring buffer'' is allocated in ``GTT'' space, which from the graphics unit's +perspective exists in the same address space as framebuffer memory, but is an +address that is outside the framebuffer memory that physically exists. + +I also observed via debugfs that the GTT apparently involves some sort of sparse +page mapping, but I don't understand how this works from an x86 perspective. + +In the absence of an understanding of how to make my own ``GTT'' address space, +I attempted to operate the R500 in ``PIO'' mode. This has the advantage of being +able to simply write to registers via (simple) PCI memory-mapped accesses, but +it has the disadvantage that Linux doesn't use R500 this way, so I have no +reference implementation for how PIO mode should be used. + +\subsection{Triangle drawing attempt \#1} + +I translated my \href{mesa/glDrawArrays.txt}{glDrawArrays notes} to +\href{https://git.idk.st/bilbo/r500/src/commit/b6472e4c16946f44e02d82f31adaa411df009c67/pci/triangle.c}{equivalent + register writes}. + +This does not work, and I don't yet understand why. The main issue is that most +of the time when I execute that code, Linux appears to ``hang'' completely, and +my ``printk'' messages are never sent over ssh. On the rare occasion when the +``hang'' does not occur, a triangle is nevertheless not drawn on the +framebuffer. + +I have a few ideas for how to proceed: + +\begin{itemize} +\item Move the ``triangle.c'' register accesses to userspace via + \texttt{/sys/bus/pci}, which might improve debuggability +\item Abandon the ``write a kernel module'' idea completely, and instead + interact with the R500 via \href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/radeon_drv.c#L565-L577}{radeon DRM ioctls} +\end{itemize} + +The latter is perhaps both the most attractive, and the most work. I currently +don't have any understanding of GEM buffers, radeon buffer objects, etc.., so +I'd need to study these in more detail. + +\section{Progress: 14 Oct 2025} + +From 08 Oct 2025 to 14 Oct 2025, I achieved the following: + +\begin{itemize} +\item I studied how Mesa interacts with the \texttt{radeon} kernel module via + \texttt{DRM\_RADEON\_} ioctls. +\item I wrote simple R500 \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/regs/pvs_disassemble.py}{vertex shader} and \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/regs/us_disassemble.py}{pixel shader} disassemblers. +\item I wrote a \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/regs/parse_packets.py}{tool} to print R500 ``PM4'' packets in human-readable form. +\item I laboriously \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/regs/bits}{copied and reformatted} all bit definitions from \href{doc/R5xx_Acceleration_v1.5.pdf}{R5xx\_Acceleration\_v1.5.pdf} +\item I wrote \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/regs}{several other miscellaneous tools} related to register and bit parsing and manipulation. +\item I wrote two \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/drm/single_color.c}{humble} \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/drm/vertex_color.c}{demos} to draw a triangle on R500. +\end{itemize} + +\subsection{Radeon DRM} + +As implied in the last update, primarily due to my lack of experience with +bare-metal x86, I decided it would be a better approach to interact with R500 +Command Processor via the \texttt{radeon} kernel module, which provides a +partially reasonable interface for this via the \texttt{DRM\_RADEON\_CS} ioctl. + +All \texttt{DRM\_RADEON\_} ioctls are mostly or entirely undocumented. Instead, +I built debugging symbols for Mesa and other supporting libraries so that I +could set breakpoints in GDB to observe what sequences of \texttt{DRM\_RADEON\_} +ioctls Mesa uses. + +From my previous \href{mesa/glDrawArrays.txt}{glDrawArrays notes} observations, +I noticed this strange sequence: + +\begin{verbatim} +0x0000138a // type 0 packet, count=0, starting offset = RB3D_COLOROFFSET0 +0x00000000 // RB3D_COLOROFFSET0 = 0 +0xc0001000 // type 3 packet, count=0, opcode=NOP +0x00000000 // zero (meaningless data) +\end{verbatim} + +At first, it seemed Mesa was deliberately setting the colorbuffer write address +to (VRAM address) zero, which seemed like a strange choice considering I am +debugging an X11/GLX OpenGL application--surely the colorbuffer address would be +some non-zero value several megabytes after the beginning of VRAM. + +I later attempted to send my own PM4 packet via \texttt{DRM\_RADEON\_CS}. This +initial attempt returned \texttt{Invalid argument}, with the following +message in dmesg: + +\begin{verbatim} +[ 1205.978993] [drm:radeon_cs_packet_next_reloc [radeon]] *ERROR* No packet3 for relocation for packet at 14. +[ 1205.979427] [drm] ib[14]=0x0000138E +[ 1205.979433] [drm] ib[15]=0x00C00640 +[ 1205.979437] [drm:r300_packet0_check [radeon]] *ERROR* No reloc for ib[13]=0x4E28 +[ 1205.979545] [drm] ib[12]=0x0000138A +[ 1205.979548] [drm] ib[13]=0x00000000 +[ 1205.979553] [drm:radeon_cs_ioctl [radeon]] *ERROR* Invalid command stream ! +\end{verbatim} + +This error message comes from +\href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/r300.c#L664-L669}{drm/radeon/r300.c}. + +The meaningless data following the type-3 NOP packet is used by the kernel to +\href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/radeon_cs.c#L875-L889}{index} +the \texttt{DRM\_RADEON\_CS} ``relocs'' array (an array of GEM buffer handles). + +It seems perhaps the design goal was to never expose the VRAM address of GEM +buffers to userspace (indeed there seems to be no way to retrieve that via any +GEM ioctls). This restriction is slightly disappointing, as I would have +preferred to be able to send unmodified packet data to the R500. + +However, at the moment this does not appear to be a significant issue, as a +relatively small number of registers are modified by the Linux kernel's packet +parser prior creating the indirect buffer that is actually sent to the R500 +hardware. + +\subsection{Indirect buffers} + +There appears to be a lot of memory-to-memory copying in the +Linux/Mesa/DRM/GEM/radeon graphics stack: + +\begin{itemize} +\item Mesa writes the OpenGL state to various internal structures +\item Mesa \href{https://gitlab.freedesktop.org/mesa/mesa/-/blob/25.0/src/gallium/drivers/r300/r300_emit.c?ref_type=heads}{copies} OpenGL state to packet commands in a userspace buffer +\item Mesa + \href{https://gitlab.freedesktop.org/mesa/mesa/-/blob/25.0/src/gallium/winsys/radeon/drm/radeon_drm_cs.c?ref_type=heads#L486-487}{passes + the address} of the userspace buffer to the kernel via + \texttt{DRM\_RADEON\_CS} +\item Linux + \href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/radeon_cs.c#L340-L358}{copies + the entire userspace buffer} to kernel space (calling kvmalloc/kvfree on + each ioctl) +\item The \texttt{radeon\_cs\_parser} parses and modifies the buffer originally + generated by Mesa +\item \href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/radeon_cs.c#L613}{radeon\_cs\_ib\_fill} copies the parser result to gpu address space. +\end{itemize} + +Eventually, +\href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/r100.c#L3709-L3722}{r100\_ring\_ib\_execute} +is called, which writes the indirect buffer address (now in GPU address space) +to the ring. + +It would be interesting to experiment with writing a packet buffer directly in +GPU/GTT address space (from Linux userspace), with zero copies. This would +require an entirely new set of ioctls. + +\subsection{Triangle drawing attempt \#2} + +These images were never drawn on-screen. I extracted them from VRAM via +\texttt{/sys/kernel/debug/radeon\_vram}. + +\begin{figure} + \href{images/single_color_macrotiled.png}{\includegraphics{images/single_color_macrotiled.png}} + \caption*{R500 framebuffer capture, \texttt{single\_color.c}} +\end{figure} + +Though I was not aware of it yet, the above image was indeed my triangle, and +\texttt{COLORPITCH0} was merely in ``macrotiled'' mode. Once I realized this, I +produced this image (still in off-screen VRAM): + +\begin{figure} + \href{images/single_color.png}{\includegraphics{images/single_color.png}} + \caption*{R500 framebuffer capture, \texttt{single\_color.c}} +\end{figure} + +This \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/drm/single_color.c}{``single color''} demo deliberately uses the very simple vertex and fragment +shaders: + +\begin{figure} +\begin{verbatim} +instruction[0]: + 0x00f00203 dst: VE_ADD out[0].xyzw + 0x00d10001 src0: input[0].xyzw + 0x01248001 src1: input[0].0000 + 0x01248001 src2: input[0].0000 +\end{verbatim} +\caption*{R500 vertex shader (1 instruction, 128-bit control word)} +\end{figure} + +This vertex shader is doing the equivalent of: + +\begin{figure} + \href{verbatim/vertex_shader_equivalent_single_color.glsl}{\includegraphics{verbatim/output/vertex_shader_equivalent_single_color.glsl.pdf}} +\end{figure} + +The W component \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae//drm/single_color.c#L339}{comes from} +\texttt{VAP\_PROG\_STREAM\_CNTL\_EXT\_\_SWIZZLE\_SELECT\_W\_0(5)}, which +swizzles W to a constant \texttt{1.0}, despite W not being present in the vertex +data. + +\begin{figure} +\begin{verbatim} +instruction[0]: + 0x00078005 OUT RGBA + 0x08020080 RGB ADDR0=0.0 ADDR1=0.0 ADDR2=0.0 + 0x08020080 ALPHA ADDR0=0.0 ADDR1=0.0 ADDR2=0.0 + 0x1c9b04d8 RGB_SEL_A=src0.110 RGB_SEL_B=src0.110 TARGET=A + 0x1c810003 ALPHA_OP=OP_MAX ALPHA_SEL_A=src0.0 ALPHA_SEL_B=src0.0 TARGET=A + 0x00000005 RGB_OP=OP_MAX +\end{verbatim} +\caption*{R500 fragment shader (1 instruction, 192-bit control word)} +\end{figure} + +This fragment shader is doing the equivalent of: + +\begin{figure} + \href{verbatim/fragment_shader_equivalent_single_color.glsl}{\includegraphics{verbatim/output/fragment_shader_equivalent_single_color.glsl.pdf}} +\end{figure} + +via the src swizzles. I think it is interesting that there are so many options +for producing inline constants within the fragment shader. + +The ``target'' fragment shader field also seems interesting. I am excited to +write shaders that use multiple output buffers. + +\subsection{DRM/KMS/GBM} + +These renders were not displayed on-screen, so I looked for ways to correct +this. + +Perhaps the most obvious method would be to write to the display controller +registers (\texttt{D1GRPH\_PRIMARY\_SURFACE\_ADDRESS}) via +\texttt{RADEON\_DRM\_CS}. However, this does not work due to the command parser +anti-fun implemented in +\href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/r300.c#L643}{r300\_packet0\_check}: +any register not present in that case statement is considered invalid, and the +packet buffer is not submitted. + +I attempted to do this the ``right way'' via the DRM/KMS/GBM APIs. I then +learned that this does not behave correctly on my R500 because demos that wait +for the flag returned by \texttt{DRM\_IOCTL\_MODE\_PAGE\_FLIP} hang forever. + +I noticed this earlier on Xorg/GLX as well, as I have been using the +\texttt{vblank\_mode=0} environment variable to avoid hanging forever in +\texttt{glXSwapBuffers}. This appears to be a Linux kernel bug, but I didn't +investigate this further. + +\subsection{On-screen drawing} + +I noticed in \texttt{/sys/kernel/debug/radeon\_vram\_mm} that the Linux console +is only using a single framebuffer (and does not double-buffer). + +This is fortunate, because this means I can simply +\href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/pci_user/main.c#L48}{mmap + the register address space} and write +\texttt{D1GRPH\_PRIMARY\_SURFACE\_ADDRESS} myself without worrying about the +Linux console overwriting my change. I observed the \texttt{0x813000} value from +\texttt{/sys/kernel/debug/radeon\_vram\_mm}--there appears to be no other way to +get the vram address of a GEM buffer. + +This is ``good enough'' for now, though at some point I'll want to learn how to +do proper vblank-synchronized double buffering. + +\subsection{Triangle drawing attempt \#3} + +I felt the next logical step was to learn how attributes and constants are +passed through the shader pipeline, so I then \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/drm/vertex_color.c}{created a demo} that produced this image (this time also displayed on-screen): + +\begin{figure} + \href{images/vertex_color.png}{\includegraphics{images/vertex_color.png}} + \caption*{R500 framebuffer capture, \texttt{vertex\_color.c}} +\end{figure} + +\begin{figure} +\begin{verbatim} +instruction[0]: + 0x00702203 dst: VE_ADD out[1].xyz_ + 0x01d10021 src0: input[1].xyz_ + 0x01248021 src1: input[1].0000 + 0x01248021 src2: input[1].0000 +instruction[1]: + 0x00f00203 dst: VE_ADD out[0].xyzw + 0x01510001 src0: input[0].xyz1 + 0x01248001 src1: input[0].0000 + 0x01248001 src2: input[0].0000 +\end{verbatim} +\caption*{R500 vertex shader (2 instructions, 128-bit control words)} +\end{figure} + +This vertex shader is doing the equivalent of + +\begin{figure} + \href{verbatim/vertex_shader_equivalent_vertex_color.glsl}{\includegraphics{verbatim/output/vertex_shader_equivalent_vertex_color.glsl.pdf}} +\end{figure} + +The extra vertex input is fed to the vertex shader via changes to +\texttt{VAP\_PROG\_STREAM\_CNTL\_0}, +\texttt{VAP\_PROG\_STREAM\_CNTL\_EXT\_0}. Based on my currently limited +understanding, it seems that arranging the vertex data like this: + +\begin{figure} + \href{verbatim/vap_prog_stream_vertices.c}{\includegraphics{verbatim/output/vap_prog_stream_vertices.c.pdf}} +\end{figure} + +Is easier to deal with in \texttt{VAP\_PROG\_STREAM\_CNTL} than: + +\begin{figure} + \href{verbatim/vap_prog_stream_vertices2.c}{\includegraphics{verbatim/output/vap_prog_stream_vertices2.c.pdf}} +\end{figure} + +\begin{figure} +\begin{verbatim} +instruction[0]: + 0x00078005 OUT RGBA + 0x08020000 RGB ADDR0=temp[0] ADDR1=0.0 ADDR2=0.0 + 0x08020080 ALPHA ADDR0=0.0 ADDR1=0.0 ADDR2=0.0 + 0x1c440220 RGB_SEL_A=src0.rgb RGB_SEL_B=src0.rgb TARGET=A + 0x1cc18003 ALPHA_OP=OP_MAX ALPHA_SEL_A=src0.1 ALPHA_SEL_B=src0.1 TARGET=A + 0x00000005 RGB_OP=OP_MAX +\end{verbatim} +\caption*{R500 fragment shader (1 instruction, 192-bit control word)} +\end{figure} + +This fragment shader is doing the equivalent of: + +\begin{figure} + \href{verbatim/fragment_shader_equivalent_vertex_color.glsl}{\includegraphics{verbatim/output/fragment_shader_equivalent_vertex_color.glsl.pdf}} +\end{figure} + +The \texttt{temp} input appears to be written by +\texttt{VAP\_OUT\_VTX\_FMT\_0\__VTX\_COLOR\_0\_PRESENT} and read due to the +changes to \texttt{RS\_COUNT} and \texttt{RS\_INST\_0}. + +\section{Progress: 21 Oct 2025} + +From 15 Oct 2025 to 21 Oct 2025, I achieved the following (roughly in chronological order): + +\begin{itemize} +\item I learned how the vertex fetcher is \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/vertex_color_aos.c#L387-L401}{configured} +\item I learned how the ``point list'' drawing primitive can be used to \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/single_color_clear.c#L504}{clear the screen} +\item I invented a new syntax for R500 vertex shader assembly (ATI never specified one themselves) +\item I modified my R500 \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/regs/pvs_disassemble.py}{vertex shader disassembler} to emit this new vertex shader syntax +\item I wrote a R500 \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/regs/assembler/vs}{vertex shader assembler} that can process my vertex shader assembly syntax +\item I create several animated demos with \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/single_color_clear_translate_vblank.c#L849-L859}{vblank-synchronized double buffering} +\item I learned how to configure and draw (multi-)textured triangles +\item I learned how to configure, clear, and use Z-buffers +\item I made a \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/texture_cube_clear_zwrite_vertex_shader.c}{textured rotating cube demo} that uses my first non-trivial \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/cube_rotate.vs.asm}{handwritten vertex shader assembly program} +\item I invented a new syntax for R500 fragment shader assembly (ATI never specified one themselves) +\item I wrote a new R500 \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/regs/us_disassemble2.py}{fragment shader disassembler} that emits this new fragment shader syntax +\item I wrote a R500 \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/regs/assembler/fs}{fragment shader assembler} that can process my fragment shader assembly syntax +\item I wrote a ``shadertoy''-style demo that uses my first non-trivial \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/shadertoy_palette.fs.asm}{handwritten fragment shader assembly program} +\end{itemize} + +\subsection{DRM\_RADEON\_CS state tracking} + +While attempting refactor one of my R500 demos to send fewer registers per +\texttt{DRM\_RADEON\_CS} ioctl, I found that there is a ``state tracker'' within +the \texttt{drm/radeon/r100}. For example, even if you don't use or depend on a +Z-buffer, \texttt{DRM\_RADEON\_CS} will still reject your packet buffer +depending on its own (imagined) concept of what the GPU state is. For example: + +\begin{verbatim} +[ 1614.729278] [drm:r100_cs_track_check [radeon]] *ERROR* [drm] No buffer for z buffer ! +[ 1614.729626] [drm:radeon_cs_ioctl [radeon]] *ERROR* Invalid command stream ! +\end{verbatim} + +This happens because \texttt{track->z\_enabled} is +\href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/r100.c#L2435}{initially + true} at the start of a \texttt{DRM\_RADEON\_CS} ioctl, and does not become +false unless the packet buffer +\href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/r300.c#L836-L843}{contains + a write} to \texttt{ZB\_CNTL}. + +This seems a bit heavy-handed. Even if the model were ``multiple applications +may be using the GPU, so a single application can't depend on previously-set +register state'', it would still be better if the kernel didn't try to enforce +this by restricting permissible content of a packet buffer. + +\subsection{Vertex transform bypass} + +Mesa uses a ``point'' 3D primitive to implement \texttt{glClear} on R500. It +does this by first uploading this vertex shader: + +\begin{figure} + \href{verbatim/mesa_glclear.vs.asm}{\includegraphics{verbatim/output/mesa_glclear.vs.asm.pdf}} + \caption*{\texttt{mesa\_glclear.vs.asm}} +\end{figure} + +This shader does nothing to the input other than copy it to the output, where +\texttt{out[0]} is the position vector, and \texttt{out[1]} is sent to the +fragment shader as a ``texture coordinate''. That fragment shader, in turn, does +not use the texture coordinate: + +\begin{figure} + \href{verbatim/mesa_glclear.fs.asm}{\includegraphics{verbatim/output/mesa_glclear.fs.asm.pdf}} + \caption*{\texttt{mesa\_glclear.fs.asm}} +\end{figure} + +In my ``clear'' +\href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/single_color_clear_rotate_vblank.c#L539}{implementation}, +I instead set \texttt{PVS\_BYPASS}, which ``bypasses'' the vertex shader +completely, sending the vertices directly to the rasterizer. This is convenient +because it obviates the need to upload/change vertex shaders just to clear the +color and Z -buffers. + +\subsection{Animation attempt \#1} + +With a working colorbuffer clear, I wrote the +\href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/single_color_clear_translate.c#L786}{single\_color\_clear\_translate.c} +demo to translate my triangle position coordinates in a loop that waits for +\texttt{DRM\_RADEON\_GEM\_WAIT\_IDLE} between each frame. This attempt +produced the following images: + +\begin{figure} + \includegraphics{videos/single_color_clear_translate.png} + \caption*{R500 DVI capture, \texttt{single\_color\_clear\_translate.c}} +\end{figure} + +This was intended to be a smooth animation, yet it is not. It also seems several +frames are never being displayed--the translation step is much smaller than what +is shown in the video. + +This, interestingly, is exactly identical to how OpenGL/GLX applications behave +on R500 with \texttt{vblank\_mode=0}. + +\subsection{Animation attempt \#2} + +I read the R500 display controller \href{doc/RRG-216M56-03oOEM.pdf}{register reference guide} again. +It appears to suggest the \texttt{D1CRTC\_UPDATE\_INSTANTLY} bit, when unset, +might cause changes to \texttt{D1GRPH\_PRIMARY\_SURFACE\_ADDRESS} to be delayed in +hardware until the next vertical blanking interval begins. + +This can be combined with polling \texttt{D1GRPH\_SURFACE\_UPDATE\_PENDING} to +later determine when the vblank-synchronized frame change actually occured. + +This is precisely what I implemented in +\href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/single_color_clear_translate_vblank.c#L854-L855}{single\_color\_clear\_translate\_vblank.c}: + +\begin{figure} + \includegraphics{videos/single_color_clear_translate_vblank.png} + \caption*{R500 DVI capture, \texttt{single\_color\_clear\_translate\_vblank.c}} +\end{figure} + +This is much closer to what I intended. The +\texttt{D1GRPH\_SURFACE\_UPDATE\_PENDING} part is certainly working as I +expected. Setting/unsetting \texttt{D1CRTC\_UPDATE\_INSTANTLY} appears to have +no effect on \texttt{D1GRPH\_PRIMARY\_SURFACE\_ADDRESS} behavior, so I feel my +understanding of R500 double-buffering is still incomplete. + +\subsection{Multiple-texture sampling} + +I am amazed and delighted how simple multiple-texture sampling is on R500. + +As a counter-example, while Sega Dreamcast does have a fairly capable +fixed-function blending unit, to use the blending unit with multiple-texture +sampled polygons one needs to render the polygon multiple times (at least once +per texture) to an accumulation buffer. Blending is then performed between the +currently-sampled texture and the previously-accumulated result, and the blend +result is written to the accumulation buffer. From a vertex transformation +perspective, it can be inconvenient/inefficient to be required to buffer entire +triangle strips so that they can be submitted more than once per frame without +duplicating the clip/transform computations. + +This is the fragment shader for +\href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/texture_dual.c}{texture\_dual.c} +(disassembly of code originally generated by Mesa): + +\begin{figure} + \href{verbatim/texture_dual.fs.asm}{\includegraphics{verbatim/output/texture_dual.fs.asm.pdf}} + \caption*{\texttt{texture\_dual.fs.asm}} +\end{figure} + +This pre-subtract multiply-add is an algebraic rearrangement of this GLSL code: + +\begin{figure} + \href{verbatim/texture_dual.fs.glsl}{\includegraphics{verbatim/output/texture_dual.fs.glsl.pdf}} + \caption*{\texttt{texture\_dual.fs.glsl}} +\end{figure} + +Which produces this image: + +\begin{figure} + \href{images/texture_dual.png}{\includegraphics{images/texture_dual.png}} + \caption*{R500 framebuffer capture, \texttt{texture\_dual.c}} +\end{figure} + +Being able to manipulate the texture samples as fragment shader unit temporaries +rather than as a sequence of accumulation buffer operations has me feeling excited +to do more with this. + +\subsection{Z-buffer clear} + +I've never worked with traditional Z-buffers before--Sega Saturn uses +\href{https://en.wikipedia.org/wiki/Painter\%27s_algorithm}{painter's algorithm} +exclusively, and Sega Dreamcast uses a ``depth accumulation buffer'' +that isn't directly readable/writable. + +It is slightly obvious in retrospect, but it took me several minutes to realize +that a ``depth clear'' can be implemented by covering the entire screen with a +``point'' primitive with the desired initial depth while \texttt{ZFUNC} set to +\texttt{ALWAYS}. + +\subsection{Drawing a 3D cube} + +With working double-buffering, Z-buffering, and the ability to clear each of +these every frame, I felt I was finally ready to draw something ``3D''. + +I thought it would be fun to first start with a cube that is transformed in +``software'' on the x86 CPU (not using a vertex shader). This sequence of videos +shows my progression on implementing this: + +\begin{figure} + \includegraphics{videos/texture_cube.png} + \caption*{R500 DVI capture, \texttt{texture\_cube.c}} +\end{figure} + +\begin{figure} + \includegraphics{videos/texture_cube_clear.png} + \caption*{R500 DVI capture, \texttt{texture\_cube\_clear.c}} +\end{figure} + +\begin{figure} + \includegraphics{videos/texture_cube_clear_zwrite.png} + \caption*{R500 DVI capture, \texttt{texture\_cube\_clear\_zwrite.c}} +\end{figure} + +\subsection{Drawing a 3D cube with vertex shaders} + +I then decided it would be fun to hand-write a ``3D rotation'' vertex shader +from scratch. I first implemented the rotation in GLSL: + +\begin{figure} + \href{verbatim/cube_rotate.vs.glsl}{\includegraphics{verbatim/output/cube_rotate.vs.glsl.pdf}} + \caption*{\texttt{cube\_rotate.vs.glsl}} +\end{figure} + +I verified that the GLSL version worked as expected in OpenGL, then I translated +the GLSL to R500 vertex shader assembly, as: + +\begin{figure} + \href{verbatim/cube_rotate.vs.asm}{\includegraphics{verbatim/output/cube_rotate.vs.asm.pdf}} + \caption*{\texttt{cube\_rotate.vs.asm}} +\end{figure} + +However, when I first executed the vertex shader cube rotation demo, I found +it did not work as expected: + +\begin{figure} + \includegraphics{videos/texture_cube_clear_zwrite_vertex_shader_incorrect.png} + \caption*{R500 DVI capture, \texttt{texture\_cube\_clear\_zwrite\_vertex\_shader.c}\\(incorrect vertex shader assembler output)} +\end{figure} + +After hours of debugging, I eventually found the issue was in this instruction: + +\begin{figure} + \href{verbatim/cube_rotate_3_temp.vs.asm}{\includegraphics{verbatim/output/cube_rotate_3_temp.vs.asm.pdf}} +\end{figure} + +\href{doc/R5xx_Acceleration_v1.5.pdf}{R5xx\_Acceleration\_v1.5.pdf} briefly mentions this on pages 98 and 99: + +\begin{quote} +The PVS\_DST\_MACRO\_INST bit was meant to be used for MACROS such as a +vector-matrix multiply, but currently is only set for the following cases: + +A VE\_MULTIPLY\_ADD or VE\_MULTIPLYX2\_ADD instruction with all 3 source +operands using unique PVS\_REG\_TEMPORARY vector addresses. Since R300 only has +two read ports on the temporary memory, this special case of these instructions +is broken up (by the HW) into 2 operations. +\end{quote} + +I read this paragraph much earlier, but I didn't fully understand it until +now. Indeed, this multiply-add has three unique \texttt{temp} addresses, and +must be encoded as a ``macro'' instruction. + +I fixed this in my vertex shader assembler by +\href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/regs/assembler/vs/validator.py}{counting the number of unique temp addresses} +referenced by each instruction, promoting \texttt{VE\_MULTIPLY\_ADD} to +\texttt{PVS\_MACRO\_OP\_2CLK\_MADD} if more than two unique \texttt{temp} +addresses are referenced. + +With this change, reassembling the same vertex shader source code now produces a +correct vertex shader cube rotation: + +\begin{figure} + \includegraphics{videos/texture_cube_clear_zwrite_vertex_shader.png} + \caption*{R500 DVI capture, \texttt{texture\_cube\_clear\_zwrite\_vertex\_shader.c}\\(correct vertex shader assembler output)} +\end{figure} + +\subsection{Comparison with Mesa's R500 vertex shader compiler} + +My ``cube rotation'' vertex shader, +\href{https://git.idk.st/bilbo/r500/src/commit/50244c7c95/drm/cube_rotate.vs.asm}{cube\_rotate.vs.asm} +is 15 instructions. + +Mesa's R500 vertex shader compiler generated a +\href{https://git.idk.st/bilbo/r500/src/commit/50244c7c95/shader_examples/mesa/texture_cube_depth_vertex_shader.vs.txt}{27-instruction vertex shader} +from \href{https://r500.idk.st/verbatim/cube_rotate.vs.glsl}{semantically equivalent GLSL code}. Disassembly: + +\begin{figure} + \href{verbatim/mesa_cube_rotate.vs.asm}{\includegraphics{verbatim/output/mesa_cube_rotate.vs.asm.pdf}} + \caption*{\texttt{mesa\_cube\_rotate.vs.asm}} +\end{figure} + +I was not particularly trying to write concise code, but I find this difference +in instruction count to be surprising. In general it seems Mesa's R500 vertex +shader compiler failed to vectorize several operations, and does significantly +more scalar multiplies and scalar multiply-adds than my implementation. + +Ignoring algorithmic improvements (such as lifting the sin/cos calculation to +x86 code and instead sending a 4x4 matrix to the vertex shader), there is still +more opportunity for optimization beyond my 15-instruction implementation. + +Particularly, the vertex shader unit has a ``dual math'' instruction mode, where +``vector engine'' (VE\_) and ``math engine'' (ME\_) operations can be executed +simultaneously in the same instruction. \texttt{cube\_rotate.vs.asm} would +indeed benefit from such an optimization--most of the \texttt{ME\_SIN} and +\texttt{ME\_COS} instructions could be interleaved with the \texttt{VE\_MUL} and +\texttt{VE\_MAD} operations that follow (at significant expense to +human-readability). + +I am curious to see more examples of the difference between Mesa's R500 vertex +shader compiler output and my own vertex shader assembly. + +\subsection{Fragment shader instruction expressiveness} + +Compared to the R500 vertex shader instructions, the R500 fragment shader +instructions are significantly more featureful. This makes inventing a syntax +that can fully express the range of operations that a R500 fragment shader +instruction can do more complex. + +A significant difference is where R500 vertex shaders have a single tier of +operand argument decoding, as in: + +\begin{figure} + \includegraphics{diagrams/vertex_inputs.svg} + \caption*{R500 vertex shader instruction operand inputs (simplified)} +\end{figure} + +While R500 fragment shaders have multiple tiers of operand argument decoding, as +in: + +\begin{figure} + \includegraphics{diagrams/fragment_inputs.svg} + \caption*{R500 fragment shader instruction operand inputs (simplified)} +\end{figure} + +I've written several \href{https://github.com/buhman/scu-dsp-asm}{nice assemblers} +for other architectures in the past, but I've never seen any instruction set +as expressive as R500 fragment shaders. + +I attempted to directly reflect this ``multiple tiers of operand argument +decoding'' in the syntax I invented for fragment shader ALU instructions. + +These instructions are also vector instructions: a total of 24 floating point +input operands and 8 floating results could be evaluated per instruction. + +With this abundance of expressiveness and a relatively high skill ceiling, I'm +amazed R500 fragment shader assembly isn't more popular in programming +competitions, general everyday conversation, etc... + +\subsection{Fragment shader assembler bugs} + +There were two ``I spent a lot of time debugging this'' issues I encountered +with my fragment shader assembler. + +The first was in this code I wrote to draw a fragment shaded circle, as in: + +\begin{figure} + \href{images/shadertoy_circle.png}{\includegraphics{images/shadertoy_circle.png}} + \caption*{R500 framebuffer capture, \texttt{shadertoy\_circle.fs.asm}} +\end{figure} + +However, in an earlier version of my fragment shader assembler, I produced this +image instead: + +\begin{figure} + \href{images/shadertoy_circle_incorrect.png}{\includegraphics{images/shadertoy_circle_incorrect.png}} + \caption*{R500 framebuffer capture, \texttt{shadertoy\_circle.fs.asm}\\(incorrect assembler output)} +\end{figure} + +In this handwritten fragment shader code: + +\begin{figure} + \href{verbatim/shadertoy_circle.fs.asm}{\includegraphics{verbatim/output/shadertoy_circle.fs.asm.pdf}} + \caption*{\texttt{shadertoy\_circle.fs.asm}} +\end{figure} + +\href{doc/R5xx_Acceleration_v1.5.pdf}{R5xx\_Acceleration\_v1.5.pdf} says briefly on page 241: + +\begin{quote} +Specifies whether to insert a NOP instruction after this. This would get +specified in order to meet dependency requirements for the pre-subtract inputs, +and dependency requirements for src0 of an MDH/MDV instruction. +\end{quote} + +The issue is the pre-subtract input for the \texttt{MAD |srcp.a| src0.1 -src2.a} +instruction depends on the write to \texttt{temp[0].a} from the immediately +preceding \texttt{RCP src0.a} instruction--a pipeline hazard. + +To fix this, I added support for +\href{https://git.idk.st/bilbo/r500/commit/fe0684ca5e58ed3be026410812c042e883bdce71}{generating the \texttt{NOP} bit} +in my fragment shader assembler. + +\subsection{More fragment shader assembler bugs} + +While trying to produce this image: + +\begin{figure} + \href{images/shadertoy_palette.png}{\includegraphics{images/shadertoy_palette.png}} + \caption*{R500 framebuffer capture, \texttt{shadertoy\_palette.fs.asm}} +\end{figure} + +My fragment shader code instead produced this image: + +\begin{figure} + \href{images/shadertoy_palette_incorrect.png}{\includegraphics{images/shadertoy_palette_incorrect.png}} + \caption*{R500 framebuffer capture, \texttt{shadertoy\_palette.fs.asm}\\(incorrect assembler output)} +\end{figure} + +The issue was simply that in the chaos of all of the other features I was +implementing for my fragment shader assembler, I +\href{https://git.idk.st/bilbo/r500/commit/f6a0fc4fab5dee3085dcf4b9a984244bba05d5ca}{forgot to emit the \texttt{ADDRD} bits}. + +This meant that while fragment shader code that exclusively uses zero-address +destinations, such as \texttt{shadertoy\_circle.fs.asm}, appeared to work +completely correctly, I encountered this bug as soon as I started using non-zero +addresses such as \texttt{temp[1]} in my fragment shader code. + +\subsection{Comparison to Direct3D ``asm''} + +Prior to Direct3D 10, Microsoft previously defined a specification for both +\href{https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx9-graphics-reference-asm-vs-3-0}{vertex shader assembly} and +\href{https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx9-graphics-reference-asm-ps-3-0}{fragment shader assembly}. + +The Direct3D ``asm'' name is slightly deceptive, however, as the +\texttt{vs\_3\_0} and \texttt{ps\_3\_0} instruction syntax does not map 1-to-1 +with any hardware that exists. + +It would perhaps be more accurate to think of Direct3D's ``asm'' +language and compiler as more analogous to a +\href{https://en.wikipedia.org/wiki/BASIC}{shader BASIC} than as a true assembly +language on the same level as ``6502 assembly'', ``Z80 assembly'' and similar. + +In contrast, my R500 assembly syntaxes are deliberately/explicitly mapped 1-to-1 +with R500 instructions. + +\subsection{Fragment shader animated demo} + +\begin{figure} + \includegraphics{videos/shadertoy_palette.png} + \caption*{R500 DVI capture, \texttt{shadertoy\_palette.fs.asm}} +\end{figure} + +The R500 fragment shader code that I handwrote for this is: + +\begin{figure} + \href{verbatim/shadertoy_palette.fs.asm}{\includegraphics{verbatim/output/shadertoy_palette.fs.asm.pdf}} + \caption*{\texttt{shadertoy\_palette.fs.asm}} +\end{figure} + +The \texttt{float} constants are interesting--they are decoded almost +identically to the +\href{https://en.wikipedia.org/wiki/Minifloat#8-bit_(1.4.3)}{8-bit (1.4.3) (bias 7) format shown on Wikipedia}, +except: +\begin{itemize} +\item There is no sign bit (the value is always positive--positive values + can be swizzled to produce negative operands) +\item There is no ``zero'' value (zero can also be instead obtained via + swizzles); the ``all zeros'' bit pattern instead has a value of + \texttt{0.0009765625}. +\item There are no infinite or not-a-number values: a ``15'' exponent is treated + as 15. +\end{itemize} + +The exponent/mantissa table that shows example 7-bit float values on page 106 of +\href{doc/R5xx_Acceleration_v1.5.pdf}{R5xx\_Acceleration\_v1.5.pdf} is incorrect. + +\end{document} diff --git a/mesa/glDrawArrays.txt b/mesa/glDrawArrays.txt new file mode 100644 index 0000000..a9439ad --- /dev/null +++ b/mesa/glDrawArrays.txt @@ -0,0 +1,197 @@ +// _mesa_draw_arrays + +// r300_draw_vbo + +// r300_draw_arrays_immediate + + vertex_size = 3 + dwords = 13 + + // r300_prepare_for_rendering + // r300_emit_states + // r300_reserve_cs_dwords + 389 + // r300_emit_dirty_state + // r300_emit_gpu_flush + SC_SCISSOR0 = 0 + SC_SCISSOR1 = (width - 1), (height - 1) // 600, 600 + // cb_flush_clean + RB3D_DSTCACHE_CTLSTAT = 0xa + ZB_ZCACHE_CTLSTAT = 0x3 + WAIT_UNTIL [0x1720] = RADEON_WAIT_3D_IDLECLEAN + // r300_emit_aa_state + GB_AA_CONFIG = 0 + RB3D_AARESOLVE_CTL = 0 + // r300_emit_fb_state + RB3D_CCTL = 16384 + RB3D_COLOROFFSET0 = 0 + //OUT_CS_RELOC + OUT_CS(0xc0001000); /* PKT3_NOP */ \ + OUT_CS(0); + + RB3D_COLORPITCH0 = 0xc10640 + //OUT_CS_RELOC + OUT_CS(0xc0001000); /* PKT3_NOP */ \ + OUT_CS(0); + + ZB_FORMAT = 2 + ZB_DEPTHOFFSET = 0 + //OUT_CS_RELOC + OUT_CS(0xc0001000); /* PKT3_NOP */ \ + OUT_CS(4); + + ZB_DEPTHPITCH = 0x30640 + //OUT_CS_RELOC + OUT_CS(0xc0001000); /* PKT3_NOP */ \ + OUT_CS(4); + // r300_emit_hyperz_state + ZB_BW_CNTL = 0 + ZB_DEPTHCLEARVALUE = 0 + SC_HYPERZ_EN = 0x1c + GB_Z_PEQ_CONFIG = 0 + // r300_emit_ztop_state + ZB_ZTOP = 1 + // r300_emit_dsa_state + FG_ALPHA_FUNC = 0 + ZB_CNTL = 0 + ZB_ZSTENCILCNTL = 0 + ZB_STENCILREFMASK = 0 + ZB_STENCILREFMASK_BF = 0 + FG_ALPHA_VALUE = 0 + // r300_emit_blend_state + RB3D_ROPCNTL = 0 + RB3D_BLENDCNTL = 0 + RB3D_ABLENDCNTL = 0 + RB3D_COLOR_CHANNEL_MASK = 15 + RB3D_DITHER_CTL = 0 + // r300_emit_blend_color_state + RB3D_CONSTANT_COLOR_AR = 0 + RB3D_CONSTANT_COLOR_GB = 0 + // r300_emit_scissor_state + SC_CLIP_0_A = 0, 0 + SC_CLIP_0_B = 0 - 1, 0 - 1 + // r300_emit_sample_mask + SC_SCREENDOOR = 63 | (63 << 6) | (63 << 12) | (63 << 18) + // r300_emit_invariant_state + GB_SELECT = 0 + FG_FOG_BLEND = 0 + GA_OFFSET = 0 + SU_TEX_WRAP = 0 + SU_DEPTH_SCALE = 16777215.0f (0x4b7fffff) + SU_DEPTH_OFFSET = 0 + SC_EDGERULE = 0x2da49525 + RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD = 0x1010101 + RB3D_DISCARD_SRC_PIXEL_GTE_THRESHOLD = 0xfefefefe + GA_COLOR_CONTROL_PS3 = 0 + SU_TEX_WRAP_PS3 = 0 + // r300_emit_viewport_state + VAP_VPORT_XSCALE = 300 + VAP_VPORT_XOFFSET = 300 + VAP_VPORT_YSCALE = -300 + VAP_VPORT_YOFFSET = 300 + VAP_VPORT_ZSCALE = 0.5 + VAP_VPORT_ZOFFSET = 0.5 + VAP_VTE_CNTL = 0x43f + // r300_emit_pvs_flush + VAP_PVS_STATE_FLUSH_REG = 0 + // r300_emit_vap_invariant_state + VAP_PVS_VTX_TIMEOUT_REG = 0xffff + VAP_GB_VERT_CLIP_ADJ = 1.0f (0x3f800000) + VAP_GB_VERT_DISC_ADJ = 1.0f (0x3f800000) + VAP_GB_HORZ_CLIP_ADJ = 1.0f (0x3f800000) + VAP_GB_HORZ_DISC_ADJ = 1.0f (0x3f800000) + VAP_PSC_SGN_NORM_CNTL = 0xaaaaaaaa + VAP_TEX_TO_COLOR_CNTL = 0 + // r300_emit_vertex_stream_state + VAP_PROG_STREAM_CNTL_0 = 0x2002 + VAP_PROG_STREAM_CNTL_EXT_0 = 0xfa88 + // r300_emit_vs_state + VAP_PVS_CODE_CNTL_0 = 0 + VAP_PVS_CODE_CNTL_1 = 0 + VAP_PVS_VECTOR_INDX_REG = 0 + VAP_PVS_VECTOR_DATA_REG_128 = (ONE_REG_WR:) + {0xf00203, 0xd10001, 0x1248001, 0x1248001} + VAP_CNTL = 0xb0055a + VAP_PVS_FLOW_CNTL_OPC = 0 + VAP_PVS_FLOW_CNTL_ADDRS_LW_[0-15] = 0 + VAP_PVS_FLOW_CNTL_ADDRS_UW_[0-15] = 0 + VAP_PVS_FLOW_CNTL_LOOP_INDEX_[0-15] = 0 + // r300_emit_clip_state + VAP_PVS_VECTOR_INDX_REG = 0x600 + VAP_PVS_VECTOR_DATA_REG_128 = + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} (24) + // r300_emit_rs_block_state + VAP_VTX_STATE_CNTL = 0x5555 + VAP_VSM_VTX_ASSM [0x2184] = 0x1 + VAP_OUTPUT_VTX_FMT_0 = 1 + VAP_OUTPUT_VTX_FMT_1 = 4 + GB_ENABLE = 0 + RS_IP_0 = 0x30000000 + RS_COUNT = 0x40080 + RS_INST_COUNT = 0 + RS_INST_0 = 0 + // r300_emit_rs_state + VAP_CNTL_STATUS = 0 + VAP_CLIP_CNTL = 0xc000 + GA_POINT_SIZE = 0x60006 + GA_POINT_MINMAX = 0x60006 + GA_LINE_CNTL = 0x20006 + SU_POLY_OFFSET_ENABLE = 0 + SU_CULL_MODE = 0 + GA_LINE_STIPPLE_CONFIG = 0 + GA_LINE_STIPPLE_VALUE = 0 + GA_POLY_MODE = 0 + GA_ROUND_MODE = 0x31 + SC_CLIP_RULE = 0xffff + GA_POINT_S0 = 0 + GA_POINT_T0 = 1.0f (0x3f800000) + GA_POINT_S1 = 1.0f (0x3f800000) + GA_POINT_T1 = 0 + // r300_emit_fb_state_pipelined + US_OUT_FMT_0 = 0x1b00 + US_OUT_FMT_1 = 0xf + US_OUT_FMT_2 = 0xf + US_OUT_FMT_3 = 0xf + GB_MSPOS0 = 0x66666666 + GB_MSPOS1 = 0x6666666 + // r500_emit_fs + US_CONFIG = 2 + US_PIXSIZE = 1 + US_FC_CTRL = 0 + US_CODE_RANGE = 0 + US_CODE_OFFSET = 0 + US_CODE_ADDR = 0 + GA_US_VECTOR_INDEX = 0 + GA_US_VECTOR_DATA = (ONE_REG_WR:) + {0x78005, 0x8020080, 0x8020080, 0x1c9b04d8, 0x1c810003, 0x5} + FG_DEPTH_SRC = 0 + US_W_FMT = 0 + // r500_emit_fs_rc_constant_state + [nothing] + // r500_emit_fs_constants + [nothing] + // r300_emit_vs_constants + VAP_PVS_CONST_CNTL = 0 + // r300_emit_texture_cache_inval + TX_INVALTAGS = 0 + // r300_emit_textures_state + TX_ENABLE = 0 + // r300_emit_query_start + [nothing] + // r500_emit_index_bias + VAP_INDEX_OFFSET = 0 + // r300_emit_draw_init + GA_COLOR_CONTROL = 0x3aaaa + VAP_VF_MAX_VTX_INDX = 2 + VAP_VF_MIN_VTX_INDX = 0 + + // r300_draw_arrays_immediate + VAP_VTX_SIZE = 3 + + [ + PACKET3_3D_DRAW_IMMD_2 (3 * 3) + 0x30034 // VAP_VF_CNTL + {0.5, -0.5, 0} + {-0.5, -0.5, 0} + {0, 0.5, 0} + ] diff --git a/replace_video.py b/replace_video.py new file mode 100644 index 0000000..37617b1 --- /dev/null +++ b/replace_video.py @@ -0,0 +1,26 @@ +import sys + +scale = 1.5 + +def transform(): + with open(sys.argv[1]) as f: + for line in f.readlines(): + if "PIC" + yield f"" + yield "" + else: + yield line + +lines = list(transform()) +with open(sys.argv[1], 'w') as f: + f.write(''.join(lines)) diff --git a/resize_svg.py b/resize_svg.py new file mode 100644 index 0000000..62b9ab2 --- /dev/null +++ b/resize_svg.py @@ -0,0 +1,22 @@ +import sys + +scale = 1.5 + +def transform(): + with open(sys.argv[1]) as f: + for line in f.readlines(): + if line.strip().startswith("' + yield template + else: + yield line + +lines = list(transform()) +with open(sys.argv[1], 'w') as f: + f.write('\n'.join(lines)) diff --git a/verbatim.sh b/verbatim.sh new file mode 100644 index 0000000..4a53c54 --- /dev/null +++ b/verbatim.sh @@ -0,0 +1,72 @@ +set -eux + +cd verbatim/ + +mkdir -p output + +for i in *.asm; do + cat < $i.tex +\documentclass[varwidth=13.1cm, border={0.0cm 0.0cm 0.0cm 0.0cm}]{standalone} +\usepackage{minted} +\setminted[python]{breaklines, linenos, frame=lines, framesep=2mm, fontsize=\huge, numbersep=5pt} +\standaloneenv{minted} +\begin{document} + +\begin{minted}{haskell} +EOF + cat $i >> $i.tex + + cat <> $i.tex +\end{minted} + +\end{document} +EOF + + pdflatex -shell-escape -output-directory=output $i.tex + pdflatex -shell-escape -output-directory=output $i.tex +done + + +for i in *.glsl; do + cat < $i.tex +\documentclass[varwidth=13.1cm, border={0.0cm 0.0cm 0.0cm 0.0cm}]{standalone} +\usepackage{minted} +\setminted[python]{breaklines, linenos, frame=lines, framesep=2mm, fontsize=\huge, numbersep=5pt} +\standaloneenv{minted} +\begin{document} + +\begin{minted}{glsl} +EOF + cat $i >> $i.tex + + cat <> $i.tex +\end{minted} + +\end{document} +EOF + + pdflatex -shell-escape -output-directory=output $i.tex + pdflatex -shell-escape -output-directory=output $i.tex +done + +for i in *.c; do + cat < $i.tex +\documentclass[varwidth=13.1cm, border={0.0cm 0.0cm 0.0cm 0.0cm}]{standalone} +\usepackage{minted} +\setminted[python]{breaklines, linenos, frame=lines, framesep=2mm, fontsize=\huge, numbersep=5pt} +\standaloneenv{minted} +\begin{document} + +\begin{minted}{c} +EOF + cat $i >> $i.tex + + cat <> $i.tex +\end{minted} + +\end{document} +EOF + + pdflatex -shell-escape -output-directory=output $i.tex + pdflatex -shell-escape -output-directory=output $i.tex +done