diff --git a/.gitignore b/.gitignore index 5e7a2b1..37473f1 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,5 @@ verbatim/*.svg verbatim/*.pdf verbatim/output images/*.data +/index*.svg +diagrams/*-.svg \ No newline at end of file diff --git a/build.sh b/build.sh index 8492761..427b17b 100644 --- a/build.sh +++ b/build.sh @@ -15,10 +15,17 @@ echo 'figure.figure { margin-left: 20px; margin-right: 20px; }' >> index.css echo 'pre.verbatim { font-size: 0.9em; }' >> index.css sed -i 's|color-scheme: light dark;||g' index.css echo 'figcaption.caption { margin-bottom: 1.3em; margin-top: 0.3em; }' >> index.css +echo '.cmti-10 { font-style: italic; }' >> index.css -sed -i 's/index.css/index2.css/g' index.html +sed -i 's/˜/~/g' index.html -mv index.css index2.css +sed -i "s|

|

|g" index.html + +sed -i '/height: 2.5em;/d' index.css + +sed -i 's/index.css/index3.css/g' index.html + +mv index.css index3.css python replace_video.py index.html diff --git a/diagrams/build.sh b/diagrams/build.sh new file mode 100644 index 0000000..ff7f228 --- /dev/null +++ b/diagrams/build.sh @@ -0,0 +1,3 @@ +dot -Tsvg z_operations.dot > z_operations.svg + +#sed -i 's/scale(1 1)/scale(0.75 0.75)/g' z_operations.svg diff --git a/diagrams/resize_dot_svg.py b/diagrams/resize_dot_svg.py new file mode 100644 index 0000000..edf8eaf --- /dev/null +++ b/diagrams/resize_dot_svg.py @@ -0,0 +1,31 @@ +import sys + +scale = 0.75 + +def scale_svg(lines): + svg = "".join(lines) + head, viewbox = svg.split("viewBox=\"", maxsplit=1) + viewbox, tail = viewbox.split('"', maxsplit=1) + x, y, width, height = map(float, viewbox.split()) + yield head + yield f'viewBox="{x} {y} {width * scale} {height * scale}"' + yield tail + +def transform(): + with open(sys.argv[1]) as f: + svg_lines = [] + + for line in f.readlines(): + if line.strip().startswith(""): + yield from scale_svg(svg_lines) + svg_lines = [] + else: + yield line + +lines = list(transform()) +with open(sys.argv[1], 'w') as f: + f.write(''.join(lines)) diff --git a/diagrams/sin_clamp.pdf b/diagrams/sin_clamp.pdf new file mode 100644 index 0000000..57006be Binary files /dev/null and b/diagrams/sin_clamp.pdf differ diff --git a/diagrams/sin_clamp.tex b/diagrams/sin_clamp.tex new file mode 100644 index 0000000..6ad7efa --- /dev/null +++ b/diagrams/sin_clamp.tex @@ -0,0 +1,34 @@ +\documentclass[varwidth=13.1cm, border={0.0cm 0.0cm 0.0cm 0.0cm}]{standalone} +\usepackage{tikz} +\usepackage[dvipsnames]{xcolor} +\usepackage{pgfplots} +\pgfplotsset{compat=1.18} +\usepackage{amsmath} +\newcommand{\Clamp}[1]{\operatorname{clamp}#1} + +\begin{document} + +\begin{tikzpicture}[scale=0.5] + + \draw[very thin,color=gray] (-pi * 3,-pi * 1.2) grid (pi * 3, pi * 1.2); + \draw[->] (-3.2*pi,0) -- (3.2*pi,0) node[right] {$x$}; + \draw[->] (0,-pi * 1.4) -- (0,pi * 1.5) node[above] {$f(x)$}; + +\draw[thick, color=NavyBlue] plot [domain=-pi * 3:pi * 3, samples=100] (\x, {min(max(\x, -pi), pi)} ); +\draw[thick, color=OrangeRed] plot [domain=-pi * 3:pi * 3, samples=1000] (\x, {sin(min(max(\x, -pi), pi) r)} ); + +\node[NavyBlue] at (0, -5.4) {$f(x) = \Clamp(x, -\pi, +\pi) $}; +\node[OrangeRed] at (0, -6.6) {$f(x) = \sin(\Clamp(x, -\pi, +\pi)) $}; + +\draw [dashed, color=ForestGreen] (-2 * pi,-3.8) -- (-2 * pi,3.8) node[above] {$x=-2\pi$}; +\draw [dashed, color=Brown] (2 * pi,-3.8) -- (2 * pi,3.8) node[above] {$x=2\pi$}; + +\draw [dashed, color=Fuchsia] (-3.0,pi) -- (3.0,pi) ; +\draw [color=Fuchsia] (0, pi + 0.5) node {$y=\pi$}; + +\draw [dashed, color=Peach] (-3.0,-pi) -- (3.0,-pi) ; +\draw [color=Peach] (0, -pi + 0.5) node {$y=-\pi$}; + +\end{tikzpicture} + +\end{document} diff --git a/diagrams/sin_frac.pdf b/diagrams/sin_frac.pdf new file mode 100644 index 0000000..46528b3 Binary files /dev/null and b/diagrams/sin_frac.pdf differ diff --git a/diagrams/sin_frac.tex b/diagrams/sin_frac.tex new file mode 100644 index 0000000..64101a8 --- /dev/null +++ b/diagrams/sin_frac.tex @@ -0,0 +1,38 @@ +\documentclass[varwidth=13.1cm, border={0.0cm 0.0cm 0.0cm 0.0cm}]{standalone} +\usepackage{tikz} +\usepackage[dvipsnames]{xcolor} +\usepackage{pgfplots} +\pgfplotsset{compat=1.18} +\usepackage{amsmath} +\newcommand{\Frac}[1]{\operatorname{frac}#1} + +\begin{document} + +\begin{tikzpicture}[scale=0.5] + + \draw[very thin,color=gray] (-pi * 3,-pi * 1.2) grid (pi * 3, pi * 1.2); + \draw[->] (-3.2*pi,0) -- (3.2*pi,0) node[right] {$x$}; + \draw[->] (0,-pi * 1.4) -- (0,pi * 1.5) node[above] {$f(x)$}; + +\foreach \i in {-2, 0, 2}{ + \pgfmathsetmacro{\start}{(\i - 1) * pi} + \pgfmathsetmacro{\end} {(\i + 1) * pi} + \draw[thick, color=NavyBlue] plot [domain=\start:\end, samples=100] (\x, {((\x * 1/(2 * pi) + 0.5) - floor(\x * 1/(2 * pi) + 0.5)) * 2 * pi - pi} ); +} +\draw[thick, color=OrangeRed] plot [domain=-pi * 3:pi * 3, samples=1000] (\x, {sin((((\x * 1/(2 * pi) + 0.5) - floor(\x * 1/(2 * pi) + 0.5)) * 2 * pi - pi) r)} ); + +\node[NavyBlue] at (0, -5.4) {$f(x) = \Frac(x \cdot \frac{1}{2\pi}+0.5) \cdot 2\pi - \pi $}; +\node[OrangeRed] at (0, -6.6) {$f(x) = \sin( \Frac(x \cdot \frac{1}{2\pi}+0.5) \cdot 2\pi - \pi ) $}; + +\draw [dashed, color=ForestGreen] (-2 * pi,-3.8) -- (-2 * pi,3.8) node[above] {$x=-2\pi$}; +\draw [dashed, color=Brown] (2 * pi,-3.8) -- (2 * pi,3.8) node[above] {$x=2\pi$}; + +\draw [dashed, color=Fuchsia] (-3.0,pi) -- (3.0,pi) ; +\draw [color=Fuchsia] (0, pi + 0.5) node {$y=\pi$}; + +\draw [dashed, color=Peach] (-3.0,-pi) -- (3.0,-pi) ; +\draw [color=Peach] (0, -pi + 0.5) node {$y=-\pi$}; + +\end{tikzpicture} + +\end{document} diff --git a/diagrams/z_operations.dot b/diagrams/z_operations.dot new file mode 100644 index 0000000..f8d1e99 --- /dev/null +++ b/diagrams/z_operations.dot @@ -0,0 +1,60 @@ +digraph G { + + vertex_shader [label="(from the vertex shader)"] + + subgraph cluster_clipping { + label = "clipping" + DX_CLIP_SPACE_DEF [label="DX_CLIP_SPACE_DEF +possibly clip the polygon"] + } + + subgraph cluster_perspective { + label = "perspective division" + + VTX_Z_FMT [nojustify=true label="VTX_Z_FMT +(if enabled) divide Z by W"] + } + + subgraph cluster_viewport_transformation { + label = "viewport transformation" + + VPORT_Z_SCALE + VPORT_Z_OFFSET + } + + subgraph cluster_geometry_assembly { + } + + subgraph cluster_setup_unit { + label = "setup unit" + + SU_DEPTH_SCALE + SU_DEPTH_OFFSET + } + + subgraph cluster_zfunc { + label = "ZFUNC" + { rank=same + depth_test [shape=box label="depth test"] + depth_pass [shape=box label="depth pass"] + } + depth_test -> depth_pass + } + + Z_BUFFER [shape=invhouse label="(write the new Z +value to the Z-buffer)"] + + fragment_shader [label="(to the fragment shader)"] + + vertex_shader -> DX_CLIP_SPACE_DEF + DX_CLIP_SPACE_DEF -> VTX_Z_FMT + VTX_Z_FMT -> VPORT_Z_SCALE + VPORT_Z_SCALE -> VPORT_Z_OFFSET + VPORT_Z_OFFSET -> SU_DEPTH_SCALE + SU_DEPTH_SCALE -> SU_DEPTH_OFFSET + SU_DEPTH_OFFSET -> depth_test + depth_test -> Z_BUFFER + + VPORT_Z_OFFSET -> depth_pass + depth_pass -> fragment_shader +} \ No newline at end of file diff --git a/diagrams/z_operations.svg b/diagrams/z_operations.svg new file mode 100644 index 0000000..1da76d6 --- /dev/null +++ b/diagrams/z_operations.svg @@ -0,0 +1,173 @@ + + + + + + +G + + +cluster_clipping + +clipping + + +cluster_perspective + +perspective division + + +cluster_viewport_transformation + +viewport transformation + + +cluster_setup_unit + +setup unit + + +cluster_zfunc + +ZFUNC + + + +vertex_shader + +(from the vertex shader) + + + +DX_CLIP_SPACE_DEF + +DX_CLIP_SPACE_DEF +possibly clip the polygon + + + +vertex_shader->DX_CLIP_SPACE_DEF + + + + + +VTX_Z_FMT + +VTX_Z_FMT +(if enabled) divide Z by W + + + +DX_CLIP_SPACE_DEF->VTX_Z_FMT + + + + + +VPORT_Z_SCALE + +VPORT_Z_SCALE + + + +VTX_Z_FMT->VPORT_Z_SCALE + + + + + +VPORT_Z_OFFSET + +VPORT_Z_OFFSET + + + +VPORT_Z_SCALE->VPORT_Z_OFFSET + + + + + +SU_DEPTH_SCALE + +SU_DEPTH_SCALE + + + +VPORT_Z_OFFSET->SU_DEPTH_SCALE + + + + + +depth_pass + +depth pass + + + +VPORT_Z_OFFSET->depth_pass + + + + + +SU_DEPTH_OFFSET + +SU_DEPTH_OFFSET + + + +SU_DEPTH_SCALE->SU_DEPTH_OFFSET + + + + + +depth_test + +depth test + + + +SU_DEPTH_OFFSET->depth_test + + + + + +depth_test->depth_pass + + + + + +Z_BUFFER + +(write the new Z +value to the Z-buffer) + + + +depth_test->Z_BUFFER + + + + + +fragment_shader + +(to the fragment shader) + + + +depth_pass->fragment_shader + + + + + diff --git a/images/cube_scene.png b/images/cube_scene.png new file mode 100644 index 0000000..969039d Binary files /dev/null and b/images/cube_scene.png differ diff --git a/images/plane_scene.png b/images/plane_scene.png new file mode 100644 index 0000000..8bec1b4 Binary files /dev/null and b/images/plane_scene.png differ diff --git a/images/z_buffer_clipped.png b/images/z_buffer_clipped.png new file mode 100644 index 0000000..7b7a175 Binary files /dev/null and b/images/z_buffer_clipped.png differ diff --git a/images/z_buffer_cube.png b/images/z_buffer_cube.png new file mode 100644 index 0000000..385bf90 Binary files /dev/null and b/images/z_buffer_cube.png differ diff --git a/images/z_buffer_cube_range.png b/images/z_buffer_cube_range.png new file mode 100644 index 0000000..8ddb642 Binary files /dev/null and b/images/z_buffer_cube_range.png differ diff --git a/images/z_buffer_cube_range_back.png b/images/z_buffer_cube_range_back.png new file mode 100644 index 0000000..327cded Binary files /dev/null and b/images/z_buffer_cube_range_back.png differ diff --git a/images/z_buffer_gradient.png b/images/z_buffer_gradient.png new file mode 100644 index 0000000..3fc177d Binary files /dev/null and b/images/z_buffer_gradient.png differ diff --git a/images/z_buffer_overflow.png b/images/z_buffer_overflow.png new file mode 100644 index 0000000..e238e1e Binary files /dev/null and b/images/z_buffer_overflow.png differ diff --git a/images/z_buffer_perspective.png b/images/z_buffer_perspective.png new file mode 100644 index 0000000..d5ba56e Binary files /dev/null and b/images/z_buffer_perspective.png differ diff --git a/images/z_buffer_perspective_scale.png b/images/z_buffer_perspective_scale.png new file mode 100644 index 0000000..93108bb Binary files /dev/null and b/images/z_buffer_perspective_scale.png differ diff --git a/index.tex b/index.tex index ded3477..07f79e1 100644 --- a/index.tex +++ b/index.tex @@ -1,5 +1,6 @@ \documentclass[20pt]{article} +\usepackage{amsmath} \usepackage[font=small,labelfont=bf]{caption} \usepackage{hyperref} \hypersetup{ @@ -15,6 +16,7 @@ \graphicspath{ {./images/} } \usepackage{minted} +\usepackage{nicefrac} \title{Radeon R500} \date{} @@ -28,9 +30,9 @@ \section{Introduction} -The primary/minimal project goal is "draw a triangle on a Radeon R500 via direct -memory-mapped hardware register and texture memory accesses". This means no -\href{https://mesa3d.org/}{Mesa}, no +The primary/minimal project goal is ``draw a triangle on a Radeon R500 via +direct memory-mapped hardware register and texture memory accesses''. This means +no \href{https://mesa3d.org/}{Mesa}, no \href{https://github.com/torvalds/linux/tree/v6.12/drivers/gpu/drm/radeon}{radeon} kernel module, and certainly no OpenGL or Direct3D. @@ -661,14 +663,45 @@ from scratch. I first implemented the rotation in GLSL: \caption*{\texttt{cube\_rotate.vs.glsl}} \end{figure} -I verified that the GLSL version worked as expected in OpenGL, then I translated -the GLSL to R500 vertex shader assembly, as: +\subsubsection{Remapping shader unit sin/cos operands} + +Because this shader program depends on being able to calculate sin and cos, this +meant I immediately needed to understand how to use the \texttt{ME\_SIN} and +\texttt{ME\_COS} operations. + +The R500 vertex shader ME unit clamps sin/cos operands to the range +$(-\pi,+\pi)$, as in: + +\begin{figure} + \href{diagrams/sin_clamp.pdf}{\includegraphics{diagrams/sin_clamp.pdf}} +\end{figure} + +``Remapping'' floating point values from $(-\infty,+\infty)$ to $(-\pi,+\pi)$ is not +obvious. I was not previously aware of this transformation: + +\begin{figure} + \href{diagrams/sin_frac.pdf}{\includegraphics{diagrams/sin_frac.pdf}} +\end{figure} + +Or, expressed as R500 vertex shader assembly: + +\begin{figure} + \href{verbatim/sin_operand_remap.vs.asm}{\includegraphics{verbatim/output/sin_operand_remap.vs.asm.pdf}} +\end{figure} + +\subsubsection{Translation of the GLSL vertex shader to R500 vertex shader assembly} + +Having verified that the GLSL version works as expected in OpenGL, and knowing +how to use the R500 vertex shader sin/cos operations, then I translated the GLSL +to R500 vertex shader assembly, as: \begin{figure} \href{verbatim/cube_rotate.vs.asm}{\includegraphics{verbatim/output/cube_rotate.vs.asm.pdf}} \caption*{\texttt{cube\_rotate.vs.asm}} \end{figure} +\subsubsection{Vertex shader assembler/code generator debugging} + However, when I first executed the vertex shader cube rotation demo, I found it did not work as expected: @@ -775,8 +808,8 @@ I've written several \href{https://github.com/buhman/scu-dsp-asm}{nice assembler for other architectures in the past, but I've never seen any instruction set as expressive as R500 fragment shaders. -I attempted to directly reflect this ``multiple tiers of operand argument -decoding'' in the syntax I invented for fragment shader ALU instructions. +I attempted to directly represent this ``multiple tiers of operand argument +decoding'' in my fragment shader ALU instructions syntax. These instructions are also vector instructions: a total of 24 floating point input operands and 8 floating results could be evaluated per instruction. @@ -902,4 +935,426 @@ except: The exponent/mantissa table that shows example 7-bit float values on page 106 of \href{doc/R5xx_Acceleration_v1.5.pdf}{R5xx\_Acceleration\_v1.5.pdf} is incorrect. +\section{Progress: 26 Oct 2025} + +From 21 Oct 2025 to 26 Oct 2025, I achieved the following (roughly in chronological order): + +\begin{itemize} +\item I \href{https://git.idk.st/bilbo/r500/commit/8594bc4a38f6fcab2ac6e437b46bcf1e0e6d32dd}{rewrote} most of the vertex shader assembler parser/validator, and implemented support for \href{https://git.idk.st/bilbo/r500/commit/f3f1969f4a9b336536f5fb23d246f7103c41e20d}{assembling/disassembling ``dual math'' operations} +\item I implemented support for \href{https://git.idk.st/bilbo/r500/commit/96d7286e7cd3270b9dca0924d3a046d585d6dc9d}{assembling} and \href{https://git.idk.st/bilbo/r500/commit/27227426eaac265bc3126edd7d017c791640e789}{disassembling} TEX fragment shader instructions +\item I presented this project (including live demos on real hardware) at + a \href{https://itch.io/jam/spoopy-jam-7-heckraiser}{local in-person game jam event} +\end{itemize} + +\subsection{Vertex shader optimization part 1: ``MOV'' elimination} + +After talking about it in-person, I decided to try to golf my original +15-instruction +\href{https://git.idk.st/bilbo/r500/src/commit/c8ae311e60/drm/cube_rotate.vs.asm}{cube\_rotate.vs.asm} vertex shader. + +The first opportunity for optimization is in the first two instructions of: + +\begin{figure} + \href{verbatim/cube_rotate_const_move.vs.asm}{\includegraphics{verbatim/output/cube_rotate_const_move.vs.asm.pdf}} +\end{figure} + +The \texttt{VE\_ADD} (being used here as a ``MOV'' instruction) is needed +because there is only a single 128-bit read port into \texttt{const} memory, so +a multiply-add like this is illegal: + +\begin{figure} + \href{verbatim/cube_rotate_const_move_illegal.vs.asm}{\includegraphics{verbatim/output/cube_rotate_const_move_illegal.vs.asm.pdf}} +\end{figure} + +I observed that because I never need to reference the last two constants in the +same instruction that references the first two constants, if I rearrange the +ordering of the constants to: + +\begin{figure} + \href{verbatim/cube_rotate_const_move_rearrange.vs.asm}{\includegraphics{verbatim/output/cube_rotate_const_move_rearrange.vs.asm.pdf}} +\end{figure} + +I can then rewrite the multiply-add instructions as: + +\begin{figure} + \href{verbatim/cube_rotate_const_move_rearrange_mad.vs.asm}{\includegraphics{verbatim/output/cube_rotate_const_move_rearrange_mad.vs.asm.pdf}} +\end{figure} + +\subsection{Vertex shader optimization part 2: ``dual math'' instructions} + +I spent an entire day rewriting large portions of the vertex shader assembler to +add support for ``dual math'' instructions. + +The original +\href{https://git.idk.st/bilbo/r500/src/commit/c8ae311e60/drm/cube_rotate.vs.asm}{cube\_rotate.vs.asm} +contains this sequence of \texttt{ME_SIN}/\texttt{ME\_COS} instructions: + +\begin{figure} + \href{verbatim/cube_rotate_sin_cos.vs.asm}{\includegraphics{verbatim/output/cube_rotate_sin_cos.vs.asm.pdf}} +\end{figure} + +The \texttt{temp[3].x} and \texttt{temp[3].y} results are needed immediately, +but \texttt{temp[3].z} and \texttt{temp[3].w} are not needed until after the +first pair of \texttt{VE\_MUL}/\texttt{VE\_MAD} operations. + +The dual math instruction mode replaces the 3rd \texttt{VE_} instruction operand +with any \texttt{ME\_} operation, so it is only usable with 2-operand +\texttt{VE\_} instructions like \texttt{VE\_MUL}. + +The dual math encoding also has several restrictions (it only has \nicefrac{1}{4}th the +control word bits compared to a normal \texttt{ME\_} instruction). A notable +restriction is that it must write to \texttt{alt\_temp}. + +Unlike the fancy things that can be done with fragment shader +operands/sources/swizzles, a single vertex shader operand can also only read +from a single 128-bit register, so this means to be able to continue to access +\texttt{temp[3].zw} as a vector, both \texttt{z} and \texttt{w} must now be +stored in \texttt{alt\_temp}, even if only one of them was written by a ``dual +math'' instruction. + +The change (and my newly-implemented dual math syntax) is: + +\begin{figure} + \href{verbatim/cube_rotate_dual_math.vs.asm}{\includegraphics{verbatim/output/cube_rotate_dual_math.vs.asm.pdf}} +\end{figure} + +Where the dual math instruction: + +\begin{figure} + \href{verbatim/cube_rotate_dual_math_single_instruction.vs.asm}{\includegraphics{verbatim/output/cube_rotate_dual_math_single_instruction.vs.asm.pdf}} +\end{figure} + +Is encoded by the assembler as single instruction and is executed by the vertex +shader unit in a single clock cycle. + +The final +\href{https://git.idk.st/bilbo/r500/src/commit/c8ae311e60/drm/cube_rotate_optimize.vs.asm}{cube\_rotate\_optimize.vs.asm} +was reduced from 15 instructions to 13 instructions (compared +to Mesa's R500 vertex shader compiler's 27 instructions). + +\section{Progress: 29 Oct 2025} + +From 27 Oct 2025 to 29 Oct 2025, I achieved the following (roughly in chronological order): + +\begin{itemize} +\item I implemented support for \href{https://git.idk.st/bilbo/r500/commit/9aecbbfc6f297ea71c72f4c4fba1b8107be95ca1}{``multiple render targets''} in the fragment shader assembler +\item I wrote a \href{https://git.idk.st/bilbo/r500/src/commit/18b7a593bd/drm/texture_blur_horizontal.fs.asm}{gaussian blur fragment shader} +\item I made a demo that draws \href{https://git.idk.st/bilbo/r500/src/commit/18b7a593bd/drm/pumpkin_man.c#L963}{multiple 3D ``objects''} where each object's UV coordinates sample a \href{https://git.idk.st/bilbo/r500/src/commit/18b7a593bd/drm/pumpkin_man.c#L1029-L1069}{different} \href{https://git.idk.st/bilbo/r500/src/commit/18b7a593bd/drm/pumpkin_man.c#L314}{texture} +\item I did several experiments related to R500's Z-buffer implementation +\end{itemize} + +\subsection{Z-buffer experiments} +\label{sec:z-buffer-experiments} +Though I produced a ``properly'' Z-buffered 3D cube demo previously, I felt I +did not fully understand the relationship between Z coordinates, W coordinates, +viewport transformations, and the actual values that are written the the +Z-buffer. At some point, I'd like to write fragment shaders that sample the +Z-buffer, so I feel I need to understand this more rigorously. + +For comparison, Sega Dreamcast stores 32-bit floating-point values in the +``depth accumulation buffer''. This effectively means that any Z coordinates can +be stored in the depth accumulation buffer without scaling or range +remapping. I've made several +\href{https://az1.idk.st/public/20kdm2-demo.mp4}{moderately fancy} Dreamcast +demos in that happily store arbitrary ``view space'' Z values in the depth +accumulation buffer without any visible depth aliasing/artifacts. + +In contrast, the Radeon R500 does not have a 32-bit floating point Z-buffer +format. Instead, R500 supports (\href{doc/R5xx_Acceleration_v1.5.pdf}{R5xx\_Acceleration\_v1.5.pdf}, page 283, +\texttt{ZB\_FORMAT}): + +\begin{itemize} +\item 16-bit integer Z +\item 16-bit floating point +\item 24-bit integer Z with 8-bit stencil +\end{itemize} + +The third option, with the most bits, clearly ought to give the most +precision--with the caveat that the Z values that are written to the Z-buffer +should be scaled to be uniformly distributed across the range of 24-bit integers. + +I performed several tests with variations of +\href{https://git.idk.st/bilbo/r500/src/branch/main/drm/zbuffer_test.c}{zbuffer\_test.c}. The +general strategy was: + +\begin{itemize} +\item Define some contrived/illustrative 3D scene +\item Manipulate the scale/range of Z and W values +\item Observe the state of the Z-buffer after rendering +\end{itemize} + +The first scene I chose was of a tilted plane that is non-coplanar with the view +space XY plane, as in: + +\begin{figure} + \href{images/plane_scene.png}{\includegraphics{images/plane_scene.png}} + \caption*{Blender screenshot, ``plane scene''} +\end{figure} + +Where the grey plane is the object that is to be rendered, the yellow lines +represent a ``camera'' from which the plane is to be viewed, and the blue line +represents the view/clip-space Z axis. + +To view the content of the Z buffer, I wrote a +\href{https://git.idk.st/bilbo/r500/src/commit/18b7a593bd/tools/zbuf_decode.py}{simple script} +to convert the 24-bit integer Z-buffer to 16-bit +\href{https://en.wikipedia.org/wiki/Netpbm}{PGM}, +so that it can be easily viewed in an image editor. This tool also shows the +minimum and maximum values found in the Z-buffer, intended to help verify that +the entire numeric range of the Z-buffer is being used. + +While I expected to see the (orthographic, directly facing the camera) plane +drawn on the Z-buffer as a smooth gradient such as: + +\begin{figure} + \href{images/z_buffer_gradient.png}{\includegraphics{images/z_buffer_gradient.png}} + \caption*{R500 framebuffer capture, \texttt{z\_buffer\_gradient.png}} +\end{figure} + +Several of my tests displayed numeric aliasing, overflows, underflows, etc..: + +\begin{figure} + \href{images/z_buffer_overflow.png}{\includegraphics{images/z_buffer_overflow.png}} + \caption*{R500 framebuffer capture, \texttt{z\_buffer\_overflow.png}} +\end{figure} + +Of particular interest to me was to verify the behavior of the +\texttt{DX\_CLIP\_SPACE\_DEF} bit +(\href{doc/R5xx_Acceleration_v1.5.pdf}{R5xx\_Acceleration\_v1.5.pdf}, page +255--this is also the only place in the entire manual where ``non-user'' clip +planes are even defined), and to understand the order of pipeline operations. + +I played with moving the plane around, to observe clipping behavior (here the +lower half of the scene was clipped due to intersecting the Z=+1.0 clip plane): + +\begin{figure} + \href{images/z_buffer_clipped.png}{\includegraphics{images/z_buffer_clipped.png}} + \caption*{R500 framebuffer capture, \texttt{z\_buffer\_clipped.png}\\ + (also simultaneously showing overflow/underflow artifacts)} +\end{figure} + +Thinking at this point that I nearly understood most of the pieces, I then +re-enabled XY perspective division: + +\begin{figure} + \href{images/z_buffer_perspective.png}{\includegraphics{images/z_buffer_perspective.png}} + \caption*{R500 framebuffer capture, \texttt{z\_buffer\_perspective.png}} +\end{figure} + +The above image was not quite what I wanted: I noticed the range of the Z buffer +values were roughly between \texttt{0} and \texttt{8388607}, but what I really +wanted was \texttt{0} to \texttt{16777215}. Adjusting scale again produced this +Z-buffer: + +\begin{figure} + \href{images/z_buffer_perspective_scale.png}{\includegraphics{images/z_buffer_perspective_scale.png}} + \caption*{R500 framebuffer capture, \texttt{z\_buffer\_perspective\_scale.png}} +\end{figure} + +Up to this point, I was using \texttt{ZFUNC=GREATER} with a Z-buffer cleared +with an initial depth of zero, where all Z values are negative numbers. + +I decided it might be more intuitive to use a Z-buffer that is cleared with an +initial depth of one, using \texttt{ZFUNC=LESS} instead where all Z values are +positive numbers. + +With these adjustments, I captured a Z-buffer from the earlier cube demo: + +\begin{figure} + \href{images/z_buffer_cube.png}{\includegraphics{images/z_buffer_cube.png}} + \caption*{R500 framebuffer capture, \texttt{z\_buffer\_cube.png}} +\end{figure} + +This was still not quite ``correct'', because the minimum depth of the cube is +being drawn as \textasciitilde{}\texttt{2763306} (\textasciitilde{}0.16), but I expected +something closer to zero. + +Adjusting my range/scale arithmetic again produced this image: + +\begin{figure} + \href{images/z_buffer_cube_range.png}{\includegraphics{images/z_buffer_cube_range.png}} + \caption*{R500 framebuffer capture, \texttt{z\_buffer\_cube\_range.png}} +\end{figure} + +The minimum Z value now appears to be closer to zero, but the ``back'' faces of +the cube (and maximum Z values) are not visible. Without changing any +scale/range constants, inverting \texttt{ZFUNC} and using a zero-initialized +Z-buffer produced this image of the back faces of the cube: + +\begin{figure} + \href{images/z_buffer_cube_range_back.png}{\includegraphics{images/z_buffer_cube_range_back.png}} + \caption*{R500 framebuffer capture, \texttt{z\_buffer\_cube\_range\_back.png}} +\end{figure} + +Indeed, the maximum Z value is close to \textasciitilde{}\texttt{16777215} +(\textasciitilde{}1.0), as intended. I feel at this point I have a better intuition +for using integer Z-buffers. The pipeline (and relevant registers) appears to be +something like this: + +\begin{figure} + \includegraphics{diagrams/z_operations.svg} + \caption*{R500 Z transform pipeline (simplified)} +\end{figure} + +Prior to these experiments, I was not aware \texttt{SU\_DEPTH\_SCALE} is the +thing directly responsible for scaling floating point Z values to the integer Z +values stored in the depth buffer. + +In general, the hardware perspective divide, viewport transform, clipping, and +setup units are absolutely fascinating. + +\subsection{3D perspective} + +Despite making many 3D demos in the past, I feel that every time I want to +``draw something 3D'' on a new platform, I need to re-relearn 3D/perspective +transformations, (perhaps because I never truly \textit{learned} anything). + +In many OpenGL articles/tutorials/books the +\href{https://learnopengl.com/Getting-started/Coordinate-Systems}{standard} +\href{https://ogldev.org/www/tutorial12/tutorial12.html}{formula} for +\href{https://songho.ca/opengl/gl_projectionmatrix.html}{explaining} +\href{https://www.scratchapixel.com/lessons/3d-basic-rendering/perspective-and-orthographic-projection-matrix/opengl-perspective-projection-matrix.html}{perspective} +\href{https://learnwebgl.brown37.net/08_projections/projections_perspective.html}{projection} +appears to be: + +\begin{itemize} +\item Begin with an overly-academic explanation of perspective in terms of camera optics and trigonometry +\item Do not implement or demonstrate the any of the systems or mathematics + described in the preceding pages of explanations; intead abruptly hide all + magic behind \texttt{glm::perspective} +\item Refuse to explain or clarify further +\item Continue for the next 30 chapters/articles without ever revisiting focal + length, view frustums, depth of field, etc.. again +\end{itemize} + +It is sufficient to instead rationalize/implement ``perspective'' as: + +\begin{quote} + Perspective is the division of X and Y coordinates by Z, where the coordinate + $(0, 0, 0)$ is the view origin (and the center of the screen/projection). +\end{quote} + +Defining perspective this way also works for OpenGL, with some slight +adjustment, notably to deal with OpenGL's +\href{https://registry.khronos.org/OpenGL/specs/gl/glspec20.pdf}{definition} of +``normalized device coordinates''. + +I note that (unlike Dreamcast) one can't actually divide by Z on R500 (nor +OpenGL), both because the VTE doesn't support this, and because the texture +unit doesn't support this. Of course, I tried it anyway: + +\begin{figure} + \includegraphics{videos/cube_warped_textures.png} + \caption*{R500 DVI capture, \texttt{texture\_cube\_warping.c} \\ + (unrelated to this demo, R500 also interestingly has a dedicated ``disable perspective-correct texture mapping'' bit)} +\end{figure} + +Instead, in both cases, the R500 uses the W coordinate for division. This turns +out to be very convenient, because it means that that the ``field of +view''/perspective scale (W) and the Z-buffer/depth test scale (Z) can be +adjusted independently. + +\subsection{3D clipping} + +Here are several examples of improperly scaled Z values, which are being clipped +by the setup unit: + +\begin{figure} + \includegraphics{videos/cube_clipped_far.png} + \caption*{R500 DVI capture, \texttt{texture\_cube\_clear\_zwrite\_vertex\_shader\_optimize\_zscale.c} \\ + (``far'' clip plane intersection)} +\end{figure} + +\begin{figure} + \includegraphics{videos/cube_clipped_near.png} + \caption*{R500 DVI capture, \texttt{texture\_cube\_clear\_zwrite\_vertex\_shader\_optimize\_zscale.c} \\ + (``near'' clip plane intersection)} +\end{figure} + +\begin{figure} + \includegraphics{videos/cube_clipped_near_opengl.png} + \caption*{R500 DVI capture, \texttt{texture\_cube\_clear\_zwrite\_vertex\_shader\_optimize\_zscale.c} \\ + (I am curious to learn under what circumstances the OpenGL designers thought\\ $-w_{c} < z_{c} < w_{c}$ was a good idea)} +\end{figure} + +\section{Progress: 31 Oct 2025} + +From 30 Oct 2025 to 31 Oct 2025, I achieved the following (non-chronological): + +\begin{itemize} +\item I implemented a \href{https://git.idk.st/bilbo/r500/src/branch/main/drm/matrix_cubesphere_specular.fs.asm}{diffuse/specular lighting fragment shader} in R500 fragment shader assembly +\item I made vertex shaders that represent coordinate space transformations + using matrix multiplications rather than ad-hoc arithmetic +\item While writing demos that pass multiple (interpolated) vectors from the + vertex shader to the fragment shader, I learned more about \href{https://git.idk.st/bilbo/r500/src/commit/f43ac599f9/drm/matrix_cubesphere_specular_suzanne.cpp#L444-L512}{``rasterizer instructions''} +\item I made a demo that uses more than one texture for the entire scene + (by \href{https://git.idk.st/bilbo/r500/src/commit/f43ac599f9/drm/pumpkin_man.c#L272-L317}{reconfiguring + the texture unit for each ``object''}) +\end{itemize} + +\subsection{Lighting demo} + +\begin{figure} + \includegraphics{videos/suzanne.png} + \caption*{R500 DVI capture, \texttt{matrix\_cubesphere\_specular\_suzanne.cpp} \\ + (subdivided Suzanne mesh, 15,744 triangles)} +\end{figure} + +Despite being a ``simple'' lighting demo, a surprising number of things need to +happen simultaneously before it becomes possible. + +Where vertex shaders from previous demos were passed at most a single scalar +variable for animation/timing, the vertex shader in this demo uses +\href{https://git.idk.st/bilbo/r500/src/commit/f43ac599f9/drm/matrix_cubesphere_specular_suzanne.cpp#L301-L326}{10 vectors} as +input: + +\begin{itemize} +\item 4 vectors for a ``local space to clip space'' transformation matrix +\item 4 vectors for a ``local space to world space'' transformation matrix (used for lighting) +\item 1 vector for a ``light position'' (in world space coordinates, used for lighting) +\item 1 vector for a ``view origin'' (in world space coordinates, used for lighting) +\end{itemize} + +Additionally, where previous demos passed at most a single vector from the +vertex shader to the fragment shader (vertex color or texture coordinates), this +demo passes +\href{https://git.idk.st/bilbo/r500/src/commit/f43ac599f9/drm/matrix_cubesphere_specular_suzanne.cpp#L444-L512}{5 vectors} +from the vertex shader to the fragment shader, all of which are used +by the lighting calculation: + +\begin{itemize} +\item world space position +\item world space normal +\item world space light position +\item world space view origin +\item uv space texture coordinates +\end{itemize} + +\subsection{Learn algebra by writing fragment shader assembly} + +Prior to today, I did not know about this transformation/equivalence: + +\begin{gather*} +x^{n} \iff 2^{\left( n\cdot\frac{\log(x)}{\log(2)} \right)} +\end{gather*} + +While the R500 fragment shader alpha unit does not have a \texttt{POW} operation, +it does have \href{https://git.idk.st/bilbo/r500/src/commit/f43ac599f9/drm/matrix_cubesphere_specular.fs.asm#L93-L99}{\texttt{EX2} and \texttt{LN2}} +operations. + +For example, one could implement $a^{32}$ in R500 fragment shader assembly as: + +\begin{figure} + \href{verbatim/pow_fragment_shader.fs.asm}{\includegraphics{verbatim/output/pow_fragment_shader.fs.asm.pdf}} +\end{figure} + +This ``arbitrary exponents with arbitrary bases'' pattern is used in the +lighting demo fragment shader as part of the ``specular intensity'' calculation. + +This fragment shader unit feature is very cool, because a software +implementation of a generalized floating-point \texttt{pow} function is +extremely +\href{https://git.musl-libc.org/cgit/musl/tree/src/math/powf.c?id=cb5c057c87240a9534f8e0d9b7ff2560082f6218}{computationally expensive} +otherwise. + \end{document} diff --git a/resize_svg.py b/resize_svg.py index 62b9ab2..77e6d77 100644 --- a/resize_svg.py +++ b/resize_svg.py @@ -19,4 +19,4 @@ def transform(): lines = list(transform()) with open(sys.argv[1], 'w') as f: - f.write('\n'.join(lines)) + f.write(''.join(lines)) diff --git a/verbatim/cube_rotate_const_move.vs.asm b/verbatim/cube_rotate_const_move.vs.asm new file mode 100644 index 0000000..dec95bd --- /dev/null +++ b/verbatim/cube_rotate_const_move.vs.asm @@ -0,0 +1,8 @@ +-- CONST[0] = {0.159155, 0.5, 6.283185, -3.141593} +-- CONST[1] = {theta1, theta2, 0.2, 0.5} + +temp[0].xy = VE_ADD const[1].xy__ const[1].00__ ; + +temp[0].xy = VE_MAD temp[0].xy__ const[0].xx__ const[0].yy__ ; +temp[0].xy = VE_FRC temp[0].xy__ ; +temp[0].xy = VE_MAD temp[0].xy__ const[0].zz__ const[0].ww__ ; diff --git a/verbatim/cube_rotate_const_move_illegal.vs.asm b/verbatim/cube_rotate_const_move_illegal.vs.asm new file mode 100644 index 0000000..b4df9d4 --- /dev/null +++ b/verbatim/cube_rotate_const_move_illegal.vs.asm @@ -0,0 +1,3 @@ +-- this is an illegal instruction: +-- const[1] and const[0] can not be read simultaneously +temp[0].xy = VE_MAD const[1].xy__ const[0].xx__ const[0].yy__ ; diff --git a/verbatim/cube_rotate_const_move_rearrange.vs.asm b/verbatim/cube_rotate_const_move_rearrange.vs.asm new file mode 100644 index 0000000..4eb3de9 --- /dev/null +++ b/verbatim/cube_rotate_const_move_rearrange.vs.asm @@ -0,0 +1,2 @@ +-- CONST[0] = {theta1, theta2, 0.159155, 0.5} +-- CONST[1] = {6.283185, -3.141593, 0.2, 0.5} diff --git a/verbatim/cube_rotate_const_move_rearrange_mad.vs.asm b/verbatim/cube_rotate_const_move_rearrange_mad.vs.asm new file mode 100644 index 0000000..ea23274 --- /dev/null +++ b/verbatim/cube_rotate_const_move_rearrange_mad.vs.asm @@ -0,0 +1,7 @@ +-- the VE_ADD instruction is now not necessary/deleted: +-- temp[0].xy = VE_ADD const[1].xy__ const[1].00__ ; + +-- const addresses and swizzles changed: +temp[0].xy = VE_MAD const[0].xy__ const[0].zz__ const[0].ww__ ; +temp[0].xy = VE_FRC temp[0].xy__ ; +temp[0].xy = VE_MAD temp[0].xy__ const[1].xx__ const[1].yy__ ; diff --git a/verbatim/cube_rotate_dual_math.vs.asm b/verbatim/cube_rotate_dual_math.vs.asm new file mode 100644 index 0000000..5cc8c32 --- /dev/null +++ b/verbatim/cube_rotate_dual_math.vs.asm @@ -0,0 +1,14 @@ +temp[3].x = ME_SIN temp[0].___x ; +temp[3].y = ME_COS temp[0].___x ; +alt_temp[3].z = ME_SIN temp[0].___y ; + +-- first rotation +temp[1].yz = VE_MUL input[0]._-zz_ temp[3]._xy_ , +alt_temp[3].w = ME_COS temp[0].y_ ; + +temp[1].xyz = VE_MAD input[0].xyy_ temp[3].1yx_ temp[1].0yz_ ; + +-- second rotation +temp[2].xz = VE_MUL temp[1].-z_z_ alt_temp[3].z_w_ ; + +temp[2].xyz = VE_MAD temp[1].xyx_ alt_temp[3].w1z_ temp[2].x0z_ ; diff --git a/verbatim/cube_rotate_dual_math_single_instruction.vs.asm b/verbatim/cube_rotate_dual_math_single_instruction.vs.asm new file mode 100644 index 0000000..7dedb3c --- /dev/null +++ b/verbatim/cube_rotate_dual_math_single_instruction.vs.asm @@ -0,0 +1,2 @@ +temp[1].yz = VE_MUL input[0]._-zz_ temp[3]._xy_ , +alt_temp[3].w = ME_COS temp[0].y_ ; diff --git a/verbatim/cube_rotate_sin_cos.vs.asm b/verbatim/cube_rotate_sin_cos.vs.asm new file mode 100644 index 0000000..4cf230a --- /dev/null +++ b/verbatim/cube_rotate_sin_cos.vs.asm @@ -0,0 +1,14 @@ +temp[3].x = ME_SIN temp[0].___x ; +temp[3].y = ME_COS temp[0].___x ; +temp[3].z = ME_SIN temp[0].___y ; +temp[3].w = ME_COS temp[0].___y ; + +-- first rotation +temp[1].yz = VE_MUL input[0]._-zz_ temp[3]._xy_ ; + +temp[1].xyz = VE_MAD input[0].xyy_ temp[3].1yx_ temp[1].0yz_ ; + +-- second rotation +temp[2].xz = VE_MUL temp[1].-z_z_ temp[3].z_w_ ; + +temp[2].xyz = VE_MAD temp[1].xyx_ temp[3].w1z_ temp[2].x0z_ ; diff --git a/verbatim/pow_fragment_shader.fs.asm b/verbatim/pow_fragment_shader.fs.asm new file mode 100644 index 0000000..c587ea8 --- /dev/null +++ b/verbatim/pow_fragment_shader.fs.asm @@ -0,0 +1,12 @@ +-- a = log(a) / log(2) +src0.a = temp[0] : + temp[0].a = LN2 src0.a ; + +-- a = a * 32.0 + 0 +src0.a = temp[0] , +src1.a = float(96) : -- 32.0 (or any other constant) + temp[0].a = MAD src0.a src1.a src1.0 ; + +-- a = 2 ^ a +src0.a = temp[0] : + temp[0].a = EX2 src0.a ; diff --git a/verbatim/r500_view_clip.c b/verbatim/r500_view_clip.c new file mode 100644 index 0000000..0a77aae --- /dev/null +++ b/verbatim/r500_view_clip.c @@ -0,0 +1,6 @@ +VAP_VTE_CNTL__VPORT_Z_SCALE_ENA(0) +VAP_VTE_CNTL__VPORT_Z_OFFSET_ENA(0) +VAP_VTE_CNTL__VTX_XY_FMT(1) +VAP_VTE_CNTL__VTX_Z_FMT(0) +VAP_VTE_CNTL__VTX_W0_FMT(1) +VAP_CNTL__DX_CLIP_SPACE_DEF(1) diff --git a/verbatim/sin_operand_remap.vs.asm b/verbatim/sin_operand_remap.vs.asm new file mode 100644 index 0000000..8ae317c --- /dev/null +++ b/verbatim/sin_operand_remap.vs.asm @@ -0,0 +1,8 @@ +-- CONST[0] = {0.159155, 0.5, 6.283185, -3.141593} + +-- t = t * 0.159155 + 0.5 +temp[0].xy = VE_MAD temp[0].xy__ const[0].xx__ const[0].yy__ ; +-- t = frac(t) +temp[0].xy = VE_FRC temp[0].xy__ ; +-- t = t * 6.283185 + -3.141593 +temp[0].xy = VE_MAD temp[0].xy__ const[0].zz__ const[0].ww__ ; diff --git a/videos/cube_clipped_far.mp4 b/videos/cube_clipped_far.mp4 new file mode 100644 index 0000000..4eebc38 Binary files /dev/null and b/videos/cube_clipped_far.mp4 differ diff --git a/videos/cube_clipped_far.png b/videos/cube_clipped_far.png new file mode 100644 index 0000000..51ad3b0 Binary files /dev/null and b/videos/cube_clipped_far.png differ diff --git a/videos/cube_clipped_near.mp4 b/videos/cube_clipped_near.mp4 new file mode 100644 index 0000000..9abc332 Binary files /dev/null and b/videos/cube_clipped_near.mp4 differ diff --git a/videos/cube_clipped_near.png b/videos/cube_clipped_near.png new file mode 100644 index 0000000..91f1c8d Binary files /dev/null and b/videos/cube_clipped_near.png differ diff --git a/videos/cube_clipped_near_opengl.mp4 b/videos/cube_clipped_near_opengl.mp4 new file mode 100644 index 0000000..a415571 Binary files /dev/null and b/videos/cube_clipped_near_opengl.mp4 differ diff --git a/videos/cube_clipped_near_opengl.png b/videos/cube_clipped_near_opengl.png new file mode 100644 index 0000000..3f3320a Binary files /dev/null and b/videos/cube_clipped_near_opengl.png differ diff --git a/videos/cube_warped_textures.mp4 b/videos/cube_warped_textures.mp4 new file mode 100644 index 0000000..0ccc367 Binary files /dev/null and b/videos/cube_warped_textures.mp4 differ diff --git a/videos/cube_warped_textures.png b/videos/cube_warped_textures.png new file mode 100644 index 0000000..4b97ea1 Binary files /dev/null and b/videos/cube_warped_textures.png differ diff --git a/videos/suzanne.mp4 b/videos/suzanne.mp4 new file mode 100644 index 0000000..f97dd1a Binary files /dev/null and b/videos/suzanne.mp4 differ diff --git a/videos/suzanne.png b/videos/suzanne.png new file mode 100644 index 0000000..3184ef3 Binary files /dev/null and b/videos/suzanne.png differ