october 31 update

This commit is contained in:
Zack Buhman 2025-11-11 18:34:41 -06:00
parent ca4d7fb8ee
commit 2c6e735350
42 changed files with 889 additions and 10 deletions

2
.gitignore vendored
View File

@ -19,3 +19,5 @@ verbatim/*.svg
verbatim/*.pdf verbatim/*.pdf
verbatim/output verbatim/output
images/*.data images/*.data
/index*.svg
diagrams/*-.svg

View File

@ -15,10 +15,17 @@ echo 'figure.figure { margin-left: 20px; margin-right: 20px; }' >> index.css
echo 'pre.verbatim { font-size: 0.9em; }' >> index.css echo 'pre.verbatim { font-size: 0.9em; }' >> index.css
sed -i 's|color-scheme: light dark;||g' index.css sed -i 's|color-scheme: light dark;||g' index.css
echo 'figcaption.caption { margin-bottom: 1.3em; margin-top: 0.3em; }' >> index.css echo 'figcaption.caption { margin-bottom: 1.3em; margin-top: 0.3em; }' >> index.css
echo '.cmti-10 { font-style: italic; }' >> index.css
sed -i 's/index.css/index2.css/g' index.html sed -i 's/˜/~/g' index.html
mv index.css index2.css sed -i "s|<p class='noindent'><object class='graphics' data='diagrams/z_operations.svg' name='picture diagrams/z_operations' type='image/svg+xml'></object>|<p class='noindent' style='text-align: center;'><object class='graphics' style='width: 40em;' data='diagrams/z_operations.svg' name='picture diagrams/z_operations' type='image/svg+xml'></object>|g" index.html
sed -i '/height: 2.5em;/d' index.css
sed -i 's/index.css/index3.css/g' index.html
mv index.css index3.css
python replace_video.py index.html python replace_video.py index.html

3
diagrams/build.sh Normal file
View File

@ -0,0 +1,3 @@
dot -Tsvg z_operations.dot > z_operations.svg
#sed -i 's/scale(1 1)/scale(0.75 0.75)/g' z_operations.svg

View File

@ -0,0 +1,31 @@
import sys
scale = 0.75
def scale_svg(lines):
svg = "".join(lines)
head, viewbox = svg.split("viewBox=\"", maxsplit=1)
viewbox, tail = viewbox.split('"', maxsplit=1)
x, y, width, height = map(float, viewbox.split())
yield head
yield f'viewBox="{x} {y} {width * scale} {height * scale}"'
yield tail
def transform():
with open(sys.argv[1]) as f:
svg_lines = []
for line in f.readlines():
if line.strip().startswith("<svg"):
svg_lines.append(line)
elif svg_lines != []:
svg_lines.append(line)
if line.strip().endswith(">"):
yield from scale_svg(svg_lines)
svg_lines = []
else:
yield line
lines = list(transform())
with open(sys.argv[1], 'w') as f:
f.write(''.join(lines))

BIN
diagrams/sin_clamp.pdf Normal file

Binary file not shown.

34
diagrams/sin_clamp.tex Normal file
View File

@ -0,0 +1,34 @@
\documentclass[varwidth=13.1cm, border={0.0cm 0.0cm 0.0cm 0.0cm}]{standalone}
\usepackage{tikz}
\usepackage[dvipsnames]{xcolor}
\usepackage{pgfplots}
\pgfplotsset{compat=1.18}
\usepackage{amsmath}
\newcommand{\Clamp}[1]{\operatorname{clamp}#1}
\begin{document}
\begin{tikzpicture}[scale=0.5]
\draw[very thin,color=gray] (-pi * 3,-pi * 1.2) grid (pi * 3, pi * 1.2);
\draw[->] (-3.2*pi,0) -- (3.2*pi,0) node[right] {$x$};
\draw[->] (0,-pi * 1.4) -- (0,pi * 1.5) node[above] {$f(x)$};
\draw[thick, color=NavyBlue] plot [domain=-pi * 3:pi * 3, samples=100] (\x, {min(max(\x, -pi), pi)} );
\draw[thick, color=OrangeRed] plot [domain=-pi * 3:pi * 3, samples=1000] (\x, {sin(min(max(\x, -pi), pi) r)} );
\node[NavyBlue] at (0, -5.4) {$f(x) = \Clamp(x, -\pi, +\pi) $};
\node[OrangeRed] at (0, -6.6) {$f(x) = \sin(\Clamp(x, -\pi, +\pi)) $};
\draw [dashed, color=ForestGreen] (-2 * pi,-3.8) -- (-2 * pi,3.8) node[above] {$x=-2\pi$};
\draw [dashed, color=Brown] (2 * pi,-3.8) -- (2 * pi,3.8) node[above] {$x=2\pi$};
\draw [dashed, color=Fuchsia] (-3.0,pi) -- (3.0,pi) ;
\draw [color=Fuchsia] (0, pi + 0.5) node {$y=\pi$};
\draw [dashed, color=Peach] (-3.0,-pi) -- (3.0,-pi) ;
\draw [color=Peach] (0, -pi + 0.5) node {$y=-\pi$};
\end{tikzpicture}
\end{document}

BIN
diagrams/sin_frac.pdf Normal file

Binary file not shown.

38
diagrams/sin_frac.tex Normal file
View File

@ -0,0 +1,38 @@
\documentclass[varwidth=13.1cm, border={0.0cm 0.0cm 0.0cm 0.0cm}]{standalone}
\usepackage{tikz}
\usepackage[dvipsnames]{xcolor}
\usepackage{pgfplots}
\pgfplotsset{compat=1.18}
\usepackage{amsmath}
\newcommand{\Frac}[1]{\operatorname{frac}#1}
\begin{document}
\begin{tikzpicture}[scale=0.5]
\draw[very thin,color=gray] (-pi * 3,-pi * 1.2) grid (pi * 3, pi * 1.2);
\draw[->] (-3.2*pi,0) -- (3.2*pi,0) node[right] {$x$};
\draw[->] (0,-pi * 1.4) -- (0,pi * 1.5) node[above] {$f(x)$};
\foreach \i in {-2, 0, 2}{
\pgfmathsetmacro{\start}{(\i - 1) * pi}
\pgfmathsetmacro{\end} {(\i + 1) * pi}
\draw[thick, color=NavyBlue] plot [domain=\start:\end, samples=100] (\x, {((\x * 1/(2 * pi) + 0.5) - floor(\x * 1/(2 * pi) + 0.5)) * 2 * pi - pi} );
}
\draw[thick, color=OrangeRed] plot [domain=-pi * 3:pi * 3, samples=1000] (\x, {sin((((\x * 1/(2 * pi) + 0.5) - floor(\x * 1/(2 * pi) + 0.5)) * 2 * pi - pi) r)} );
\node[NavyBlue] at (0, -5.4) {$f(x) = \Frac(x \cdot \frac{1}{2\pi}+0.5) \cdot 2\pi - \pi $};
\node[OrangeRed] at (0, -6.6) {$f(x) = \sin( \Frac(x \cdot \frac{1}{2\pi}+0.5) \cdot 2\pi - \pi ) $};
\draw [dashed, color=ForestGreen] (-2 * pi,-3.8) -- (-2 * pi,3.8) node[above] {$x=-2\pi$};
\draw [dashed, color=Brown] (2 * pi,-3.8) -- (2 * pi,3.8) node[above] {$x=2\pi$};
\draw [dashed, color=Fuchsia] (-3.0,pi) -- (3.0,pi) ;
\draw [color=Fuchsia] (0, pi + 0.5) node {$y=\pi$};
\draw [dashed, color=Peach] (-3.0,-pi) -- (3.0,-pi) ;
\draw [color=Peach] (0, -pi + 0.5) node {$y=-\pi$};
\end{tikzpicture}
\end{document}

60
diagrams/z_operations.dot Normal file
View File

@ -0,0 +1,60 @@
digraph G {
vertex_shader [label="(from the vertex shader)"]
subgraph cluster_clipping {
label = "clipping"
DX_CLIP_SPACE_DEF [label="DX_CLIP_SPACE_DEF
possibly clip the polygon"]
}
subgraph cluster_perspective {
label = "perspective division"
VTX_Z_FMT [nojustify=true label="VTX_Z_FMT
(if enabled) divide Z by W"]
}
subgraph cluster_viewport_transformation {
label = "viewport transformation"
VPORT_Z_SCALE
VPORT_Z_OFFSET
}
subgraph cluster_geometry_assembly {
}
subgraph cluster_setup_unit {
label = "setup unit"
SU_DEPTH_SCALE
SU_DEPTH_OFFSET
}
subgraph cluster_zfunc {
label = "ZFUNC"
{ rank=same
depth_test [shape=box label="depth test"]
depth_pass [shape=box label="depth pass"]
}
depth_test -> depth_pass
}
Z_BUFFER [shape=invhouse label="(write the new Z
value to the Z-buffer)"]
fragment_shader [label="(to the fragment shader)"]
vertex_shader -> DX_CLIP_SPACE_DEF
DX_CLIP_SPACE_DEF -> VTX_Z_FMT
VTX_Z_FMT -> VPORT_Z_SCALE
VPORT_Z_SCALE -> VPORT_Z_OFFSET
VPORT_Z_OFFSET -> SU_DEPTH_SCALE
SU_DEPTH_SCALE -> SU_DEPTH_OFFSET
SU_DEPTH_OFFSET -> depth_test
depth_test -> Z_BUFFER
VPORT_Z_OFFSET -> depth_pass
depth_pass -> fragment_shader
}

173
diagrams/z_operations.svg Normal file
View File

@ -0,0 +1,173 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<!-- Generated by graphviz version 12.2.1 (20241206.2353)
-->
<!-- Title: G Pages: 1 -->
<svg width="588pt" height="765pt"
viewBox="0.00 0.00 588.26 764.75" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 760.75)">
<title>G</title>
<polygon fill="white" stroke="none" points="-4,4 -4,-760.75 584.26,-760.75 584.26,4 -4,4"/>
<g id="clust1" class="cluster">
<title>cluster_clipping</title>
<polygon fill="none" stroke="black" points="94.4,-611.4 94.4,-712.75 382.4,-712.75 382.4,-611.4 94.4,-611.4"/>
<text text-anchor="middle" x="238.4" y="-695.45" font-family="Times,serif" font-size="14.00">clipping</text>
</g>
<g id="clust2" class="cluster">
<title>cluster_perspective</title>
<polygon fill="none" stroke="black" points="89.4,-502.04 89.4,-603.4 387.4,-603.4 387.4,-502.04 89.4,-502.04"/>
<text text-anchor="middle" x="238.4" y="-586.1" font-family="Times,serif" font-size="14.00">perspective division</text>
</g>
<g id="clust3" class="cluster">
<title>cluster_viewport_transformation</title>
<polygon fill="none" stroke="black" points="125.4,-344.79 125.4,-494.04 351.4,-494.04 351.4,-344.79 125.4,-344.79"/>
<text text-anchor="middle" x="238.4" y="-476.74" font-family="Times,serif" font-size="14.00">viewport transformation</text>
</g>
<g id="clust5" class="cluster">
<title>cluster_setup_unit</title>
<polygon fill="none" stroke="black" points="101.4,-187.54 101.4,-336.79 347.4,-336.79 347.4,-187.54 101.4,-187.54"/>
<text text-anchor="middle" x="224.4" y="-319.49" font-family="Times,serif" font-size="14.00">setup unit</text>
</g>
<g id="clust6" class="cluster">
<title>cluster_zfunc</title>
<polygon fill="none" stroke="black" points="187.4,-102.29 187.4,-179.54 405.4,-179.54 405.4,-102.29 187.4,-102.29"/>
<text text-anchor="middle" x="296.4" y="-162.24" font-family="Times,serif" font-size="14.00">ZFUNC</text>
</g>
<!-- vertex_shader -->
<g id="node1" class="node">
<title>vertex_shader</title>
<ellipse fill="none" stroke="black" cx="238.4" cy="-738.75" rx="134.33" ry="18"/>
<text text-anchor="middle" x="238.4" y="-734.08" font-family="Times,serif" font-size="14.00">(from the vertex shader)</text>
</g>
<!-- DX_CLIP_SPACE_DEF -->
<g id="node2" class="node">
<title>DX_CLIP_SPACE_DEF</title>
<ellipse fill="none" stroke="black" cx="238.4" cy="-649.45" rx="136.47" ry="30.05"/>
<text text-anchor="middle" x="238.4" y="-653.4" font-family="Times,serif" font-size="14.00">DX_CLIP_SPACE_DEF</text>
<text text-anchor="middle" x="238.4" y="-636.15" font-family="Times,serif" font-size="14.00">possibly clip the polygon</text>
</g>
<!-- vertex_shader&#45;&gt;DX_CLIP_SPACE_DEF -->
<g id="edge2" class="edge">
<title>vertex_shader&#45;&gt;DX_CLIP_SPACE_DEF</title>
<path fill="none" stroke="black" d="M238.4,-720.5C238.4,-712.05 238.4,-701.49 238.4,-691.15"/>
<polygon fill="black" stroke="black" points="241.9,-691.21 238.4,-681.21 234.9,-691.21 241.9,-691.21"/>
</g>
<!-- VTX_Z_FMT -->
<g id="node3" class="node">
<title>VTX_Z_FMT</title>
<ellipse fill="none" stroke="black" cx="238.4" cy="-540.09" rx="141.24" ry="30.05"/>
<text text-anchor="middle" x="238.4" y="-544.04" font-family="Times,serif" font-size="14.00">VTX_Z_FMT</text>
<text text-anchor="middle" x="238.4" y="-526.79" font-family="Times,serif" font-size="14.00">(if enabled) divide Z by W</text>
</g>
<!-- DX_CLIP_SPACE_DEF&#45;&gt;VTX_Z_FMT -->
<g id="edge3" class="edge">
<title>DX_CLIP_SPACE_DEF&#45;&gt;VTX_Z_FMT</title>
<path fill="none" stroke="black" d="M238.4,-619.11C238.4,-607.63 238.4,-594.27 238.4,-581.88"/>
<polygon fill="black" stroke="black" points="241.9,-581.91 238.4,-571.91 234.9,-581.91 241.9,-581.91"/>
</g>
<!-- VPORT_Z_SCALE -->
<g id="node4" class="node">
<title>VPORT_Z_SCALE</title>
<ellipse fill="none" stroke="black" cx="238.4" cy="-442.79" rx="97.51" ry="18"/>
<text text-anchor="middle" x="238.4" y="-438.12" font-family="Times,serif" font-size="14.00">VPORT_Z_SCALE</text>
</g>
<!-- VTX_Z_FMT&#45;&gt;VPORT_Z_SCALE -->
<g id="edge4" class="edge">
<title>VTX_Z_FMT&#45;&gt;VPORT_Z_SCALE</title>
<path fill="none" stroke="black" d="M238.4,-509.72C238.4,-497.95 238.4,-484.43 238.4,-472.7"/>
<polygon fill="black" stroke="black" points="241.9,-472.8 238.4,-462.8 234.9,-472.8 241.9,-472.8"/>
</g>
<!-- VPORT_Z_OFFSET -->
<g id="node5" class="node">
<title>VPORT_Z_OFFSET</title>
<ellipse fill="none" stroke="black" cx="238.4" cy="-370.79" rx="104.87" ry="18"/>
<text text-anchor="middle" x="238.4" y="-366.12" font-family="Times,serif" font-size="14.00">VPORT_Z_OFFSET</text>
</g>
<!-- VPORT_Z_SCALE&#45;&gt;VPORT_Z_OFFSET -->
<g id="edge5" class="edge">
<title>VPORT_Z_SCALE&#45;&gt;VPORT_Z_OFFSET</title>
<path fill="none" stroke="black" d="M238.4,-424.49C238.4,-417.2 238.4,-408.52 238.4,-400.33"/>
<polygon fill="black" stroke="black" points="241.9,-400.41 238.4,-390.41 234.9,-400.41 241.9,-400.41"/>
</g>
<!-- SU_DEPTH_SCALE -->
<g id="node6" class="node">
<title>SU_DEPTH_SCALE</title>
<ellipse fill="none" stroke="black" cx="227.4" cy="-285.54" rx="107.5" ry="18"/>
<text text-anchor="middle" x="227.4" y="-280.87" font-family="Times,serif" font-size="14.00">SU_DEPTH_SCALE</text>
</g>
<!-- VPORT_Z_OFFSET&#45;&gt;SU_DEPTH_SCALE -->
<g id="edge6" class="edge">
<title>VPORT_Z_OFFSET&#45;&gt;SU_DEPTH_SCALE</title>
<path fill="none" stroke="black" d="M236.12,-352.54C234.69,-341.72 232.81,-327.49 231.16,-315.02"/>
<polygon fill="black" stroke="black" points="234.65,-314.73 229.87,-305.27 227.72,-315.64 234.65,-314.73"/>
</g>
<!-- depth_pass -->
<g id="node9" class="node">
<title>depth_pass</title>
<polygon fill="none" stroke="black" points="397.4,-146.29 303.4,-146.29 303.4,-110.29 397.4,-110.29 397.4,-146.29"/>
<text text-anchor="middle" x="350.4" y="-123.62" font-family="Times,serif" font-size="14.00">depth pass</text>
</g>
<!-- VPORT_Z_OFFSET&#45;&gt;depth_pass -->
<g id="edge10" class="edge">
<title>VPORT_Z_OFFSET&#45;&gt;depth_pass</title>
<path fill="none" stroke="black" d="M321.07,-359.37C332.91,-354.32 343.68,-347.09 351.4,-336.79 390.89,-284.12 373.49,-200.91 360.3,-157.6"/>
<polygon fill="black" stroke="black" points="363.67,-156.65 357.29,-148.19 357,-158.78 363.67,-156.65"/>
</g>
<!-- SU_DEPTH_OFFSET -->
<g id="node7" class="node">
<title>SU_DEPTH_OFFSET</title>
<ellipse fill="none" stroke="black" cx="224.4" cy="-213.54" rx="114.87" ry="18"/>
<text text-anchor="middle" x="224.4" y="-208.87" font-family="Times,serif" font-size="14.00">SU_DEPTH_OFFSET</text>
</g>
<!-- SU_DEPTH_SCALE&#45;&gt;SU_DEPTH_OFFSET -->
<g id="edge7" class="edge">
<title>SU_DEPTH_SCALE&#45;&gt;SU_DEPTH_OFFSET</title>
<path fill="none" stroke="black" d="M226.66,-267.24C226.35,-259.95 225.97,-251.27 225.62,-243.08"/>
<polygon fill="black" stroke="black" points="229.12,-243 225.2,-233.16 222.13,-243.3 229.12,-243"/>
</g>
<!-- depth_test -->
<g id="node8" class="node">
<title>depth_test</title>
<polygon fill="none" stroke="black" points="285.15,-146.29 195.65,-146.29 195.65,-110.29 285.15,-110.29 285.15,-146.29"/>
<text text-anchor="middle" x="240.4" y="-123.62" font-family="Times,serif" font-size="14.00">depth test</text>
</g>
<!-- SU_DEPTH_OFFSET&#45;&gt;depth_test -->
<g id="edge8" class="edge">
<title>SU_DEPTH_OFFSET&#45;&gt;depth_test</title>
<path fill="none" stroke="black" d="M227.72,-195.29C229.8,-184.47 232.53,-170.24 234.93,-157.77"/>
<polygon fill="black" stroke="black" points="238.35,-158.49 236.8,-148.01 231.48,-157.17 238.35,-158.49"/>
</g>
<!-- depth_test&#45;&gt;depth_pass -->
<g id="edge1" class="edge">
<title>depth_test&#45;&gt;depth_pass</title>
<path fill="none" stroke="black" d="M285.52,-128.29C287.52,-128.29 289.53,-128.29 291.53,-128.29"/>
<polygon fill="black" stroke="black" points="291.49,-131.79 301.49,-128.29 291.49,-124.79 291.49,-131.79"/>
</g>
<!-- Z_BUFFER -->
<g id="node10" class="node">
<title>Z_BUFFER</title>
<polygon fill="none" stroke="black" points="0,-25.67 146.4,0 292.8,-25.67 292.66,-67.2 0.14,-67.2 0,-25.67"/>
<text text-anchor="middle" x="146.4" y="-41.1" font-family="Times,serif" font-size="14.00">(write the new Z</text>
<text text-anchor="middle" x="146.4" y="-23.85" font-family="Times,serif" font-size="14.00">value to the Z&#45;buffer)</text>
</g>
<!-- depth_test&#45;&gt;Z_BUFFER -->
<g id="edge9" class="edge">
<title>depth_test&#45;&gt;Z_BUFFER</title>
<path fill="none" stroke="black" d="M222.28,-110.1C211.86,-100.23 198.36,-87.42 185.63,-75.35"/>
<polygon fill="black" stroke="black" points="188.19,-72.95 178.52,-68.61 183.37,-78.03 188.19,-72.95"/>
</g>
<!-- fragment_shader -->
<g id="node11" class="node">
<title>fragment_shader</title>
<ellipse fill="none" stroke="black" cx="445.4" cy="-37.15" rx="134.86" ry="18"/>
<text text-anchor="middle" x="445.4" y="-32.47" font-family="Times,serif" font-size="14.00">(to the fragment shader)</text>
</g>
<!-- depth_pass&#45;&gt;fragment_shader -->
<g id="edge11" class="edge">
<title>depth_pass&#45;&gt;fragment_shader</title>
<path fill="none" stroke="black" d="M368.72,-110.1C383.04,-96.66 403.15,-77.79 419.18,-62.75"/>
<polygon fill="black" stroke="black" points="421.14,-65.71 426.04,-56.32 416.35,-60.61 421.14,-65.71"/>
</g>
</g>
</svg>

After

Width:  |  Height:  |  Size: 9.1 KiB

BIN
images/cube_scene.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 79 KiB

BIN
images/plane_scene.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 338 KiB

BIN
images/z_buffer_clipped.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.8 KiB

BIN
images/z_buffer_cube.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 65 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 39 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 29 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 13 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 13 KiB

469
index.tex
View File

@ -1,5 +1,6 @@
\documentclass[20pt]{article} \documentclass[20pt]{article}
\usepackage{amsmath}
\usepackage[font=small,labelfont=bf]{caption} \usepackage[font=small,labelfont=bf]{caption}
\usepackage{hyperref} \usepackage{hyperref}
\hypersetup{ \hypersetup{
@ -15,6 +16,7 @@
\graphicspath{ {./images/} } \graphicspath{ {./images/} }
\usepackage{minted} \usepackage{minted}
\usepackage{nicefrac}
\title{Radeon R500} \title{Radeon R500}
\date{} \date{}
@ -28,9 +30,9 @@
\section{Introduction} \section{Introduction}
The primary/minimal project goal is "draw a triangle on a Radeon R500 via direct The primary/minimal project goal is ``draw a triangle on a Radeon R500 via
memory-mapped hardware register and texture memory accesses". This means no direct memory-mapped hardware register and texture memory accesses''. This means
\href{https://mesa3d.org/}{Mesa}, no no \href{https://mesa3d.org/}{Mesa}, no
\href{https://github.com/torvalds/linux/tree/v6.12/drivers/gpu/drm/radeon}{radeon} \href{https://github.com/torvalds/linux/tree/v6.12/drivers/gpu/drm/radeon}{radeon}
kernel module, and certainly no OpenGL or Direct3D. kernel module, and certainly no OpenGL or Direct3D.
@ -661,14 +663,45 @@ from scratch. I first implemented the rotation in GLSL:
\caption*{\texttt{cube\_rotate.vs.glsl}} \caption*{\texttt{cube\_rotate.vs.glsl}}
\end{figure} \end{figure}
I verified that the GLSL version worked as expected in OpenGL, then I translated \subsubsection{Remapping shader unit sin/cos operands}
the GLSL to R500 vertex shader assembly, as:
Because this shader program depends on being able to calculate sin and cos, this
meant I immediately needed to understand how to use the \texttt{ME\_SIN} and
\texttt{ME\_COS} operations.
The R500 vertex shader ME unit clamps sin/cos operands to the range
$(-\pi,+\pi)$, as in:
\begin{figure}
\href{diagrams/sin_clamp.pdf}{\includegraphics{diagrams/sin_clamp.pdf}}
\end{figure}
``Remapping'' floating point values from $(-\infty,+\infty)$ to $(-\pi,+\pi)$ is not
obvious. I was not previously aware of this transformation:
\begin{figure}
\href{diagrams/sin_frac.pdf}{\includegraphics{diagrams/sin_frac.pdf}}
\end{figure}
Or, expressed as R500 vertex shader assembly:
\begin{figure}
\href{verbatim/sin_operand_remap.vs.asm}{\includegraphics{verbatim/output/sin_operand_remap.vs.asm.pdf}}
\end{figure}
\subsubsection{Translation of the GLSL vertex shader to R500 vertex shader assembly}
Having verified that the GLSL version works as expected in OpenGL, and knowing
how to use the R500 vertex shader sin/cos operations, then I translated the GLSL
to R500 vertex shader assembly, as:
\begin{figure} \begin{figure}
\href{verbatim/cube_rotate.vs.asm}{\includegraphics{verbatim/output/cube_rotate.vs.asm.pdf}} \href{verbatim/cube_rotate.vs.asm}{\includegraphics{verbatim/output/cube_rotate.vs.asm.pdf}}
\caption*{\texttt{cube\_rotate.vs.asm}} \caption*{\texttt{cube\_rotate.vs.asm}}
\end{figure} \end{figure}
\subsubsection{Vertex shader assembler/code generator debugging}
However, when I first executed the vertex shader cube rotation demo, I found However, when I first executed the vertex shader cube rotation demo, I found
it did not work as expected: it did not work as expected:
@ -775,8 +808,8 @@ I've written several \href{https://github.com/buhman/scu-dsp-asm}{nice assembler
for other architectures in the past, but I've never seen any instruction set for other architectures in the past, but I've never seen any instruction set
as expressive as R500 fragment shaders. as expressive as R500 fragment shaders.
I attempted to directly reflect this ``multiple tiers of operand argument I attempted to directly represent this ``multiple tiers of operand argument
decoding'' in the syntax I invented for fragment shader ALU instructions. decoding'' in my fragment shader ALU instructions syntax.
These instructions are also vector instructions: a total of 24 floating point These instructions are also vector instructions: a total of 24 floating point
input operands and 8 floating results could be evaluated per instruction. input operands and 8 floating results could be evaluated per instruction.
@ -902,4 +935,426 @@ except:
The exponent/mantissa table that shows example 7-bit float values on page 106 of The exponent/mantissa table that shows example 7-bit float values on page 106 of
\href{doc/R5xx_Acceleration_v1.5.pdf}{R5xx\_Acceleration\_v1.5.pdf} is incorrect. \href{doc/R5xx_Acceleration_v1.5.pdf}{R5xx\_Acceleration\_v1.5.pdf} is incorrect.
\section{Progress: 26 Oct 2025}
From 21 Oct 2025 to 26 Oct 2025, I achieved the following (roughly in chronological order):
\begin{itemize}
\item I \href{https://git.idk.st/bilbo/r500/commit/8594bc4a38f6fcab2ac6e437b46bcf1e0e6d32dd}{rewrote} most of the vertex shader assembler parser/validator, and implemented support for \href{https://git.idk.st/bilbo/r500/commit/f3f1969f4a9b336536f5fb23d246f7103c41e20d}{assembling/disassembling ``dual math'' operations}
\item I implemented support for \href{https://git.idk.st/bilbo/r500/commit/96d7286e7cd3270b9dca0924d3a046d585d6dc9d}{assembling} and \href{https://git.idk.st/bilbo/r500/commit/27227426eaac265bc3126edd7d017c791640e789}{disassembling} TEX fragment shader instructions
\item I presented this project (including live demos on real hardware) at
a \href{https://itch.io/jam/spoopy-jam-7-heckraiser}{local in-person game jam event}
\end{itemize}
\subsection{Vertex shader optimization part 1: ``MOV'' elimination}
After talking about it in-person, I decided to try to golf my original
15-instruction
\href{https://git.idk.st/bilbo/r500/src/commit/c8ae311e60/drm/cube_rotate.vs.asm}{cube\_rotate.vs.asm} vertex shader.
The first opportunity for optimization is in the first two instructions of:
\begin{figure}
\href{verbatim/cube_rotate_const_move.vs.asm}{\includegraphics{verbatim/output/cube_rotate_const_move.vs.asm.pdf}}
\end{figure}
The \texttt{VE\_ADD} (being used here as a ``MOV'' instruction) is needed
because there is only a single 128-bit read port into \texttt{const} memory, so
a multiply-add like this is illegal:
\begin{figure}
\href{verbatim/cube_rotate_const_move_illegal.vs.asm}{\includegraphics{verbatim/output/cube_rotate_const_move_illegal.vs.asm.pdf}}
\end{figure}
I observed that because I never need to reference the last two constants in the
same instruction that references the first two constants, if I rearrange the
ordering of the constants to:
\begin{figure}
\href{verbatim/cube_rotate_const_move_rearrange.vs.asm}{\includegraphics{verbatim/output/cube_rotate_const_move_rearrange.vs.asm.pdf}}
\end{figure}
I can then rewrite the multiply-add instructions as:
\begin{figure}
\href{verbatim/cube_rotate_const_move_rearrange_mad.vs.asm}{\includegraphics{verbatim/output/cube_rotate_const_move_rearrange_mad.vs.asm.pdf}}
\end{figure}
\subsection{Vertex shader optimization part 2: ``dual math'' instructions}
I spent an entire day rewriting large portions of the vertex shader assembler to
add support for ``dual math'' instructions.
The original
\href{https://git.idk.st/bilbo/r500/src/commit/c8ae311e60/drm/cube_rotate.vs.asm}{cube\_rotate.vs.asm}
contains this sequence of \texttt{ME_SIN}/\texttt{ME\_COS} instructions:
\begin{figure}
\href{verbatim/cube_rotate_sin_cos.vs.asm}{\includegraphics{verbatim/output/cube_rotate_sin_cos.vs.asm.pdf}}
\end{figure}
The \texttt{temp[3].x} and \texttt{temp[3].y} results are needed immediately,
but \texttt{temp[3].z} and \texttt{temp[3].w} are not needed until after the
first pair of \texttt{VE\_MUL}/\texttt{VE\_MAD} operations.
The dual math instruction mode replaces the 3rd \texttt{VE_} instruction operand
with any \texttt{ME\_} operation, so it is only usable with 2-operand
\texttt{VE\_} instructions like \texttt{VE\_MUL}.
The dual math encoding also has several restrictions (it only has \nicefrac{1}{4}th the
control word bits compared to a normal \texttt{ME\_} instruction). A notable
restriction is that it must write to \texttt{alt\_temp}.
Unlike the fancy things that can be done with fragment shader
operands/sources/swizzles, a single vertex shader operand can also only read
from a single 128-bit register, so this means to be able to continue to access
\texttt{temp[3].zw} as a vector, both \texttt{z} and \texttt{w} must now be
stored in \texttt{alt\_temp}, even if only one of them was written by a ``dual
math'' instruction.
The change (and my newly-implemented dual math syntax) is:
\begin{figure}
\href{verbatim/cube_rotate_dual_math.vs.asm}{\includegraphics{verbatim/output/cube_rotate_dual_math.vs.asm.pdf}}
\end{figure}
Where the dual math instruction:
\begin{figure}
\href{verbatim/cube_rotate_dual_math_single_instruction.vs.asm}{\includegraphics{verbatim/output/cube_rotate_dual_math_single_instruction.vs.asm.pdf}}
\end{figure}
Is encoded by the assembler as single instruction and is executed by the vertex
shader unit in a single clock cycle.
The final
\href{https://git.idk.st/bilbo/r500/src/commit/c8ae311e60/drm/cube_rotate_optimize.vs.asm}{cube\_rotate\_optimize.vs.asm}
was reduced from 15 instructions to 13 instructions (compared
to Mesa's R500 vertex shader compiler's 27 instructions).
\section{Progress: 29 Oct 2025}
From 27 Oct 2025 to 29 Oct 2025, I achieved the following (roughly in chronological order):
\begin{itemize}
\item I implemented support for \href{https://git.idk.st/bilbo/r500/commit/9aecbbfc6f297ea71c72f4c4fba1b8107be95ca1}{``multiple render targets''} in the fragment shader assembler
\item I wrote a \href{https://git.idk.st/bilbo/r500/src/commit/18b7a593bd/drm/texture_blur_horizontal.fs.asm}{gaussian blur fragment shader}
\item I made a demo that draws \href{https://git.idk.st/bilbo/r500/src/commit/18b7a593bd/drm/pumpkin_man.c#L963}{multiple 3D ``objects''} where each object's UV coordinates sample a \href{https://git.idk.st/bilbo/r500/src/commit/18b7a593bd/drm/pumpkin_man.c#L1029-L1069}{different} \href{https://git.idk.st/bilbo/r500/src/commit/18b7a593bd/drm/pumpkin_man.c#L314}{texture}
\item I did several experiments related to R500's Z-buffer implementation
\end{itemize}
\subsection{Z-buffer experiments}
\label{sec:z-buffer-experiments}
Though I produced a ``properly'' Z-buffered 3D cube demo previously, I felt I
did not fully understand the relationship between Z coordinates, W coordinates,
viewport transformations, and the actual values that are written the the
Z-buffer. At some point, I'd like to write fragment shaders that sample the
Z-buffer, so I feel I need to understand this more rigorously.
For comparison, Sega Dreamcast stores 32-bit floating-point values in the
``depth accumulation buffer''. This effectively means that any Z coordinates can
be stored in the depth accumulation buffer without scaling or range
remapping. I've made several
\href{https://az1.idk.st/public/20kdm2-demo.mp4}{moderately fancy} Dreamcast
demos in that happily store arbitrary ``view space'' Z values in the depth
accumulation buffer without any visible depth aliasing/artifacts.
In contrast, the Radeon R500 does not have a 32-bit floating point Z-buffer
format. Instead, R500 supports (\href{doc/R5xx_Acceleration_v1.5.pdf}{R5xx\_Acceleration\_v1.5.pdf}, page 283,
\texttt{ZB\_FORMAT}):
\begin{itemize}
\item 16-bit integer Z
\item 16-bit floating point
\item 24-bit integer Z with 8-bit stencil
\end{itemize}
The third option, with the most bits, clearly ought to give the most
precision--with the caveat that the Z values that are written to the Z-buffer
should be scaled to be uniformly distributed across the range of 24-bit integers.
I performed several tests with variations of
\href{https://git.idk.st/bilbo/r500/src/branch/main/drm/zbuffer_test.c}{zbuffer\_test.c}. The
general strategy was:
\begin{itemize}
\item Define some contrived/illustrative 3D scene
\item Manipulate the scale/range of Z and W values
\item Observe the state of the Z-buffer after rendering
\end{itemize}
The first scene I chose was of a tilted plane that is non-coplanar with the view
space XY plane, as in:
\begin{figure}
\href{images/plane_scene.png}{\includegraphics{images/plane_scene.png}}
\caption*{Blender screenshot, ``plane scene''}
\end{figure}
Where the grey plane is the object that is to be rendered, the yellow lines
represent a ``camera'' from which the plane is to be viewed, and the blue line
represents the view/clip-space Z axis.
To view the content of the Z buffer, I wrote a
\href{https://git.idk.st/bilbo/r500/src/commit/18b7a593bd/tools/zbuf_decode.py}{simple script}
to convert the 24-bit integer Z-buffer to 16-bit
\href{https://en.wikipedia.org/wiki/Netpbm}{PGM},
so that it can be easily viewed in an image editor. This tool also shows the
minimum and maximum values found in the Z-buffer, intended to help verify that
the entire numeric range of the Z-buffer is being used.
While I expected to see the (orthographic, directly facing the camera) plane
drawn on the Z-buffer as a smooth gradient such as:
\begin{figure}
\href{images/z_buffer_gradient.png}{\includegraphics{images/z_buffer_gradient.png}}
\caption*{R500 framebuffer capture, \texttt{z\_buffer\_gradient.png}}
\end{figure}
Several of my tests displayed numeric aliasing, overflows, underflows, etc..:
\begin{figure}
\href{images/z_buffer_overflow.png}{\includegraphics{images/z_buffer_overflow.png}}
\caption*{R500 framebuffer capture, \texttt{z\_buffer\_overflow.png}}
\end{figure}
Of particular interest to me was to verify the behavior of the
\texttt{DX\_CLIP\_SPACE\_DEF} bit
(\href{doc/R5xx_Acceleration_v1.5.pdf}{R5xx\_Acceleration\_v1.5.pdf}, page
255--this is also the only place in the entire manual where ``non-user'' clip
planes are even defined), and to understand the order of pipeline operations.
I played with moving the plane around, to observe clipping behavior (here the
lower half of the scene was clipped due to intersecting the Z=+1.0 clip plane):
\begin{figure}
\href{images/z_buffer_clipped.png}{\includegraphics{images/z_buffer_clipped.png}}
\caption*{R500 framebuffer capture, \texttt{z\_buffer\_clipped.png}\\
(also simultaneously showing overflow/underflow artifacts)}
\end{figure}
Thinking at this point that I nearly understood most of the pieces, I then
re-enabled XY perspective division:
\begin{figure}
\href{images/z_buffer_perspective.png}{\includegraphics{images/z_buffer_perspective.png}}
\caption*{R500 framebuffer capture, \texttt{z\_buffer\_perspective.png}}
\end{figure}
The above image was not quite what I wanted: I noticed the range of the Z buffer
values were roughly between \texttt{0} and \texttt{8388607}, but what I really
wanted was \texttt{0} to \texttt{16777215}. Adjusting scale again produced this
Z-buffer:
\begin{figure}
\href{images/z_buffer_perspective_scale.png}{\includegraphics{images/z_buffer_perspective_scale.png}}
\caption*{R500 framebuffer capture, \texttt{z\_buffer\_perspective\_scale.png}}
\end{figure}
Up to this point, I was using \texttt{ZFUNC=GREATER} with a Z-buffer cleared
with an initial depth of zero, where all Z values are negative numbers.
I decided it might be more intuitive to use a Z-buffer that is cleared with an
initial depth of one, using \texttt{ZFUNC=LESS} instead where all Z values are
positive numbers.
With these adjustments, I captured a Z-buffer from the earlier cube demo:
\begin{figure}
\href{images/z_buffer_cube.png}{\includegraphics{images/z_buffer_cube.png}}
\caption*{R500 framebuffer capture, \texttt{z\_buffer\_cube.png}}
\end{figure}
This was still not quite ``correct'', because the minimum depth of the cube is
being drawn as \textasciitilde{}\texttt{2763306} (\textasciitilde{}0.16), but I expected
something closer to zero.
Adjusting my range/scale arithmetic again produced this image:
\begin{figure}
\href{images/z_buffer_cube_range.png}{\includegraphics{images/z_buffer_cube_range.png}}
\caption*{R500 framebuffer capture, \texttt{z\_buffer\_cube\_range.png}}
\end{figure}
The minimum Z value now appears to be closer to zero, but the ``back'' faces of
the cube (and maximum Z values) are not visible. Without changing any
scale/range constants, inverting \texttt{ZFUNC} and using a zero-initialized
Z-buffer produced this image of the back faces of the cube:
\begin{figure}
\href{images/z_buffer_cube_range_back.png}{\includegraphics{images/z_buffer_cube_range_back.png}}
\caption*{R500 framebuffer capture, \texttt{z\_buffer\_cube\_range\_back.png}}
\end{figure}
Indeed, the maximum Z value is close to \textasciitilde{}\texttt{16777215}
(\textasciitilde{}1.0), as intended. I feel at this point I have a better intuition
for using integer Z-buffers. The pipeline (and relevant registers) appears to be
something like this:
\begin{figure}
\includegraphics{diagrams/z_operations.svg}
\caption*{R500 Z transform pipeline (simplified)}
\end{figure}
Prior to these experiments, I was not aware \texttt{SU\_DEPTH\_SCALE} is the
thing directly responsible for scaling floating point Z values to the integer Z
values stored in the depth buffer.
In general, the hardware perspective divide, viewport transform, clipping, and
setup units are absolutely fascinating.
\subsection{3D perspective}
Despite making many 3D demos in the past, I feel that every time I want to
``draw something 3D'' on a new platform, I need to re-relearn 3D/perspective
transformations, (perhaps because I never truly \textit{learned} anything).
In many OpenGL articles/tutorials/books the
\href{https://learnopengl.com/Getting-started/Coordinate-Systems}{standard}
\href{https://ogldev.org/www/tutorial12/tutorial12.html}{formula} for
\href{https://songho.ca/opengl/gl_projectionmatrix.html}{explaining}
\href{https://www.scratchapixel.com/lessons/3d-basic-rendering/perspective-and-orthographic-projection-matrix/opengl-perspective-projection-matrix.html}{perspective}
\href{https://learnwebgl.brown37.net/08_projections/projections_perspective.html}{projection}
appears to be:
\begin{itemize}
\item Begin with an overly-academic explanation of perspective in terms of camera optics and trigonometry
\item Do not implement or demonstrate the any of the systems or mathematics
described in the preceding pages of explanations; intead abruptly hide all
magic behind \texttt{glm::perspective}
\item Refuse to explain or clarify further
\item Continue for the next 30 chapters/articles without ever revisiting focal
length, view frustums, depth of field, etc.. again
\end{itemize}
It is sufficient to instead rationalize/implement ``perspective'' as:
\begin{quote}
Perspective is the division of X and Y coordinates by Z, where the coordinate
$(0, 0, 0)$ is the view origin (and the center of the screen/projection).
\end{quote}
Defining perspective this way also works for OpenGL, with some slight
adjustment, notably to deal with OpenGL's
\href{https://registry.khronos.org/OpenGL/specs/gl/glspec20.pdf}{definition} of
``normalized device coordinates''.
I note that (unlike Dreamcast) one can't actually divide by Z on R500 (nor
OpenGL), both because the VTE doesn't support this, and because the texture
unit doesn't support this. Of course, I tried it anyway:
\begin{figure}
\includegraphics{videos/cube_warped_textures.png}
\caption*{R500 DVI capture, \texttt{texture\_cube\_warping.c} \\
(unrelated to this demo, R500 also interestingly has a dedicated ``disable perspective-correct texture mapping'' bit)}
\end{figure}
Instead, in both cases, the R500 uses the W coordinate for division. This turns
out to be very convenient, because it means that that the ``field of
view''/perspective scale (W) and the Z-buffer/depth test scale (Z) can be
adjusted independently.
\subsection{3D clipping}
Here are several examples of improperly scaled Z values, which are being clipped
by the setup unit:
\begin{figure}
\includegraphics{videos/cube_clipped_far.png}
\caption*{R500 DVI capture, \texttt{texture\_cube\_clear\_zwrite\_vertex\_shader\_optimize\_zscale.c} \\
(``far'' clip plane intersection)}
\end{figure}
\begin{figure}
\includegraphics{videos/cube_clipped_near.png}
\caption*{R500 DVI capture, \texttt{texture\_cube\_clear\_zwrite\_vertex\_shader\_optimize\_zscale.c} \\
(``near'' clip plane intersection)}
\end{figure}
\begin{figure}
\includegraphics{videos/cube_clipped_near_opengl.png}
\caption*{R500 DVI capture, \texttt{texture\_cube\_clear\_zwrite\_vertex\_shader\_optimize\_zscale.c} \\
(I am curious to learn under what circumstances the OpenGL designers thought\\ $-w_{c} < z_{c} < w_{c}$ was a good idea)}
\end{figure}
\section{Progress: 31 Oct 2025}
From 30 Oct 2025 to 31 Oct 2025, I achieved the following (non-chronological):
\begin{itemize}
\item I implemented a \href{https://git.idk.st/bilbo/r500/src/branch/main/drm/matrix_cubesphere_specular.fs.asm}{diffuse/specular lighting fragment shader} in R500 fragment shader assembly
\item I made vertex shaders that represent coordinate space transformations
using matrix multiplications rather than ad-hoc arithmetic
\item While writing demos that pass multiple (interpolated) vectors from the
vertex shader to the fragment shader, I learned more about \href{https://git.idk.st/bilbo/r500/src/commit/f43ac599f9/drm/matrix_cubesphere_specular_suzanne.cpp#L444-L512}{``rasterizer instructions''}
\item I made a demo that uses more than one texture for the entire scene
(by \href{https://git.idk.st/bilbo/r500/src/commit/f43ac599f9/drm/pumpkin_man.c#L272-L317}{reconfiguring
the texture unit for each ``object''})
\end{itemize}
\subsection{Lighting demo}
\begin{figure}
\includegraphics{videos/suzanne.png}
\caption*{R500 DVI capture, \texttt{matrix\_cubesphere\_specular\_suzanne.cpp} \\
(subdivided Suzanne mesh, 15,744 triangles)}
\end{figure}
Despite being a ``simple'' lighting demo, a surprising number of things need to
happen simultaneously before it becomes possible.
Where vertex shaders from previous demos were passed at most a single scalar
variable for animation/timing, the vertex shader in this demo uses
\href{https://git.idk.st/bilbo/r500/src/commit/f43ac599f9/drm/matrix_cubesphere_specular_suzanne.cpp#L301-L326}{10 vectors} as
input:
\begin{itemize}
\item 4 vectors for a ``local space to clip space'' transformation matrix
\item 4 vectors for a ``local space to world space'' transformation matrix (used for lighting)
\item 1 vector for a ``light position'' (in world space coordinates, used for lighting)
\item 1 vector for a ``view origin'' (in world space coordinates, used for lighting)
\end{itemize}
Additionally, where previous demos passed at most a single vector from the
vertex shader to the fragment shader (vertex color or texture coordinates), this
demo passes
\href{https://git.idk.st/bilbo/r500/src/commit/f43ac599f9/drm/matrix_cubesphere_specular_suzanne.cpp#L444-L512}{5 vectors}
from the vertex shader to the fragment shader, all of which are used
by the lighting calculation:
\begin{itemize}
\item world space position
\item world space normal
\item world space light position
\item world space view origin
\item uv space texture coordinates
\end{itemize}
\subsection{Learn algebra by writing fragment shader assembly}
Prior to today, I did not know about this transformation/equivalence:
\begin{gather*}
x^{n} \iff 2^{\left( n\cdot\frac{\log(x)}{\log(2)} \right)}
\end{gather*}
While the R500 fragment shader alpha unit does not have a \texttt{POW} operation,
it does have \href{https://git.idk.st/bilbo/r500/src/commit/f43ac599f9/drm/matrix_cubesphere_specular.fs.asm#L93-L99}{\texttt{EX2} and \texttt{LN2}}
operations.
For example, one could implement $a^{32}$ in R500 fragment shader assembly as:
\begin{figure}
\href{verbatim/pow_fragment_shader.fs.asm}{\includegraphics{verbatim/output/pow_fragment_shader.fs.asm.pdf}}
\end{figure}
This ``arbitrary exponents with arbitrary bases'' pattern is used in the
lighting demo fragment shader as part of the ``specular intensity'' calculation.
This fragment shader unit feature is very cool, because a software
implementation of a generalized floating-point \texttt{pow} function is
extremely
\href{https://git.musl-libc.org/cgit/musl/tree/src/math/powf.c?id=cb5c057c87240a9534f8e0d9b7ff2560082f6218}{computationally expensive}
otherwise.
\end{document} \end{document}

View File

@ -19,4 +19,4 @@ def transform():
lines = list(transform()) lines = list(transform())
with open(sys.argv[1], 'w') as f: with open(sys.argv[1], 'w') as f:
f.write('\n'.join(lines)) f.write(''.join(lines))

View File

@ -0,0 +1,8 @@
-- CONST[0] = {0.159155, 0.5, 6.283185, -3.141593}
-- CONST[1] = {theta1, theta2, 0.2, 0.5}
temp[0].xy = VE_ADD const[1].xy__ const[1].00__ ;
temp[0].xy = VE_MAD temp[0].xy__ const[0].xx__ const[0].yy__ ;
temp[0].xy = VE_FRC temp[0].xy__ ;
temp[0].xy = VE_MAD temp[0].xy__ const[0].zz__ const[0].ww__ ;

View File

@ -0,0 +1,3 @@
-- this is an illegal instruction:
-- const[1] and const[0] can not be read simultaneously
temp[0].xy = VE_MAD const[1].xy__ const[0].xx__ const[0].yy__ ;

View File

@ -0,0 +1,2 @@
-- CONST[0] = {theta1, theta2, 0.159155, 0.5}
-- CONST[1] = {6.283185, -3.141593, 0.2, 0.5}

View File

@ -0,0 +1,7 @@
-- the VE_ADD instruction is now not necessary/deleted:
-- temp[0].xy = VE_ADD const[1].xy__ const[1].00__ ;
-- const addresses and swizzles changed:
temp[0].xy = VE_MAD const[0].xy__ const[0].zz__ const[0].ww__ ;
temp[0].xy = VE_FRC temp[0].xy__ ;
temp[0].xy = VE_MAD temp[0].xy__ const[1].xx__ const[1].yy__ ;

View File

@ -0,0 +1,14 @@
temp[3].x = ME_SIN temp[0].___x ;
temp[3].y = ME_COS temp[0].___x ;
alt_temp[3].z = ME_SIN temp[0].___y ;
-- first rotation
temp[1].yz = VE_MUL input[0]._-zz_ temp[3]._xy_ ,
alt_temp[3].w = ME_COS temp[0].y_ ;
temp[1].xyz = VE_MAD input[0].xyy_ temp[3].1yx_ temp[1].0yz_ ;
-- second rotation
temp[2].xz = VE_MUL temp[1].-z_z_ alt_temp[3].z_w_ ;
temp[2].xyz = VE_MAD temp[1].xyx_ alt_temp[3].w1z_ temp[2].x0z_ ;

View File

@ -0,0 +1,2 @@
temp[1].yz = VE_MUL input[0]._-zz_ temp[3]._xy_ ,
alt_temp[3].w = ME_COS temp[0].y_ ;

View File

@ -0,0 +1,14 @@
temp[3].x = ME_SIN temp[0].___x ;
temp[3].y = ME_COS temp[0].___x ;
temp[3].z = ME_SIN temp[0].___y ;
temp[3].w = ME_COS temp[0].___y ;
-- first rotation
temp[1].yz = VE_MUL input[0]._-zz_ temp[3]._xy_ ;
temp[1].xyz = VE_MAD input[0].xyy_ temp[3].1yx_ temp[1].0yz_ ;
-- second rotation
temp[2].xz = VE_MUL temp[1].-z_z_ temp[3].z_w_ ;
temp[2].xyz = VE_MAD temp[1].xyx_ temp[3].w1z_ temp[2].x0z_ ;

View File

@ -0,0 +1,12 @@
-- a = log(a) / log(2)
src0.a = temp[0] :
temp[0].a = LN2 src0.a ;
-- a = a * 32.0 + 0
src0.a = temp[0] ,
src1.a = float(96) : -- 32.0 (or any other constant)
temp[0].a = MAD src0.a src1.a src1.0 ;
-- a = 2 ^ a
src0.a = temp[0] :
temp[0].a = EX2 src0.a ;

View File

@ -0,0 +1,6 @@
VAP_VTE_CNTL__VPORT_Z_SCALE_ENA(0)
VAP_VTE_CNTL__VPORT_Z_OFFSET_ENA(0)
VAP_VTE_CNTL__VTX_XY_FMT(1)
VAP_VTE_CNTL__VTX_Z_FMT(0)
VAP_VTE_CNTL__VTX_W0_FMT(1)
VAP_CNTL__DX_CLIP_SPACE_DEF(1)

View File

@ -0,0 +1,8 @@
-- CONST[0] = {0.159155, 0.5, 6.283185, -3.141593}
-- t = t * 0.159155 + 0.5
temp[0].xy = VE_MAD temp[0].xy__ const[0].xx__ const[0].yy__ ;
-- t = frac(t)
temp[0].xy = VE_FRC temp[0].xy__ ;
-- t = t * 6.283185 + -3.141593
temp[0].xy = VE_MAD temp[0].xy__ const[0].zz__ const[0].ww__ ;

BIN
videos/cube_clipped_far.mp4 Normal file

Binary file not shown.

BIN
videos/cube_clipped_far.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 127 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 122 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 142 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 212 KiB

BIN
videos/suzanne.mp4 Normal file

Binary file not shown.

BIN
videos/suzanne.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 77 KiB