add index
This commit is contained in:
parent
6d73be14cc
commit
ca4d7fb8ee
21
.gitignore
vendored
Normal file
21
.gitignore
vendored
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
*.html
|
||||||
|
*.css
|
||||||
|
*.out
|
||||||
|
index.pdf
|
||||||
|
_minted/
|
||||||
|
*.aux
|
||||||
|
*.log
|
||||||
|
*.4ct
|
||||||
|
*.4tc
|
||||||
|
*.dvi
|
||||||
|
*.idv
|
||||||
|
*.lg
|
||||||
|
*.tmp
|
||||||
|
*.toc
|
||||||
|
*.xref
|
||||||
|
*~
|
||||||
|
verbatim/*.tex
|
||||||
|
verbatim/*.svg
|
||||||
|
verbatim/*.pdf
|
||||||
|
verbatim/output
|
||||||
|
images/*.data
|
||||||
28
build.sh
Normal file
28
build.sh
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
set -eux
|
||||||
|
|
||||||
|
rm -f verbatim/output/*.svg
|
||||||
|
|
||||||
|
make4ht --shell-escape index.tex "pic-m,pic-equation,svg"
|
||||||
|
|
||||||
|
echo 'img[alt="PIC"] { width: 100%; }' >> index.css
|
||||||
|
echo '.cmtt-10 { font-size: 0.9em; }' >> index.css
|
||||||
|
echo 'img[src="index3x.svg"] { height: 2.5em; }' >> index.css
|
||||||
|
|
||||||
|
sed -i '/prefers-color-scheme/d' index.css
|
||||||
|
sed -i 's| </span>|</span> |g' index.html
|
||||||
|
sed -i '/figure.figure/d' index.css
|
||||||
|
echo 'figure.figure { margin-left: 20px; margin-right: 20px; }' >> index.css
|
||||||
|
echo 'pre.verbatim { font-size: 0.9em; }' >> index.css
|
||||||
|
sed -i 's|color-scheme: light dark;||g' index.css
|
||||||
|
echo 'figcaption.caption { margin-bottom: 1.3em; margin-top: 0.3em; }' >> index.css
|
||||||
|
|
||||||
|
sed -i 's/index.css/index2.css/g' index.html
|
||||||
|
|
||||||
|
mv index.css index2.css
|
||||||
|
|
||||||
|
python replace_video.py index.html
|
||||||
|
|
||||||
|
for file in verbatim/output/*.svg; do
|
||||||
|
sed -i 's|rgb(0%, 0%, 100%)||g' "$file"
|
||||||
|
python resize_svg.py "$file"
|
||||||
|
done
|
||||||
61
diagrams/fragment_inputs.dot
Normal file
61
diagrams/fragment_inputs.dot
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
digraph D {
|
||||||
|
graph [ranksep="1" splines=line ordering="in"];
|
||||||
|
node [shape=box];
|
||||||
|
edge [arrowhead=none];
|
||||||
|
|
||||||
|
subgraph cluster_W {
|
||||||
|
addr [shape=none]
|
||||||
|
|
||||||
|
temp
|
||||||
|
const
|
||||||
|
float
|
||||||
|
}
|
||||||
|
|
||||||
|
subgraph cluster_Z {
|
||||||
|
{rank=same
|
||||||
|
src [shape=none]
|
||||||
|
src0 [label="src0"]
|
||||||
|
src1 [label="src1"]
|
||||||
|
src2 [label="src2"]
|
||||||
|
srcp
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
subgraph cluster_R {
|
||||||
|
|
||||||
|
{rank=same
|
||||||
|
opcode [shape=none];
|
||||||
|
a [label = "a"];
|
||||||
|
b [label = "b"];
|
||||||
|
c [label = "c"];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
temp:s -> src0:n
|
||||||
|
temp:s -> src1:n
|
||||||
|
temp:s -> src2:n
|
||||||
|
|
||||||
|
const:s -> src0:n
|
||||||
|
const:s -> src1:n
|
||||||
|
const:s -> src2:n
|
||||||
|
|
||||||
|
float:s -> src0:n
|
||||||
|
float:s -> src1:n
|
||||||
|
float:s -> src2:n
|
||||||
|
|
||||||
|
src0:s -> a:n
|
||||||
|
src1:s -> a:n
|
||||||
|
src2:s -> a:n
|
||||||
|
srcp:s -> a:n
|
||||||
|
|
||||||
|
src0:s -> b:n
|
||||||
|
src1:s -> b:n
|
||||||
|
src2:s -> b:n
|
||||||
|
srcp:s -> b:n
|
||||||
|
|
||||||
|
src0:s -> c:n
|
||||||
|
src1:s -> c:n
|
||||||
|
src2:s -> c:n
|
||||||
|
srcp:s -> c:n
|
||||||
|
}
|
||||||
205
diagrams/fragment_inputs.svg
Normal file
205
diagrams/fragment_inputs.svg
Normal file
@ -0,0 +1,205 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||||
|
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
|
||||||
|
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
|
||||||
|
<!-- Generated by graphviz version 12.2.1 (20241206.2353)
|
||||||
|
-->
|
||||||
|
<!-- Title: D Pages: 1 -->
|
||||||
|
<svg width="382pt" height="292pt"
|
||||||
|
viewBox="0.00 0.00 382.00 292.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
|
||||||
|
<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 288)">
|
||||||
|
<title>D</title>
|
||||||
|
<polygon fill="white" stroke="none" points="-4,4 -4,-288 378,-288 378,4 -4,4"/>
|
||||||
|
<g id="clust1" class="cluster">
|
||||||
|
<title>cluster_W</title>
|
||||||
|
<polygon fill="none" stroke="black" points="8,-224 8,-276 294,-276 294,-224 8,-224"/>
|
||||||
|
</g>
|
||||||
|
<g id="clust2" class="cluster">
|
||||||
|
<title>cluster_Z</title>
|
||||||
|
<polygon fill="none" stroke="black" points="8,-116 8,-168 366,-168 366,-116 8,-116"/>
|
||||||
|
</g>
|
||||||
|
<g id="clust4" class="cluster">
|
||||||
|
<title>cluster_R</title>
|
||||||
|
<polygon fill="none" stroke="black" points="35,-8 35,-60 333,-60 333,-8 35,-8"/>
|
||||||
|
</g>
|
||||||
|
<!-- addr -->
|
||||||
|
<g id="node1" class="node">
|
||||||
|
<title>addr</title>
|
||||||
|
<text text-anchor="middle" x="43" y="-245.32" font-family="Times,serif" font-size="14.00">addr</text>
|
||||||
|
</g>
|
||||||
|
<!-- temp -->
|
||||||
|
<g id="node2" class="node">
|
||||||
|
<title>temp</title>
|
||||||
|
<polygon fill="none" stroke="black" points="142,-268 88,-268 88,-232 142,-232 142,-268"/>
|
||||||
|
<text text-anchor="middle" x="115" y="-245.32" font-family="Times,serif" font-size="14.00">temp</text>
|
||||||
|
</g>
|
||||||
|
<!-- src0 -->
|
||||||
|
<g id="node6" class="node">
|
||||||
|
<title>src0</title>
|
||||||
|
<polygon fill="none" stroke="black" points="214,-160 160,-160 160,-124 214,-124 214,-160"/>
|
||||||
|
<text text-anchor="middle" x="187" y="-137.32" font-family="Times,serif" font-size="14.00">src0</text>
|
||||||
|
</g>
|
||||||
|
<!-- temp->src0 -->
|
||||||
|
<g id="edge1" class="edge">
|
||||||
|
<title>temp:s->src0:n</title>
|
||||||
|
<path fill="none" stroke="black" d="M115,-231C115,-231 187,-161 187,-161"/>
|
||||||
|
</g>
|
||||||
|
<!-- src1 -->
|
||||||
|
<g id="node7" class="node">
|
||||||
|
<title>src1</title>
|
||||||
|
<polygon fill="none" stroke="black" points="142,-160 88,-160 88,-124 142,-124 142,-160"/>
|
||||||
|
<text text-anchor="middle" x="115" y="-137.32" font-family="Times,serif" font-size="14.00">src1</text>
|
||||||
|
</g>
|
||||||
|
<!-- temp->src1 -->
|
||||||
|
<g id="edge2" class="edge">
|
||||||
|
<title>temp:s->src1:n</title>
|
||||||
|
<path fill="none" stroke="black" d="M115,-231C115,-231 115,-161 115,-161"/>
|
||||||
|
</g>
|
||||||
|
<!-- src2 -->
|
||||||
|
<g id="node8" class="node">
|
||||||
|
<title>src2</title>
|
||||||
|
<polygon fill="none" stroke="black" points="286,-160 232,-160 232,-124 286,-124 286,-160"/>
|
||||||
|
<text text-anchor="middle" x="259" y="-137.32" font-family="Times,serif" font-size="14.00">src2</text>
|
||||||
|
</g>
|
||||||
|
<!-- temp->src2 -->
|
||||||
|
<g id="edge3" class="edge">
|
||||||
|
<title>temp:s->src2:n</title>
|
||||||
|
<path fill="none" stroke="black" d="M115,-231C115,-231 259,-161 259,-161"/>
|
||||||
|
</g>
|
||||||
|
<!-- const -->
|
||||||
|
<g id="node3" class="node">
|
||||||
|
<title>const</title>
|
||||||
|
<polygon fill="none" stroke="black" points="214.12,-268 159.88,-268 159.88,-232 214.12,-232 214.12,-268"/>
|
||||||
|
<text text-anchor="middle" x="187" y="-245.32" font-family="Times,serif" font-size="14.00">const</text>
|
||||||
|
</g>
|
||||||
|
<!-- const->src0 -->
|
||||||
|
<g id="edge4" class="edge">
|
||||||
|
<title>const:s->src0:n</title>
|
||||||
|
<path fill="none" stroke="black" d="M187,-231C187,-231 187,-161 187,-161"/>
|
||||||
|
</g>
|
||||||
|
<!-- const->src1 -->
|
||||||
|
<g id="edge5" class="edge">
|
||||||
|
<title>const:s->src1:n</title>
|
||||||
|
<path fill="none" stroke="black" d="M187,-231C187,-231 115,-161 115,-161"/>
|
||||||
|
</g>
|
||||||
|
<!-- const->src2 -->
|
||||||
|
<g id="edge6" class="edge">
|
||||||
|
<title>const:s->src2:n</title>
|
||||||
|
<path fill="none" stroke="black" d="M187,-231C187,-231 259,-161 259,-161"/>
|
||||||
|
</g>
|
||||||
|
<!-- float -->
|
||||||
|
<g id="node4" class="node">
|
||||||
|
<title>float</title>
|
||||||
|
<polygon fill="none" stroke="black" points="286,-268 232,-268 232,-232 286,-232 286,-268"/>
|
||||||
|
<text text-anchor="middle" x="259" y="-245.32" font-family="Times,serif" font-size="14.00">float</text>
|
||||||
|
</g>
|
||||||
|
<!-- float->src0 -->
|
||||||
|
<g id="edge7" class="edge">
|
||||||
|
<title>float:s->src0:n</title>
|
||||||
|
<path fill="none" stroke="black" d="M259,-231C259,-231 187,-161 187,-161"/>
|
||||||
|
</g>
|
||||||
|
<!-- float->src1 -->
|
||||||
|
<g id="edge8" class="edge">
|
||||||
|
<title>float:s->src1:n</title>
|
||||||
|
<path fill="none" stroke="black" d="M259,-231C259,-231 115,-161 115,-161"/>
|
||||||
|
</g>
|
||||||
|
<!-- float->src2 -->
|
||||||
|
<g id="edge9" class="edge">
|
||||||
|
<title>float:s->src2:n</title>
|
||||||
|
<path fill="none" stroke="black" d="M259,-231C259,-231 259,-161 259,-161"/>
|
||||||
|
</g>
|
||||||
|
<!-- src -->
|
||||||
|
<g id="node5" class="node">
|
||||||
|
<title>src</title>
|
||||||
|
<text text-anchor="middle" x="43" y="-137.32" font-family="Times,serif" font-size="14.00">src</text>
|
||||||
|
</g>
|
||||||
|
<!-- a -->
|
||||||
|
<g id="node11" class="node">
|
||||||
|
<title>a</title>
|
||||||
|
<polygon fill="none" stroke="black" points="181,-52 127,-52 127,-16 181,-16 181,-52"/>
|
||||||
|
<text text-anchor="middle" x="154" y="-29.32" font-family="Times,serif" font-size="14.00">a</text>
|
||||||
|
</g>
|
||||||
|
<!-- src0->a -->
|
||||||
|
<g id="edge10" class="edge">
|
||||||
|
<title>src0:s->a:n</title>
|
||||||
|
<path fill="none" stroke="black" d="M187,-123C187,-123 154,-53 154,-53"/>
|
||||||
|
</g>
|
||||||
|
<!-- b -->
|
||||||
|
<g id="node12" class="node">
|
||||||
|
<title>b</title>
|
||||||
|
<polygon fill="none" stroke="black" points="253,-52 199,-52 199,-16 253,-16 253,-52"/>
|
||||||
|
<text text-anchor="middle" x="226" y="-29.32" font-family="Times,serif" font-size="14.00">b</text>
|
||||||
|
</g>
|
||||||
|
<!-- src0->b -->
|
||||||
|
<g id="edge14" class="edge">
|
||||||
|
<title>src0:s->b:n</title>
|
||||||
|
<path fill="none" stroke="black" d="M187,-123C187,-123 226,-53 226,-53"/>
|
||||||
|
</g>
|
||||||
|
<!-- c -->
|
||||||
|
<g id="node13" class="node">
|
||||||
|
<title>c</title>
|
||||||
|
<polygon fill="none" stroke="black" points="325,-52 271,-52 271,-16 325,-16 325,-52"/>
|
||||||
|
<text text-anchor="middle" x="298" y="-29.32" font-family="Times,serif" font-size="14.00">c</text>
|
||||||
|
</g>
|
||||||
|
<!-- src0->c -->
|
||||||
|
<g id="edge18" class="edge">
|
||||||
|
<title>src0:s->c:n</title>
|
||||||
|
<path fill="none" stroke="black" d="M187,-123C187,-123 298,-53 298,-53"/>
|
||||||
|
</g>
|
||||||
|
<!-- src1->a -->
|
||||||
|
<g id="edge11" class="edge">
|
||||||
|
<title>src1:s->a:n</title>
|
||||||
|
<path fill="none" stroke="black" d="M115,-123C115,-123 154,-53 154,-53"/>
|
||||||
|
</g>
|
||||||
|
<!-- src1->b -->
|
||||||
|
<g id="edge15" class="edge">
|
||||||
|
<title>src1:s->b:n</title>
|
||||||
|
<path fill="none" stroke="black" d="M115,-123C115,-123 226,-53 226,-53"/>
|
||||||
|
</g>
|
||||||
|
<!-- src1->c -->
|
||||||
|
<g id="edge19" class="edge">
|
||||||
|
<title>src1:s->c:n</title>
|
||||||
|
<path fill="none" stroke="black" d="M115,-123C115,-123 298,-53 298,-53"/>
|
||||||
|
</g>
|
||||||
|
<!-- src2->a -->
|
||||||
|
<g id="edge12" class="edge">
|
||||||
|
<title>src2:s->a:n</title>
|
||||||
|
<path fill="none" stroke="black" d="M259,-123C259,-123 154,-53 154,-53"/>
|
||||||
|
</g>
|
||||||
|
<!-- src2->b -->
|
||||||
|
<g id="edge16" class="edge">
|
||||||
|
<title>src2:s->b:n</title>
|
||||||
|
<path fill="none" stroke="black" d="M259,-123C259,-123 226,-53 226,-53"/>
|
||||||
|
</g>
|
||||||
|
<!-- src2->c -->
|
||||||
|
<g id="edge20" class="edge">
|
||||||
|
<title>src2:s->c:n</title>
|
||||||
|
<path fill="none" stroke="black" d="M259,-123C259,-123 298,-53 298,-53"/>
|
||||||
|
</g>
|
||||||
|
<!-- srcp -->
|
||||||
|
<g id="node9" class="node">
|
||||||
|
<title>srcp</title>
|
||||||
|
<polygon fill="none" stroke="black" points="358,-160 304,-160 304,-124 358,-124 358,-160"/>
|
||||||
|
<text text-anchor="middle" x="331" y="-137.32" font-family="Times,serif" font-size="14.00">srcp</text>
|
||||||
|
</g>
|
||||||
|
<!-- srcp->a -->
|
||||||
|
<g id="edge13" class="edge">
|
||||||
|
<title>srcp:s->a:n</title>
|
||||||
|
<path fill="none" stroke="black" d="M331,-123C331,-123 154,-53 154,-53"/>
|
||||||
|
</g>
|
||||||
|
<!-- srcp->b -->
|
||||||
|
<g id="edge17" class="edge">
|
||||||
|
<title>srcp:s->b:n</title>
|
||||||
|
<path fill="none" stroke="black" d="M331,-123C331,-123 226,-53 226,-53"/>
|
||||||
|
</g>
|
||||||
|
<!-- srcp->c -->
|
||||||
|
<g id="edge21" class="edge">
|
||||||
|
<title>srcp:s->c:n</title>
|
||||||
|
<path fill="none" stroke="black" d="M331,-123C331,-123 298,-53 298,-53"/>
|
||||||
|
</g>
|
||||||
|
<!-- opcode -->
|
||||||
|
<g id="node10" class="node">
|
||||||
|
<title>opcode</title>
|
||||||
|
<text text-anchor="middle" x="76" y="-29.32" font-family="Times,serif" font-size="14.00">opcode</text>
|
||||||
|
</g>
|
||||||
|
</g>
|
||||||
|
</svg>
|
||||||
|
After Width: | Height: | Size: 7.5 KiB |
36
diagrams/vertex_inputs.dot
Normal file
36
diagrams/vertex_inputs.dot
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
digraph D {
|
||||||
|
graph [ranksep="1" splines=line];
|
||||||
|
node [shape=box];
|
||||||
|
edge [arrowhead=none];
|
||||||
|
|
||||||
|
input
|
||||||
|
const
|
||||||
|
temp
|
||||||
|
alt_temp
|
||||||
|
|
||||||
|
opcode [shape=none];
|
||||||
|
a [label = "a"];
|
||||||
|
b [label = "b"];
|
||||||
|
c [label = "c"];
|
||||||
|
|
||||||
|
subgraph cluster_R {
|
||||||
|
|
||||||
|
{rank=same opcode a b c}
|
||||||
|
}
|
||||||
|
|
||||||
|
input:s -> a:n
|
||||||
|
input:s -> b:n
|
||||||
|
input:s -> c:n
|
||||||
|
|
||||||
|
const:s -> a:n
|
||||||
|
const:s -> b:n
|
||||||
|
const:s -> c:n
|
||||||
|
|
||||||
|
temp:s -> a:n
|
||||||
|
temp:s -> b:n
|
||||||
|
temp:s -> c:n
|
||||||
|
|
||||||
|
alt_temp:s -> a:n
|
||||||
|
alt_temp:s -> b:n
|
||||||
|
alt_temp:s -> c:n
|
||||||
|
}
|
||||||
124
diagrams/vertex_inputs.svg
Normal file
124
diagrams/vertex_inputs.svg
Normal file
@ -0,0 +1,124 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||||
|
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
|
||||||
|
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
|
||||||
|
<!-- Generated by graphviz version 12.2.1 (20241206.2353)
|
||||||
|
-->
|
||||||
|
<!-- Title: D Pages: 1 -->
|
||||||
|
<svg width="366pt" height="168pt"
|
||||||
|
viewBox="0.00 0.00 366.12 168.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
|
||||||
|
<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 164)">
|
||||||
|
<title>D</title>
|
||||||
|
<polygon fill="white" stroke="none" points="-4,4 -4,-164 362.12,-164 362.12,4 -4,4"/>
|
||||||
|
<g id="clust1" class="cluster">
|
||||||
|
<title>cluster_R</title>
|
||||||
|
<polygon fill="none" stroke="black" points="8,-8 8,-60 306,-60 306,-8 8,-8"/>
|
||||||
|
</g>
|
||||||
|
<!-- input -->
|
||||||
|
<g id="node1" class="node">
|
||||||
|
<title>input</title>
|
||||||
|
<polygon fill="none" stroke="black" points="118,-160 64,-160 64,-124 118,-124 118,-160"/>
|
||||||
|
<text text-anchor="middle" x="91" y="-137.32" font-family="Times,serif" font-size="14.00">input</text>
|
||||||
|
</g>
|
||||||
|
<!-- a -->
|
||||||
|
<g id="node6" class="node">
|
||||||
|
<title>a</title>
|
||||||
|
<polygon fill="none" stroke="black" points="154,-52 100,-52 100,-16 154,-16 154,-52"/>
|
||||||
|
<text text-anchor="middle" x="127" y="-29.32" font-family="Times,serif" font-size="14.00">a</text>
|
||||||
|
</g>
|
||||||
|
<!-- input->a -->
|
||||||
|
<g id="edge1" class="edge">
|
||||||
|
<title>input:s->a:n</title>
|
||||||
|
<path fill="none" stroke="black" d="M91,-124C91,-124 127,-53 127,-53"/>
|
||||||
|
</g>
|
||||||
|
<!-- b -->
|
||||||
|
<g id="node7" class="node">
|
||||||
|
<title>b</title>
|
||||||
|
<polygon fill="none" stroke="black" points="226,-52 172,-52 172,-16 226,-16 226,-52"/>
|
||||||
|
<text text-anchor="middle" x="199" y="-29.32" font-family="Times,serif" font-size="14.00">b</text>
|
||||||
|
</g>
|
||||||
|
<!-- input->b -->
|
||||||
|
<g id="edge2" class="edge">
|
||||||
|
<title>input:s->b:n</title>
|
||||||
|
<path fill="none" stroke="black" d="M91,-124C91,-124 199,-53 199,-53"/>
|
||||||
|
</g>
|
||||||
|
<!-- c -->
|
||||||
|
<g id="node8" class="node">
|
||||||
|
<title>c</title>
|
||||||
|
<polygon fill="none" stroke="black" points="298,-52 244,-52 244,-16 298,-16 298,-52"/>
|
||||||
|
<text text-anchor="middle" x="271" y="-29.32" font-family="Times,serif" font-size="14.00">c</text>
|
||||||
|
</g>
|
||||||
|
<!-- input->c -->
|
||||||
|
<g id="edge3" class="edge">
|
||||||
|
<title>input:s->c:n</title>
|
||||||
|
<path fill="none" stroke="black" d="M91,-124C91,-124 271,-53 271,-53"/>
|
||||||
|
</g>
|
||||||
|
<!-- const -->
|
||||||
|
<g id="node2" class="node">
|
||||||
|
<title>const</title>
|
||||||
|
<polygon fill="none" stroke="black" points="190.12,-160 135.88,-160 135.88,-124 190.12,-124 190.12,-160"/>
|
||||||
|
<text text-anchor="middle" x="163" y="-137.32" font-family="Times,serif" font-size="14.00">const</text>
|
||||||
|
</g>
|
||||||
|
<!-- const->a -->
|
||||||
|
<g id="edge4" class="edge">
|
||||||
|
<title>const:s->a:n</title>
|
||||||
|
<path fill="none" stroke="black" d="M163,-124C163,-124 127,-53 127,-53"/>
|
||||||
|
</g>
|
||||||
|
<!-- const->b -->
|
||||||
|
<g id="edge5" class="edge">
|
||||||
|
<title>const:s->b:n</title>
|
||||||
|
<path fill="none" stroke="black" d="M163,-124C163,-124 199,-53 199,-53"/>
|
||||||
|
</g>
|
||||||
|
<!-- const->c -->
|
||||||
|
<g id="edge6" class="edge">
|
||||||
|
<title>const:s->c:n</title>
|
||||||
|
<path fill="none" stroke="black" d="M163,-124C163,-124 271,-53 271,-53"/>
|
||||||
|
</g>
|
||||||
|
<!-- temp -->
|
||||||
|
<g id="node3" class="node">
|
||||||
|
<title>temp</title>
|
||||||
|
<polygon fill="none" stroke="black" points="262,-160 208,-160 208,-124 262,-124 262,-160"/>
|
||||||
|
<text text-anchor="middle" x="235" y="-137.32" font-family="Times,serif" font-size="14.00">temp</text>
|
||||||
|
</g>
|
||||||
|
<!-- temp->a -->
|
||||||
|
<g id="edge7" class="edge">
|
||||||
|
<title>temp:s->a:n</title>
|
||||||
|
<path fill="none" stroke="black" d="M235,-124C235,-124 127,-53 127,-53"/>
|
||||||
|
</g>
|
||||||
|
<!-- temp->b -->
|
||||||
|
<g id="edge8" class="edge">
|
||||||
|
<title>temp:s->b:n</title>
|
||||||
|
<path fill="none" stroke="black" d="M235,-124C235,-124 199,-53 199,-53"/>
|
||||||
|
</g>
|
||||||
|
<!-- temp->c -->
|
||||||
|
<g id="edge9" class="edge">
|
||||||
|
<title>temp:s->c:n</title>
|
||||||
|
<path fill="none" stroke="black" d="M235,-124C235,-124 271,-53 271,-53"/>
|
||||||
|
</g>
|
||||||
|
<!-- alt_temp -->
|
||||||
|
<g id="node4" class="node">
|
||||||
|
<title>alt_temp</title>
|
||||||
|
<polygon fill="none" stroke="black" points="358.12,-160 279.88,-160 279.88,-124 358.12,-124 358.12,-160"/>
|
||||||
|
<text text-anchor="middle" x="319" y="-137.32" font-family="Times,serif" font-size="14.00">alt_temp</text>
|
||||||
|
</g>
|
||||||
|
<!-- alt_temp->a -->
|
||||||
|
<g id="edge10" class="edge">
|
||||||
|
<title>alt_temp:s->a:n</title>
|
||||||
|
<path fill="none" stroke="black" d="M319,-124C319,-124 127,-53 127,-53"/>
|
||||||
|
</g>
|
||||||
|
<!-- alt_temp->b -->
|
||||||
|
<g id="edge11" class="edge">
|
||||||
|
<title>alt_temp:s->b:n</title>
|
||||||
|
<path fill="none" stroke="black" d="M319,-124C319,-124 199,-53 199,-53"/>
|
||||||
|
</g>
|
||||||
|
<!-- alt_temp->c -->
|
||||||
|
<g id="edge12" class="edge">
|
||||||
|
<title>alt_temp:s->c:n</title>
|
||||||
|
<path fill="none" stroke="black" d="M319,-124C319,-124 271,-53 271,-53"/>
|
||||||
|
</g>
|
||||||
|
<!-- opcode -->
|
||||||
|
<g id="node5" class="node">
|
||||||
|
<title>opcode</title>
|
||||||
|
<text text-anchor="middle" x="49" y="-29.32" font-family="Times,serif" font-size="14.00">opcode</text>
|
||||||
|
</g>
|
||||||
|
</g>
|
||||||
|
</svg>
|
||||||
|
After Width: | Height: | Size: 4.6 KiB |
905
index.tex
Normal file
905
index.tex
Normal file
@ -0,0 +1,905 @@
|
|||||||
|
\documentclass[20pt]{article}
|
||||||
|
|
||||||
|
\usepackage[font=small,labelfont=bf]{caption}
|
||||||
|
\usepackage{hyperref}
|
||||||
|
\hypersetup{
|
||||||
|
colorlinks=true,
|
||||||
|
linkcolor=blue,
|
||||||
|
filecolor=magenta,
|
||||||
|
urlcolor=cyan,
|
||||||
|
pdftitle={Dreamcast},
|
||||||
|
pdfpagemode=FullScreen,
|
||||||
|
}
|
||||||
|
|
||||||
|
\usepackage{graphicx}
|
||||||
|
\graphicspath{ {./images/} }
|
||||||
|
|
||||||
|
\usepackage{minted}
|
||||||
|
|
||||||
|
\title{Radeon R500}
|
||||||
|
\date{}
|
||||||
|
|
||||||
|
\begin{document}
|
||||||
|
|
||||||
|
\maketitle
|
||||||
|
\href{images/x1950xt.jpg}{\includegraphics{images/x1950xt.jpg}}
|
||||||
|
|
||||||
|
\tableofcontents
|
||||||
|
|
||||||
|
\section{Introduction}
|
||||||
|
|
||||||
|
The primary/minimal project goal is "draw a triangle on a Radeon R500 via direct
|
||||||
|
memory-mapped hardware register and texture memory accesses". This means no
|
||||||
|
\href{https://mesa3d.org/}{Mesa}, no
|
||||||
|
\href{https://github.com/torvalds/linux/tree/v6.12/drivers/gpu/drm/radeon}{radeon}
|
||||||
|
kernel module, and certainly no OpenGL or Direct3D.
|
||||||
|
|
||||||
|
I have worked directly with several other graphics units in the past
|
||||||
|
(\href{https://github.com/buhman/saturn-examples}{Saturn VDP1},
|
||||||
|
\href{https://github.com/buhman/dreamcast}{Dreamcast Holly},
|
||||||
|
\href{https://github.com/buhman/voodoo}{Voodoo 2}). In all of these projects,
|
||||||
|
my strategy is generally:
|
||||||
|
|
||||||
|
\begin{itemize}
|
||||||
|
\item read the entire \href{doc/R5xx_Acceleration_v1.5.pdf}{reference
|
||||||
|
documentation} at least once, front-to-back
|
||||||
|
\item copy all hardware register definitions from the documentation to a
|
||||||
|
spreadsheet or text file (sometimes typing everything by hand if I am in such
|
||||||
|
a chill mood)
|
||||||
|
\item progressively build increasingly-complex example programs that exercise
|
||||||
|
the hardware
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
The rabbit hole for R500 seems significantly deeper, considering this is the
|
||||||
|
first graphics unit I've worked with that has programmable vertex and pixel
|
||||||
|
shader engines.
|
||||||
|
|
||||||
|
\subsection{Hardware}
|
||||||
|
|
||||||
|
For testing, I currently have this hardware configuration:
|
||||||
|
|
||||||
|
\begin{itemize}
|
||||||
|
\item ASUS P4B-LX (Intel 845) motherboard
|
||||||
|
\item Intel Pentium 4 2.6GHz SL6PP (Northwood)
|
||||||
|
\item 1024 MB RAM
|
||||||
|
\item 32GB PATA SSD
|
||||||
|
\item ATI Radeon X1650 PRO 512MB AGP
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
I also have the X1950 XT PCIe shown in the photo, which amazingly has never been
|
||||||
|
used, and prior to the photo was sealed in an antistatic bag from manufacture to
|
||||||
|
now.
|
||||||
|
|
||||||
|
\subsection{Test setup}
|
||||||
|
|
||||||
|
While in my other (video game console) projects I typically insist on
|
||||||
|
``bare-metal'' development with no operating system or third-party library
|
||||||
|
running on the target hardware, my experience with x86 is much more limited.
|
||||||
|
|
||||||
|
While it is something I am interested in doing, I believe creating a
|
||||||
|
zero-dependency ``code upload'' mechanism for an x86-pc that does not depend on
|
||||||
|
an operating system would severely delay my progress on R500-specific work.
|
||||||
|
|
||||||
|
For my initial exploration of R500, I will instead be manipulating the hardware
|
||||||
|
primarily from Linux kernel space. This Linux kernel code does not actually
|
||||||
|
meaningfully depend on Linux APIs beyond calling \texttt{ioremap} to get usable
|
||||||
|
memory mappings for R500 PCI resources (texture/framebuffer memory and
|
||||||
|
registers).
|
||||||
|
|
||||||
|
\section{Progress: 07 Oct 2025}
|
||||||
|
|
||||||
|
From 01 Oct 2025 to 07 Oct 2025, I achieved the following:
|
||||||
|
|
||||||
|
\begin{itemize}
|
||||||
|
\item I wrote a reasonably complete AtomBIOS disassembler
|
||||||
|
\item I can disable (IBM PC) VGA mode and manipulate the native framebuffer
|
||||||
|
\item I can upload microcode to the ``command processor'', and I can write to
|
||||||
|
scratch registers via command processor packets (this is uncoincidentally the
|
||||||
|
same command processor test that the radeon kernel module does).
|
||||||
|
\item I stepped through Mesa functions as invoked by a simple OpenGL
|
||||||
|
application, and created \href{mesa/glDrawArrays.txt}{a list of R500
|
||||||
|
registers/values} that are written by Mesa during \texttt{glDrawArrays}.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
I did not achieve the following:
|
||||||
|
|
||||||
|
\begin{itemize}
|
||||||
|
\item I attempted to manipulate the R500 register state and command processor
|
||||||
|
into drawing a triangle, but I have not been successful yet
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\subsection{Documentation}
|
||||||
|
|
||||||
|
In general, I note that the R500 documentation is significantly weaker than I
|
||||||
|
hoped, and does not contain enough information to draw a triangle on the R500
|
||||||
|
from the documentation alone (with no prior knowledge about previous Radeon
|
||||||
|
graphics units).
|
||||||
|
|
||||||
|
In addition to the lack of prose, in several cases I've noticed both Mesa and
|
||||||
|
Linux reference R500 registers that are
|
||||||
|
\href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/pci/undocumented_3d_registers.h}{not
|
||||||
|
present at all} in the documentation.
|
||||||
|
|
||||||
|
\subsection{AtomBIOS}
|
||||||
|
|
||||||
|
AtomBIOS physically exists as a section inside the ROM on R500 graphics units.
|
||||||
|
AtomBIOS is notably used for setting PLL/pixel clock frequencies and display
|
||||||
|
resolutions, among several other functions.
|
||||||
|
|
||||||
|
The Radeon graphics hardware itself does not execute AtomBIOS code--instead, it
|
||||||
|
is expected that the host (e.g: x86) CPU evaluate the instructions in the
|
||||||
|
AtomBIOS command tables. Generally the outcome of evaluating AtomBIOS code is
|
||||||
|
that several ``register write'' instructions will be executed, changing the
|
||||||
|
state of the graphics unit.
|
||||||
|
|
||||||
|
My original goal in studying AtomBIOS was that I thought I would need it to set
|
||||||
|
up the R500 display controller to a reasonable state (as a prerequisite for
|
||||||
|
drawing 3D graphics). However, after actually experimenting with ``disable VGA
|
||||||
|
mode'', I currently believe that I don't actually need to implement
|
||||||
|
resolution/mode changes, and can proceed without it.
|
||||||
|
|
||||||
|
\subsection{PIO mode}
|
||||||
|
|
||||||
|
The Linux kernel exclusively communicates with R500 via ``PCI bus mastering''.
|
||||||
|
A ``ring buffer'' is allocated in ``GTT'' space, which from the graphics unit's
|
||||||
|
perspective exists in the same address space as framebuffer memory, but is an
|
||||||
|
address that is outside the framebuffer memory that physically exists.
|
||||||
|
|
||||||
|
I also observed via debugfs that the GTT apparently involves some sort of sparse
|
||||||
|
page mapping, but I don't understand how this works from an x86 perspective.
|
||||||
|
|
||||||
|
In the absence of an understanding of how to make my own ``GTT'' address space,
|
||||||
|
I attempted to operate the R500 in ``PIO'' mode. This has the advantage of being
|
||||||
|
able to simply write to registers via (simple) PCI memory-mapped accesses, but
|
||||||
|
it has the disadvantage that Linux doesn't use R500 this way, so I have no
|
||||||
|
reference implementation for how PIO mode should be used.
|
||||||
|
|
||||||
|
\subsection{Triangle drawing attempt \#1}
|
||||||
|
|
||||||
|
I translated my \href{mesa/glDrawArrays.txt}{glDrawArrays notes} to
|
||||||
|
\href{https://git.idk.st/bilbo/r500/src/commit/b6472e4c16946f44e02d82f31adaa411df009c67/pci/triangle.c}{equivalent
|
||||||
|
register writes}.
|
||||||
|
|
||||||
|
This does not work, and I don't yet understand why. The main issue is that most
|
||||||
|
of the time when I execute that code, Linux appears to ``hang'' completely, and
|
||||||
|
my ``printk'' messages are never sent over ssh. On the rare occasion when the
|
||||||
|
``hang'' does not occur, a triangle is nevertheless not drawn on the
|
||||||
|
framebuffer.
|
||||||
|
|
||||||
|
I have a few ideas for how to proceed:
|
||||||
|
|
||||||
|
\begin{itemize}
|
||||||
|
\item Move the ``triangle.c'' register accesses to userspace via
|
||||||
|
\texttt{/sys/bus/pci}, which might improve debuggability
|
||||||
|
\item Abandon the ``write a kernel module'' idea completely, and instead
|
||||||
|
interact with the R500 via \href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/radeon_drv.c#L565-L577}{radeon DRM ioctls}
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
The latter is perhaps both the most attractive, and the most work. I currently
|
||||||
|
don't have any understanding of GEM buffers, radeon buffer objects, etc.., so
|
||||||
|
I'd need to study these in more detail.
|
||||||
|
|
||||||
|
\section{Progress: 14 Oct 2025}
|
||||||
|
|
||||||
|
From 08 Oct 2025 to 14 Oct 2025, I achieved the following:
|
||||||
|
|
||||||
|
\begin{itemize}
|
||||||
|
\item I studied how Mesa interacts with the \texttt{radeon} kernel module via
|
||||||
|
\texttt{DRM\_RADEON\_} ioctls.
|
||||||
|
\item I wrote simple R500 \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/regs/pvs_disassemble.py}{vertex shader} and \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/regs/us_disassemble.py}{pixel shader} disassemblers.
|
||||||
|
\item I wrote a \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/regs/parse_packets.py}{tool} to print R500 ``PM4'' packets in human-readable form.
|
||||||
|
\item I laboriously \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/regs/bits}{copied and reformatted} all bit definitions from \href{doc/R5xx_Acceleration_v1.5.pdf}{R5xx\_Acceleration\_v1.5.pdf}
|
||||||
|
\item I wrote \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/regs}{several other miscellaneous tools} related to register and bit parsing and manipulation.
|
||||||
|
\item I wrote two \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/drm/single_color.c}{humble} \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/drm/vertex_color.c}{demos} to draw a triangle on R500.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\subsection{Radeon DRM}
|
||||||
|
|
||||||
|
As implied in the last update, primarily due to my lack of experience with
|
||||||
|
bare-metal x86, I decided it would be a better approach to interact with R500
|
||||||
|
Command Processor via the \texttt{radeon} kernel module, which provides a
|
||||||
|
partially reasonable interface for this via the \texttt{DRM\_RADEON\_CS} ioctl.
|
||||||
|
|
||||||
|
All \texttt{DRM\_RADEON\_} ioctls are mostly or entirely undocumented. Instead,
|
||||||
|
I built debugging symbols for Mesa and other supporting libraries so that I
|
||||||
|
could set breakpoints in GDB to observe what sequences of \texttt{DRM\_RADEON\_}
|
||||||
|
ioctls Mesa uses.
|
||||||
|
|
||||||
|
From my previous \href{mesa/glDrawArrays.txt}{glDrawArrays notes} observations,
|
||||||
|
I noticed this strange sequence:
|
||||||
|
|
||||||
|
\begin{verbatim}
|
||||||
|
0x0000138a // type 0 packet, count=0, starting offset = RB3D_COLOROFFSET0
|
||||||
|
0x00000000 // RB3D_COLOROFFSET0 = 0
|
||||||
|
0xc0001000 // type 3 packet, count=0, opcode=NOP
|
||||||
|
0x00000000 // zero (meaningless data)
|
||||||
|
\end{verbatim}
|
||||||
|
|
||||||
|
At first, it seemed Mesa was deliberately setting the colorbuffer write address
|
||||||
|
to (VRAM address) zero, which seemed like a strange choice considering I am
|
||||||
|
debugging an X11/GLX OpenGL application--surely the colorbuffer address would be
|
||||||
|
some non-zero value several megabytes after the beginning of VRAM.
|
||||||
|
|
||||||
|
I later attempted to send my own PM4 packet via \texttt{DRM\_RADEON\_CS}. This
|
||||||
|
initial attempt returned \texttt{Invalid argument}, with the following
|
||||||
|
message in dmesg:
|
||||||
|
|
||||||
|
\begin{verbatim}
|
||||||
|
[ 1205.978993] [drm:radeon_cs_packet_next_reloc [radeon]] *ERROR* No packet3 for relocation for packet at 14.
|
||||||
|
[ 1205.979427] [drm] ib[14]=0x0000138E
|
||||||
|
[ 1205.979433] [drm] ib[15]=0x00C00640
|
||||||
|
[ 1205.979437] [drm:r300_packet0_check [radeon]] *ERROR* No reloc for ib[13]=0x4E28
|
||||||
|
[ 1205.979545] [drm] ib[12]=0x0000138A
|
||||||
|
[ 1205.979548] [drm] ib[13]=0x00000000
|
||||||
|
[ 1205.979553] [drm:radeon_cs_ioctl [radeon]] *ERROR* Invalid command stream !
|
||||||
|
\end{verbatim}
|
||||||
|
|
||||||
|
This error message comes from
|
||||||
|
\href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/r300.c#L664-L669}{drm/radeon/r300.c}.
|
||||||
|
|
||||||
|
The meaningless data following the type-3 NOP packet is used by the kernel to
|
||||||
|
\href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/radeon_cs.c#L875-L889}{index}
|
||||||
|
the \texttt{DRM\_RADEON\_CS} ``relocs'' array (an array of GEM buffer handles).
|
||||||
|
|
||||||
|
It seems perhaps the design goal was to never expose the VRAM address of GEM
|
||||||
|
buffers to userspace (indeed there seems to be no way to retrieve that via any
|
||||||
|
GEM ioctls). This restriction is slightly disappointing, as I would have
|
||||||
|
preferred to be able to send unmodified packet data to the R500.
|
||||||
|
|
||||||
|
However, at the moment this does not appear to be a significant issue, as a
|
||||||
|
relatively small number of registers are modified by the Linux kernel's packet
|
||||||
|
parser prior creating the indirect buffer that is actually sent to the R500
|
||||||
|
hardware.
|
||||||
|
|
||||||
|
\subsection{Indirect buffers}
|
||||||
|
|
||||||
|
There appears to be a lot of memory-to-memory copying in the
|
||||||
|
Linux/Mesa/DRM/GEM/radeon graphics stack:
|
||||||
|
|
||||||
|
\begin{itemize}
|
||||||
|
\item Mesa writes the OpenGL state to various internal structures
|
||||||
|
\item Mesa \href{https://gitlab.freedesktop.org/mesa/mesa/-/blob/25.0/src/gallium/drivers/r300/r300_emit.c?ref_type=heads}{copies} OpenGL state to packet commands in a userspace buffer
|
||||||
|
\item Mesa
|
||||||
|
\href{https://gitlab.freedesktop.org/mesa/mesa/-/blob/25.0/src/gallium/winsys/radeon/drm/radeon_drm_cs.c?ref_type=heads#L486-487}{passes
|
||||||
|
the address} of the userspace buffer to the kernel via
|
||||||
|
\texttt{DRM\_RADEON\_CS}
|
||||||
|
\item Linux
|
||||||
|
\href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/radeon_cs.c#L340-L358}{copies
|
||||||
|
the entire userspace buffer} to kernel space (calling kvmalloc/kvfree on
|
||||||
|
each ioctl)
|
||||||
|
\item The \texttt{radeon\_cs\_parser} parses and modifies the buffer originally
|
||||||
|
generated by Mesa
|
||||||
|
\item \href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/radeon_cs.c#L613}{radeon\_cs\_ib\_fill} copies the parser result to gpu address space.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
Eventually,
|
||||||
|
\href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/r100.c#L3709-L3722}{r100\_ring\_ib\_execute}
|
||||||
|
is called, which writes the indirect buffer address (now in GPU address space)
|
||||||
|
to the ring.
|
||||||
|
|
||||||
|
It would be interesting to experiment with writing a packet buffer directly in
|
||||||
|
GPU/GTT address space (from Linux userspace), with zero copies. This would
|
||||||
|
require an entirely new set of ioctls.
|
||||||
|
|
||||||
|
\subsection{Triangle drawing attempt \#2}
|
||||||
|
|
||||||
|
These images were never drawn on-screen. I extracted them from VRAM via
|
||||||
|
\texttt{/sys/kernel/debug/radeon\_vram}.
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\href{images/single_color_macrotiled.png}{\includegraphics{images/single_color_macrotiled.png}}
|
||||||
|
\caption*{R500 framebuffer capture, \texttt{single\_color.c}}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
Though I was not aware of it yet, the above image was indeed my triangle, and
|
||||||
|
\texttt{COLORPITCH0} was merely in ``macrotiled'' mode. Once I realized this, I
|
||||||
|
produced this image (still in off-screen VRAM):
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\href{images/single_color.png}{\includegraphics{images/single_color.png}}
|
||||||
|
\caption*{R500 framebuffer capture, \texttt{single\_color.c}}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
This \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/drm/single_color.c}{``single color''} demo deliberately uses the very simple vertex and fragment
|
||||||
|
shaders:
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\begin{verbatim}
|
||||||
|
instruction[0]:
|
||||||
|
0x00f00203 dst: VE_ADD out[0].xyzw
|
||||||
|
0x00d10001 src0: input[0].xyzw
|
||||||
|
0x01248001 src1: input[0].0000
|
||||||
|
0x01248001 src2: input[0].0000
|
||||||
|
\end{verbatim}
|
||||||
|
\caption*{R500 vertex shader (1 instruction, 128-bit control word)}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
This vertex shader is doing the equivalent of:
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\href{verbatim/vertex_shader_equivalent_single_color.glsl}{\includegraphics{verbatim/output/vertex_shader_equivalent_single_color.glsl.pdf}}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
The W component \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae//drm/single_color.c#L339}{comes from}
|
||||||
|
\texttt{VAP\_PROG\_STREAM\_CNTL\_EXT\_\_SWIZZLE\_SELECT\_W\_0(5)}, which
|
||||||
|
swizzles W to a constant \texttt{1.0}, despite W not being present in the vertex
|
||||||
|
data.
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\begin{verbatim}
|
||||||
|
instruction[0]:
|
||||||
|
0x00078005 OUT RGBA
|
||||||
|
0x08020080 RGB ADDR0=0.0 ADDR1=0.0 ADDR2=0.0
|
||||||
|
0x08020080 ALPHA ADDR0=0.0 ADDR1=0.0 ADDR2=0.0
|
||||||
|
0x1c9b04d8 RGB_SEL_A=src0.110 RGB_SEL_B=src0.110 TARGET=A
|
||||||
|
0x1c810003 ALPHA_OP=OP_MAX ALPHA_SEL_A=src0.0 ALPHA_SEL_B=src0.0 TARGET=A
|
||||||
|
0x00000005 RGB_OP=OP_MAX
|
||||||
|
\end{verbatim}
|
||||||
|
\caption*{R500 fragment shader (1 instruction, 192-bit control word)}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
This fragment shader is doing the equivalent of:
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\href{verbatim/fragment_shader_equivalent_single_color.glsl}{\includegraphics{verbatim/output/fragment_shader_equivalent_single_color.glsl.pdf}}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
via the src swizzles. I think it is interesting that there are so many options
|
||||||
|
for producing inline constants within the fragment shader.
|
||||||
|
|
||||||
|
The ``target'' fragment shader field also seems interesting. I am excited to
|
||||||
|
write shaders that use multiple output buffers.
|
||||||
|
|
||||||
|
\subsection{DRM/KMS/GBM}
|
||||||
|
|
||||||
|
These renders were not displayed on-screen, so I looked for ways to correct
|
||||||
|
this.
|
||||||
|
|
||||||
|
Perhaps the most obvious method would be to write to the display controller
|
||||||
|
registers (\texttt{D1GRPH\_PRIMARY\_SURFACE\_ADDRESS}) via
|
||||||
|
\texttt{RADEON\_DRM\_CS}. However, this does not work due to the command parser
|
||||||
|
anti-fun implemented in
|
||||||
|
\href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/r300.c#L643}{r300\_packet0\_check}:
|
||||||
|
any register not present in that case statement is considered invalid, and the
|
||||||
|
packet buffer is not submitted.
|
||||||
|
|
||||||
|
I attempted to do this the ``right way'' via the DRM/KMS/GBM APIs. I then
|
||||||
|
learned that this does not behave correctly on my R500 because demos that wait
|
||||||
|
for the flag returned by \texttt{DRM\_IOCTL\_MODE\_PAGE\_FLIP} hang forever.
|
||||||
|
|
||||||
|
I noticed this earlier on Xorg/GLX as well, as I have been using the
|
||||||
|
\texttt{vblank\_mode=0} environment variable to avoid hanging forever in
|
||||||
|
\texttt{glXSwapBuffers}. This appears to be a Linux kernel bug, but I didn't
|
||||||
|
investigate this further.
|
||||||
|
|
||||||
|
\subsection{On-screen drawing}
|
||||||
|
|
||||||
|
I noticed in \texttt{/sys/kernel/debug/radeon\_vram\_mm} that the Linux console
|
||||||
|
is only using a single framebuffer (and does not double-buffer).
|
||||||
|
|
||||||
|
This is fortunate, because this means I can simply
|
||||||
|
\href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/pci_user/main.c#L48}{mmap
|
||||||
|
the register address space} and write
|
||||||
|
\texttt{D1GRPH\_PRIMARY\_SURFACE\_ADDRESS} myself without worrying about the
|
||||||
|
Linux console overwriting my change. I observed the \texttt{0x813000} value from
|
||||||
|
\texttt{/sys/kernel/debug/radeon\_vram\_mm}--there appears to be no other way to
|
||||||
|
get the vram address of a GEM buffer.
|
||||||
|
|
||||||
|
This is ``good enough'' for now, though at some point I'll want to learn how to
|
||||||
|
do proper vblank-synchronized double buffering.
|
||||||
|
|
||||||
|
\subsection{Triangle drawing attempt \#3}
|
||||||
|
|
||||||
|
I felt the next logical step was to learn how attributes and constants are
|
||||||
|
passed through the shader pipeline, so I then \href{https://git.idk.st/bilbo/r500/src/commit/95e9ba85ae/drm/vertex_color.c}{created a demo} that produced this image (this time also displayed on-screen):
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\href{images/vertex_color.png}{\includegraphics{images/vertex_color.png}}
|
||||||
|
\caption*{R500 framebuffer capture, \texttt{vertex\_color.c}}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\begin{verbatim}
|
||||||
|
instruction[0]:
|
||||||
|
0x00702203 dst: VE_ADD out[1].xyz_
|
||||||
|
0x01d10021 src0: input[1].xyz_
|
||||||
|
0x01248021 src1: input[1].0000
|
||||||
|
0x01248021 src2: input[1].0000
|
||||||
|
instruction[1]:
|
||||||
|
0x00f00203 dst: VE_ADD out[0].xyzw
|
||||||
|
0x01510001 src0: input[0].xyz1
|
||||||
|
0x01248001 src1: input[0].0000
|
||||||
|
0x01248001 src2: input[0].0000
|
||||||
|
\end{verbatim}
|
||||||
|
\caption*{R500 vertex shader (2 instructions, 128-bit control words)}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
This vertex shader is doing the equivalent of
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\href{verbatim/vertex_shader_equivalent_vertex_color.glsl}{\includegraphics{verbatim/output/vertex_shader_equivalent_vertex_color.glsl.pdf}}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
The extra vertex input is fed to the vertex shader via changes to
|
||||||
|
\texttt{VAP\_PROG\_STREAM\_CNTL\_0},
|
||||||
|
\texttt{VAP\_PROG\_STREAM\_CNTL\_EXT\_0}. Based on my currently limited
|
||||||
|
understanding, it seems that arranging the vertex data like this:
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\href{verbatim/vap_prog_stream_vertices.c}{\includegraphics{verbatim/output/vap_prog_stream_vertices.c.pdf}}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
Is easier to deal with in \texttt{VAP\_PROG\_STREAM\_CNTL} than:
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\href{verbatim/vap_prog_stream_vertices2.c}{\includegraphics{verbatim/output/vap_prog_stream_vertices2.c.pdf}}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\begin{verbatim}
|
||||||
|
instruction[0]:
|
||||||
|
0x00078005 OUT RGBA
|
||||||
|
0x08020000 RGB ADDR0=temp[0] ADDR1=0.0 ADDR2=0.0
|
||||||
|
0x08020080 ALPHA ADDR0=0.0 ADDR1=0.0 ADDR2=0.0
|
||||||
|
0x1c440220 RGB_SEL_A=src0.rgb RGB_SEL_B=src0.rgb TARGET=A
|
||||||
|
0x1cc18003 ALPHA_OP=OP_MAX ALPHA_SEL_A=src0.1 ALPHA_SEL_B=src0.1 TARGET=A
|
||||||
|
0x00000005 RGB_OP=OP_MAX
|
||||||
|
\end{verbatim}
|
||||||
|
\caption*{R500 fragment shader (1 instruction, 192-bit control word)}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
This fragment shader is doing the equivalent of:
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\href{verbatim/fragment_shader_equivalent_vertex_color.glsl}{\includegraphics{verbatim/output/fragment_shader_equivalent_vertex_color.glsl.pdf}}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
The \texttt{temp} input appears to be written by
|
||||||
|
\texttt{VAP\_OUT\_VTX\_FMT\_0\__VTX\_COLOR\_0\_PRESENT} and read due to the
|
||||||
|
changes to \texttt{RS\_COUNT} and \texttt{RS\_INST\_0}.
|
||||||
|
|
||||||
|
\section{Progress: 21 Oct 2025}
|
||||||
|
|
||||||
|
From 15 Oct 2025 to 21 Oct 2025, I achieved the following (roughly in chronological order):
|
||||||
|
|
||||||
|
\begin{itemize}
|
||||||
|
\item I learned how the vertex fetcher is \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/vertex_color_aos.c#L387-L401}{configured}
|
||||||
|
\item I learned how the ``point list'' drawing primitive can be used to \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/single_color_clear.c#L504}{clear the screen}
|
||||||
|
\item I invented a new syntax for R500 vertex shader assembly (ATI never specified one themselves)
|
||||||
|
\item I modified my R500 \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/regs/pvs_disassemble.py}{vertex shader disassembler} to emit this new vertex shader syntax
|
||||||
|
\item I wrote a R500 \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/regs/assembler/vs}{vertex shader assembler} that can process my vertex shader assembly syntax
|
||||||
|
\item I create several animated demos with \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/single_color_clear_translate_vblank.c#L849-L859}{vblank-synchronized double buffering}
|
||||||
|
\item I learned how to configure and draw (multi-)textured triangles
|
||||||
|
\item I learned how to configure, clear, and use Z-buffers
|
||||||
|
\item I made a \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/texture_cube_clear_zwrite_vertex_shader.c}{textured rotating cube demo} that uses my first non-trivial \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/cube_rotate.vs.asm}{handwritten vertex shader assembly program}
|
||||||
|
\item I invented a new syntax for R500 fragment shader assembly (ATI never specified one themselves)
|
||||||
|
\item I wrote a new R500 \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/regs/us_disassemble2.py}{fragment shader disassembler} that emits this new fragment shader syntax
|
||||||
|
\item I wrote a R500 \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/regs/assembler/fs}{fragment shader assembler} that can process my fragment shader assembly syntax
|
||||||
|
\item I wrote a ``shadertoy''-style demo that uses my first non-trivial \href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/shadertoy_palette.fs.asm}{handwritten fragment shader assembly program}
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\subsection{DRM\_RADEON\_CS state tracking}
|
||||||
|
|
||||||
|
While attempting refactor one of my R500 demos to send fewer registers per
|
||||||
|
\texttt{DRM\_RADEON\_CS} ioctl, I found that there is a ``state tracker'' within
|
||||||
|
the \texttt{drm/radeon/r100}. For example, even if you don't use or depend on a
|
||||||
|
Z-buffer, \texttt{DRM\_RADEON\_CS} will still reject your packet buffer
|
||||||
|
depending on its own (imagined) concept of what the GPU state is. For example:
|
||||||
|
|
||||||
|
\begin{verbatim}
|
||||||
|
[ 1614.729278] [drm:r100_cs_track_check [radeon]] *ERROR* [drm] No buffer for z buffer !
|
||||||
|
[ 1614.729626] [drm:radeon_cs_ioctl [radeon]] *ERROR* Invalid command stream !
|
||||||
|
\end{verbatim}
|
||||||
|
|
||||||
|
This happens because \texttt{track->z\_enabled} is
|
||||||
|
\href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/r100.c#L2435}{initially
|
||||||
|
true} at the start of a \texttt{DRM\_RADEON\_CS} ioctl, and does not become
|
||||||
|
false unless the packet buffer
|
||||||
|
\href{https://github.com/torvalds/linux/blob/v6.12/drivers/gpu/drm/radeon/r300.c#L836-L843}{contains
|
||||||
|
a write} to \texttt{ZB\_CNTL}.
|
||||||
|
|
||||||
|
This seems a bit heavy-handed. Even if the model were ``multiple applications
|
||||||
|
may be using the GPU, so a single application can't depend on previously-set
|
||||||
|
register state'', it would still be better if the kernel didn't try to enforce
|
||||||
|
this by restricting permissible content of a packet buffer.
|
||||||
|
|
||||||
|
\subsection{Vertex transform bypass}
|
||||||
|
|
||||||
|
Mesa uses a ``point'' 3D primitive to implement \texttt{glClear} on R500. It
|
||||||
|
does this by first uploading this vertex shader:
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\href{verbatim/mesa_glclear.vs.asm}{\includegraphics{verbatim/output/mesa_glclear.vs.asm.pdf}}
|
||||||
|
\caption*{\texttt{mesa\_glclear.vs.asm}}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
This shader does nothing to the input other than copy it to the output, where
|
||||||
|
\texttt{out[0]} is the position vector, and \texttt{out[1]} is sent to the
|
||||||
|
fragment shader as a ``texture coordinate''. That fragment shader, in turn, does
|
||||||
|
not use the texture coordinate:
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\href{verbatim/mesa_glclear.fs.asm}{\includegraphics{verbatim/output/mesa_glclear.fs.asm.pdf}}
|
||||||
|
\caption*{\texttt{mesa\_glclear.fs.asm}}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
In my ``clear''
|
||||||
|
\href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/single_color_clear_rotate_vblank.c#L539}{implementation},
|
||||||
|
I instead set \texttt{PVS\_BYPASS}, which ``bypasses'' the vertex shader
|
||||||
|
completely, sending the vertices directly to the rasterizer. This is convenient
|
||||||
|
because it obviates the need to upload/change vertex shaders just to clear the
|
||||||
|
color and Z -buffers.
|
||||||
|
|
||||||
|
\subsection{Animation attempt \#1}
|
||||||
|
|
||||||
|
With a working colorbuffer clear, I wrote the
|
||||||
|
\href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/single_color_clear_translate.c#L786}{single\_color\_clear\_translate.c}
|
||||||
|
demo to translate my triangle position coordinates in a loop that waits for
|
||||||
|
\texttt{DRM\_RADEON\_GEM\_WAIT\_IDLE} between each frame. This attempt
|
||||||
|
produced the following images:
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\includegraphics{videos/single_color_clear_translate.png}
|
||||||
|
\caption*{R500 DVI capture, \texttt{single\_color\_clear\_translate.c}}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
This was intended to be a smooth animation, yet it is not. It also seems several
|
||||||
|
frames are never being displayed--the translation step is much smaller than what
|
||||||
|
is shown in the video.
|
||||||
|
|
||||||
|
This, interestingly, is exactly identical to how OpenGL/GLX applications behave
|
||||||
|
on R500 with \texttt{vblank\_mode=0}.
|
||||||
|
|
||||||
|
\subsection{Animation attempt \#2}
|
||||||
|
|
||||||
|
I read the R500 display controller \href{doc/RRG-216M56-03oOEM.pdf}{register reference guide} again.
|
||||||
|
It appears to suggest the \texttt{D1CRTC\_UPDATE\_INSTANTLY} bit, when unset,
|
||||||
|
might cause changes to \texttt{D1GRPH\_PRIMARY\_SURFACE\_ADDRESS} to be delayed in
|
||||||
|
hardware until the next vertical blanking interval begins.
|
||||||
|
|
||||||
|
This can be combined with polling \texttt{D1GRPH\_SURFACE\_UPDATE\_PENDING} to
|
||||||
|
later determine when the vblank-synchronized frame change actually occured.
|
||||||
|
|
||||||
|
This is precisely what I implemented in
|
||||||
|
\href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/single_color_clear_translate_vblank.c#L854-L855}{single\_color\_clear\_translate\_vblank.c}:
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\includegraphics{videos/single_color_clear_translate_vblank.png}
|
||||||
|
\caption*{R500 DVI capture, \texttt{single\_color\_clear\_translate\_vblank.c}}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
This is much closer to what I intended. The
|
||||||
|
\texttt{D1GRPH\_SURFACE\_UPDATE\_PENDING} part is certainly working as I
|
||||||
|
expected. Setting/unsetting \texttt{D1CRTC\_UPDATE\_INSTANTLY} appears to have
|
||||||
|
no effect on \texttt{D1GRPH\_PRIMARY\_SURFACE\_ADDRESS} behavior, so I feel my
|
||||||
|
understanding of R500 double-buffering is still incomplete.
|
||||||
|
|
||||||
|
\subsection{Multiple-texture sampling}
|
||||||
|
|
||||||
|
I am amazed and delighted how simple multiple-texture sampling is on R500.
|
||||||
|
|
||||||
|
As a counter-example, while Sega Dreamcast does have a fairly capable
|
||||||
|
fixed-function blending unit, to use the blending unit with multiple-texture
|
||||||
|
sampled polygons one needs to render the polygon multiple times (at least once
|
||||||
|
per texture) to an accumulation buffer. Blending is then performed between the
|
||||||
|
currently-sampled texture and the previously-accumulated result, and the blend
|
||||||
|
result is written to the accumulation buffer. From a vertex transformation
|
||||||
|
perspective, it can be inconvenient/inefficient to be required to buffer entire
|
||||||
|
triangle strips so that they can be submitted more than once per frame without
|
||||||
|
duplicating the clip/transform computations.
|
||||||
|
|
||||||
|
This is the fragment shader for
|
||||||
|
\href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/drm/texture_dual.c}{texture\_dual.c}
|
||||||
|
(disassembly of code originally generated by Mesa):
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\href{verbatim/texture_dual.fs.asm}{\includegraphics{verbatim/output/texture_dual.fs.asm.pdf}}
|
||||||
|
\caption*{\texttt{texture\_dual.fs.asm}}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
This pre-subtract multiply-add is an algebraic rearrangement of this GLSL code:
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\href{verbatim/texture_dual.fs.glsl}{\includegraphics{verbatim/output/texture_dual.fs.glsl.pdf}}
|
||||||
|
\caption*{\texttt{texture\_dual.fs.glsl}}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
Which produces this image:
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\href{images/texture_dual.png}{\includegraphics{images/texture_dual.png}}
|
||||||
|
\caption*{R500 framebuffer capture, \texttt{texture\_dual.c}}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
Being able to manipulate the texture samples as fragment shader unit temporaries
|
||||||
|
rather than as a sequence of accumulation buffer operations has me feeling excited
|
||||||
|
to do more with this.
|
||||||
|
|
||||||
|
\subsection{Z-buffer clear}
|
||||||
|
|
||||||
|
I've never worked with traditional Z-buffers before--Sega Saturn uses
|
||||||
|
\href{https://en.wikipedia.org/wiki/Painter\%27s_algorithm}{painter's algorithm}
|
||||||
|
exclusively, and Sega Dreamcast uses a ``depth accumulation buffer''
|
||||||
|
that isn't directly readable/writable.
|
||||||
|
|
||||||
|
It is slightly obvious in retrospect, but it took me several minutes to realize
|
||||||
|
that a ``depth clear'' can be implemented by covering the entire screen with a
|
||||||
|
``point'' primitive with the desired initial depth while \texttt{ZFUNC} set to
|
||||||
|
\texttt{ALWAYS}.
|
||||||
|
|
||||||
|
\subsection{Drawing a 3D cube}
|
||||||
|
|
||||||
|
With working double-buffering, Z-buffering, and the ability to clear each of
|
||||||
|
these every frame, I felt I was finally ready to draw something ``3D''.
|
||||||
|
|
||||||
|
I thought it would be fun to first start with a cube that is transformed in
|
||||||
|
``software'' on the x86 CPU (not using a vertex shader). This sequence of videos
|
||||||
|
shows my progression on implementing this:
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\includegraphics{videos/texture_cube.png}
|
||||||
|
\caption*{R500 DVI capture, \texttt{texture\_cube.c}}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\includegraphics{videos/texture_cube_clear.png}
|
||||||
|
\caption*{R500 DVI capture, \texttt{texture\_cube\_clear.c}}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\includegraphics{videos/texture_cube_clear_zwrite.png}
|
||||||
|
\caption*{R500 DVI capture, \texttt{texture\_cube\_clear\_zwrite.c}}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
\subsection{Drawing a 3D cube with vertex shaders}
|
||||||
|
|
||||||
|
I then decided it would be fun to hand-write a ``3D rotation'' vertex shader
|
||||||
|
from scratch. I first implemented the rotation in GLSL:
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\href{verbatim/cube_rotate.vs.glsl}{\includegraphics{verbatim/output/cube_rotate.vs.glsl.pdf}}
|
||||||
|
\caption*{\texttt{cube\_rotate.vs.glsl}}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
I verified that the GLSL version worked as expected in OpenGL, then I translated
|
||||||
|
the GLSL to R500 vertex shader assembly, as:
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\href{verbatim/cube_rotate.vs.asm}{\includegraphics{verbatim/output/cube_rotate.vs.asm.pdf}}
|
||||||
|
\caption*{\texttt{cube\_rotate.vs.asm}}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
However, when I first executed the vertex shader cube rotation demo, I found
|
||||||
|
it did not work as expected:
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\includegraphics{videos/texture_cube_clear_zwrite_vertex_shader_incorrect.png}
|
||||||
|
\caption*{R500 DVI capture, \texttt{texture\_cube\_clear\_zwrite\_vertex\_shader.c}\\(incorrect vertex shader assembler output)}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
After hours of debugging, I eventually found the issue was in this instruction:
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\href{verbatim/cube_rotate_3_temp.vs.asm}{\includegraphics{verbatim/output/cube_rotate_3_temp.vs.asm.pdf}}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
\href{doc/R5xx_Acceleration_v1.5.pdf}{R5xx\_Acceleration\_v1.5.pdf} briefly mentions this on pages 98 and 99:
|
||||||
|
|
||||||
|
\begin{quote}
|
||||||
|
The PVS\_DST\_MACRO\_INST bit was meant to be used for MACROS such as a
|
||||||
|
vector-matrix multiply, but currently is only set for the following cases:
|
||||||
|
|
||||||
|
A VE\_MULTIPLY\_ADD or VE\_MULTIPLYX2\_ADD instruction with all 3 source
|
||||||
|
operands using unique PVS\_REG\_TEMPORARY vector addresses. Since R300 only has
|
||||||
|
two read ports on the temporary memory, this special case of these instructions
|
||||||
|
is broken up (by the HW) into 2 operations.
|
||||||
|
\end{quote}
|
||||||
|
|
||||||
|
I read this paragraph much earlier, but I didn't fully understand it until
|
||||||
|
now. Indeed, this multiply-add has three unique \texttt{temp} addresses, and
|
||||||
|
must be encoded as a ``macro'' instruction.
|
||||||
|
|
||||||
|
I fixed this in my vertex shader assembler by
|
||||||
|
\href{https://git.idk.st/bilbo/r500/src/commit/91f83bdaa8/regs/assembler/vs/validator.py}{counting the number of unique temp addresses}
|
||||||
|
referenced by each instruction, promoting \texttt{VE\_MULTIPLY\_ADD} to
|
||||||
|
\texttt{PVS\_MACRO\_OP\_2CLK\_MADD} if more than two unique \texttt{temp}
|
||||||
|
addresses are referenced.
|
||||||
|
|
||||||
|
With this change, reassembling the same vertex shader source code now produces a
|
||||||
|
correct vertex shader cube rotation:
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\includegraphics{videos/texture_cube_clear_zwrite_vertex_shader.png}
|
||||||
|
\caption*{R500 DVI capture, \texttt{texture\_cube\_clear\_zwrite\_vertex\_shader.c}\\(correct vertex shader assembler output)}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
\subsection{Comparison with Mesa's R500 vertex shader compiler}
|
||||||
|
|
||||||
|
My ``cube rotation'' vertex shader,
|
||||||
|
\href{https://git.idk.st/bilbo/r500/src/commit/50244c7c95/drm/cube_rotate.vs.asm}{cube\_rotate.vs.asm}
|
||||||
|
is 15 instructions.
|
||||||
|
|
||||||
|
Mesa's R500 vertex shader compiler generated a
|
||||||
|
\href{https://git.idk.st/bilbo/r500/src/commit/50244c7c95/shader_examples/mesa/texture_cube_depth_vertex_shader.vs.txt}{27-instruction vertex shader}
|
||||||
|
from \href{https://r500.idk.st/verbatim/cube_rotate.vs.glsl}{semantically equivalent GLSL code}. Disassembly:
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\href{verbatim/mesa_cube_rotate.vs.asm}{\includegraphics{verbatim/output/mesa_cube_rotate.vs.asm.pdf}}
|
||||||
|
\caption*{\texttt{mesa\_cube\_rotate.vs.asm}}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
I was not particularly trying to write concise code, but I find this difference
|
||||||
|
in instruction count to be surprising. In general it seems Mesa's R500 vertex
|
||||||
|
shader compiler failed to vectorize several operations, and does significantly
|
||||||
|
more scalar multiplies and scalar multiply-adds than my implementation.
|
||||||
|
|
||||||
|
Ignoring algorithmic improvements (such as lifting the sin/cos calculation to
|
||||||
|
x86 code and instead sending a 4x4 matrix to the vertex shader), there is still
|
||||||
|
more opportunity for optimization beyond my 15-instruction implementation.
|
||||||
|
|
||||||
|
Particularly, the vertex shader unit has a ``dual math'' instruction mode, where
|
||||||
|
``vector engine'' (VE\_) and ``math engine'' (ME\_) operations can be executed
|
||||||
|
simultaneously in the same instruction. \texttt{cube\_rotate.vs.asm} would
|
||||||
|
indeed benefit from such an optimization--most of the \texttt{ME\_SIN} and
|
||||||
|
\texttt{ME\_COS} instructions could be interleaved with the \texttt{VE\_MUL} and
|
||||||
|
\texttt{VE\_MAD} operations that follow (at significant expense to
|
||||||
|
human-readability).
|
||||||
|
|
||||||
|
I am curious to see more examples of the difference between Mesa's R500 vertex
|
||||||
|
shader compiler output and my own vertex shader assembly.
|
||||||
|
|
||||||
|
\subsection{Fragment shader instruction expressiveness}
|
||||||
|
|
||||||
|
Compared to the R500 vertex shader instructions, the R500 fragment shader
|
||||||
|
instructions are significantly more featureful. This makes inventing a syntax
|
||||||
|
that can fully express the range of operations that a R500 fragment shader
|
||||||
|
instruction can do more complex.
|
||||||
|
|
||||||
|
A significant difference is where R500 vertex shaders have a single tier of
|
||||||
|
operand argument decoding, as in:
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\includegraphics{diagrams/vertex_inputs.svg}
|
||||||
|
\caption*{R500 vertex shader instruction operand inputs (simplified)}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
While R500 fragment shaders have multiple tiers of operand argument decoding, as
|
||||||
|
in:
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\includegraphics{diagrams/fragment_inputs.svg}
|
||||||
|
\caption*{R500 fragment shader instruction operand inputs (simplified)}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
I've written several \href{https://github.com/buhman/scu-dsp-asm}{nice assemblers}
|
||||||
|
for other architectures in the past, but I've never seen any instruction set
|
||||||
|
as expressive as R500 fragment shaders.
|
||||||
|
|
||||||
|
I attempted to directly reflect this ``multiple tiers of operand argument
|
||||||
|
decoding'' in the syntax I invented for fragment shader ALU instructions.
|
||||||
|
|
||||||
|
These instructions are also vector instructions: a total of 24 floating point
|
||||||
|
input operands and 8 floating results could be evaluated per instruction.
|
||||||
|
|
||||||
|
With this abundance of expressiveness and a relatively high skill ceiling, I'm
|
||||||
|
amazed R500 fragment shader assembly isn't more popular in programming
|
||||||
|
competitions, general everyday conversation, etc...
|
||||||
|
|
||||||
|
\subsection{Fragment shader assembler bugs}
|
||||||
|
|
||||||
|
There were two ``I spent a lot of time debugging this'' issues I encountered
|
||||||
|
with my fragment shader assembler.
|
||||||
|
|
||||||
|
The first was in this code I wrote to draw a fragment shaded circle, as in:
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\href{images/shadertoy_circle.png}{\includegraphics{images/shadertoy_circle.png}}
|
||||||
|
\caption*{R500 framebuffer capture, \texttt{shadertoy\_circle.fs.asm}}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
However, in an earlier version of my fragment shader assembler, I produced this
|
||||||
|
image instead:
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\href{images/shadertoy_circle_incorrect.png}{\includegraphics{images/shadertoy_circle_incorrect.png}}
|
||||||
|
\caption*{R500 framebuffer capture, \texttt{shadertoy\_circle.fs.asm}\\(incorrect assembler output)}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
In this handwritten fragment shader code:
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\href{verbatim/shadertoy_circle.fs.asm}{\includegraphics{verbatim/output/shadertoy_circle.fs.asm.pdf}}
|
||||||
|
\caption*{\texttt{shadertoy\_circle.fs.asm}}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
\href{doc/R5xx_Acceleration_v1.5.pdf}{R5xx\_Acceleration\_v1.5.pdf} says briefly on page 241:
|
||||||
|
|
||||||
|
\begin{quote}
|
||||||
|
Specifies whether to insert a NOP instruction after this. This would get
|
||||||
|
specified in order to meet dependency requirements for the pre-subtract inputs,
|
||||||
|
and dependency requirements for src0 of an MDH/MDV instruction.
|
||||||
|
\end{quote}
|
||||||
|
|
||||||
|
The issue is the pre-subtract input for the \texttt{MAD |srcp.a| src0.1 -src2.a}
|
||||||
|
instruction depends on the write to \texttt{temp[0].a} from the immediately
|
||||||
|
preceding \texttt{RCP src0.a} instruction--a pipeline hazard.
|
||||||
|
|
||||||
|
To fix this, I added support for
|
||||||
|
\href{https://git.idk.st/bilbo/r500/commit/fe0684ca5e58ed3be026410812c042e883bdce71}{generating the \texttt{NOP} bit}
|
||||||
|
in my fragment shader assembler.
|
||||||
|
|
||||||
|
\subsection{More fragment shader assembler bugs}
|
||||||
|
|
||||||
|
While trying to produce this image:
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\href{images/shadertoy_palette.png}{\includegraphics{images/shadertoy_palette.png}}
|
||||||
|
\caption*{R500 framebuffer capture, \texttt{shadertoy\_palette.fs.asm}}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
My fragment shader code instead produced this image:
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\href{images/shadertoy_palette_incorrect.png}{\includegraphics{images/shadertoy_palette_incorrect.png}}
|
||||||
|
\caption*{R500 framebuffer capture, \texttt{shadertoy\_palette.fs.asm}\\(incorrect assembler output)}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
The issue was simply that in the chaos of all of the other features I was
|
||||||
|
implementing for my fragment shader assembler, I
|
||||||
|
\href{https://git.idk.st/bilbo/r500/commit/f6a0fc4fab5dee3085dcf4b9a984244bba05d5ca}{forgot to emit the \texttt{ADDRD} bits}.
|
||||||
|
|
||||||
|
This meant that while fragment shader code that exclusively uses zero-address
|
||||||
|
destinations, such as \texttt{shadertoy\_circle.fs.asm}, appeared to work
|
||||||
|
completely correctly, I encountered this bug as soon as I started using non-zero
|
||||||
|
addresses such as \texttt{temp[1]} in my fragment shader code.
|
||||||
|
|
||||||
|
\subsection{Comparison to Direct3D ``asm''}
|
||||||
|
|
||||||
|
Prior to Direct3D 10, Microsoft previously defined a specification for both
|
||||||
|
\href{https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx9-graphics-reference-asm-vs-3-0}{vertex shader assembly} and
|
||||||
|
\href{https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx9-graphics-reference-asm-ps-3-0}{fragment shader assembly}.
|
||||||
|
|
||||||
|
The Direct3D ``asm'' name is slightly deceptive, however, as the
|
||||||
|
\texttt{vs\_3\_0} and \texttt{ps\_3\_0} instruction syntax does not map 1-to-1
|
||||||
|
with any hardware that exists.
|
||||||
|
|
||||||
|
It would perhaps be more accurate to think of Direct3D's ``asm''
|
||||||
|
language and compiler as more analogous to a
|
||||||
|
\href{https://en.wikipedia.org/wiki/BASIC}{shader BASIC} than as a true assembly
|
||||||
|
language on the same level as ``6502 assembly'', ``Z80 assembly'' and similar.
|
||||||
|
|
||||||
|
In contrast, my R500 assembly syntaxes are deliberately/explicitly mapped 1-to-1
|
||||||
|
with R500 instructions.
|
||||||
|
|
||||||
|
\subsection{Fragment shader animated demo}
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\includegraphics{videos/shadertoy_palette.png}
|
||||||
|
\caption*{R500 DVI capture, \texttt{shadertoy\_palette.fs.asm}}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
The R500 fragment shader code that I handwrote for this is:
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\href{verbatim/shadertoy_palette.fs.asm}{\includegraphics{verbatim/output/shadertoy_palette.fs.asm.pdf}}
|
||||||
|
\caption*{\texttt{shadertoy\_palette.fs.asm}}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
The \texttt{float} constants are interesting--they are decoded almost
|
||||||
|
identically to the
|
||||||
|
\href{https://en.wikipedia.org/wiki/Minifloat#8-bit_(1.4.3)}{8-bit (1.4.3) (bias 7) format shown on Wikipedia},
|
||||||
|
except:
|
||||||
|
\begin{itemize}
|
||||||
|
\item There is no sign bit (the value is always positive--positive values
|
||||||
|
can be swizzled to produce negative operands)
|
||||||
|
\item There is no ``zero'' value (zero can also be instead obtained via
|
||||||
|
swizzles); the ``all zeros'' bit pattern instead has a value of
|
||||||
|
\texttt{0.0009765625}.
|
||||||
|
\item There are no infinite or not-a-number values: a ``15'' exponent is treated
|
||||||
|
as 15.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
The exponent/mantissa table that shows example 7-bit float values on page 106 of
|
||||||
|
\href{doc/R5xx_Acceleration_v1.5.pdf}{R5xx\_Acceleration\_v1.5.pdf} is incorrect.
|
||||||
|
|
||||||
|
\end{document}
|
||||||
197
mesa/glDrawArrays.txt
Normal file
197
mesa/glDrawArrays.txt
Normal file
@ -0,0 +1,197 @@
|
|||||||
|
// _mesa_draw_arrays
|
||||||
|
|
||||||
|
// r300_draw_vbo
|
||||||
|
|
||||||
|
// r300_draw_arrays_immediate
|
||||||
|
|
||||||
|
vertex_size = 3
|
||||||
|
dwords = 13
|
||||||
|
|
||||||
|
// r300_prepare_for_rendering
|
||||||
|
// r300_emit_states
|
||||||
|
// r300_reserve_cs_dwords
|
||||||
|
389
|
||||||
|
// r300_emit_dirty_state
|
||||||
|
// r300_emit_gpu_flush
|
||||||
|
SC_SCISSOR0 = 0
|
||||||
|
SC_SCISSOR1 = (width - 1), (height - 1) // 600, 600
|
||||||
|
// cb_flush_clean
|
||||||
|
RB3D_DSTCACHE_CTLSTAT = 0xa
|
||||||
|
ZB_ZCACHE_CTLSTAT = 0x3
|
||||||
|
WAIT_UNTIL [0x1720] = RADEON_WAIT_3D_IDLECLEAN
|
||||||
|
// r300_emit_aa_state
|
||||||
|
GB_AA_CONFIG = 0
|
||||||
|
RB3D_AARESOLVE_CTL = 0
|
||||||
|
// r300_emit_fb_state
|
||||||
|
RB3D_CCTL = 16384
|
||||||
|
RB3D_COLOROFFSET0 = 0
|
||||||
|
//OUT_CS_RELOC
|
||||||
|
OUT_CS(0xc0001000); /* PKT3_NOP */ \
|
||||||
|
OUT_CS(0);
|
||||||
|
|
||||||
|
RB3D_COLORPITCH0 = 0xc10640
|
||||||
|
//OUT_CS_RELOC
|
||||||
|
OUT_CS(0xc0001000); /* PKT3_NOP */ \
|
||||||
|
OUT_CS(0);
|
||||||
|
|
||||||
|
ZB_FORMAT = 2
|
||||||
|
ZB_DEPTHOFFSET = 0
|
||||||
|
//OUT_CS_RELOC
|
||||||
|
OUT_CS(0xc0001000); /* PKT3_NOP */ \
|
||||||
|
OUT_CS(4);
|
||||||
|
|
||||||
|
ZB_DEPTHPITCH = 0x30640
|
||||||
|
//OUT_CS_RELOC
|
||||||
|
OUT_CS(0xc0001000); /* PKT3_NOP */ \
|
||||||
|
OUT_CS(4);
|
||||||
|
// r300_emit_hyperz_state
|
||||||
|
ZB_BW_CNTL = 0
|
||||||
|
ZB_DEPTHCLEARVALUE = 0
|
||||||
|
SC_HYPERZ_EN = 0x1c
|
||||||
|
GB_Z_PEQ_CONFIG = 0
|
||||||
|
// r300_emit_ztop_state
|
||||||
|
ZB_ZTOP = 1
|
||||||
|
// r300_emit_dsa_state
|
||||||
|
FG_ALPHA_FUNC = 0
|
||||||
|
ZB_CNTL = 0
|
||||||
|
ZB_ZSTENCILCNTL = 0
|
||||||
|
ZB_STENCILREFMASK = 0
|
||||||
|
ZB_STENCILREFMASK_BF = 0
|
||||||
|
FG_ALPHA_VALUE = 0
|
||||||
|
// r300_emit_blend_state
|
||||||
|
RB3D_ROPCNTL = 0
|
||||||
|
RB3D_BLENDCNTL = 0
|
||||||
|
RB3D_ABLENDCNTL = 0
|
||||||
|
RB3D_COLOR_CHANNEL_MASK = 15
|
||||||
|
RB3D_DITHER_CTL = 0
|
||||||
|
// r300_emit_blend_color_state
|
||||||
|
RB3D_CONSTANT_COLOR_AR = 0
|
||||||
|
RB3D_CONSTANT_COLOR_GB = 0
|
||||||
|
// r300_emit_scissor_state
|
||||||
|
SC_CLIP_0_A = 0, 0
|
||||||
|
SC_CLIP_0_B = 0 - 1, 0 - 1
|
||||||
|
// r300_emit_sample_mask
|
||||||
|
SC_SCREENDOOR = 63 | (63 << 6) | (63 << 12) | (63 << 18)
|
||||||
|
// r300_emit_invariant_state
|
||||||
|
GB_SELECT = 0
|
||||||
|
FG_FOG_BLEND = 0
|
||||||
|
GA_OFFSET = 0
|
||||||
|
SU_TEX_WRAP = 0
|
||||||
|
SU_DEPTH_SCALE = 16777215.0f (0x4b7fffff)
|
||||||
|
SU_DEPTH_OFFSET = 0
|
||||||
|
SC_EDGERULE = 0x2da49525
|
||||||
|
RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD = 0x1010101
|
||||||
|
RB3D_DISCARD_SRC_PIXEL_GTE_THRESHOLD = 0xfefefefe
|
||||||
|
GA_COLOR_CONTROL_PS3 = 0
|
||||||
|
SU_TEX_WRAP_PS3 = 0
|
||||||
|
// r300_emit_viewport_state
|
||||||
|
VAP_VPORT_XSCALE = 300
|
||||||
|
VAP_VPORT_XOFFSET = 300
|
||||||
|
VAP_VPORT_YSCALE = -300
|
||||||
|
VAP_VPORT_YOFFSET = 300
|
||||||
|
VAP_VPORT_ZSCALE = 0.5
|
||||||
|
VAP_VPORT_ZOFFSET = 0.5
|
||||||
|
VAP_VTE_CNTL = 0x43f
|
||||||
|
// r300_emit_pvs_flush
|
||||||
|
VAP_PVS_STATE_FLUSH_REG = 0
|
||||||
|
// r300_emit_vap_invariant_state
|
||||||
|
VAP_PVS_VTX_TIMEOUT_REG = 0xffff
|
||||||
|
VAP_GB_VERT_CLIP_ADJ = 1.0f (0x3f800000)
|
||||||
|
VAP_GB_VERT_DISC_ADJ = 1.0f (0x3f800000)
|
||||||
|
VAP_GB_HORZ_CLIP_ADJ = 1.0f (0x3f800000)
|
||||||
|
VAP_GB_HORZ_DISC_ADJ = 1.0f (0x3f800000)
|
||||||
|
VAP_PSC_SGN_NORM_CNTL = 0xaaaaaaaa
|
||||||
|
VAP_TEX_TO_COLOR_CNTL = 0
|
||||||
|
// r300_emit_vertex_stream_state
|
||||||
|
VAP_PROG_STREAM_CNTL_0 = 0x2002
|
||||||
|
VAP_PROG_STREAM_CNTL_EXT_0 = 0xfa88
|
||||||
|
// r300_emit_vs_state
|
||||||
|
VAP_PVS_CODE_CNTL_0 = 0
|
||||||
|
VAP_PVS_CODE_CNTL_1 = 0
|
||||||
|
VAP_PVS_VECTOR_INDX_REG = 0
|
||||||
|
VAP_PVS_VECTOR_DATA_REG_128 = (ONE_REG_WR:)
|
||||||
|
{0xf00203, 0xd10001, 0x1248001, 0x1248001}
|
||||||
|
VAP_CNTL = 0xb0055a
|
||||||
|
VAP_PVS_FLOW_CNTL_OPC = 0
|
||||||
|
VAP_PVS_FLOW_CNTL_ADDRS_LW_[0-15] = 0
|
||||||
|
VAP_PVS_FLOW_CNTL_ADDRS_UW_[0-15] = 0
|
||||||
|
VAP_PVS_FLOW_CNTL_LOOP_INDEX_[0-15] = 0
|
||||||
|
// r300_emit_clip_state
|
||||||
|
VAP_PVS_VECTOR_INDX_REG = 0x600
|
||||||
|
VAP_PVS_VECTOR_DATA_REG_128 =
|
||||||
|
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} (24)
|
||||||
|
// r300_emit_rs_block_state
|
||||||
|
VAP_VTX_STATE_CNTL = 0x5555
|
||||||
|
VAP_VSM_VTX_ASSM [0x2184] = 0x1
|
||||||
|
VAP_OUTPUT_VTX_FMT_0 = 1
|
||||||
|
VAP_OUTPUT_VTX_FMT_1 = 4
|
||||||
|
GB_ENABLE = 0
|
||||||
|
RS_IP_0 = 0x30000000
|
||||||
|
RS_COUNT = 0x40080
|
||||||
|
RS_INST_COUNT = 0
|
||||||
|
RS_INST_0 = 0
|
||||||
|
// r300_emit_rs_state
|
||||||
|
VAP_CNTL_STATUS = 0
|
||||||
|
VAP_CLIP_CNTL = 0xc000
|
||||||
|
GA_POINT_SIZE = 0x60006
|
||||||
|
GA_POINT_MINMAX = 0x60006
|
||||||
|
GA_LINE_CNTL = 0x20006
|
||||||
|
SU_POLY_OFFSET_ENABLE = 0
|
||||||
|
SU_CULL_MODE = 0
|
||||||
|
GA_LINE_STIPPLE_CONFIG = 0
|
||||||
|
GA_LINE_STIPPLE_VALUE = 0
|
||||||
|
GA_POLY_MODE = 0
|
||||||
|
GA_ROUND_MODE = 0x31
|
||||||
|
SC_CLIP_RULE = 0xffff
|
||||||
|
GA_POINT_S0 = 0
|
||||||
|
GA_POINT_T0 = 1.0f (0x3f800000)
|
||||||
|
GA_POINT_S1 = 1.0f (0x3f800000)
|
||||||
|
GA_POINT_T1 = 0
|
||||||
|
// r300_emit_fb_state_pipelined
|
||||||
|
US_OUT_FMT_0 = 0x1b00
|
||||||
|
US_OUT_FMT_1 = 0xf
|
||||||
|
US_OUT_FMT_2 = 0xf
|
||||||
|
US_OUT_FMT_3 = 0xf
|
||||||
|
GB_MSPOS0 = 0x66666666
|
||||||
|
GB_MSPOS1 = 0x6666666
|
||||||
|
// r500_emit_fs
|
||||||
|
US_CONFIG = 2
|
||||||
|
US_PIXSIZE = 1
|
||||||
|
US_FC_CTRL = 0
|
||||||
|
US_CODE_RANGE = 0
|
||||||
|
US_CODE_OFFSET = 0
|
||||||
|
US_CODE_ADDR = 0
|
||||||
|
GA_US_VECTOR_INDEX = 0
|
||||||
|
GA_US_VECTOR_DATA = (ONE_REG_WR:)
|
||||||
|
{0x78005, 0x8020080, 0x8020080, 0x1c9b04d8, 0x1c810003, 0x5}
|
||||||
|
FG_DEPTH_SRC = 0
|
||||||
|
US_W_FMT = 0
|
||||||
|
// r500_emit_fs_rc_constant_state
|
||||||
|
[nothing]
|
||||||
|
// r500_emit_fs_constants
|
||||||
|
[nothing]
|
||||||
|
// r300_emit_vs_constants
|
||||||
|
VAP_PVS_CONST_CNTL = 0
|
||||||
|
// r300_emit_texture_cache_inval
|
||||||
|
TX_INVALTAGS = 0
|
||||||
|
// r300_emit_textures_state
|
||||||
|
TX_ENABLE = 0
|
||||||
|
// r300_emit_query_start
|
||||||
|
[nothing]
|
||||||
|
// r500_emit_index_bias
|
||||||
|
VAP_INDEX_OFFSET = 0
|
||||||
|
// r300_emit_draw_init
|
||||||
|
GA_COLOR_CONTROL = 0x3aaaa
|
||||||
|
VAP_VF_MAX_VTX_INDX = 2
|
||||||
|
VAP_VF_MIN_VTX_INDX = 0
|
||||||
|
|
||||||
|
// r300_draw_arrays_immediate
|
||||||
|
VAP_VTX_SIZE = 3
|
||||||
|
|
||||||
|
[
|
||||||
|
PACKET3_3D_DRAW_IMMD_2 (3 * 3)
|
||||||
|
0x30034 // VAP_VF_CNTL
|
||||||
|
{0.5, -0.5, 0}
|
||||||
|
{-0.5, -0.5, 0}
|
||||||
|
{0, 0.5, 0}
|
||||||
|
]
|
||||||
26
replace_video.py
Normal file
26
replace_video.py
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
import sys
|
||||||
|
|
||||||
|
scale = 1.5
|
||||||
|
|
||||||
|
def transform():
|
||||||
|
with open(sys.argv[1]) as f:
|
||||||
|
for line in f.readlines():
|
||||||
|
if "<img alt='PIC' src='videos/" in line:
|
||||||
|
begin, end = line.split("<img", maxsplit=1)
|
||||||
|
yield begin
|
||||||
|
|
||||||
|
img, rest = end.split("/>", maxsplit=1)
|
||||||
|
yield rest
|
||||||
|
|
||||||
|
src = img.split("src='")[1].split("'")[0]
|
||||||
|
assert src.endswith(".png"), src
|
||||||
|
src = src.removesuffix(".png") + ".mp4"
|
||||||
|
yield "<video style='width: 100%;' controls=''>"
|
||||||
|
yield f"<source src='{src}' type='video/mp4'>"
|
||||||
|
yield "</video>"
|
||||||
|
else:
|
||||||
|
yield line
|
||||||
|
|
||||||
|
lines = list(transform())
|
||||||
|
with open(sys.argv[1], 'w') as f:
|
||||||
|
f.write(''.join(lines))
|
||||||
22
resize_svg.py
Normal file
22
resize_svg.py
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
import sys
|
||||||
|
|
||||||
|
scale = 1.5
|
||||||
|
|
||||||
|
def transform():
|
||||||
|
with open(sys.argv[1]) as f:
|
||||||
|
for line in f.readlines():
|
||||||
|
if line.strip().startswith("<svg xmlns"):
|
||||||
|
width = line.split('width="')[1].split('"')[0]
|
||||||
|
height = line.split('height="')[1].split('"')[0]
|
||||||
|
viewbox = line.split('viewBox="')[1].split('"')[0]
|
||||||
|
width = float(width) * scale
|
||||||
|
height = float(height) * scale
|
||||||
|
|
||||||
|
template = f'<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="{width}" height="{height}" viewBox="{viewbox}">'
|
||||||
|
yield template
|
||||||
|
else:
|
||||||
|
yield line
|
||||||
|
|
||||||
|
lines = list(transform())
|
||||||
|
with open(sys.argv[1], 'w') as f:
|
||||||
|
f.write('\n'.join(lines))
|
||||||
72
verbatim.sh
Normal file
72
verbatim.sh
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
set -eux
|
||||||
|
|
||||||
|
cd verbatim/
|
||||||
|
|
||||||
|
mkdir -p output
|
||||||
|
|
||||||
|
for i in *.asm; do
|
||||||
|
cat <<EOF > $i.tex
|
||||||
|
\documentclass[varwidth=13.1cm, border={0.0cm 0.0cm 0.0cm 0.0cm}]{standalone}
|
||||||
|
\usepackage{minted}
|
||||||
|
\setminted[python]{breaklines, linenos, frame=lines, framesep=2mm, fontsize=\huge, numbersep=5pt}
|
||||||
|
\standaloneenv{minted}
|
||||||
|
\begin{document}
|
||||||
|
|
||||||
|
\begin{minted}{haskell}
|
||||||
|
EOF
|
||||||
|
cat $i >> $i.tex
|
||||||
|
|
||||||
|
cat <<EOF >> $i.tex
|
||||||
|
\end{minted}
|
||||||
|
|
||||||
|
\end{document}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
pdflatex -shell-escape -output-directory=output $i.tex
|
||||||
|
pdflatex -shell-escape -output-directory=output $i.tex
|
||||||
|
done
|
||||||
|
|
||||||
|
|
||||||
|
for i in *.glsl; do
|
||||||
|
cat <<EOF > $i.tex
|
||||||
|
\documentclass[varwidth=13.1cm, border={0.0cm 0.0cm 0.0cm 0.0cm}]{standalone}
|
||||||
|
\usepackage{minted}
|
||||||
|
\setminted[python]{breaklines, linenos, frame=lines, framesep=2mm, fontsize=\huge, numbersep=5pt}
|
||||||
|
\standaloneenv{minted}
|
||||||
|
\begin{document}
|
||||||
|
|
||||||
|
\begin{minted}{glsl}
|
||||||
|
EOF
|
||||||
|
cat $i >> $i.tex
|
||||||
|
|
||||||
|
cat <<EOF >> $i.tex
|
||||||
|
\end{minted}
|
||||||
|
|
||||||
|
\end{document}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
pdflatex -shell-escape -output-directory=output $i.tex
|
||||||
|
pdflatex -shell-escape -output-directory=output $i.tex
|
||||||
|
done
|
||||||
|
|
||||||
|
for i in *.c; do
|
||||||
|
cat <<EOF > $i.tex
|
||||||
|
\documentclass[varwidth=13.1cm, border={0.0cm 0.0cm 0.0cm 0.0cm}]{standalone}
|
||||||
|
\usepackage{minted}
|
||||||
|
\setminted[python]{breaklines, linenos, frame=lines, framesep=2mm, fontsize=\huge, numbersep=5pt}
|
||||||
|
\standaloneenv{minted}
|
||||||
|
\begin{document}
|
||||||
|
|
||||||
|
\begin{minted}{c}
|
||||||
|
EOF
|
||||||
|
cat $i >> $i.tex
|
||||||
|
|
||||||
|
cat <<EOF >> $i.tex
|
||||||
|
\end{minted}
|
||||||
|
|
||||||
|
\end{document}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
pdflatex -shell-escape -output-directory=output $i.tex
|
||||||
|
pdflatex -shell-escape -output-directory=output $i.tex
|
||||||
|
done
|
||||||
Loading…
x
Reference in New Issue
Block a user