From d2e81516a1f4b8192438dc14586d4789dd242c4a Mon Sep 17 00:00:00 2001
From: Zack Buhman <zack@buhman.org>
Date: Sat, 18 Oct 2025 21:54:41 -0500
Subject: [PATCH] drm/texture_cube_clear_zwrite_vertex_shader: fixed

---
 drm/clear_nop.vs.asm                          |  1 +
 drm/cube_rotate.vs.asm                        | 23 ++++-----
 drm/cube_rotate.vs.inc                        | 23 ++++-----
 drm/texture_cube_clear_zwrite_vertex_shader.c | 50 +++++++++++++------
 regs/assembler/__main__.py                    |  2 +
 regs/assembler/emitter.py                     |  7 ++-
 regs/assembler/keywords.py                    | 11 ++++
 regs/assembler/parser.py                      |  4 +-
 regs/assembler/validator.py                   | 25 ++++++++++
 ...vs_opcode_and_destination_operand_bits.txt |  3 ++
 regs/pvs_disassemble.py                       |  4 +-
 11 files changed, 106 insertions(+), 47 deletions(-)
 create mode 100644 drm/clear_nop.vs.asm
 create mode 100644 regs/assembler/validator.py

diff --git a/drm/clear_nop.vs.asm b/drm/clear_nop.vs.asm
new file mode 100644
index 0000000..3140b2a
--- /dev/null
+++ b/drm/clear_nop.vs.asm
@@ -0,0 +1 @@
+out[0].xyzw = VE_ADD input[0].xyzw input[0].0000 input[0].0000
diff --git a/drm/cube_rotate.vs.asm b/drm/cube_rotate.vs.asm
index 50629c8..8fed9ab 100644
--- a/drm/cube_rotate.vs.asm
+++ b/drm/cube_rotate.vs.asm
@@ -12,20 +12,15 @@ temp[0].xy   = VE_ADD  const[1].xy__ const[1].00__
 ; calculation.
 ;
 ; This 3-instruction sequence linearly remaps the range [-∞,+∞] to [-π,+π]
-temp[0].x    = VE_MAD   temp[0].x___   const[0].x___  const[0].y___
-temp[0].x    = VE_FRC   temp[0].x___
-temp[0].x    = VE_MAD   temp[0].x___   const[0].z___  const[0].w___
-
-; the same thing, but for temp[0].y
-temp[0].y    = VE_MAD   temp[0].y___   const[0].x___  const[0].y___
-temp[0].y    = VE_FRC   temp[0].y___
-temp[0].y    = VE_MAD   temp[0].y___   const[0].z___  const[0].w___
+temp[0].xy   = VE_MAD   temp[0].xy__   const[0].xx__  const[0].yy__
+temp[0].xy   = VE_FRC   temp[0].xy__
+temp[0].xy   = VE_MAD   temp[0].xy__   const[0].zz__  const[0].ww__
 
 ; sin and cos
 temp[3].x    = ME_SIN   temp[0].___x
 temp[3].y    = ME_COS   temp[0].___x
-temp[3].z    = ME_SIN   temp[0].___y
-temp[3].w    = ME_COS   temp[0].___y
+temp[4].x    = ME_SIN   temp[0].___y
+temp[4].y    = ME_COS   temp[0].___y
 
 ; temp[3] now contains:
 ; temp[3] = {sin(theta1), cos(theta1), sin(theta2), cos(theta2)}
@@ -49,22 +44,22 @@ temp[1].xyz   = VE_MAD   input[0].xyy_   temp[3].1yx_   temp[1].0yz_
 
 ; x_ = (-z1 * st2)
 ; z_ = ( z1 * ct2)
-temp[2].xz    = VE_MUL   temp[1].-z_z_   temp[3].z_w_
+temp[2].xz    = VE_MUL   temp[1].-z_z_   temp[4].x_y_
 
 ; x2 = (x1 * ct2 + nz1st2)
 ; y2 = (y1 *   1 +      0)
 ; z2 = (x1 * st2 +  z1ct2)
-temp[2].xyz   = VE_MAD   temp[1].xyx_    temp[3].w1z_   temp[2].x0z_
+temp[2].xyz   = VE_MAD   temp[1].xyx_    temp[4].y1x_   temp[2].x0z_
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; scale
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-temp[2].xyz   = VE_MAD   temp[1].xyz_    const[1].zzz_  const[1].00w_
+temp[3].xyz   = VE_MAD   temp[2].xyz_    const[1].zzz_  const[1].00w_
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; output
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-out[0].xyzw   = VE_MUL   temp[2].xyzz    temp[2].11-z1
+out[0].xyzw   = VE_MUL   temp[3].xyzz    temp[3].11-z1
 out[1].xyzw   = VE_ADD   input[1].xyzw   input[1].0000
diff --git a/drm/cube_rotate.vs.inc b/drm/cube_rotate.vs.inc
index a540730..7d20d6f 100644
--- a/drm/cube_rotate.vs.inc
+++ b/drm/cube_rotate.vs.inc
@@ -1,18 +1,15 @@
 0x00300003, 0x01f90022, 0x01fc8022, 0x01ffe022,
-0x00100004, 0x01ff0000, 0x01ff0002, 0x01ff2002,
-0x00100006, 0x01ff0000, 0x01ffe000, 0x01ffe000,
-0x00100004, 0x01ff0000, 0x01ff4002, 0x01ff6002,
-0x00200004, 0x01ff2000, 0x01ff0002, 0x01ff2002,
-0x00200006, 0x01ff2000, 0x01ffe000, 0x01ffe000,
-0x00200004, 0x01ff2000, 0x01ff4002, 0x01ff6002,
+0x00300004, 0x01f90000, 0x01f90002, 0x01f92002,
+0x00300006, 0x01f90000, 0x01ffe000, 0x01ffe000,
+0x00300004, 0x01f90000, 0x01fa4002, 0x01fb6002,
 0x00106050, 0x003fe000, 0x01ffe000, 0x01ffe000,
 0x00206051, 0x003fe000, 0x01ffe000, 0x01ffe000,
-0x00406050, 0x007fe000, 0x01ffe000, 0x01ffe000,
-0x00806051, 0x007fe000, 0x01ffe000, 0x01ffe000,
+0x00108050, 0x007fe000, 0x01ffe000, 0x01ffe000,
+0x00208051, 0x007fe000, 0x01ffe000, 0x01ffe000,
 0x00602002, 0x05d2e001, 0x01c8e060, 0x01ffe060,
-0x00702004, 0x01c90001, 0x01c1a060, 0x01d18020,
-0x00504002, 0x03d74020, 0x01df4060, 0x01ffe060,
-0x00704004, 0x01c10020, 0x01d56060, 0x01d40040,
-0x00704004, 0x01d10020, 0x01d24022, 0x09dc8022,
-0x00f00202, 0x01510040, 0x0955a040, 0x01ffe040,
+0x00702080, 0x01c90001, 0x01c1a060, 0x01d18020,
+0x00504002, 0x03d74020, 0x01cf0080, 0x01ffe080,
+0x00704080, 0x01c10020, 0x01c52080, 0x01d40040,
+0x00706004, 0x01d10040, 0x01d24022, 0x01dc8022,
+0x00f00202, 0x00910060, 0x0955a060, 0x01ffe060,
 0x00f02203, 0x00d10021, 0x01248021, 0x01ffe021,
diff --git a/drm/texture_cube_clear_zwrite_vertex_shader.c b/drm/texture_cube_clear_zwrite_vertex_shader.c
index f065ce6..896abda 100644
--- a/drm/texture_cube_clear_zwrite_vertex_shader.c
+++ b/drm/texture_cube_clear_zwrite_vertex_shader.c
@@ -159,9 +159,10 @@ static const int fragment_shader_instructions = (fragment_shader_length / 6) - 1
 static const uint32_t vertex_shader[] = {
   //#include "../shader_examples/mesa/texture_cube.vs.txt"
   #include "cube_rotate.vs.inc"
+  #include "clear_nop.vs.inc"
 };
 static const int vertex_shader_length = (sizeof (vertex_shader)) / (sizeof (vertex_shader[0]));
-static const int vertex_shader_instructions = vertex_shader_length / 4;
+static const int vertex_shader_instructions = (vertex_shader_length / 4) - 1;
 
 union u32_f32 {
   uint32_t u32;
@@ -228,6 +229,19 @@ int _3d_clear(int ix)
 
   T0V(TX_ENABLE, 0x00000000);
 
+  //////////////////////////////////////////////////////////////////////////////
+  // VAP_PVS
+  //////////////////////////////////////////////////////////////////////////////
+
+  T0V(VAP_PVS_CODE_CNTL_0
+      , VAP_PVS_CODE_CNTL_0__PVS_FIRST_INST(vertex_shader_instructions)
+      | VAP_PVS_CODE_CNTL_0__PVS_XYZW_VALID_INST(vertex_shader_instructions)
+      | VAP_PVS_CODE_CNTL_0__PVS_LAST_INST(vertex_shader_instructions)
+      );
+  T0V(VAP_PVS_CODE_CNTL_1
+      , VAP_PVS_CODE_CNTL_1__PVS_LAST_VTX_SRC_INST(vertex_shader_instructions)
+      );
+
   //////////////////////////////////////////////////////////////////////////////
   // VAP
   //////////////////////////////////////////////////////////////////////////////
@@ -241,7 +255,7 @@ int _3d_clear(int ix)
       | VAP_VTE_CNTL__VTX_Z_FMT(1)
       );
 
-  T0V(VAP_CNTL_STATUS, VAP_CNTL_STATUS__PVS_BYPASS(1));
+  T0V(VAP_CNTL_STATUS, VAP_CNTL_STATUS__PVS_BYPASS(0));
 
   T0V(VAP_PROG_STREAM_CNTL_0
       , VAP_PROG_STREAM_CNTL__DATA_TYPE_0__FLOAT_2
@@ -252,7 +266,7 @@ int _3d_clear(int ix)
   T0V(VAP_PROG_STREAM_CNTL_EXT_0
       , VAP_PROG_STREAM_CNTL_EXT__SWIZZLE_SELECT_X_0__SELECT_X
       | VAP_PROG_STREAM_CNTL_EXT__SWIZZLE_SELECT_Y_0__SELECT_Y
-      | VAP_PROG_STREAM_CNTL_EXT__SWIZZLE_SELECT_Z_0__SELECT_FP_ONE
+      | VAP_PROG_STREAM_CNTL_EXT__SWIZZLE_SELECT_Z_0__SELECT_FP_ZERO
       | VAP_PROG_STREAM_CNTL_EXT__SWIZZLE_SELECT_W_0__SELECT_FP_ONE
       | VAP_PROG_STREAM_CNTL_EXT__WRITE_ENA_0(0b1111)
       );
@@ -321,8 +335,7 @@ int _3d_cube(int ix, float theta)
       | ZB_CNTL__ZWRITEENABLE__ENABLE // 1
       );
   T0V(ZB_ZSTENCILCNTL
-      //, ZB_ZSTENCILCNTL__ZFUNC(5) // greater than
-      , ZB_ZSTENCILCNTL__ZFUNC__ALWAYS
+      , ZB_ZSTENCILCNTL__ZFUNC(5) // greater than
       );
 
   T0V(ZB_FORMAT
@@ -408,7 +421,7 @@ int _3d_cube(int ix, float theta)
   ib[ix++].u32 = 2 * 4; // index into relocs array
 
   //////////////////////////////////////////////////////////////////////////////
-  // VAP
+  // VAP_PVS
   //////////////////////////////////////////////////////////////////////////////
 
   T0V(VAP_PVS_CONST_CNTL
@@ -421,7 +434,8 @@ int _3d_cube(int ix, float theta)
       );
 
   float theta1 = theta;
-  float theta2 = 3.14 * theta;
+  //float theta2 = 3.14f * theta;
+  float theta2 = theta;
   float consts[] = {
     I_PI_2, 0.5f, PI_2, -PI,
     theta1, theta2, 0.2f, 0.5f,
@@ -431,6 +445,19 @@ int _3d_cube(int ix, float theta)
   for (int i = 0; i < consts_length; i++)
     ib[ix++].f32 = consts[i];
 
+  T0V(VAP_PVS_CODE_CNTL_0
+      , VAP_PVS_CODE_CNTL_0__PVS_FIRST_INST(0)
+      | VAP_PVS_CODE_CNTL_0__PVS_XYZW_VALID_INST((vertex_shader_instructions - 1))
+      | VAP_PVS_CODE_CNTL_0__PVS_LAST_INST((vertex_shader_instructions - 1))
+      );
+  T0V(VAP_PVS_CODE_CNTL_1
+      , VAP_PVS_CODE_CNTL_1__PVS_LAST_VTX_SRC_INST((vertex_shader_instructions - 1))
+      );
+
+  //////////////////////////////////////////////////////////////////////////////
+  // VAP
+  //////////////////////////////////////////////////////////////////////////////
+
   T0V(VAP_CLIP_CNTL
       , VAP_CLIP_CNTL__PS_UCP_MODE(3)
       );
@@ -825,15 +852,6 @@ int indirect_buffer(float theta)
   assert(vertex_shader_length % 4 == 0);
   printf("vs instructions %d\n", vertex_shader_instructions);
 
-  T0V(VAP_PVS_CODE_CNTL_0
-      , VAP_PVS_CODE_CNTL_0__PVS_FIRST_INST(0)
-      | VAP_PVS_CODE_CNTL_0__PVS_XYZW_VALID_INST((vertex_shader_instructions - 1))
-      | VAP_PVS_CODE_CNTL_0__PVS_LAST_INST((vertex_shader_instructions - 1))
-      );
-  T0V(VAP_PVS_CODE_CNTL_1
-      , VAP_PVS_CODE_CNTL_1__PVS_LAST_VTX_SRC_INST((vertex_shader_instructions - 1))
-      );
-
   T0V(VAP_PVS_VECTOR_INDX_REG
       , VAP_PVS_VECTOR_INDX_REG__OCTWORD_OFFSET(0)
       );
diff --git a/regs/assembler/__main__.py b/regs/assembler/__main__.py
index 4ed3963..9dd3337 100644
--- a/regs/assembler/__main__.py
+++ b/regs/assembler/__main__.py
@@ -3,6 +3,7 @@ import sys
 from assembler.lexer import Lexer, LexerError
 from assembler.parser import Parser, ParserError
 from assembler.emitter import emit_instruction
+from assembler.validator import validate_instruction
 
 sample = b"""
 temp[0].xyzw = VE_ADD    const[1].xyzw     const[1].0000     const[1].0000
@@ -22,6 +23,7 @@ def frontend_inner(buf):
     tokens = list(lexer.lex_tokens())
     parser = Parser(tokens)
     for ins, start_end in parser.instructions():
+        ins = validate_instruction(ins)
         yield list(emit_instruction(ins)), start_end
 
 def print_error(filename, buf, e):
diff --git a/regs/assembler/emitter.py b/regs/assembler/emitter.py
index 0e57c49..53bbb62 100644
--- a/regs/assembler/emitter.py
+++ b/regs/assembler/emitter.py
@@ -1,4 +1,4 @@
-from assembler.keywords import ME, VE, KW
+from assembler.keywords import ME, VE, MVE, KW
 from assembler.parser import Instruction, DestinationOp, Source
 import pvs_dst
 import pvs_src
@@ -34,8 +34,10 @@ def dst_reg_type(kw):
         assert not "Invalid PVS_DST_REG", kw
 
 def emit_destination_op(dst_op: DestinationOp):
-    assert type(dst_op.opcode) in {ME, VE}
+    assert type(dst_op.opcode) in {ME, VE, MVE}
     math_inst = int(type(dst_op.opcode) is ME)
+    if dst_op.macro:
+        assert dst_op.opcode.value in {0, 1}
     value = (
           pvs_dst.OPCODE_gen(dst_op.opcode.value)
         | pvs_dst.MATH_INST_gen(math_inst)
@@ -45,6 +47,7 @@ def emit_destination_op(dst_op: DestinationOp):
         | pvs_dst.WE_Y_gen(we_y(dst_op.write_enable))
         | pvs_dst.WE_Z_gen(we_z(dst_op.write_enable))
         | pvs_dst.WE_W_gen(we_w(dst_op.write_enable))
+        | pvs_dst.MACRO_INST_gen(int(dst_op.macro))
     )
     yield value
 
diff --git a/regs/assembler/keywords.py b/regs/assembler/keywords.py
index 273357c..f0e6a05 100644
--- a/regs/assembler/keywords.py
+++ b/regs/assembler/keywords.py
@@ -2,6 +2,12 @@ from dataclasses import dataclass
 from typing import Optional
 from enum import Enum, auto
 
+@dataclass
+class MVE:
+    name: str
+    synonym: Optional[str]
+    value: int
+
 @dataclass
 class VE:
     name: str
@@ -14,6 +20,11 @@ class ME:
     synonym: Optional[str]
     value: int
 
+macro_vector_operations = [
+    MVE(b"MACRO_OP_2CLK_MADD"       , None      , 0),
+    MVE(b"MACRO_OP_2CLK_M2X_ADD"    , None      , 1),
+]
+
 vector_operations = [
        # name                       synonym    value
     VE(b"VECTOR_NO_OP"              , b"VE_NOP" , 0),
diff --git a/regs/assembler/parser.py b/regs/assembler/parser.py
index c08a90c..40a6c13 100644
--- a/regs/assembler/parser.py
+++ b/regs/assembler/parser.py
@@ -25,6 +25,7 @@ class DestinationOp:
     offset: int
     write_enable: set[int]
     opcode: Union[VE, ME]
+    macro: bool
 
 @dataclass
 class SourceSwizzle:
@@ -172,7 +173,8 @@ class Parser:
         write_enable = parse_dest_write_enable(write_enable_token)
         self.consume(TT.equal, "expected equals")
         opcode = self.opcode()
-        return DestinationOp(destination_type, offset_value, write_enable, opcode)
+        macro = False
+        return DestinationOp(destination_type, offset_value, write_enable, opcode, macro)
 
     def source_type(self):
         token = self.consume(TT.keyword, "expected source type")
diff --git a/regs/assembler/validator.py b/regs/assembler/validator.py
new file mode 100644
index 0000000..971ba2c
--- /dev/null
+++ b/regs/assembler/validator.py
@@ -0,0 +1,25 @@
+from assembler.keywords import ME, VE, macro_vector_operations
+
+class ValidatorError(Exception):
+    pass
+
+def validate_instruction(ins):
+    addresses = len(set(
+        source.offset
+        for source in [ins.source0, ins.source1, ins.source2]
+        if source is not None
+    ))
+    if addresses > 2:
+        if type(ins.destination_op.opcode) is not VE:
+            raise ValidatorError("too many addresses for non-VE instruction", ins)
+        if ins.destination_op.opcode.name not in {b"VE_MULTIPLYX2_ADD", b"VE_MULTIPLY_ADD"}:
+            raise ValidatorError("too many addresses for VE non-multiply-add instruction", ins)
+        assert ins.destination_op.macro == False, ins
+        ins.destination_op.macro = True
+        if ins.destination_op.opcode.name == b"VE_MULTIPLY_ADD":
+            ins.destination_op.opcode = macro_vector_operations[0]
+        elif ins.destination_op.opcode.name == b"VE_MULTIPLYX2_ADD":
+            ins.destination_op.opcode = macro_vector_operations[1]
+        else:
+            assert False
+    return ins
diff --git a/regs/pvs_bits/pvs_opcode_and_destination_operand_bits.txt b/regs/pvs_bits/pvs_opcode_and_destination_operand_bits.txt
index 57eae2a..8e3ba9e 100644
--- a/regs/pvs_bits/pvs_opcode_and_destination_operand_bits.txt
+++ b/regs/pvs_bits/pvs_opcode_and_destination_operand_bits.txt
@@ -1,3 +1,6 @@
+MACRO_OPCODE:
+MACRO_OP_2CLK_MADD    = 0
+MACRO_OP_2CLK_M2X_ADD = 1
 VECTOR_OPCODE:
 VECTOR_NO_OP              = 0
 VE_DOT_PRODUCT            = 1
diff --git a/regs/pvs_disassemble.py b/regs/pvs_disassemble.py
index 9d18c87..c90dea6 100644
--- a/regs/pvs_disassemble.py
+++ b/regs/pvs_disassemble.py
@@ -88,7 +88,6 @@ def parse_dst_op(dst_op):
     addr_sel = pvs_dst.ADDR_SEL(dst_op)
 
     assert addr_mode == 0
-    assert macro_inst == 0
     assert pred_enable == 0
     assert pred_sense == 0
     assert dual_math_op == 0
@@ -102,7 +101,10 @@ def parse_dst_op(dst_op):
     parts.append(f"{reg_str}[{offset}].{we_swizzle}")
 
     if math_inst:
+        assert not macro_inst
         parts.append(op_substitutions(pvs_dst_bits.MATH_OPCODE[opcode]))
+    elif macro_inst:
+        parts.append(op_substitutions(pvs_dst_bits.MACRO_OPCODE[opcode]))
     else:
         parts.append(op_substitutions(pvs_dst_bits.VECTOR_OPCODE[opcode]))