From 2b90395b2de816a3e0ce730808f04452a6697add Mon Sep 17 00:00:00 2001
From: Zack Buhman <zack@buhman.org>
Date: Fri, 31 Oct 2025 20:15:11 -0500
Subject: [PATCH] matrix_cubesphere_specular: rewrite specular fragment shader

---
 drm/matrix_cubesphere_specular.cpp    |   4 +-
 drm/matrix_cubesphere_specular.fs.asm | 149 +++++++++++++-------------
 drm/matrix_cubesphere_specular.vs.asm |  12 ++-
 regs/assembler/vs/validator.py        |   2 +-
 4 files changed, 86 insertions(+), 81 deletions(-)

diff --git a/drm/matrix_cubesphere_specular.cpp b/drm/matrix_cubesphere_specular.cpp
index ab7bb7f..edaa360 100644
--- a/drm/matrix_cubesphere_specular.cpp
+++ b/drm/matrix_cubesphere_specular.cpp
@@ -365,8 +365,8 @@ int _3d_cube_inner(int ix, mat4x4 trans, mat4x4 world_trans, vec4 light_pos, vec
       vec2 t = model->texture[obj->triangle[i][j].texture];
       vec3 n = model->normal[obj->triangle[i][j].normal];
 
-      fprintf(stderr, "% 2.03f, % 2.03f, % 2.03f, % 2.03f, % 2.03f, % 2.03f,\n",
-              p.x, p.y, p.z, n.x, n.y, n.z);
+      //fprintf(stderr, "% 2.03f, % 2.03f, % 2.03f, % 2.03f, % 2.03f, % 2.03f,\n",
+      //p.x, p.y, p.z, n.x, n.y, n.z);
 
       ib[ix++].f32 = p.x;
       ib[ix++].f32 = p.y;
diff --git a/drm/matrix_cubesphere_specular.fs.asm b/drm/matrix_cubesphere_specular.fs.asm
index ebebd59..609cefd 100644
--- a/drm/matrix_cubesphere_specular.fs.asm
+++ b/drm/matrix_cubesphere_specular.fs.asm
@@ -9,94 +9,97 @@
 TEX TEX_SEM_WAIT TEX_SEM_ACQUIRE
   temp[4].rgba = LD tex[0].rgba temp[4].rgaa ;
 
+--
 -- normal = normalize(normal)
--- normal = (1.0 / sqrt(dot(normal, normal))) * normal
-src0.rgb = temp[1] :
+--
+src0.rgb = temp[1] : -- normal
               DP3 src0.rgb src0.rgb ,
   temp[1].a = DP ;
 src0.a = temp[1] :
-  temp[1].a   = RSQ |src0.a| ;
-src0.a = temp[1], src0.rgb = temp[1] :
+  temp[1].a = RSQ |src0.a| ;
+src0.a = temp[1] ,
+src0.rgb = temp[1] : -- normal
   temp[1].rgb = MAD src0.rgb src0.aaa src0.000 ;
 
--- light_dir = normalize((f_light_pos - f_world_pos))
-src1.rgb = temp[2] , -- f_light_pos
-src0.rgb = temp[0] , -- f_world_pos
-srcp.rgb = neg :     -- (f_light_pos - f_world_pos)
-              DP3 srcp.rgb srcp.rgb ,
+--
+-- light_dir = light_pos - world_pos
+--
+src0.rgb = temp[2] , -- light pos
+src1.rgb = temp[0] : -- world pos
+  temp[2].rgb = MAD src0.111 src0.rgb -src1.rgb ;
+
+--
+-- light_dir = normalize(light_dir)
+--
+src0.rgb = temp[2] : -- light_dir
+              DP3 src0.rgb src0.rgb ,
   temp[2].a = DP ;
 src0.a = temp[2] :
-  temp[2].a   = RSQ |src0.a| ;
-src0.a = temp[2], src0.rgb = temp[2] :
+  temp[2].a = RSQ |src0.a| ;
+src0.a = temp[2] ,
+src0.rgb = temp[2] : -- light_dir
   temp[2].rgb = MAD src0.rgb src0.aaa src0.000 ;
 
--- diff = dot(normal, light_dir)
-src0.rgb = temp[2] ,
-src1.rgb = temp[1] :
-                DP3 src0.rgb src1.rgb ,
-   temp[5].a  = DP ;
+--
+-- view_dir = view_pos - world_pos
+--
+src0.rgb = temp[3] , -- view pos
+src1.rgb = temp[0] : -- world pos
+  temp[3].rgb = MAD src0.111 src0.rgb -src1.rgb ;
 
--- diff = max(diff, 0)
+--
+-- view_dir = normalize(view_dir)
+--
+src0.rgb = temp[3] : -- view dir
+              DP3 src0.rgb src0.rgb ,
+  temp[3].a = DP ;
+src0.a = temp[3] :
+  temp[3].a = RSQ |src0.a| ;
+src0.a = temp[3] ,
+src0.rgb = temp[3] : -- view dir
+  temp[3].rgb = MAD src0.rgb src0.aaa src0.000 ;
+
+--
+-- reflect_dir = reflect(light_dir, normal)
+--
+-- dotLN = dot(-light_dir, normal)
+src0.rgb = temp[2] , -- light dir
+src1.rgb = temp[1] : -- normal
+              DP3 -src0.rgb src1.rgb ,
+  temp[5].a = DP ;
+-- dotLN = 2.0 * dotLN
+src0.a = temp[5] ,   -- dotLN
+src1.a = float(64) : -- 2.0
+  temp[5].a = MAD src0.a src1.a src0.0 ;
+-- dotLN = -dotLN * normal + -light_dir
+src0.a = temp[5] , -- dotLN
+src1.rgb = temp[1] , -- normal
+src2.rgb = temp[2] : -- light dir
+  temp[5].rgb = MAD -src0.aaa src1.rgb -src2.rgb ;
+
+--
+-- spec = max(dot(view_dir, reflect_dir), 0.0)
+--
+src0.rgb = temp[3] ,
+src1.rgb = temp[5] :
+              DP3 src0.rgb src1.rgb ,
+  temp[5].a = DP ;
 src0.a = temp[5] :
   temp[5].a = MAX src0.a src0.0 ;
 
--- intensity = diff + 0.125
-src0.a = temp[5] ,
-src1.a = float(32) : -- 0.125
-  temp[5].a = MAD src0.a src0.1 src1.a ;
-
 --
--- specular
---
--- temp[3] -- view pos (world space)
--- view_dir = normalize(f_view_pos - f_world_pos)
-src1.rgb = temp[3] , -- f_light_pos
-src0.rgb = temp[0] , -- f_world_pos
-srcp.rgb = neg :     -- (f_light_pos - f_world_pos)
-              DP3 srcp.rgb srcp.rgb ,
-  temp[3].a = DP ;
-src0.a = temp[3] :
-  temp[3].a   = RSQ |src0.a| ;
-src0.a = temp[3], src0.rgb = temp[3] :
-  temp[3].rgb = MAD src0.rgb src0.aaa src0.000 ;
-
--- reflect(I, N)
--- I - 2.0 * dot(N, I) * N
--- reflect_dir = reflect(-light_dir, norm)
--- reflect_dir = reflect(-temp[2], temp[1])
--- I - 2.0 * dot(N, I) * N
--- - (2.0 * dot(N, I)) * temp[1] + -temp[2]
-src0.rgb = temp[1] , -- N=normal
-src1.rgb = temp[2] : -- I=light_dir   dot(N, -I)
-  temp[6].r = DP3 src0.rgb -src1.rgb ;
-src0.rgb = temp[6] ,
-src1.rgb = float(64) : -- 2.0
-  temp[6].r = MAD src0.r00 src1.r00 src0.000 ;
-src0.rgb = temp[6] ,
-src1.rgb = temp[1] , -- N
-src2.rgb = temp[2] : -- I
-  temp[6].rgb = MAD -src0.rrr src1.rgb -src2.rgb ;
-
--- spec = max(dot(view_dir, reflect_dir), 0.0)
-src0.rgb = temp[6] , -- reflect_dir
-src1.rgb = temp[3] : -- view_dir
-  temp[6].r = DP3 src0.rgb src1.rgb ;
-src0.rgb = temp[6] :
-  temp[6].a = MAX src0.r src0.0 ;
-
 -- spec = pow(spec, 32)
-src0.a = temp[6] :
-  temp[6].a = LN2 src0.a ;
-src0.a = temp[6] ,
-src1.a = float(72) : -- 32
-  temp[6].a = MAD src0.a src1.a src1.0 ;
-src0.a = temp[6] :
-  temp[6].a = EX2 src0.a ;
+--
+src0.a = temp[5] :    -- spec
+  temp[5].a = LN2 src0.a ;
+src0.a = temp[5] ,    -- spec
+src1.a = float(96) :  -- 32
+  temp[5].a = MAD src0.a src1.a src1.0 ;
+src0.a = temp[5] :    -- spec
+  temp[5].a = EX2 src0.a ;
 
 OUT TEX_SEM_WAIT
-src0.a = temp[4],
-src0.rgb = temp[4] ,
-src1.a = temp[6] ,
-src1.rgb = temp[6]   :
-  out[0].a    = MAX src0.a src0.a ,
-  out[0].rgb  = MAD src0.111 src1.aaa src1.000 ;
+src1.a = temp[5] ,
+src1.rgb = temp[5] :
+  out[0].a    = MAX src1.1 src0.1 ,
+  out[0].rgb  = MAD src1.111 src1.aaa src1.000 ;
diff --git a/drm/matrix_cubesphere_specular.vs.asm b/drm/matrix_cubesphere_specular.vs.asm
index 86a4d47..d5aba49 100644
--- a/drm/matrix_cubesphere_specular.vs.asm
+++ b/drm/matrix_cubesphere_specular.vs.asm
@@ -37,12 +37,14 @@ temp[3].z   = VE_DOT   const[6].xyz0    input[2].xyz0 ;
 -- position (clip space)
 out[0].xyzw = VE_ADD    temp[1].xyzw    const[0].0000 ;
 -- position (world space)
-out[1].xyzw = VE_ADD    temp[2].xyzw    const[0].0000 ;
+out[1].xyzw = VE_ADD    temp[2].xyzw    const[0].0000 ; -- 0 world pos
 -- normal
-out[2].xyzw = VE_ADD    temp[3].xyz0    const[0].0000 ;
+out[2].xyzw = VE_ADD    temp[3].xyz0    const[0].0000 ; -- 1 normal
 -- light pos (world space)
-out[3].xyzw = VE_ADD   const[8].xyzw    const[8].0000 ;
+out[3].xyzw = VE_ADD   const[8].xyzw    const[8].0000 ; -- 2 light pos / light dir
 -- view pos (world space)
-out[4].xyzw = VE_ADD   const[9].xyzw    const[9].0000 ;
+out[4].xyzw = VE_ADD   const[9].xyzw    const[9].0000 ; -- 3 view pos / view dir
 -- texture
-out[5].xyzw = VE_ADD   input[1].xy00    const[0].0000 ;
+out[5].xyzw = VE_ADD   input[1].xy00    const[0].0000 ; -- 4 texture
+
+                                                        -- 5 reflect dir
diff --git a/regs/assembler/vs/validator.py b/regs/assembler/vs/validator.py
index dd84fa7..c714379 100644
--- a/regs/assembler/vs/validator.py
+++ b/regs/assembler/vs/validator.py
@@ -382,7 +382,7 @@ def validate_instruction(ins):
     if len(opcodes) == 2:
         return validate_dual_math_instruction(ins.operations, opcodes)
     else:
-        assert len(opcodes) == 1
+        assert len(opcodes) == 1, (opcodes, ins)
         return validate_instruction_inner(ins.operations[0], opcodes[0])
 
 if __name__ == "__main__":