diff --git a/drm/matrix_cubesphere.cpp b/drm/matrix_cubesphere.cpp
index 4980b2f..31d6c62 100644
--- a/drm/matrix_cubesphere.cpp
+++ b/drm/matrix_cubesphere.cpp
@@ -279,7 +279,7 @@ mat4x4 perspective(float low1, float high1,
   return m2 * m1;
 }
 
-int _3d_cube_inner(int ix, mat4x4 trans, mat4x4 world_trans, mat3x3 normal_trans, vec4 light_pos)
+int _3d_cube_inner(int ix, mat4x4 trans, mat4x4 world_trans, vec4 light_pos)
 {
   T0V(VAP_PVS_STATE_FLUSH_REG, 0x00000000);
 
@@ -312,12 +312,6 @@ int _3d_cube_inner(int ix, mat4x4 trans, mat4x4 world_trans, mat3x3 normal_trans
     world_trans[3][0], world_trans[3][1], world_trans[3][2], world_trans[3][3],
 
     // 8
-    normal_trans[0][0], normal_trans[0][1], normal_trans[0][2], 0,
-    normal_trans[1][0], normal_trans[1][1], normal_trans[1][2], 0,
-    normal_trans[2][0], normal_trans[2][1], normal_trans[2][2], 0,
-    0, 0, 0, 0,
-
-    // 12
     light_pos.x, light_pos.y, light_pos.z, light_pos.w,
   };
   const int consts_length = (sizeof (consts)) / (sizeof (consts[0]));
@@ -373,9 +367,9 @@ int _3d_cube_inner(int ix, mat4x4 trans, mat4x4 world_trans, mat3x3 normal_trans
       ib[ix++].f32 = p.z;
       ib[ix++].f32 = t.x;
       ib[ix++].f32 = t.y;
-      ib[ix++].f32 = n.x;//n.x;//n.x;
-      ib[ix++].f32 = n.y;//0;//n.y;//n.y;
-      ib[ix++].f32 = n.z;//n.z;
+      ib[ix++].f32 = n.x;
+      ib[ix++].f32 = n.y;
+      ib[ix++].f32 = n.z;
     }
   }
 
@@ -445,7 +439,7 @@ int _3d_cube(int ix, float theta)
       | RS_IP__TEX_PTR_Q(11)
       | RS_IP__OFFSET_EN(0)
       );
-  T0V(RS_IP_2
+  T0V(RS_IP_3
       , RS_IP__TEX_PTR_S(12)
       | RS_IP__TEX_PTR_T(13)
       | RS_IP__TEX_PTR_R(14)
@@ -616,41 +610,40 @@ int _3d_cube(int ix, float theta)
                          0.5f, 2.0f);
 
   vec4 light_pos = vec4(0, 0, 0, 1.0f);
+
   // light
   if (1) {
     mat4x4 t = translate(vec3(0, 0, 3));
-    mat4x4 t1 = translate(vec3(1, 1, 1));
+    mat4x4 t1 = translate(vec3(1, 0, 0));
     mat4x4 s = scale(0.1f);
     mat4x4 rz = rotate_y(theta * 2.f);
 
     mat4x4 world_trans = rz * t1 * s;
 
-    mat3x3 normal_trans = transpose(inverse(submatrix(world_trans, 3, 3)));
+    //mat3x3 normal_trans = transpose(inverse(submatrix(world_trans, 3, 3)));
 
     mat4x4 trans = aspect * p * t * world_trans;
 
     light_pos = world_trans * light_pos;
 
-    ix = _3d_cube_inner(ix, trans, world_trans, normal_trans, light_pos);
+    ix = _3d_cube_inner(ix, trans, world_trans, light_pos);
   }
 
+
   // object
   if (1) {
     mat4x4 t = translate(vec3(0, 0, 3));
-    mat4x4 rx = rotate_x(0 * theta1 * 0.5f);
+    mat4x4 rx = rotate_x(1 * theta1 * 0.5f);
     mat4x4 ry = rotate_y(0 * theta2 * 0.8f + 1.4f);
-    mat4x4 s = scale(0.7f);
+    mat4x4 s = scale(0.9f);
 
     mat4x4 world_trans = rx * ry * s;
 
-    mat3x3 normal_trans = transpose(inverse(submatrix(world_trans, 3, 3)));
+    //mat3x3 normal_trans = transpose(inverse(submatrix(world_trans, 3, 3)));
 
     mat4x4 trans = aspect * p * t * world_trans;
 
-    printf("light_pos % 2.03f % 2.03f % 2.03f % 2.03f\n",
-           light_pos.x, light_pos.y, light_pos.z, light_pos.w);
-
-    ix = _3d_cube_inner(ix, trans, world_trans, normal_trans, light_pos);
+    ix = _3d_cube_inner(ix, trans, world_trans, light_pos);
   }
 
   return ix;
diff --git a/drm/matrix_cubesphere.fs.asm b/drm/matrix_cubesphere.fs.asm
index 7991f1a..7ed97b7 100644
--- a/drm/matrix_cubesphere.fs.asm
+++ b/drm/matrix_cubesphere.fs.asm
@@ -1,55 +1,50 @@
--- temp[0] -- texture
+-- temp[0] -- position (world space)
 -- temp[1] -- normal
--- temp[2] -- (world space) fragment position
--- temp[3] -- (world space) light position
+-- temp[2] -- light pos (world space)
+-- temp[3] -- texture
 
 -- PIXSIZE 4
 
 TEX TEX_SEM_WAIT TEX_SEM_ACQUIRE
-  temp[0].rgba = LD tex[0].rgba temp[0].rgaa ;
+  temp[3].rgba = LD tex[0].rgba temp[3].rgaa ;
 
--- normalize:
--- v * 1.0f / sqrt(dot(v, v))
-
--- norm = normalize(Normal)
+-- normal = normalize(normal)
+-- normal = (1.0 / sqrt(dot(normal, normal))) * normal
 src0.rgb = temp[1] :
-                DP3 src0.rgb src0.rgb ,
-  temp[1].a   = DP ;
-src0.a = temp[1] :
-  temp[1].a   = RSQ src0.a ;
-src0.rgb = temp[1] ,
+              DP3 src0.rgb src0.rgb ,
+  temp[1].a = DP ;
 src0.a = temp[1] :
+  temp[1].a   = RSQ |src0.a| ;
+src0.a = temp[1], src0.rgb = temp[1] :
   temp[1].rgb = MAD src0.rgb src0.aaa src0.000 ;
 
--- temp[2] -- (world space) fragment position
--- temp[3] -- (world space) light position
--- lightDir = normalize(lightPos - fragPos)
--- srcp.rgb = (src1.rgb - src0.rgb)
-src1.rgb = temp[3] ,
-src0.rgb = temp[2] ,
-srcp.rgb = neg :
+-- light_dir = normalize((f_light_pos - f_world_pos))
+src1.rgb = temp[2] , -- f_light_pos
+src0.rgb = temp[0] , -- f_world_pos
+srcp.rgb = neg :     -- (f_light_pos - f_world_pos)
               DP3 srcp.rgb srcp.rgb ,
-  temp[3].a = DP ;
-src0.a = temp[3] :
-  temp[3].a   = RSQ src0.a ;
-src0.rgb = temp[3] ,
-src0.a = temp[3] :
-  temp[3].rgb = MAD src0.rgb src0.aaa src0.000 ;
+  temp[2].a = DP ;
+src0.a = temp[2] :
+  temp[2].a   = RSQ |src0.a| ;
+src0.a = temp[2], src0.rgb = temp[2] :
+  temp[2].rgb = MAD src0.rgb src0.aaa src0.000 ;
 
--- diff = dot(norm, lightDir)
--- diff = dot(temp[1].rgb, temp[3].rgb)
-src0.rgb = temp[1] ,
-src1.rgb = temp[3] :
-  temp[4].r = DP3 src0.rgb src1.rgb ;
+-- dot(normal, light_dir)
+src0.rgb = temp[2] ,
+src1.rgb = temp[1] :
+                DP3 src0.rgb src1.rgb ,
+   temp[4].a  = DP ;
 
-src0.rgb = temp[4] :
-  temp[4].r = MAX src0.r00 src0.000 ;
+src0.a = temp[4] :
+  temp[4].a = MAX src0.a src0.0 ;
+
+src0.a = temp[4] ,
+src1.a = float(32) :
+  temp[4].a = MAD src0.a src0.1 src1.a ;
 
 OUT TEX_SEM_WAIT
-src0.a = temp[0], src0.rgb = temp[0] ,
-src1.rgb = temp[4] ,
-src2.rgb = temp[1] :
-  out[0].a    = MAD src0.a src1.1 src1.0 ,
-  out[0].rgb  = MAD src0.rgb src1.rrr src1.000 ;
-  --out[0].rgb  = MAD src2.rgb src2.100 src1.000 ;
-  --out[0].rgb  = MAD src2.r00 src1.rrr src1.000 ;
+src0.a = temp[3],
+src0.rgb = temp[3] ,
+src1.a = temp[4] :
+  out[0].a    = MAX src0.a src0.a ,
+  out[0].rgb  = MAD src0.rgb src1.aaa src2.000 ;
diff --git a/drm/matrix_cubesphere.vs.asm b/drm/matrix_cubesphere.vs.asm
index b03cd86..45d6fd4 100644
--- a/drm/matrix_cubesphere.vs.asm
+++ b/drm/matrix_cubesphere.vs.asm
@@ -28,13 +28,18 @@ temp[2].y   = VE_DOT   const[5].xyzw    input[0].xyzw ;
 temp[2].z   = VE_DOT   const[6].xyzw    input[0].xyzw ;
 temp[2].w   = VE_DOT   const[7].xyzw    input[0].xyzw ;
 
-temp[3].x   = VE_DOT   const[8].xyz0    input[2].xyz0 ;
-temp[3].y   = VE_DOT   const[9].xyz0    input[2].xyz0 ;
-temp[3].z   = VE_DOT  const[10].xyz0    input[2].xyz0 ;
---temp[3].xyzw = VE_MAX input[2].xyz0 input[2].xyz0 ;
+-- normal world space
+temp[3].x   = VE_DOT   const[4].xyz0    input[2].xyz0 ;
+temp[3].y   = VE_DOT   const[5].xyz0    input[2].xyz0 ;
+temp[3].z   = VE_DOT   const[6].xyz0    input[2].xyz0 ;
 
-out[0].xyzw = VE_MAX    temp[1].xyzw     temp[1].xyzw ; -- position clip space
-out[1].xyzw = VE_MAX   input[1].xyzw    input[1].xyzw ; -- texture
-out[2].xyzw = VE_MAX    temp[3].xyz0     temp[3].xyz0 ; -- normal
-out[3].xyzw = VE_MAX    temp[2].xyzw     temp[2].xyzw ; -- position world space
-out[4].xyzw = VE_MAX  const[12].xyzw   const[12].xyzw ; -- light position world space
+-- position (clip space)
+out[0].xyzw = VE_ADD    temp[1].xyzw    const[0].0000 ;
+-- position (world space)
+out[1].xyzw = VE_ADD    temp[2].xyzw    const[0].0000 ;
+-- normal
+out[2].xyzw = VE_ADD    temp[3].xyz0    const[0].0000 ;
+-- light pos (world space)
+out[3].xyzw = VE_ADD   const[8].xyzw    const[8].0000 ;
+-- texture
+out[4].xyzw = VE_ADD   input[1].xy00    const[0].0000 ;
diff --git a/drm/texture_cube_warping.vs.asm b/drm/texture_cube_warping.vs.asm
index f949ef0..db90c82 100644
--- a/drm/texture_cube_warping.vs.asm
+++ b/drm/texture_cube_warping.vs.asm
@@ -1,3 +1,3 @@
 temp[0].x       = VE_MUL       input[0].z___     input[0].1___     ;
 out[1].xy       = VE_ADD       input[1].xy__     input[1].0000     ;
-out[0].xyzw     = VE_ADD       temp[0].-0-0-x-0  input[0].xy01     ;
+out[0].xyzw     = VE_ADD       temp[0].-0-0-x-x  input[0].xy00     ;
diff --git a/regs/assembler/vs/validator.py b/regs/assembler/vs/validator.py
index 949a02d..dd84fa7 100644
--- a/regs/assembler/vs/validator.py
+++ b/regs/assembler/vs/validator.py
@@ -232,13 +232,13 @@ def validate_source_address_counts(sources_ast, sources, opcode):
 
     input_count = len(addresses_by_type(sources, SourceType.input))
     if input_count > 1:
-        source_ix = source_with_type_reversed(sources, SourceType.input)
+        source_ix = source_ix_with_type_reversed(sources, SourceType.input)
         raise ValidatorError(f"too many input addresses in operation(s); expected 1, have {input_count}",
                              sources_ast[source_ix].offset_identifier)
 
     alt_temporary_count = len(addresses_by_type(sources, SourceType.alt_temporary))
     if alt_temporary_count > 1:
-        source_ix = source_with_type_reversed(sources, SourceType.alt_temporary)
+        source_ix = source_ix_with_type_reversed(sources, SourceType.alt_temporary)
         raise ValidatorError(f"too many alt temporary addresses in operation(s); expected 1, have {alt_temporary_count}",
                              sources_ast[source_ix].offset_identifier)