wiffle_screen_space_store_queue2: add alpha blending

2025-01-28 02:05:14 -06:00 · 2025-01-28 02:05:14 -06:00 · b156e2d24e
commit b156e2d24e
parent 306294cfff
9 changed files with 316 additions and 111 deletions
--- a/2
+++ b/2
@ -4,7 +4,7 @@ include base.mk
 include common.mk
 include headers.mk

-OPT = -O2
+OPT = -Og
 MAKEFILE_PATH := $(patsubst %/,%,$(dir $(abspath $(firstword $(MAKEFILE_LIST)))))
 CFLAGS += -I$(MAKEFILE_PATH)
 LIB ?= $(MAKEFILE_PATH)
--- a/example/example.mk
+++ b/example/example.mk
@ -244,7 +244,8 @@ WIFFLE_SCREEN_SPACE_STORE_QUEUE2_OBJ = \
 	holly/background.o \
 	holly/ta_fifo_polygon_converter.o \
 	sh7091/serial.o \
-	sobel_fipr_store_queue2.o
+	sobel_fipr_store_queue2.o \
+	$(LIBGCC)

 example/wiffle_screen_space_store_queue2.elf: LDSCRIPT = $(LIB)/main.lds
 example/wiffle_screen_space_store_queue2.elf: $(START_OBJ) $(WIFFLE_SCREEN_SPACE_STORE_QUEUE2_OBJ)
--- a/example/wiffle_screen_space_store_queue2.cpp
+++ b/example/wiffle_screen_space_store_queue2.cpp
@ -143,6 +143,85 @@ void transfer_scene(float theta)
  sq_transfer_32byte(ta_fifo_polygon_converter);
 }

+struct quad_vertex {
+  float x;
+  float y;
+  float z;
+  float u;
+  float v;
+};
+
+// screen space coordinates
+constexpr float x_uv = 640.f / 1024.f;
+constexpr float y_uv = 480.f / 512.f;
+
+const struct quad_vertex quad_vertices[] = {
+  { 0.f,   0.f,   0.1f, 0.0f, 0.0f },
+  { 640.f, 0.f,   0.1f, x_uv, 0.0f },
+  { 640.f, 480.f, 0.1f, x_uv, y_uv },
+  { 0.f,   480.f, 0.1f, 0.0f, y_uv },
+};
+
+void transfer_translucent_quad(uint32_t texture_address, bool use_alpha)
+{
+  const uint32_t parameter_control_word = para_control::para_type::sprite
+                                        | para_control::list_type::translucent
+                                        | obj_control::col_type::packed_color
+                                        | obj_control::texture
+                                        | obj_control::_16bit_uv;
+
+  const uint32_t isp_tsp_instruction_word = isp_tsp_instruction_word::depth_compare_mode::always
+                                          | isp_tsp_instruction_word::culling_mode::no_culling;
+
+  const uint32_t alpha =
+    tsp_instruction_word::src_alpha_instr::inverse_src_alpha |
+    tsp_instruction_word::dst_alpha_instr::src_alpha;
+  const uint32_t no_alpha =
+    tsp_instruction_word::src_alpha_instr::one |
+    tsp_instruction_word::dst_alpha_instr::zero;
+
+  const uint32_t tsp_instruction_word = (use_alpha ? alpha : no_alpha)
+                                      | tsp_instruction_word::fog_control::no_fog
+                                      | tsp_instruction_word::texture_u_size::from_int(1024)
+                                      | tsp_instruction_word::texture_v_size::from_int(512)
+                                      | (use_alpha ? tsp_instruction_word::use_alpha : 0);
+
+  const uint32_t texture_control_word = texture_control_word::pixel_format::_4444
+                                      | texture_control_word::scan_order::non_twiddled
+                                      | texture_control_word::texture_address(texture_address / 8)
+                                      | texture_control_word::stride_select;
+
+  const uint32_t base_color = 0xffff00ff;
+  *reinterpret_cast<ta_global_parameter::sprite *>(store_queue) =
+    ta_global_parameter::sprite(parameter_control_word,
+                                isp_tsp_instruction_word,
+                                tsp_instruction_word,
+                                texture_control_word,
+                                base_color,
+                                0,  // offset_color
+                                0,  // data_size_for_sort_dma
+                                0); // next_address_for_sort_dma
+  sq_transfer_32byte(ta_fifo_polygon_converter);
+
+  *reinterpret_cast<ta_vertex_parameter::sprite_type_1 *>(store_queue) =
+    ta_vertex_parameter::sprite_type_1(para_control::para_type::vertex_parameter,
+				       quad_vertices[0].x,
+				       quad_vertices[0].y,
+				       quad_vertices[0].z,
+				       quad_vertices[1].x,
+				       quad_vertices[1].y,
+				       quad_vertices[1].z,
+				       quad_vertices[2].x,
+				       quad_vertices[2].y,
+				       quad_vertices[2].z,
+				       quad_vertices[3].x,
+				       quad_vertices[3].y,
+                                       uv_16bit(quad_vertices[0].u, quad_vertices[0].v),
+                                       uv_16bit(quad_vertices[1].u, quad_vertices[1].v),
+                                       uv_16bit(quad_vertices[2].u, quad_vertices[2].v));
+  sq_transfer_64byte(ta_fifo_polygon_converter);
+}
+
 void dma_transfer(uint32_t source, uint32_t destination, uint32_t transfers)
 {
  using namespace dmac;
@ -227,13 +306,14 @@ void main()
  dma_init();
  video_output::set_mode_vga();

+  const int render_passes = 1;
+
  constexpr uint32_t ta_alloc = ta_alloc_ctrl::pt_opb::no_list
 			      | ta_alloc_ctrl::tm_opb::no_list
 			      | ta_alloc_ctrl::t_opb::_16x4byte
 			      | ta_alloc_ctrl::om_opb::no_list
                              | ta_alloc_ctrl::o_opb::no_list;

-  const int render_passes = 1;
  const struct opb_size opb_size[render_passes] = {
    {
      .opaque = 0,
@ -244,6 +324,22 @@ void main()
    }
  };

+  constexpr uint32_t ta_alloc2 = ta_alloc_ctrl::pt_opb::no_list
+                               | ta_alloc_ctrl::tm_opb::no_list
+                               | ta_alloc_ctrl::t_opb::_16x4byte
+                               | ta_alloc_ctrl::om_opb::no_list
+                               | ta_alloc_ctrl::o_opb::no_list;
+
+  const struct opb_size opb_size2[render_passes] = {
+    {
+      .opaque = 0,
+      .opaque_modifier = 0,
+      .translucent = 16 * 4,
+      .translucent_modifier = 0,
+      .punch_through = 0
+    }
+  };
+
  holly.SOFTRESET = softreset::pipeline_soft_reset
 		  | softreset::ta_soft_reset;
  holly.SOFTRESET = 0;
@ -265,66 +361,45 @@ void main()
 			 render_passes,
 			 texture_memory_alloc.region_array[0].start,
 			 texture_memory_alloc.object_list[0].start);
-
  background_parameter2(texture_memory_alloc.background[0].start,
 			0xffc0c0c0);

+  region_array_multipass(tile_width,
+			 tile_height,
+			 opb_size2,
+			 render_passes,
+			 texture_memory_alloc.region_array[1].start,
+			 texture_memory_alloc.object_list[1].start);
+  background_parameter2(texture_memory_alloc.background[1].start,
+			0xffc0c0c0);

  holly.FB_R_SOF1 = texture_memory_alloc.framebuffer[0].start;

  holly.FB_R_CTRL = fb_r_ctrl::vclk_div::pclk_vclk_1
-                  | fb_r_ctrl::fb_depth::_0888_rgb_32bit
+                  | fb_r_ctrl::fb_depth::_565_rgb_16bit
                  | fb_r_ctrl::fb_enable;

  holly.FB_R_SIZE = fb_r_size::fb_modulus(1)
                  | fb_r_size::fb_y_size(480 - 3)
-                  | fb_r_size::fb_x_size((640 * 32) / 32 - 1);
+                  | fb_r_size::fb_x_size((640 * 16) / 32 - 1);

-  holly.FB_W_CTRL = fb_w_ctrl::fb_packmode::_8888_argb_32bit;
+  holly.TEXT_CONTROL = text_control::stride(20); // 640 pixels

-  system.LMMODE0 = 1;
-  system.LMMODE1 = 1; // 32-bit
-
-  uint32_t * out = (uint32_t *)&texture_memory32[texture_memory_alloc.framebuffer[0].start / 4];
-  for (int i = 0; i < 640 * 480; i++) {
-    out[i] = 0xffff0000;
-  }
-
-  ta_polygon_converter_init2(texture_memory_alloc.isp_tsp_parameters[0].start,
-                             texture_memory_alloc.isp_tsp_parameters[0].end,
-                             texture_memory_alloc.object_list[0].start,
-                             texture_memory_alloc.object_list[0].end,
-                             opb_size[0].total(),
-                             ta_alloc,
-                             tile_width,
-                             tile_height);
-  transfer_scene(theta);
-  ta_wait_translucent_list();
-
-  const uint32_t bytes_per_pixel = 4;
-  core_start_render3(texture_memory_alloc.region_array[0].start,
-                     texture_memory_alloc.isp_tsp_parameters[0].start,
-                     texture_memory_alloc.background[0].start,
-                     //texture_memory_alloc.framebuffer[0].start,
-                     0x100'0000 | texture_memory_alloc.texture.start, // 64-bit area
-                     framebuffer_width,
-                     bytes_per_pixel);
-
-  ta_polygon_converter_init2(texture_memory_alloc.isp_tsp_parameters[0].start,
-                             texture_memory_alloc.isp_tsp_parameters[0].end,
-                             texture_memory_alloc.object_list[0].start,
-                             texture_memory_alloc.object_list[0].end,
-                             opb_size[0].total(),
-                             ta_alloc,
-                             tile_width,
-                             tile_height);
-  transfer_scene(theta);
+  //system.LMMODE0 = 1;
+  //system.LMMODE1 = 1; // 32-bit
+  system.LMMODE0 = 0;
+  system.LMMODE1 = 0; // 64-bit

  uint32_t * in = (uint32_t *)&texture_memory64[texture_memory_alloc.texture.start / 4];
-  uint32_t * framebuffer = (uint32_t *)(0x11000000 + texture_memory_alloc.framebuffer[0].start);
+
+  /*
+  for (int i = 0; i < 640 * 480; i++) {
+    uint32_t * framebuffer = (uint32_t *)(0x11000000 + texture_memory_alloc.framebuffer[0].start);
+    framebuffer[i] = 0xffff0000;
+  }
+  */

  while (1) {
-    ta_wait_translucent_list();
    ta_polygon_converter_init2(texture_memory_alloc.isp_tsp_parameters[0].start,
 			       texture_memory_alloc.isp_tsp_parameters[0].end,
 			       texture_memory_alloc.object_list[0].start,
@ -333,32 +408,75 @@ void main()
 			       ta_alloc,
 			       tile_width,
 			       tile_height);
-
    transfer_scene(theta);
+    //serial::string("wait_tl1\n");
+    ta_wait_translucent_list();
+    //serial::string("wait_tl1 end\n");
+
+    holly.FB_W_CTRL = fb_w_ctrl::fb_packmode::_4444_argb_16bit;

-    core_wait_end_of_render_video();
    core_start_render3(texture_memory_alloc.region_array[0].start,
                       texture_memory_alloc.isp_tsp_parameters[0].start,
                       texture_memory_alloc.background[0].start,
-                       //texture_memory_alloc.framebuffer[0].start,
                       0x100'0000 | texture_memory_alloc.texture.start, // 64-bit area
                       framebuffer_width,
-                       bytes_per_pixel);
+                       2); // bytes_per_pixel
+    //serial::string("wait_eorv1\n");
+    core_wait_end_of_render_video();
+    //serial::string("wait_eorv1 end\n");

-    dma_transfer((uint32_t)in, (uint32_t)inbuf, 640 * 480 * 4 / 32);
+    dma_transfer((uint32_t)in, (uint32_t)inbuf, 640 * 480 * 2 / 32);
    while ((sh7091.DMAC.CHCR1 & dmac::chcr::te::transfers_completed) == 0);

-    sobel_fipr_store_queue2(inbuf, framebuffer, temp);
+    //sobel_fipr_store_queue2(inbuf, out, temp);
+    int frame = frame_ix & 1;
+    uint32_t * framebuffer = (uint32_t *)(0x11000000 + texture_memory_alloc.framebuffer[0].start);
+    uint32_t * out = (uint32_t *)(0x11000000 + texture_memory_alloc.texture.start + 640 * 480 * 2);
+    //serial::string("sobel\n");
+    //sobel_fipr_store_queue2(inbuf, framebuffer, temp);
+    sobel_fipr_store_queue2(inbuf, out, temp);
+
+    ta_polygon_converter_init2(texture_memory_alloc.isp_tsp_parameters[1].start,
+			       texture_memory_alloc.isp_tsp_parameters[1].end,
+			       texture_memory_alloc.object_list[1].start,
+			       texture_memory_alloc.object_list[1].end,
+			       opb_size2[0].total(),
+			       ta_alloc2,
+			       tile_width,
+			       tile_height);
+
+    const uint32_t texture_address0 = texture_memory_alloc.texture.start;
+    transfer_translucent_quad(texture_address0, false);
+    const uint32_t texture_address1 = texture_memory_alloc.texture.start + 640 * 480 * 2;
+    transfer_translucent_quad(texture_address1, true);
+    *reinterpret_cast<ta_global_parameter::end_of_list *>(store_queue) =
+    ta_global_parameter::end_of_list(para_control::para_type::end_of_list);
+    sq_transfer_32byte(ta_fifo_polygon_converter);
+
+    //serial::string("wait_tl2\n");
+    ta_wait_translucent_list();
+    //serial::string("wait_tl2 end\n");
+
+    holly.FB_W_CTRL = fb_w_ctrl::fb_packmode::_565_rgb_16bit;
+
+    core_start_render3(texture_memory_alloc.region_array[1].start,
+                       texture_memory_alloc.isp_tsp_parameters[1].start,
+                       texture_memory_alloc.background[1].start,
+                       texture_memory_alloc.framebuffer[frame].start,
+                       framebuffer_width,
+                       2); // bytes_per_pixel
+    //serial::string("wait_eorv2\n");
+    core_wait_end_of_render_video();
+    //serial::string("wait_eorv2 end\n");
+
+    while (!spg_status::vsync(holly.SPG_STATUS));
+    holly.FB_R_SOF1 = texture_memory_alloc.framebuffer[frame].start;
+    while (spg_status::vsync(holly.SPG_STATUS));

    theta += half_degree;
    frame_ix += 1;
-    if (frame_ix > 100)
-      break;
  }

-  ta_wait_translucent_list();
-  core_wait_end_of_render_video();
-
  serial::string("return\n");
  serial::string("return\n");
  serial::string("return\n");
--- a/holly/core.cpp
+++ b/holly/core.cpp
@ -140,18 +140,43 @@ void core_wait_end_of_render_video()
    "Furthermore, it is strongly recommended that the End of ISP and End of Video interrupts
    be cleared at the same time in order to make debugging easier when an error occurs."
  */
-  while ((system.ISTNRM & istnrm::end_of_render_tsp) == 0) {
+  //serial::string("eorv\n");
+  int64_t count = 0;
+  while (1) {
+    uint32_t istnrm = system.ISTNRM;
+    if ((istnrm & istnrm::end_of_render_tsp) != 0)
+      break;
+    if (istnrm & 0xc0000000) {
+      serial::string("istnrm ");
+      serial::integer<uint32_t>(istnrm);
+      serial::string("isterr ");
+      serial::integer<uint32_t>(system.ISTERR);
+    }
+
+    //serial::integer<uint32_t>(system.ISTERR);
    if (system.ISTERR) {
      //serial::string("core ");
      //serial::integer<uint32_t>(system.ISTERR);
      holly.SOFTRESET = softreset::pipeline_soft_reset;
      holly.SOFTRESET = 0;
+      //break;
+    }
+    if (count > 10000000) {
+      serial::string("core timeout:\n");
+      serial::string("isterr ");
+      serial::integer<uint32_t>(system.ISTERR);
+      serial::string("istnrm ");
+      serial::integer<uint32_t>(system.ISTNRM);
      break;
    }
+    count += 1;
  };
  system.ISTNRM = istnrm::end_of_render_tsp
 		| istnrm::end_of_render_isp
 		| istnrm::end_of_render_video;
+
+  holly.SOFTRESET = softreset::pipeline_soft_reset;
+  holly.SOFTRESET = 0;
 }

 void core_flip(uint32_t frame_ix)
--- a/holly/texture_memory_alloc3.hpp
+++ b/holly/texture_memory_alloc3.hpp
@ -30,10 +30,10 @@ constexpr texture_memory_alloc texture_memory_alloc = {
  .background         = {{0x07'ffe0, 0x08'0000}, {0x47'ffe0, 0x48'0000}},
  .object_list        = {{0x08'0000, 0x0f'ffe0}, {0x48'0000, 0x4f'ffe0}}, // ~122880 object list pointers
  .region_array       = {{0x10'0000, 0x11'0000}, {0x50'0000, 0x51'0000}}, // ~9 render passes
-  //.framebuffer        = {{0x11'0000, 0x1b'8c00}, {0x51'0000, 0x5b'8c00}}, // 720x480*2
-  .framebuffer        = {{0x11'0000, 0x23'c000}, {0x51'0000, 0x63'c000}}, // 640x480*4
+  .framebuffer        = {{0x11'0000, 0x1b'8c00}, {0x51'0000, 0x5b'8c00}}, // 720x480*2
+  //.framebuffer        = {{0x11'0000, 0x23'c000}, {0x51'0000, 0x63'c000}}, // 640x480*4

  // 64-bit addresses
-  //.texture = {0x37'1800, 0x80'0000}
-  .texture = {0x57'1800, 0x80'0000}
+  .texture = {0x37'1800, 0x80'0000}
+  //.texture = {0x57'1800, 0x80'0000}
 };
--- a/sobel_fipr_inner2.s
+++ b/sobel_fipr_inner2.s
@ -1,3 +1,5 @@
+        .macro inner_multiplication
+
        /* y multiplication */
        mov #4,r1           /* r1 : temporary */
        fmov.s @r0,fr0      /* 0 */
@ -49,11 +51,27 @@
        fsts FPUL,fr3
        fadd fr3,fr7

+        add #4,r0 /* next pixel */
+
        fschg
        fmov xd0,dr0 /* load 100.f constant */
        fcmp/gt fr0,fr7
        fschg
+
+        .endm
+
+        .macro sobel_fipr_inner_2px
+        mov #0,r9
+
+        inner_multiplication
        movt r9
        add #-1,r9
+        extu.w r9,r9

-        add #4,r0 /* next pixel */
+        inner_multiplication
+        movt r1
+        add #-1,r1
+        extu.w r1,r1
+        shll16 r1
+        or r1,r9
+        .endm
--- a/sobel_fipr_store_queue2.s
+++ b/sobel_fipr_store_queue2.s
@ -15,7 +15,7 @@ _sobel_fipr_store_queue2:
        /* r11: var   (y loop counter)                               */
        /* r12: var   (prefetch address: input address + 1280  4)    */
        /* r13: var   (input address) */
-        /* r14: -    */
+        /* r14: (temporary)    */

 __setup:
        mov.l r8,@-r15
@ -24,6 +24,7 @@ __setup:
        mov.l r11,@-r15
        mov.l r12,@-r15
        mov.l r13,@-r15
+        mov.l r14,@-r15
        fmov.s  fr12,@-r15
        fmov.s  fr13,@-r15
        fmov.s  fr14,@-r15
@ -81,7 +82,7 @@ __setup:
        nop

        .align 4
-_const_100f:    .float 3900
+_const_100f:    .float 50

 _const_store_queue:             .long 0xe0000000
 _const_store_queue_mask:        .long 0x03ffffc0 /* (0xffffffff & (~0b111111)) & (~(0b111111 << 26)) */
@ -93,54 +94,56 @@ _const_1280:    .short (1280 * 4)
 _const_1281:    .short (1281 * 4)
 _const_1282:    .short (1282 * 4)

-        /* use r10 as temporary to load the first 1280 pixels; 8 pixels per loop iteration */
+        /* use r10 as temporary to load the first 1280 pixels; 16 pixels per loop iteration */
+        .include "unpack_pixel.s"
        .align 4
 _prime_pixels_loop_init:
-        mov #80,r10               /* 1280 / 8 */
-        shll r10
        mov r0,r12
+        mov #80,r10               /* 1280 / 16 */
+        shll r10

 _prime_pixels_loop:
-        .include "unpack_pixel.s"
+        unpack_pixel_16
        dt r10
        bt _loop_init
        bra _prime_pixels_loop
        nop

-        .align 4
 _loop_init:
-        /* skip first row */
-        add r3,r0 /* r3: const (640 * 4) */
-        add r3,r8
+        /* skip first output row */
+        mov r3,r1
+        shlr r1
+        add r1,r8 /* r3: 640 * 4 */

        mov.w _const_height,r11   /* 478      */
        bra _loop
-        mov #80,r10               /* 640 / 8 */
+        mov #40,r10               /* 640 / 8 */

-_const_height:     .short 478
+_const_height:     .short 476
+/*_const_height:     .short 238*/

+        .include "sobel_fipr_inner2.s"
 _loop:
 _loop_width:
        /* prefetch at r8 + 1280 */
+        unpack_pixel_16

-        /* process the next 8 pixels */
-        .include "unpack_pixel.s"
-
-        .include "sobel_fipr_inner2.s"
+        /* process the next 16 pixels */
+        sobel_fipr_inner_2px
        mov.l r9,@r8     /* save result in the store queue */
-        .include "sobel_fipr_inner2.s"
+        sobel_fipr_inner_2px
        mov.l r9,@(4,r8) /* save result in the store queue */
-        .include "sobel_fipr_inner2.s"
+        sobel_fipr_inner_2px
        mov.l r9,@(8,r8) /* save result in the store queue */
-        .include "sobel_fipr_inner2.s"
+        sobel_fipr_inner_2px
        mov.l r9,@(12,r8) /* save result in the store queue */
-        .include "sobel_fipr_inner2.s"
+        sobel_fipr_inner_2px
        mov.l r9,@(16,r8) /* save result in the store queue */
-        .include "sobel_fipr_inner2.s"
+        sobel_fipr_inner_2px
        mov.l r9,@(20,r8) /* save result in the store queue */
-        .include "sobel_fipr_inner2.s"
+        sobel_fipr_inner_2px
        mov.l r9,@(24,r8) /* save result in the store queue */
-        .include "sobel_fipr_inner2.s"
+        sobel_fipr_inner_2px
        mov.l r9,@(28,r8) /* save result in the store queue */

        /* send the store queue */
@ -158,7 +161,7 @@ _row_decrement:
        dt r11
        bt _return
        bra _loop
-        mov #80,r10 /* 640 / 8 */
+        mov #40,r10 /* 640 / 8 */

        /* restore registers */
 _return:
@ -166,6 +169,7 @@ _return:
        fmov.s  @r15+,fr14
        fmov.s  @r15+,fr13
        fmov.s  @r15+,fr12
+        mov.l @r15+,r14
        mov.l @r15+,r13
        mov.l @r15+,r12
        mov.l @r15+,r11
--- a/unpack_pixel.s
+++ b/unpack_pixel.s
@ -1,25 +1,63 @@
-        ocbi @r13
-        pref @r13 /* 32 bytes, 8 pixels */
-
        /* unpack the next 8 pixels */

-        fschg
+        /*
+        mov.l @r13,r9
+        extu.b r9,r1
+        shlr8 r9
+        extu.b r9,r2
+        add r1,r2
+        shlr8 r9
+        extu.b r9,r1
+        add r1,r2
+        shlr8 r9
+        add r2,r9
+        lds r9,fpul
+        add #4,r13
+        */
+        .macro unpack_pixel_inner_nibs

-        .include "unpack_pixel_inner.s"
+        mov.w @r13+,r9
+
+        mov r9,r1 /* nib0 */
+        shlr2 r9
+        shlr2 r9
+        and r14,r1
+
+        mov r9,r2 /* nib1 */
+        shlr2 r9
+        shlr2 r9
+        and r14,r2
+        add r2,r1
+
+        mov r9,r2  /* nib3 */
+        shlr2 r9
+        shlr2 r9
+        and r14,r2
+        add r2,r1
+
+        and r14,r9 /* nib4 */
+        add r9,r1
+
+        lds r1,fpul
+
+        .endm
+
+        .macro unpack_pixel_8
+        unpack_pixel_inner_nibs
        float fpul,fr0
-        .include "unpack_pixel_inner.s"
+        unpack_pixel_inner_nibs
        float fpul,fr1
-        .include "unpack_pixel_inner.s"
+        unpack_pixel_inner_nibs
        float fpul,fr2
-        .include "unpack_pixel_inner.s"
+        unpack_pixel_inner_nibs
        float fpul,fr3
-        .include "unpack_pixel_inner.s"
+        unpack_pixel_inner_nibs
        float fpul,fr4
-        .include "unpack_pixel_inner.s"
+        unpack_pixel_inner_nibs
        float fpul,fr5
-        .include "unpack_pixel_inner.s"
+        unpack_pixel_inner_nibs
        float fpul,fr6
-        .include "unpack_pixel_inner.s"
+        unpack_pixel_inner_nibs
        float fpul,fr7

        fmov dr0,@r12
@ -30,5 +68,17 @@
        add #8,r12
        fmov dr6,@r12
        add #8,r12
+        .endm
+
+        .macro unpack_pixel_16
+        ocbi @r13
+        pref @r13 /* 32 bytes, 16 pixels */
+        mov #15,r14

        fschg
+
+        unpack_pixel_8
+        unpack_pixel_8
+
+        fschg
+        .endm
--- a/unpack_pixel_inner.s
+++ b/unpack_pixel_inner.s
@ -1,12 +1 @@
-        mov.l @r13,r9
-        extu.b r9,r1
-        shlr8 r9
-        extu.b r9,r2
-        add r1,r2
-        shlr8 r9
-        extu.b r9,r1
-        add r1,r2
-        shlr8 r9
-        add r2,r9
-        lds r9,fpul
-        add #4,r13
+