wiffle_screen_space_store_queue2: add alpha blending

This commit is contained in:
Zack Buhman 2025-01-28 02:05:14 -06:00
parent 306294cfff
commit b156e2d24e
9 changed files with 316 additions and 111 deletions

View File

@ -4,7 +4,7 @@ include base.mk
include common.mk include common.mk
include headers.mk include headers.mk
OPT = -O2 OPT = -Og
MAKEFILE_PATH := $(patsubst %/,%,$(dir $(abspath $(firstword $(MAKEFILE_LIST))))) MAKEFILE_PATH := $(patsubst %/,%,$(dir $(abspath $(firstword $(MAKEFILE_LIST)))))
CFLAGS += -I$(MAKEFILE_PATH) CFLAGS += -I$(MAKEFILE_PATH)
LIB ?= $(MAKEFILE_PATH) LIB ?= $(MAKEFILE_PATH)

View File

@ -244,7 +244,8 @@ WIFFLE_SCREEN_SPACE_STORE_QUEUE2_OBJ = \
holly/background.o \ holly/background.o \
holly/ta_fifo_polygon_converter.o \ holly/ta_fifo_polygon_converter.o \
sh7091/serial.o \ sh7091/serial.o \
sobel_fipr_store_queue2.o sobel_fipr_store_queue2.o \
$(LIBGCC)
example/wiffle_screen_space_store_queue2.elf: LDSCRIPT = $(LIB)/main.lds example/wiffle_screen_space_store_queue2.elf: LDSCRIPT = $(LIB)/main.lds
example/wiffle_screen_space_store_queue2.elf: $(START_OBJ) $(WIFFLE_SCREEN_SPACE_STORE_QUEUE2_OBJ) example/wiffle_screen_space_store_queue2.elf: $(START_OBJ) $(WIFFLE_SCREEN_SPACE_STORE_QUEUE2_OBJ)

View File

@ -143,6 +143,85 @@ void transfer_scene(float theta)
sq_transfer_32byte(ta_fifo_polygon_converter); sq_transfer_32byte(ta_fifo_polygon_converter);
} }
struct quad_vertex {
float x;
float y;
float z;
float u;
float v;
};
// screen space coordinates
constexpr float x_uv = 640.f / 1024.f;
constexpr float y_uv = 480.f / 512.f;
const struct quad_vertex quad_vertices[] = {
{ 0.f, 0.f, 0.1f, 0.0f, 0.0f },
{ 640.f, 0.f, 0.1f, x_uv, 0.0f },
{ 640.f, 480.f, 0.1f, x_uv, y_uv },
{ 0.f, 480.f, 0.1f, 0.0f, y_uv },
};
void transfer_translucent_quad(uint32_t texture_address, bool use_alpha)
{
const uint32_t parameter_control_word = para_control::para_type::sprite
| para_control::list_type::translucent
| obj_control::col_type::packed_color
| obj_control::texture
| obj_control::_16bit_uv;
const uint32_t isp_tsp_instruction_word = isp_tsp_instruction_word::depth_compare_mode::always
| isp_tsp_instruction_word::culling_mode::no_culling;
const uint32_t alpha =
tsp_instruction_word::src_alpha_instr::inverse_src_alpha |
tsp_instruction_word::dst_alpha_instr::src_alpha;
const uint32_t no_alpha =
tsp_instruction_word::src_alpha_instr::one |
tsp_instruction_word::dst_alpha_instr::zero;
const uint32_t tsp_instruction_word = (use_alpha ? alpha : no_alpha)
| tsp_instruction_word::fog_control::no_fog
| tsp_instruction_word::texture_u_size::from_int(1024)
| tsp_instruction_word::texture_v_size::from_int(512)
| (use_alpha ? tsp_instruction_word::use_alpha : 0);
const uint32_t texture_control_word = texture_control_word::pixel_format::_4444
| texture_control_word::scan_order::non_twiddled
| texture_control_word::texture_address(texture_address / 8)
| texture_control_word::stride_select;
const uint32_t base_color = 0xffff00ff;
*reinterpret_cast<ta_global_parameter::sprite *>(store_queue) =
ta_global_parameter::sprite(parameter_control_word,
isp_tsp_instruction_word,
tsp_instruction_word,
texture_control_word,
base_color,
0, // offset_color
0, // data_size_for_sort_dma
0); // next_address_for_sort_dma
sq_transfer_32byte(ta_fifo_polygon_converter);
*reinterpret_cast<ta_vertex_parameter::sprite_type_1 *>(store_queue) =
ta_vertex_parameter::sprite_type_1(para_control::para_type::vertex_parameter,
quad_vertices[0].x,
quad_vertices[0].y,
quad_vertices[0].z,
quad_vertices[1].x,
quad_vertices[1].y,
quad_vertices[1].z,
quad_vertices[2].x,
quad_vertices[2].y,
quad_vertices[2].z,
quad_vertices[3].x,
quad_vertices[3].y,
uv_16bit(quad_vertices[0].u, quad_vertices[0].v),
uv_16bit(quad_vertices[1].u, quad_vertices[1].v),
uv_16bit(quad_vertices[2].u, quad_vertices[2].v));
sq_transfer_64byte(ta_fifo_polygon_converter);
}
void dma_transfer(uint32_t source, uint32_t destination, uint32_t transfers) void dma_transfer(uint32_t source, uint32_t destination, uint32_t transfers)
{ {
using namespace dmac; using namespace dmac;
@ -227,13 +306,14 @@ void main()
dma_init(); dma_init();
video_output::set_mode_vga(); video_output::set_mode_vga();
const int render_passes = 1;
constexpr uint32_t ta_alloc = ta_alloc_ctrl::pt_opb::no_list constexpr uint32_t ta_alloc = ta_alloc_ctrl::pt_opb::no_list
| ta_alloc_ctrl::tm_opb::no_list | ta_alloc_ctrl::tm_opb::no_list
| ta_alloc_ctrl::t_opb::_16x4byte | ta_alloc_ctrl::t_opb::_16x4byte
| ta_alloc_ctrl::om_opb::no_list | ta_alloc_ctrl::om_opb::no_list
| ta_alloc_ctrl::o_opb::no_list; | ta_alloc_ctrl::o_opb::no_list;
const int render_passes = 1;
const struct opb_size opb_size[render_passes] = { const struct opb_size opb_size[render_passes] = {
{ {
.opaque = 0, .opaque = 0,
@ -244,6 +324,22 @@ void main()
} }
}; };
constexpr uint32_t ta_alloc2 = ta_alloc_ctrl::pt_opb::no_list
| ta_alloc_ctrl::tm_opb::no_list
| ta_alloc_ctrl::t_opb::_16x4byte
| ta_alloc_ctrl::om_opb::no_list
| ta_alloc_ctrl::o_opb::no_list;
const struct opb_size opb_size2[render_passes] = {
{
.opaque = 0,
.opaque_modifier = 0,
.translucent = 16 * 4,
.translucent_modifier = 0,
.punch_through = 0
}
};
holly.SOFTRESET = softreset::pipeline_soft_reset holly.SOFTRESET = softreset::pipeline_soft_reset
| softreset::ta_soft_reset; | softreset::ta_soft_reset;
holly.SOFTRESET = 0; holly.SOFTRESET = 0;
@ -265,66 +361,45 @@ void main()
render_passes, render_passes,
texture_memory_alloc.region_array[0].start, texture_memory_alloc.region_array[0].start,
texture_memory_alloc.object_list[0].start); texture_memory_alloc.object_list[0].start);
background_parameter2(texture_memory_alloc.background[0].start, background_parameter2(texture_memory_alloc.background[0].start,
0xffc0c0c0); 0xffc0c0c0);
region_array_multipass(tile_width,
tile_height,
opb_size2,
render_passes,
texture_memory_alloc.region_array[1].start,
texture_memory_alloc.object_list[1].start);
background_parameter2(texture_memory_alloc.background[1].start,
0xffc0c0c0);
holly.FB_R_SOF1 = texture_memory_alloc.framebuffer[0].start; holly.FB_R_SOF1 = texture_memory_alloc.framebuffer[0].start;
holly.FB_R_CTRL = fb_r_ctrl::vclk_div::pclk_vclk_1 holly.FB_R_CTRL = fb_r_ctrl::vclk_div::pclk_vclk_1
| fb_r_ctrl::fb_depth::_0888_rgb_32bit | fb_r_ctrl::fb_depth::_565_rgb_16bit
| fb_r_ctrl::fb_enable; | fb_r_ctrl::fb_enable;
holly.FB_R_SIZE = fb_r_size::fb_modulus(1) holly.FB_R_SIZE = fb_r_size::fb_modulus(1)
| fb_r_size::fb_y_size(480 - 3) | fb_r_size::fb_y_size(480 - 3)
| fb_r_size::fb_x_size((640 * 32) / 32 - 1); | fb_r_size::fb_x_size((640 * 16) / 32 - 1);
holly.FB_W_CTRL = fb_w_ctrl::fb_packmode::_8888_argb_32bit; holly.TEXT_CONTROL = text_control::stride(20); // 640 pixels
system.LMMODE0 = 1; //system.LMMODE0 = 1;
system.LMMODE1 = 1; // 32-bit //system.LMMODE1 = 1; // 32-bit
system.LMMODE0 = 0;
uint32_t * out = (uint32_t *)&texture_memory32[texture_memory_alloc.framebuffer[0].start / 4]; system.LMMODE1 = 0; // 64-bit
for (int i = 0; i < 640 * 480; i++) {
out[i] = 0xffff0000;
}
ta_polygon_converter_init2(texture_memory_alloc.isp_tsp_parameters[0].start,
texture_memory_alloc.isp_tsp_parameters[0].end,
texture_memory_alloc.object_list[0].start,
texture_memory_alloc.object_list[0].end,
opb_size[0].total(),
ta_alloc,
tile_width,
tile_height);
transfer_scene(theta);
ta_wait_translucent_list();
const uint32_t bytes_per_pixel = 4;
core_start_render3(texture_memory_alloc.region_array[0].start,
texture_memory_alloc.isp_tsp_parameters[0].start,
texture_memory_alloc.background[0].start,
//texture_memory_alloc.framebuffer[0].start,
0x100'0000 | texture_memory_alloc.texture.start, // 64-bit area
framebuffer_width,
bytes_per_pixel);
ta_polygon_converter_init2(texture_memory_alloc.isp_tsp_parameters[0].start,
texture_memory_alloc.isp_tsp_parameters[0].end,
texture_memory_alloc.object_list[0].start,
texture_memory_alloc.object_list[0].end,
opb_size[0].total(),
ta_alloc,
tile_width,
tile_height);
transfer_scene(theta);
uint32_t * in = (uint32_t *)&texture_memory64[texture_memory_alloc.texture.start / 4]; uint32_t * in = (uint32_t *)&texture_memory64[texture_memory_alloc.texture.start / 4];
/*
for (int i = 0; i < 640 * 480; i++) {
uint32_t * framebuffer = (uint32_t *)(0x11000000 + texture_memory_alloc.framebuffer[0].start); uint32_t * framebuffer = (uint32_t *)(0x11000000 + texture_memory_alloc.framebuffer[0].start);
framebuffer[i] = 0xffff0000;
}
*/
while (1) { while (1) {
ta_wait_translucent_list();
ta_polygon_converter_init2(texture_memory_alloc.isp_tsp_parameters[0].start, ta_polygon_converter_init2(texture_memory_alloc.isp_tsp_parameters[0].start,
texture_memory_alloc.isp_tsp_parameters[0].end, texture_memory_alloc.isp_tsp_parameters[0].end,
texture_memory_alloc.object_list[0].start, texture_memory_alloc.object_list[0].start,
@ -333,32 +408,75 @@ void main()
ta_alloc, ta_alloc,
tile_width, tile_width,
tile_height); tile_height);
transfer_scene(theta); transfer_scene(theta);
//serial::string("wait_tl1\n");
ta_wait_translucent_list();
//serial::string("wait_tl1 end\n");
holly.FB_W_CTRL = fb_w_ctrl::fb_packmode::_4444_argb_16bit;
core_wait_end_of_render_video();
core_start_render3(texture_memory_alloc.region_array[0].start, core_start_render3(texture_memory_alloc.region_array[0].start,
texture_memory_alloc.isp_tsp_parameters[0].start, texture_memory_alloc.isp_tsp_parameters[0].start,
texture_memory_alloc.background[0].start, texture_memory_alloc.background[0].start,
//texture_memory_alloc.framebuffer[0].start,
0x100'0000 | texture_memory_alloc.texture.start, // 64-bit area 0x100'0000 | texture_memory_alloc.texture.start, // 64-bit area
framebuffer_width, framebuffer_width,
bytes_per_pixel); 2); // bytes_per_pixel
//serial::string("wait_eorv1\n");
core_wait_end_of_render_video();
//serial::string("wait_eorv1 end\n");
dma_transfer((uint32_t)in, (uint32_t)inbuf, 640 * 480 * 4 / 32); dma_transfer((uint32_t)in, (uint32_t)inbuf, 640 * 480 * 2 / 32);
while ((sh7091.DMAC.CHCR1 & dmac::chcr::te::transfers_completed) == 0); while ((sh7091.DMAC.CHCR1 & dmac::chcr::te::transfers_completed) == 0);
sobel_fipr_store_queue2(inbuf, framebuffer, temp); //sobel_fipr_store_queue2(inbuf, out, temp);
int frame = frame_ix & 1;
uint32_t * framebuffer = (uint32_t *)(0x11000000 + texture_memory_alloc.framebuffer[0].start);
uint32_t * out = (uint32_t *)(0x11000000 + texture_memory_alloc.texture.start + 640 * 480 * 2);
//serial::string("sobel\n");
//sobel_fipr_store_queue2(inbuf, framebuffer, temp);
sobel_fipr_store_queue2(inbuf, out, temp);
ta_polygon_converter_init2(texture_memory_alloc.isp_tsp_parameters[1].start,
texture_memory_alloc.isp_tsp_parameters[1].end,
texture_memory_alloc.object_list[1].start,
texture_memory_alloc.object_list[1].end,
opb_size2[0].total(),
ta_alloc2,
tile_width,
tile_height);
const uint32_t texture_address0 = texture_memory_alloc.texture.start;
transfer_translucent_quad(texture_address0, false);
const uint32_t texture_address1 = texture_memory_alloc.texture.start + 640 * 480 * 2;
transfer_translucent_quad(texture_address1, true);
*reinterpret_cast<ta_global_parameter::end_of_list *>(store_queue) =
ta_global_parameter::end_of_list(para_control::para_type::end_of_list);
sq_transfer_32byte(ta_fifo_polygon_converter);
//serial::string("wait_tl2\n");
ta_wait_translucent_list();
//serial::string("wait_tl2 end\n");
holly.FB_W_CTRL = fb_w_ctrl::fb_packmode::_565_rgb_16bit;
core_start_render3(texture_memory_alloc.region_array[1].start,
texture_memory_alloc.isp_tsp_parameters[1].start,
texture_memory_alloc.background[1].start,
texture_memory_alloc.framebuffer[frame].start,
framebuffer_width,
2); // bytes_per_pixel
//serial::string("wait_eorv2\n");
core_wait_end_of_render_video();
//serial::string("wait_eorv2 end\n");
while (!spg_status::vsync(holly.SPG_STATUS));
holly.FB_R_SOF1 = texture_memory_alloc.framebuffer[frame].start;
while (spg_status::vsync(holly.SPG_STATUS));
theta += half_degree; theta += half_degree;
frame_ix += 1; frame_ix += 1;
if (frame_ix > 100)
break;
} }
ta_wait_translucent_list();
core_wait_end_of_render_video();
serial::string("return\n"); serial::string("return\n");
serial::string("return\n"); serial::string("return\n");
serial::string("return\n"); serial::string("return\n");

View File

@ -140,18 +140,43 @@ void core_wait_end_of_render_video()
"Furthermore, it is strongly recommended that the End of ISP and End of Video interrupts "Furthermore, it is strongly recommended that the End of ISP and End of Video interrupts
be cleared at the same time in order to make debugging easier when an error occurs." be cleared at the same time in order to make debugging easier when an error occurs."
*/ */
while ((system.ISTNRM & istnrm::end_of_render_tsp) == 0) { //serial::string("eorv\n");
int64_t count = 0;
while (1) {
uint32_t istnrm = system.ISTNRM;
if ((istnrm & istnrm::end_of_render_tsp) != 0)
break;
if (istnrm & 0xc0000000) {
serial::string("istnrm ");
serial::integer<uint32_t>(istnrm);
serial::string("isterr ");
serial::integer<uint32_t>(system.ISTERR);
}
//serial::integer<uint32_t>(system.ISTERR);
if (system.ISTERR) { if (system.ISTERR) {
//serial::string("core "); //serial::string("core ");
//serial::integer<uint32_t>(system.ISTERR); //serial::integer<uint32_t>(system.ISTERR);
holly.SOFTRESET = softreset::pipeline_soft_reset; holly.SOFTRESET = softreset::pipeline_soft_reset;
holly.SOFTRESET = 0; holly.SOFTRESET = 0;
//break;
}
if (count > 10000000) {
serial::string("core timeout:\n");
serial::string("isterr ");
serial::integer<uint32_t>(system.ISTERR);
serial::string("istnrm ");
serial::integer<uint32_t>(system.ISTNRM);
break; break;
} }
count += 1;
}; };
system.ISTNRM = istnrm::end_of_render_tsp system.ISTNRM = istnrm::end_of_render_tsp
| istnrm::end_of_render_isp | istnrm::end_of_render_isp
| istnrm::end_of_render_video; | istnrm::end_of_render_video;
holly.SOFTRESET = softreset::pipeline_soft_reset;
holly.SOFTRESET = 0;
} }
void core_flip(uint32_t frame_ix) void core_flip(uint32_t frame_ix)

View File

@ -30,10 +30,10 @@ constexpr texture_memory_alloc texture_memory_alloc = {
.background = {{0x07'ffe0, 0x08'0000}, {0x47'ffe0, 0x48'0000}}, .background = {{0x07'ffe0, 0x08'0000}, {0x47'ffe0, 0x48'0000}},
.object_list = {{0x08'0000, 0x0f'ffe0}, {0x48'0000, 0x4f'ffe0}}, // ~122880 object list pointers .object_list = {{0x08'0000, 0x0f'ffe0}, {0x48'0000, 0x4f'ffe0}}, // ~122880 object list pointers
.region_array = {{0x10'0000, 0x11'0000}, {0x50'0000, 0x51'0000}}, // ~9 render passes .region_array = {{0x10'0000, 0x11'0000}, {0x50'0000, 0x51'0000}}, // ~9 render passes
//.framebuffer = {{0x11'0000, 0x1b'8c00}, {0x51'0000, 0x5b'8c00}}, // 720x480*2 .framebuffer = {{0x11'0000, 0x1b'8c00}, {0x51'0000, 0x5b'8c00}}, // 720x480*2
.framebuffer = {{0x11'0000, 0x23'c000}, {0x51'0000, 0x63'c000}}, // 640x480*4 //.framebuffer = {{0x11'0000, 0x23'c000}, {0x51'0000, 0x63'c000}}, // 640x480*4
// 64-bit addresses // 64-bit addresses
//.texture = {0x37'1800, 0x80'0000} .texture = {0x37'1800, 0x80'0000}
.texture = {0x57'1800, 0x80'0000} //.texture = {0x57'1800, 0x80'0000}
}; };

View File

@ -1,3 +1,5 @@
.macro inner_multiplication
/* y multiplication */ /* y multiplication */
mov #4,r1 /* r1 : temporary */ mov #4,r1 /* r1 : temporary */
fmov.s @r0,fr0 /* 0 */ fmov.s @r0,fr0 /* 0 */
@ -49,11 +51,27 @@
fsts FPUL,fr3 fsts FPUL,fr3
fadd fr3,fr7 fadd fr3,fr7
add #4,r0 /* next pixel */
fschg fschg
fmov xd0,dr0 /* load 100.f constant */ fmov xd0,dr0 /* load 100.f constant */
fcmp/gt fr0,fr7 fcmp/gt fr0,fr7
fschg fschg
.endm
.macro sobel_fipr_inner_2px
mov #0,r9
inner_multiplication
movt r9 movt r9
add #-1,r9 add #-1,r9
extu.w r9,r9
add #4,r0 /* next pixel */ inner_multiplication
movt r1
add #-1,r1
extu.w r1,r1
shll16 r1
or r1,r9
.endm

View File

@ -15,7 +15,7 @@ _sobel_fipr_store_queue2:
/* r11: var (y loop counter) */ /* r11: var (y loop counter) */
/* r12: var (prefetch address: input address + 1280 4) */ /* r12: var (prefetch address: input address + 1280 4) */
/* r13: var (input address) */ /* r13: var (input address) */
/* r14: - */ /* r14: (temporary) */
__setup: __setup:
mov.l r8,@-r15 mov.l r8,@-r15
@ -24,6 +24,7 @@ __setup:
mov.l r11,@-r15 mov.l r11,@-r15
mov.l r12,@-r15 mov.l r12,@-r15
mov.l r13,@-r15 mov.l r13,@-r15
mov.l r14,@-r15
fmov.s fr12,@-r15 fmov.s fr12,@-r15
fmov.s fr13,@-r15 fmov.s fr13,@-r15
fmov.s fr14,@-r15 fmov.s fr14,@-r15
@ -81,7 +82,7 @@ __setup:
nop nop
.align 4 .align 4
_const_100f: .float 3900 _const_100f: .float 50
_const_store_queue: .long 0xe0000000 _const_store_queue: .long 0xe0000000
_const_store_queue_mask: .long 0x03ffffc0 /* (0xffffffff & (~0b111111)) & (~(0b111111 << 26)) */ _const_store_queue_mask: .long 0x03ffffc0 /* (0xffffffff & (~0b111111)) & (~(0b111111 << 26)) */
@ -93,54 +94,56 @@ _const_1280: .short (1280 * 4)
_const_1281: .short (1281 * 4) _const_1281: .short (1281 * 4)
_const_1282: .short (1282 * 4) _const_1282: .short (1282 * 4)
/* use r10 as temporary to load the first 1280 pixels; 8 pixels per loop iteration */ /* use r10 as temporary to load the first 1280 pixels; 16 pixels per loop iteration */
.include "unpack_pixel.s"
.align 4 .align 4
_prime_pixels_loop_init: _prime_pixels_loop_init:
mov #80,r10 /* 1280 / 8 */
shll r10
mov r0,r12 mov r0,r12
mov #80,r10 /* 1280 / 16 */
shll r10
_prime_pixels_loop: _prime_pixels_loop:
.include "unpack_pixel.s" unpack_pixel_16
dt r10 dt r10
bt _loop_init bt _loop_init
bra _prime_pixels_loop bra _prime_pixels_loop
nop nop
.align 4
_loop_init: _loop_init:
/* skip first row */ /* skip first output row */
add r3,r0 /* r3: const (640 * 4) */ mov r3,r1
add r3,r8 shlr r1
add r1,r8 /* r3: 640 * 4 */
mov.w _const_height,r11 /* 478 */ mov.w _const_height,r11 /* 478 */
bra _loop bra _loop
mov #80,r10 /* 640 / 8 */ mov #40,r10 /* 640 / 8 */
_const_height: .short 478 _const_height: .short 476
/*_const_height: .short 238*/
.include "sobel_fipr_inner2.s"
_loop: _loop:
_loop_width: _loop_width:
/* prefetch at r8 + 1280 */ /* prefetch at r8 + 1280 */
unpack_pixel_16
/* process the next 8 pixels */ /* process the next 16 pixels */
.include "unpack_pixel.s" sobel_fipr_inner_2px
.include "sobel_fipr_inner2.s"
mov.l r9,@r8 /* save result in the store queue */ mov.l r9,@r8 /* save result in the store queue */
.include "sobel_fipr_inner2.s" sobel_fipr_inner_2px
mov.l r9,@(4,r8) /* save result in the store queue */ mov.l r9,@(4,r8) /* save result in the store queue */
.include "sobel_fipr_inner2.s" sobel_fipr_inner_2px
mov.l r9,@(8,r8) /* save result in the store queue */ mov.l r9,@(8,r8) /* save result in the store queue */
.include "sobel_fipr_inner2.s" sobel_fipr_inner_2px
mov.l r9,@(12,r8) /* save result in the store queue */ mov.l r9,@(12,r8) /* save result in the store queue */
.include "sobel_fipr_inner2.s" sobel_fipr_inner_2px
mov.l r9,@(16,r8) /* save result in the store queue */ mov.l r9,@(16,r8) /* save result in the store queue */
.include "sobel_fipr_inner2.s" sobel_fipr_inner_2px
mov.l r9,@(20,r8) /* save result in the store queue */ mov.l r9,@(20,r8) /* save result in the store queue */
.include "sobel_fipr_inner2.s" sobel_fipr_inner_2px
mov.l r9,@(24,r8) /* save result in the store queue */ mov.l r9,@(24,r8) /* save result in the store queue */
.include "sobel_fipr_inner2.s" sobel_fipr_inner_2px
mov.l r9,@(28,r8) /* save result in the store queue */ mov.l r9,@(28,r8) /* save result in the store queue */
/* send the store queue */ /* send the store queue */
@ -158,7 +161,7 @@ _row_decrement:
dt r11 dt r11
bt _return bt _return
bra _loop bra _loop
mov #80,r10 /* 640 / 8 */ mov #40,r10 /* 640 / 8 */
/* restore registers */ /* restore registers */
_return: _return:
@ -166,6 +169,7 @@ _return:
fmov.s @r15+,fr14 fmov.s @r15+,fr14
fmov.s @r15+,fr13 fmov.s @r15+,fr13
fmov.s @r15+,fr12 fmov.s @r15+,fr12
mov.l @r15+,r14
mov.l @r15+,r13 mov.l @r15+,r13
mov.l @r15+,r12 mov.l @r15+,r12
mov.l @r15+,r11 mov.l @r15+,r11

View File

@ -1,25 +1,63 @@
ocbi @r13
pref @r13 /* 32 bytes, 8 pixels */
/* unpack the next 8 pixels */ /* unpack the next 8 pixels */
fschg /*
mov.l @r13,r9
extu.b r9,r1
shlr8 r9
extu.b r9,r2
add r1,r2
shlr8 r9
extu.b r9,r1
add r1,r2
shlr8 r9
add r2,r9
lds r9,fpul
add #4,r13
*/
.macro unpack_pixel_inner_nibs
.include "unpack_pixel_inner.s" mov.w @r13+,r9
mov r9,r1 /* nib0 */
shlr2 r9
shlr2 r9
and r14,r1
mov r9,r2 /* nib1 */
shlr2 r9
shlr2 r9
and r14,r2
add r2,r1
mov r9,r2 /* nib3 */
shlr2 r9
shlr2 r9
and r14,r2
add r2,r1
and r14,r9 /* nib4 */
add r9,r1
lds r1,fpul
.endm
.macro unpack_pixel_8
unpack_pixel_inner_nibs
float fpul,fr0 float fpul,fr0
.include "unpack_pixel_inner.s" unpack_pixel_inner_nibs
float fpul,fr1 float fpul,fr1
.include "unpack_pixel_inner.s" unpack_pixel_inner_nibs
float fpul,fr2 float fpul,fr2
.include "unpack_pixel_inner.s" unpack_pixel_inner_nibs
float fpul,fr3 float fpul,fr3
.include "unpack_pixel_inner.s" unpack_pixel_inner_nibs
float fpul,fr4 float fpul,fr4
.include "unpack_pixel_inner.s" unpack_pixel_inner_nibs
float fpul,fr5 float fpul,fr5
.include "unpack_pixel_inner.s" unpack_pixel_inner_nibs
float fpul,fr6 float fpul,fr6
.include "unpack_pixel_inner.s" unpack_pixel_inner_nibs
float fpul,fr7 float fpul,fr7
fmov dr0,@r12 fmov dr0,@r12
@ -30,5 +68,17 @@
add #8,r12 add #8,r12
fmov dr6,@r12 fmov dr6,@r12
add #8,r12 add #8,r12
.endm
.macro unpack_pixel_16
ocbi @r13
pref @r13 /* 32 bytes, 16 pixels */
mov #15,r14
fschg fschg
unpack_pixel_8
unpack_pixel_8
fschg
.endm

View File

@ -1,12 +1 @@
mov.l @r13,r9
extu.b r9,r1
shlr8 r9
extu.b r9,r2
add r1,r2
shlr8 r9
extu.b r9,r1
add r1,r2
shlr8 r9
add r2,r9
lds r9,fpul
add #4,r13