wiffle_screen_space_store_queue2: add alpha blending

This commit is contained in:
Zack Buhman 2025-01-28 02:05:14 -06:00
parent 306294cfff
commit b156e2d24e
9 changed files with 316 additions and 111 deletions

View File

@ -4,7 +4,7 @@ include base.mk
include common.mk
include headers.mk
OPT = -O2
OPT = -Og
MAKEFILE_PATH := $(patsubst %/,%,$(dir $(abspath $(firstword $(MAKEFILE_LIST)))))
CFLAGS += -I$(MAKEFILE_PATH)
LIB ?= $(MAKEFILE_PATH)

View File

@ -244,7 +244,8 @@ WIFFLE_SCREEN_SPACE_STORE_QUEUE2_OBJ = \
holly/background.o \
holly/ta_fifo_polygon_converter.o \
sh7091/serial.o \
sobel_fipr_store_queue2.o
sobel_fipr_store_queue2.o \
$(LIBGCC)
example/wiffle_screen_space_store_queue2.elf: LDSCRIPT = $(LIB)/main.lds
example/wiffle_screen_space_store_queue2.elf: $(START_OBJ) $(WIFFLE_SCREEN_SPACE_STORE_QUEUE2_OBJ)

View File

@ -143,6 +143,85 @@ void transfer_scene(float theta)
sq_transfer_32byte(ta_fifo_polygon_converter);
}
struct quad_vertex {
float x;
float y;
float z;
float u;
float v;
};
// screen space coordinates
constexpr float x_uv = 640.f / 1024.f;
constexpr float y_uv = 480.f / 512.f;
const struct quad_vertex quad_vertices[] = {
{ 0.f, 0.f, 0.1f, 0.0f, 0.0f },
{ 640.f, 0.f, 0.1f, x_uv, 0.0f },
{ 640.f, 480.f, 0.1f, x_uv, y_uv },
{ 0.f, 480.f, 0.1f, 0.0f, y_uv },
};
void transfer_translucent_quad(uint32_t texture_address, bool use_alpha)
{
const uint32_t parameter_control_word = para_control::para_type::sprite
| para_control::list_type::translucent
| obj_control::col_type::packed_color
| obj_control::texture
| obj_control::_16bit_uv;
const uint32_t isp_tsp_instruction_word = isp_tsp_instruction_word::depth_compare_mode::always
| isp_tsp_instruction_word::culling_mode::no_culling;
const uint32_t alpha =
tsp_instruction_word::src_alpha_instr::inverse_src_alpha |
tsp_instruction_word::dst_alpha_instr::src_alpha;
const uint32_t no_alpha =
tsp_instruction_word::src_alpha_instr::one |
tsp_instruction_word::dst_alpha_instr::zero;
const uint32_t tsp_instruction_word = (use_alpha ? alpha : no_alpha)
| tsp_instruction_word::fog_control::no_fog
| tsp_instruction_word::texture_u_size::from_int(1024)
| tsp_instruction_word::texture_v_size::from_int(512)
| (use_alpha ? tsp_instruction_word::use_alpha : 0);
const uint32_t texture_control_word = texture_control_word::pixel_format::_4444
| texture_control_word::scan_order::non_twiddled
| texture_control_word::texture_address(texture_address / 8)
| texture_control_word::stride_select;
const uint32_t base_color = 0xffff00ff;
*reinterpret_cast<ta_global_parameter::sprite *>(store_queue) =
ta_global_parameter::sprite(parameter_control_word,
isp_tsp_instruction_word,
tsp_instruction_word,
texture_control_word,
base_color,
0, // offset_color
0, // data_size_for_sort_dma
0); // next_address_for_sort_dma
sq_transfer_32byte(ta_fifo_polygon_converter);
*reinterpret_cast<ta_vertex_parameter::sprite_type_1 *>(store_queue) =
ta_vertex_parameter::sprite_type_1(para_control::para_type::vertex_parameter,
quad_vertices[0].x,
quad_vertices[0].y,
quad_vertices[0].z,
quad_vertices[1].x,
quad_vertices[1].y,
quad_vertices[1].z,
quad_vertices[2].x,
quad_vertices[2].y,
quad_vertices[2].z,
quad_vertices[3].x,
quad_vertices[3].y,
uv_16bit(quad_vertices[0].u, quad_vertices[0].v),
uv_16bit(quad_vertices[1].u, quad_vertices[1].v),
uv_16bit(quad_vertices[2].u, quad_vertices[2].v));
sq_transfer_64byte(ta_fifo_polygon_converter);
}
void dma_transfer(uint32_t source, uint32_t destination, uint32_t transfers)
{
using namespace dmac;
@ -227,13 +306,14 @@ void main()
dma_init();
video_output::set_mode_vga();
const int render_passes = 1;
constexpr uint32_t ta_alloc = ta_alloc_ctrl::pt_opb::no_list
| ta_alloc_ctrl::tm_opb::no_list
| ta_alloc_ctrl::t_opb::_16x4byte
| ta_alloc_ctrl::om_opb::no_list
| ta_alloc_ctrl::o_opb::no_list;
const int render_passes = 1;
const struct opb_size opb_size[render_passes] = {
{
.opaque = 0,
@ -244,6 +324,22 @@ void main()
}
};
constexpr uint32_t ta_alloc2 = ta_alloc_ctrl::pt_opb::no_list
| ta_alloc_ctrl::tm_opb::no_list
| ta_alloc_ctrl::t_opb::_16x4byte
| ta_alloc_ctrl::om_opb::no_list
| ta_alloc_ctrl::o_opb::no_list;
const struct opb_size opb_size2[render_passes] = {
{
.opaque = 0,
.opaque_modifier = 0,
.translucent = 16 * 4,
.translucent_modifier = 0,
.punch_through = 0
}
};
holly.SOFTRESET = softreset::pipeline_soft_reset
| softreset::ta_soft_reset;
holly.SOFTRESET = 0;
@ -265,66 +361,45 @@ void main()
render_passes,
texture_memory_alloc.region_array[0].start,
texture_memory_alloc.object_list[0].start);
background_parameter2(texture_memory_alloc.background[0].start,
0xffc0c0c0);
region_array_multipass(tile_width,
tile_height,
opb_size2,
render_passes,
texture_memory_alloc.region_array[1].start,
texture_memory_alloc.object_list[1].start);
background_parameter2(texture_memory_alloc.background[1].start,
0xffc0c0c0);
holly.FB_R_SOF1 = texture_memory_alloc.framebuffer[0].start;
holly.FB_R_CTRL = fb_r_ctrl::vclk_div::pclk_vclk_1
| fb_r_ctrl::fb_depth::_0888_rgb_32bit
| fb_r_ctrl::fb_depth::_565_rgb_16bit
| fb_r_ctrl::fb_enable;
holly.FB_R_SIZE = fb_r_size::fb_modulus(1)
| fb_r_size::fb_y_size(480 - 3)
| fb_r_size::fb_x_size((640 * 32) / 32 - 1);
| fb_r_size::fb_x_size((640 * 16) / 32 - 1);
holly.FB_W_CTRL = fb_w_ctrl::fb_packmode::_8888_argb_32bit;
holly.TEXT_CONTROL = text_control::stride(20); // 640 pixels
system.LMMODE0 = 1;
system.LMMODE1 = 1; // 32-bit
uint32_t * out = (uint32_t *)&texture_memory32[texture_memory_alloc.framebuffer[0].start / 4];
for (int i = 0; i < 640 * 480; i++) {
out[i] = 0xffff0000;
}
ta_polygon_converter_init2(texture_memory_alloc.isp_tsp_parameters[0].start,
texture_memory_alloc.isp_tsp_parameters[0].end,
texture_memory_alloc.object_list[0].start,
texture_memory_alloc.object_list[0].end,
opb_size[0].total(),
ta_alloc,
tile_width,
tile_height);
transfer_scene(theta);
ta_wait_translucent_list();
const uint32_t bytes_per_pixel = 4;
core_start_render3(texture_memory_alloc.region_array[0].start,
texture_memory_alloc.isp_tsp_parameters[0].start,
texture_memory_alloc.background[0].start,
//texture_memory_alloc.framebuffer[0].start,
0x100'0000 | texture_memory_alloc.texture.start, // 64-bit area
framebuffer_width,
bytes_per_pixel);
ta_polygon_converter_init2(texture_memory_alloc.isp_tsp_parameters[0].start,
texture_memory_alloc.isp_tsp_parameters[0].end,
texture_memory_alloc.object_list[0].start,
texture_memory_alloc.object_list[0].end,
opb_size[0].total(),
ta_alloc,
tile_width,
tile_height);
transfer_scene(theta);
//system.LMMODE0 = 1;
//system.LMMODE1 = 1; // 32-bit
system.LMMODE0 = 0;
system.LMMODE1 = 0; // 64-bit
uint32_t * in = (uint32_t *)&texture_memory64[texture_memory_alloc.texture.start / 4];
uint32_t * framebuffer = (uint32_t *)(0x11000000 + texture_memory_alloc.framebuffer[0].start);
/*
for (int i = 0; i < 640 * 480; i++) {
uint32_t * framebuffer = (uint32_t *)(0x11000000 + texture_memory_alloc.framebuffer[0].start);
framebuffer[i] = 0xffff0000;
}
*/
while (1) {
ta_wait_translucent_list();
ta_polygon_converter_init2(texture_memory_alloc.isp_tsp_parameters[0].start,
texture_memory_alloc.isp_tsp_parameters[0].end,
texture_memory_alloc.object_list[0].start,
@ -333,32 +408,75 @@ void main()
ta_alloc,
tile_width,
tile_height);
transfer_scene(theta);
//serial::string("wait_tl1\n");
ta_wait_translucent_list();
//serial::string("wait_tl1 end\n");
holly.FB_W_CTRL = fb_w_ctrl::fb_packmode::_4444_argb_16bit;
core_wait_end_of_render_video();
core_start_render3(texture_memory_alloc.region_array[0].start,
texture_memory_alloc.isp_tsp_parameters[0].start,
texture_memory_alloc.background[0].start,
//texture_memory_alloc.framebuffer[0].start,
0x100'0000 | texture_memory_alloc.texture.start, // 64-bit area
framebuffer_width,
bytes_per_pixel);
2); // bytes_per_pixel
//serial::string("wait_eorv1\n");
core_wait_end_of_render_video();
//serial::string("wait_eorv1 end\n");
dma_transfer((uint32_t)in, (uint32_t)inbuf, 640 * 480 * 4 / 32);
dma_transfer((uint32_t)in, (uint32_t)inbuf, 640 * 480 * 2 / 32);
while ((sh7091.DMAC.CHCR1 & dmac::chcr::te::transfers_completed) == 0);
sobel_fipr_store_queue2(inbuf, framebuffer, temp);
//sobel_fipr_store_queue2(inbuf, out, temp);
int frame = frame_ix & 1;
uint32_t * framebuffer = (uint32_t *)(0x11000000 + texture_memory_alloc.framebuffer[0].start);
uint32_t * out = (uint32_t *)(0x11000000 + texture_memory_alloc.texture.start + 640 * 480 * 2);
//serial::string("sobel\n");
//sobel_fipr_store_queue2(inbuf, framebuffer, temp);
sobel_fipr_store_queue2(inbuf, out, temp);
ta_polygon_converter_init2(texture_memory_alloc.isp_tsp_parameters[1].start,
texture_memory_alloc.isp_tsp_parameters[1].end,
texture_memory_alloc.object_list[1].start,
texture_memory_alloc.object_list[1].end,
opb_size2[0].total(),
ta_alloc2,
tile_width,
tile_height);
const uint32_t texture_address0 = texture_memory_alloc.texture.start;
transfer_translucent_quad(texture_address0, false);
const uint32_t texture_address1 = texture_memory_alloc.texture.start + 640 * 480 * 2;
transfer_translucent_quad(texture_address1, true);
*reinterpret_cast<ta_global_parameter::end_of_list *>(store_queue) =
ta_global_parameter::end_of_list(para_control::para_type::end_of_list);
sq_transfer_32byte(ta_fifo_polygon_converter);
//serial::string("wait_tl2\n");
ta_wait_translucent_list();
//serial::string("wait_tl2 end\n");
holly.FB_W_CTRL = fb_w_ctrl::fb_packmode::_565_rgb_16bit;
core_start_render3(texture_memory_alloc.region_array[1].start,
texture_memory_alloc.isp_tsp_parameters[1].start,
texture_memory_alloc.background[1].start,
texture_memory_alloc.framebuffer[frame].start,
framebuffer_width,
2); // bytes_per_pixel
//serial::string("wait_eorv2\n");
core_wait_end_of_render_video();
//serial::string("wait_eorv2 end\n");
while (!spg_status::vsync(holly.SPG_STATUS));
holly.FB_R_SOF1 = texture_memory_alloc.framebuffer[frame].start;
while (spg_status::vsync(holly.SPG_STATUS));
theta += half_degree;
frame_ix += 1;
if (frame_ix > 100)
break;
}
ta_wait_translucent_list();
core_wait_end_of_render_video();
serial::string("return\n");
serial::string("return\n");
serial::string("return\n");

View File

@ -140,18 +140,43 @@ void core_wait_end_of_render_video()
"Furthermore, it is strongly recommended that the End of ISP and End of Video interrupts
be cleared at the same time in order to make debugging easier when an error occurs."
*/
while ((system.ISTNRM & istnrm::end_of_render_tsp) == 0) {
//serial::string("eorv\n");
int64_t count = 0;
while (1) {
uint32_t istnrm = system.ISTNRM;
if ((istnrm & istnrm::end_of_render_tsp) != 0)
break;
if (istnrm & 0xc0000000) {
serial::string("istnrm ");
serial::integer<uint32_t>(istnrm);
serial::string("isterr ");
serial::integer<uint32_t>(system.ISTERR);
}
//serial::integer<uint32_t>(system.ISTERR);
if (system.ISTERR) {
//serial::string("core ");
//serial::integer<uint32_t>(system.ISTERR);
holly.SOFTRESET = softreset::pipeline_soft_reset;
holly.SOFTRESET = 0;
//break;
}
if (count > 10000000) {
serial::string("core timeout:\n");
serial::string("isterr ");
serial::integer<uint32_t>(system.ISTERR);
serial::string("istnrm ");
serial::integer<uint32_t>(system.ISTNRM);
break;
}
count += 1;
};
system.ISTNRM = istnrm::end_of_render_tsp
| istnrm::end_of_render_isp
| istnrm::end_of_render_video;
holly.SOFTRESET = softreset::pipeline_soft_reset;
holly.SOFTRESET = 0;
}
void core_flip(uint32_t frame_ix)

View File

@ -30,10 +30,10 @@ constexpr texture_memory_alloc texture_memory_alloc = {
.background = {{0x07'ffe0, 0x08'0000}, {0x47'ffe0, 0x48'0000}},
.object_list = {{0x08'0000, 0x0f'ffe0}, {0x48'0000, 0x4f'ffe0}}, // ~122880 object list pointers
.region_array = {{0x10'0000, 0x11'0000}, {0x50'0000, 0x51'0000}}, // ~9 render passes
//.framebuffer = {{0x11'0000, 0x1b'8c00}, {0x51'0000, 0x5b'8c00}}, // 720x480*2
.framebuffer = {{0x11'0000, 0x23'c000}, {0x51'0000, 0x63'c000}}, // 640x480*4
.framebuffer = {{0x11'0000, 0x1b'8c00}, {0x51'0000, 0x5b'8c00}}, // 720x480*2
//.framebuffer = {{0x11'0000, 0x23'c000}, {0x51'0000, 0x63'c000}}, // 640x480*4
// 64-bit addresses
//.texture = {0x37'1800, 0x80'0000}
.texture = {0x57'1800, 0x80'0000}
.texture = {0x37'1800, 0x80'0000}
//.texture = {0x57'1800, 0x80'0000}
};

View File

@ -1,3 +1,5 @@
.macro inner_multiplication
/* y multiplication */
mov #4,r1 /* r1 : temporary */
fmov.s @r0,fr0 /* 0 */
@ -49,11 +51,27 @@
fsts FPUL,fr3
fadd fr3,fr7
add #4,r0 /* next pixel */
fschg
fmov xd0,dr0 /* load 100.f constant */
fcmp/gt fr0,fr7
fschg
.endm
.macro sobel_fipr_inner_2px
mov #0,r9
inner_multiplication
movt r9
add #-1,r9
extu.w r9,r9
add #4,r0 /* next pixel */
inner_multiplication
movt r1
add #-1,r1
extu.w r1,r1
shll16 r1
or r1,r9
.endm

View File

@ -15,7 +15,7 @@ _sobel_fipr_store_queue2:
/* r11: var (y loop counter) */
/* r12: var (prefetch address: input address + 1280 4) */
/* r13: var (input address) */
/* r14: - */
/* r14: (temporary) */
__setup:
mov.l r8,@-r15
@ -24,6 +24,7 @@ __setup:
mov.l r11,@-r15
mov.l r12,@-r15
mov.l r13,@-r15
mov.l r14,@-r15
fmov.s fr12,@-r15
fmov.s fr13,@-r15
fmov.s fr14,@-r15
@ -81,7 +82,7 @@ __setup:
nop
.align 4
_const_100f: .float 3900
_const_100f: .float 50
_const_store_queue: .long 0xe0000000
_const_store_queue_mask: .long 0x03ffffc0 /* (0xffffffff & (~0b111111)) & (~(0b111111 << 26)) */
@ -93,54 +94,56 @@ _const_1280: .short (1280 * 4)
_const_1281: .short (1281 * 4)
_const_1282: .short (1282 * 4)
/* use r10 as temporary to load the first 1280 pixels; 8 pixels per loop iteration */
/* use r10 as temporary to load the first 1280 pixels; 16 pixels per loop iteration */
.include "unpack_pixel.s"
.align 4
_prime_pixels_loop_init:
mov #80,r10 /* 1280 / 8 */
shll r10
mov r0,r12
mov #80,r10 /* 1280 / 16 */
shll r10
_prime_pixels_loop:
.include "unpack_pixel.s"
unpack_pixel_16
dt r10
bt _loop_init
bra _prime_pixels_loop
nop
.align 4
_loop_init:
/* skip first row */
add r3,r0 /* r3: const (640 * 4) */
add r3,r8
/* skip first output row */
mov r3,r1
shlr r1
add r1,r8 /* r3: 640 * 4 */
mov.w _const_height,r11 /* 478 */
bra _loop
mov #80,r10 /* 640 / 8 */
mov #40,r10 /* 640 / 8 */
_const_height: .short 478
_const_height: .short 476
/*_const_height: .short 238*/
.include "sobel_fipr_inner2.s"
_loop:
_loop_width:
/* prefetch at r8 + 1280 */
unpack_pixel_16
/* process the next 8 pixels */
.include "unpack_pixel.s"
.include "sobel_fipr_inner2.s"
/* process the next 16 pixels */
sobel_fipr_inner_2px
mov.l r9,@r8 /* save result in the store queue */
.include "sobel_fipr_inner2.s"
sobel_fipr_inner_2px
mov.l r9,@(4,r8) /* save result in the store queue */
.include "sobel_fipr_inner2.s"
sobel_fipr_inner_2px
mov.l r9,@(8,r8) /* save result in the store queue */
.include "sobel_fipr_inner2.s"
sobel_fipr_inner_2px
mov.l r9,@(12,r8) /* save result in the store queue */
.include "sobel_fipr_inner2.s"
sobel_fipr_inner_2px
mov.l r9,@(16,r8) /* save result in the store queue */
.include "sobel_fipr_inner2.s"
sobel_fipr_inner_2px
mov.l r9,@(20,r8) /* save result in the store queue */
.include "sobel_fipr_inner2.s"
sobel_fipr_inner_2px
mov.l r9,@(24,r8) /* save result in the store queue */
.include "sobel_fipr_inner2.s"
sobel_fipr_inner_2px
mov.l r9,@(28,r8) /* save result in the store queue */
/* send the store queue */
@ -158,7 +161,7 @@ _row_decrement:
dt r11
bt _return
bra _loop
mov #80,r10 /* 640 / 8 */
mov #40,r10 /* 640 / 8 */
/* restore registers */
_return:
@ -166,6 +169,7 @@ _return:
fmov.s @r15+,fr14
fmov.s @r15+,fr13
fmov.s @r15+,fr12
mov.l @r15+,r14
mov.l @r15+,r13
mov.l @r15+,r12
mov.l @r15+,r11

View File

@ -1,25 +1,63 @@
ocbi @r13
pref @r13 /* 32 bytes, 8 pixels */
/* unpack the next 8 pixels */
fschg
/*
mov.l @r13,r9
extu.b r9,r1
shlr8 r9
extu.b r9,r2
add r1,r2
shlr8 r9
extu.b r9,r1
add r1,r2
shlr8 r9
add r2,r9
lds r9,fpul
add #4,r13
*/
.macro unpack_pixel_inner_nibs
.include "unpack_pixel_inner.s"
mov.w @r13+,r9
mov r9,r1 /* nib0 */
shlr2 r9
shlr2 r9
and r14,r1
mov r9,r2 /* nib1 */
shlr2 r9
shlr2 r9
and r14,r2
add r2,r1
mov r9,r2 /* nib3 */
shlr2 r9
shlr2 r9
and r14,r2
add r2,r1
and r14,r9 /* nib4 */
add r9,r1
lds r1,fpul
.endm
.macro unpack_pixel_8
unpack_pixel_inner_nibs
float fpul,fr0
.include "unpack_pixel_inner.s"
unpack_pixel_inner_nibs
float fpul,fr1
.include "unpack_pixel_inner.s"
unpack_pixel_inner_nibs
float fpul,fr2
.include "unpack_pixel_inner.s"
unpack_pixel_inner_nibs
float fpul,fr3
.include "unpack_pixel_inner.s"
unpack_pixel_inner_nibs
float fpul,fr4
.include "unpack_pixel_inner.s"
unpack_pixel_inner_nibs
float fpul,fr5
.include "unpack_pixel_inner.s"
unpack_pixel_inner_nibs
float fpul,fr6
.include "unpack_pixel_inner.s"
unpack_pixel_inner_nibs
float fpul,fr7
fmov dr0,@r12
@ -30,5 +68,17 @@
add #8,r12
fmov dr6,@r12
add #8,r12
.endm
.macro unpack_pixel_16
ocbi @r13
pref @r13 /* 32 bytes, 16 pixels */
mov #15,r14
fschg
unpack_pixel_8
unpack_pixel_8
fschg
.endm

View File

@ -1,12 +1 @@
mov.l @r13,r9
extu.b r9,r1
shlr8 r9
extu.b r9,r2
add r1,r2
shlr8 r9
extu.b r9,r1
add r1,r2
shlr8 r9
add r2,r9
lds r9,fpul
add #4,r13