diff --git a/Makefile b/Makefile index 652c1e2..9bd51b8 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ include base.mk include common.mk include headers.mk -OPT = -O2 +OPT = -Og MAKEFILE_PATH := $(patsubst %/,%,$(dir $(abspath $(firstword $(MAKEFILE_LIST))))) CFLAGS += -I$(MAKEFILE_PATH) LIB ?= $(MAKEFILE_PATH) diff --git a/example/example.mk b/example/example.mk index 0b7aae0..3594b7a 100644 --- a/example/example.mk +++ b/example/example.mk @@ -244,7 +244,8 @@ WIFFLE_SCREEN_SPACE_STORE_QUEUE2_OBJ = \ holly/background.o \ holly/ta_fifo_polygon_converter.o \ sh7091/serial.o \ - sobel_fipr_store_queue2.o + sobel_fipr_store_queue2.o \ + $(LIBGCC) example/wiffle_screen_space_store_queue2.elf: LDSCRIPT = $(LIB)/main.lds example/wiffle_screen_space_store_queue2.elf: $(START_OBJ) $(WIFFLE_SCREEN_SPACE_STORE_QUEUE2_OBJ) diff --git a/example/wiffle_screen_space_store_queue2.cpp b/example/wiffle_screen_space_store_queue2.cpp index a6e5142..2a679d6 100644 --- a/example/wiffle_screen_space_store_queue2.cpp +++ b/example/wiffle_screen_space_store_queue2.cpp @@ -143,6 +143,85 @@ void transfer_scene(float theta) sq_transfer_32byte(ta_fifo_polygon_converter); } +struct quad_vertex { + float x; + float y; + float z; + float u; + float v; +}; + +// screen space coordinates +constexpr float x_uv = 640.f / 1024.f; +constexpr float y_uv = 480.f / 512.f; + +const struct quad_vertex quad_vertices[] = { + { 0.f, 0.f, 0.1f, 0.0f, 0.0f }, + { 640.f, 0.f, 0.1f, x_uv, 0.0f }, + { 640.f, 480.f, 0.1f, x_uv, y_uv }, + { 0.f, 480.f, 0.1f, 0.0f, y_uv }, +}; + +void transfer_translucent_quad(uint32_t texture_address, bool use_alpha) +{ + const uint32_t parameter_control_word = para_control::para_type::sprite + | para_control::list_type::translucent + | obj_control::col_type::packed_color + | obj_control::texture + | obj_control::_16bit_uv; + + const uint32_t isp_tsp_instruction_word = isp_tsp_instruction_word::depth_compare_mode::always + | isp_tsp_instruction_word::culling_mode::no_culling; + + const uint32_t alpha = + tsp_instruction_word::src_alpha_instr::inverse_src_alpha | + tsp_instruction_word::dst_alpha_instr::src_alpha; + const uint32_t no_alpha = + tsp_instruction_word::src_alpha_instr::one | + tsp_instruction_word::dst_alpha_instr::zero; + + const uint32_t tsp_instruction_word = (use_alpha ? alpha : no_alpha) + | tsp_instruction_word::fog_control::no_fog + | tsp_instruction_word::texture_u_size::from_int(1024) + | tsp_instruction_word::texture_v_size::from_int(512) + | (use_alpha ? tsp_instruction_word::use_alpha : 0); + + const uint32_t texture_control_word = texture_control_word::pixel_format::_4444 + | texture_control_word::scan_order::non_twiddled + | texture_control_word::texture_address(texture_address / 8) + | texture_control_word::stride_select; + + const uint32_t base_color = 0xffff00ff; + *reinterpret_cast(store_queue) = + ta_global_parameter::sprite(parameter_control_word, + isp_tsp_instruction_word, + tsp_instruction_word, + texture_control_word, + base_color, + 0, // offset_color + 0, // data_size_for_sort_dma + 0); // next_address_for_sort_dma + sq_transfer_32byte(ta_fifo_polygon_converter); + + *reinterpret_cast(store_queue) = + ta_vertex_parameter::sprite_type_1(para_control::para_type::vertex_parameter, + quad_vertices[0].x, + quad_vertices[0].y, + quad_vertices[0].z, + quad_vertices[1].x, + quad_vertices[1].y, + quad_vertices[1].z, + quad_vertices[2].x, + quad_vertices[2].y, + quad_vertices[2].z, + quad_vertices[3].x, + quad_vertices[3].y, + uv_16bit(quad_vertices[0].u, quad_vertices[0].v), + uv_16bit(quad_vertices[1].u, quad_vertices[1].v), + uv_16bit(quad_vertices[2].u, quad_vertices[2].v)); + sq_transfer_64byte(ta_fifo_polygon_converter); +} + void dma_transfer(uint32_t source, uint32_t destination, uint32_t transfers) { using namespace dmac; @@ -227,13 +306,14 @@ void main() dma_init(); video_output::set_mode_vga(); + const int render_passes = 1; + constexpr uint32_t ta_alloc = ta_alloc_ctrl::pt_opb::no_list | ta_alloc_ctrl::tm_opb::no_list | ta_alloc_ctrl::t_opb::_16x4byte | ta_alloc_ctrl::om_opb::no_list | ta_alloc_ctrl::o_opb::no_list; - const int render_passes = 1; const struct opb_size opb_size[render_passes] = { { .opaque = 0, @@ -244,6 +324,22 @@ void main() } }; + constexpr uint32_t ta_alloc2 = ta_alloc_ctrl::pt_opb::no_list + | ta_alloc_ctrl::tm_opb::no_list + | ta_alloc_ctrl::t_opb::_16x4byte + | ta_alloc_ctrl::om_opb::no_list + | ta_alloc_ctrl::o_opb::no_list; + + const struct opb_size opb_size2[render_passes] = { + { + .opaque = 0, + .opaque_modifier = 0, + .translucent = 16 * 4, + .translucent_modifier = 0, + .punch_through = 0 + } + }; + holly.SOFTRESET = softreset::pipeline_soft_reset | softreset::ta_soft_reset; holly.SOFTRESET = 0; @@ -265,66 +361,45 @@ void main() render_passes, texture_memory_alloc.region_array[0].start, texture_memory_alloc.object_list[0].start); - background_parameter2(texture_memory_alloc.background[0].start, 0xffc0c0c0); + region_array_multipass(tile_width, + tile_height, + opb_size2, + render_passes, + texture_memory_alloc.region_array[1].start, + texture_memory_alloc.object_list[1].start); + background_parameter2(texture_memory_alloc.background[1].start, + 0xffc0c0c0); holly.FB_R_SOF1 = texture_memory_alloc.framebuffer[0].start; holly.FB_R_CTRL = fb_r_ctrl::vclk_div::pclk_vclk_1 - | fb_r_ctrl::fb_depth::_0888_rgb_32bit + | fb_r_ctrl::fb_depth::_565_rgb_16bit | fb_r_ctrl::fb_enable; holly.FB_R_SIZE = fb_r_size::fb_modulus(1) | fb_r_size::fb_y_size(480 - 3) - | fb_r_size::fb_x_size((640 * 32) / 32 - 1); + | fb_r_size::fb_x_size((640 * 16) / 32 - 1); - holly.FB_W_CTRL = fb_w_ctrl::fb_packmode::_8888_argb_32bit; + holly.TEXT_CONTROL = text_control::stride(20); // 640 pixels - system.LMMODE0 = 1; - system.LMMODE1 = 1; // 32-bit - - uint32_t * out = (uint32_t *)&texture_memory32[texture_memory_alloc.framebuffer[0].start / 4]; - for (int i = 0; i < 640 * 480; i++) { - out[i] = 0xffff0000; - } - - ta_polygon_converter_init2(texture_memory_alloc.isp_tsp_parameters[0].start, - texture_memory_alloc.isp_tsp_parameters[0].end, - texture_memory_alloc.object_list[0].start, - texture_memory_alloc.object_list[0].end, - opb_size[0].total(), - ta_alloc, - tile_width, - tile_height); - transfer_scene(theta); - ta_wait_translucent_list(); - - const uint32_t bytes_per_pixel = 4; - core_start_render3(texture_memory_alloc.region_array[0].start, - texture_memory_alloc.isp_tsp_parameters[0].start, - texture_memory_alloc.background[0].start, - //texture_memory_alloc.framebuffer[0].start, - 0x100'0000 | texture_memory_alloc.texture.start, // 64-bit area - framebuffer_width, - bytes_per_pixel); - - ta_polygon_converter_init2(texture_memory_alloc.isp_tsp_parameters[0].start, - texture_memory_alloc.isp_tsp_parameters[0].end, - texture_memory_alloc.object_list[0].start, - texture_memory_alloc.object_list[0].end, - opb_size[0].total(), - ta_alloc, - tile_width, - tile_height); - transfer_scene(theta); + //system.LMMODE0 = 1; + //system.LMMODE1 = 1; // 32-bit + system.LMMODE0 = 0; + system.LMMODE1 = 0; // 64-bit uint32_t * in = (uint32_t *)&texture_memory64[texture_memory_alloc.texture.start / 4]; - uint32_t * framebuffer = (uint32_t *)(0x11000000 + texture_memory_alloc.framebuffer[0].start); + + /* + for (int i = 0; i < 640 * 480; i++) { + uint32_t * framebuffer = (uint32_t *)(0x11000000 + texture_memory_alloc.framebuffer[0].start); + framebuffer[i] = 0xffff0000; + } + */ while (1) { - ta_wait_translucent_list(); ta_polygon_converter_init2(texture_memory_alloc.isp_tsp_parameters[0].start, texture_memory_alloc.isp_tsp_parameters[0].end, texture_memory_alloc.object_list[0].start, @@ -333,32 +408,75 @@ void main() ta_alloc, tile_width, tile_height); - transfer_scene(theta); + //serial::string("wait_tl1\n"); + ta_wait_translucent_list(); + //serial::string("wait_tl1 end\n"); + + holly.FB_W_CTRL = fb_w_ctrl::fb_packmode::_4444_argb_16bit; - core_wait_end_of_render_video(); core_start_render3(texture_memory_alloc.region_array[0].start, texture_memory_alloc.isp_tsp_parameters[0].start, texture_memory_alloc.background[0].start, - //texture_memory_alloc.framebuffer[0].start, 0x100'0000 | texture_memory_alloc.texture.start, // 64-bit area framebuffer_width, - bytes_per_pixel); + 2); // bytes_per_pixel + //serial::string("wait_eorv1\n"); + core_wait_end_of_render_video(); + //serial::string("wait_eorv1 end\n"); - dma_transfer((uint32_t)in, (uint32_t)inbuf, 640 * 480 * 4 / 32); + dma_transfer((uint32_t)in, (uint32_t)inbuf, 640 * 480 * 2 / 32); while ((sh7091.DMAC.CHCR1 & dmac::chcr::te::transfers_completed) == 0); - sobel_fipr_store_queue2(inbuf, framebuffer, temp); + //sobel_fipr_store_queue2(inbuf, out, temp); + int frame = frame_ix & 1; + uint32_t * framebuffer = (uint32_t *)(0x11000000 + texture_memory_alloc.framebuffer[0].start); + uint32_t * out = (uint32_t *)(0x11000000 + texture_memory_alloc.texture.start + 640 * 480 * 2); + //serial::string("sobel\n"); + //sobel_fipr_store_queue2(inbuf, framebuffer, temp); + sobel_fipr_store_queue2(inbuf, out, temp); + + ta_polygon_converter_init2(texture_memory_alloc.isp_tsp_parameters[1].start, + texture_memory_alloc.isp_tsp_parameters[1].end, + texture_memory_alloc.object_list[1].start, + texture_memory_alloc.object_list[1].end, + opb_size2[0].total(), + ta_alloc2, + tile_width, + tile_height); + + const uint32_t texture_address0 = texture_memory_alloc.texture.start; + transfer_translucent_quad(texture_address0, false); + const uint32_t texture_address1 = texture_memory_alloc.texture.start + 640 * 480 * 2; + transfer_translucent_quad(texture_address1, true); + *reinterpret_cast(store_queue) = + ta_global_parameter::end_of_list(para_control::para_type::end_of_list); + sq_transfer_32byte(ta_fifo_polygon_converter); + + //serial::string("wait_tl2\n"); + ta_wait_translucent_list(); + //serial::string("wait_tl2 end\n"); + + holly.FB_W_CTRL = fb_w_ctrl::fb_packmode::_565_rgb_16bit; + + core_start_render3(texture_memory_alloc.region_array[1].start, + texture_memory_alloc.isp_tsp_parameters[1].start, + texture_memory_alloc.background[1].start, + texture_memory_alloc.framebuffer[frame].start, + framebuffer_width, + 2); // bytes_per_pixel + //serial::string("wait_eorv2\n"); + core_wait_end_of_render_video(); + //serial::string("wait_eorv2 end\n"); + + while (!spg_status::vsync(holly.SPG_STATUS)); + holly.FB_R_SOF1 = texture_memory_alloc.framebuffer[frame].start; + while (spg_status::vsync(holly.SPG_STATUS)); theta += half_degree; frame_ix += 1; - if (frame_ix > 100) - break; } - ta_wait_translucent_list(); - core_wait_end_of_render_video(); - serial::string("return\n"); serial::string("return\n"); serial::string("return\n"); diff --git a/holly/core.cpp b/holly/core.cpp index dc46718..435fdaa 100644 --- a/holly/core.cpp +++ b/holly/core.cpp @@ -140,18 +140,43 @@ void core_wait_end_of_render_video() "Furthermore, it is strongly recommended that the End of ISP and End of Video interrupts be cleared at the same time in order to make debugging easier when an error occurs." */ - while ((system.ISTNRM & istnrm::end_of_render_tsp) == 0) { + //serial::string("eorv\n"); + int64_t count = 0; + while (1) { + uint32_t istnrm = system.ISTNRM; + if ((istnrm & istnrm::end_of_render_tsp) != 0) + break; + if (istnrm & 0xc0000000) { + serial::string("istnrm "); + serial::integer(istnrm); + serial::string("isterr "); + serial::integer(system.ISTERR); + } + + //serial::integer(system.ISTERR); if (system.ISTERR) { //serial::string("core "); //serial::integer(system.ISTERR); holly.SOFTRESET = softreset::pipeline_soft_reset; holly.SOFTRESET = 0; + //break; + } + if (count > 10000000) { + serial::string("core timeout:\n"); + serial::string("isterr "); + serial::integer(system.ISTERR); + serial::string("istnrm "); + serial::integer(system.ISTNRM); break; } + count += 1; }; system.ISTNRM = istnrm::end_of_render_tsp | istnrm::end_of_render_isp | istnrm::end_of_render_video; + + holly.SOFTRESET = softreset::pipeline_soft_reset; + holly.SOFTRESET = 0; } void core_flip(uint32_t frame_ix) diff --git a/holly/texture_memory_alloc3.hpp b/holly/texture_memory_alloc3.hpp index fcd1c92..7bdd9ff 100644 --- a/holly/texture_memory_alloc3.hpp +++ b/holly/texture_memory_alloc3.hpp @@ -30,10 +30,10 @@ constexpr texture_memory_alloc texture_memory_alloc = { .background = {{0x07'ffe0, 0x08'0000}, {0x47'ffe0, 0x48'0000}}, .object_list = {{0x08'0000, 0x0f'ffe0}, {0x48'0000, 0x4f'ffe0}}, // ~122880 object list pointers .region_array = {{0x10'0000, 0x11'0000}, {0x50'0000, 0x51'0000}}, // ~9 render passes - //.framebuffer = {{0x11'0000, 0x1b'8c00}, {0x51'0000, 0x5b'8c00}}, // 720x480*2 - .framebuffer = {{0x11'0000, 0x23'c000}, {0x51'0000, 0x63'c000}}, // 640x480*4 + .framebuffer = {{0x11'0000, 0x1b'8c00}, {0x51'0000, 0x5b'8c00}}, // 720x480*2 + //.framebuffer = {{0x11'0000, 0x23'c000}, {0x51'0000, 0x63'c000}}, // 640x480*4 // 64-bit addresses - //.texture = {0x37'1800, 0x80'0000} - .texture = {0x57'1800, 0x80'0000} + .texture = {0x37'1800, 0x80'0000} + //.texture = {0x57'1800, 0x80'0000} }; diff --git a/sobel_fipr_inner2.s b/sobel_fipr_inner2.s index 59528f1..a24afe2 100644 --- a/sobel_fipr_inner2.s +++ b/sobel_fipr_inner2.s @@ -1,3 +1,5 @@ + .macro inner_multiplication + /* y multiplication */ mov #4,r1 /* r1 : temporary */ fmov.s @r0,fr0 /* 0 */ @@ -49,11 +51,27 @@ fsts FPUL,fr3 fadd fr3,fr7 + add #4,r0 /* next pixel */ + fschg fmov xd0,dr0 /* load 100.f constant */ fcmp/gt fr0,fr7 fschg + + .endm + + .macro sobel_fipr_inner_2px + mov #0,r9 + + inner_multiplication movt r9 add #-1,r9 + extu.w r9,r9 - add #4,r0 /* next pixel */ + inner_multiplication + movt r1 + add #-1,r1 + extu.w r1,r1 + shll16 r1 + or r1,r9 + .endm diff --git a/sobel_fipr_store_queue2.s b/sobel_fipr_store_queue2.s index f32a62e..0bcac70 100644 --- a/sobel_fipr_store_queue2.s +++ b/sobel_fipr_store_queue2.s @@ -15,7 +15,7 @@ _sobel_fipr_store_queue2: /* r11: var (y loop counter) */ /* r12: var (prefetch address: input address + 1280 4) */ /* r13: var (input address) */ - /* r14: - */ + /* r14: (temporary) */ __setup: mov.l r8,@-r15 @@ -24,6 +24,7 @@ __setup: mov.l r11,@-r15 mov.l r12,@-r15 mov.l r13,@-r15 + mov.l r14,@-r15 fmov.s fr12,@-r15 fmov.s fr13,@-r15 fmov.s fr14,@-r15 @@ -81,7 +82,7 @@ __setup: nop .align 4 -_const_100f: .float 3900 +_const_100f: .float 50 _const_store_queue: .long 0xe0000000 _const_store_queue_mask: .long 0x03ffffc0 /* (0xffffffff & (~0b111111)) & (~(0b111111 << 26)) */ @@ -93,54 +94,56 @@ _const_1280: .short (1280 * 4) _const_1281: .short (1281 * 4) _const_1282: .short (1282 * 4) - /* use r10 as temporary to load the first 1280 pixels; 8 pixels per loop iteration */ + /* use r10 as temporary to load the first 1280 pixels; 16 pixels per loop iteration */ + .include "unpack_pixel.s" .align 4 _prime_pixels_loop_init: - mov #80,r10 /* 1280 / 8 */ - shll r10 mov r0,r12 + mov #80,r10 /* 1280 / 16 */ + shll r10 _prime_pixels_loop: - .include "unpack_pixel.s" + unpack_pixel_16 dt r10 bt _loop_init bra _prime_pixels_loop nop - .align 4 _loop_init: - /* skip first row */ - add r3,r0 /* r3: const (640 * 4) */ - add r3,r8 + /* skip first output row */ + mov r3,r1 + shlr r1 + add r1,r8 /* r3: 640 * 4 */ mov.w _const_height,r11 /* 478 */ bra _loop - mov #80,r10 /* 640 / 8 */ + mov #40,r10 /* 640 / 8 */ -_const_height: .short 478 +_const_height: .short 476 +/*_const_height: .short 238*/ + .include "sobel_fipr_inner2.s" _loop: _loop_width: /* prefetch at r8 + 1280 */ + unpack_pixel_16 - /* process the next 8 pixels */ - .include "unpack_pixel.s" - - .include "sobel_fipr_inner2.s" + /* process the next 16 pixels */ + sobel_fipr_inner_2px mov.l r9,@r8 /* save result in the store queue */ - .include "sobel_fipr_inner2.s" + sobel_fipr_inner_2px mov.l r9,@(4,r8) /* save result in the store queue */ - .include "sobel_fipr_inner2.s" + sobel_fipr_inner_2px mov.l r9,@(8,r8) /* save result in the store queue */ - .include "sobel_fipr_inner2.s" + sobel_fipr_inner_2px mov.l r9,@(12,r8) /* save result in the store queue */ - .include "sobel_fipr_inner2.s" + sobel_fipr_inner_2px mov.l r9,@(16,r8) /* save result in the store queue */ - .include "sobel_fipr_inner2.s" + sobel_fipr_inner_2px mov.l r9,@(20,r8) /* save result in the store queue */ - .include "sobel_fipr_inner2.s" + sobel_fipr_inner_2px mov.l r9,@(24,r8) /* save result in the store queue */ - .include "sobel_fipr_inner2.s" + sobel_fipr_inner_2px mov.l r9,@(28,r8) /* save result in the store queue */ /* send the store queue */ @@ -158,7 +161,7 @@ _row_decrement: dt r11 bt _return bra _loop - mov #80,r10 /* 640 / 8 */ + mov #40,r10 /* 640 / 8 */ /* restore registers */ _return: @@ -166,6 +169,7 @@ _return: fmov.s @r15+,fr14 fmov.s @r15+,fr13 fmov.s @r15+,fr12 + mov.l @r15+,r14 mov.l @r15+,r13 mov.l @r15+,r12 mov.l @r15+,r11 diff --git a/unpack_pixel.s b/unpack_pixel.s index f5b1c29..e86dc3a 100644 --- a/unpack_pixel.s +++ b/unpack_pixel.s @@ -1,25 +1,63 @@ - ocbi @r13 - pref @r13 /* 32 bytes, 8 pixels */ - /* unpack the next 8 pixels */ - fschg + /* + mov.l @r13,r9 + extu.b r9,r1 + shlr8 r9 + extu.b r9,r2 + add r1,r2 + shlr8 r9 + extu.b r9,r1 + add r1,r2 + shlr8 r9 + add r2,r9 + lds r9,fpul + add #4,r13 + */ + .macro unpack_pixel_inner_nibs - .include "unpack_pixel_inner.s" + mov.w @r13+,r9 + + mov r9,r1 /* nib0 */ + shlr2 r9 + shlr2 r9 + and r14,r1 + + mov r9,r2 /* nib1 */ + shlr2 r9 + shlr2 r9 + and r14,r2 + add r2,r1 + + mov r9,r2 /* nib3 */ + shlr2 r9 + shlr2 r9 + and r14,r2 + add r2,r1 + + and r14,r9 /* nib4 */ + add r9,r1 + + lds r1,fpul + + .endm + + .macro unpack_pixel_8 + unpack_pixel_inner_nibs float fpul,fr0 - .include "unpack_pixel_inner.s" + unpack_pixel_inner_nibs float fpul,fr1 - .include "unpack_pixel_inner.s" + unpack_pixel_inner_nibs float fpul,fr2 - .include "unpack_pixel_inner.s" + unpack_pixel_inner_nibs float fpul,fr3 - .include "unpack_pixel_inner.s" + unpack_pixel_inner_nibs float fpul,fr4 - .include "unpack_pixel_inner.s" + unpack_pixel_inner_nibs float fpul,fr5 - .include "unpack_pixel_inner.s" + unpack_pixel_inner_nibs float fpul,fr6 - .include "unpack_pixel_inner.s" + unpack_pixel_inner_nibs float fpul,fr7 fmov dr0,@r12 @@ -30,5 +68,17 @@ add #8,r12 fmov dr6,@r12 add #8,r12 + .endm + + .macro unpack_pixel_16 + ocbi @r13 + pref @r13 /* 32 bytes, 16 pixels */ + mov #15,r14 fschg + + unpack_pixel_8 + unpack_pixel_8 + + fschg + .endm diff --git a/unpack_pixel_inner.s b/unpack_pixel_inner.s index 2dcddf7..8b13789 100644 --- a/unpack_pixel_inner.s +++ b/unpack_pixel_inner.s @@ -1,12 +1 @@ - mov.l @r13,r9 - extu.b r9,r1 - shlr8 r9 - extu.b r9,r2 - add r1,r2 - shlr8 r9 - extu.b r9,r1 - add r1,r2 - shlr8 r9 - add r2,r9 - lds r9,fpul - add #4,r13 +