From a71ac1c4b1f7095677e439892c532f0c7224d736 Mon Sep 17 00:00:00 2001 From: Zack Buhman Date: Fri, 2 Feb 2024 13:11:32 +0800 Subject: [PATCH] suzanne_profile: remove tearing Though I did spend much time thinking about this, my idea was not correct. The "tearing" and "previous frame is being shown while it is being drawn" is simply because that's exactly what the logic in holly/core.cpp did. This is no longer the case--by the time the newly-created core_flip function is called, the core render is complete, and we should switch the FB_R_SOF1 to the current framebuffer, not the one that is going to be written on next frame. This also modifies alt.lds so that (non-startup) code now runs in the P1 area, with operand/instruction/copyback caches enabled. This caused a 10x speed increase in my testing. --- alt.lds | 17 ++--- example/suzanne_profile.cpp | 135 ++++++++++++++++++++++++++++-------- font/font_bitmap.cpp | 2 +- holly/core.cpp | 9 +-- holly/core.hpp | 2 +- sh7091/cache.cpp | 2 +- sh7091/cache.hpp | 2 +- sh7091/serial.cpp | 2 +- vga.cpp | 4 -- 9 files changed, 123 insertions(+), 52 deletions(-) diff --git a/alt.lds b/alt.lds index 77d5d52..89d4c0b 100644 --- a/alt.lds +++ b/alt.lds @@ -1,16 +1,23 @@ OUTPUT_FORMAT("elf32-shl", "elf32-shl", "elf32-shl") MEMORY { - p1ram : ORIGIN = 0xac020000, LENGTH = 0xff0000 + p1ram : ORIGIN = 0x8c020000, LENGTH = 0xff0000 + p2ram : ORIGIN = 0xac020000, LENGTH = 0xff0000 } SECTIONS { - . = ORIGIN(p1ram); + . = ORIGIN(p2ram); .text ALIGN(4) : SUBALIGN(4) { KEEP(*(.text.start)) *(.text.startup.*) + } > p2ram AT>p1ram + + . = ORIGIN(p1ram) + (. - ORIGIN(p2ram)); + + .text ALIGN(4) : SUBALIGN(4) + { *(.text.*) *(.text) } > p1ram @@ -33,12 +40,6 @@ SECTIONS KEEP(*(.ctors.*)) } > p1ram - .text.p2ram ALIGN(4) : SUBALIGN(4) - { - *(.p2ram) - *(.p2ram.*) - } > p1ram - .bss ALIGN(4) (NOLOAD) : SUBALIGN(4) { *(.bss) diff --git a/example/suzanne_profile.cpp b/example/suzanne_profile.cpp index f427313..5d3c4f9 100644 --- a/example/suzanne_profile.cpp +++ b/example/suzanne_profile.cpp @@ -16,12 +16,17 @@ #include "holly/background.hpp" #include "holly/texture_memory_alloc.hpp" #include "memorymap.hpp" +#include "sh7091/sh7091.hpp" +#include "sh7091/sh7091_bits.hpp" +#include "sh7091/serial.hpp" #include "geometry/suzanne.hpp" +#include "geometry/circle.hpp" #include "math/vec4.hpp" #include "font/font_bitmap.hpp" #include "verite_8x16.hpp" +#include "string.hpp" constexpr float half_degree = 0.01745329f / 2; @@ -235,8 +240,39 @@ void init_texture_memory(const struct opb_size& opb_size) uint32_t _ta_parameter_buf[((32 * 8192) + 32) / 4]; +static inline void label_number(ta_parameter_writer& parameter, + const char * label, + const uint32_t len, + const uint32_t number, + const uint32_t row) +{ + constexpr uint32_t max_label_len = 10; + char buf[8]; + + string::hex(buf, 8, number); + font_bitmap::transform_string(parameter, + 8, 16, // texture + 8, 16, // glyph + 16 + (8 * (max_label_len - len)), // position x + 16 * row, // position y + label, len); + font_bitmap::transform_string(parameter, + 8, 16, // texture + 8, 16, // glyph + 16 + (8 * (max_label_len + 1)), // position x + 16 * row, // position y + buf, 8); +} + void main() { + sh7091.TMU.TSTR = 0; // stop all timers + sh7091.TMU.TOCR = tmu::tocr::tcoe::tclk_is_external_clock_or_input_capture; + sh7091.TMU.TCR0 = tmu::tcr0::tpsc::p_phi_256; // 256 / 200MHz = 1.28 μs ; underflows in ~1 hour + sh7091.TMU.TCOR0 = 0xffff'ffff; + sh7091.TMU.TCNT0 = 0xffff'ffff; + sh7091.TMU.TSTR = tmu::tstr::str0::counter_start; + vga(); auto src = reinterpret_cast(&_binary_verite_8x16_data_start); @@ -282,45 +318,86 @@ void main() {0.f, 0.f, 0.f}, }; + uint32_t t_transform_start = 0; + uint32_t t_transform_end = 0; + uint32_t t_text_start = 0; + uint32_t t_text_end = 0; + uint32_t t_transfer_start = 0; + uint32_t t_transfer_end = 0; + uint32_t t_render_start = 0; + uint32_t t_render_end = 0; + while (1) { ta_polygon_converter_init(opb_size.total(), ta_alloc, 640 / 32, 480 / 32); - - float theta2 = 3.14 * 2 * sin(theta / 7); - - lights[0].x = cos(theta) * 15; - lights[0].z = sin(theta) * 15; - - lights[1].x = cos(theta2 + half_degree * 180.f) * 15; - lights[1].z = sin(theta2 + half_degree * 180.f) * 15; - - lights[2].x = cos(theta + half_degree * 360.f) * 15; - lights[2].z = sin(theta + half_degree * 360.f) * 15; - auto parameter = ta_parameter_writer(ta_parameter_buf); - for (uint32_t i = 0; i < MODEL::num_faces; i++) { - transform(parameter, i, theta, lights); - } - transform2(parameter, lights[0], {1.f, 0.f, 0.f, 1.f}); - transform2(parameter, lights[1], {0.f, 1.f, 0.f, 1.f}); - transform2(parameter, lights[2], {0.f, 0.f, 1.f, 1.f}); - font_bitmap::transform_string(parameter, - 8, 16, // texture - 8, 16, // glyph - 40, 40, // position - "test", 4); + // transform start + t_transform_start = sh7091.TMU.TCNT0; + { + const float theta2 = 3.14 * 2 * sin(theta / 7); + + lights[0].x = cos(theta) * 15; + lights[0].z = sin(theta) * 15; + + lights[1].x = cos(theta2 + half_degree * 180.f) * 15; + lights[1].z = sin(theta2 + half_degree * 180.f) * 15; + + lights[2].x = cos(theta + half_degree * 360.f) * 15; + lights[2].z = sin(theta + half_degree * 360.f) * 15; + + for (uint32_t i = 0; i < MODEL::num_faces; i++) { + transform(parameter, i, theta, lights); + } + transform2(parameter, lights[0], {1.f, 0.f, 0.f, 1.f}); + transform2(parameter, lights[1], {0.f, 1.f, 0.f, 1.f}); + transform2(parameter, lights[2], {0.f, 0.f, 1.f, 1.f}); + } + t_transform_end = sh7091.TMU.TCNT0; + // transform end + + uint32_t _t_text_start = sh7091.TMU.TCNT0; + { + + const uint32_t transform = t_transform_start - t_transform_end; + label_number(parameter, "transform:", 10, transform, 1); + + const uint32_t text = t_text_start - t_text_end; + label_number(parameter, "text:", 5, text, 2); + + const uint32_t transfer = t_transfer_start - t_transfer_end; + label_number(parameter, "transfer:", 9, transfer, 3); + + const uint32_t render = t_render_start - t_render_end; + label_number(parameter, "render:", 7, render, 4); + } + t_text_start = _t_text_start; + t_text_end = sh7091.TMU.TCNT0; parameter.append() = ta_global_parameter::end_of_list(para_control::para_type::end_of_list); - ta_polygon_converter_transfer(ta_parameter_buf, parameter.offset); - ta_wait_opaque_list(); - core_start_render(frame_ix, num_frames); - v_sync_out(); - core_wait_end_of_render_video(frame_ix, num_frames); - theta += half_degree; + // transfer start + t_transfer_start = sh7091.TMU.TCNT0; + { + ta_polygon_converter_transfer(ta_parameter_buf, parameter.offset); + ta_wait_opaque_list(); + } + t_transfer_end = sh7091.TMU.TCNT0; + + t_render_start = sh7091.TMU.TCNT0; + core_start_render(frame_ix, num_frames); + core_wait_end_of_render_video(); + t_render_end = sh7091.TMU.TCNT0; + + while (!spg_status::vsync(holly.SPG_STATUS)) { + } + core_flip(frame_ix, num_frames); + while (spg_status::vsync(holly.SPG_STATUS)) { + } + + theta += half_degree * 0.5; frame_ix += 1; } } diff --git a/font/font_bitmap.cpp b/font/font_bitmap.cpp index 813ceba..f609380 100644 --- a/font/font_bitmap.cpp +++ b/font/font_bitmap.cpp @@ -152,7 +152,7 @@ void transform_string(ta_parameter_writer& parameter, x *= static_cast(glyph_width * 1); y *= static_cast(glyph_height * 1); - x += static_cast(position_x + glyph_width * 4 * string_ix); + x += static_cast(position_x + glyph_width * string_ix); y += static_cast(position_y); z = 1.f / (z + 10.f); diff --git a/holly/core.cpp b/holly/core.cpp index 0ea30c0..90a73c1 100644 --- a/holly/core.cpp +++ b/holly/core.cpp @@ -67,7 +67,7 @@ void core_start_render(uint32_t frame_address, holly.FB_W_CTRL = fb_w_ctrl::fb_dither | fb_w_ctrl::fb_packmode::_565_rgb_16bit; holly.FB_W_LINESTRIDE = (frame_linestride * 2) / 8; - uint32_t w_fb = ((frame_ix + 0) & num_frames) * frame_size; + uint32_t w_fb = (frame_ix & num_frames) * frame_size; holly.FB_W_SOF1 = frame_address + w_fb; holly.STARTRENDER = 1; @@ -95,11 +95,8 @@ void core_wait_end_of_render_video() | ISTNRM__END_OF_RENDER_VIDEO; } -void core_wait_end_of_render_video(uint32_t frame_ix, uint32_t num_frames) +void core_flip(uint32_t frame_ix, uint32_t num_frames) { - core_wait_end_of_render_video(); - - // hmm hacky... - uint32_t r_fb = ((frame_ix + 1) & num_frames) * 0x00096000; + uint32_t r_fb = (frame_ix & num_frames) * 0x00096000; holly.FB_R_SOF1 = (offsetof (struct texture_memory_alloc, framebuffer)) + r_fb; } diff --git a/holly/core.hpp b/holly/core.hpp index b9761dc..95c25e3 100644 --- a/holly/core.hpp +++ b/holly/core.hpp @@ -9,4 +9,4 @@ void core_start_render(uint32_t frame_address, void core_start_render(uint32_t frame_ix, uint32_t num_frames); void core_wait_end_of_render_video(); -void core_wait_end_of_render_video(uint32_t frame_ix, uint32_t num_frames); +void core_flip(uint32_t frame_ix, uint32_t num_frames); diff --git a/sh7091/cache.cpp b/sh7091/cache.cpp index f223b83..ee12709 100644 --- a/sh7091/cache.cpp +++ b/sh7091/cache.cpp @@ -25,7 +25,7 @@ void init() | ice::ic_used // instruction cache enable | oci::clear_v_and_u_bits_of_all_oc_entries // operand cache invalidate | oce::oc_used // operand cache enable - // | cb::copy_back_mode // enable copy-back mode for the P1 area + | cb::copy_back_mode // enable copy-back mode for the P1 area ; sh7091.CCN.MMUCR = ccn::mmucr::at::mmu_disabled; diff --git a/sh7091/cache.hpp b/sh7091/cache.hpp index 192c228..cf2e602 100644 --- a/sh7091/cache.hpp +++ b/sh7091/cache.hpp @@ -2,6 +2,6 @@ namespace cache { -void init() __attribute__ ((section (".p2ram.cache_init"))); +void init() __attribute__ ((section (".text.startup.cache_init"))); } diff --git a/sh7091/serial.cpp b/sh7091/serial.cpp index b3475ed..de42b71 100644 --- a/sh7091/serial.cpp +++ b/sh7091/serial.cpp @@ -33,7 +33,7 @@ void character(const char c) // wait for transmit fifo to become empty while ((sh7091.SCIF.SCFSR2 & scfsr2::tdfe::bit_mask) == 0); - for (int i = 0; i < 100000; i++) { + for (int i = 0; i < 10000; i++) { asm volatile ("nop;"); } diff --git a/vga.cpp b/vga.cpp index 7f4123c..a5af9ad 100644 --- a/vga.cpp +++ b/vga.cpp @@ -100,20 +100,16 @@ void vga2() void v_sync_in() { while (!spg_status::vsync(holly.SPG_STATUS)) { - asm volatile ("nop"); } while (spg_status::vsync(holly.SPG_STATUS)) { - asm volatile ("nop"); } } void v_sync_out() { while (spg_status::vsync(holly.SPG_STATUS)) { - asm volatile ("nop"); } while (!spg_status::vsync(holly.SPG_STATUS)) { - asm volatile ("nop"); } }