From 05e0a36ca73078391c852031add14ca652dad82c Mon Sep 17 00:00:00 2001 From: Zack Buhman Date: Sat, 25 Jan 2025 20:51:30 -0600 Subject: [PATCH] wiffle_screen_space: use DMA for transfers to/from texture memory --- example/example.mk | 7 ++ example/holly_recv_dma.cpp | 80 +++++++++++++++++ example/wiffle_screen_space.cpp | 154 ++++++++++++++++++++++++++------ maple/maple.cpp | 4 +- serial_load.cpp | 4 +- 5 files changed, 216 insertions(+), 33 deletions(-) create mode 100644 example/holly_recv_dma.cpp diff --git a/example/example.mk b/example/example.mk index 05728ed..3ced61a 100644 --- a/example/example.mk +++ b/example/example.mk @@ -745,3 +745,10 @@ TEXTURE_MEMORY_OBJ = \ example/texture_memory.elf: LDSCRIPT = $(LIB)/main.lds example/texture_memory.elf: $(START_OBJ) $(TEXTURE_MEMORY_OBJ) + +HOLLY_RECV_DMA_OBJ = \ + example/holly_recv_dma.o \ + sh7091/serial.o + +example/holly_recv_dma.elf: LDSCRIPT = $(LIB)/main.lds +example/holly_recv_dma.elf: $(START_OBJ) $(HOLLY_RECV_DMA_OBJ) diff --git a/example/holly_recv_dma.cpp b/example/holly_recv_dma.cpp new file mode 100644 index 0000000..874c2bb --- /dev/null +++ b/example/holly_recv_dma.cpp @@ -0,0 +1,80 @@ +#include "sh7091/sh7091.hpp" +#include "sh7091/sh7091_bits.hpp" +#include "sh7091/serial.hpp" +#include "memorymap.hpp" + +static void dma(uint32_t source, uint32_t destination, uint32_t length) +{ + using namespace dmac; + + sh7091.DMAC.CHCR1 = 0; + + sh7091.DMAC.SAR1 = source; + sh7091.DMAC.DAR1 = destination; + sh7091.DMAC.DMATCR1 = length & 0x00ff'ffff; + + sh7091.DMAC.CHCR1 = chcr::dm::destination_address_incremented + | chcr::sm::source_address_incremented + | chcr::rs::resource_select(0b0100) /* external address space → external address space */ + | chcr::tm::cycle_burst_mode /* transmit mode */ + //| chcr::tm::cycle_steal_mode /* transmit mode */ + | chcr::ts::_32_byte /* transfer size */ + //| chcr::ie::interrupt_request_generated + | chcr::de::channel_operation_enabled; +} + +static uint32_t buf[256] __attribute__((aligned(32))); + +void main() +{ + for (int i = 0; i < 256; i++) { + buf[i] = 0; + texture_memory32[i] = (1 << 31) | i; + } + + for (uint32_t i = 0; i < (sizeof (buf)) / 32; i++) { + uint32_t address = (uint32_t)&buf[0]; + asm volatile ("ocbp @%0" + : // output + : "r" (address + (i * 32)) // input + ); + } + + sh7091.DMAC.DMAOR = 0; + + serial::integer(sh7091.DMAC.DMAOR); + + serial::integer((uint32_t)&buf[0]); + + dma((uint32_t)&texture_memory32[0], (uint32_t)&buf[0], (sizeof (buf))); + + uint32_t last_dar = sh7091.DMAC.DAR1; + uint32_t count = 0; + while ((sh7091.DMAC.CHCR1 & dmac::chcr::te::transfers_completed) == 0) { + uint32_t dar = sh7091.DMAC.DAR1; + if (dar == last_dar) + count += 1; + else + count = 0; + if (count > 100) + return; + }; + serial::integer(sh7091.DMAC.DMAOR); + + for (uint32_t i = 0; i < (sizeof (buf)) / 32; i++) { + uint32_t address = (uint32_t)&buf[i * 32]; + asm volatile ("ocbi @%0" + : // output + : "r" (address) // input + ); + } + + serial::string("buf:\n"); + for (int i = 0; i < 256; i++) { + serial::integer(buf[i]); + } + serial::string("return\n"); + serial::string("return\n"); + serial::string("return\n"); + serial::string("return\n"); +} diff --git a/example/wiffle_screen_space.cpp b/example/wiffle_screen_space.cpp index bbc6664..8b5685c 100644 --- a/example/wiffle_screen_space.cpp +++ b/example/wiffle_screen_space.cpp @@ -1,24 +1,29 @@ -#include +#include -#include "align.hpp" -#include "holly/video_output.hpp" - -#include "holly/holly.hpp" +#include "holly/background.hpp" #include "holly/core.hpp" #include "holly/core_bits.hpp" -#include "holly/ta_fifo_polygon_converter.hpp" -#include "holly/ta_parameter.hpp" -#include "holly/ta_global_parameter.hpp" -#include "holly/ta_vertex_parameter.hpp" +#include "holly/holly.hpp" #include "holly/isp_tsp.hpp" -#include "holly/ta_bits.hpp" #include "holly/region_array.hpp" -#include "holly/background.hpp" +#include "holly/ta_bits.hpp" +#include "holly/ta_fifo_polygon_converter.hpp" +#include "holly/ta_global_parameter.hpp" +#include "holly/ta_parameter.hpp" +#include "holly/ta_vertex_parameter.hpp" #include "holly/texture_memory_alloc3.hpp" +#include "holly/video_output.hpp" + +#include "sh7091/sh7091.hpp" +#include "sh7091/sh7091_bits.hpp" +#include "sh7091/store_queue.hpp" +#include "sh7091/serial.hpp" + #include "memorymap.hpp" +#include "systembus.hpp" +#include "systembus_bits.hpp" #include "geometry/wiffle.hpp" -#include "math/vec4.hpp" void convolve(uint32_t * in, uint32_t * out); @@ -46,8 +51,7 @@ vec3 rotate(const vec3& vertex, float theta) return vec3(x, y, z); } -void transform(ta_parameter_writer& parameter, - const uint32_t face_ix, +void transform(const uint32_t face_ix, const float theta, const vec3 lights[3]) { @@ -65,7 +69,7 @@ void transform(ta_parameter_writer& parameter, | tsp_instruction_word::fog_control::no_fog | tsp_instruction_word::use_alpha; - parameter.append() = + *reinterpret_cast(store_queue) = ta_global_parameter::polygon_type_0(parameter_control_word, isp_tsp_instruction_word, tsp_instruction_word, @@ -73,6 +77,7 @@ void transform(ta_parameter_writer& parameter, 0, // data_size_for_sort_dma 0 // next_address_for_sort_dma ); + sq_transfer_32byte(ta_fifo_polygon_converter); auto& face = MODEL::faces[face_ix]; @@ -158,7 +163,7 @@ void transform(ta_parameter_writer& parameter, bool end_of_strip = i == strip_length - 1; - parameter.append() = + *reinterpret_cast(store_queue) = ta_vertex_parameter::polygon_type_1(polygon_vertex_parameter_control_word(end_of_strip), x, y, z, scale_z, // alpha @@ -166,22 +171,92 @@ void transform(ta_parameter_writer& parameter, scale_ny, // g scale_nz // b ); + sq_transfer_32byte(ta_fifo_polygon_converter); } } -uint32_t ta_parameter_buf[((32 * 8192) + 32) / 4] -__attribute__((aligned(32))); +void dma_transfer(uint32_t source, uint32_t destination, uint32_t transfers) +{ + using namespace dmac; -uint32_t inbuf[640 * 480]; -uint32_t outbuf[640 * 480]; + volatile uint32_t _dummy = sh7091.DMAC.CHCR1; + (void)_dummy; + + sh7091.DMAC.CHCR1 = 0; + + sh7091.DMAC.SAR1 = source; + sh7091.DMAC.DAR1 = destination; + sh7091.DMAC.DMATCR1 = transfers & 0x00ff'ffff; + + sh7091.DMAC.CHCR1 = chcr::dm::destination_address_incremented + | chcr::sm::source_address_incremented + | chcr::rs::resource_select(0b0100) /* auto request; external address space → external address space */ + | chcr::tm::cycle_burst_mode /* transmit mode */ + //| chcr::tm::cycle_steal_mode /* transmit mode */ + | chcr::ts::_32_byte /* transfer size */ + //| chcr::ie::interrupt_request_generated + | chcr::de::channel_operation_enabled; +} + +void ch2_dma_transfer(uint32_t source, uint32_t destination, uint32_t transfers) +{ + using namespace dmac; + + for (uint32_t i = 0; i < transfers; i++) { + asm volatile ("ocbwb @%0" + : // output + : "r" (source + (32 * i)) // input + ); + } + + // this dummy read appears to be required on real hardware. + volatile uint32_t _dummy = sh7091.DMAC.CHCR2; + (void)_dummy; + + /* start a new CH2-DMA transfer from "system memory" to "TA FIFO polygon converter" */ + sh7091.DMAC.CHCR2 = 0; /* disable DMA channel */ + sh7091.DMAC.SAR2 = reinterpret_cast(source); /* start address, must be aligned to a CHCHR__TS-sized (32-byte) boundary */ + sh7091.DMAC.DMATCR2 = dmatcr::transfer_count(transfers); /* transfer count, in CHCHR__TS-sized (32-byte) units */ + sh7091.DMAC.CHCR2 = chcr::dm::destination_address_incremented + | chcr::sm::source_address_incremented + | chcr::rs::resource_select(0b0010) /* external request, single address mode; + external address space → external device */ + | chcr::tm::cycle_burst_mode /* transmit mode */ + | chcr::ts::_32_byte /* transfer size */ + | chcr::de::channel_operation_enabled; + + system.C2DSTAT = c2dstat::texture_memory_start_address(destination); /* CH2-DMA destination address */ + system.C2DLEN = c2dlen::transfer_length(transfers * 32); /* CH2-DMA length (must be a multiple of 32) */ + system.C2DST = 1; /* CH2-DMA start (an 'external' request from SH7091's perspective) */ + + // wait for ch2-dma completion + while ((system.ISTNRM & istnrm::end_of_dma_ch2_dma) == 0); + // reset ch2-dma interrupt status + system.ISTNRM = istnrm::end_of_dma_ch2_dma; +} + +void dma_init() +{ + using namespace dmac; + + sh7091.DMAC.CHCR0 = 0; + sh7091.DMAC.CHCR1 = 0; + sh7091.DMAC.CHCR2 = 0; + sh7091.DMAC.CHCR3 = 0; + sh7091.DMAC.DMAOR = dmaor::ddt::on_demand_data_transfer_mode /* on-demand data transfer mode */ + | dmaor::pr::ch2_ch0_ch1_ch3 /* priority mode; CH2 > CH0 > CH1 > CH3 */ + | dmaor::dme::operation_enabled_on_all_channels; /* DMAC master enable */ + +} + +static uint32_t inbuf[640 * 480] __attribute__((aligned(32))); +static uint32_t outbuf[640 * 480] __attribute__((aligned(32))); void main() { + dma_init(); video_output::set_mode_vga(); - // The address of `ta_parameter_buf` must be a multiple of 32 bytes. - // This is mandatory for ch2-dma to the ta fifo polygon converter. - constexpr uint32_t ta_alloc = ta_alloc_ctrl::pt_opb::no_list | ta_alloc_ctrl::tm_opb::no_list | ta_alloc_ctrl::t_opb::_16x4byte @@ -250,9 +325,8 @@ void main() lights[2].x = cos(theta + half_degree * 360.f) * 20; lights[2].z = sin(theta + half_degree * 360.f) * 20; - auto parameter = ta_parameter_writer(ta_parameter_buf); for (uint32_t i = 0; i < MODEL::num_faces; i++) { - transform(parameter, i, theta, lights); + transform(i, theta, lights); } /* transform2(parameter, lights[0], {1.f, 0.f, 0.f, 1.f}); @@ -260,8 +334,10 @@ void main() transform2(parameter, lights[2], {0.f, 0.f, 1.f, 1.f}); */ - parameter.append() = ta_global_parameter::end_of_list(para_control::para_type::end_of_list); - ta_polygon_converter_transfer(ta_parameter_buf, parameter.offset); + *reinterpret_cast(store_queue) = + ta_global_parameter::end_of_list(para_control::para_type::end_of_list); + sq_transfer_32byte(ta_fifo_polygon_converter); + ta_wait_translucent_list(); holly.FB_W_CTRL = fb_w_ctrl::fb_packmode::_8888_argb_32bit; @@ -276,11 +352,31 @@ void main() core_wait_end_of_render_video(); uint32_t * in = (uint32_t *)&texture_memory64[texture_memory_alloc.texture.start / 4]; + //uint32_t * out = (uint32_t *)&texture_memory32[texture_memory_alloc.framebuffer[0].start / 4]; + serial::string("ch1 dma start\n"); + dma_transfer((uint32_t)in, (uint32_t)inbuf, 640 * 480 * 4 / 32); + while ((sh7091.DMAC.CHCR1 & dmac::chcr::te::transfers_completed) == 0); + serial::string("ch1 dma end\n"); + for (uint32_t i = 0; i < (sizeof (640 * 480 * 4)) / 32; i++) { + uint32_t address = (uint32_t)&inbuf[0]; + asm volatile ("ocbp @%0" + : // output + : "r" (address + (i * 32)) // input + ); + } - uint32_t * out = (uint32_t *)&texture_memory32[texture_memory_alloc.framebuffer[0].start / 4]; - convolve(in, out); + serial::string("convolve start\n"); + convolve(inbuf, outbuf); + serial::string("convolve end\n"); + + uint32_t framebuffer = 0x11000000 + texture_memory_alloc.framebuffer[0].start; // TA FIFO - Direct Texture Path + system.LMMODE0 = 1; + system.LMMODE1 = 1; // 32-bit + serial::string("ch2 dma start\n"); + ch2_dma_transfer((uint32_t)outbuf, framebuffer, (640 * 480 * 4) / 32); + serial::string("ch2 dma end\n"); while (!spg_status::vsync(holly.SPG_STATUS)); holly.FB_R_SOF1 = texture_memory_alloc.framebuffer[0].start; diff --git a/maple/maple.cpp b/maple/maple.cpp index c14c5a8..55939c7 100644 --- a/maple/maple.cpp +++ b/maple/maple.cpp @@ -176,9 +176,9 @@ void dma_start(uint8_t const * const send_buf, // start maple DMA _dma_start(send_buf); - // purge operand cache block for recv buffer, prior to returning to the caller + // invalidate operand cache block for recv buffer, prior to returning to the caller for (uint32_t i = 0; i < align_32byte(recv_size) / 32; i++) { - asm volatile ("ocbp @%0" + asm volatile ("ocbi @%0" : // output : "r" (reinterpret_cast(&recv_buf[32 * i])) // input ); diff --git a/serial_load.cpp b/serial_load.cpp index 29ce99e..cfe402a 100644 --- a/serial_load.cpp +++ b/serial_load.cpp @@ -192,9 +192,9 @@ void tick(struct maple_poll_state& poll_state) const uint8_t * buf = reinterpret_cast(state.reply_crc.offset); const uint8_t * buf32 = reinterpret_cast((state.reply_crc.offset / 32) * 32); // round down - // purge operand cache blocks for the data written by DMA, rounding up twice + // invalidate operand cache blocks for the data written by DMA, rounding up twice for (uint32_t i = 0; i < align_32byte(len) + 32; i += 32) { - asm volatile ("ocbp @%0" + asm volatile ("ocbi @%0" : // output : "r" (reinterpret_cast(&buf32[i])) // input );