diff --git a/example/example.mk b/example/example.mk index e1ea0c0..5846cb3 100644 --- a/example/example.mk +++ b/example/example.mk @@ -223,6 +223,20 @@ WIFFLE_SCREEN_SPACE_OBJ = \ example/wiffle_screen_space.elf: LDSCRIPT = $(LIB)/main.lds example/wiffle_screen_space.elf: $(START_OBJ) $(WIFFLE_SCREEN_SPACE_OBJ) +WIFFLE_SCREEN_SPACE_STORE_QUEUE_OBJ = \ + example/wiffle_screen_space_store_queue.o \ + holly/video_output.o \ + holly/core.o \ + holly/region_array.o \ + holly/background.o \ + holly/ta_fifo_polygon_converter.o \ + sh7091/serial.o \ + sobel_fipr_store_queue.o \ + sobel.o + +example/wiffle_screen_space_store_queue.elf: LDSCRIPT = $(LIB)/main.lds +example/wiffle_screen_space_store_queue.elf: $(START_OBJ) $(WIFFLE_SCREEN_SPACE_STORE_QUEUE_OBJ) + MODIFIER_VOLUME_OBJ = \ example/modifier_volume.o \ holly/video_output.o \ diff --git a/example/wiffle_screen_space_store_queue.cpp b/example/wiffle_screen_space_store_queue.cpp new file mode 100644 index 0000000..f53a405 --- /dev/null +++ b/example/wiffle_screen_space_store_queue.cpp @@ -0,0 +1,429 @@ +#include + +#include "holly/background.hpp" +#include "holly/core.hpp" +#include "holly/core_bits.hpp" +#include "holly/holly.hpp" +#include "holly/isp_tsp.hpp" +#include "holly/region_array.hpp" +#include "holly/ta_bits.hpp" +#include "holly/ta_fifo_polygon_converter.hpp" +#include "holly/ta_global_parameter.hpp" +#include "holly/ta_parameter.hpp" +#include "holly/ta_vertex_parameter.hpp" +#include "holly/texture_memory_alloc3.hpp" +#include "holly/video_output.hpp" + +#include "sh7091/sh7091.hpp" +#include "sh7091/sh7091_bits.hpp" +#include "sh7091/store_queue.hpp" +#include "sh7091/serial.hpp" + +#include "memorymap.hpp" +#include "systembus.hpp" +#include "systembus_bits.hpp" + +#include "geometry/wiffle.hpp" + +#include "sobel.hpp" + +constexpr float half_degree = 0.01745329f / 2; + +#define MODEL wiffle + +vec3 rotate(const vec3& vertex, float theta) +{ + float x = vertex.x; + float y = vertex.y; + float z = vertex.z; + float t; + + t = y * cos(theta) - z * sin(theta); + z = y * sin(theta) + z * cos(theta); + y = t; + + float theta2 = 3.14 * sin(theta / 2); + + t = x * cos(theta2) - z * sin(theta2); + z = x * sin(theta2) + z * cos(theta2); + x = t; + + return vec3(x, y, z); +} + +void transform(const uint32_t face_ix, + const float theta, + const vec3 lights[3]) +{ + const uint32_t parameter_control_word = para_control::para_type::polygon_or_modifier_volume + | para_control::list_type::translucent + // | obj_control::texture + | obj_control::col_type::floating_color + | obj_control::gouraud; + + const uint32_t isp_tsp_instruction_word = isp_tsp_instruction_word::depth_compare_mode::greater + | isp_tsp_instruction_word::culling_mode::cull_if_positive; + + const uint32_t tsp_instruction_word = tsp_instruction_word::src_alpha_instr::one + | tsp_instruction_word::dst_alpha_instr::zero + | tsp_instruction_word::fog_control::no_fog + | tsp_instruction_word::use_alpha; + + *reinterpret_cast(store_queue) = + ta_global_parameter::polygon_type_0(parameter_control_word, + isp_tsp_instruction_word, + tsp_instruction_word, + 0, // texture_control_word + 0, // data_size_for_sort_dma + 0 // next_address_for_sort_dma + ); + sq_transfer_32byte(ta_fifo_polygon_converter); + + auto& face = MODEL::faces[face_ix]; + + constexpr uint32_t strip_length = 3; + for (uint32_t i = 0; i < strip_length; i++) { + // world transform + uint32_t vertex_ix = face[i].vertex; + auto& vertex = MODEL::vertices[vertex_ix]; + auto point = rotate(vertex, theta); + + // lighting transform + uint32_t normal_ix = face[i].normal; + auto& normal = MODEL::normals[normal_ix]; + auto n = rotate(normal, theta); + + /* + vec4 color = {0.0, 0.0, 0.0, 1.0}; + + // intensity calculation + { + auto l = lights[0] - point; + auto n_dot_l = dot(n, l); + if (n_dot_l > 0) { + float distance = magnitude(lights[0] - point); + float attenuation = 1.0 / (1.0f + + 0.07f * distance + + 0.007f * (distance * distance)); + color.x += 5.0 * attenuation; + } + } + + { + auto l = lights[1] - point; + auto n_dot_l = dot(n, l); + if (n_dot_l > 0) { + float distance = magnitude(lights[1] - point); + float attenuation = 1.0 / (1.0f + + 0.07f * distance + + 0.007f * (distance * distance)); + color.y += 5.0 * attenuation; + } + } + + { + auto l = lights[2] - point; + auto n_dot_l = dot(n, l); + if (n_dot_l > 0) { + float distance = magnitude(lights[2] - point); + float attenuation = 1.0 / (1.0f + + 0.07f * distance + + 0.007f * (distance * distance)); + color.z += 9.0 * attenuation; + } + } + */ + + float x = point.x; + float y = point.y; + float z = point.z; + + x *= 1; + y *= 1; + z *= 1; + + // camera transform + z += 90; + + // perspective + x = x / z; + y = y / z; + + // screen space transform + x *= 240.f; + y *= 240.f; + x += 320.f; + y += 240.f; + z = 1 / z; + + float scale_nx = ((n.x - -1) / (1 - -1)) * (1 - 0); + float scale_ny = ((n.y - -1) / (1 - -1)) * (1 - 0); + float scale_nz = ((n.z - -1) / (1 - -1)) * (1 - 0); + float scale_z = ((point.z - -46) / (46 - -46)) * (1 - 0); + + bool end_of_strip = i == strip_length - 1; + + *reinterpret_cast(store_queue) = + ta_vertex_parameter::polygon_type_1(polygon_vertex_parameter_control_word(end_of_strip), + x, y, z, + scale_z, // alpha + scale_nx, // r + scale_ny, // g + scale_nz // b + ); + sq_transfer_32byte(ta_fifo_polygon_converter); + } +} + +void dma_transfer(uint32_t source, uint32_t destination, uint32_t transfers) +{ + using namespace dmac; + + volatile uint32_t _dummy = sh7091.DMAC.CHCR1; + (void)_dummy; + + sh7091.DMAC.CHCR1 = 0; + + sh7091.DMAC.SAR1 = source; + sh7091.DMAC.DAR1 = destination; + sh7091.DMAC.DMATCR1 = transfers & 0x00ff'ffff; + + sh7091.DMAC.CHCR1 = chcr::dm::destination_address_incremented + | chcr::sm::source_address_incremented + | chcr::rs::resource_select(0b0100) /* auto request; external address space → external address space */ + | chcr::tm::cycle_burst_mode /* transmit mode */ + //| chcr::tm::cycle_steal_mode /* transmit mode */ + | chcr::ts::_32_byte /* transfer size */ + //| chcr::ie::interrupt_request_generated + | chcr::de::channel_operation_enabled; +} + +void ch2_dma_transfer(uint32_t source, uint32_t destination, uint32_t transfers) +{ + using namespace dmac; + + for (uint32_t i = 0; i < transfers; i++) { + asm volatile ("ocbwb @%0" + : // output + : "r" (source + (32 * i)) // input + ); + } + + // this dummy read appears to be required on real hardware. + volatile uint32_t _dummy = sh7091.DMAC.CHCR2; + (void)_dummy; + + /* start a new CH2-DMA transfer from "system memory" to "TA FIFO polygon converter" */ + sh7091.DMAC.CHCR2 = 0; /* disable DMA channel */ + sh7091.DMAC.SAR2 = reinterpret_cast(source); /* start address, must be aligned to a CHCHR__TS-sized (32-byte) boundary */ + sh7091.DMAC.DMATCR2 = dmatcr::transfer_count(transfers); /* transfer count, in CHCHR__TS-sized (32-byte) units */ + sh7091.DMAC.CHCR2 = chcr::dm::destination_address_incremented + | chcr::sm::source_address_incremented + | chcr::rs::resource_select(0b0010) /* external request, single address mode; + external address space → external device */ + | chcr::tm::cycle_burst_mode /* transmit mode */ + | chcr::ts::_32_byte /* transfer size */ + | chcr::de::channel_operation_enabled; + + system.C2DSTAT = c2dstat::texture_memory_start_address(destination); /* CH2-DMA destination address */ + system.C2DLEN = c2dlen::transfer_length(transfers * 32); /* CH2-DMA length (must be a multiple of 32) */ + system.C2DST = 1; /* CH2-DMA start (an 'external' request from SH7091's perspective) */ + + // wait for ch2-dma completion + while ((system.ISTNRM & istnrm::end_of_dma_ch2_dma) == 0); + // reset ch2-dma interrupt status + system.ISTNRM = istnrm::end_of_dma_ch2_dma; +} + +void dma_init() +{ + using namespace dmac; + + sh7091.DMAC.CHCR0 = 0; + sh7091.DMAC.CHCR1 = 0; + sh7091.DMAC.CHCR2 = 0; + sh7091.DMAC.CHCR3 = 0; + sh7091.DMAC.DMAOR = dmaor::ddt::on_demand_data_transfer_mode /* on-demand data transfer mode */ + | dmaor::pr::ch2_ch0_ch1_ch3 /* priority mode; CH2 > CH0 > CH1 > CH3 */ + | dmaor::dme::operation_enabled_on_all_channels; /* DMAC master enable */ + +} + +static uint32_t inbuf[640 * 480] __attribute__((aligned(32))); +static float temp[640 * 480] __attribute__((aligned(32))); + +extern "C" int sobel_fipr_store_queue(float * a, uint32_t * i); + +void make_temp() +{ + for (int i = 0; i < 640 * 480; i++) { + if ((i & 31) == 0) { + asm volatile ("pref @%0" + : // output + : "r" ((uint32_t)&inbuf[i]) // input + ); + } + uint32_t n = inbuf[i]; + uint32_t sum; + sum = n & 0xff; + n >>= 8; + sum += n & 0xff; + n >>= 8; + sum += n & 0xff; + n >>= 8; + sum += n & 0xff; + temp[i] = (float)(sum * 0.25); + } +} + +void main() +{ + dma_init(); + video_output::set_mode_vga(); + + constexpr uint32_t ta_alloc = ta_alloc_ctrl::pt_opb::no_list + | ta_alloc_ctrl::tm_opb::no_list + | ta_alloc_ctrl::t_opb::_16x4byte + | ta_alloc_ctrl::om_opb::no_list + | ta_alloc_ctrl::o_opb::no_list; + + const int render_passes = 1; + const struct opb_size opb_size[render_passes] = { + { + .opaque = 0, + .opaque_modifier = 0, + .translucent = 16 * 4, + .translucent_modifier = 0, + .punch_through = 0 + } + }; + + holly.SOFTRESET = softreset::pipeline_soft_reset + | softreset::ta_soft_reset; + holly.SOFTRESET = 0; + + core_init(); + + uint32_t frame_ix = 0; + + float theta = 0; + vec3 lights[3] = { + {0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f}, + }; + + const int framebuffer_width = 640; + const int framebuffer_height = 480; + const int tile_width = framebuffer_width / 32; + const int tile_height = framebuffer_height / 32; + + region_array_multipass(tile_width, + tile_height, + opb_size, + render_passes, + texture_memory_alloc.region_array[0].start, + texture_memory_alloc.object_list[0].start); + + background_parameter2(texture_memory_alloc.background[0].start, + 0xffc0c0c0); + + + holly.FB_R_SOF1 = texture_memory_alloc.framebuffer[0].start; + + holly.FB_R_CTRL = fb_r_ctrl::vclk_div::pclk_vclk_1 + | fb_r_ctrl::fb_depth::_0888_rgb_32bit + | fb_r_ctrl::fb_enable; + + holly.FB_R_SIZE = fb_r_size::fb_modulus(1) + | fb_r_size::fb_y_size(480 - 3) + | fb_r_size::fb_x_size((640 * 32) / 32 - 1); + + system.LMMODE0 = 1; + system.LMMODE1 = 1; // 32-bit + + uint32_t * out = (uint32_t *)&texture_memory32[texture_memory_alloc.framebuffer[0].start / 4]; + for (int i = 0; i < 640 * 480; i++) { + out[i] = 0xffff0000; + } + + while (1) { + ta_polygon_converter_init2(texture_memory_alloc.isp_tsp_parameters[0].start, + texture_memory_alloc.isp_tsp_parameters[0].end, + texture_memory_alloc.object_list[0].start, + texture_memory_alloc.object_list[0].end, + opb_size[0].total(), + ta_alloc, + tile_width, + tile_height); + + float theta2 = 3.14 * 2 * sin(theta / 7); + + lights[0].x = cos(theta) * 20; + lights[0].z = sin(theta) * 20; + + lights[1].x = cos(theta2 + half_degree * 180.f) * 20; + lights[1].z = sin(theta2 + half_degree * 180.f) * 20; + + lights[2].x = cos(theta + half_degree * 360.f) * 20; + lights[2].z = sin(theta + half_degree * 360.f) * 20; + + for (uint32_t i = 0; i < MODEL::num_faces; i++) { + transform(i, theta, lights); + } + /* + transform2(parameter, lights[0], {1.f, 0.f, 0.f, 1.f}); + transform2(parameter, lights[1], {0.f, 1.f, 0.f, 1.f}); + transform2(parameter, lights[2], {0.f, 0.f, 1.f, 1.f}); + */ + + *reinterpret_cast(store_queue) = + ta_global_parameter::end_of_list(para_control::para_type::end_of_list); + sq_transfer_32byte(ta_fifo_polygon_converter); + + ta_wait_translucent_list(); + + holly.FB_W_CTRL = fb_w_ctrl::fb_packmode::_8888_argb_32bit; + uint32_t bytes_per_pixel = 4; + core_start_render3(texture_memory_alloc.region_array[0].start, + texture_memory_alloc.isp_tsp_parameters[0].start, + texture_memory_alloc.background[0].start, + //texture_memory_alloc.framebuffer[0].start, + 0x100'0000 | texture_memory_alloc.texture.start, // 64-bit area + framebuffer_width, + bytes_per_pixel); + core_wait_end_of_render_video(); + + uint32_t * in = (uint32_t *)&texture_memory64[texture_memory_alloc.texture.start / 4]; + uint32_t * framebuffer = (uint32_t *)(0x11000000 + texture_memory_alloc.framebuffer[0].start); + + + //serial::string("ch1 dma start\n"); + dma_transfer((uint32_t)in, (uint32_t)inbuf, 640 * 480 * 4 / 32); + + for (uint32_t i = 0; i < (sizeof (640 * 480 * 4)) / 32; i++) { + uint32_t address = (uint32_t)&inbuf[0]; + asm volatile ("ocbp @%0" + : // output + : "r" (address + (i * 32)) // input + ); + } + + while ((sh7091.DMAC.CHCR1 & dmac::chcr::te::transfers_completed) == 0); + //serial::string("ch1 dma end\n"); + + //serial::string("temp start\n"); + make_temp(); + //serial::string("temp end\n"); + + //serial::string("convolve start\n"); + + //convolve(temp, outbuf); + sobel_fipr_store_queue(temp, framebuffer); + //serial::integer((uint32_t)temp); + //serial::integer(a); + //serial::string("convolve end\n"); + + theta += half_degree; + } +} diff --git a/sobel_fipr_inner.s b/sobel_fipr_inner.s new file mode 100644 index 0000000..82a8d5b --- /dev/null +++ b/sobel_fipr_inner.s @@ -0,0 +1,56 @@ + /* y multiplication */ + fmov.s @r0,fr0 /* 0 */ + fmov.s @(r0,r1),fr1 /* 1 */ + fmov.s @(r0,r2),fr2 /* 2 */ + fldi0 fr3 + fipr fv8,fv0 + + fmov.s @(r0,r5),fr4 /* 1280 */ + fmov.s @(r0,r6),fr5 /* 1281 */ + fmov.s @(r0,r7),fr6 /* 1282 */ + fldi0 fr7 + fipr fv12,fv4 + + fadd fr3,fr7 + fmul fr7,fr7 + + /* save fr7 in FPUL */ + flds fr7,FPUL + + /* x multiplication */ + /* transpose and load + before → + fr0, fr1, fr2, _, + , , , , + fr4, fr5, fr6, _, + + after → + fr0, , fr4, _, + fr1, , fr5, _, + fr2, , fr6, _, + */ + /* exchange fr4/fr2 */ + fmov fr4,fr3 + fmov fr2,fr4 + fmov fr3,fr2 + /* load fr1,fr5 */ + fmov.s @(r0,r3),fr1 /* 640 */ + fldi0 fr3 + fipr fv8,fv0 + fmov.s @(r0,r4),fr5 /* 642 */ + fldi0 fr7 + fipr fv12,fv4 + + fadd fr3,fr7 + fmul fr7,fr7 + /* restore FPUL from y multiplication */ + fsts FPUL,fr3 + fadd fr3,fr7 + + fmov dr0,xd0 /* load 100.f constant */ + + fcmp/gt fr0,fr7 + movt r9 + add #-1,r9 + + add #4,r0 /* next pixel */ diff --git a/sobel_fipr_store_queue.s b/sobel_fipr_store_queue.s new file mode 100644 index 0000000..06f1426 --- /dev/null +++ b/sobel_fipr_store_queue.s @@ -0,0 +1,172 @@ + /* fv0 fv4 fv8 fv12 */ + .global _sobel_fipr_store_queue +_sobel_fipr_store_queue: + /* r0: var (input address) */ + /* r1: const 1 4 */ + /* r2: const 2 4 */ + /* r3: const 640 4 */ + /* r4: const 642 4 */ + /* r5: const 1280 4 */ + /* r6: const 1281 4 */ + /* r7: const 1282 4 */ + /* r8: var (output address / store queue) */ + /* r9: var (result temporary) */ + /* r10: var (x loop counter) */ + /* r11: var (y loop counter) */ + /* r12: var (prefetch address: input address + 1280 4) */ + /* r13: - */ + /* r14: - */ + +__setup: + mov.l r8,@-r15 + mov.l r9,@-r15 + mov.l r10,@-r15 + mov.l r11,@-r15 + mov.l r12,@-r15 + fmov.s fr12,@-r15 + fmov.s fr13,@-r15 + fmov.s fr14,@-r15 + fmov.s fr15,@-r15 + + fldi1 fr8 /* 1.0 */ + fldi1 fr9 /* 2.0 */ + fldi1 fr10 /* 1.0 */ + fldi0 fr11 /* 0.0 */ + fadd fr9,fr9 + + fldi1 fr12 + fmov fr9,fr13 + fldi1 fr14 + fldi0 fr15 + fneg fr12 + fneg fr13 + fneg fr14 + + /* constants */ + mova _const_100f,r0 /* use r0 as temporary */ + fmov.s @r0,fr0 + + /* set qacr0 */ + mov r5,r0 /* r5: C argument */ + shlr16 r0 /* use r0 as temporary */ + mov.l _const_qacr0,r9 /* use r9 as temporary */ + shlr8 r0 + and #28,r0 /* 0b11100 */ + mov.l r0,@r9 + mov.l r0,@(4,r9) /* qacr1 */ + + /* translate r8 to store queue address; keep bits [25:6] */ + mov r5,r8 /* r5: C argument */ + mov.l _const_store_queue_mask,r0 /* use r0 as temporary */ + and r0,r8 + mov.l _const_store_queue,r9 /* use r9 as temporary */ + or r9,r8 /* 0xe0000000 | (in_addr & 0x03ffffc0) */ + + /* save C input argument */ + mov r4,r0 /* r4 saved as r0 */ + + /* offsets */ + mov #(1 * 4),r1 + mov #(2 * 4),r2 + mov.w _const_640,r3 + mov.w _const_642,r4 + mov.w _const_1280,r5 + mov.w _const_1281,r6 + mov.w _const_1282,r7 + + mov #80,r10 /* 640 / 8 */ + mov.w _const_height,r11 /* 478 */ + + bra _prefetch + nop + + .align 4 +_const_100f: .float 100 + +_const_store_queue: .long 0xe0000000 +_const_store_queue_mask: .long 0x03ffffc0 /* (0xffffffff & (~0b111111)) & (~(0b111111 << 26)) */ +_const_qacr0: .long 0xff000038 + +_const_640: .short (640 * 4) +_const_642: .short (642 * 4) +_const_1280: .short (1280 * 4) +_const_1281: .short (1281 * 4) +_const_1282: .short (1282 * 4) + +_const_height: .short 478 + + .align 4 +_prefetch: + /* prefetch first 1280 pixels (160 prefetches) */ + .rept 160 + mov r0,r12 + ocbi @r12 + pref @r12 + add #32,r12 + .endr + + /* skip first row */ + add r3,r0 /* r3: const (640 * 4) */ + add r3,r8 + + /*add #4,r0 /* skip first pixel */ + /*add #4,r8*/ + +_loop: +_loop_width: + /* prefetch at r8 + 1280 */ + ocbi @r12 + pref @r12 + add #32,r12 + + /* process the next 8 pixels */ + .include "sobel_fipr_inner.s" + mov.l r9,@r8 /* save result in the store queue */ + .include "sobel_fipr_inner.s" + mov.l r9,@(4,r8) /* save result in the store queue */ + .include "sobel_fipr_inner.s" + mov.l r9,@(8,r8) /* save result in the store queue */ + .include "sobel_fipr_inner.s" + mov.l r9,@(12,r8) /* save result in the store queue */ + .include "sobel_fipr_inner.s" + mov.l r9,@(16,r8) /* save result in the store queue */ + .include "sobel_fipr_inner.s" + mov.l r9,@(20,r8) /* save result in the store queue */ + .include "sobel_fipr_inner.s" + mov.l r9,@(24,r8) /* save result in the store queue */ + .include "sobel_fipr_inner.s" + mov.l r9,@(28,r8) /* save result in the store queue */ + + /* send the store queue */ + pref @r8 + add #32,r8 + + dt r10 + bt _row_decrement + bra _loop_width + nop + /* end of _loop_width */ + +_row_decrement: + /* row decrement */ + dt r11 + bt _return + bra _loop + mov #80,r10 /* 640 / 8 */ + + /* restore registers */ +_return: + fmov.s @r15+,fr15 + fmov.s @r15+,fr14 + fmov.s @r15+,fr13 + fmov.s @r15+,fr12 + mov.l @r15+,r12 + mov.l @r15+,r11 + mov.l @r15+,r10 + mov.l @r15+,r9 + mov.l @r15+,r8 + + rts + nop + +_const_638_b: .short 638