diff --git a/drm/texture.vs.asm b/drm/texture.vs.asm new file mode 100644 index 0000000..a63e3d2 --- /dev/null +++ b/drm/texture.vs.asm @@ -0,0 +1,2 @@ +out[0].xyzw = VE_ADD input[0].xyz1 input[0].0000 ; +out[1].xy = VE_ADD input[1].xy__ input[1].0000 ; diff --git a/drm/texture.vs.inc b/drm/texture.vs.inc new file mode 100644 index 0000000..7609903 --- /dev/null +++ b/drm/texture.vs.inc @@ -0,0 +1,2 @@ +0x00f00203, 0x01510001, 0x01248001, 0x01ffe001, +0x00302203, 0x01f90021, 0x01248021, 0x01ffe021, diff --git a/drm/texture_blur.c b/drm/texture_blur.c index e3d7835..e252340 100644 --- a/drm/texture_blur.c +++ b/drm/texture_blur.c @@ -502,7 +502,7 @@ int indirect_buffer() ////////////////////////////////////////////////////////////////////////////// const uint32_t fragment_shader[] = { - #include "texture_blur.fs.inc" + #include "texture_blur_horizontal.fs.inc" }; const int fragment_shader_length = (sizeof (fragment_shader)) / (sizeof (fragment_shader[0])); printf("fs length %d\n", fragment_shader_length); diff --git a/drm/texture_blur_combined.c b/drm/texture_blur_combined.c new file mode 100644 index 0000000..38f76d2 --- /dev/null +++ b/drm/texture_blur_combined.c @@ -0,0 +1,889 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include "3d_registers.h" +#include "3d_registers_undocumented.h" +#include "3d_registers_bits.h" +#include "command_processor.h" + +static void * read_file(const char * filename) +{ + int fd = open(filename, O_RDONLY); + if (fd == -1) { + fprintf(stderr, "open(%s): %s\n", filename, strerror(errno)); + return NULL; + } + + off_t size = lseek(fd, 0, SEEK_END); + if (size == (off_t)-1) { + fprintf(stderr, "lseek(%s, SEEK_END): %s\n", filename, strerror(errno)); + return NULL; + } + + off_t start = lseek(fd, 0, SEEK_SET); + if (start == (off_t)-1) { + fprintf(stderr, "lseek(%s, SEEK_SET): %s\n", filename, strerror(errno)); + return NULL; + } + + void * buf = malloc(size+1); + + ssize_t read_size = read(fd, buf, size); + if (read_size == -1) { + fprintf(stderr, "read(%s): %s\n", filename, strerror(errno)); + return NULL; + } + ((char*)buf)[read_size] = 0; + + close(fd); + + return buf; +} + +union u32_f32 { + uint32_t u32; + float f32; +}; + +static union u32_f32 ib[16384]; + +int indirect_buffer(int ix, + int width, + int height, + int colorbuffer_reloc_ix, + int texturebuffer_reloc_ix, + int shader_ix, + bool intermediate) +{ + T0V(RB3D_DSTCACHE_CTLSTAT + , RB3D_DSTCACHE_CTLSTAT__DC_FLUSH(0x2) // Flush dirty 3D data + | RB3D_DSTCACHE_CTLSTAT__DC_FREE(0x2) // Free 3D tags + ); + + T0V(ZB_ZCACHE_CTLSTAT + , ZB_ZCACHE_CTLSTAT__ZC_FLUSH(1) + | ZB_ZCACHE_CTLSTAT__ZC_FREE(1) + ); + + T0V(WAIT_UNTIL, 0x00020000); + + T0V(GB_AA_CONFIG, 0x00000000); + + T0V(RB3D_AARESOLVE_CTL, 0x00000000); + + T0V(RB3D_CCTL + , RB3D_CCTL__INDEPENDENT_COLORFORMAT_ENABLE(1) + ); + + T0V(ZB_BW_CNTL, 0x00000000); + T0V(ZB_DEPTHCLEARVALUE, 0x00000000); + T0V(SC_HYPERZ_EN, 0x00000000); + T0V(GB_Z_PEQ_CONFIG, 0x00000000); + T0V(ZB_ZTOP + , ZB_ZTOP__ZTOP(1) + ); + T0V(FG_ALPHA_FUNC, 0x00000000); + T0V(ZB_CNTL, 0x00000000); + T0V(ZB_ZSTENCILCNTL, 0x00000000); + T0V(ZB_STENCILREFMASK, 0x00000000); + T0V(ZB_STENCILREFMASK_BF, 0x00000000); + + T0V(FG_ALPHA_VALUE, 0x00000000); + T0V(RB3D_ROPCNTL, 0x00000000); + T0V(RB3D_BLENDCNTL, 0x00000000); + T0V(RB3D_ABLENDCNTL, 0x00000000); + T0V(RB3D_COLOR_CHANNEL_MASK + , RB3D_COLOR_CHANNEL_MASK__BLUE_MASK(1) + | RB3D_COLOR_CHANNEL_MASK__GREEN_MASK(1) + | RB3D_COLOR_CHANNEL_MASK__RED_MASK(1) + | RB3D_COLOR_CHANNEL_MASK__ALPHA_MASK(1) + ); + T0V(RB3D_DITHER_CTL, 0x00000000); + T0V(RB3D_CONSTANT_COLOR_AR, 0x00000000); + T0V(RB3D_CONSTANT_COLOR_GB, 0x00000000); + + T0V(SC_CLIP_0_A, 0x00000000); + T0V(SC_CLIP_0_B, 0xffffffff); + T0V(SC_SCREENDOOR, 0x00ffffff); + + T0V(GB_SELECT, 0x00000000); + T0V(FG_FOG_BLEND, 0x00000000); + T0V(GA_OFFSET, 0x00000000); + T0V(SU_TEX_WRAP, 0x00000000); + T0Vf(SU_DEPTH_SCALE, 16777215.0f); + T0V(SU_DEPTH_OFFSET, 0x00000000); + T0V(SC_EDGERULE + , SC_EDGERULE__ER_TRI(5) // L-in,R-out,HT-in,HB-in + | SC_EDGERULE__ER_POINT(9) // L-out,R-in,HT-in,HB-out + | SC_EDGERULE__ER_LINE_LR(5) // L-in,R-out,HT-in,HB-out + | SC_EDGERULE__ER_LINE_RL(9) // L-out,R-in,HT-in,HB-out + | SC_EDGERULE__ER_LINE_TB(26) // T-in,B-out,VL-out,VR-in + | SC_EDGERULE__ER_LINE_BT(22) // T-out,B-in,VL-out,VR-in + ); + T0V(RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD + , RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD__BLUE(1) + | RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD__GREEN(1) + | RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD__RED(1) + | RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD__ALPHA(1) + ); + T0V(RB3D_DISCARD_SRC_PIXEL_GTE_THRESHOLD + , RB3D_DISCARD_SRC_PIXEL_GTE_THRESHOLD__BLUE(254) + | RB3D_DISCARD_SRC_PIXEL_GTE_THRESHOLD__GREEN(254) + | RB3D_DISCARD_SRC_PIXEL_GTE_THRESHOLD__RED(254) + | RB3D_DISCARD_SRC_PIXEL_GTE_THRESHOLD__ALPHA(254) + ); + T0V(GA_COLOR_CONTROL_PS3, 0x00000000); + T0V(SU_TEX_WRAP_PS3, 0x00000000); + T0V(VAP_PVS_STATE_FLUSH_REG, 0x00000000); + T0V(VAP_PVS_VTX_TIMEOUT_REG + , VAP_PVS_VTX_TIMEOUT_REG__CLK_COUNT(0xffff) + ); + T0Vf(VAP_GB_VERT_CLIP_ADJ, 1.0f); + T0Vf(VAP_GB_VERT_DISC_ADJ, 1.0f); + T0Vf(VAP_GB_HORZ_CLIP_ADJ, 1.0f); + T0Vf(VAP_GB_HORZ_DISC_ADJ, 1.0f); + T0V(VAP_PSC_SGN_NORM_CNTL + , VAP_PSC_SGN_NORM_CNTL__SGN_NORM_METHOD_0(2) + | VAP_PSC_SGN_NORM_CNTL__SGN_NORM_METHOD_1(2) + | VAP_PSC_SGN_NORM_CNTL__SGN_NORM_METHOD_2(2) + | VAP_PSC_SGN_NORM_CNTL__SGN_NORM_METHOD_3(2) + | VAP_PSC_SGN_NORM_CNTL__SGN_NORM_METHOD_4(2) + | VAP_PSC_SGN_NORM_CNTL__SGN_NORM_METHOD_5(2) + | VAP_PSC_SGN_NORM_CNTL__SGN_NORM_METHOD_6(2) + | VAP_PSC_SGN_NORM_CNTL__SGN_NORM_METHOD_7(2) + | VAP_PSC_SGN_NORM_CNTL__SGN_NORM_METHOD_8(2) + | VAP_PSC_SGN_NORM_CNTL__SGN_NORM_METHOD_9(2) + | VAP_PSC_SGN_NORM_CNTL__SGN_NORM_METHOD_10(2) + | VAP_PSC_SGN_NORM_CNTL__SGN_NORM_METHOD_11(2) + | VAP_PSC_SGN_NORM_CNTL__SGN_NORM_METHOD_12(2) + | VAP_PSC_SGN_NORM_CNTL__SGN_NORM_METHOD_13(2) + | VAP_PSC_SGN_NORM_CNTL__SGN_NORM_METHOD_14(2) + | VAP_PSC_SGN_NORM_CNTL__SGN_NORM_METHOD_15(2) + ); + T0V(VAP_TEX_TO_COLOR_CNTL, 0x00000000); + + T0V(VAP_CNTL + , VAP_CNTL__PVS_NUM_SLOTS(10) + | VAP_CNTL__PVS_NUM_CNTLRS(5) + | VAP_CNTL__PVS_NUM_FPUS(5) + | VAP_CNTL__VAP_NO_RENDER(0) + | VAP_CNTL__VF_MAX_VTX_NUM(12) + | VAP_CNTL__DX_CLIP_SPACE_DEF(0) + | VAP_CNTL__TCL_STATE_OPTIMIZATION(1) + ); + T0V(VAP_PVS_FLOW_CNTL_OPC, 0x00000000); + + T0(VAP_PVS_FLOW_CNTL_ADDRS_LW_0, 31); + for (int i = 0; i < 32; i++) + ib[ix++].u32 = 0x00000000; + + T0(VAP_PVS_FLOW_CNTL_LOOP_INDEX_0, 15); + for (int i = 0; i < 16; i++) + ib[ix++].u32 = 0x00000000; + + T0V(VAP_PVS_VECTOR_INDX_REG + , VAP_PVS_VECTOR_INDX_REG__OCTWORD_OFFSET(1536)); + T0_ONE_REG(VAP_PVS_VECTOR_DATA_REG_128, 23); + for (int i = 0; i < 24; i++) + ib[ix++].u32 = 0x00000000; + + T0V(VAP_VTX_STATE_CNTL + , VAP_VTX_STATE_CNTL__COLOR_0_ASSEMBLY_CNTL(1) + | VAP_VTX_STATE_CNTL__COLOR_1_ASSEMBLY_CNTL(1) + | VAP_VTX_STATE_CNTL__COLOR_2_ASSEMBLY_CNTL(1) + | VAP_VTX_STATE_CNTL__COLOR_3_ASSEMBLY_CNTL(1) + | VAP_VTX_STATE_CNTL__COLOR_4_ASSEMBLY_CNTL(1) + | VAP_VTX_STATE_CNTL__COLOR_5_ASSEMBLY_CNTL(1) + | VAP_VTX_STATE_CNTL__COLOR_6_ASSEMBLY_CNTL(1) + | VAP_VTX_STATE_CNTL__COLOR_7_ASSEMBLY_CNTL(1) + | VAP_VTX_STATE_CNTL__UPDATE_USER_COLOR_0_ENA(0) + ); + + T0V(GB_ENABLE, 0x00000000); + T0V(VAP_CNTL_STATUS, 0x00000000); + T0V(VAP_CLIP_CNTL + , VAP_CLIP_CNTL__PS_UCP_MODE(3) + ); + T0V(GA_POINT_SIZE + , GA_POINT_SIZE__HEIGHT(6) + | GA_POINT_SIZE__WIDTH(6) + ); + T0V(GA_POINT_MINMAX + , GA_POINT_MINMAX__MIN_SIZE(6) + | GA_POINT_MINMAX__MAX_SIZE(6) + ); + T0V(GA_LINE_CNTL + , GA_LINE_CNTL__WIDTH(6) + | GA_LINE_CNTL__END_TYPE(2) + | GA_LINE_CNTL__SORT(0) + ); + T0V(SU_POLY_OFFSET_ENABLE, 0x00000000); + T0V(SU_CULL_MODE, 0x00000000); + T0V(GA_LINE_STIPPLE_CONFIG, 0x00000000); + T0V(GA_LINE_STIPPLE_VALUE, 0x00000000); + T0V(GA_POLY_MODE, 0x00000000); + T0V(GA_ROUND_MODE + , GA_ROUND_MODE__GEOMETRY_ROUND(1) + | GA_ROUND_MODE__COLOR_ROUND(0) + | GA_ROUND_MODE__RGB_CLAMP(1) + | GA_ROUND_MODE__ALPHA_CLAMP(1) + | GA_ROUND_MODE__GEOMETRY_MASK(0) + ); + T0V(SC_CLIP_RULE + , SC_CLIP_RULE__CLIP_RULE(0xffff)); + T0Vf(GA_POINT_S0, 0.0f); + T0Vf(GA_POINT_T0, 1.0f); + T0Vf(GA_POINT_S1, 1.0f); + T0Vf(GA_POINT_T1, 0.0f); + if (intermediate) { + T0V(US_OUT_FMT_0 + , US_OUT_FMT__OUT_FMT(0) // C4_8 + | US_OUT_FMT__C0_SEL(1) // Blue + | US_OUT_FMT__C1_SEL(2) // Green + | US_OUT_FMT__C2_SEL(3) // Red + | US_OUT_FMT__C3_SEL(0) // Alpha + | US_OUT_FMT__OUT_SIGN(0) + ); + } else { + T0V(US_OUT_FMT_0 + , US_OUT_FMT__OUT_FMT(0) // C4_8 + | US_OUT_FMT__C0_SEL(3) // Blue + | US_OUT_FMT__C1_SEL(2) // Green + | US_OUT_FMT__C2_SEL(1) // Red + | US_OUT_FMT__C3_SEL(0) // Alpha + | US_OUT_FMT__OUT_SIGN(0) + ); + } + T0V(US_OUT_FMT_1 + , US_OUT_FMT__OUT_FMT(15) // render target is not used + ); + T0V(US_OUT_FMT_2 + , US_OUT_FMT__OUT_FMT(15) // render target is not used + ); + T0V(US_OUT_FMT_2 + , US_OUT_FMT__OUT_FMT(15) // render target is not used + ); + T0V(GB_MSPOS0 + , GB_MSPOS0__MS_X0(6) + | GB_MSPOS0__MS_Y0(6) + | GB_MSPOS0__MS_X1(6) + | GB_MSPOS0__MS_Y1(6) + | GB_MSPOS0__MS_X2(6) + | GB_MSPOS0__MS_Y2(6) + | GB_MSPOS0__MSBD0_Y(6) + | GB_MSPOS0__MSBD0_X(6) + ); + T0V(GB_MSPOS1 + , GB_MSPOS1__MS_X3(6) + | GB_MSPOS1__MS_Y3(6) + | GB_MSPOS1__MS_X4(6) + | GB_MSPOS1__MS_Y4(6) + | GB_MSPOS1__MS_X5(6) + | GB_MSPOS1__MS_Y5(6) + | GB_MSPOS1__MSBD1(6) + ); + T0V(US_CONFIG + , US_CONFIG__ZERO_TIMES_ANYTHING_EQUALS_ZERO(1) + ); + T0V(US_PIXSIZE + , US_PIXSIZE__PIX_SIZE(9) + ); + T0V(US_FC_CTRL, 0); + + T0V(FG_DEPTH_SRC, 0x00000000); + T0V(US_W_FMT, 0x00000000); + T0V(VAP_PVS_CONST_CNTL, 0x00000000); + T0V(VAP_INDEX_OFFSET, 0x00000000); + T0V(GA_COLOR_CONTROL + , GA_COLOR_CONTROL__RGB0_SHADING(2) + | GA_COLOR_CONTROL__ALPHA0_SHADING(2) + | GA_COLOR_CONTROL__RGB1_SHADING(2) + | GA_COLOR_CONTROL__ALPHA1_SHADING(2) + | GA_COLOR_CONTROL__RGB2_SHADING(2) + | GA_COLOR_CONTROL__ALPHA2_SHADING(2) + | GA_COLOR_CONTROL__RGB3_SHADING(2) + | GA_COLOR_CONTROL__ALPHA3_SHADING(2) + | GA_COLOR_CONTROL__PROVOKING_VERTEX(3) + ); + + ////////////////////////////////////////////////////////////////////////////// + // CB + ////////////////////////////////////////////////////////////////////////////// + + T0V(RB3D_COLOROFFSET0 + , 0x00000000 // value replaced by kernel from relocs + ); + T3(_NOP, 0); + ib[ix++].u32 = colorbuffer_reloc_ix * 4; // index into relocs array + + T0V(RB3D_COLORPITCH0 + , RB3D_COLORPITCH__COLORPITCH(width >> 1) + | RB3D_COLORPITCH__COLORFORMAT(6) // ARGB8888 + ); + // The COLORPITCH NOP is ignored/not applied due to + // RADEON_CS_KEEP_TILING_FLAGS, but is still required. + T3(_NOP, 0); + ib[ix++].u32 = colorbuffer_reloc_ix * 4; // index into relocs array + + ////////////////////////////////////////////////////////////////////////////// + // SC + ////////////////////////////////////////////////////////////////////////////// + + T0V(SC_SCISSOR0 + , SC_SCISSOR0__XS0(0) + | SC_SCISSOR0__YS0(0) + ); + T0V(SC_SCISSOR1 + , SC_SCISSOR1__XS1(width - 1) + | SC_SCISSOR1__YS1(height - 1) + ); + + ////////////////////////////////////////////////////////////////////////////// + // VAP + ////////////////////////////////////////////////////////////////////////////// + + T0Vf(VAP_VPORT_XSCALE, ((float)height) * 0.5f); + T0Vf(VAP_VPORT_XOFFSET, ((float)width) * 0.5f); + T0Vf(VAP_VPORT_YSCALE, ((float)height) * -0.5f); + T0Vf(VAP_VPORT_YOFFSET, ((float)height) * 0.5f); + T0Vf(VAP_VPORT_ZSCALE, 0.5f); + T0Vf(VAP_VPORT_ZOFFSET, 0.5f); + + T0V(VAP_VTE_CNTL + , VAP_VTE_CNTL__VPORT_X_SCALE_ENA(1) + | VAP_VTE_CNTL__VPORT_X_OFFSET_ENA(1) + | VAP_VTE_CNTL__VPORT_Y_SCALE_ENA(1) + | VAP_VTE_CNTL__VPORT_Y_OFFSET_ENA(1) + | VAP_VTE_CNTL__VPORT_Z_SCALE_ENA(1) + | VAP_VTE_CNTL__VPORT_Z_OFFSET_ENA(1) + | VAP_VTE_CNTL__VTX_XY_FMT(0) + | VAP_VTE_CNTL__VTX_Z_FMT(0) + | VAP_VTE_CNTL__VTX_W0_FMT(1) + | VAP_VTE_CNTL__SERIAL_PROC_ENA(0) + ); + + T0V(VAP_VF_MAX_VTX_INDX + , VAP_VF_MAX_VTX_INDX__MAX_INDX(5) + ); + T0V(VAP_VF_MIN_VTX_INDX + , VAP_VF_MIN_VTX_INDX__MIN_INDX(0) + ); + T0V(VAP_VTX_SIZE + , VAP_VTX_SIZE__DWORDS_PER_VTX(5) + ); + + T0V(VAP_PROG_STREAM_CNTL_0 + , VAP_PROG_STREAM_CNTL__DATA_TYPE_0__FLOAT_3 + | VAP_PROG_STREAM_CNTL__SKIP_DWORDS_0(0) + | VAP_PROG_STREAM_CNTL__DST_VEC_LOC_0(0) + | VAP_PROG_STREAM_CNTL__LAST_VEC_0(0) + | VAP_PROG_STREAM_CNTL__DATA_TYPE_1__FLOAT_2 + | VAP_PROG_STREAM_CNTL__SKIP_DWORDS_1(0) + | VAP_PROG_STREAM_CNTL__DST_VEC_LOC_1(1) + | VAP_PROG_STREAM_CNTL__LAST_VEC_1(1) + ); + T0V(VAP_PROG_STREAM_CNTL_EXT_0 + , VAP_PROG_STREAM_CNTL_EXT__SWIZZLE_SELECT_X_0__SELECT_X + | VAP_PROG_STREAM_CNTL_EXT__SWIZZLE_SELECT_Y_0__SELECT_Y + | VAP_PROG_STREAM_CNTL_EXT__SWIZZLE_SELECT_Z_0__SELECT_Z + | VAP_PROG_STREAM_CNTL_EXT__SWIZZLE_SELECT_W_0__SELECT_FP_ONE + | VAP_PROG_STREAM_CNTL_EXT__WRITE_ENA_0(0b1111) // XYZW + | VAP_PROG_STREAM_CNTL_EXT__SWIZZLE_SELECT_X_1__SELECT_X + | VAP_PROG_STREAM_CNTL_EXT__SWIZZLE_SELECT_Y_1__SELECT_Y + | VAP_PROG_STREAM_CNTL_EXT__SWIZZLE_SELECT_Z_1__SELECT_FP_ZERO + | VAP_PROG_STREAM_CNTL_EXT__SWIZZLE_SELECT_W_1__SELECT_FP_ONE + | VAP_PROG_STREAM_CNTL_EXT__WRITE_ENA_1(0b1111) // XYZW + ); + + T0V(VAP_VSM_VTX_ASSM + , 0x00000401); // undocumented + T0V(VAP_OUT_VTX_FMT_0 + , VAP_OUT_VTX_FMT_0__VTX_POS_PRESENT(1)); + T0V(VAP_OUT_VTX_FMT_1 + , VAP_OUT_VTX_FMT_1__TEX_0_COMP_CNT(4)); + + ////////////////////////////////////////////////////////////////////////////// + // VAP_PVS + ////////////////////////////////////////////////////////////////////////////// + + const uint32_t vertex_shader[] = { + #include "texture.vs.inc" + }; + const int vertex_shader_length = (sizeof (vertex_shader)) / (sizeof (vertex_shader[0])); + printf("vs length %d\n", vertex_shader_length); + assert(vertex_shader_length % 4 == 0); + const int vertex_shader_instructions = vertex_shader_length / 4; + printf("vs instructions %d\n", vertex_shader_instructions); + + T0V(VAP_PVS_CODE_CNTL_0 + , VAP_PVS_CODE_CNTL_0__PVS_FIRST_INST(0) + | VAP_PVS_CODE_CNTL_0__PVS_XYZW_VALID_INST((vertex_shader_instructions - 1)) + | VAP_PVS_CODE_CNTL_0__PVS_LAST_INST((vertex_shader_instructions - 1)) + ); + T0V(VAP_PVS_CODE_CNTL_1 + , VAP_PVS_CODE_CNTL_1__PVS_LAST_VTX_SRC_INST((vertex_shader_instructions - 1)) + ); + + T0V(VAP_PVS_VECTOR_INDX_REG + , VAP_PVS_VECTOR_INDX_REG__OCTWORD_OFFSET(0) + ); + T0_ONE_REG(VAP_PVS_VECTOR_DATA_REG_128, vertex_shader_length - 1); + for (int i = 0; i < vertex_shader_length; i++) { + ib[ix++].u32 = vertex_shader[i]; + } + + ////////////////////////////////////////////////////////////////////////////// + // RS + ////////////////////////////////////////////////////////////////////////////// + + T0V(RS_IP_0 + , RS_IP__TEX_PTR_S(0) + | RS_IP__TEX_PTR_T(1) + | RS_IP__TEX_PTR_R(2) + | RS_IP__TEX_PTR_Q(3) + | RS_IP__COL_PTR(0) + | RS_IP__COL_FMT(0) + | RS_IP__OFFSET_EN(0) + ); + T0V(RS_COUNT + , RS_COUNT__IT_COUNT(4) + | RS_COUNT__IC_COUNT(0) + | RS_COUNT__W_ADDR(0) + | RS_COUNT__HIRES_EN(1) + ); + T0V(RS_INST_COUNT, 0x00000000); + T0V(RS_INST_0 + , RS_INST__TEX_ID(0) + | RS_INST__TEX_CN(1) + | RS_INST__TEX_ADDR(0) + ); + + ////////////////////////////////////////////////////////////////////////////// + // TX + ////////////////////////////////////////////////////////////////////////////// + + T0V(TX_INVALTAGS, 0x00000000); + + T0V(TX_ENABLE + , TX_ENABLE__TEX_0_ENABLE__ENABLE); + T0V(TX_FILTER0_0 + //, TX_FILTER0__CLAMP_S(2) // clamp to (0.0, 1.0) + //| TX_FILTER0__CLAMP_T(2) // clamp to (0.0, 1.0) + , TX_FILTER0__MAG_FILTER__POINT + | TX_FILTER0__MIN_FILTER__POINT + ); + T0V(TX_FILTER1_0 + , TX_FILTER1__LOD_BIAS(1) + ); + T0V(TX_BORDER_COLOR_0, 0); + T0V(TX_FORMAT0_0 + , TX_FORMAT0__TXWIDTH(128 - 1) + | TX_FORMAT0__TXHEIGHT(128 - 1) + ); + + T0V(TX_FORMAT1_0 + , TX_FORMAT1__TXFORMAT__TX_FMT_8_8_8_8 + | TX_FORMAT1__SEL_ALPHA(5) + | TX_FORMAT1__SEL_RED(0) + | TX_FORMAT1__SEL_GREEN(1) + | TX_FORMAT1__SEL_BLUE(2) + | TX_FORMAT1__TEX_COORD_TYPE__2D + ); + T0V(TX_FORMAT2_0, 0); + + T0V(TX_OFFSET_0 + //, TX_OFFSET__MACRO_TILE(1) + //| TX_OFFSET__MICRO_TILE(1) + , 0 + ); + + T3(_NOP, 0); + ib[ix++].u32 = texturebuffer_reloc_ix * 4; // index into relocs array + + ////////////////////////////////////////////////////////////////////////////// + // GA_US + ////////////////////////////////////////////////////////////////////////////// + + const uint32_t fragment_shader0[] = { + #include "texture_blur_horizontal.fs.inc" + }; + const uint32_t fragment_shader1[] = { + #include "texture_blur_vertical.fs.inc" + }; + const int fragment_shader0_length = (sizeof (fragment_shader0)) / (sizeof (fragment_shader0[0])); + const int fragment_shader1_length = (sizeof (fragment_shader1)) / (sizeof (fragment_shader1[0])); + assert(fragment_shader0_length % 6 == 0); + assert(fragment_shader1_length % 6 == 0); + const int fragment_shader0_instructions = fragment_shader0_length / 6; + const int fragment_shader1_instructions = fragment_shader0_length / 6; + + struct shader { + const uint32_t * buf; + int instructions; + int start; + }; + const struct shader shaders[] = { + { + .buf = fragment_shader0, + .instructions = fragment_shader0_instructions, + .start = 0, + }, + { + .buf = fragment_shader1, + .instructions = fragment_shader1_instructions, + .start = fragment_shader0_instructions, + } + }; + int shaders_length = (sizeof (shaders)) / (sizeof (shaders[0])); + + int fragment_shader_total_length = 0; + for (int i = 0; i < shaders_length; i++) { + printf("fs[%d] offset=%d instructions=%d\n", i, fragment_shader_total_length, shaders[i].instructions); + fragment_shader_total_length += shaders[i].instructions * 6; + } + printf("fs total=%d\n", fragment_shader_total_length); + T0V(GA_US_VECTOR_INDEX, 0x00000000); + T0_ONE_REG(GA_US_VECTOR_DATA, fragment_shader_total_length - 1); + for (int j = 0; j < shaders_length; j++) { + for (int i = 0; i < shaders[j].instructions * 6; i++) { + ib[ix++].u32 = shaders[j].buf[i]; + } + } + + const float fragment_consts[] = { + -1.0f / 128.f, 1.0f / 128.f, -2.0f / 128.f, 2.0f / 128.f, + -3.0f / 128.f, 3.0f / 128.f, 0.0f, 0.0f, + 0.24609375, 0.205078125, 0.1171875, 0.0439453125, + }; + int fragment_consts_length = (sizeof (fragment_consts)) / (sizeof (fragment_consts[0])); + T0V(GA_US_VECTOR_INDEX + , GA_US_VECTOR_INDEX__INDEX(0) + | GA_US_VECTOR_INDEX__TYPE(1) + ); + T0_ONE_REG(GA_US_VECTOR_DATA, (fragment_consts_length - 1)); + for (int i = 0; i < fragment_consts_length; i++) + ib[ix++].f32 = fragment_consts[i]; + + + // program selection + + assert(shader_ix >= 0 && shader_ix < shaders_length); + printf("fs shader_ix %d\n", shader_ix); + + T0V(US_CODE_RANGE + , US_CODE_RANGE__CODE_ADDR(shaders[shader_ix].start) // absolute + | US_CODE_RANGE__CODE_SIZE(shaders[shader_ix].instructions - 1) // relative to CODE_ADDR + ); + T0V(US_CODE_OFFSET + , US_CODE_OFFSET__OFFSET_ADDR(shaders[shader_ix].start) // absolute + ); + T0V(US_CODE_ADDR + , US_CODE_ADDR__START_ADDR(0) // relative to OFFSET_ADDR + | US_CODE_ADDR__END_ADDR(shaders[shader_ix].instructions - 1) // relative to OFFSET_ADDR + ); + + ////////////////////////////////////////////////////////////////////////////// + // 3D_DRAW + ////////////////////////////////////////////////////////////////////////////// + + const float vertices[] = { + 1.0, 1.0, 0.0, 1.0, 0.0, + 1.0, -1.0, 0.0, 1.0, 1.0, + -1.0, -1.0, 0.0, 0.0, 1.0, + -1.0, 1.0, 0.0, 0.0, 0.0, + 1.0, 1.0, 0.0, 1.0, 0.0, + -1.0, -1.0, 0.0, 0.0, 1.0, + }; + const int vertices_length = (sizeof (vertices)) / (sizeof (vertices[0])); + printf("vtx length %d\n", vertices_length); + T3(_3D_DRAW_IMMD_2, (1 + vertices_length) - 1); + ib[ix++].u32 + = VAP_VF_CNTL__PRIM_TYPE(4) + | VAP_VF_CNTL__PRIM_WALK(3) + | VAP_VF_CNTL__INDEX_SIZE(0) + | VAP_VF_CNTL__VTX_REUSE_DIS(0) + | VAP_VF_CNTL__DUAL_INDEX_MODE(0) + | VAP_VF_CNTL__USE_ALT_NUM_VERTS(0) + | VAP_VF_CNTL__NUM_VERTICES(6) + ; + for (int i = 0; i < vertices_length; i++) { + ib[ix++].f32 = vertices[i]; + } + + ////////////////////////////////////////////////////////////////////////////// + // padding + ////////////////////////////////////////////////////////////////////////////// + + while ((ix % 8) != 0) { + ib[ix++].u32 = 0x80000000; + } + + return ix; +} + +int create_colorbuffer(int fd, int colorbuffer_size, void ** out_ptr) +{ + int ret; + + struct drm_radeon_gem_create args = { + .size = colorbuffer_size, + .alignment = 4096, + .handle = 0, + .initial_domain = 4, // RADEON_GEM_DOMAIN_VRAM + .flags = 4 + }; + + ret = drmCommandWriteRead(fd, DRM_RADEON_GEM_CREATE, &args, (sizeof (struct drm_radeon_gem_create))); + if (ret != 0) { + perror("drmCommandWriteRead(DRM_RADEON_GEM_CREATE)"); + } + assert(args.handle != 0); + + struct drm_radeon_gem_mmap mmap_args = { + .handle = args.handle, + .offset = 0, + .size = colorbuffer_size, + }; + ret = drmCommandWriteRead(fd, DRM_RADEON_GEM_MMAP, &mmap_args, (sizeof (struct drm_radeon_gem_mmap))); + if (ret != 0) { + perror("drmCommandWriteRead(DRM_RADEON_GEM_MMAP)"); + } + + void * ptr = mmap(0, + colorbuffer_size, + PROT_READ | PROT_WRITE, + MAP_SHARED, + fd, + mmap_args.addr_ptr); + assert(ptr != MAP_FAILED); + + // clear colorbuffer + for (int i = 0; i < colorbuffer_size / 4; i++) { + ((uint32_t*)ptr)[i] = 0x00000000; + } + asm volatile ("" ::: "memory"); + + if (out_ptr != NULL) { + *out_ptr = ptr; + } else { + munmap(ptr, colorbuffer_size); + } + + return args.handle; +} + +int main() +{ + int ret; + int fd = open("/dev/dri/card0", O_RDWR | O_CLOEXEC); + + const int texture_size = 128 * 128 * 4; + const int colorbuffer_size = 1600 * 1200 * 4; + int intermediate_handle[2]; + int colorbuffer_handle; + int texturebuffer_handle; + void * texturebuffer_ptr; + void * colorbuffer_ptr; + int flush_handle; + + // colorbuffer + colorbuffer_handle = create_colorbuffer(fd, colorbuffer_size, &colorbuffer_ptr); + texturebuffer_handle = create_colorbuffer(fd, texture_size, &texturebuffer_ptr); + intermediate_handle[0] = create_colorbuffer(fd, texture_size, NULL); + intermediate_handle[1] = create_colorbuffer(fd, texture_size, NULL); + + { + void * texture_buf = read_file("../texture/butterfly_128x128_argb8888.data"); + assert(texture_buf != NULL); + for (int i = 0; i < texture_size / 4; i++) { + ((uint32_t*)texturebuffer_ptr)[i] = ((uint32_t*)texture_buf)[i]; + } + asm volatile ("" ::: "memory"); + munmap(texturebuffer_ptr, texture_size); + free(texture_buf); + } + + // flush + { + struct drm_radeon_gem_create args = { + .size = 4096, + .alignment = 4096, + .handle = 0, + .initial_domain = 2, // GTT + .flags = 0 + }; + + ret = drmCommandWriteRead(fd, DRM_RADEON_GEM_CREATE, + &args, (sizeof (args))); + if (ret != 0) { + perror("drmCommandWriteRead(DRM_RADEON_GEM_CREATE)"); + } + assert(args.handle != 0); + flush_handle = args.handle; + } + + fprintf(stderr, "colorbuffer handle %d\n", colorbuffer_handle); + + struct drm_radeon_cs_reloc relocs[] = { + { + .handle = colorbuffer_handle, + .read_domains = 4, // RADEON_GEM_DOMAIN_VRAM + .write_domain = 4, // RADEON_GEM_DOMAIN_VRAM + .flags = 8, + }, + { + .handle = texturebuffer_handle, + .read_domains = 4, // RADEON_GEM_DOMAIN_VRAM + .write_domain = 4, // RADEON_GEM_DOMAIN_VRAM + .flags = 8, + }, + { + .handle = intermediate_handle[0], + .read_domains = 4, // RADEON_GEM_DOMAIN_VRAM + .write_domain = 4, // RADEON_GEM_DOMAIN_VRAM + .flags = 8, + }, + { + .handle = intermediate_handle[1], + .read_domains = 4, // RADEON_GEM_DOMAIN_VRAM + .write_domain = 4, // RADEON_GEM_DOMAIN_VRAM + .flags = 8, + }, + { + .handle = flush_handle, + .read_domains = 2, // RADEON_GEM_DOMAIN_GTT + .write_domain = 2, // RADEON_GEM_DOMAIN_GTT + .flags = 0, + } + }; + + uint32_t flags[2] = { + 5, // RADEON_CS_KEEP_TILING_FLAGS | RADEON_CS_END_OF_FRAME + 0, // RADEON_CS_RING_GFX + }; + + int ib_dwords = 0; + { + int texturebuffer_reloc_ix = 1; + int colorbuffer_reloc_ix = 2; + int shader_ix = 0; + ib_dwords = indirect_buffer(ib_dwords, + 128, 128, + colorbuffer_reloc_ix, + texturebuffer_reloc_ix, + shader_ix, + true); + } + for (int i = 0; i < 0; i++) { + { + int texturebuffer_reloc_ix = 2; + int colorbuffer_reloc_ix = 3; + int shader_ix = 1; + ib_dwords = indirect_buffer(ib_dwords, + 128, 128, + colorbuffer_reloc_ix, + texturebuffer_reloc_ix, + shader_ix, + true); + } + { + int texturebuffer_reloc_ix = 3; + int colorbuffer_reloc_ix = 2; + int shader_ix = 0; + ib_dwords = indirect_buffer(ib_dwords, + 128, 128, + colorbuffer_reloc_ix, + texturebuffer_reloc_ix, + shader_ix, + true); + } + } + { + int texturebuffer_reloc_ix = 2; + int colorbuffer_reloc_ix = 0; + int shader_ix = 1; + ib_dwords = indirect_buffer(ib_dwords, + 1600, 1200, + colorbuffer_reloc_ix, + texturebuffer_reloc_ix, + shader_ix, + false); + } + + struct drm_radeon_cs_chunk chunks[3] = { + { + .chunk_id = RADEON_CHUNK_ID_IB, + .length_dw = ib_dwords, + .chunk_data = (uint64_t)(uintptr_t)ib, + }, + { + .chunk_id = RADEON_CHUNK_ID_RELOCS, + .length_dw = (sizeof (relocs)) / (sizeof (uint32_t)), + .chunk_data = (uint64_t)(uintptr_t)relocs, + }, + { + .chunk_id = RADEON_CHUNK_ID_FLAGS, + .length_dw = (sizeof (flags)) / (sizeof (uint32_t)), + .chunk_data = (uint64_t)(uintptr_t)&flags, + }, + }; + + uint64_t chunks_array[3] = { + (uint64_t)(uintptr_t)&chunks[0], + (uint64_t)(uintptr_t)&chunks[1], + (uint64_t)(uintptr_t)&chunks[2], + }; + + struct drm_radeon_cs cs = { + .num_chunks = 3, + .cs_id = 0, + .chunks = (uint64_t)(uintptr_t)chunks_array, + .gart_limit = 0, + .vram_limit = 0, + }; + + ret = drmCommandWriteRead(fd, DRM_RADEON_CS, &cs, (sizeof (struct drm_radeon_cs))); + if (ret != 0) { + perror("drmCommandWriteRead(DRM_RADEON_CS)"); + } + + struct drm_radeon_gem_wait_idle args = { + .handle = flush_handle + }; + while (drmCommandWrite(fd, DRM_RADEON_GEM_WAIT_IDLE, &args, (sizeof (struct drm_radeon_gem_wait_idle))) == -EBUSY); + + int out_fd = open("colorbuffer.data", O_RDWR|O_CREAT); + assert(out_fd >= 0); + ssize_t write_length = write(out_fd, colorbuffer_ptr, colorbuffer_size); + assert(write_length == colorbuffer_size); + close(out_fd); + + int mm_fd = open("/sys/kernel/debug/radeon_vram_mm", O_RDONLY); + assert(mm_fd >= 0); + char buf[4096]; + while (true) { + ssize_t read_length = read(mm_fd, buf, 4096); + assert(read_length >= 0); + write(STDOUT_FILENO, buf, read_length); + if (read_length < 4096) { + break; + } + } + close(mm_fd); + + munmap(colorbuffer_ptr, colorbuffer_size); + + close(fd); +} diff --git a/drm/texture_blur.fs.asm b/drm/texture_blur_horizontal.fs.asm similarity index 100% rename from drm/texture_blur.fs.asm rename to drm/texture_blur_horizontal.fs.asm diff --git a/drm/texture_blur.fs.inc b/drm/texture_blur_horizontal.fs.inc similarity index 90% rename from drm/texture_blur.fs.inc rename to drm/texture_blur_horizontal.fs.inc index 9d2f610..dd4f59d 100644 --- a/drm/texture_blur.fs.inc +++ b/drm/texture_blur_horizontal.fs.inc @@ -1,31 +1,24 @@ 0x00007800, 0x08040000, -0x08040080, -0x00920020, -0x00804010, +0x08020080, +0x00db0020, +0x00c04010, 0x22181010, 0x00007800, 0x08040000, -0x08040080, -0x00920020, -0x00804020, +0x08020080, +0x00db0020, +0x00c04020, 0x22389020, 0x00007800, 0x08040400, -0x08040480, -0x00920020, -0x00804030, +0x08020080, +0x00db0020, +0x00c04030, 0x22181030, -0x00007803, -0x00400000, -0xe400f400, -0x00000000, -0x00000000, -0x00000000, - 0x00007803, 0x00400000, 0xe404f401, @@ -61,15 +54,22 @@ 0x00000000, 0x00000000, -0x00007807, -0x02400000, +0x00007803, +0x00400000, 0xe409fe03, 0x00000000, 0x00000000, 0x00000000, +0x00007807, +0x02400000, +0xe400f400, +0x00000000, +0x00000000, +0x00000000, + 0x00003804, -0x08040800, +0x00040800, 0x08020080, 0x00002220, 0x00000000, diff --git a/drm/texture_blur_vertical.fs.asm b/drm/texture_blur_vertical.fs.asm new file mode 100644 index 0000000..645a6f0 --- /dev/null +++ b/drm/texture_blur_vertical.fs.asm @@ -0,0 +1,94 @@ +-- CONST[0] { -1/D, 1/D, -2/D, 2/D } +-- CONST[1] { -3/D, 3/D, _, _ } +-- CONST[2] { 0.2460, 0.2050, 0.1171, 0.0439 } + +-- uv1 = vec4(vec2(uv0.x, uv0.y + const[1].x), +-- vec2(uv0.x, uv0.y + const[1].y)) +src0.rgb = temp[0] , +src1.rgb = const[0] : + temp[1].rgb = MAD src0.rgr src0.111 src1.0r0 , + temp[1].a = MAD src0.g src0.1 src1.g ; + +-- uv2 = vec4(vec2(uv0.x, uv0.y + const[1].x), +-- vec2(uv0.x, uv0.y + const[1].y)) +src0.rgb = temp[0] , +src1.rgb = const[0] : + temp[2].rgb = MAD src0.rgr src0.111 src1.0b0 , + temp[2].a = MAD src0.g src0.1 src1.a ; + +-- uv3 = vec4(vec2(uv0.x, uv0.y + const[1].x), +-- vec2(uv0.x, uv0.y + const[1].y)) +src0.rgb = temp[0] , +src1.rgb = const[1] : + temp[3].rgb = MAD src0.rgr src0.111 src1.0r0 , + temp[3].a = MAD src0.g src0.1 src1.g ; + +-- s1n = texture2D(tex, uv1n) +-- s1p = texture2D(tex, uv1p) +TEX + temp[4].rgba = LD tex[0].rgba temp[1].rgaa ; +TEX + temp[5].rgba = LD tex[0].rgba temp[1].baaa ; + +-- s2n = texture2D(tex, uv2n) +-- s2p = texture2D(tex, uv2p) +TEX + temp[6].rgba = LD tex[0].rgba temp[2].rgaa ; +TEX + temp[7].rgba = LD tex[0].rgba temp[2].baaa ; + +-- s3n = texture2D(tex, uv3n) +-- s3p = texture2D(tex, uv3p) +TEX + temp[8].rgba = LD tex[0].rgba temp[3].rgaa ; +TEX + temp[9].rgba = LD tex[0].rgba temp[3].baaa ; + +-- s0 = texture2D(tex, uv0) +TEX TEX_SEM_ACQUIRE TEX_SEM_WAIT + temp[0].rgba = LD tex[0].rgba temp[0].rgaa ; + +-- col = s0 * weight[2] + 0 +TEX_SEM_WAIT +src0.rgb = temp[0] , +src1.rgb = const[2] , +src2.rgb = temp[0] : + temp[0].rgb = MAD src0.rgb src1.rrr src2.000 ; + +-- col = s1p * weight[2] + col +src0.rgb = temp[4] , +src1.rgb = const[2] , +src2.rgb = temp[0] : + temp[0].rgb = MAD src0.rgb src1.ggg src2.rgb ; + +-- col = s1p * weight[2] + col +src0.rgb = temp[5] , +src1.rgb = const[2] , +src2.rgb = temp[0] : + temp[0].rgb = MAD src0.rgb src1.ggg src2.rgb ; + +-- col = s2n * weight[2] + col +src0.rgb = temp[6] , +src1.rgb = const[2] , +src2.rgb = temp[0] : + temp[0].rgb = MAD src0.rgb src1.bbb src2.rgb ; + +-- col = s2p * weight[2] + col +src0.rgb = temp[7] , +src1.rgb = const[2] , +src2.rgb = temp[0] : + temp[0].rgb = MAD src0.rgb src1.bbb src2.rgb ; + +-- col = s3n * weight[3] + col +src0.rgb = temp[8] , +src1.a = const[2] , +src2.rgb = temp[0] : + temp[0].rgb = MAD src0.rgb src1.aaa src2.rgb ; + +-- col = s3p * weight[3] + col +OUT TEX_SEM_WAIT +src0.rgb = temp[9] , +src1.a = const[2] , +src2.rgb = temp[0] : + out[0].rgb = MAD src0.rgb src1.aaa src2.rgb , + out[0].a = MAD src0.0 src0.0 src0.1 ; diff --git a/conv/bgra_to_rgba.py b/tools/bgra_to_rgba.py similarity index 67% rename from conv/bgra_to_rgba.py rename to tools/bgra_to_rgba.py index 9010b6b..ceac35a 100644 --- a/conv/bgra_to_rgba.py +++ b/tools/bgra_to_rgba.py @@ -1,4 +1,5 @@ import sys +from PIL import Image with open(sys.argv[1], 'rb') as f: buf = f.read() @@ -16,5 +17,7 @@ for i in range(len(buf) // 4): out[i * 4 + 2] = b out[i * 4 + 3] = a -with open(sys.argv[2], 'wb') as f: - f.write(out) +im = Image.frombuffer("RGBA", (1600, 1200), out) +im.save(sys.argv[2]) +#with open(sys.argv[2], 'wb') as f: +# f.write(out)