842 lines
30 KiB
C
842 lines
30 KiB
C
#include <stdint.h>
|
|
|
|
/*
|
|
This demo does not work in emulators:
|
|
|
|
- Flycast does not work because it strangely is incapable of displaying a
|
|
single rendered frame.
|
|
|
|
- Devcast does not work because it does not perform (the equivalent of) boot
|
|
rom initialization when loading .elf files
|
|
|
|
In an attempt to reduce boilerplate, this demo presumes the boot rom has
|
|
initialized Holly with the values needed to display the "PRODUCED BY OR UNDER
|
|
LICENSE FROM SEGA ENTERPRESES, LTD." screen, and that no register values have
|
|
been modified beyond boot rom initialization.
|
|
*/
|
|
|
|
/* Texture memory access
|
|
|
|
texture_memory64 and texture_memory32 refer two different addressing schemes
|
|
over the same 8MB of physical texture memory.
|
|
|
|
Generally speaking the texture_memory64 address scheme is used for textures
|
|
(any texture memory address referenced by `texture_control_word`), and
|
|
texture_memory32 is used for everything else.
|
|
|
|
E_DC_HW_outline.pdf "2.4 System memory mapping" (PDF page 10)
|
|
*/
|
|
const uint32_t texture_memory64 = 0xa4000000;
|
|
const uint32_t texture_memory32 = 0xa5000000;
|
|
|
|
/* The "TA Polygon Converter FIFO" is a Holly functional unit. */
|
|
const uint32_t ta_polygon_converter_fifo = 0x10000000;
|
|
|
|
/* The "Store Queue" is a SH4 functional unit. */
|
|
const uint32_t store_queue = 0xe0000000;
|
|
|
|
/******************************************************************************
|
|
Region array
|
|
******************************************************************************/
|
|
|
|
/*
|
|
These "region array entries" are briefly illustrated in DCDBSysArc990907E.pdf
|
|
page 168, 177-180.
|
|
|
|
The number of list pointers per region array entry is affected by
|
|
FPU_PARAM_CFG "Region Header Type" (page 368). This struct models the
|
|
"6 × 32bit/Tile Type 2" mode.
|
|
*/
|
|
typedef struct region_array_entry {
|
|
uint32_t tile;
|
|
struct {
|
|
uint32_t opaque;
|
|
uint32_t opaque_modifier_volume;
|
|
uint32_t translucent;
|
|
uint32_t translucent_modifier_volume;
|
|
uint32_t punch_through;
|
|
} list_pointer;
|
|
} region_array_entry;
|
|
static_assert((sizeof (struct region_array_entry)) == 4 * 6);
|
|
|
|
/*
|
|
DCDBSysArc990907E.pdf page 216-217 describes the REGION_ARRAY__ bit fields:
|
|
*/
|
|
#define REGION_ARRAY__TILE__LAST_REGION (1 << 31)
|
|
#define REGION_ARRAY__TILE__Y_POSITION(n) (((n) & 0x3f) << 8)
|
|
#define REGION_ARRAY__TILE__X_POSITION(n) (((n) & 0x3f) << 2)
|
|
|
|
#define REGION_ARRAY__LIST_POINTER__EMPTY (1 << 31)
|
|
#define REGION_ARRAY__LIST_POINTER__OBJECT_LIST(n) (((n) & 0xfffffc) << 0)
|
|
|
|
void transfer_region_array(uint32_t region_array_start,
|
|
uint32_t opaque_list_pointer)
|
|
{
|
|
/*
|
|
Create a minimal region array with a single entry:
|
|
- one tile at tile coordinate (0, 0) with one opaque list pointer
|
|
*/
|
|
|
|
/*
|
|
Holly reads the region array from "32-bit" texture memory address space,
|
|
so the region array is correspondingly written from "32-bit" address space.
|
|
*/
|
|
volatile region_array_entry * region_array = (volatile region_array_entry *)(texture_memory32 + region_array_start);
|
|
|
|
const int num_tiles_x = 640 / 32;
|
|
const int num_tiles_y = 480 / 32;
|
|
const int num_tiles = num_tiles_x * num_tiles_y;
|
|
|
|
for (int i = 0; i < num_tiles; i++) {
|
|
/* define one region array entry per 32×32 px tile over a 640x480 px area */
|
|
|
|
int x = i % num_tiles_x;
|
|
int y = i / num_tiles_x;
|
|
|
|
bool last_tile = (i == (num_tiles - 1));
|
|
|
|
region_array[i].tile
|
|
= (last_tile ? REGION_ARRAY__TILE__LAST_REGION : 0)
|
|
| REGION_ARRAY__TILE__Y_POSITION(y)
|
|
| REGION_ARRAY__TILE__X_POSITION(x);
|
|
|
|
/*
|
|
list pointers are offsets relative to the beginning of "32-bit" texture memory.
|
|
|
|
Each list type uses different rasterization steps, "opaque" being the fastest and most efficient.
|
|
*/
|
|
|
|
/*
|
|
In all previous demos, a single `opaque_list_pointer` was used for all
|
|
tiles. This was correct in the cases where:
|
|
|
|
- we were generating our own object lists
|
|
- we were using the TA with a single tile
|
|
|
|
However, this is no longer correct for this example, where we are both
|
|
using the TA and multiple tiles simultaneously. In this case, the TA's
|
|
"object pointer block" allocation strategy needs to implemented here.
|
|
|
|
See DCDBSysArc990907E.pdf page 178,179 and 186 for a relatively weak
|
|
explanation of the TA's OPB allocation behavior.
|
|
*/
|
|
|
|
|
|
// This example is using TA_ALLOC_CTRL__O_OPB__8X4BYTE, so each OPB is 8×4
|
|
// bytes. They are (un)coincidentally stored in the same order that the
|
|
// tile x position and tile y position coordinates are calculated above.
|
|
int opb_pointer = opaque_list_pointer + i * 8 * 4;
|
|
region_array[i].list_pointer.opaque = REGION_ARRAY__LIST_POINTER__OBJECT_LIST(opb_pointer);
|
|
region_array[i].list_pointer.opaque_modifier_volume = REGION_ARRAY__LIST_POINTER__EMPTY;
|
|
region_array[i].list_pointer.translucent = REGION_ARRAY__LIST_POINTER__EMPTY;
|
|
region_array[i].list_pointer.translucent_modifier_volume = REGION_ARRAY__LIST_POINTER__EMPTY;
|
|
region_array[i].list_pointer.punch_through = REGION_ARRAY__LIST_POINTER__EMPTY;
|
|
}
|
|
}
|
|
|
|
/******************************************************************************
|
|
ISP/TSP Parameter
|
|
******************************************************************************/
|
|
|
|
/*
|
|
Other examples of possible ISP/TSP parameter formats are shown on
|
|
DCDBSysArc990907E.pdf page 221. Page 221 is non-exhaustive, and many
|
|
permutations are possible.
|
|
|
|
Parameter format selection is controlled mostly by the value of the
|
|
`isp_tsp_instruction_word` (always present).
|
|
|
|
This is most similar to the "2 Stripped Triangle Polygon (Non-Textured,
|
|
Gouraud)" example (except this is for a non-strip triangle).
|
|
*/
|
|
typedef struct isp_tsp_parameter__vertex {
|
|
float x;
|
|
float y;
|
|
float z;
|
|
uint32_t color;
|
|
} isp_tsp_parameter__vertex;
|
|
|
|
typedef struct isp_tsp_parameter__polygon {
|
|
uint32_t isp_tsp_instruction_word;
|
|
uint32_t tsp_instruction_word;
|
|
uint32_t texture_control_word;
|
|
isp_tsp_parameter__vertex a;
|
|
isp_tsp_parameter__vertex b;
|
|
isp_tsp_parameter__vertex c;
|
|
} isp_tsp_parameter__polygon;
|
|
|
|
/*
|
|
isp_tsp_instruction_word bits
|
|
|
|
DCDBSysArc990907E.pdf page 222-225
|
|
*/
|
|
#define ISP_TSP_INSTRUCTION_WORD__DEPTH_COMPARE_MODE__ALWAYS (7 << 29)
|
|
#define ISP_TSP_INSTRUCTION_WORD__DEPTH_COMPARE_MODE__GREATER (4 << 29)
|
|
|
|
#define ISP_TSP_INSTRUCTION_WORD__CULLING_MODE__NO_CULLING (0 << 27)
|
|
|
|
#define ISP_TSP_INSTRUCTION_WORD__GOURAUD_SHADING (1 << 23)
|
|
|
|
/*
|
|
tsp_instruction_word bits
|
|
|
|
DCDBSysArc990907E.pdf page 226-232
|
|
*/
|
|
#define TSP_INSTRUCTION_WORD__SRC_ALPHA_INSTR__ONE (1 << 29)
|
|
#define TSP_INSTRUCTION_WORD__DST_ALPHA_INSTR__ZERO (0 << 26)
|
|
#define TSP_INSTRUCTION_WORD__FOG_CONTROL__NO_FOG (0b10 << 22)
|
|
#define TSP_INSTRUCTION_WORD__FILTER_MODE__POINT_SAMPLED (0b00 << 13)
|
|
#define TSP_INSTRUCTION_WORD__TEXTURE_SHADING_INSTRUCTION__DECAL (0 << 6)
|
|
#define TSP_INSTRUCTION_WORD__TEXTURE_U_SIZE__256 (5 << 3)
|
|
#define TSP_INSTRUCTION_WORD__TEXTURE_V_SIZE__256 (5 << 0)
|
|
|
|
void transfer_isp_tsp_background_parameter(uint32_t isp_tsp_parameter_start)
|
|
{
|
|
/*
|
|
Create a minimal background parameter:
|
|
- non-textured
|
|
- packed color
|
|
- single volume
|
|
*/
|
|
|
|
volatile isp_tsp_parameter__polygon * params = (volatile isp_tsp_parameter__polygon *)(texture_memory32 + isp_tsp_parameter_start);
|
|
|
|
params[0].isp_tsp_instruction_word = ISP_TSP_INSTRUCTION_WORD__DEPTH_COMPARE_MODE__ALWAYS
|
|
| ISP_TSP_INSTRUCTION_WORD__CULLING_MODE__NO_CULLING;
|
|
|
|
params[0].tsp_instruction_word = TSP_INSTRUCTION_WORD__SRC_ALPHA_INSTR__ONE
|
|
| TSP_INSTRUCTION_WORD__DST_ALPHA_INSTR__ZERO
|
|
| TSP_INSTRUCTION_WORD__FOG_CONTROL__NO_FOG;
|
|
|
|
params[0].texture_control_word = 0;
|
|
|
|
// top left
|
|
params[0].a.x = 0.0f;
|
|
params[0].a.y = 0.0f;
|
|
params[0].a.z = 0.00001f;
|
|
params[0].a.color = 0xff00ff; // magenta
|
|
|
|
// top right
|
|
params[0].b.x = 32.0f;
|
|
params[0].b.y = 0.0f;
|
|
params[0].b.z = 0.00001f;
|
|
params[0].b.color = 0xff00ff; // magenta
|
|
|
|
// bottom right
|
|
params[0].c.x = 32.0f;
|
|
params[0].c.y = 32.0f;
|
|
params[0].c.z = 0.00001f;
|
|
params[0].c.color = 0xff00ff; // magenta
|
|
|
|
// bottom left (implied)
|
|
}
|
|
|
|
/* background */
|
|
#define ISP_BACKGND_T__SKIP(n) (((n) & 0x7) << 24)
|
|
#define ISP_BACKGND_T__TAG_ADDRESS(n) (((n) & 0x1fffff) << 3)
|
|
#define ISP_BACKGND_T__TAG_OFFSET(n) (((n) & 0x7) << 0)
|
|
|
|
/******************************************************************************
|
|
SH4 store queue
|
|
******************************************************************************/
|
|
|
|
/*
|
|
The TA polygon converter FIFO requires 32-byte bus access. Attempts to access
|
|
the TA with smaller bus accesses will result in incorrect TA operation. The
|
|
Dreamcast has three mechanisms that can generate 32-byte writes:
|
|
|
|
- SH4 store queue (commonly used)
|
|
|
|
- Holly CH2-DMA (commonly used)
|
|
|
|
- meticulous and clever use of SH4 cache writeback (esoteric forbidden technique)
|
|
|
|
Of these, the mechanism that requires the least code is the SH4 store queue,
|
|
so this demo will also use the SH4 store queue for that reason.
|
|
|
|
The SH4 store queue is described in sh7091pm_e.pdf printed page 61-64 and
|
|
79-81.
|
|
*/
|
|
|
|
// sh7091pm_e.pdf:
|
|
// > Issuing a PREF instruction for P4 area H'E000 0000 to H'E3FF FFFC starts a
|
|
// > burst transfer from the SQs to external memory.
|
|
#define pref(address) \
|
|
{ asm volatile ("pref @%0" : : "r" (address) : "memory"); }
|
|
|
|
volatile uint32_t * SH7091__CCN__QACR0 = (volatile uint32_t *)(0xff000000 + 0x38);
|
|
volatile uint32_t * SH7091__CCN__QACR1 = (volatile uint32_t *)(0xff000000 + 0x3c);
|
|
|
|
/******************************************************************************
|
|
TA Parameters
|
|
******************************************************************************/
|
|
|
|
/*
|
|
The primary advantage of using the TA: it will generate object lists on your
|
|
behalf, and does a reasonable job of excluding object list entries from tiles
|
|
that are entirely outside the area of that triangle.
|
|
|
|
In addition, the TA can be used to perform floating point to integer color
|
|
packing, including color component clamping. On the SH4, each floating point
|
|
to integer color conversion requires at least 50-60 clock cycles, whereas the
|
|
TA can do the same conversion much more quickly (~1 clock cycle).
|
|
|
|
Floating point color is typical when performing (colored) lighting/shading
|
|
calculations.
|
|
*/
|
|
|
|
/*
|
|
TA parameters are roughly superset of CORE ISP/TSP parameters.
|
|
|
|
There are a few differences:
|
|
|
|
- the TA overwrites certain ISP/TSP Instruction Word bits, based on duplicated
|
|
values in the TA Parameter Control Word (DCDBSysArc990907E.pdf page 200)
|
|
|
|
- the TA supports several (floating point) vertex color formats, whereas CORE
|
|
exclusively supports 32-bit packed integer ARGB color.
|
|
*/
|
|
|
|
typedef struct ta_global_parameter__polygon_type_0 {
|
|
uint32_t parameter_control_word;
|
|
uint32_t isp_tsp_instruction_word;
|
|
uint32_t tsp_instruction_word;
|
|
uint32_t texture_control_word;
|
|
uint32_t _res0;
|
|
uint32_t _res1;
|
|
uint32_t data_size_for_sort_dma;
|
|
uint32_t next_address_for_sort_dma;
|
|
} ta_global_parameter__polygon_type_0;
|
|
static_assert((sizeof (struct ta_global_parameter__polygon_type_0)) == 32);
|
|
|
|
typedef struct ta_global_parameter__end_of_list {
|
|
uint32_t parameter_control_word;
|
|
uint32_t _res0;
|
|
uint32_t _res1;
|
|
uint32_t _res2;
|
|
uint32_t _res3;
|
|
uint32_t _res4;
|
|
uint32_t _res5;
|
|
uint32_t _res6;
|
|
} ta_global_parameter__end_of_list;
|
|
static_assert((sizeof (struct ta_global_parameter__end_of_list)) == 32);
|
|
|
|
/*
|
|
The TA only supports polygon/triangle vertex input represented as a triangle
|
|
strip. TA triangle strips can be any length between 1 and infinity (or the end
|
|
of texture memory, whichever comes first). CORE triangle strips can be any
|
|
length between 1 and 6. The TA automatically splits infinite-length strips
|
|
into strip lengths that CORE supports.
|
|
|
|
See DCDBSysArc990907E.pdf page 181.
|
|
*/
|
|
typedef struct ta_vertex_parameter__polygon_type_3 {
|
|
uint32_t parameter_control_word;
|
|
float x;
|
|
float y;
|
|
float z;
|
|
float u;
|
|
float v;
|
|
uint32_t base_color;
|
|
uint32_t offset_color;
|
|
} ta_vertex_parameter__polygon_type_3;
|
|
static_assert((sizeof (struct ta_vertex_parameter__polygon_type_3)) == 32);
|
|
|
|
#define PARAMETER_CONTROL_WORD__PARA_CONTROL__PARA_TYPE__END_OF_LIST (0 << 29)
|
|
#define PARAMETER_CONTROL_WORD__PARA_CONTROL__PARA_TYPE__POLYGON_OR_MODIFIER_VOLUME (4 << 29)
|
|
#define PARAMETER_CONTROL_WORD__PARA_CONTROL__PARA_TYPE__VERTEX_PARAMETER (7 << 29)
|
|
#define PARAMETER_CONTROL_WORD__PARA_CONTROL__END_OF_STRIP (1 << 28)
|
|
#define PARAMETER_CONTROL_WORD__PARA_CONTROL__LIST_TYPE__OPAQUE (0 << 24)
|
|
#define PARAMETER_CONTROL_WORD__OBJ_CONTROL__COL_TYPE__PACKED_COLOR (0 << 4)
|
|
#define PARAMETER_CONTROL_WORD__OBJ_CONTROL__TEXTURE (1 << 3)
|
|
#define PARAMETER_CONTROL_WORD__OBJ_CONTROL__GOURAUD (1 << 1)
|
|
|
|
#define TEXTURE_CONTROL_WORD__PIXEL_FORMAT__565 (1 << 27)
|
|
#define TEXTURE_CONTROL_WORD__SCAN_ORDER__NON_TWIDDLED (1 << 26)
|
|
#define TEXTURE_CONTROL_WORD__TEXTURE_ADDRESS(a) (((a) & 0x1fffff) << 0)
|
|
|
|
static inline uint32_t transfer_ta_global_end_of_list(uint32_t store_queue_ix)
|
|
{
|
|
//
|
|
// TA "end of list" global transfer
|
|
//
|
|
|
|
volatile ta_global_parameter__end_of_list * end_of_list = (volatile ta_global_parameter__end_of_list *)store_queue_ix;
|
|
|
|
end_of_list->parameter_control_word = PARAMETER_CONTROL_WORD__PARA_CONTROL__PARA_TYPE__END_OF_LIST;
|
|
|
|
// start store queue transfer of `end_of_list` to the TA
|
|
pref(store_queue_ix);
|
|
|
|
store_queue_ix += (sizeof (ta_global_parameter__end_of_list));
|
|
|
|
return store_queue_ix;
|
|
}
|
|
|
|
static inline uint32_t transfer_ta_vertex_triangle(uint32_t store_queue_ix,
|
|
float ax, float ay, float az, float au, float av, uint32_t ac,
|
|
float bx, float by, float bz, float bu, float bv, uint32_t bc,
|
|
float cx, float cy, float cz, float cu, float cv, uint32_t cc)
|
|
{
|
|
//
|
|
// TA polygon vertex transfer
|
|
//
|
|
|
|
volatile ta_vertex_parameter__polygon_type_3 * vertex = (volatile ta_vertex_parameter__polygon_type_3 *)store_queue_ix;
|
|
|
|
// bottom left
|
|
vertex[0].parameter_control_word = PARAMETER_CONTROL_WORD__PARA_CONTROL__PARA_TYPE__VERTEX_PARAMETER;
|
|
vertex[0].x = ax;
|
|
vertex[0].y = ay;
|
|
vertex[0].z = az;
|
|
vertex[0].u = au;
|
|
vertex[0].v = av;
|
|
vertex[0].base_color = ac;
|
|
vertex[0].offset_color = 0;
|
|
|
|
// start store queue transfer of `vertex[0]` to the TA
|
|
pref(store_queue_ix + 32 * 0);
|
|
|
|
// top center
|
|
vertex[1].parameter_control_word = PARAMETER_CONTROL_WORD__PARA_CONTROL__PARA_TYPE__VERTEX_PARAMETER;
|
|
vertex[1].x = bx;
|
|
vertex[1].y = by;
|
|
vertex[1].z = bz;
|
|
vertex[1].u = bu;
|
|
vertex[1].v = bv;
|
|
vertex[1].base_color = bc;
|
|
vertex[1].offset_color = 0;
|
|
|
|
// start store queue transfer of `vertex[1]` to the TA
|
|
pref(store_queue_ix + 32 * 1);
|
|
|
|
// bottom right
|
|
vertex[2].parameter_control_word = PARAMETER_CONTROL_WORD__PARA_CONTROL__PARA_TYPE__VERTEX_PARAMETER
|
|
| PARAMETER_CONTROL_WORD__PARA_CONTROL__END_OF_STRIP;
|
|
vertex[2].x = cx;
|
|
vertex[2].y = cy;
|
|
vertex[2].z = cz;
|
|
vertex[2].u = cu;
|
|
vertex[2].v = cv;
|
|
vertex[2].base_color = cc;
|
|
vertex[2].offset_color = 0;
|
|
|
|
// start store queue transfer of `params[2]` to the TA
|
|
pref(store_queue_ix + 32 * 2);
|
|
|
|
store_queue_ix += (sizeof (ta_vertex_parameter__polygon_type_3)) * 3;
|
|
|
|
return store_queue_ix;
|
|
}
|
|
|
|
static inline uint32_t transfer_ta_global_polygon(uint32_t store_queue_ix, uint32_t texture_address)
|
|
{
|
|
//
|
|
// TA polygon global transfer
|
|
//
|
|
|
|
volatile ta_global_parameter__polygon_type_0 * polygon = (volatile ta_global_parameter__polygon_type_0 *)store_queue_ix;
|
|
|
|
polygon->parameter_control_word = PARAMETER_CONTROL_WORD__PARA_CONTROL__PARA_TYPE__POLYGON_OR_MODIFIER_VOLUME
|
|
| PARAMETER_CONTROL_WORD__PARA_CONTROL__LIST_TYPE__OPAQUE
|
|
| PARAMETER_CONTROL_WORD__OBJ_CONTROL__COL_TYPE__PACKED_COLOR
|
|
| PARAMETER_CONTROL_WORD__OBJ_CONTROL__TEXTURE
|
|
| PARAMETER_CONTROL_WORD__OBJ_CONTROL__GOURAUD;
|
|
|
|
polygon->isp_tsp_instruction_word = ISP_TSP_INSTRUCTION_WORD__DEPTH_COMPARE_MODE__GREATER
|
|
| ISP_TSP_INSTRUCTION_WORD__CULLING_MODE__NO_CULLING;
|
|
// Note that it is not possible to use
|
|
// ISP_TSP_INSTRUCTION_WORD__GOURAUD_SHADING in this isp_tsp_instruction_word,
|
|
// because `gouraud` is one of the bits overwritten by the value in
|
|
// parameter_control_word. See DCDBSysArc990907E.pdf page 200.
|
|
|
|
polygon->tsp_instruction_word = TSP_INSTRUCTION_WORD__SRC_ALPHA_INSTR__ONE
|
|
| TSP_INSTRUCTION_WORD__DST_ALPHA_INSTR__ZERO
|
|
| TSP_INSTRUCTION_WORD__FOG_CONTROL__NO_FOG
|
|
| TSP_INSTRUCTION_WORD__FILTER_MODE__POINT_SAMPLED
|
|
| TSP_INSTRUCTION_WORD__TEXTURE_SHADING_INSTRUCTION__DECAL
|
|
| TSP_INSTRUCTION_WORD__TEXTURE_U_SIZE__256
|
|
| TSP_INSTRUCTION_WORD__TEXTURE_V_SIZE__256;
|
|
|
|
polygon->texture_control_word = TEXTURE_CONTROL_WORD__PIXEL_FORMAT__565
|
|
| TEXTURE_CONTROL_WORD__SCAN_ORDER__NON_TWIDDLED
|
|
| TEXTURE_CONTROL_WORD__TEXTURE_ADDRESS(texture_address / 8);
|
|
|
|
polygon->data_size_for_sort_dma = 0;
|
|
polygon->next_address_for_sort_dma = 0;
|
|
|
|
// start store queue transfer of `polygon` to the TA
|
|
pref(store_queue_ix);
|
|
|
|
store_queue_ix += (sizeof (ta_global_parameter__polygon_type_0));
|
|
|
|
return store_queue_ix;
|
|
}
|
|
|
|
/*
|
|
These vertex and face definitions are a trivial transformation of the default
|
|
Blender cube, as exported by the .obj exporter (with triangulation enabled).
|
|
*/
|
|
typedef struct vec3 {
|
|
float x;
|
|
float y;
|
|
float z;
|
|
} vec3;
|
|
|
|
typedef struct vec2 {
|
|
float u;
|
|
float v;
|
|
} vec2;
|
|
|
|
static const vec3 cube_vertex_position[] = {
|
|
{ 1.0f, 1.0f, -1.0f },
|
|
{ 1.0f, -1.0f, -1.0f },
|
|
{ 1.0f, 1.0f, 1.0f },
|
|
{ 1.0f, -1.0f, 1.0f },
|
|
{ -1.0f, 1.0f, -1.0f },
|
|
{ -1.0f, -1.0f, -1.0f },
|
|
{ -1.0f, 1.0f, 1.0f },
|
|
{ -1.0f, -1.0f, 1.0f },
|
|
};
|
|
|
|
static const vec2 cube_vertex_texture[] = {
|
|
{1.0f, 0.0f},
|
|
{0.0f, 1.0f},
|
|
{0.0f, 0.0f},
|
|
{1.0f, 1.0f},
|
|
};
|
|
|
|
typedef struct position_texture {
|
|
int position;
|
|
int texture;
|
|
} position_texture;
|
|
|
|
typedef struct face {
|
|
position_texture a;
|
|
position_texture b;
|
|
position_texture c;
|
|
} face;
|
|
|
|
/*
|
|
It is also possible to submit each cube face as a 4-vertex triangle strip, or
|
|
submit the entire cube as a single triangle strip.
|
|
|
|
Separate 3-vertex triangles are chosen to make this example more
|
|
straightforward, but this is not the best approach if high performance is
|
|
desired.
|
|
*/
|
|
static const face cube_faces[] = {
|
|
{{4, 0}, {2, 1}, {0, 2}},
|
|
{{2, 0}, {7, 1}, {3, 2}},
|
|
{{6, 0}, {5, 1}, {7, 2}},
|
|
{{1, 0}, {7, 1}, {5, 2}},
|
|
{{0, 0}, {3, 1}, {1, 2}},
|
|
{{4, 0}, {1, 1}, {5, 2}},
|
|
{{4, 0}, {6, 3}, {2, 1}},
|
|
{{2, 0}, {6, 3}, {7, 1}},
|
|
{{6, 0}, {4, 3}, {5, 1}},
|
|
{{1, 0}, {3, 3}, {7, 1}},
|
|
{{0, 0}, {2, 3}, {3, 1}},
|
|
{{4, 0}, {0, 3}, {1, 1}},
|
|
};
|
|
static const int cube_faces_length = (sizeof (cube_faces)) / (sizeof (cube_faces[0]));
|
|
|
|
#define cos(n) __builtin_cosf(n)
|
|
#define sin(n) __builtin_sinf(n)
|
|
|
|
float theta = 0.7853981633974483f; // pi / 4
|
|
|
|
static inline vec3 vertex_rotate(vec3 v)
|
|
{
|
|
// to make the cube's appearance more interesting, rotate the vertex on two
|
|
// axes
|
|
|
|
float x0 = v.x;
|
|
float y0 = v.y;
|
|
float z0 = v.z;
|
|
|
|
float x1 = x0 * cos(theta) - z0 * sin(theta);
|
|
float y1 = y0;
|
|
float z1 = x0 * sin(theta) + z0 * cos(theta);
|
|
|
|
float x2 = x1;
|
|
float y2 = y1 * cos(theta) - z1 * sin(theta);
|
|
float z2 = y1 * sin(theta) + z1 * cos(theta);
|
|
|
|
return (vec3){x2, y2, z2};
|
|
}
|
|
|
|
static inline vec3 vertex_perspective_divide(vec3 v)
|
|
{
|
|
float w = 1.0f / (v.z + 3.0f);
|
|
return (vec3){v.x * w, v.y * w, w};
|
|
}
|
|
|
|
static inline vec3 vertex_screen_space(vec3 v)
|
|
{
|
|
return (vec3){
|
|
v.x * 240.f + 320.f,
|
|
v.y * 240.f + 240.f,
|
|
v.z,
|
|
};
|
|
}
|
|
|
|
void transfer_ta_cube(uint32_t texture_address)
|
|
{
|
|
// set the store queue destination address to the TA Polygon Converter FIFO
|
|
*SH7091__CCN__QACR0 = ((ta_polygon_converter_fifo >> 24) & 0b11100);
|
|
*SH7091__CCN__QACR1 = ((ta_polygon_converter_fifo >> 24) & 0b11100);
|
|
|
|
uint32_t store_queue_ix = store_queue;
|
|
|
|
// See sh7091pm_e.pdf, printed page 79:
|
|
//
|
|
// > While the contents of one SQ are being transferred to external memory,
|
|
// > the other SQ can be written to without a penalty cycle, but writing to
|
|
// > the SQ involved in the transfer to external memory is deferred until the
|
|
// > transfer is completed.
|
|
//
|
|
// The reason for incrementing store_queue_ix is that it is a cheap way to
|
|
// track which store queue is the most/least recently used--encoded in bit 5
|
|
// of the store queue address.
|
|
|
|
store_queue_ix = transfer_ta_global_polygon(store_queue_ix, texture_address);
|
|
|
|
for (int face_ix = 0; face_ix < cube_faces_length; face_ix++) {
|
|
int ipa = cube_faces[face_ix].a.position;
|
|
int ipb = cube_faces[face_ix].b.position;
|
|
int ipc = cube_faces[face_ix].c.position;
|
|
|
|
vec3 vpa = vertex_screen_space(
|
|
vertex_perspective_divide(
|
|
vertex_rotate(cube_vertex_position[ipa])));
|
|
|
|
vec3 vpb = vertex_screen_space(
|
|
vertex_perspective_divide(
|
|
vertex_rotate(cube_vertex_position[ipb])));
|
|
|
|
vec3 vpc = vertex_screen_space(
|
|
vertex_perspective_divide(
|
|
vertex_rotate(cube_vertex_position[ipc])));
|
|
|
|
|
|
int ita = cube_faces[face_ix].a.texture;
|
|
int itb = cube_faces[face_ix].b.texture;
|
|
int itc = cube_faces[face_ix].c.texture;
|
|
|
|
vec2 vta = cube_vertex_texture[ita];
|
|
vec2 vtb = cube_vertex_texture[itb];
|
|
vec2 vtc = cube_vertex_texture[itc];
|
|
|
|
// vertex color is irrelevant in "decal" mode
|
|
uint32_t va_color = 0;
|
|
uint32_t vb_color = 0;
|
|
uint32_t vc_color = 0;
|
|
|
|
store_queue_ix = transfer_ta_vertex_triangle(store_queue_ix,
|
|
vpa.x, vpa.y, vpa.z, vta.u, vta.v, va_color,
|
|
vpb.x, vpb.y, vpb.z, vtb.u, vtb.v, vb_color,
|
|
vpc.x, vpc.y, vpc.z, vtc.u, vtc.v, vc_color);
|
|
}
|
|
|
|
store_queue_ix = transfer_ta_global_end_of_list(store_queue_ix);
|
|
}
|
|
|
|
/******************************************************************************
|
|
Holly register definitions
|
|
******************************************************************************/
|
|
|
|
volatile uint32_t * SOFTRESET = (volatile uint32_t *)(0xa05f8000 + 0x08);
|
|
volatile uint32_t * STARTRENDER = (volatile uint32_t *)(0xa05f8000 + 0x14);
|
|
volatile uint32_t * PARAM_BASE = (volatile uint32_t *)(0xa05f8000 + 0x20);
|
|
volatile uint32_t * REGION_BASE = (volatile uint32_t *)(0xa05f8000 + 0x2c);
|
|
volatile uint32_t * FB_R_SOF1 = (volatile uint32_t *)(0xa05f8000 + 0x50);
|
|
volatile uint32_t * FB_W_SOF1 = (volatile uint32_t *)(0xa05f8000 + 0x60);
|
|
volatile uint32_t * ISP_BACKGND_T = (volatile uint32_t *)(0xa05f8000 + 0x8c);
|
|
|
|
volatile uint32_t * SPG_STATUS = (volatile uint32_t *)(0xa05f8000 + 0x10c);
|
|
|
|
#define SPG_STATUS__VSYNC (1 << 13)
|
|
|
|
volatile uint32_t * TA_OL_BASE = (volatile uint32_t *)(0xa05f8000 + 0x124);
|
|
volatile uint32_t * TA_ISP_BASE = (volatile uint32_t *)(0xa05f8000 + 0x128);
|
|
volatile uint32_t * TA_OL_LIMIT = (volatile uint32_t *)(0xa05f8000 + 0x12c);
|
|
volatile uint32_t * TA_ISP_LIMIT = (volatile uint32_t *)(0xa05f8000 + 0x130);
|
|
volatile uint32_t * TA_GLOB_TILE_CLIP = (volatile uint32_t *)(0xa05f8000 + 0x13c);
|
|
volatile uint32_t * TA_ALLOC_CTRL = (volatile uint32_t *)(0xa05f8000 + 0x140);
|
|
volatile uint32_t * TA_LIST_INIT = (volatile uint32_t *)(0xa05f8000 + 0x144);
|
|
|
|
#define TA_GLOB_TILE_CLIP__TILE_Y_NUM(n) (((n) & 0xf) << 16)
|
|
#define TA_GLOB_TILE_CLIP__TILE_X_NUM(n) (((n) & 0x1f) << 0)
|
|
#define TA_ALLOC_CTRL__OPB_MODE__INCREASING_ADDRESSES (0 << 20)
|
|
#define TA_ALLOC_CTRL__O_OPB__8X4BYTE (1 << 0)
|
|
#define TA_LIST_INIT__LIST_INIT (1 << 31)
|
|
|
|
const uint8_t texture[] __attribute__((aligned(4))) = {
|
|
#embed "pavement_256x256.rgb565"
|
|
};
|
|
|
|
void transfer_texture(uint32_t texture_start)
|
|
{
|
|
// use 4-byte transfers to texture memory, for slightly increased transfer
|
|
// speed
|
|
//
|
|
// It would be even faster to use the SH4 store queue for this operation, or
|
|
// SH4 DMA.
|
|
|
|
const uint32_t * texture4 = (const uint32_t *)texture;
|
|
|
|
for (int i = 0; i < (sizeof (texture)) / 4; i++) {
|
|
// Holly samples texture images from "64-bit" texture memory address space
|
|
*((volatile uint32_t *)(texture_memory64 + texture_start + i * 4)) = texture4[i];
|
|
}
|
|
}
|
|
|
|
void main()
|
|
{
|
|
/*
|
|
a very simple memory map:
|
|
|
|
the ordering within texture memory is not significant, and could be
|
|
anything
|
|
*/
|
|
|
|
// all of the following are addresses in "32-bit" texture memory address
|
|
// space:
|
|
uint32_t framebuffer_start = 0x200000; // intentionally the same address that the boot rom used to draw the SEGA logo
|
|
uint32_t isp_tsp_parameter_start = 0x400000;
|
|
uint32_t region_array_start = 0x500000;
|
|
uint32_t object_list_start = 0x100000;
|
|
uint32_t opaque_list_pointer = object_list_start;
|
|
|
|
// these addresses are in "64-bit" texture memory address space:
|
|
uint32_t texture_start = 0x700000;
|
|
|
|
// background_offset is relative to the beginning of isp_tsp_parameter_start
|
|
uint32_t background_offset = (sizeof (isp_tsp_parameter__polygon)) * 0;
|
|
|
|
transfer_region_array(region_array_start, opaque_list_pointer);
|
|
|
|
transfer_isp_tsp_background_parameter(isp_tsp_parameter_start);
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
// transfer the texture image to texture ram
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
transfer_texture(texture_start);
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
// configure the TA
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
const int tile_y_num = 480 / 32;
|
|
const int tile_x_num = 640 / 32;
|
|
|
|
// TA_GLOB_TILE_CLIP restricts which "object pointer blocks" are written
|
|
// to.
|
|
//
|
|
// This can also be used to implement "windowing", as long as the desired
|
|
// window size happens to be a multiple of 32 pixels. The "User Tile Clip" TA
|
|
// control parameter can also ~equivalently be used as many times as desired
|
|
// within a single TA initialization to produce an identical effect.
|
|
//
|
|
// See DCDBSysArc990907E.pdf page 183.
|
|
*TA_GLOB_TILE_CLIP = TA_GLOB_TILE_CLIP__TILE_Y_NUM(tile_y_num - 1)
|
|
| TA_GLOB_TILE_CLIP__TILE_X_NUM(tile_x_num - 1);
|
|
|
|
// While CORE supports arbitrary-length object lists, the TA uses "object
|
|
// pointer blocks" as a memory allocation strategy. These fixed-length blocks
|
|
// can still have infinite length via "object pointer block links". This
|
|
// mechanism is illustrated in DCDBSysArc990907E.pdf page 188.
|
|
*TA_ALLOC_CTRL = TA_ALLOC_CTRL__OPB_MODE__INCREASING_ADDRESSES
|
|
| TA_ALLOC_CTRL__O_OPB__8X4BYTE;
|
|
|
|
// While building object lists, the TA contains an internal index (exposed as
|
|
// the read-only TA_ITP_CURRENT) for the next address that new ISP/TSP will be
|
|
// stored at. The initial value of this index is TA_ISP_BASE.
|
|
*TA_ISP_BASE = isp_tsp_parameter_start + (sizeof (isp_tsp_parameter__polygon)) * 1;
|
|
*TA_ISP_LIMIT = isp_tsp_parameter_start + 0x100000;
|
|
|
|
// Similarly, the TA also contains, for up to 600 tiles, an internal index for
|
|
// the next address that an object list entry will be stored for each
|
|
// tile. These internal indicies are partially exposed via the read-only
|
|
// TA_OL_POINTERS.
|
|
*TA_OL_BASE = object_list_start;
|
|
|
|
// TA_OL_LIMIT, DCDBSysArc990907E.pdf page 385:
|
|
//
|
|
// > Because the TA may automatically store data in the address that is
|
|
// > specified by this register, it must not be used for other data. For
|
|
// > example, the address specified here must not be the same as the address
|
|
// > in the TA_ISP_BASE register.
|
|
*TA_OL_LIMIT = object_list_start + 0x100000 - 32;
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
// configure CORE
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
// REGION_BASE is the (texture memory-relative) address of the region array.
|
|
*REGION_BASE = region_array_start;
|
|
|
|
// PARAM_BASE is the (texture memory-relative) address of ISP/TSP parameters.
|
|
// Anything that references an ISP/TSP parameter does so relative to this
|
|
// address (and not relative to the beginning of texture memory).
|
|
*PARAM_BASE = isp_tsp_parameter_start;
|
|
|
|
// Set the offset of the background ISP/TSP parameter, relative to PARAM_BASE
|
|
// SKIP is related to the size of each vertex
|
|
*ISP_BACKGND_T = ISP_BACKGND_T__TAG_ADDRESS(background_offset / 4)
|
|
| ISP_BACKGND_T__TAG_OFFSET(0)
|
|
| ISP_BACKGND_T__SKIP(1);
|
|
|
|
// FB_W_SOF1 is the (texture memory-relative) address of the framebuffer that
|
|
// will be written to when a tile is rendered/flushed.
|
|
*FB_W_SOF1 = framebuffer_start;
|
|
|
|
// without waiting for rendering to actually complete, immediately display the
|
|
// framebuffer.
|
|
*FB_R_SOF1 = framebuffer_start;
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
// animated drawing
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
// draw 500 frames of cube rotation
|
|
for (int i = 0; i < 500; i++) {
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
// transfer cube to texture memory via the TA polygon converter FIFO
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
// TA_LIST_INIT needs to be written (every frame) prior to the first FIFO
|
|
// write.
|
|
*TA_LIST_INIT = TA_LIST_INIT__LIST_INIT;
|
|
|
|
// dummy TA_LIST_INIT read; DCDBSysArc990907E.pdf in multiple places says this
|
|
// step is required.
|
|
(void)*TA_LIST_INIT;
|
|
|
|
transfer_ta_cube(texture_start);
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
// wait for vertical synchronization (and the TA)
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
while (!((*SPG_STATUS) & SPG_STATUS__VSYNC));
|
|
while (((*SPG_STATUS) & SPG_STATUS__VSYNC));
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
// start the actual rasterization
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
// start the actual render--the rendering process begins by interpreting the
|
|
// region array
|
|
*STARTRENDER = 1;
|
|
|
|
// increment theta for the cube rotation animation
|
|
// (used by the `vertex_rotate` function)
|
|
theta += 0.01f;
|
|
}
|
|
|
|
// return from main; this will effectively jump back to the serial loader
|
|
}
|