reorganize math and libgcc

I'd like to include bits of libgcc piecemeal--I don't want to "accidentally" start depending on libgcc bits that I'm not aware of. Reworked division so that it uses the on-chip division register.
2023-01-24 23:27:30 -08:00 · 2023-01-24 23:27:30 -08:00 · ab809791cd
commit ab809791cd
parent a4b72e2f85
11 changed files with 2678 additions and 29 deletions
--- a/.gitignore
+++ b/.gitignore
@ -3,5 +3,6 @@
 *.elf
 *.bin
 *.iso
+*.cue
 *.ppm
 *.png
--- a/11
+++ b/11
@ -1,10 +1,17 @@
 CFLAGS = -Isaturn -Imath
 OPT = -O3
+LIBGCC = $(shell $(CC) -print-file-name=libgcc.a)

 all: raytracing.iso

 LIB = ./saturn
 include $(LIB)/common.mk

-LIBGCC = $(shell $(CC) -print-file-name=libgcc.a)
-raytracing.elf: main-saturn.o raytracing.o $(LIBGCC)
+sh/lib1funcs.o: CFLAGS += -DL_ashiftrt
+
+raytracing.elf: main-saturn.o raytracing.o sh/lib1funcs.o
+
+# clean
+clean: clean-sh
+clean-sh:
+	rm -f sh/*.o
--- a/main-hosted.cpp
+++ b/main-hosted.cpp
@ -31,15 +31,15 @@ void put_pixel(int32_t x, int32_t y, const vec3& color)
    return;
  }

-  vec3 px255 = functor1(clamp, color) * fp16_16(255);
-  frame[sy][sx] = functor1(to_uint8_t, px255);
+  vec3 px31 = functor1(clamp, color) * fp16_16(31);
+  frame[sy][sx] = functor1(to_uint8_t, px31);
 }

 void render_ppm(ostream& out)
 {
  using namespace canvas;

-  out << "P3 " << width << ' ' << height << " 255\n";
+  out << "P3 " << width << ' ' << height << " 31\n";
  for (int sy = 0; sy < height; sy++) {
    for (int sx = 0; sx < width; sx++) {
      const pixel& px = frame[sy][sx];
@ -51,4 +51,6 @@ void render_ppm(ostream& out)
 int main()
 {
  render(put_pixel);
+
+  render_ppm(cout);
 }
--- a/main-saturn.cpp
+++ b/main-saturn.cpp
@ -13,23 +13,45 @@ fp16_16 clamp(fp16_16 const& n)
  return (n > fp16_16(1) ? fp16_16(1) : (n < fp16_16(0) ? fp16_16(0) : n));
 };

-uint16_t rgb15(const vec3& color)
+template<typename T, int P>
+inline constexpr T rgb(const vec3& color)
 {
-  vec3 c = functor1(clamp, color) * fp16_16(255);
+  constexpr int channel_mask = (1 << P) - 1;
+  constexpr int last_bit = ((sizeof(T) * 8) - 1);

-  uint8_t red = (c.r.value >> 16) & 0xff;
-  uint8_t green = (c.g.value >> 16) & 0xff;
-  uint8_t blue = (c.b.value >> 16) & 0xff;
+  vec3 c = functor1(clamp, color) * fp16_16(channel_mask);

-  return (blue << 10) | (green << 5) | (red << 0);
+  T red = static_cast<T>(c.r.value >> 16);
+  T green = static_cast<T>(c.g.value >> 16);
+  T blue = static_cast<T>(c.b.value >> 16);
+
+  return (1 << last_bit)
+       | (blue  << (P * 2))
+       | (green << (P * 1))
+       | (red   << (P * 0));
 }

+constexpr auto rgb15 = rgb<uint16_t, 5>;
+constexpr auto rgb24 = rgb<uint32_t, 8>;
+
 void put_pixel(int32_t x, int32_t y, const vec3& color)
 {
  int sx = 320 / 2 + x;
  int sy = 240 / 2 - y;

-  vdp2.vram.u16[512 * sy + sx] = (1 << 15) | rgb15(color);
+  if (sx >= 320 || sx < 0 || sy >= 240 || sy < 0)
+    return;
+
+  vdp2.vram.u16[512 * sy + sx] = rgb15(color);
+}
+
+template <class T>
+void fill(T * buf, T v, int32_t n) noexcept
+{
+  while (n > 0) {
+    *buf++ = v;
+    n -= (sizeof (T));
+  }
 }

 void main_asdf()
@ -40,13 +62,33 @@ void main_asdf()

  vdp2.reg.BGON = BGON__N0ON;

-  vdp2.reg.CHCTLA = ( CHCTLA__N0CHCN__32K_COLOR     // 15 bits per pixel, RGB
+  vdp2.reg.CHCTLA = (
+                      CHCTLA__N0CHCN__32K_COLOR // 15 bits per pixel, RGB
+                      //CHCTLA__N0CHCN__16M_COLOR // 24 bits per pixel
                    | CHCTLA__N0BMSZ__512x256_DOT
                    | CHCTLA__N0BMEN__BITMAP_FORMAT
                    );

  vdp2.reg.MPOFN = MPOFN__N0MP(0);

+  constexpr s32 plane_size = 512 * 256 * 2;
+  fill<volatile uint32_t>(&vdp2.vram.u32[0x0 / 4], (1 << 31) | (1 << 15), plane_size);
+
+  vdp2.reg.SCXIN0 = 0;
+  vdp2.reg.SCXDN0 = 0;
+  vdp2.reg.SCYIN0 = 0;
+  vdp2.reg.SCYDN0 = 0;
+  vdp2.reg.ZMXIN0 = 1;
+  vdp2.reg.ZMXDN0 = 0;
+  vdp2.reg.ZMYIN0 = 1;
+  vdp2.reg.ZMYDN0 = 0;
+
+  vdp2.reg.VCSTA = 0;
+
+  vdp2.reg.WCTLA = 0;
+  vdp2.reg.WCTLB = 0;
+  vdp2.reg.WCTLC = 0;
+
  render(put_pixel);
 }

--- a/math/div.hpp
+++ b/math/div.hpp
@ -0,0 +1,120 @@
+#pragma once
+
+#include <stdint.h>
+
+#ifndef USE_SH2_DVSR
+inline constexpr uint32_t
+__udiv32(uint32_t n, uint32_t d)
+{
+  uint32_t q = 0;
+  uint32_t r = 0;
+
+  for (int i = 31; i >= 0; --i) {
+    q = q << 1;
+    r = r << 1;
+
+    r |= (n >> 31) & 1;
+    n = n << 1;
+
+    if (d <= r) {
+      r = r - d;
+      q = q | 1;
+    }
+  }
+
+  return q;
+}
+
+inline constexpr uint32_t
+__udiv64_32(uint64_t n, uint32_t base)
+{
+  uint64_t rem = n;
+  uint64_t b = base;
+  uint64_t res = 0, d = 1;
+  uint32_t high = rem >> 32;
+
+  if (high >= base) {
+    high = __udiv32(high, base);
+    res = (uint64_t)high << 32;
+    rem -= (uint64_t)(high*base) << 32;
+  }
+
+  while ((int64_t)b > 0 && b < rem) {
+    b = b+b;
+    d = d+d;
+  }
+
+  do {
+    if (rem >= b) {
+      rem -= b;
+      res += d;
+    }
+    b >>= 1;
+    d >>= 1;
+  } while (d);
+
+  return res;
+}
+#else
+#include "sh2.h"
+inline uint32_t
+__udiv64_32(uint64_t n, uint32_t d)
+{
+  sh2.reg.DVSR = d;
+  sh2.reg.DVDNTH = (uint32_t)(n >> 32);
+  sh2.reg.DVDNTL = (uint32_t)(n);
+
+  // 39 cycles
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+  asm volatile ("nop");
+
+  return sh2.reg.DVDNTL;
+}
+#endif
+
+inline int32_t
+__div64_32(int64_t n, int32_t d)
+{
+  uint64_t n_abs = n >= 0 ? (uint64_t)n : -(uint64_t)n;
+  uint32_t d_abs = d >= 0 ? (uint32_t)d : -(uint32_t)d;
+  uint32_t q_abs = __udiv64_32(n_abs, d_abs);
+
+  return (n < 0) == (d < 0) ? (int32_t)q_abs : -(int32_t)q_abs;
+}
--- a/math/fp.hpp
+++ b/math/fp.hpp
@ -1,6 +1,7 @@
 #pragma once

 #include <stdint.h>
+#include "div.hpp"

 struct fp_raw_tag {};

@ -21,8 +22,35 @@ struct fp
  {
    return fp(-value, fp_raw_tag{});
  }
+
+  inline constexpr fp<T, I, B>& operator=(fp<T, I, B> const& v);
+
+  inline constexpr fp<T, I, B>& operator+=(fp<T, I, B> const& v);
+
+  inline constexpr fp<T, I, B>& operator-=(fp<T, I, B> const& v);
 };

+template <typename T, typename I, int B>
+inline constexpr fp<T, I, B>& fp<T, I, B>::operator=(fp<T, I, B> const& v)
+{
+  this->value = v.value;
+  return *this;
+}
+
+template <typename T, typename I, int B>
+inline constexpr fp<T, I, B>& fp<T, I, B>::operator+=(fp<T, I, B> const& v)
+{
+  *this = *this + v;
+  return *this;
+}
+
+template <typename T, typename I, int B>
+inline constexpr fp<T, I, B>& fp<T, I, B>::operator-=(fp<T, I, B> const& v)
+{
+  *this = *this - v;
+  return *this;
+}
+
 template <typename T, typename I, int B>
 constexpr inline fp<T, I, B> operator+(const fp<T, I, B>& a, const fp<T, I, B>& b) noexcept
 {
@ -59,7 +87,10 @@ constexpr inline fp<T, I, B> operator*(T b, const fp<T, I, B>& a) noexcept
 template <typename T, typename I, int B>
 constexpr inline fp<T, I, B> operator/(const fp<T, I, B>& a, const fp<T, I, B>& b) noexcept
 {
-  I p = (static_cast<I>(a.value) * (static_cast<I>(1) << B)) / static_cast<I>(b.value);
+  //T p = (static_cast<T>(a.value) * ) / static_cast<T>(b.value);
+  //T p = static_cast<T>(a.value) / static_cast<T>(b.value);
+  I p = __div64_32((static_cast<I>(a.value) << 16), static_cast<T>(b.value));
+
  return fp<T, I, B>(static_cast<T>(p), fp_raw_tag{});
 }

--- a/math/math.hpp
+++ b/math/math.hpp
--- a/math/vec.hpp
+++ b/math/vec.hpp
@ -25,14 +25,11 @@ struct vec<3, T>

  inline constexpr T const& operator[](int i) const;

-  template<typename U>
-  inline constexpr vec<3, T>& operator=(vec<3, U> const& v);
+  inline constexpr vec<3, T>& operator=(vec<3, T> const& v);

-  template<typename U>
-  inline constexpr vec<3, T>& operator+=(vec<3, U> const& v);
+  inline constexpr vec<3, T>& operator+=(vec<3, T> const& v);

-  template<typename U>
-  inline constexpr vec<3, T>& operator-=(vec<3, U> const& v);
+  inline constexpr vec<3, T>& operator-=(vec<3, T> const& v);
 };

 template <typename T>
@ -66,8 +63,7 @@ inline constexpr T const& vec<3, T>::operator[](int i) const
 }

 template<typename T>
-template<typename U>
-inline constexpr vec<3, T>& vec<3, T>::operator=(vec<3, U> const& v)
+inline constexpr vec<3, T>& vec<3, T>::operator=(vec<3, T> const& v)
 {
  this->x = static_cast<T>(v.x);
  this->y = static_cast<T>(v.y);
@ -76,16 +72,14 @@ inline constexpr vec<3, T>& vec<3, T>::operator=(vec<3, U> const& v)
 }

 template<typename T>
-template<typename U>
-inline constexpr vec<3, T>& vec<3, T>::operator+=(vec<3, U> const& v)
+inline constexpr vec<3, T>& vec<3, T>::operator+=(vec<3, T> const& v)
 {
  *this = *this + vec<3, T>(v);
  return *this;
 }

 template<typename T>
-template<typename U>
-inline constexpr vec<3, T>& vec<3, T>::operator-=(vec<3, U> const& v)
+inline constexpr vec<3, T>& vec<3, T>::operator-=(vec<3, T> const& v)
 {
  *this = *this + vec<3, T>(v);
  return *this;
@ -115,12 +109,30 @@ inline constexpr vec<3, T> operator*(vec<3, T> const& v1, vec<3, T> const& v2)
                   v1.z * v2.z);
 }

+/*
+template <typename T>
+inline constexpr vec<3, T> operator/(vec<3, T> const& v1, vec<3, T> const& v2)
+{
+  return vec<3, T>(v1.x / v2.x,
+                   v1.y / v2.y,
+                   v1.z / v2.z);
+}
+*/
+
 template <typename T>
 inline constexpr vec<3, T> operator*(vec<3, T> const& v1, T const& scalar)
 {
  return v1 * vec<3, T>(scalar);
 }

+/*
+template <typename T>
+inline constexpr vec<3, T> operator/(vec<3, T> const& v1, T const& scalar)
+{
+  return v1 / vec<3, T>(scalar);
+}
+*/
+
 template <typename T>
 inline constexpr T dot(vec<3, T> const& v1, vec<3, T> const& v2)
 {
--- a/raytracing.cpp
+++ b/raytracing.cpp
@ -24,8 +24,24 @@ struct sphere {
  vec3 color;
 };

+enum class light_type {
+  ambient,
+  point,
+  directional
+};
+
+struct light {
+  light_type type;
+  fp16_16 intensity;
+  union {
+    vec3 position;
+    vec3 direction;
+  };
+};
+
 struct scene {
-  sphere spheres[3];
+  sphere spheres[4];
+  light lights[3];
 };

 constexpr scene scene {
@ -44,17 +60,65 @@ constexpr scene scene {
      {-2, 0, 4},
      fp16_16(1),
      {0, 1, 0},
+    },
+    {
+      {0, -61, 0},
+      fp16_16(60),
+      {1, 1, 0},
+    }
+  },
+  { // lights
+    {
+      light_type::ambient, // type
+      fp16_16(65536 * 0.2, fp_raw_tag{}),        // intensity
+      {{0, 0, 0}}          //
+    },
+    {
+      light_type::point,   // type
+      fp16_16(65536 * 0.6, fp_raw_tag{}),        // intensity
+      {{2, 1, 0}}          // position
+    },
+    {
+      light_type::directional, // type
+      fp16_16(65536 * 0.6, fp_raw_tag{}),        // intensity
+      {{1, 4, 4}}          // direction
    }
  }
 };

 static_assert(scene.spheres[0].center.z.value == (3 << 16));
+static_assert(scene.lights[0].intensity.value != 0);
+static_assert(scene.lights[1].position.x.value == (2 << 16));

 struct t1_t2 {
  fp16_16 t1;
  fp16_16 t2;
 };

+fp16_16 compute_lighting(const vec3& point, const vec3& normal)
+{
+  fp16_16 intensity{0};
+
+  for (int i = 0; i < 3; i++) {
+    const light& light = scene.lights[i];
+    if (light.type == light_type::ambient) {
+      intensity += light.intensity;
+    } else {
+      vec3 light_vector;
+      if (light.type == light_type::point) {
+        light_vector = light.position - point;
+      } else {
+        light_vector = light.direction;
+      }
+      auto n_dot_l = dot(normal, light_vector);
+      if (n_dot_l > fp16_16(0)) {
+        intensity += light.intensity * n_dot_l * (fp16_16(1) / length(light_vector));
+      }
+    }
+  }
+  return intensity;
+}
+
 t1_t2 intersect_ray_sphere(const vec3& origin, const vec3& direction, const sphere& sphere)
 {
  fp16_16 r = sphere.radius;
@ -87,7 +151,7 @@ static vec3 trace_ray
 {
  fp16_16 closest_t = fp_limits<fp16_16>::max();
  const sphere * closest_sphere = nullptr;
-  for (int i = 0; i < 3; i++) {
+  for (int i = 0; i < 4; i++) {
    auto& sphere = scene.spheres[i];
    auto [t1, t2] = intersect_ray_sphere(origin, direction, sphere);
    if (t1 >= t_min && t1 < t_max && t1 < closest_t) {
@ -102,7 +166,10 @@ static vec3 trace_ray
  if (closest_sphere == nullptr) {
    return vec3(0, 0, 0);
  } else {
-    return closest_sphere->color;
+    vec3 point = origin + direction * closest_t;
+    vec3 normal = point - closest_sphere->center;
+    normal = normal * (fp16_16(1) / length(normal));
+    return closest_sphere->color * compute_lighting(point, normal);
  }
 }

--- a/sh/lib1funcs.S
+++ b/sh/lib1funcs.S
--- a/sh/lib1funcs.h
+++ b/sh/lib1funcs.h
@ -0,0 +1,74 @@
+/* Copyright (C) 1994-2022 Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 3, or (at your option) any
+later version.
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#ifdef __ELF__
+#define LOCAL(X)	.L_##X
+#define FUNC(X)		.type X,@function
+#define HIDDEN_FUNC(X)	FUNC(X); .hidden X
+#define HIDDEN_ALIAS(X,Y) ALIAS (X,Y); .hidden GLOBAL(X)
+#define ENDFUNC0(X)	.Lfe_##X: .size X,.Lfe_##X-X
+#define ENDFUNC(X)	ENDFUNC0(X)
+#else
+#define LOCAL(X)	L_##X
+#define FUNC(X)
+#define HIDDEN_FUNC(X)
+#define HIDDEN_ALIAS(X,Y) ALIAS (X,Y)
+#define ENDFUNC(X)
+#endif
+
+#define	CONCAT(A,B)	A##B
+#define	GLOBAL0(U,X)	CONCAT(U,__##X)
+#define	GLOBAL(X)	GLOBAL0(__USER_LABEL_PREFIX__,X)
+
+#define ALIAS(X,Y)	.global GLOBAL(X); .set GLOBAL(X),GLOBAL(Y)
+
+#if defined __SH2A__ && defined __FMOVD_ENABLED__
+#undef  FMOVD_WORKS
+#define FMOVD_WORKS
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define DR00 fr1
+#define DR01 fr0
+#define DR20 fr3
+#define DR21 fr2
+#define DR40 fr5
+#define DR41 fr4
+#else /* !__LITTLE_ENDIAN__ */
+#define DR00 fr0
+#define DR01 fr1
+#define DR20 fr2
+#define DR21 fr3
+#define DR40 fr4
+#define DR41 fr5
+#endif /* !__LITTLE_ENDIAN__ */
+
+#ifdef __sh1__
+#define SL(branch, dest, in_slot, in_slot_arg2) \
+	in_slot, in_slot_arg2; branch dest
+#define SL1(branch, dest, in_slot) \
+	in_slot; branch dest
+#else /* ! __sh1__ */
+#define SL(branch, dest, in_slot, in_slot_arg2) \
+	branch##.s dest; in_slot, in_slot_arg2
+#define SL1(branch, dest, in_slot) \
+	branch##/s dest; in_slot
+#endif /* !__sh1__ */