From 4d13e821b5726e3a8fbcc5935eea30203889668c Mon Sep 17 00:00:00 2001 From: Falco Girgis Date: Tue, 15 Apr 2025 12:06:12 -0500 Subject: [PATCH 1/6] Liberty/Miami Perf Gainz Phase 3 1) synced dca3-kos repo which has some gainzy commits 2) rwdc_common.h - all low-level and matrix/vector routines for SH4 are now shared in this common file, included in both RW and Liberty/Miami engines 3) CMatrix a. assignment operator: now uses asm-optimized mat_copy() b. multiplication operator: now use mat_mult() SH4 routine c. Scale(): applies a scale matrix via mat_scale d. MultiplyInverse: fipr-optimizations 4) CQuaternion a. multiplication: SH4 ASM FIPR optimized b. Get(V3d& axis, float &angle): fast inversion/division c. Set(RWMatrix&): fast division 5) CVector a. Multiply3x3() now accelerated with mat_transpose 5) RwQuat a. mult(): FIPR accelerated b. length(): FIPR/FSRRA accelerated --- liberty/common.mk | 1 + miami/common.mk | 1 + src/common/vmu/vmu.cpp | 41 +- src/liberty/core/common.h | 21 +- src/liberty/math/Matrix.cpp | 84 +--- src/liberty/math/Matrix.h | 21 + src/liberty/math/Quaternion.cpp | 17 +- src/liberty/math/Quaternion.h | 14 +- src/liberty/math/Vector.cpp | 9 + src/liberty/math/VuVector.h | 6 +- src/liberty/math/maths.h | 71 +-- src/miami/core/common.h | 29 +- src/miami/math/Matrix.cpp | 40 +- src/miami/math/Matrix.h | 32 +- src/miami/math/Quaternion.cpp | 17 +- src/miami/math/Quaternion.h | 14 +- src/miami/math/Vector.cpp | 9 + src/miami/math/VuVector.h | 7 +- src/miami/math/maths.h | 44 +- vendor/dca3-kos | 2 +- vendor/librw/src/base.cpp | 8 + vendor/librw/src/dc/rwdc.cpp | 7 - vendor/librw/src/dc/rwdc_common.h | 739 ++++++++++++++++++++++++++++++ vendor/librw/src/rwbase.h | 11 +- 24 files changed, 967 insertions(+), 278 deletions(-) create mode 100644 vendor/librw/src/dc/rwdc_common.h diff --git a/liberty/common.mk b/liberty/common.mk index d6d585ec..e5fd01a4 100644 --- a/liberty/common.mk +++ b/liberty/common.mk @@ -381,6 +381,7 @@ INCLUDE = \ -I../src/liberty/skel/win \ \ -I../vendor/librw \ +-I../vendor/librw/src/dc \ \ -I../vendor/miniLZO \ \ diff --git a/miami/common.mk b/miami/common.mk index dc043f7b..7e2c915b 100644 --- a/miami/common.mk +++ b/miami/common.mk @@ -396,6 +396,7 @@ INCLUDE = \ -I../src/miami/skel/win \ \ -I../vendor/librw \ +-I../vendor/librw/src/dc \ \ -I../vendor/miniLZO \ \ diff --git a/src/common/vmu/vmu.cpp b/src/common/vmu/vmu.cpp index 4e04e09a..57b5de41 100644 --- a/src/common/vmu/vmu.cpp +++ b/src/common/vmu/vmu.cpp @@ -94,23 +94,26 @@ void VmuProfiler::run() { pvr_stats_t pvrStats; pvr_get_stats(&pvrStats); uint32_t sramStats = snd_mem_available(); size_t pvrAvail = pvr_mem_available(); - float fps = std::accumulate(std::begin(fps_), std::end(fps_), 0.0f) - / static_cast(fpsSamples); float sh4Mem = heapUtilization(); float pvrMem = (8_MB - pvrAvail ) / 8_MB * 100.0f; float armMem = (2_MB - sramStats) / 2_MB * 100.0f; - float vtxBuf = vertBuffUse_; - { - std::shared_lock lk(mtx_); - vmu_printf("FPS :%6.2f\n" - "SH4 :%6.2f%%\n" - "PVR :%6.2f%%\n" - "ARM :%6.2f%%\n" - "VTX :%6.2f%%", - fps, sh4Mem, pvrMem, armMem, vtxBuf); + float vtxBuf; + float fps; + { /* Critical section with main thread. */ + std::shared_lock lk(mtx_); + vtxBuf = vertBuffUse_; + fps = std::accumulate(std::begin(fps_), std::end(fps_), 0.0f) + / static_cast(fpsSamples); } + + vmu_printf(" FPS:%6.2f\n" + " RAM:%6.2f%%\n" + "VRAM:%6.2f%%\n" + "SRAM:%6.2f%%\n" + " VTX:%6.2f%%", + fps, sh4Mem, pvrMem, armMem, vtxBuf); } #endif @@ -119,15 +122,21 @@ void VmuProfiler::run() { } void VmuProfiler::updateVertexBufferUsage() { +#ifndef DC_SH4 std::unique_lock lk(mtx_); updated_ = true; -#ifdef DC_SH4 - vertBuffUse_ = vertexBufferUtilization(); +#else + pvr_stats_t pvrStats; + pvr_get_stats(&pvrStats); + float vtxUtil = vertexBufferUtilization(); - pvr_stats_t pvrStats; - pvr_get_stats(&pvrStats); - fps_[fpsFrame_++] = pvrStats.frame_rate; + { /* Critical section with VMU thread. */ + std::unique_lock lk(mtx_); + vertBuffUse_ = vtxUtil; + updated_ = true; + fps_[fpsFrame_++] = pvrStats.frame_rate; + } if(fpsFrame_ >= fpsSamples) fpsFrame_ = 0; diff --git a/src/liberty/core/common.h b/src/liberty/core/common.h index 661499d6..ed1a23d3 100644 --- a/src/liberty/core/common.h +++ b/src/liberty/core/common.h @@ -82,8 +82,6 @@ #define rwVENDORID_ROCKSTAR 0x0253F2 -__always_inline auto Max(auto a, auto b) { return ((a > b)? a : b); } -__always_inline auto Min(auto a, auto b) { return ((a < b)? a : b); } // Use this to add const that wasn't there in the original code #define Const const @@ -299,15 +297,6 @@ extern int strcasecmp(const char *str1, const char *str2); extern wchar *AllocUnicode(const char*src); -template -__always_inline T Clamp(T v, auto low, auto high) { - return std::clamp(v, static_cast(low), static_cast(high)); -} - -__always_inline auto Clamp2(auto v, auto center, auto radius) { - return (v > center) ? Min(v, center + radius) : Max(v, center - radius); -} - #define SQR(x) ((x) * (x)) __always_inline auto sq(auto x) { return SQR(x); } @@ -418,15 +407,7 @@ template struct check_size { #endif #define BIT(num) (1<<(num)) -#define ABS(a) std::abs(a) +#define ABS(a) Abs(a) -__always_inline auto norm(auto value, auto min, auto max) { - return (Clamp(value, min, max) - min) / (max - min); -} // we use std::lerp now //#define lerp(norm, min, max) ( (norm) * ((max) - (min)) + (min) ) - -#define STRINGIFY(x) #x -#define STR(x) STRINGIFY(x) -#define CONCAT_(x,y) x##y -#define CONCAT(x,y) CONCAT_(x,y) diff --git a/src/liberty/math/Matrix.cpp b/src/liberty/math/Matrix.cpp index d275d963..a919baaf 100644 --- a/src/liberty/math/Matrix.cpp +++ b/src/liberty/math/Matrix.cpp @@ -4,7 +4,7 @@ CMatrix::CMatrix(CMatrix const &m) { m_attachment = nil; m_hasRwMatrix = false; - *this = m; + mat_copy(*this, m); } CMatrix::CMatrix(RwMatrix *matrix, bool owner) @@ -75,7 +75,7 @@ CMatrix::UpdateRW(void) void CMatrix::operator=(CMatrix const &rhs) { - memcpy(this, &rhs, sizeof(f)); + mat_copy(*this, rhs); if (m_attachment) UpdateRW(); } @@ -83,7 +83,7 @@ CMatrix::operator=(CMatrix const &rhs) void CMatrix::CopyOnlyMatrix(const CMatrix &other) { - memcpy(this, &other, sizeof(f)); + mat_copy(*this, other); } CMatrix & @@ -277,9 +277,9 @@ void CMatrix::RotateX(float x) { #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results - mat_load(reinterpret_cast(this)); - mat_rotate_x(x); - mat_store(reinterpret_cast(this)); + mat_load(reinterpret_cast(this)); + mat_rotate_x(x); + mat_store(reinterpret_cast(this)); #else auto [s, c] = SinCos(x); @@ -307,9 +307,9 @@ void CMatrix::RotateY(float y) { #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results - mat_load(reinterpret_cast(this)); - mat_rotate_y(y); - mat_store(reinterpret_cast(this)); + mat_load(reinterpret_cast(this)); + mat_rotate_y(y); + mat_store(reinterpret_cast(this)); #else auto [s, c] = SinCos(y); @@ -337,9 +337,9 @@ void CMatrix::RotateZ(float z) { #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results - mat_load(reinterpret_cast(this)); - mat_rotate_z(z); - mat_store(reinterpret_cast(this)); + mat_load(reinterpret_cast(this)); + mat_rotate_z(z); + mat_store(reinterpret_cast(this)); #else auto [s, c] = SinCos(z); @@ -367,9 +367,9 @@ void CMatrix::Rotate(float x, float y, float z) { #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results - mat_load(reinterpret_cast(this)); - mat_rotate(x, y, z); - mat_store(reinterpret_cast(this)); + mat_load(reinterpret_cast(this)); + mat_rotate(x, y, z); + mat_store(reinterpret_cast(this)); #else auto [sX, cX] = SinCos(x); auto [sY, cY] = SinCos(y); @@ -449,65 +449,13 @@ CMatrix::Reorthogonalise(void) f = CrossProduct(u, r); } -#ifdef DC_SH4 -static __always_inline void MATH_Load_Matrix_Product(const matrix_t* matrix1, const matrix_t* matrix2) -{ - unsigned int prefetch_scratch; - - asm volatile ( - "mov %[bmtrx], %[pref_scratch]\n\t" // (MT) - "add #32, %[pref_scratch]\n\t" // offset by 32 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle) - "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) (FE) - "pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS) - // back matrix - "fmov.d @%[bmtrx]+, XD0\n\t" // (LS) - "fmov.d @%[bmtrx]+, XD2\n\t" - "fmov.d @%[bmtrx]+, XD4\n\t" - "fmov.d @%[bmtrx]+, XD6\n\t" - "pref @%[fmtrx]\n\t" // prefetch fmtrx now while we wait (LS) - "fmov.d @%[bmtrx]+, XD8\n\t" // bmtrx prefetch should work for here - "fmov.d @%[bmtrx]+, XD10\n\t" - "fmov.d @%[bmtrx]+, XD12\n\t" - "mov %[fmtrx], %[pref_scratch]\n\t" // (MT) - "add #32, %[pref_scratch]\n\t" // store offset by 32 in r0 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle) - "fmov.d @%[bmtrx], XD14\n\t" - "pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS) - // front matrix - // interleave loads and matrix multiply 4x4 - "fmov.d @%[fmtrx]+, DR0\n\t" - "fmov.d @%[fmtrx]+, DR2\n\t" - "fmov.d @%[fmtrx]+, DR4\n\t" // (LS) want to issue the next one before 'ftrv' for parallel exec - "ftrv XMTRX, FV0\n\t" // (FE) - - "fmov.d @%[fmtrx]+, DR6\n\t" - "fmov.d @%[fmtrx]+, DR8\n\t" - "ftrv XMTRX, FV4\n\t" - - "fmov.d @%[fmtrx]+, DR10\n\t" - "fmov.d @%[fmtrx]+, DR12\n\t" - "ftrv XMTRX, FV8\n\t" - - "fmov.d @%[fmtrx], DR14\n\t" // (LS, but this will stall 'ftrv' for 3 cycles) - "fschg\n\t" // switch back to single moves (and avoid stalling 'ftrv') (FE) - "ftrv XMTRX, FV12\n\t" // (FE) - // Save output in XF regs - "frchg\n" - : [bmtrx] "+&r" ((unsigned int)matrix1), [fmtrx] "+r" ((unsigned int)matrix2), [pref_scratch] "=&r" (prefetch_scratch) // outputs, "+" means r/w, "&" means it's written to before all inputs are consumed - : // no inputs - : "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11", "fr12", "fr13", "fr14", "fr15" // clobbers (GCC doesn't know about back bank, so writing to it isn't clobbered) - ); -} -#endif - CMatrix operator*(const CMatrix &m1, const CMatrix &m2) { // TODO: VU0 code CMatrix out; #if defined(RW_DC) - mat_load(reinterpret_cast(&m1)); - mat_apply(reinterpret_cast(&m2)); - mat_store(reinterpret_cast(&out)); + mat_mult(out, m1, m2); #else out.rx = m1.rx * m2.rx + m1.fx * m2.ry + m1.ux * m2.rz; out.ry = m1.ry * m2.rx + m1.fy * m2.ry + m1.uy * m2.rz; diff --git a/src/liberty/math/Matrix.h b/src/liberty/math/Matrix.h index d2ff6067..06dab72a 100644 --- a/src/liberty/math/Matrix.h +++ b/src/liberty/math/Matrix.h @@ -1,5 +1,7 @@ #pragma once +#include "rwdc_common.h" + class alignas(8) CMatrix { public: @@ -27,6 +29,8 @@ public: SetScale(scale); } ~CMatrix(void); + operator matrix_t *() { return reinterpret_cast(this); } + operator const matrix_t *() const { return reinterpret_cast(this); } void Attach(RwMatrix *matrix, bool owner = false); void AttachRW(RwMatrix *matrix, bool owner = false); void Detach(void); @@ -59,6 +63,7 @@ public: void SetScale(float s); void Scale(float scale) { +#ifndef DC_SH4 for (int i = 0; i < 3; i++) #ifdef FIX_BUGS // BUGFIX from VC for (int j = 0; j < 3; j++) @@ -66,6 +71,11 @@ public: for (int j = 0; j < 4; j++) #endif f[i][j] *= scale; +#else + mat_load(*this); + mat_scale(scale, scale, scale); + mat_store(*this); +#endif } @@ -102,11 +112,22 @@ CMatrix Invert(const CMatrix &matrix); CMatrix operator*(const CMatrix &m1, const CMatrix &m2); inline CVector MultiplyInverse(const CMatrix &mat, const CVector &vec) { +#ifndef DC_SH4 CVector v(vec.x - mat.px, vec.y - mat.py, vec.z - mat.pz); return CVector( mat.rx * v.x + mat.ry * v.y + mat.rz * v.z, mat.fx * v.x + mat.fy * v.y + mat.fz * v.z, mat.ux * v.x + mat.uy * v.y + mat.uz * v.z); +#else + register float x asm(KOS_FPARG(0)) = vec.x - mat.px; + register float y asm(KOS_FPARG(1)) = vec.y - mat.py; + register float z asm(KOS_FPARG(2)) = vec.z - mat.pz; + return CVector( + fipr(x, y, z, 0.0f, mat.rx, mat.ry, mat.rz, 0.0f), + fipr(x, y, z, 0.0f, mat.fx, mat.fy, mat.fz, 0.0f), + fipr(x, y, z, 0.0f, mat.ux, mat.uy, mat.uz, 0.0f) + ); +#endif } diff --git a/src/liberty/math/Quaternion.cpp b/src/liberty/math/Quaternion.cpp index b0e782e2..9f55b234 100644 --- a/src/liberty/math/Quaternion.cpp +++ b/src/liberty/math/Quaternion.cpp @@ -39,10 +39,14 @@ CQuaternion::Slerp(const CQuaternion &q1, const CQuaternion &q2, float theta, fl void CQuaternion::Multiply(const CQuaternion &q1, const CQuaternion &q2) { +#ifndef DC_SH4 x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w); y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w); z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w); w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z); +#else + quat_mult(*this, q1, q2); +#endif } void @@ -51,9 +55,16 @@ CQuaternion::Get(RwV3d *axis, float *angle) *angle = Acos(w); float s = Sin(*angle); +#ifndef DC_SH4 axis->x = x * (1.0f / s); axis->y = y * (1.0f / s); axis->z = z * (1.0f / s); +#else + float invS = dc::Invert(s); + axis->x = x * invS; + axis->y = y * invS; + axis->z = z * invS; +#endif } void @@ -104,7 +115,7 @@ CQuaternion::Set(const RwMatrix &matrix) if (f >= 0.0f) { s = Sqrt(f + 1.0f); w = 0.5f * s; - m = 0.5f / s; + m = Div(0.5f, s); x = (matrix.up.z - matrix.at.y) * m; y = (matrix.at.x - matrix.right.z) * m; z = (matrix.right.y - matrix.up.x) * m; @@ -115,7 +126,7 @@ CQuaternion::Set(const RwMatrix &matrix) if (f >= 0.0f) { s = Sqrt(f + 1.0f); x = 0.5f * s; - m = 0.5f / s; + m = Div(0.5f, s); y = (matrix.up.x + matrix.right.y) * m; z = (matrix.at.x + matrix.right.z) * m; w = (matrix.up.z - matrix.at.y) * m; @@ -126,7 +137,7 @@ CQuaternion::Set(const RwMatrix &matrix) if (f >= 0.0f) { s = Sqrt(f + 1.0f); y = 0.5f * s; - m = 0.5f / s; + m = Div(0.5f, s); w = (matrix.at.x - matrix.right.z) * m; x = (matrix.up.x - matrix.right.y) * m; z = (matrix.at.y + matrix.up.z) * m; diff --git a/src/liberty/math/Quaternion.h b/src/liberty/math/Quaternion.h index 47581368..c1b0e481 100644 --- a/src/liberty/math/Quaternion.h +++ b/src/liberty/math/Quaternion.h @@ -1,6 +1,7 @@ #pragma once #include "src/common_defines.h" +#include "rwdc_common.h" // TODO: actually implement this class CQuaternion @@ -10,6 +11,10 @@ public: CQuaternion(void) {} CQuaternion(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {} + operator quaternion_t *() { return reinterpret_cast(this); } + operator const quaternion_t *() const { return reinterpret_cast(this); } + operator quaternion_t &() { return *reinterpret_cast(this); } + operator const quaternion_t &() const { return *reinterpret_cast(this); } float Magnitude(void) const { #ifndef DC_SH4 return Sqrt(x*x + y*y + z*z + w*w); @@ -57,10 +62,11 @@ public: } const CQuaternion &operator/=(float right) { - x /= right; - y /= right; - z /= right; - w /= right; + right = dc::Invert(right); + x *= right; + y *= right; + z *= right; + w *= right; return *this; } diff --git a/src/liberty/math/Vector.cpp b/src/liberty/math/Vector.cpp index 9ac3c2d7..4aebb789 100644 --- a/src/liberty/math/Vector.cpp +++ b/src/liberty/math/Vector.cpp @@ -50,9 +50,18 @@ Multiply3x3(const CMatrix &mat, const CVector &vec) CVector Multiply3x3(const CVector &vec, const CMatrix &mat) { +#ifndef DC_SH4 return CVector(mat.rx * vec.x + mat.ry * vec.y + mat.rz * vec.z, mat.fx * vec.x + mat.fy * vec.y + mat.fz * vec.z, mat.ux * vec.x + mat.uy * vec.y + mat.uz * vec.z); +#else + CVector out; + mat_load(mat); + mat_transpose(); + mat_trans_vec3_nomod(vec.x, vec.y, vec.z, + out.x, out.y, out.z); + return out; +#endif } CVector diff --git a/src/liberty/math/VuVector.h b/src/liberty/math/VuVector.h index 4f0239b7..f2f4a8e4 100644 --- a/src/liberty/math/VuVector.h +++ b/src/liberty/math/VuVector.h @@ -44,7 +44,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const CV sqc2 vf06,0x0(%0)\n\ ": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory"); #elif defined(DC_SH4) - mat_load(reinterpret_cast(const_cast(&mat))); + mat_load(mat); mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z); #else out = mat * in; @@ -71,7 +71,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const Rw sqc2 vf06,0x0(%0)\n\ ": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory"); #elif defined(DC_SH4) - mat_load(reinterpret_cast(const_cast(&mat))); + mat_load(mat); mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z); #else out = mat * in; @@ -108,7 +108,7 @@ __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat, bnez %1,1b\n\ ": : "r" (out) , "r" (n), "r" (&mat), "r" (in), "r" (stride): "memory"); #elif defined(DC_SH4) - mat_load(reinterpret_cast(const_cast(&mat))); + mat_load(mat); while(n--) { mat_trans_single3_nodiv_nomod(in->x, in->y, in->z, out->x, out->y, out->z); in = reinterpret_cast(reinterpret_cast(in) + stride); diff --git a/src/liberty/math/maths.h b/src/liberty/math/maths.h index 9959bf61..5e1cf016 100644 --- a/src/liberty/math/maths.h +++ b/src/liberty/math/maths.h @@ -1,73 +1,6 @@ #pragma once #include "src/common_defines.h" +#include "rwdc_common.h" -#include -#include - -#ifdef DC_SH4 - -#define mat_trans_nodiv_nomod(x, y, z, x2, y2, z2, w2) do { \ - register float __x __asm__("fr12") = (x); \ - register float __y __asm__("fr13") = (y); \ - register float __z __asm__("fr14") = (z); \ - register float __w __asm__("fr15") = 1.0f; \ - __asm__ __volatile__( "ftrv xmtrx, fv12\n" \ - : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \ - : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \ - x2 = __x; y2 = __y; z2 = __z; w2 = __w; \ - } while(false) - -#define mat_trans_w_nodiv_nomod(x, y, z, w) do { \ - register float __x __asm__("fr12") = (x); \ - register float __y __asm__("fr13") = (y); \ - register float __z __asm__("fr14") = (z); \ - register float __w __asm__("fr15") = 1.0f; \ - __asm__ __volatile__( "ftrv xmtrx, fv12\n" \ - : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \ - : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \ - w = __w; \ - } while(false) - -__always_inline float Fmac(float a, float b, float c) { - asm volatile ("fmac fr0, %[floatb], %[floatc]\n" - : [floatc] "+f" (c) : "w" (a), [floatb] "f" (b) : ); - return c; -} - -#else - -#define mat_trans_nodiv_nomod(x_, y_, z_, x2, y2, z2, w2) do { \ - vector_t tmp = { x_, y_, z_, 1.0f }; \ - mat_transform(&tmp, &tmp, 1, 0); \ - x2 = tmp.x; y2 = tmp.y; z2 = tmp.z; w2 = tmp.w; \ - } while(false) - -#define mat_trans_w_nodiv_nomod(x_, y_, z_, w_) do { \ - vector_t tmp = { x_, y_, z_, 1.0f }; \ - mat_transform(&tmp, &tmp, 1, 0); \ - w_ = tmp.w; \ - } while(false) - -__always_inline float Fmac(float a, float b, float c) { return a * b + c; } - -#endif - -__always_inline float Sin(float x) { return __builtin_sinf(x); } -__always_inline float Cos(float x) { return __builtin_cosf(x); } -__always_inline auto SinCos(float x) { return std::pair { Sin(x), Cos(x) }; } -__always_inline float Tan(float x) { return __builtin_tanf(x); } -__always_inline float Abs(float x) { return __builtin_fabsf(x); } -__always_inline float Sqrt(float x) { return __builtin_sqrtf(x); } -__always_inline float RecipSqrt(float x) { return 1.0f / __builtin_sqrtf(x); } -__always_inline float Asin(float x) { return __builtin_asinf(x); } -__always_inline float Acos(float x) { return __builtin_acosf(x); } -__always_inline float Atan(float x) { return __builtin_atanf(x); } -__always_inline float Atan2(float y, float x) { return __builtin_atan2f(y, x); } -__always_inline float RecipSqrt(float x, float y) { return x / __builtin_sqrtf(y); /*y = RecipSqrt(y); return x * y * y;*/ } -__always_inline float Pow(float x, float y) { return __builtin_powf(x, y); } -__always_inline float Floor(float x) { return __builtin_floorf(x); } -__always_inline float Ceil(float x) { return __builtin_ceilf(x); } -__always_inline float Invert(float x) { return (((x) < 0.0f)? -1.0f : 1.0f) * RecipSqrt((x) * (x)); } -__always_inline float Div(float x, float y) { return x * Invert(y); } -__always_inline float Lerp(float a, float b, float t) { return Fmac(t, (b - a), a); } \ No newline at end of file +using namespace dc; diff --git a/src/miami/core/common.h b/src/miami/core/common.h index 71911ec3..426ebeca 100644 --- a/src/miami/core/common.h +++ b/src/miami/core/common.h @@ -85,8 +85,6 @@ #define rwVENDORID_ROCKSTAR 0x0253F2 -#define Max(a,b) ((a) > (b) ? (a) : (b)) -#define Min(a,b) ((a) < (b) ? (a) : (b)) // Use this to add const that wasn't there in the original code #define Const const @@ -303,12 +301,8 @@ extern int strncasecmp(const char *str1, const char *str2, size_t len); extern wchar *AllocUnicode(const char*src); -#define Clamp(v, low, high) ((v)<(low) ? (low) : (v)>(high) ? (high) : (v)) - -#define Clamp2(v, center, radius) ((v) > (center) ? Min(v, center + radius) : Max(v, center - radius)) - -inline float sq(float x) { return x*x; } #define SQR(x) ((x) * (x)) +__always_inline auto sq(auto x) { return SQR(x); } #ifdef __MWERKS__ #define M_E 2.71828182845904523536 // e @@ -326,7 +320,10 @@ inline float sq(float x) { return x*x; } #define M_SQRT1_2 0.707106781186547524401 // 1/sqrt(2) #endif -#define PI (float)M_PI +#ifndef DC_SH4 +#define F_PI M_PI +#endif +#define PI (float)F_PI #define TWOPI (PI*2) #define HALFPI (PI/2) #define DEGTORAD(x) ((x) * PI / 180.0f) @@ -380,7 +377,10 @@ __inline__ void TRACE(char *f, ...) { } // this is re3 only, and so the function #endif #endif -#ifndef MASTER +#ifdef assert +#undef assert +#endif +#if !defined(MASTER) #define assert(_Expression) (void)( (!!(_Expression)) || (re3_assert(#_Expression, __FILE__, __LINE__, __FUNCTION__), 0) ) #else #define assert(_Expression) (_Expression) @@ -411,12 +411,7 @@ template struct check_size { #endif #define BIT(num) (1<<(num)) -#define ABS(a) (((a) < 0) ? (-(a)) : (a)) -#define norm(value, min, max) (((value) < (min)) ? 0 : (((value) > (max)) ? 1 : (((value) - (min)) / ((max) - (min))))) -#define Lerp(norm, min, max) ( (norm) * ((max) - (min)) + (min) ) - -#define STRINGIFY(x) #x -#define STR(x) STRINGIFY(x) -#define CONCAT_(x,y) x##y -#define CONCAT(x,y) CONCAT_(x,y) +#define ABS(a) Abs(a) +// we use std::lerp now +//#define lerp(norm, min, max) ( (norm) * ((max) - (min)) + (min) ) diff --git a/src/miami/math/Matrix.cpp b/src/miami/math/Matrix.cpp index 62b7029c..f90e3c06 100644 --- a/src/miami/math/Matrix.cpp +++ b/src/miami/math/Matrix.cpp @@ -4,7 +4,7 @@ CMatrix::CMatrix(CMatrix const &m) { m_attachment = nil; m_hasRwMatrix = false; - *this = m; + mat_copy(*this, m); } CMatrix::CMatrix(RwMatrix *matrix, bool owner) @@ -75,7 +75,7 @@ CMatrix::UpdateRW(void) void CMatrix::operator=(CMatrix const &rhs) { - memcpy(this, &rhs, sizeof(f)); + mat_copy(*this, rhs); if (m_attachment) UpdateRW(); } @@ -83,7 +83,7 @@ CMatrix::operator=(CMatrix const &rhs) void CMatrix::CopyOnlyMatrix(const CMatrix &other) { - memcpy(this, &other, sizeof(f)); + mat_copy(*this, other); } CMatrix & @@ -358,12 +358,14 @@ CMatrix::RotateZ(float z) void CMatrix::Rotate(float x, float y, float z) { - float cX = Cos(x); - float sX = Sin(x); - float cY = Cos(y); - float sY = Sin(y); - float cZ = Cos(z); - float sZ = Sin(z); +#if 0 && defined(DC_SH4) // this is bugged and does not yield correct results + mat_load(reinterpret_cast(this)); + mat_rotate(x, y, z); + mat_store(reinterpret_cast(this)); +#else + auto [sX, cX] = SinCos(x); + auto [sY, cY] = SinCos(y); + auto [sZ, cZ] = SinCos(z); float rx = this->rx; float ry = this->ry; @@ -388,6 +390,20 @@ CMatrix::Rotate(float x, float y, float z) float z2 = sZ * sY - (cZ * sX) * cY; float z3 = cX * cY; + #if !defined(DC_TEXCONV) && !defined(DC_SIM) + this->rx = fipr(x1, y1, z1, 0, rx, ry, rz, 0); + this->ry = fipr(x2, y2, z2, 0, rx, ry, rz, 0); + this->rz = fipr(x3, y3, z3, 0, rx, ry, rz, 0); + this->fx = fipr(x1, y1, z1, 0, ux, uy, uz, 0); + this->fy = fipr(x2, y2, z2, 0, ux, uy, uz, 0); + this->fz = fipr(x3, y3, z3, 0, ux, uy, uz, 0); + this->ux = fipr(x1, y1, z1, 0, ax, ay, az, 0); + this->uy = fipr(x2, y2, z2, 0, ax, ay, az, 0); + this->uz = fipr(x3, y3, z3, 0, ax, ay, az, 0); + this->px = fipr(x1, y1, z1, 0, px, py, pz, 0); + this->py = fipr(x2, y2, z2, 0, px, py, pz, 0); + this->pz = fipr(x3, y3, z3, 0, px, py, pz, 0); + #else this->rx = x1 * rx + y1 * ry + z1 * rz; this->ry = x2 * rx + y2 * ry + z2 * rz; this->rz = x3 * rx + y3 * ry + z3 * rz; @@ -400,6 +416,8 @@ CMatrix::Rotate(float x, float y, float z) this->px = x1 * px + y1 * py + z1 * pz; this->py = x2 * px + y2 * py + z2 * pz; this->pz = x3 * px + y3 * py + z3 * pz; + #endif +#endif } CMatrix & @@ -429,9 +447,7 @@ operator*(const CMatrix &m1, const CMatrix &m2) // TODO: VU0 code CMatrix out; #if defined(RW_DC) - mat_load(reinterpret_cast(&m1)); - mat_apply(reinterpret_cast(&m2)); - mat_store(reinterpret_cast(&out)); + mat_mult(out, m1, m2); #else out.rx = m1.rx * m2.rx + m1.fx * m2.ry + m1.ux * m2.rz; out.ry = m1.ry * m2.rx + m1.fy * m2.ry + m1.uy * m2.rz; diff --git a/src/miami/math/Matrix.h b/src/miami/math/Matrix.h index dd6f892e..456c6f05 100644 --- a/src/miami/math/Matrix.h +++ b/src/miami/math/Matrix.h @@ -43,6 +43,8 @@ public: SetScale(scale); } ~CMatrix(void); + operator matrix_t *() { return reinterpret_cast(this); } + operator const matrix_t *() const { return reinterpret_cast(this); } void Attach(RwMatrix *matrix, bool owner = false); void AttachRW(RwMatrix *matrix, bool owner = false); void Detach(void); @@ -75,20 +77,31 @@ public: void SetScale(float s); void Scale(float scale) { +#if !defined(DC_SH4) || 1 /* TODO */ for (int i = 0; i < 3; i++) for (int j = 0; j < 3; j++) f[i][j] *= scale; +#else + mat_load(*this); + mat_scale(scale, scale, scale); + mat_store(*this); +#endif } void Scale(float sx, float sy, float sz) { +#if !defined(DC_SH4) || 1 /* TODO */ for (int i = 0; i < 3; i++){ - f[i][0] *= sx; - f[i][1] *= sy; - f[i][2] *= sz; + f[i][0] *= sx; + f[i][1] *= sy; + f[i][2] *= sz; } +#else + mat_load(*this); + mat_scale(sx, sy, sz); + mat_store(*this); +#endif } - void SetRotateXOnly(float angle); void SetRotateYOnly(float angle); void SetRotateZOnly(float angle); @@ -125,11 +138,22 @@ CMatrix Invert(const CMatrix &matrix); CMatrix operator*(const CMatrix &m1, const CMatrix &m2); inline CVector MultiplyInverse(const CMatrix &mat, const CVector &vec) { +#ifndef DC_SH4 CVector v(vec.x - mat.px, vec.y - mat.py, vec.z - mat.pz); return CVector( mat.rx * v.x + mat.ry * v.y + mat.rz * v.z, mat.fx * v.x + mat.fy * v.y + mat.fz * v.z, mat.ux * v.x + mat.uy * v.y + mat.uz * v.z); +#else + register float x asm(KOS_FPARG(0)) = vec.x - mat.px; + register float y asm(KOS_FPARG(1)) = vec.y - mat.py; + register float z asm(KOS_FPARG(2)) = vec.z - mat.pz; + return CVector( + fipr(x, y, z, 0.0f, mat.rx, mat.ry, mat.rz, 0.0f), + fipr(x, y, z, 0.0f, mat.fx, mat.fy, mat.fz, 0.0f), + fipr(x, y, z, 0.0f, mat.ux, mat.uy, mat.uz, 0.0f) + ); +#endif } diff --git a/src/miami/math/Quaternion.cpp b/src/miami/math/Quaternion.cpp index b0e782e2..9f55b234 100644 --- a/src/miami/math/Quaternion.cpp +++ b/src/miami/math/Quaternion.cpp @@ -39,10 +39,14 @@ CQuaternion::Slerp(const CQuaternion &q1, const CQuaternion &q2, float theta, fl void CQuaternion::Multiply(const CQuaternion &q1, const CQuaternion &q2) { +#ifndef DC_SH4 x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w); y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w); z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w); w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z); +#else + quat_mult(*this, q1, q2); +#endif } void @@ -51,9 +55,16 @@ CQuaternion::Get(RwV3d *axis, float *angle) *angle = Acos(w); float s = Sin(*angle); +#ifndef DC_SH4 axis->x = x * (1.0f / s); axis->y = y * (1.0f / s); axis->z = z * (1.0f / s); +#else + float invS = dc::Invert(s); + axis->x = x * invS; + axis->y = y * invS; + axis->z = z * invS; +#endif } void @@ -104,7 +115,7 @@ CQuaternion::Set(const RwMatrix &matrix) if (f >= 0.0f) { s = Sqrt(f + 1.0f); w = 0.5f * s; - m = 0.5f / s; + m = Div(0.5f, s); x = (matrix.up.z - matrix.at.y) * m; y = (matrix.at.x - matrix.right.z) * m; z = (matrix.right.y - matrix.up.x) * m; @@ -115,7 +126,7 @@ CQuaternion::Set(const RwMatrix &matrix) if (f >= 0.0f) { s = Sqrt(f + 1.0f); x = 0.5f * s; - m = 0.5f / s; + m = Div(0.5f, s); y = (matrix.up.x + matrix.right.y) * m; z = (matrix.at.x + matrix.right.z) * m; w = (matrix.up.z - matrix.at.y) * m; @@ -126,7 +137,7 @@ CQuaternion::Set(const RwMatrix &matrix) if (f >= 0.0f) { s = Sqrt(f + 1.0f); y = 0.5f * s; - m = 0.5f / s; + m = Div(0.5f, s); w = (matrix.at.x - matrix.right.z) * m; x = (matrix.up.x - matrix.right.y) * m; z = (matrix.at.y + matrix.up.z) * m; diff --git a/src/miami/math/Quaternion.h b/src/miami/math/Quaternion.h index 47581368..c1b0e481 100644 --- a/src/miami/math/Quaternion.h +++ b/src/miami/math/Quaternion.h @@ -1,6 +1,7 @@ #pragma once #include "src/common_defines.h" +#include "rwdc_common.h" // TODO: actually implement this class CQuaternion @@ -10,6 +11,10 @@ public: CQuaternion(void) {} CQuaternion(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {} + operator quaternion_t *() { return reinterpret_cast(this); } + operator const quaternion_t *() const { return reinterpret_cast(this); } + operator quaternion_t &() { return *reinterpret_cast(this); } + operator const quaternion_t &() const { return *reinterpret_cast(this); } float Magnitude(void) const { #ifndef DC_SH4 return Sqrt(x*x + y*y + z*z + w*w); @@ -57,10 +62,11 @@ public: } const CQuaternion &operator/=(float right) { - x /= right; - y /= right; - z /= right; - w /= right; + right = dc::Invert(right); + x *= right; + y *= right; + z *= right; + w *= right; return *this; } diff --git a/src/miami/math/Vector.cpp b/src/miami/math/Vector.cpp index a3510a6c..e2bc5f7e 100644 --- a/src/miami/math/Vector.cpp +++ b/src/miami/math/Vector.cpp @@ -50,9 +50,18 @@ Multiply3x3(const CMatrix &mat, const CVector &vec) CVector Multiply3x3(const CVector &vec, const CMatrix &mat) { +#ifndef DC_SH4 return CVector(mat.rx * vec.x + mat.ry * vec.y + mat.rz * vec.z, mat.fx * vec.x + mat.fy * vec.y + mat.fz * vec.z, mat.ux * vec.x + mat.uy * vec.y + mat.uz * vec.z); +#else + CVector out; + mat_load(mat); + mat_transpose(); + mat_trans_vec3_nomod(vec.x, vec.y, vec.z, + out.x, out.y, out.z); + return out; +#endif } CVector diff --git a/src/miami/math/VuVector.h b/src/miami/math/VuVector.h index 1b184f97..3c5933c4 100644 --- a/src/miami/math/VuVector.h +++ b/src/miami/math/VuVector.h @@ -50,7 +50,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const CV sqc2 vf06,0x0(%0)\n\ ": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory"); #elif defined(DC_SH4) - mat_load(reinterpret_cast(const_cast(&mat))); + mat_load(mat); mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z); #else out = mat * in; @@ -77,7 +77,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const Rw sqc2 vf06,0x0(%0)\n\ ": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory"); #elif defined(DC_SH4) - mat_load(reinterpret_cast(const_cast(&mat))); + mat_load(mat); mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z); #else out = mat * in; @@ -114,7 +114,7 @@ __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat, bnez %1,1b\n\ ": : "r" (out) , "r" (n), "r" (&mat), "r" (in), "r" (stride): "memory"); #elif defined(DC_SH4) - mat_load(reinterpret_cast(const_cast(&mat))); + mat_load(mat); while(n--) { mat_trans_single3_nodiv_nomod(in->x, in->y, in->z, out->x, out->y, out->z); in = reinterpret_cast(reinterpret_cast(in) + stride); @@ -137,7 +137,6 @@ __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat, lqc2 vf03,0x10(%2)\n\ lqc2 vf04,0x20(%2)\n\ lqc2 vf05,0x30(%2)\n\ - lqc2 vf01,0x0(%3)\n\ nop\n\ 1: vmulax.xyz ACC, vf02,vf01\n\ vmadday.xyz ACC, vf03,vf01\n\ diff --git a/src/miami/math/maths.h b/src/miami/math/maths.h index 6f2458a2..5e1cf016 100644 --- a/src/miami/math/maths.h +++ b/src/miami/math/maths.h @@ -1,46 +1,6 @@ #pragma once #include "src/common_defines.h" +#include "rwdc_common.h" -#include - -#ifdef DC_SH4 - -#define mat_trans_nodiv_nomod(x, y, z, x2, y2, z2, w2) do { \ - register float __x __asm__("fr12") = (x); \ - register float __y __asm__("fr13") = (y); \ - register float __z __asm__("fr14") = (z); \ - register float __w __asm__("fr15") = 1.0f; \ - __asm__ __volatile__( "ftrv xmtrx, fv12\n" \ - : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \ - : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \ - x2 = __x; y2 = __y; z2 = __z; w2 = __w; \ - } while(false) - -#else - -#define mat_trans_nodiv_nomod(x_, y_, z_, x2, y2, z2, w2) do { \ - vector_t tmp = { x_, y_, z_, 1.0f }; \ - mat_transform(&tmp, &tmp, 1, 0); \ - x2 = tmp.x; y2 = tmp.y; z2 = tmp.z; w2 = tmp.w; \ - } while(false) -#endif - - -// wrapper around float versions of functions -// in gta they are in CMaths but that makes the code rather noisy - -inline float Sin(float x) { return sinf(x); } -inline float Asin(float x) { return asinf(x); } -inline float Cos(float x) { return cosf(x); } -inline float Acos(float x) { return acosf(x); } -inline float Tan(float x) { return tanf(x); } -inline float Atan(float x) { return atanf(x); } -inline float Atan2(float y, float x) { return atan2f(y, x); } -inline float Abs(float x) { return fabsf(x); } -inline float Sqrt(float x) { return sqrtf(x); } -inline float RecipSqrt(float x, float y) { return x/Sqrt(y); } -inline float RecipSqrt(float x) { return RecipSqrt(1.0f, x); } -inline float Pow(float x, float y) { return powf(x, y); } -inline float Floor(float x) { return floorf(x); } -inline float Ceil(float x) { return ceilf(x); } +using namespace dc; diff --git a/vendor/dca3-kos b/vendor/dca3-kos index 5f69d048..144eaeab 160000 --- a/vendor/dca3-kos +++ b/vendor/dca3-kos @@ -1 +1 @@ -Subproject commit 5f69d048aaf2548687c33afc7726d603a22e29b7 +Subproject commit 144eaeab940246d9c84c1f14b45d9e2847705c5e diff --git a/vendor/librw/src/base.cpp b/vendor/librw/src/base.cpp index 391af2ab..a9b7b5d6 100644 --- a/vendor/librw/src/base.cpp +++ b/vendor/librw/src/base.cpp @@ -89,10 +89,18 @@ strncmp_ci(const char *s1, const char *s2, int n) Quat mult(const Quat &q, const Quat &p) { +#ifndef DC_SH4 return makeQuat(q.w*p.w - q.x*p.x - q.y*p.y - q.z*p.z, q.w*p.x + q.x*p.w + q.y*p.z - q.z*p.y, q.w*p.y + q.y*p.w + q.z*p.x - q.x*p.z, q.w*p.z + q.z*p.w + q.x*p.y - q.y*p.x); +#else + Quat o; + dc::quat_mult(reinterpret_cast(&o), + reinterpret_cast(q), + reinterpret_cast(p)); + return o; +#endif } diff --git a/vendor/librw/src/dc/rwdc.cpp b/vendor/librw/src/dc/rwdc.cpp index d282b317..d33de9b0 100644 --- a/vendor/librw/src/dc/rwdc.cpp +++ b/vendor/librw/src/dc/rwdc.cpp @@ -728,13 +728,6 @@ __always_inline void DCE_RenderSubmitVertexIM3D(float x, float y, float w, /* END TA Submission Functions*/ - - - -#if defined(DC_TEXCONV) -void malloc_stats() { } -#endif - #if 0 #define UNIMPL_LOG() printf("TODO: Implement %s @ %s:%d\n", __func__, __FILE__, __LINE__); #define UNIMPL_LOGV(fmt, ...) printf("TODO: Implement %s @ %s:%d " fmt "\n", __func__, __FILE__, __LINE__, __VA_ARGS__); diff --git a/vendor/librw/src/dc/rwdc_common.h b/vendor/librw/src/dc/rwdc_common.h new file mode 100644 index 00000000..e0f940b7 --- /dev/null +++ b/vendor/librw/src/dc/rwdc_common.h @@ -0,0 +1,739 @@ +#ifndef __RWDC_COMMON_H +#define __RWDC_COMMON_H + +#include +#include +#include +#include +#include +#include + +#ifndef RW_DC +# pragma warning(disable: 4244) // int to float +# pragma warning(disable: 4800) // int to bool +# pragma warning(disable: 4838) // narrowing conversion +# pragma warning(disable: 4996) // POSIX names +#else +# if !defined(DC_TEXCONV) && !defined(DC_SIM) +# include +# define DC_SH4 +# define VIDEO_MODE_WIDTH static_cast(vid_mode->width) +# define VIDEO_MODE_HEIGHT static_cast(vid_mode->height) +# define memcpy4 memcpy +# else +# ifdef DC_TEXCONV +# define malloc_stats() +# endif +# include +# include +# include +# define VIDEO_MODE_WIDTH 640.0f +# define VIDEO_MODE_HEIGHT 480.0f +# define memcpy4 memcpy +# define dcache_pref_block(a) __builtin_prefetch(a) +# define F_PI M_PI +# ifndef __always_inline +# define __always_inline inline +# endif +# endif +# ifdef PVR_TXRFMT_STRIDE +# undef PVR_TXRFMT_STRIDE +# define PVR_TXRFMT_STRIDE (1 << 25) +# endif + static_assert(PVR_TXRFMT_STRIDE == (1 << 25), + "PVR_TXRFMT_STRIDE is bugged in your KOS version"); +#endif + +#define F_PI_2 (F_PI * 0.5f) +#define __hot __attribute__((hot)) +#define __cold __attribute__((cold)) + +#define STRINGIFY(x) #x +#define STR(x) STRINGIFY(x) +#define CONCAT_(x,y) x##y +#define CONCAT(x,y) CONCAT_(x,y) + +namespace rw { + class Matrix; +} + +namespace dc { + +struct quaternion_t { + float x, y, z, w; +}; + +__always_inline __hot constexpr float Sin(float x) { return sinf(x); } +__always_inline __hot constexpr float Cos(float x) { return cosf(x); } +__always_inline __hot constexpr auto SinCos(float x) { return std::pair { Sin(x), Cos(x) }; } +__always_inline __hot constexpr float Abs(float x) { return fabsf(x); } +__always_inline __hot constexpr float Sqrt(float x) { return sqrtf(x); } +__always_inline __hot constexpr float RecipSqrt(float x, float y) { return x / Sqrt(y); } +__always_inline __hot constexpr float Pow(float x, float y) { return powf(x, y); } +__always_inline __hot constexpr float Floor(float x) { return floorf(x); } +__always_inline __hot constexpr float Ceil(float x) { return ceilf(x); } +__always_inline __hot constexpr float Fmac(auto a, auto b, auto c) { return a * b + c; } +__always_inline __hot constexpr float Lerp(float a, float b, float t) { return Fmac(t, (b - a), a); } +__always_inline __hot constexpr auto Max(auto a, auto b) { return ((a > b)? a : b); } +__always_inline __hot constexpr auto Min(auto a, auto b) { return ((a < b)? a : b); } + +template +__always_inline __hot constexpr float RecipSqrt(float x) { + if constexpr(CHECK_ZERO) + if(x == 0.0f) + return 0.0f; + + return 1.0f / Sqrt(x); +} + +template +__always_inline __hot constexpr T Clamp(T v, auto low, auto high) { + return std::clamp(v, static_cast(low), static_cast(high)); +} + +__always_inline constexpr auto Clamp2(auto v, auto center, auto radius) { + return (v > center) ? Min(v, center + radius) : Max(v, center - radius); +} + +template +__always_inline __hot constexpr float Invert(float x) { + float value; + + if(!std::is_constant_evaluated() && FAST_APPROX) { + value = RecipSqrt(x * x); + + if constexpr(COPY_SIGN) + if(x < 0.0f) + value = -value; + + } else value = 1.0f / x; + + return value; +} + +template +__always_inline __hot constexpr float Div(float x, float y) { + if(FAST_APPROX && !std::is_constant_evaluated()) + return x * Invert(y); + else + return x / y; +} + +template +__always_inline __hot constexpr auto Norm(auto value, auto min, auto max) { + auto numerator = Clamp(value, min, max) - min; + auto denominator = (max - min); + + if(FAST_APPROX && !std::is_constant_evaluated()) + return Div(numerator, denominator); + else + return numerator / denominator; +} + +template +__always_inline __hot constexpr float Tan(float x) { + if(!std::is_constant_evaluated() && FAST_APPROX) { + constexpr float pisqby4 = 2.4674011002723397f; + constexpr float adjpisqby4 = 2.471688400562703f; + constexpr float adj1minus8bypisq = 0.189759681063053f; + float xsq = x * x; + + return x * Div(adjpisqby4 - adj1minus8bypisq * xsq, + pisqby4 - xsq); + } else + return tanf(x); +} + +template +__always_inline __hot constexpr float Atan(float x) { + if(FAST_APPROX && !std::is_constant_evaluated()) { + constexpr float a[3] = { // + 0.998418889819911f, -2.9993501171084700E-01f, 0.0869142852883849f}; + float xx = x * x; + return ((a[2] * xx + a[1]) * xx + a[0]) * x; + } else return atanf(x); +} + +template +__hot constexpr float Atan2(float y, float x) { + if(FAST_APPROX && !std::is_constant_evaluated()) { +#if 0 + constexpr float halfpi_i754 = M_PI * 0.5f; + constexpr float quarterpi_i754 = M_PI * 0.25f; + // kludge to prevent 0/0 condition + float abs_y = Abs(y) + std::numeric_limits::epsilon(); + float absy_plus_absx = abs_y + Abs(x); + float inv_absy_plus_absx = Invert(absy_plus_absx); + float angle = halfpi_i754 - copysignf(quarterpi_i754, x); + float r = (x - copysignf(abs_y, x)) * inv_absy_plus_absx; + angle += (0.1963f * r * r - 0.9817f) * r; + return copysignf(angle, y); +#else + // Ensure input is in [-1, +1] + bool swap = fabs(x) < fabs(y); + float atan_input = (swap ? x : y) / (swap ? y : x); + + // Approximate atan + float res = Atan(atan_input); + + // If swapped, adjust atan output + res = swap ? (atan_input >= 0.0f ? F_PI_2 : -F_PI_2) - res : res; + // Adjust quadrants + if (x >= 0.0f && y >= 0.0f) {} // 1st quadrant + else if (x < 0.0f && y >= 0.0f) { res = F_PI + res; } // 2nd quadrant + else if (x < 0.0f && y < 0.0f) { res = -F_PI + res; } // 3rd quadrant + else if (x >= 0.0f && y < 0.0f) {} // 4th quadrant + + // Store result + return res; +#endif + } else return atan2f(y, x); +} + +template +__always_inline __hot constexpr float Asin(float x) { + if(FAST_APPROX && !std::is_constant_evaluated()) { + Atan(Div(x, Sqrt(1.0f-(x*x)))); + } else return asinf(x); +} + +template +__always_inline __hot constexpr float Acos(float x) { + if(FAST_APPROX && !std::is_constant_evaluated()) { + return (-0.69813170079773212f * x * x - 0.87266462599716477f) * x + 1.5707963267948966f; + } else return acosf(x); +} + +#ifdef RW_DC +# ifdef DC_SH4 + +#define mat_trans_nodiv_nomod(x, y, z, x2, y2, z2, w2) do { \ + register float __x __asm__("fr12") = (x); \ + register float __y __asm__("fr13") = (y); \ + register float __z __asm__("fr14") = (z); \ + register float __w __asm__("fr15") = 1.0f; \ + __asm__ __volatile__( "ftrv xmtrx, fv12\n" \ + : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \ + : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \ + x2 = __x; y2 = __y; z2 = __z; w2 = __w; \ + } while(false) + +#define mat_trans_nodiv_nomod_zerow(x, y, z, x2, y2, z2, w2) do { \ + register float __x __asm__("fr12") = (x); \ + register float __y __asm__("fr13") = (y); \ + register float __z __asm__("fr14") = (z); \ + register float __w __asm__("fr15") = 0.0f; \ + __asm__ __volatile__( "ftrv xmtrx, fv12\n" \ + : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \ + : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \ + x2 = __x; y2 = __y; z2 = __z; w2 = __w; \ + } while(false) + +#define mat_trans_w_nodiv_nomod(x, y, z, w) do { \ + register float __x __asm__("fr12") = (x); \ + register float __y __asm__("fr13") = (y); \ + register float __z __asm__("fr14") = (z); \ + register float __w __asm__("fr15") = 1.0f; \ + __asm__ __volatile__( "ftrv xmtrx, fv12\n" \ + : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \ + : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \ + w = __w; \ + } while(false) + +#define mat_trans_vec3_nomod(x, y, z, x2, y2, z2) { \ + register float __x __asm__("fr12") = (x); \ + register float __y __asm__("fr13") = (y); \ + register float __z __asm__("fr14") = (z); \ + __asm__ __volatile__( \ + "fldi0 fr15\n" \ + "ftrv xmtrx, fv8\n" \ + : "=f" (__x), "=f" (__y), "=f" (__z) \ + : "0" (__x), "1" (__y), "2" (__z) \ + : "fr15" ); \ + x2 = __x; y2 = __y; z2 = __z; \ + } + +// no declspec naked, so can't do rts / fschg. instead compiler pads with nop? +inline __hot void mat_load_3x3(const matrix_t* mtx) { + __asm__ __volatile__ ( + R"( + fschg + frchg + + fmov @%[mtx]+, dr0 + fldi0 fr12 + + fmov @%[mtx]+, dr2 + fldi0 fr13 + + fmov @%[mtx]+, dr4 + fldi0 fr3 + + fmov @%[mtx]+, dr6 + fmov dr12, dr14 + + fmov @%[mtx]+, dr8 + fldi0 fr7 + + fmov @%[mtx]+, dr10 + fldi0 fr11 + + fschg + frchg + )" + : [mtx] "+r" (mtx) + ); +} + +// sets pos.w to 1 +inline __hot void rw_mat_load_4x4(const rw::Matrix* mtx) { + __asm__ __volatile__ ( + R"( + fschg + frchg + fmov @%[mtx]+, dr0 + + fmov @%[mtx]+, dr2 + fmov @%[mtx]+, dr4 + fmov @%[mtx]+, dr6 + fmov @%[mtx]+, dr8 + fmov @%[mtx]+, dr10 + fmov @%[mtx]+, dr12 + fmov @%[mtx]+, dr14 + fldi1 fr15 + + fschg + frchg + )" + : [mtx] "+r" (mtx) + ); +} + +__hot inline void mat_transpose(void) { + asm volatile ( + "frchg\n\t" // fmov for singles only works on front bank + // FR0, FR5, FR10, and FR15 are already in place + // swap FR1 and FR4 + "flds FR1, FPUL\n\t" + "fmov FR4, FR1\n\t" + "fsts FPUL, FR4\n\t" + // swap FR2 and FR8 + "flds FR2, FPUL\n\t" + "fmov FR8, FR2\n\t" + "fsts FPUL, FR8\n\t" + // swap FR3 and FR12 + "flds FR3, FPUL\n\t" + "fmov FR12, FR3\n\t" + "fsts FPUL, FR12\n\t" + // swap FR6 and FR9 + "flds FR6, FPUL\n\t" + "fmov FR9, FR6\n\t" + "fsts FPUL, FR9\n\t" + // swap FR7 and FR13 + "flds FR7, FPUL\n\t" + "fmov FR13, FR7\n\t" + "fsts FPUL, FR13\n\t" + // swap FR11 and FR14 + "flds FR11, FPUL\n\t" + "fmov FR14, FR11\n\t" + "fsts FPUL, FR14\n\t" + // restore XMTRX to back bank + "frchg\n" + : // no outputs + : // no inputs + : "fpul" // clobbers + ); +} + +__attribute__((optimize("align-functions=32"))) +__hot inline void mat_copy(matrix_t *dst, const matrix_t *src) { + asm volatile(R"( + fschg + + pref @%[dst] + + fmov.d @%[src]+, xd0 + fmov.d @%[src]+, xd2 + fmov.d @%[src]+, xd4 + fmov.d @%[src]+, xd6 + + pref @%[src] + + add #32, %[dst] + + fmov.d xd6, @-%[dst] + fmov.d xd4, @-%[dst] + fmov.d xd2, @-%[dst] + fmov.d xd0, @-%[dst] + + add #32, %[dst] + pref @%[dst] + + fmov.d @%[src]+, xd0 + fmov.d @%[src]+, xd2 + fmov.d @%[src]+, xd4 + fmov.d @%[src]+, xd6 + + add #32, %[dst] + + fmov.d xd6, @-%[dst] + fmov.d xd4, @-%[dst] + fmov.d xd2, @-%[dst] + fmov.d xd0, @-%[dst] + + fschg + )": [dst] "+&r" (dst), [src] "+&r" (src) + : + : "memory"); +} + +template +__hot constexpr inline void quat_mult(quaternion_t *r, const quaternion_t &q1, const quaternion_t &q2) { + if(FAST_APPROX && !std::is_constant_evaluated()) { + /* + // reorder the coefficients so that q1 stays in constant order {x,y,z,w} + // q2 then needs to be rotated after each inner product + x = (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x); + y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y); + z = (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z); + w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w); + */ + // keep q1 in fv4 + register float q1x __asm__ ("fr4") = (q1.x); + register float q1y __asm__ ("fr5") = (q1.y); + register float q1z __asm__ ("fr6") = (q1.z); + register float q1w __asm__ ("fr7") = (q1.w); + + // load q2 into fv8, use it to get the shuffled reorder into fv0 + register float q2x __asm__ ("fr8") = (q2.x); + register float q2y __asm__ ("fr9") = (q2.y); + register float q2z __asm__ ("fr10") = (q2.z); + register float q2w __asm__ ("fr11") = (q2.w); + + // temporary operand / result in fv0 + register float t1x __asm__ ("fr0"); + register float t1y __asm__ ("fr1"); + register float t1z __asm__ ("fr2"); + register float t1w __asm__ ("fr3"); + + // x = (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x); + t1x = q2w; + t1y = q2z; + t1z = -q2y; + t1w = q2w; + __asm__ ("\n" + " fipr fv4,fv0\n" + : "+f" (t1w) + : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w), + "f" (t1x), "f" (t1y), "f" (t1z) + ); + // x = t1w; try to avoid the stall by not reading the fipr result immediately + + // y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y); + t1x = -q2z; + t1y = q2w; + t1z = q2x; + __atomic_thread_fence(1); + r->x = t1w; // get previous result + t1w = q2y; + __asm__ ("\n" + " fipr fv4,fv0\n" + : "+f" (t1w) + : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w), + "f" (t1x), "f" (t1y), "f" (t1z) + ); + //y = t1w; + + // z = (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z); + t1x = q2y; + t1y = -q2x; + t1z = q2w; + __atomic_thread_fence(1); + r->y = t1w; // get previous result + t1w = q2z; + __asm__ ("\n" + " fipr fv4,fv0\n" + : "+f" (t1w) + : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w), + "f" (t1x), "f" (t1y), "f" (t1z) + ); + //z = t1w; + __atomic_thread_fence(1); + + // w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w); + q2x = -q2x; + q2y = -q2y; + q2z = -q2z; + __asm__ ("\n" + " fipr fv4,fv8\n" + : "+f" (q2w) + : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w), + "f" (q2x), "f" (q2y), "f" (q2z) + ); + + __atomic_thread_fence(1); + r->z = t1w; + __atomic_thread_fence(1); + r->w = q2w; + } else { + r->x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w); + r->y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w); + r->z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w); + r->w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z); + } +} + +__hot inline void mat_load_apply(const matrix_t* matrix1, const matrix_t* matrix2) { + unsigned int prefetch_scratch; + + asm volatile ( + "mov %[bmtrx], %[pref_scratch]\n\t" // (MT) + "add #32, %[pref_scratch]\n\t" // offset by 32 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle) + "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) (FE) + "pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS) + // back matrix + "fmov.d @%[bmtrx]+, XD0\n\t" // (LS) + "fmov.d @%[bmtrx]+, XD2\n\t" + "fmov.d @%[bmtrx]+, XD4\n\t" + "fmov.d @%[bmtrx]+, XD6\n\t" + "pref @%[fmtrx]\n\t" // prefetch fmtrx now while we wait (LS) + "fmov.d @%[bmtrx]+, XD8\n\t" // bmtrx prefetch should work for here + "fmov.d @%[bmtrx]+, XD10\n\t" + "fmov.d @%[bmtrx]+, XD12\n\t" + "mov %[fmtrx], %[pref_scratch]\n\t" // (MT) + "add #32, %[pref_scratch]\n\t" // store offset by 32 in r0 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle) + "fmov.d @%[bmtrx], XD14\n\t" + "pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS) + // front matrix + // interleave loads and matrix multiply 4x4 + "fmov.d @%[fmtrx]+, DR0\n\t" + "fmov.d @%[fmtrx]+, DR2\n\t" + "fmov.d @%[fmtrx]+, DR4\n\t" // (LS) want to issue the next one before 'ftrv' for parallel exec + "ftrv XMTRX, FV0\n\t" // (FE) + + "fmov.d @%[fmtrx]+, DR6\n\t" + "fmov.d @%[fmtrx]+, DR8\n\t" + "ftrv XMTRX, FV4\n\t" + + "fmov.d @%[fmtrx]+, DR10\n\t" + "fmov.d @%[fmtrx]+, DR12\n\t" + "ftrv XMTRX, FV8\n\t" + + "fmov.d @%[fmtrx], DR14\n\t" // (LS, but this will stall 'ftrv' for 3 cycles) + "fschg\n\t" // switch back to single moves (and avoid stalling 'ftrv') (FE) + "ftrv XMTRX, FV12\n\t" // (FE) + // Save output in XF regs + "frchg\n" + : [bmtrx] "+&r" ((unsigned int)matrix1), [fmtrx] "+r" ((unsigned int)matrix2), [pref_scratch] "=&r" (prefetch_scratch) // outputs, "+" means r/w, "&" means it's written to before all inputs are consumed + : // no inputs + : "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11", "fr12", "fr13", "fr14", "fr15" // clobbers (GCC doesn't know about back bank, so writing to it isn't clobbered) + ); +} + +__hot inline void mat_apply_rotate_x(float x) { + x *= 10430.37835f; + asm volatile( + "ftrc %[a], fpul\n\t" + "fsca fpul, dr4\n\t" + "fldi0 fr8\n\t" + "fldi0 fr11\n\t" + "fmov fr5, fr10\n\t" + "fmov fr4, fr9\n\t" + "fneg fr9\n\t" + "ftrv xmtrx, fv8\n\t" + "fmov fr4, fr6\n\t" + "fldi0 fr7\n\t" + "fldi0 fr4\n\t" + "ftrv xmtrx, fv4\n\t" + "fschg\n\t" + "fmov dr8, xd8\n\t" + "fmov dr10, xd10\n\t" + "fmov dr4, xd4\n\t" + "fmov dr6, xd6\n\t" + "fschg\n" + : + : [a] "f"(x) + : "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11"); +} + +__hot inline void mat_apply_rotate_y(float y) { + y *= 10430.37835f; + asm volatile( + "ftrc %[a], fpul\n\t" + "fsca fpul, dr6\n\t" + "fldi0 fr9\n\t" + "fldi0 fr11\n\t" + "fmov fr6, fr8\n\t" + "fmov fr7, fr10\n\t" + "ftrv xmtrx, fv8\n\t" + "fmov fr7, fr4\n\t" + "fldi0 fr5\n\t" + "fneg fr6\n\t" + "fldi0 fr7\n\t" + "ftrv xmtrx, fv4\n\t" + "fschg\n\t" + "fmov dr8, xd8\n\t" + "fmov dr10, xd10\n\t" + "fmov dr4, xd0\n\t" + "fmov dr6, xd2\n\t" + "fschg\n" + : + : [a] "f"(y) + : "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11"); +} + +__hot inline void mat_apply_rotate_z(float z) { + z *= 10430.37835f; + asm volatile( + "ftrc %[a], fpul\n\t" + "fsca fpul, dr8\n\t" + "fldi0 fr10\n\t" + "fldi0 fr11\n\t" + "fmov fr8, fr5\n\t" + "fneg fr8\n\t" + "ftrv xmtrx, fv8\n\t" + "fmov fr9, fr4\n\t" + "fschg\n\t" + "fmov dr10, dr6\n\t" + "ftrv xmtrx, fv4\n\t" + "fmov dr8, xd4\n\t" + "fmov dr10, xd6\n\t" + "fmov dr4, xd0\n\t" + "fmov dr6, xd2\n\t" + "fschg\n" + : + : [a] "f"(z) + : "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11"); +} + +# else +# ifdef DC_TEXCONV +# define mat_transform(a, b, c, d) +# define mat_apply(a) +# define mat_load(a) +# define mat_store(a) +# define mat_identity(a) +# define pvr_fog_table_color(a,r,g,b) +# define pvr_fog_table_linear(s,e) +# endif + +#define mat_trans_vec3_nomod(x_, y_, z_, x2, y2, z2) do { \ + vector_t tmp = { x_, y_, z_, 0.0f }; \ + mat_transform(&tmp, &tmp, 1, 0); \ + x2 = tmp.x; y2 = tmp.y; z2 = tmp.z; \ + } while(false) + +#define mat_trans_single3_nomod(x_, y_, z_, x2, y2, z2) do { \ + vector_t tmp = { x_, y_, z_, 1.0f }; \ + mat_transform(&tmp, &tmp, 1, 0); \ + z2 = 1.0f / tmp.w; \ + x2 = tmp.x * z2; \ + y2 = tmp.y * z2; \ + } while(false) + +#define mat_trans_single3_nodiv_nomod(x_, y_, z_, x2, y2, z2) do { \ + vector_t tmp = { x_, y_, z_, 1.0f }; \ + mat_transform(&tmp, &tmp, 1, 0); \ + z2 = tmp.z; \ + x2 = tmp.x; \ + y2 = tmp.y; \ + } while(false) + +#define mat_trans_nodiv_nomod(x_, y_, z_, x2, y2, z2, w2) do { \ + vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \ + mat_transform(&tmp1233123, &tmp1233123, 1, 0); \ + x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \ + } while(false) + +#define mat_trans_nodiv_nomod_zerow(x_, y_, z_, x2, y2, z2, w2) do { \ + vector_t tmp1233123 = { x_, y_, z_, 0.0f }; \ + mat_transform(&tmp1233123, &tmp1233123, 1, 0); \ + x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \ + } while(false) + +#define mat_trans_w_nodiv_nomod(x_, y_, z_, w_) do { \ + vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \ + mat_transform(&tmp1233123, &tmp1233123, 1, 0); \ + w_ = tmp1233123.w; \ + } while(false) + +inline void mat_load_3x3(const matrix_t* mtx) { + memcpy(XMTRX, mtx, sizeof(matrix_t)); + XMTRX[0][3] = 0.0f; + XMTRX[1][3] = 0.0f; + XMTRX[2][3] = 0.0f; + + XMTRX[3][0] = 0.0f; + XMTRX[3][1] = 0.0f; + XMTRX[3][2] = 0.0f; + XMTRX[3][3] = 0.0f; +} + +inline void rw_mat_load_4x4(const rw::Matrix* mtx) { + memcpy(XMTRX, mtx, sizeof(matrix_t)); + XMTRX[3][3] = 1.0f; +} + +inline void mat_transpose(void) { + matrix_t tmp; + + for(unsigned i = 0; i < 4; ++i) + for(unsigned j = 0; j < 4; ++j) + tmp[j][i] = XMTRX[i][j]; + + mat_load(&tmp); +} + +template +__hot inline void quat_mult(quaternion_t *r, const quaternion_t &q1, const quaternion_t &q2) { + r->x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w); + r->y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w); + r->z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w); + r->w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z); +} + +__hot inline void mat_load_apply(const matrix_t* matrix1, const matrix_t* matrix2) { + mat_load(matrix1); + mat_apply(matrix2); +} + +__always_inline __hot void mat_copy(matrix_t *dst, const matrix_t *src) { + mat_load(src); + mat_store(dst); +} + +#endif + +__hot inline void mat_mult(matrix_t *out, const matrix_t* matrix1, const matrix_t* matrix2) { + mat_load_apply(matrix1, matrix2); + mat_store(out); +} + +#if 0 +template +__always_inline __hot +auto vec3_dot_prod(const vec3_t &v1, vec3_t const (&v2)[N]) { + std::array result; + +#ifdef DC_SH4 + register float x asm(KOS_FPARG(0)) = v1.x; + register float y asm(KOS_FPARG(1)) = v1.y; + register float z asm(KOS_FPARG(2)) = v1.z; +#else + float x = v1.x; + float y = v1.y; + float z = v1.z; +#endif + + for(std::size_t v = 0; v < N; ++v) + result[v] = fipr(x, y, z, 0.0f, v2[v].x, v2[v].y, v2[v].z, 0.0f); + + return result; +} +#endif + +#endif + +} + +#endif /* RWDC_COMMON_H */ \ No newline at end of file diff --git a/vendor/librw/src/rwbase.h b/vendor/librw/src/rwbase.h index 4274be0e..8b567709 100644 --- a/vendor/librw/src/rwbase.h +++ b/vendor/librw/src/rwbase.h @@ -1,6 +1,9 @@ #pragma once #include "common_defines.h" +#ifdef RW_DC +#include "rwdc_common.h" +#endif #ifndef RW_PS2 #include @@ -322,7 +325,13 @@ inline float32 dot(const Quat &q, const Quat &p) { #endif } inline Quat scale(const Quat &q, float32 r) { return makeQuat(q.w*r, q.x*r, q.y*r, q.z*r); } -inline float32 length(const Quat &q) { return sqrtf(q.w*q.w + q.x*q.x + q.y*q.y + q.z*q.z); } +inline float32 length(const Quat &q) { +#ifndef DC_SH4 + return sqrtf(q.w*q.w + q.x*q.x + q.y*q.y + q.z*q.z); +#else + return dc::Sqrt(fipr_magnitude_sqr(q.x, q.y, q.z, q.w)); +#endif +} inline Quat normalize(const Quat &q) { return scale(q, 1.0f/length(q)); } inline Quat conj(const Quat &q) { return makeQuat(q.w, -q.x, -q.y, -q.z); } Quat mult(const Quat &q, const Quat &p); From dc96ffc5517d18c7ffcc26ba600b0e3716ca77d6 Mon Sep 17 00:00:00 2001 From: Falco Girgis Date: Tue, 15 Apr 2025 13:53:19 -0500 Subject: [PATCH 2/6] FIXED LIBERTY BOAT ROTATION ISSUES. --- src/liberty/math/Matrix.cpp | 2 +- src/liberty/math/Matrix.h | 8 +------- src/liberty/math/Quaternion.h | 5 +++-- src/liberty/math/Vector.cpp | 24 +++++++++--------------- src/liberty/math/Vector.h | 5 +++-- 5 files changed, 17 insertions(+), 27 deletions(-) diff --git a/src/liberty/math/Matrix.cpp b/src/liberty/math/Matrix.cpp index a919baaf..5ac7d4c4 100644 --- a/src/liberty/math/Matrix.cpp +++ b/src/liberty/math/Matrix.cpp @@ -454,7 +454,7 @@ operator*(const CMatrix &m1, const CMatrix &m2) { // TODO: VU0 code CMatrix out; -#if defined(RW_DC) +#ifdef DC_SH4 mat_mult(out, m1, m2); #else out.rx = m1.rx * m2.rx + m1.fx * m2.ry + m1.ux * m2.rz; diff --git a/src/liberty/math/Matrix.h b/src/liberty/math/Matrix.h index 06dab72a..295d5540 100644 --- a/src/liberty/math/Matrix.h +++ b/src/liberty/math/Matrix.h @@ -63,7 +63,6 @@ public: void SetScale(float s); void Scale(float scale) { -#ifndef DC_SH4 for (int i = 0; i < 3; i++) #ifdef FIX_BUGS // BUGFIX from VC for (int j = 0; j < 3; j++) @@ -71,11 +70,6 @@ public: for (int j = 0; j < 4; j++) #endif f[i][j] *= scale; -#else - mat_load(*this); - mat_scale(scale, scale, scale); - mat_store(*this); -#endif } @@ -112,7 +106,7 @@ CMatrix Invert(const CMatrix &matrix); CMatrix operator*(const CMatrix &m1, const CMatrix &m2); inline CVector MultiplyInverse(const CMatrix &mat, const CVector &vec) { -#ifndef DC_SH4 +#if 1 CVector v(vec.x - mat.px, vec.y - mat.py, vec.z - mat.pz); return CVector( mat.rx * v.x + mat.ry * v.y + mat.rz * v.z, diff --git a/src/liberty/math/Quaternion.h b/src/liberty/math/Quaternion.h index c1b0e481..185adb7c 100644 --- a/src/liberty/math/Quaternion.h +++ b/src/liberty/math/Quaternion.h @@ -62,7 +62,7 @@ public: } const CQuaternion &operator/=(float right) { - right = dc::Invert(right); + right = dc::Invert(right); x *= right; y *= right; z *= right; @@ -115,5 +115,6 @@ inline CQuaternion operator*(float left, const CQuaternion &right) inline CQuaternion operator/(const CQuaternion &left, float right) { - return CQuaternion(left.x / right, left.y / right, left.z / right, left.w / right); + right = Invert(right); + return CQuaternion(left.x * right, left.y * right, left.z * right, left.w * right); } diff --git a/src/liberty/math/Vector.cpp b/src/liberty/math/Vector.cpp index 4aebb789..91ac44fe 100644 --- a/src/liberty/math/Vector.cpp +++ b/src/liberty/math/Vector.cpp @@ -27,23 +27,17 @@ CrossProduct(const CVector &v1, const CVector &v2) CVector Multiply3x3(const CMatrix &mat, const CVector &vec) { -#ifdef DC_SH4 - register float __x __asm__("fr12") = vec.x; - register float __y __asm__("fr13") = vec.y; - register float __z __asm__("fr14") = vec.z; - register float __w __asm__("fr15") = 0.0f; - - mat_load(reinterpret_cast(const_cast(&mat))); - - asm volatile( "ftrv xmtrx, fv12\n" - : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) - : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); - return { __x, __y, __z }; -#else +#ifndef DC_SH4 // TODO: VU0 code return CVector(mat.rx * vec.x + mat.fx * vec.y + mat.ux * vec.z, mat.ry * vec.x + mat.fy * vec.y + mat.uy * vec.z, mat.rz * vec.x + mat.fz * vec.y + mat.uz * vec.z); +#else + CVector out; + mat_load(mat); + mat_trans_normal3_nomod(vec.x, vec.y, vec.z, + out.x, out.y, out.z); + return out; #endif } @@ -58,8 +52,8 @@ Multiply3x3(const CVector &vec, const CMatrix &mat) CVector out; mat_load(mat); mat_transpose(); - mat_trans_vec3_nomod(vec.x, vec.y, vec.z, - out.x, out.y, out.z); + mat_trans_normal3_nomod(vec.x, vec.y, vec.z, + out.x, out.y, out.z); return out; #endif } diff --git a/src/liberty/math/Vector.h b/src/liberty/math/Vector.h index c0996510..1501512d 100644 --- a/src/liberty/math/Vector.h +++ b/src/liberty/math/Vector.h @@ -68,7 +68,7 @@ public: } const CVector &operator/=(float right) { - right = Invert(right); + right = Invert(right); x *= right; y *= right; z *= right; @@ -112,7 +112,8 @@ inline CVector operator*(float left, const CVector &right) inline CVector operator/(const CVector &left, float right) { - return CVector(left.x / right, left.y / right, left.z / right); + right = Invert(right); + return CVector(left.x * right, left.y * right, left.z * right); } __always_inline float From 2716147db45d180979f547926bb575d5ee349369 Mon Sep 17 00:00:00 2001 From: Falco Girgis Date: Sat, 26 Apr 2025 10:52:36 -0500 Subject: [PATCH 3/6] Everything tested/working: liberty/miami + gainz! --- src/liberty/collision/Collision.cpp | 12 +- src/liberty/math/Matrix.cpp | 16 +- src/liberty/math/Matrix.h | 4 +- src/liberty/math/Vector.cpp | 6 +- src/liberty/math/VuVector.h | 6 +- src/miami/collision/Collision.cpp | 12 +- src/miami/math/Matrix.cpp | 4 +- src/miami/math/Matrix.h | 12 - src/miami/math/Quaternion.h | 5 +- src/miami/math/Vector.cpp | 28 +- src/miami/math/VuVector.h | 8 +- vendor/dca3-kos | 2 +- vendor/librw/src/base.cpp | 4 +- vendor/librw/src/camera.cpp | 206 ++++++++++++++- vendor/librw/src/dc/rwdc.cpp | 236 ++--------------- vendor/librw/src/dc/rwdc_common.h | 396 ++++++++++++++-------------- 16 files changed, 476 insertions(+), 481 deletions(-) diff --git a/src/liberty/collision/Collision.cpp b/src/liberty/collision/Collision.cpp index 3d85f0c4..05185f9e 100644 --- a/src/liberty/collision/Collision.cpp +++ b/src/liberty/collision/Collision.cpp @@ -1619,7 +1619,7 @@ CCollision::ProcessLineOfSight(const CColLine &line, point.point = matrix * point.point; point.normal = Multiply3x3(matrix, point.normal); #else - mat_load(reinterpret_cast(const_cast(&matrix))); + dc::mat_load2(matrix); mat_trans_single3_nodiv(point.point.x, point.point.y, point.point.z); @@ -1798,7 +1798,7 @@ CCollision::ProcessVerticalLine(const CColLine &line, point.point = matrix * point.point; point.normal = Multiply3x3(matrix, point.normal); #else - mat_load(reinterpret_cast(const_cast(&matrix))); + dc::mat_load2(matrix); mat_trans_single3_nodiv(point.point.x, point.point.y, point.point.z); @@ -2173,8 +2173,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA, #ifndef DC_SH4 matAB *= matrixA; #else - mat_load(reinterpret_cast(&matAB)); - mat_apply(reinterpret_cast(&matrixA)); + dc::mat_load_apply(matAB, matrixA); #endif CColSphere bsphereAB; // bounding sphere of A in B space @@ -2246,8 +2245,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA, #ifndef DC_SH4 matBA *= matrixB; #else - mat_load(reinterpret_cast(&matBA)); - mat_apply(reinterpret_cast(&matrixB)); + dc::mat_load_apply(matBA, matrixB); #endif for(i = 0; i < modelB.numSpheres; i++){ s.radius = modelB.spheres[i].radius; @@ -2309,7 +2307,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA, } #ifdef DC_SH4 - mat_load(reinterpret_cast(const_cast(&matrixB))); + dc::mat_load2(matrixB); #endif for(i = 0; i < numCollisions; i++){ #ifndef DC_SH4 diff --git a/src/liberty/math/Matrix.cpp b/src/liberty/math/Matrix.cpp index 5ac7d4c4..a4b6c73f 100644 --- a/src/liberty/math/Matrix.cpp +++ b/src/liberty/math/Matrix.cpp @@ -277,9 +277,9 @@ void CMatrix::RotateX(float x) { #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results - mat_load(reinterpret_cast(this)); + dc::mat_load2(reinterpret_cast(this)); mat_rotate_x(x); - mat_store(reinterpret_cast(this)); + dc::mat_store2(reinterpret_cast(this)); #else auto [s, c] = SinCos(x); @@ -307,9 +307,9 @@ void CMatrix::RotateY(float y) { #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results - mat_load(reinterpret_cast(this)); + dc::mat_load2(reinterpret_cast(this)); mat_rotate_y(y); - mat_store(reinterpret_cast(this)); + dc::mat_store2(reinterpret_cast(this)); #else auto [s, c] = SinCos(y); @@ -337,9 +337,9 @@ void CMatrix::RotateZ(float z) { #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results - mat_load(reinterpret_cast(this)); + dc::mat_load2(reinterpret_cast(this)); mat_rotate_z(z); - mat_store(reinterpret_cast(this)); + dc::mat_store2(reinterpret_cast(this)); #else auto [s, c] = SinCos(z); @@ -367,9 +367,9 @@ void CMatrix::Rotate(float x, float y, float z) { #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results - mat_load(reinterpret_cast(this)); + dc::mat_load2(reinterpret_cast(this)); mat_rotate(x, y, z); - mat_store(reinterpret_cast(this)); + dc::mat_store2(reinterpret_cast(this)); #else auto [sX, cX] = SinCos(x); auto [sY, cY] = SinCos(y); diff --git a/src/liberty/math/Matrix.h b/src/liberty/math/Matrix.h index 295d5540..50f4031e 100644 --- a/src/liberty/math/Matrix.h +++ b/src/liberty/math/Matrix.h @@ -106,7 +106,7 @@ CMatrix Invert(const CMatrix &matrix); CMatrix operator*(const CMatrix &m1, const CMatrix &m2); inline CVector MultiplyInverse(const CMatrix &mat, const CVector &vec) { -#if 1 +#ifndef DC_SH4 CVector v(vec.x - mat.px, vec.y - mat.py, vec.z - mat.pz); return CVector( mat.rx * v.x + mat.ry * v.y + mat.rz * v.z, @@ -124,8 +124,6 @@ inline CVector MultiplyInverse(const CMatrix &mat, const CVector &vec) #endif } - - class CCompressedMatrixNotAligned { CVector m_vecPos; diff --git a/src/liberty/math/Vector.cpp b/src/liberty/math/Vector.cpp index 91ac44fe..4f5a21ed 100644 --- a/src/liberty/math/Vector.cpp +++ b/src/liberty/math/Vector.cpp @@ -34,7 +34,7 @@ Multiply3x3(const CMatrix &mat, const CVector &vec) mat.rz * vec.x + mat.fz * vec.y + mat.uz * vec.z); #else CVector out; - mat_load(mat); + dc::mat_load2(mat); mat_trans_normal3_nomod(vec.x, vec.y, vec.z, out.x, out.y, out.z); return out; @@ -50,7 +50,7 @@ Multiply3x3(const CVector &vec, const CMatrix &mat) mat.ux * vec.x + mat.uy * vec.y + mat.uz * vec.z); #else CVector out; - mat_load(mat); + dc::mat_load2(mat); mat_transpose(); mat_trans_normal3_nomod(vec.x, vec.y, vec.z, out.x, out.y, out.z); @@ -63,7 +63,7 @@ operator*(const CMatrix &mat, const CVector &vec) { #ifdef DC_SH4 CVector out; - mat_load(reinterpret_cast(const_cast(&mat))); + dc::mat_load2(mat); mat_trans_single3_nodiv_nomod(vec.x, vec.y, vec.z, out.x, out.y, out.z); return out; #else diff --git a/src/liberty/math/VuVector.h b/src/liberty/math/VuVector.h index f2f4a8e4..b8228ec8 100644 --- a/src/liberty/math/VuVector.h +++ b/src/liberty/math/VuVector.h @@ -44,7 +44,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const CV sqc2 vf06,0x0(%0)\n\ ": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory"); #elif defined(DC_SH4) - mat_load(mat); + dc::mat_load2(mat); mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z); #else out = mat * in; @@ -71,7 +71,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const Rw sqc2 vf06,0x0(%0)\n\ ": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory"); #elif defined(DC_SH4) - mat_load(mat); + dc::mat_load2(mat); mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z); #else out = mat * in; @@ -108,7 +108,7 @@ __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat, bnez %1,1b\n\ ": : "r" (out) , "r" (n), "r" (&mat), "r" (in), "r" (stride): "memory"); #elif defined(DC_SH4) - mat_load(mat); + dc::mat_load2(mat); while(n--) { mat_trans_single3_nodiv_nomod(in->x, in->y, in->z, out->x, out->y, out->z); in = reinterpret_cast(reinterpret_cast(in) + stride); diff --git a/src/miami/collision/Collision.cpp b/src/miami/collision/Collision.cpp index 8db6c42a..683ebd75 100644 --- a/src/miami/collision/Collision.cpp +++ b/src/miami/collision/Collision.cpp @@ -1473,7 +1473,7 @@ CCollision::ProcessLineOfSight(const CColLine &line, point.point = matrix * point.point; point.normal = Multiply3x3(matrix, point.normal); #else - mat_load(reinterpret_cast(const_cast(&matrix))); + dc::mat_load2(matrix); mat_trans_single3_nodiv(point.point.x, point.point.y, point.point.z); @@ -1653,7 +1653,7 @@ CCollision::ProcessVerticalLine(const CColLine &line, point.point = matrix * point.point; point.normal = Multiply3x3(matrix, point.normal); #else - mat_load(reinterpret_cast(const_cast(&matrix))); + dc::mat_load2(matrix); mat_trans_single3_nodiv(point.point.x, point.point.y, point.point.z); @@ -2027,8 +2027,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA, #ifndef DC_SH4 matAB *= matrixA; #else - mat_load(reinterpret_cast(&matAB)); - mat_apply(reinterpret_cast(&matrixA)); + dc::mat_load_apply(matAB, matrixA); #endif CColSphere bsphereAB; // bounding sphere of A in B space @@ -2099,8 +2098,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA, #ifndef DC_SH4 matBA *= matrixB; #else - mat_load(reinterpret_cast(&matBA)); - mat_apply(reinterpret_cast(&matrixB)); + dc::mat_load_apply(matBA, matrixB); #endif for(i = 0; i < modelB.numSpheres; i++){ s.radius = modelB.spheres[i].radius; @@ -2162,7 +2160,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA, } #ifdef DC_SH4 - mat_load(reinterpret_cast(const_cast(&matrixB))); + dc::mat_load2(matrixB); #endif for(i = 0; i < numCollisions; i++){ #ifndef DC_SH4 diff --git a/src/miami/math/Matrix.cpp b/src/miami/math/Matrix.cpp index f90e3c06..6f06c0af 100644 --- a/src/miami/math/Matrix.cpp +++ b/src/miami/math/Matrix.cpp @@ -359,9 +359,9 @@ void CMatrix::Rotate(float x, float y, float z) { #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results - mat_load(reinterpret_cast(this)); + dc::mat_load2(reinterpret_cast(this)); mat_rotate(x, y, z); - mat_store(reinterpret_cast(this)); + dc::mat_store2(reinterpret_cast(this)); #else auto [sX, cX] = SinCos(x); auto [sY, cY] = SinCos(y); diff --git a/src/miami/math/Matrix.h b/src/miami/math/Matrix.h index 456c6f05..76b02d34 100644 --- a/src/miami/math/Matrix.h +++ b/src/miami/math/Matrix.h @@ -77,29 +77,17 @@ public: void SetScale(float s); void Scale(float scale) { -#if !defined(DC_SH4) || 1 /* TODO */ for (int i = 0; i < 3; i++) for (int j = 0; j < 3; j++) f[i][j] *= scale; -#else - mat_load(*this); - mat_scale(scale, scale, scale); - mat_store(*this); -#endif } void Scale(float sx, float sy, float sz) { -#if !defined(DC_SH4) || 1 /* TODO */ for (int i = 0; i < 3; i++){ f[i][0] *= sx; f[i][1] *= sy; f[i][2] *= sz; } -#else - mat_load(*this); - mat_scale(sx, sy, sz); - mat_store(*this); -#endif } void SetRotateXOnly(float angle); diff --git a/src/miami/math/Quaternion.h b/src/miami/math/Quaternion.h index c1b0e481..185adb7c 100644 --- a/src/miami/math/Quaternion.h +++ b/src/miami/math/Quaternion.h @@ -62,7 +62,7 @@ public: } const CQuaternion &operator/=(float right) { - right = dc::Invert(right); + right = dc::Invert(right); x *= right; y *= right; z *= right; @@ -115,5 +115,6 @@ inline CQuaternion operator*(float left, const CQuaternion &right) inline CQuaternion operator/(const CQuaternion &left, float right) { - return CQuaternion(left.x / right, left.y / right, left.z / right, left.w / right); + right = Invert(right); + return CQuaternion(left.x * right, left.y * right, left.z * right, left.w * right); } diff --git a/src/miami/math/Vector.cpp b/src/miami/math/Vector.cpp index e2bc5f7e..2667b969 100644 --- a/src/miami/math/Vector.cpp +++ b/src/miami/math/Vector.cpp @@ -27,23 +27,17 @@ CrossProduct(const CVector &v1, const CVector &v2) CVector Multiply3x3(const CMatrix &mat, const CVector &vec) { -#ifdef DC_SH4 - register float __x __asm__("fr12") = vec.x; - register float __y __asm__("fr13") = vec.y; - register float __z __asm__("fr14") = vec.z; - register float __w __asm__("fr15") = 0.0f; - - mat_load(reinterpret_cast(const_cast(&mat))); - - asm volatile( "ftrv xmtrx, fv12\n" - : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) - : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); - return { __x, __y, __z }; -#else +#ifndef DC_SH4 // TODO: VU0 code return CVector(mat.rx * vec.x + mat.fx * vec.y + mat.ux * vec.z, mat.ry * vec.x + mat.fy * vec.y + mat.uy * vec.z, mat.rz * vec.x + mat.fz * vec.y + mat.uz * vec.z); +#else + CVector out; + dc::mat_load2(mat); + mat_trans_normal3_nomod(vec.x, vec.y, vec.z, + out.x, out.y, out.z); + return out; #endif } @@ -56,10 +50,10 @@ Multiply3x3(const CVector &vec, const CMatrix &mat) mat.ux * vec.x + mat.uy * vec.y + mat.uz * vec.z); #else CVector out; - mat_load(mat); + dc::mat_load2(mat); mat_transpose(); - mat_trans_vec3_nomod(vec.x, vec.y, vec.z, - out.x, out.y, out.z); + mat_trans_normal3_nomod(vec.x, vec.y, vec.z, + out.x, out.y, out.z); return out; #endif } @@ -69,7 +63,7 @@ operator*(const CMatrix &mat, const CVector &vec) { #ifdef DC_SH4 CVector out; - mat_load(reinterpret_cast(const_cast(&mat))); + dc::mat_load2(mat); mat_trans_single3_nodiv_nomod(vec.x, vec.y, vec.z, out.x, out.y, out.z); return out; #else diff --git a/src/miami/math/VuVector.h b/src/miami/math/VuVector.h index 3c5933c4..d21ad18a 100644 --- a/src/miami/math/VuVector.h +++ b/src/miami/math/VuVector.h @@ -50,7 +50,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const CV sqc2 vf06,0x0(%0)\n\ ": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory"); #elif defined(DC_SH4) - mat_load(mat); + dc::mat_load2(mat); mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z); #else out = mat * in; @@ -77,7 +77,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const Rw sqc2 vf06,0x0(%0)\n\ ": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory"); #elif defined(DC_SH4) - mat_load(mat); + dc::mat_load2(mat); mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z); #else out = mat * in; @@ -86,7 +86,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const Rw __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat, const RwV3d *in, int stride) { -#ifdef GTA_PS3 +#ifdef GTA_PS2 __asm__ __volatile__("\n\ paddub $3,%4,$0\n\ lqc2 vf02,0x0(%2)\n\ @@ -114,7 +114,7 @@ __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat, bnez %1,1b\n\ ": : "r" (out) , "r" (n), "r" (&mat), "r" (in), "r" (stride): "memory"); #elif defined(DC_SH4) - mat_load(mat); + dc::mat_load2(mat); while(n--) { mat_trans_single3_nodiv_nomod(in->x, in->y, in->z, out->x, out->y, out->z); in = reinterpret_cast(reinterpret_cast(in) + stride); diff --git a/vendor/dca3-kos b/vendor/dca3-kos index 144eaeab..b1f6bbc9 160000 --- a/vendor/dca3-kos +++ b/vendor/dca3-kos @@ -1 +1 @@ -Subproject commit 144eaeab940246d9c84c1f14b45d9e2847705c5e +Subproject commit b1f6bbc960cd1f5cd05be8d9d0eca4556ca3a56a diff --git a/vendor/librw/src/base.cpp b/vendor/librw/src/base.cpp index a9b7b5d6..9f45b459 100644 --- a/vendor/librw/src/base.cpp +++ b/vendor/librw/src/base.cpp @@ -97,8 +97,8 @@ mult(const Quat &q, const Quat &p) #else Quat o; dc::quat_mult(reinterpret_cast(&o), - reinterpret_cast(q), - reinterpret_cast(p)); + reinterpret_cast(q), + reinterpret_cast(p)); return o; #endif } diff --git a/vendor/librw/src/camera.cpp b/vendor/librw/src/camera.cpp index b0204585..0d2d55af 100644 --- a/vendor/librw/src/camera.cpp +++ b/vendor/librw/src/camera.cpp @@ -429,6 +429,7 @@ Camera::frustumTestSphere(const Sphere *s) const { int32 res = SPHEREINSIDE; const FrustumPlane *p = this->frustumPlanes; +#ifndef DC_SH4 for(int32 i = 0; i < 6; i++){ float32 dist = dot(p->plane.normal, s->center) - p->plane.distance; if(s->radius < dist) @@ -437,6 +438,114 @@ Camera::frustumTestSphere(const Sphere *s) const res = SPHEREBOUNDARY; p++; } +#else + __builtin_prefetch(p); + + register float sx asm("fr0") = s->center.x; + register float sy asm("fr1") = s->center.y; + register float sz asm("fr2") = s->center.z; + register float sw asm("fr3") = -1.0f; + + // far + register float px asm("fr4") = p->plane.normal.x; + register float py asm("fr5") = p->plane.normal.y; + register float pz asm("fr6") = p->plane.normal.z; + register float pw asm("fr7") = p->plane.distance; + + asm volatile("fipr fv0, fv4" + : "+f" (pw) + : "f" (sx), "f" (sy), "f" (sz), "f" (sw), + "f" (px), "f" (py), "f" (pz)); + if(s->radius < pw) + return SPHEREOUTSIDE; + else if(s->radius > -pw) + res = SPHEREBOUNDARY; + p++; + + // near + px = p->plane.normal.x; + py = p->plane.normal.y; + pz = p->plane.normal.z; + pw = p->plane.distance; + asm volatile("fipr fv0, fv4" + : "+f" (pw) + : "f" (sx), "f" (sy), "f" (sz), "f" (sw), + "f" (px), "f" (py), "f" (pz)); + if(s->radius < pw) + return SPHEREOUTSIDE; + if(s->radius > -pw) + res = SPHEREBOUNDARY_NEAR; + p++; + + const float* base_ptr0 = &p[0].plane.normal.x; + const float* base_ptr1 = &p[1].plane.normal.x; + const float* base_ptr2 = &p[2].plane.normal.x; + const float* base_ptr3 = &p[3].plane.normal.x; + + __builtin_prefetch(base_ptr0); + + static_assert(offsetof (decltype (p[0].plane.normal), y) + -offsetof (decltype (p[0].plane.normal), x) == sizeof (float)); + + static_assert(offsetof (decltype (p[0].plane.normal), z) + -offsetof (decltype (p[0].plane.normal), y) == sizeof (float)); + + static_assert(offsetof (decltype (p[0].plane), distance) + -offsetof (decltype (p[0].plane.normal), z) == sizeof (float)); + + asm volatile (R"( + frchg + + fmov.s @%0+,fr0 + fmov.s @%1+,fr1 + fmov.s @%2+,fr2 + fmov.s @%3+,fr3 + + fmov.s @%0+,fr4 + fmov.s @%1+,fr5 + fmov.s @%2+,fr6 + fmov.s @%3+,fr7 + + fmov.s @%0+,fr8 + fmov.s @%1+,fr9 + fmov.s @%2+,fr10 + fmov.s @%3+,fr11 + + fmov.s @%0,fr12 + fmov.s @%1,fr13 + fmov.s @%2,fr14 + fmov.s @%3,fr15 + + frchg + )" : "+&r" (base_ptr0), "+&r" (base_ptr1), "+&r" (base_ptr2), "+&r" (base_ptr3) + : + : ); + + float dists[4]; + mat_trans_vec4_nodiv_nomod(sx, sy, sz, sw, + dists[0], dists[1], dists[2], dists[3]); + + if(s->radius < dists[0]) + return SPHEREOUTSIDE; + else if(s->radius > -dists[0]) + res = SPHEREBOUNDARY; + + if(s->radius < dists[1]) + return SPHEREOUTSIDE; + else if(s->radius > -dists[1]) + res = SPHEREBOUNDARY; + + if(s->radius < dists[2]) + return SPHEREOUTSIDE; + else if(s->radius > -dists[2]) + res = SPHEREBOUNDARY; + + if(s->radius < dists[3]) + return SPHEREOUTSIDE; + else if(s->radius > -dists[3]) + res = SPHEREBOUNDARY; + +#endif return res; } @@ -445,7 +554,7 @@ Camera::frustumTestSphereNear(const Sphere *s) const { int32 res = SPHEREINSIDE; const FrustumPlane *p = this->frustumPlanes; - +#ifndef DC_SH4 // far float32 dist = dot(p->plane.normal, s->center) - p->plane.distance; if(s->radius < dist) @@ -481,6 +590,101 @@ Camera::frustumTestSphereNear(const Sphere *s) const return SPHEREOUTSIDE; p++; +#else + __builtin_prefetch(p); + + register float sx asm("fr0") = s->center.x; + register float sy asm("fr1") = s->center.y; + register float sz asm("fr2") = s->center.z; + register float sw asm("fr3") = -1.0f; + + // far + register float px asm("fr4") = p->plane.normal.x; + register float py asm("fr5") = p->plane.normal.y; + register float pz asm("fr6") = p->plane.normal.z; + register float pw asm("fr7") = p->plane.distance; + + asm volatile("fipr fv0, fv4" + : "+f" (pw) + : "f" (sx), "f" (sy), "f" (sz), "f" (sw), + "f" (px), "f" (py), "f" (pz)); + if(s->radius < pw) + return SPHEREOUTSIDE; + p++; + + // near + px = p->plane.normal.x; + py = p->plane.normal.y; + pz = p->plane.normal.z; + pw = p->plane.distance; + asm volatile("fipr fv0, fv4" + : "+f" (pw) + : "f" (sx), "f" (sy), "f" (sz), "f" (sw), + "f" (px), "f" (py), "f" (pz)); + if(s->radius < pw) + return SPHEREOUTSIDE; + if(s->radius > -pw) + res = SPHEREBOUNDARY_NEAR; + p++; + + const float* base_ptr0 = &p[0].plane.normal.x; + const float* base_ptr1 = &p[1].plane.normal.x; + const float* base_ptr2 = &p[2].plane.normal.x; + const float* base_ptr3 = &p[3].plane.normal.x; + + __builtin_prefetch(base_ptr0); + + static_assert(offsetof (decltype (p[0].plane.normal), y) + -offsetof (decltype (p[0].plane.normal), x) == sizeof (float)); + + static_assert(offsetof (decltype (p[0].plane.normal), z) + -offsetof (decltype (p[0].plane.normal), y) == sizeof (float)); + + static_assert(offsetof (decltype (p[0].plane), distance) + -offsetof (decltype (p[0].plane.normal), z) == sizeof (float)); + + asm volatile (R"( + frchg + + fmov.s @%0+,fr0 + fmov.s @%1+,fr1 + fmov.s @%2+,fr2 + fmov.s @%3+,fr3 + + fmov.s @%0+,fr4 + fmov.s @%1+,fr5 + fmov.s @%2+,fr6 + fmov.s @%3+,fr7 + + fmov.s @%0+,fr8 + fmov.s @%1+,fr9 + fmov.s @%2+,fr10 + fmov.s @%3+,fr11 + + fmov.s @%0,fr12 + fmov.s @%1,fr13 + fmov.s @%2,fr14 + fmov.s @%3,fr15 + + frchg + )" : "+&r" (base_ptr0), "+&r" (base_ptr1), "+&r" (base_ptr2), "+&r" (base_ptr3) + : + : ); + + float dists[4]; + mat_trans_vec4_nodiv_nomod(sx, sy, sz, sw, + dists[0], dists[1], dists[2], dists[3]); + + if(s->radius < dists[0]) + return SPHEREOUTSIDE; + else if(s->radius < dists[1]) + return SPHEREOUTSIDE; + else if(s->radius < dists[2]) + return SPHEREOUTSIDE; + else if(s->radius < dists[3]) + return SPHEREOUTSIDE; + +#endif return res; } diff --git a/vendor/librw/src/dc/rwdc.cpp b/vendor/librw/src/dc/rwdc.cpp index d33de9b0..2849461f 100644 --- a/vendor/librw/src/dc/rwdc.cpp +++ b/vendor/librw/src/dc/rwdc.cpp @@ -37,8 +37,6 @@ extern const char* currentFile; #include #include -#define ARRAY_SIZE(array) (sizeof(array) / sizeof(array[0])) - #define errorf(...) dbglog(DBG_CRITICAL, __VA_ARGS__) #define logf(...) // printf(__VA_ARGS__) bool re3RemoveLeastUsedModel(); @@ -47,21 +45,13 @@ void* re3StreamingAlloc(size_t size); // #include "rwdcimpl.h" -#include -#include #include "alloc.h" -#undef PVR_TXRFMT_STRIDE -#define PVR_TXRFMT_STRIDE (1 << 25) - -static_assert(PVR_TXRFMT_STRIDE == (1 << 25), "PVR_TXRFMT_STRIDE is bugged in your KOS version"); +using namespace dc; // TODO: probably needs a better place to be bool doEnvironmentMaps = true; -#define fclamp0_1(n) ((n) > 1.0f ? 1.0f : n < 0.0f ? 0.0f : n) -#define fclamp1(n) ((n) > 1.0f ? 1.0f : n) - struct alignas(32) pvr_vertex16_t { uint32_t flags; /**< \brief TA command (vertex flags) */ float x; /**< \brief X coordinate */ @@ -170,177 +160,10 @@ struct alignas(32) pvr_vertex32_ut { static_assert(sizeof(pvr_vertex16_t) == 32, "pvr_vertex16_t size mismatch"); static_assert(alignof(pvr_vertex16_t) == 32, "pvr_vertex16_t alignof mismatch"); - -#define MATH_Fast_Invert(x) ({ (((x) < 0.0f)? -1.0f : 1.0f) * frsqrt((x) * (x)); }) - static pvr_dr_state_t drState; -#include - float VIDEO_MODE_SCALE_X; -#if !defined(DC_TEXCONV) && !defined(DC_SIM) -#include - -#define mat_trans_nodiv_nomod(x, y, z, x2, y2, z2, w2) do { \ - register float __x __asm__("fr12") = (x); \ - register float __y __asm__("fr13") = (y); \ - register float __z __asm__("fr14") = (z); \ - register float __w __asm__("fr15") = 1.0f; \ - __asm__ __volatile__( "ftrv xmtrx, fv12\n" \ - : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \ - : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \ - x2 = __x; y2 = __y; z2 = __z; w2 = __w; \ - } while(false) - -#define mat_trans_nodiv_nomod_zerow(x, y, z, x2, y2, z2, w2) do { \ - register float __x __asm__("fr12") = (x); \ - register float __y __asm__("fr13") = (y); \ - register float __z __asm__("fr14") = (z); \ - register float __w __asm__("fr15") = 0.0f; \ - __asm__ __volatile__( "ftrv xmtrx, fv12\n" \ - : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \ - : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \ - x2 = __x; y2 = __y; z2 = __z; w2 = __w; \ - } while(false) - -#define mat_trans_w_nodiv_nomod(x, y, z, w) do { \ - register float __x __asm__("fr12") = (x); \ - register float __y __asm__("fr13") = (y); \ - register float __z __asm__("fr14") = (z); \ - register float __w __asm__("fr15") = 1.0f; \ - __asm__ __volatile__( "ftrv xmtrx, fv12\n" \ - : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \ - : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \ - w = __w; \ - } while(false) - - // no declspec naked, so can't do rts / fschg. instead compiler pads with nop? - - inline void rw_mat_load_3x3(const rw::Matrix* mtx) { - __asm__ __volatile__ ( - R"( - fschg - frchg - - fmov @%[mtx]+, dr0 - - fldi0 fr12 - fldi0 fr13 - - fmov @%[mtx]+, dr2 - fmov @%[mtx]+, dr4 - fmov @%[mtx]+, dr6 - fmov @%[mtx]+, dr8 - fmov @%[mtx]+, dr10 - - fldi0 fr3 - fldi0 fr7 - fldi0 fr11 - fmov dr12, dr14 - - fschg - frchg - )" - : [mtx] "+r" (mtx) - ); - } - - // sets pos.w to 1 - inline void rw_mat_load_4x4(const rw::Matrix* mtx) { - __asm__ __volatile__ ( - R"( - fschg - frchg - fmov @%[mtx]+, dr0 - - fmov @%[mtx]+, dr2 - fmov @%[mtx]+, dr4 - fmov @%[mtx]+, dr6 - fmov @%[mtx]+, dr8 - fmov @%[mtx]+, dr10 - fmov @%[mtx]+, dr12 - fmov @%[mtx]+, dr14 - fldi1 fr15 - - fschg - frchg - )" - : [mtx] "+r" (mtx) - ); - } - -#else -extern matrix_t XMTRX; - -void rw_mat_load_3x3(rw::Matrix* mtx) { - memcpy(XMTRX, mtx, sizeof(matrix_t)); - XMTRX[0][3] = 0.0f; - XMTRX[1][3] = 0.0f; - XMTRX[2][3] = 0.0f; - - XMTRX[3][0] = 0.0f; - XMTRX[3][1] = 0.0f; - XMTRX[3][2] = 0.0f; - XMTRX[3][3] = 0.0f; -} - -void rw_mat_load_4x4(rw::Matrix* mtx) { - memcpy(XMTRX, mtx, sizeof(matrix_t)); - XMTRX[3][3] = 1.0f; -} - -#include -#define frsqrt(a) (1.0f/sqrt(a)) -#define dcache_pref_block(a) __builtin_prefetch(a) - -#ifndef __always_inline -#define __always_inline __attribute__((always_inline)) inline -#endif - -#ifdef DC_TEXCONV -#define mat_transform(a, b, c, d) -#define mat_apply(a) -#define mat_load(a) -#define mat_store(a) -#define mat_identity(a) -#define pvr_fog_table_color(a,r,g,b) -#define pvr_fog_table_linear(s,e) -#define pvr_fog_table_exp(d) -#define pvr_fog_table_custom(d) -#endif - -#define mat_trans_single3_nomod(x_, y_, z_, x2, y2, z2) do { \ - vector_t tmp = { x_, y_, z_, 1.0f }; \ - mat_transform(&tmp, &tmp, 1, 0); \ - z2 = 1.0f / tmp.w; \ - x2 = tmp.x * z2; \ - y2 = tmp.y * z2; \ - } while(false) - -#define mat_trans_nodiv_nomod(x_, y_, z_, x2, y2, z2, w2) do { \ - vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \ - mat_transform(&tmp1233123, &tmp1233123, 1, 0); \ - x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \ - } while(false) - -#define mat_trans_nodiv_nomod_zerow(x_, y_, z_, x2, y2, z2, w2) do { \ - vector_t tmp1233123 = { x_, y_, z_, 0.0f }; \ - mat_transform(&tmp1233123, &tmp1233123, 1, 0); \ - x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \ - } while(false) - -#define mat_trans_w_nodiv_nomod(x_, y_, z_, w_) do { \ - vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \ - mat_transform(&tmp1233123, &tmp1233123, 1, 0); \ - w_ = tmp1233123.w; \ - } while(false) - -#define memcpy4 memcpy - -// END STUBS -#endif - static pvr_ptr_t fake_tex; alignas(4) static const uint16_t fake_tex_data[] = { @@ -551,9 +374,9 @@ void DCE_MatrixViewport(float x, float y, float width, float height) { void DCE_InitMatrices() { // Setup the screenview matrix. Only need to do once since this matrix does not need to change for single player viewpoint. - mat_identity(); + mat_identity2(); - mat_store(&DCE_MAT_SCREENVIEW); + mat_store2(&DCE_MAT_SCREENVIEW); } } @@ -684,7 +507,7 @@ struct atomic_context_t { __always_inline void DCE_RenderSubmitVertex(const pvr_vertex_t *v, uint32_t flags) { auto *sq = reinterpret_cast(pvr_dr_target(drState)); auto *src = reinterpret_cast(v); - float sz = MATH_Fast_Invert(v->z); + float sz = Invert(v->z); float sx = v->x * sz; float sy = v->y * sz; @@ -711,7 +534,7 @@ __always_inline void DCE_RenderSubmitVertexIM3D(float x, float y, float w, { auto *sq = reinterpret_cast(pvr_dr_target(drState)); auto *uv32 = reinterpret_cast(uv); - float sz = MATH_Fast_Invert(w); + float sz = Invert(w); float sx = x * sz; float sy = y * sz; @@ -801,9 +624,8 @@ void beginUpdate(Camera* cam) { DCE_MatrixViewport(0, 0, cam->frameBuffer->width * VIDEO_MODE_SCALE_X, cam->frameBuffer->height); - mat_load((matrix_t*)&DCE_MAT_SCREENVIEW); - mat_apply((matrix_t*)&cam->devProj); - mat_store((matrix_t*)&cam->devProjScreen); + mat_load_apply((matrix_t*)&DCE_MAT_SCREENVIEW, (matrix_t*)&cam->devProj); + mat_store2((matrix_t*)&cam->devProjScreen); } @@ -1831,7 +1653,7 @@ void im2DRenderPrimitive(PrimitiveType primType, void *vertices, int32_t numVert pvrVert->flags = flags; pvrVert->x = gtaVert.x * VIDEO_MODE_SCALE_X; pvrVert->y = gtaVert.y; - pvrVert->z = MATH_Fast_Invert(gtaVert.w); // this is perfect for almost every case... + pvrVert->z = Invert(gtaVert.w); // this is perfect for almost every case... pvrVert->u = gtaVert.u; pvrVert->v = gtaVert.v; pvrVert->argb = (gtaVert.a << 24) | @@ -1952,7 +1774,7 @@ void im2DRenderIndexedPrimitive(PrimitiveType primType, void *vertices, int32 nu pvrVert->flags = flags; pvrVert->x = gtaVert.x * VIDEO_MODE_SCALE_X; pvrVert->y = gtaVert.y; - pvrVert->z = MATH_Fast_Invert(gtaVert.w); // this is perfect for almost every case... + pvrVert->z = Invert(gtaVert.w); // this is perfect for almost every case... pvrVert->u = gtaVert.u; pvrVert->v = gtaVert.v; pvrVert->argb = (gtaVert.a << 24) | @@ -2015,8 +1837,8 @@ void im3DTransform(void *vertices, int32 numVertices, Matrix *worldMat, uint32 f rw::RawMatrix::mult(&worldview, &world, &cam->devView); rw::RawMatrix::mult(&proj, &worldview, &cam->devProj); rw::RawMatrix::mult(&mtx, &proj, (RawMatrix*)&DCE_MAT_SCREENVIEW); - // mat_load(&DCE_MAT_SCREENVIEW); // ~11 cycles. - mat_load(( matrix_t*)&mtx.right); // Number of cycles: ~32. + // mat_load2(&DCE_MAT_SCREENVIEW); // ~11 cycles. + mat_load2(( matrix_t*)&mtx.right); // Number of cycles: ~32. if (im3dVertices) { free(im3dVertices); } @@ -2110,7 +1932,7 @@ void im3DRenderIndexedPrimitive(PrimitiveType primType, // assuming near plane is 0.0f // gv1 is visible (posi), and gv2 is behind the plane (negative) - float t = (1.0f - gv1.position.z) * MATH_Fast_Invert(gv2.position.z - gv1.position.z); + float t = (1.0f - gv1.position.z) * Invert(gv2.position.z - gv1.position.z); pvr_vertex_t pvrVert; @@ -3388,13 +3210,13 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve auto skinningWeightData = (uint8_t*)skinWeights; if (!matrix0Identity) { - rw_mat_load_4x4(&skinMatrices[0]); + mat_load_4x4(&skinMatrices[0]); if (small_xyz) { mat_apply(&DCE_MESHLET_MAT_DECODE); } } else { if (small_xyz) { - mat_load(&DCE_MESHLET_MAT_DECODE); + mat_load2(&DCE_MESHLET_MAT_DECODE); } } @@ -3458,7 +3280,7 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve break; } - rw_mat_load_4x4(currentMatrix); + mat_load_4x4(currentMatrix); if (small_xyz){ mat_apply(&DCE_MESHLET_MAT_DECODE); } @@ -3491,7 +3313,7 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve auto skinningWeightData = (uint8_t*)skinWeights; if (!matrix0Identity) { - rw_mat_load_3x3(&skinMatrices[0]); + mat_load_3x3(&skinMatrices[0]); } for(;;) { @@ -3549,7 +3371,7 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve break; } - rw_mat_load_3x3(currentMatrix); + mat_load_3x3(currentMatrix); do { auto srcOffset = *skinningIndexData++; @@ -3574,7 +3396,7 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve __attribute__((noinline)) void tnlMeshletEnvMap(uint8_t* OCR, uint8_t* normal, int vertexCount, int vertexSize, matrix_t* matfxMatrix, float matfxCoefficient) { - mat_load(matfxMatrix); + mat_load2(matfxMatrix); do { pvr_vertex64_t* v = (pvr_vertex64_t*)OCR; @@ -3991,15 +3813,12 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) { lightingCB(atomic, ac->uniform); - rw::RawMatrix world; rw::convMatrix(&world, atomic->getFrame()->getLTM()); - - mat_load((matrix_t*)&cam->devProjScreen); - mat_apply((matrix_t*)&cam->devView); + mat_load_apply((matrix_t*)&cam->devProjScreen, (matrix_t*)&cam->devView); mat_apply((matrix_t*)&world); - mat_store((matrix_t*)&atomicContexts.back().mtx); + mat_store2((matrix_t*)&atomicContexts.back().mtx); auto meshes = geo->meshHeader->getMeshes(); @@ -4238,15 +4057,14 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) { unsigned skinSelector = small_xyz + acp->skinMatrix0Identity*2; tnlMeshletSkinVerticesSelector[skinSelector](OCR_SPACE, normalDst, &dcModel->data[meshlet->vertexOffset], normalSrc, &dcModel->data[meshlet->skinWeightOffset], &dcModel->data[meshlet->skinIndexOffset], meshlet->vertexCount, meshlet->vertexSize, &acp->skinContextPointer->mtx); - mat_load(&mtx); + mat_load2(&mtx); tnlMeshletTransformSelector[clippingRequired * 2](OCR_SPACE, OCR_SPACE + 4, meshlet->vertexCount, 64); } else { if (selector & 8) { - mat_load(&mtx); - mat_apply(&DCE_MESHLET_MAT_DECODE); + mat_load_apply(&mtx, &DCE_MESHLET_MAT_DECODE); } else { - mat_load(&mtx); + mat_load2(&mtx); } tnlMeshletTransformSelector[smallSelector](OCR_SPACE, &dcModel->data[meshlet->vertexOffset], meshlet->vertexCount, meshlet->vertexSize); } @@ -4273,7 +4091,7 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) { unsigned dstColOffset = textured ? offsetof(pvr_vertex64_t, a) : offsetof(pvr_vertex32_ut, a); dce_set_mat_vertex_color(&residual, &material); - mat_load(&DCE_MESHLET_MAT_VERTEX_COLOR); + mat_load2(&DCE_MESHLET_MAT_VERTEX_COLOR); tnlMeshletVertexColorSelector[0](OCR_SPACE + dstColOffset, (int8_t*)&dcModel->data[meshlet->vertexOffset] + colOffset, meshlet->vertexCount, meshlet->vertexSize); } else { unsigned dstColOffset = textured ? offsetof(pvr_vertex64_t, a) : offsetof(pvr_vertex32_ut, a); @@ -4295,7 +4113,7 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) { unsigned normalSelector = (pass1 - 1) + (skin != 0) * 4; - mat_load((matrix_t*)&uniformObject.dir[0][0][0]); + mat_load2((matrix_t*)&uniformObject.dir[0][0][0]); auto normalPointer = &dcModel->data[meshlet->vertexOffset] + normalOffset; auto vtxSize = meshlet->vertexSize; if (skin) { @@ -4310,7 +4128,7 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) { if (pass2) { unsigned normalSelector = (pass2 - 1) + (skin != 0) * 4; - mat_load((matrix_t*)&uniformObject.dir[1][0][0]); + mat_load2((matrix_t*)&uniformObject.dir[1][0][0]); tnlMeshletDiffuseColorSelector[normalSelector](OCR_SPACE + dstColOffset, normalPointer, meshlet->vertexCount, vtxSize, &lightDiffuseColors[4]); } } @@ -4373,7 +4191,7 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) { indices.back() |= 0x80; pvr_vertex64_t *vd = (pvr_vertex64_t *)OCR_SPACE; - mat_load(&mtx); // Number of cycles: ~11 + mat_load2(&mtx); // Number of cycles: ~11 for (int idx = 0; idx < geo->numVertices; idx++) { auto& vert = vertices[idx]; diff --git a/vendor/librw/src/dc/rwdc_common.h b/vendor/librw/src/dc/rwdc_common.h index e0f940b7..c6d094e6 100644 --- a/vendor/librw/src/dc/rwdc_common.h +++ b/vendor/librw/src/dc/rwdc_common.h @@ -30,6 +30,7 @@ # define VIDEO_MODE_WIDTH 640.0f # define VIDEO_MODE_HEIGHT 480.0f # define memcpy4 memcpy +# define frsqrt(a) (1.0f/sqrtf(a)) # define dcache_pref_block(a) __builtin_prefetch(a) # define F_PI M_PI # ifndef __always_inline @@ -44,14 +45,16 @@ "PVR_TXRFMT_STRIDE is bugged in your KOS version"); #endif -#define F_PI_2 (F_PI * 0.5f) -#define __hot __attribute__((hot)) -#define __cold __attribute__((cold)) +#define F_PI_2 (F_PI * 0.5f) +#define __hot __attribute__((hot)) +#define __cold __attribute__((cold)) +#define __icache_aligned __attribute__((aligned(32))) -#define STRINGIFY(x) #x -#define STR(x) STRINGIFY(x) -#define CONCAT_(x,y) x##y -#define CONCAT(x,y) CONCAT_(x,y) +#define STRINGIFY(x) #x +#define STR(x) STRINGIFY(x) +#define CONCAT_(x,y) x##y +#define CONCAT(x,y) CONCAT_(x,y) +#define ARRAY_SIZE(array) (sizeof(array) / sizeof(array[0])) namespace rw { class Matrix; @@ -66,6 +69,11 @@ struct quaternion_t { __always_inline __hot constexpr float Sin(float x) { return sinf(x); } __always_inline __hot constexpr float Cos(float x) { return cosf(x); } __always_inline __hot constexpr auto SinCos(float x) { return std::pair { Sin(x), Cos(x) }; } +__always_inline __hot constexpr float Tan(float x) { return tanf(x); } +__always_inline __hot constexpr float Atan(float x) { return atanf(x); } +__always_inline __hot constexpr float Atan2(float y, float x) { return atan2f(y, x); } +__always_inline __hot constexpr float Asin(float x) { return asinf(x); } +__always_inline __hot constexpr float Acos(float x) { return acosf(x); } __always_inline __hot constexpr float Abs(float x) { return fabsf(x); } __always_inline __hot constexpr float Sqrt(float x) { return sqrtf(x); } __always_inline __hot constexpr float RecipSqrt(float x, float y) { return x / Sqrt(y); } @@ -130,80 +138,6 @@ __always_inline __hot constexpr auto Norm(auto value, auto min, auto max) { return numerator / denominator; } -template -__always_inline __hot constexpr float Tan(float x) { - if(!std::is_constant_evaluated() && FAST_APPROX) { - constexpr float pisqby4 = 2.4674011002723397f; - constexpr float adjpisqby4 = 2.471688400562703f; - constexpr float adj1minus8bypisq = 0.189759681063053f; - float xsq = x * x; - - return x * Div(adjpisqby4 - adj1minus8bypisq * xsq, - pisqby4 - xsq); - } else - return tanf(x); -} - -template -__always_inline __hot constexpr float Atan(float x) { - if(FAST_APPROX && !std::is_constant_evaluated()) { - constexpr float a[3] = { // - 0.998418889819911f, -2.9993501171084700E-01f, 0.0869142852883849f}; - float xx = x * x; - return ((a[2] * xx + a[1]) * xx + a[0]) * x; - } else return atanf(x); -} - -template -__hot constexpr float Atan2(float y, float x) { - if(FAST_APPROX && !std::is_constant_evaluated()) { -#if 0 - constexpr float halfpi_i754 = M_PI * 0.5f; - constexpr float quarterpi_i754 = M_PI * 0.25f; - // kludge to prevent 0/0 condition - float abs_y = Abs(y) + std::numeric_limits::epsilon(); - float absy_plus_absx = abs_y + Abs(x); - float inv_absy_plus_absx = Invert(absy_plus_absx); - float angle = halfpi_i754 - copysignf(quarterpi_i754, x); - float r = (x - copysignf(abs_y, x)) * inv_absy_plus_absx; - angle += (0.1963f * r * r - 0.9817f) * r; - return copysignf(angle, y); -#else - // Ensure input is in [-1, +1] - bool swap = fabs(x) < fabs(y); - float atan_input = (swap ? x : y) / (swap ? y : x); - - // Approximate atan - float res = Atan(atan_input); - - // If swapped, adjust atan output - res = swap ? (atan_input >= 0.0f ? F_PI_2 : -F_PI_2) - res : res; - // Adjust quadrants - if (x >= 0.0f && y >= 0.0f) {} // 1st quadrant - else if (x < 0.0f && y >= 0.0f) { res = F_PI + res; } // 2nd quadrant - else if (x < 0.0f && y < 0.0f) { res = -F_PI + res; } // 3rd quadrant - else if (x >= 0.0f && y < 0.0f) {} // 4th quadrant - - // Store result - return res; -#endif - } else return atan2f(y, x); -} - -template -__always_inline __hot constexpr float Asin(float x) { - if(FAST_APPROX && !std::is_constant_evaluated()) { - Atan(Div(x, Sqrt(1.0f-(x*x)))); - } else return asinf(x); -} - -template -__always_inline __hot constexpr float Acos(float x) { - if(FAST_APPROX && !std::is_constant_evaluated()) { - return (-0.69813170079773212f * x * x - 0.87266462599716477f) * x + 1.5707963267948966f; - } else return acosf(x); -} - #ifdef RW_DC # ifdef DC_SH4 @@ -240,43 +174,134 @@ __always_inline __hot constexpr float Acos(float x) { w = __w; \ } while(false) -#define mat_trans_vec3_nomod(x, y, z, x2, y2, z2) { \ - register float __x __asm__("fr12") = (x); \ - register float __y __asm__("fr13") = (y); \ - register float __z __asm__("fr14") = (z); \ - __asm__ __volatile__( \ - "fldi0 fr15\n" \ - "ftrv xmtrx, fv8\n" \ - : "=f" (__x), "=f" (__y), "=f" (__z) \ - : "0" (__x), "1" (__y), "2" (__z) \ - : "fr15" ); \ - x2 = __x; y2 = __y; z2 = __z; \ - } +#define mat_trans_vec4_nodiv_nomod(x, y, z, w, x2, y2, z2, w2) { \ + register float __x __asm__("fr0") = (x); \ + register float __y __asm__("fr1") = (y); \ + register float __z __asm__("fr2") = (z); \ + register float __w __asm__("fr3") = (w); \ + __asm__ __volatile__( "ftrv xmtrx, fv0\n" \ + : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \ + : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \ + x2 = __x; y2 = __y; z2 = __z; w2 = __w; \ + } while(false) + +inline __hot __icache_aligned void mat_load2(const matrix_t* mtx) { + asm volatile( + R"( + fschg + fmov.d @%[mtx],xd0 + add #32,%[mtx] + pref @%[mtx] + add #-(32-8),%[mtx] + fmov.d @%[mtx]+,xd2 + fmov.d @%[mtx]+,xd4 + fmov.d @%[mtx]+,xd6 + fmov.d @%[mtx]+,xd8 + fmov.d @%[mtx]+,xd10 + fmov.d @%[mtx]+,xd12 + fmov.d @%[mtx]+,xd14 + fschg + )" + : [mtx] "+r" (mtx) + : + : + ); +} + +inline __hot __icache_aligned void mat_store2(matrix_t *mtx) { + asm volatile( + R"( + fschg + add #64-8,%[mtx] + fmov.d xd14,@%[mtx] + add #-32,%[mtx] + pref @%[mtx] + add #32,%[mtx] + fmov.d xd12,@-%[mtx] + fmov.d xd10,@-%[mtx] + fmov.d xd8,@-%[mtx] + fmov.d xd6,@-%[mtx] + fmov.d xd4,@-%[mtx] + fmov.d xd2,@-%[mtx] + fmov.d xd0,@-%[mtx] + fschg + )" + : [mtx] "+&r" (mtx), "=m" (*mtx) + : + : + ); +} + +inline __hot __icache_aligned void mat_identity2(void) { + asm volatile( + R"( + frchg + fldi1 fr0 + fschg + fldi0 fr1 + fldi0 fr2 + fldi0 fr3 + fldi0 fr4 + fldi1 fr5 + fmov dr2,dr6 + fmov dr2,dr8 + fmov dr0,dr10 + fmov dr2,dr12 + fmov dr4,dr14 + fschg + frchg + )" + ); +} + +inline __hot __icache_aligned void mat_set_scale(float x, float y, float z) { + asm volatile( + R"( + frchg + fldi0 fr1 + fschg + fldi0 fr2 + fldi0 fr3 + fldi0 fr4 + fmov dr2, dr6 + fmov dr2, dr8 + fldi0 fr11 + fmov dr2, dr12 + fldi0 fr14 + fschg + frchg + fmov %[x], xf0 + fmov %[y], xf5 + fmov %[z], xf10 + )" + : + : [x] "r" (x), [y] "r" (y), [z] "r" (z) + : + ); +} // no declspec naked, so can't do rts / fschg. instead compiler pads with nop? -inline __hot void mat_load_3x3(const matrix_t* mtx) { +inline __hot void mat_load_3x3(const rw::Matrix* mtx) { __asm__ __volatile__ ( R"( fschg frchg fmov @%[mtx]+, dr0 - fldi0 fr12 - fmov @%[mtx]+, dr2 + fldi0 fr12 fldi0 fr13 + fmov @%[mtx]+, dr2 fmov @%[mtx]+, dr4 - fldi0 fr3 - fmov @%[mtx]+, dr6 - fmov dr12, dr14 - fmov @%[mtx]+, dr8 - fldi0 fr7 - fmov @%[mtx]+, dr10 + + fldi0 fr3 + fldi0 fr7 fldi0 fr11 + fmov dr12, dr14 fschg frchg @@ -286,28 +311,28 @@ inline __hot void mat_load_3x3(const matrix_t* mtx) { } // sets pos.w to 1 -inline __hot void rw_mat_load_4x4(const rw::Matrix* mtx) { - __asm__ __volatile__ ( - R"( - fschg - frchg - fmov @%[mtx]+, dr0 +inline __hot void mat_load_4x4(const rw::Matrix* mtx) { + __asm__ __volatile__ ( + R"( + fschg + frchg + fmov @%[mtx]+, dr0 - fmov @%[mtx]+, dr2 - fmov @%[mtx]+, dr4 - fmov @%[mtx]+, dr6 - fmov @%[mtx]+, dr8 - fmov @%[mtx]+, dr10 - fmov @%[mtx]+, dr12 - fmov @%[mtx]+, dr14 - fldi1 fr15 + fmov @%[mtx]+, dr2 + fmov @%[mtx]+, dr4 + fmov @%[mtx]+, dr6 + fmov @%[mtx]+, dr8 + fmov @%[mtx]+, dr10 + fmov @%[mtx]+, dr12 + fmov @%[mtx]+, dr14 + fldi1 fr15 - fschg - frchg - )" - : [mtx] "+r" (mtx) - ); -} + fschg + frchg + )" + : [mtx] "+r" (mtx) + ); + } __hot inline void mat_transpose(void) { asm volatile ( @@ -345,8 +370,7 @@ __hot inline void mat_transpose(void) { ); } -__attribute__((optimize("align-functions=32"))) -__hot inline void mat_copy(matrix_t *dst, const matrix_t *src) { +__hot __icache_aligned inline void mat_copy(matrix_t *dst, const matrix_t *src) { asm volatile(R"( fschg @@ -367,7 +391,7 @@ __hot inline void mat_copy(matrix_t *dst, const matrix_t *src) { fmov.d xd0, @-%[dst] add #32, %[dst] - pref @%[dst] + pref @%[dst] fmov.d @%[src]+, xd0 fmov.d @%[src]+, xd2 @@ -382,12 +406,13 @@ __hot inline void mat_copy(matrix_t *dst, const matrix_t *src) { fmov.d xd0, @-%[dst] fschg - )": [dst] "+&r" (dst), [src] "+&r" (src) + )": [dst] "+&r" (dst), [src] "+&r" (src), "=m" (*dst) : - : "memory"); + :); } -template +//TODO: FIXME FOR VC (AND USE FTRV) +template __hot constexpr inline void quat_mult(quaternion_t *r, const quaternion_t &q1, const quaternion_t &q2) { if(FAST_APPROX && !std::is_constant_evaluated()) { /* @@ -477,9 +502,9 @@ __hot constexpr inline void quat_mult(quaternion_t *r, const quaternion_t &q1, c r->w = q2w; } else { r->x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w); - r->y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w); - r->z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w); - r->w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z); + r->y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w); + r->z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w); + r->w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z); } } @@ -608,70 +633,64 @@ __hot inline void mat_apply_rotate_z(float z) { # else # ifdef DC_TEXCONV -# define mat_transform(a, b, c, d) # define mat_apply(a) -# define mat_load(a) -# define mat_store(a) -# define mat_identity(a) +# define mat_load2(a) +# define mat_store2(a) +# define mat_identity2() +# define mat_transform(a, b, c, d) # define pvr_fog_table_color(a,r,g,b) # define pvr_fog_table_linear(s,e) # endif -#define mat_trans_vec3_nomod(x_, y_, z_, x2, y2, z2) do { \ - vector_t tmp = { x_, y_, z_, 0.0f }; \ +#define mat_trans_single3_nomod(x_, y_, z_, x2, y2, z2) do { \ + vector_t tmp = { x_, y_, z_, 1.0f }; \ mat_transform(&tmp, &tmp, 1, 0); \ - x2 = tmp.x; y2 = tmp.y; z2 = tmp.z; \ + z2 = 1.0f / tmp.w; \ + x2 = tmp.x * z2; \ + y2 = tmp.y * z2; \ } while(false) -#define mat_trans_single3_nomod(x_, y_, z_, x2, y2, z2) do { \ - vector_t tmp = { x_, y_, z_, 1.0f }; \ - mat_transform(&tmp, &tmp, 1, 0); \ - z2 = 1.0f / tmp.w; \ - x2 = tmp.x * z2; \ - y2 = tmp.y * z2; \ - } while(false) - #define mat_trans_single3_nodiv_nomod(x_, y_, z_, x2, y2, z2) do { \ - vector_t tmp = { x_, y_, z_, 1.0f }; \ - mat_transform(&tmp, &tmp, 1, 0); \ - z2 = tmp.z; \ - x2 = tmp.x; \ - y2 = tmp.y; \ - } while(false) + vector_t tmp = { x_, y_, z_, 1.0f }; \ + mat_transform(&tmp, &tmp, 1, 0); \ + z2 = tmp.z; \ + x2 = tmp.x; \ + y2 = tmp.y; \ + } while(false) #define mat_trans_nodiv_nomod(x_, y_, z_, x2, y2, z2, w2) do { \ - vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \ - mat_transform(&tmp1233123, &tmp1233123, 1, 0); \ - x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \ - } while(false) + vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \ + mat_transform(&tmp1233123, &tmp1233123, 1, 0); \ + x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \ + } while(false) #define mat_trans_nodiv_nomod_zerow(x_, y_, z_, x2, y2, z2, w2) do { \ - vector_t tmp1233123 = { x_, y_, z_, 0.0f }; \ - mat_transform(&tmp1233123, &tmp1233123, 1, 0); \ - x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \ - } while(false) + vector_t tmp1233123 = { x_, y_, z_, 0.0f }; \ + mat_transform(&tmp1233123, &tmp1233123, 1, 0); \ + x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \ + } while(false) #define mat_trans_w_nodiv_nomod(x_, y_, z_, w_) do { \ - vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \ - mat_transform(&tmp1233123, &tmp1233123, 1, 0); \ - w_ = tmp1233123.w; \ - } while(false) + vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \ + mat_transform(&tmp1233123, &tmp1233123, 1, 0); \ + w_ = tmp1233123.w; \ + } while(false) -inline void mat_load_3x3(const matrix_t* mtx) { - memcpy(XMTRX, mtx, sizeof(matrix_t)); - XMTRX[0][3] = 0.0f; - XMTRX[1][3] = 0.0f; - XMTRX[2][3] = 0.0f; +inline void mat_load_3x3(const rw::Matrix* mtx) { + memcpy(XMTRX, mtx, sizeof(matrix_t)); + XMTRX[0][3] = 0.0f; + XMTRX[1][3] = 0.0f; + XMTRX[2][3] = 0.0f; - XMTRX[3][0] = 0.0f; - XMTRX[3][1] = 0.0f; - XMTRX[3][2] = 0.0f; - XMTRX[3][3] = 0.0f; + XMTRX[3][0] = 0.0f; + XMTRX[3][1] = 0.0f; + XMTRX[3][2] = 0.0f; + XMTRX[3][3] = 0.0f; } -inline void rw_mat_load_4x4(const rw::Matrix* mtx) { - memcpy(XMTRX, mtx, sizeof(matrix_t)); - XMTRX[3][3] = 1.0f; +inline void mat_load_4x4(const rw::Matrix* mtx) { + memcpy(XMTRX, mtx, sizeof(matrix_t)); + XMTRX[3][3] = 1.0f; } inline void mat_transpose(void) { @@ -706,32 +725,9 @@ __always_inline __hot void mat_copy(matrix_t *dst, const matrix_t *src) { __hot inline void mat_mult(matrix_t *out, const matrix_t* matrix1, const matrix_t* matrix2) { mat_load_apply(matrix1, matrix2); - mat_store(out); + mat_store2(out); } -#if 0 -template -__always_inline __hot -auto vec3_dot_prod(const vec3_t &v1, vec3_t const (&v2)[N]) { - std::array result; - -#ifdef DC_SH4 - register float x asm(KOS_FPARG(0)) = v1.x; - register float y asm(KOS_FPARG(1)) = v1.y; - register float z asm(KOS_FPARG(2)) = v1.z; -#else - float x = v1.x; - float y = v1.y; - float z = v1.z; -#endif - - for(std::size_t v = 0; v < N; ++v) - result[v] = fipr(x, y, z, 0.0f, v2[v].x, v2[v].y, v2[v].z, 0.0f); - - return result; -} -#endif - #endif } From 04b11dfb9ddd36a5610f50a5fb357d27d0357a56 Mon Sep 17 00:00:00 2001 From: Falco Girgis Date: Sat, 26 Apr 2025 11:15:07 -0500 Subject: [PATCH 4/6] Synchronized dca3-kos with main branch. Which is where KOS's Gainz Phase 3 work now resides. --- vendor/dca3-kos | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/dca3-kos b/vendor/dca3-kos index b1f6bbc9..a2706087 160000 --- a/vendor/dca3-kos +++ b/vendor/dca3-kos @@ -1 +1 @@ -Subproject commit b1f6bbc960cd1f5cd05be8d9d0eca4556ca3a56a +Subproject commit a2706087626795b69689f06803329c1e37bdf7e2 From fc4a7e37919d49415209b66c38bd351f45db1f50 Mon Sep 17 00:00:00 2001 From: Falco Girgis Date: Sat, 26 Apr 2025 11:28:08 -0500 Subject: [PATCH 5/6] Fixing simulator builds. - Had forgotten to define mat_load2(), mat_store2(), mat_identity2() for simulator builds in rwdc_common.h. - Just defining them to use the regular versions. --- vendor/librw/src/dc/rwdc_common.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vendor/librw/src/dc/rwdc_common.h b/vendor/librw/src/dc/rwdc_common.h index c6d094e6..8e3d899f 100644 --- a/vendor/librw/src/dc/rwdc_common.h +++ b/vendor/librw/src/dc/rwdc_common.h @@ -640,6 +640,10 @@ __hot inline void mat_apply_rotate_z(float z) { # define mat_transform(a, b, c, d) # define pvr_fog_table_color(a,r,g,b) # define pvr_fog_table_linear(s,e) +# else +# define mat_load2(a) mat_load(a) +# define mat_store2(a) mat_store2(a) +# define mat_identity2() mat_identity() # endif #define mat_trans_single3_nomod(x_, y_, z_, x2, y2, z2) do { \ From 1f4dace511991ee4429d01ad95884437d6471684 Mon Sep 17 00:00:00 2001 From: Falco Girgis Date: Sat, 26 Apr 2025 11:36:49 -0500 Subject: [PATCH 6/6] Fixing simulator builds. Ugh. Stupid typo. Accidentally defined mat_store2() to mat_store2(). :( --- vendor/librw/src/dc/rwdc_common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/librw/src/dc/rwdc_common.h b/vendor/librw/src/dc/rwdc_common.h index 8e3d899f..9c8f673a 100644 --- a/vendor/librw/src/dc/rwdc_common.h +++ b/vendor/librw/src/dc/rwdc_common.h @@ -642,7 +642,7 @@ __hot inline void mat_apply_rotate_z(float z) { # define pvr_fog_table_linear(s,e) # else # define mat_load2(a) mat_load(a) -# define mat_store2(a) mat_store2(a) +# define mat_store2(a) mat_store(a) # define mat_identity2() mat_identity() # endif