From 4d13e821b5726e3a8fbcc5935eea30203889668c Mon Sep 17 00:00:00 2001 From: Falco Girgis Date: Tue, 15 Apr 2025 12:06:12 -0500 Subject: [PATCH] Liberty/Miami Perf Gainz Phase 3 1) synced dca3-kos repo which has some gainzy commits 2) rwdc_common.h - all low-level and matrix/vector routines for SH4 are now shared in this common file, included in both RW and Liberty/Miami engines 3) CMatrix a. assignment operator: now uses asm-optimized mat_copy() b. multiplication operator: now use mat_mult() SH4 routine c. Scale(): applies a scale matrix via mat_scale d. MultiplyInverse: fipr-optimizations 4) CQuaternion a. multiplication: SH4 ASM FIPR optimized b. Get(V3d& axis, float &angle): fast inversion/division c. Set(RWMatrix&): fast division 5) CVector a. Multiply3x3() now accelerated with mat_transpose 5) RwQuat a. mult(): FIPR accelerated b. length(): FIPR/FSRRA accelerated --- liberty/common.mk | 1 + miami/common.mk | 1 + src/common/vmu/vmu.cpp | 41 +- src/liberty/core/common.h | 21 +- src/liberty/math/Matrix.cpp | 84 +--- src/liberty/math/Matrix.h | 21 + src/liberty/math/Quaternion.cpp | 17 +- src/liberty/math/Quaternion.h | 14 +- src/liberty/math/Vector.cpp | 9 + src/liberty/math/VuVector.h | 6 +- src/liberty/math/maths.h | 71 +-- src/miami/core/common.h | 29 +- src/miami/math/Matrix.cpp | 40 +- src/miami/math/Matrix.h | 32 +- src/miami/math/Quaternion.cpp | 17 +- src/miami/math/Quaternion.h | 14 +- src/miami/math/Vector.cpp | 9 + src/miami/math/VuVector.h | 7 +- src/miami/math/maths.h | 44 +- vendor/dca3-kos | 2 +- vendor/librw/src/base.cpp | 8 + vendor/librw/src/dc/rwdc.cpp | 7 - vendor/librw/src/dc/rwdc_common.h | 739 ++++++++++++++++++++++++++++++ vendor/librw/src/rwbase.h | 11 +- 24 files changed, 967 insertions(+), 278 deletions(-) create mode 100644 vendor/librw/src/dc/rwdc_common.h diff --git a/liberty/common.mk b/liberty/common.mk index d6d585ec..e5fd01a4 100644 --- a/liberty/common.mk +++ b/liberty/common.mk @@ -381,6 +381,7 @@ INCLUDE = \ -I../src/liberty/skel/win \ \ -I../vendor/librw \ +-I../vendor/librw/src/dc \ \ -I../vendor/miniLZO \ \ diff --git a/miami/common.mk b/miami/common.mk index dc043f7b..7e2c915b 100644 --- a/miami/common.mk +++ b/miami/common.mk @@ -396,6 +396,7 @@ INCLUDE = \ -I../src/miami/skel/win \ \ -I../vendor/librw \ +-I../vendor/librw/src/dc \ \ -I../vendor/miniLZO \ \ diff --git a/src/common/vmu/vmu.cpp b/src/common/vmu/vmu.cpp index 4e04e09a..57b5de41 100644 --- a/src/common/vmu/vmu.cpp +++ b/src/common/vmu/vmu.cpp @@ -94,23 +94,26 @@ void VmuProfiler::run() { pvr_stats_t pvrStats; pvr_get_stats(&pvrStats); uint32_t sramStats = snd_mem_available(); size_t pvrAvail = pvr_mem_available(); - float fps = std::accumulate(std::begin(fps_), std::end(fps_), 0.0f) - / static_cast(fpsSamples); float sh4Mem = heapUtilization(); float pvrMem = (8_MB - pvrAvail ) / 8_MB * 100.0f; float armMem = (2_MB - sramStats) / 2_MB * 100.0f; - float vtxBuf = vertBuffUse_; - { - std::shared_lock lk(mtx_); - vmu_printf("FPS :%6.2f\n" - "SH4 :%6.2f%%\n" - "PVR :%6.2f%%\n" - "ARM :%6.2f%%\n" - "VTX :%6.2f%%", - fps, sh4Mem, pvrMem, armMem, vtxBuf); + float vtxBuf; + float fps; + { /* Critical section with main thread. */ + std::shared_lock lk(mtx_); + vtxBuf = vertBuffUse_; + fps = std::accumulate(std::begin(fps_), std::end(fps_), 0.0f) + / static_cast(fpsSamples); } + + vmu_printf(" FPS:%6.2f\n" + " RAM:%6.2f%%\n" + "VRAM:%6.2f%%\n" + "SRAM:%6.2f%%\n" + " VTX:%6.2f%%", + fps, sh4Mem, pvrMem, armMem, vtxBuf); } #endif @@ -119,15 +122,21 @@ void VmuProfiler::run() { } void VmuProfiler::updateVertexBufferUsage() { +#ifndef DC_SH4 std::unique_lock lk(mtx_); updated_ = true; -#ifdef DC_SH4 - vertBuffUse_ = vertexBufferUtilization(); +#else + pvr_stats_t pvrStats; + pvr_get_stats(&pvrStats); + float vtxUtil = vertexBufferUtilization(); - pvr_stats_t pvrStats; - pvr_get_stats(&pvrStats); - fps_[fpsFrame_++] = pvrStats.frame_rate; + { /* Critical section with VMU thread. */ + std::unique_lock lk(mtx_); + vertBuffUse_ = vtxUtil; + updated_ = true; + fps_[fpsFrame_++] = pvrStats.frame_rate; + } if(fpsFrame_ >= fpsSamples) fpsFrame_ = 0; diff --git a/src/liberty/core/common.h b/src/liberty/core/common.h index 661499d6..ed1a23d3 100644 --- a/src/liberty/core/common.h +++ b/src/liberty/core/common.h @@ -82,8 +82,6 @@ #define rwVENDORID_ROCKSTAR 0x0253F2 -__always_inline auto Max(auto a, auto b) { return ((a > b)? a : b); } -__always_inline auto Min(auto a, auto b) { return ((a < b)? a : b); } // Use this to add const that wasn't there in the original code #define Const const @@ -299,15 +297,6 @@ extern int strcasecmp(const char *str1, const char *str2); extern wchar *AllocUnicode(const char*src); -template -__always_inline T Clamp(T v, auto low, auto high) { - return std::clamp(v, static_cast(low), static_cast(high)); -} - -__always_inline auto Clamp2(auto v, auto center, auto radius) { - return (v > center) ? Min(v, center + radius) : Max(v, center - radius); -} - #define SQR(x) ((x) * (x)) __always_inline auto sq(auto x) { return SQR(x); } @@ -418,15 +407,7 @@ template struct check_size { #endif #define BIT(num) (1<<(num)) -#define ABS(a) std::abs(a) +#define ABS(a) Abs(a) -__always_inline auto norm(auto value, auto min, auto max) { - return (Clamp(value, min, max) - min) / (max - min); -} // we use std::lerp now //#define lerp(norm, min, max) ( (norm) * ((max) - (min)) + (min) ) - -#define STRINGIFY(x) #x -#define STR(x) STRINGIFY(x) -#define CONCAT_(x,y) x##y -#define CONCAT(x,y) CONCAT_(x,y) diff --git a/src/liberty/math/Matrix.cpp b/src/liberty/math/Matrix.cpp index d275d963..a919baaf 100644 --- a/src/liberty/math/Matrix.cpp +++ b/src/liberty/math/Matrix.cpp @@ -4,7 +4,7 @@ CMatrix::CMatrix(CMatrix const &m) { m_attachment = nil; m_hasRwMatrix = false; - *this = m; + mat_copy(*this, m); } CMatrix::CMatrix(RwMatrix *matrix, bool owner) @@ -75,7 +75,7 @@ CMatrix::UpdateRW(void) void CMatrix::operator=(CMatrix const &rhs) { - memcpy(this, &rhs, sizeof(f)); + mat_copy(*this, rhs); if (m_attachment) UpdateRW(); } @@ -83,7 +83,7 @@ CMatrix::operator=(CMatrix const &rhs) void CMatrix::CopyOnlyMatrix(const CMatrix &other) { - memcpy(this, &other, sizeof(f)); + mat_copy(*this, other); } CMatrix & @@ -277,9 +277,9 @@ void CMatrix::RotateX(float x) { #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results - mat_load(reinterpret_cast(this)); - mat_rotate_x(x); - mat_store(reinterpret_cast(this)); + mat_load(reinterpret_cast(this)); + mat_rotate_x(x); + mat_store(reinterpret_cast(this)); #else auto [s, c] = SinCos(x); @@ -307,9 +307,9 @@ void CMatrix::RotateY(float y) { #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results - mat_load(reinterpret_cast(this)); - mat_rotate_y(y); - mat_store(reinterpret_cast(this)); + mat_load(reinterpret_cast(this)); + mat_rotate_y(y); + mat_store(reinterpret_cast(this)); #else auto [s, c] = SinCos(y); @@ -337,9 +337,9 @@ void CMatrix::RotateZ(float z) { #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results - mat_load(reinterpret_cast(this)); - mat_rotate_z(z); - mat_store(reinterpret_cast(this)); + mat_load(reinterpret_cast(this)); + mat_rotate_z(z); + mat_store(reinterpret_cast(this)); #else auto [s, c] = SinCos(z); @@ -367,9 +367,9 @@ void CMatrix::Rotate(float x, float y, float z) { #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results - mat_load(reinterpret_cast(this)); - mat_rotate(x, y, z); - mat_store(reinterpret_cast(this)); + mat_load(reinterpret_cast(this)); + mat_rotate(x, y, z); + mat_store(reinterpret_cast(this)); #else auto [sX, cX] = SinCos(x); auto [sY, cY] = SinCos(y); @@ -449,65 +449,13 @@ CMatrix::Reorthogonalise(void) f = CrossProduct(u, r); } -#ifdef DC_SH4 -static __always_inline void MATH_Load_Matrix_Product(const matrix_t* matrix1, const matrix_t* matrix2) -{ - unsigned int prefetch_scratch; - - asm volatile ( - "mov %[bmtrx], %[pref_scratch]\n\t" // (MT) - "add #32, %[pref_scratch]\n\t" // offset by 32 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle) - "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) (FE) - "pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS) - // back matrix - "fmov.d @%[bmtrx]+, XD0\n\t" // (LS) - "fmov.d @%[bmtrx]+, XD2\n\t" - "fmov.d @%[bmtrx]+, XD4\n\t" - "fmov.d @%[bmtrx]+, XD6\n\t" - "pref @%[fmtrx]\n\t" // prefetch fmtrx now while we wait (LS) - "fmov.d @%[bmtrx]+, XD8\n\t" // bmtrx prefetch should work for here - "fmov.d @%[bmtrx]+, XD10\n\t" - "fmov.d @%[bmtrx]+, XD12\n\t" - "mov %[fmtrx], %[pref_scratch]\n\t" // (MT) - "add #32, %[pref_scratch]\n\t" // store offset by 32 in r0 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle) - "fmov.d @%[bmtrx], XD14\n\t" - "pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS) - // front matrix - // interleave loads and matrix multiply 4x4 - "fmov.d @%[fmtrx]+, DR0\n\t" - "fmov.d @%[fmtrx]+, DR2\n\t" - "fmov.d @%[fmtrx]+, DR4\n\t" // (LS) want to issue the next one before 'ftrv' for parallel exec - "ftrv XMTRX, FV0\n\t" // (FE) - - "fmov.d @%[fmtrx]+, DR6\n\t" - "fmov.d @%[fmtrx]+, DR8\n\t" - "ftrv XMTRX, FV4\n\t" - - "fmov.d @%[fmtrx]+, DR10\n\t" - "fmov.d @%[fmtrx]+, DR12\n\t" - "ftrv XMTRX, FV8\n\t" - - "fmov.d @%[fmtrx], DR14\n\t" // (LS, but this will stall 'ftrv' for 3 cycles) - "fschg\n\t" // switch back to single moves (and avoid stalling 'ftrv') (FE) - "ftrv XMTRX, FV12\n\t" // (FE) - // Save output in XF regs - "frchg\n" - : [bmtrx] "+&r" ((unsigned int)matrix1), [fmtrx] "+r" ((unsigned int)matrix2), [pref_scratch] "=&r" (prefetch_scratch) // outputs, "+" means r/w, "&" means it's written to before all inputs are consumed - : // no inputs - : "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11", "fr12", "fr13", "fr14", "fr15" // clobbers (GCC doesn't know about back bank, so writing to it isn't clobbered) - ); -} -#endif - CMatrix operator*(const CMatrix &m1, const CMatrix &m2) { // TODO: VU0 code CMatrix out; #if defined(RW_DC) - mat_load(reinterpret_cast(&m1)); - mat_apply(reinterpret_cast(&m2)); - mat_store(reinterpret_cast(&out)); + mat_mult(out, m1, m2); #else out.rx = m1.rx * m2.rx + m1.fx * m2.ry + m1.ux * m2.rz; out.ry = m1.ry * m2.rx + m1.fy * m2.ry + m1.uy * m2.rz; diff --git a/src/liberty/math/Matrix.h b/src/liberty/math/Matrix.h index d2ff6067..06dab72a 100644 --- a/src/liberty/math/Matrix.h +++ b/src/liberty/math/Matrix.h @@ -1,5 +1,7 @@ #pragma once +#include "rwdc_common.h" + class alignas(8) CMatrix { public: @@ -27,6 +29,8 @@ public: SetScale(scale); } ~CMatrix(void); + operator matrix_t *() { return reinterpret_cast(this); } + operator const matrix_t *() const { return reinterpret_cast(this); } void Attach(RwMatrix *matrix, bool owner = false); void AttachRW(RwMatrix *matrix, bool owner = false); void Detach(void); @@ -59,6 +63,7 @@ public: void SetScale(float s); void Scale(float scale) { +#ifndef DC_SH4 for (int i = 0; i < 3; i++) #ifdef FIX_BUGS // BUGFIX from VC for (int j = 0; j < 3; j++) @@ -66,6 +71,11 @@ public: for (int j = 0; j < 4; j++) #endif f[i][j] *= scale; +#else + mat_load(*this); + mat_scale(scale, scale, scale); + mat_store(*this); +#endif } @@ -102,11 +112,22 @@ CMatrix Invert(const CMatrix &matrix); CMatrix operator*(const CMatrix &m1, const CMatrix &m2); inline CVector MultiplyInverse(const CMatrix &mat, const CVector &vec) { +#ifndef DC_SH4 CVector v(vec.x - mat.px, vec.y - mat.py, vec.z - mat.pz); return CVector( mat.rx * v.x + mat.ry * v.y + mat.rz * v.z, mat.fx * v.x + mat.fy * v.y + mat.fz * v.z, mat.ux * v.x + mat.uy * v.y + mat.uz * v.z); +#else + register float x asm(KOS_FPARG(0)) = vec.x - mat.px; + register float y asm(KOS_FPARG(1)) = vec.y - mat.py; + register float z asm(KOS_FPARG(2)) = vec.z - mat.pz; + return CVector( + fipr(x, y, z, 0.0f, mat.rx, mat.ry, mat.rz, 0.0f), + fipr(x, y, z, 0.0f, mat.fx, mat.fy, mat.fz, 0.0f), + fipr(x, y, z, 0.0f, mat.ux, mat.uy, mat.uz, 0.0f) + ); +#endif } diff --git a/src/liberty/math/Quaternion.cpp b/src/liberty/math/Quaternion.cpp index b0e782e2..9f55b234 100644 --- a/src/liberty/math/Quaternion.cpp +++ b/src/liberty/math/Quaternion.cpp @@ -39,10 +39,14 @@ CQuaternion::Slerp(const CQuaternion &q1, const CQuaternion &q2, float theta, fl void CQuaternion::Multiply(const CQuaternion &q1, const CQuaternion &q2) { +#ifndef DC_SH4 x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w); y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w); z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w); w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z); +#else + quat_mult(*this, q1, q2); +#endif } void @@ -51,9 +55,16 @@ CQuaternion::Get(RwV3d *axis, float *angle) *angle = Acos(w); float s = Sin(*angle); +#ifndef DC_SH4 axis->x = x * (1.0f / s); axis->y = y * (1.0f / s); axis->z = z * (1.0f / s); +#else + float invS = dc::Invert(s); + axis->x = x * invS; + axis->y = y * invS; + axis->z = z * invS; +#endif } void @@ -104,7 +115,7 @@ CQuaternion::Set(const RwMatrix &matrix) if (f >= 0.0f) { s = Sqrt(f + 1.0f); w = 0.5f * s; - m = 0.5f / s; + m = Div(0.5f, s); x = (matrix.up.z - matrix.at.y) * m; y = (matrix.at.x - matrix.right.z) * m; z = (matrix.right.y - matrix.up.x) * m; @@ -115,7 +126,7 @@ CQuaternion::Set(const RwMatrix &matrix) if (f >= 0.0f) { s = Sqrt(f + 1.0f); x = 0.5f * s; - m = 0.5f / s; + m = Div(0.5f, s); y = (matrix.up.x + matrix.right.y) * m; z = (matrix.at.x + matrix.right.z) * m; w = (matrix.up.z - matrix.at.y) * m; @@ -126,7 +137,7 @@ CQuaternion::Set(const RwMatrix &matrix) if (f >= 0.0f) { s = Sqrt(f + 1.0f); y = 0.5f * s; - m = 0.5f / s; + m = Div(0.5f, s); w = (matrix.at.x - matrix.right.z) * m; x = (matrix.up.x - matrix.right.y) * m; z = (matrix.at.y + matrix.up.z) * m; diff --git a/src/liberty/math/Quaternion.h b/src/liberty/math/Quaternion.h index 47581368..c1b0e481 100644 --- a/src/liberty/math/Quaternion.h +++ b/src/liberty/math/Quaternion.h @@ -1,6 +1,7 @@ #pragma once #include "src/common_defines.h" +#include "rwdc_common.h" // TODO: actually implement this class CQuaternion @@ -10,6 +11,10 @@ public: CQuaternion(void) {} CQuaternion(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {} + operator quaternion_t *() { return reinterpret_cast(this); } + operator const quaternion_t *() const { return reinterpret_cast(this); } + operator quaternion_t &() { return *reinterpret_cast(this); } + operator const quaternion_t &() const { return *reinterpret_cast(this); } float Magnitude(void) const { #ifndef DC_SH4 return Sqrt(x*x + y*y + z*z + w*w); @@ -57,10 +62,11 @@ public: } const CQuaternion &operator/=(float right) { - x /= right; - y /= right; - z /= right; - w /= right; + right = dc::Invert(right); + x *= right; + y *= right; + z *= right; + w *= right; return *this; } diff --git a/src/liberty/math/Vector.cpp b/src/liberty/math/Vector.cpp index 9ac3c2d7..4aebb789 100644 --- a/src/liberty/math/Vector.cpp +++ b/src/liberty/math/Vector.cpp @@ -50,9 +50,18 @@ Multiply3x3(const CMatrix &mat, const CVector &vec) CVector Multiply3x3(const CVector &vec, const CMatrix &mat) { +#ifndef DC_SH4 return CVector(mat.rx * vec.x + mat.ry * vec.y + mat.rz * vec.z, mat.fx * vec.x + mat.fy * vec.y + mat.fz * vec.z, mat.ux * vec.x + mat.uy * vec.y + mat.uz * vec.z); +#else + CVector out; + mat_load(mat); + mat_transpose(); + mat_trans_vec3_nomod(vec.x, vec.y, vec.z, + out.x, out.y, out.z); + return out; +#endif } CVector diff --git a/src/liberty/math/VuVector.h b/src/liberty/math/VuVector.h index 4f0239b7..f2f4a8e4 100644 --- a/src/liberty/math/VuVector.h +++ b/src/liberty/math/VuVector.h @@ -44,7 +44,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const CV sqc2 vf06,0x0(%0)\n\ ": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory"); #elif defined(DC_SH4) - mat_load(reinterpret_cast(const_cast(&mat))); + mat_load(mat); mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z); #else out = mat * in; @@ -71,7 +71,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const Rw sqc2 vf06,0x0(%0)\n\ ": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory"); #elif defined(DC_SH4) - mat_load(reinterpret_cast(const_cast(&mat))); + mat_load(mat); mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z); #else out = mat * in; @@ -108,7 +108,7 @@ __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat, bnez %1,1b\n\ ": : "r" (out) , "r" (n), "r" (&mat), "r" (in), "r" (stride): "memory"); #elif defined(DC_SH4) - mat_load(reinterpret_cast(const_cast(&mat))); + mat_load(mat); while(n--) { mat_trans_single3_nodiv_nomod(in->x, in->y, in->z, out->x, out->y, out->z); in = reinterpret_cast(reinterpret_cast(in) + stride); diff --git a/src/liberty/math/maths.h b/src/liberty/math/maths.h index 9959bf61..5e1cf016 100644 --- a/src/liberty/math/maths.h +++ b/src/liberty/math/maths.h @@ -1,73 +1,6 @@ #pragma once #include "src/common_defines.h" +#include "rwdc_common.h" -#include -#include - -#ifdef DC_SH4 - -#define mat_trans_nodiv_nomod(x, y, z, x2, y2, z2, w2) do { \ - register float __x __asm__("fr12") = (x); \ - register float __y __asm__("fr13") = (y); \ - register float __z __asm__("fr14") = (z); \ - register float __w __asm__("fr15") = 1.0f; \ - __asm__ __volatile__( "ftrv xmtrx, fv12\n" \ - : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \ - : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \ - x2 = __x; y2 = __y; z2 = __z; w2 = __w; \ - } while(false) - -#define mat_trans_w_nodiv_nomod(x, y, z, w) do { \ - register float __x __asm__("fr12") = (x); \ - register float __y __asm__("fr13") = (y); \ - register float __z __asm__("fr14") = (z); \ - register float __w __asm__("fr15") = 1.0f; \ - __asm__ __volatile__( "ftrv xmtrx, fv12\n" \ - : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \ - : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \ - w = __w; \ - } while(false) - -__always_inline float Fmac(float a, float b, float c) { - asm volatile ("fmac fr0, %[floatb], %[floatc]\n" - : [floatc] "+f" (c) : "w" (a), [floatb] "f" (b) : ); - return c; -} - -#else - -#define mat_trans_nodiv_nomod(x_, y_, z_, x2, y2, z2, w2) do { \ - vector_t tmp = { x_, y_, z_, 1.0f }; \ - mat_transform(&tmp, &tmp, 1, 0); \ - x2 = tmp.x; y2 = tmp.y; z2 = tmp.z; w2 = tmp.w; \ - } while(false) - -#define mat_trans_w_nodiv_nomod(x_, y_, z_, w_) do { \ - vector_t tmp = { x_, y_, z_, 1.0f }; \ - mat_transform(&tmp, &tmp, 1, 0); \ - w_ = tmp.w; \ - } while(false) - -__always_inline float Fmac(float a, float b, float c) { return a * b + c; } - -#endif - -__always_inline float Sin(float x) { return __builtin_sinf(x); } -__always_inline float Cos(float x) { return __builtin_cosf(x); } -__always_inline auto SinCos(float x) { return std::pair { Sin(x), Cos(x) }; } -__always_inline float Tan(float x) { return __builtin_tanf(x); } -__always_inline float Abs(float x) { return __builtin_fabsf(x); } -__always_inline float Sqrt(float x) { return __builtin_sqrtf(x); } -__always_inline float RecipSqrt(float x) { return 1.0f / __builtin_sqrtf(x); } -__always_inline float Asin(float x) { return __builtin_asinf(x); } -__always_inline float Acos(float x) { return __builtin_acosf(x); } -__always_inline float Atan(float x) { return __builtin_atanf(x); } -__always_inline float Atan2(float y, float x) { return __builtin_atan2f(y, x); } -__always_inline float RecipSqrt(float x, float y) { return x / __builtin_sqrtf(y); /*y = RecipSqrt(y); return x * y * y;*/ } -__always_inline float Pow(float x, float y) { return __builtin_powf(x, y); } -__always_inline float Floor(float x) { return __builtin_floorf(x); } -__always_inline float Ceil(float x) { return __builtin_ceilf(x); } -__always_inline float Invert(float x) { return (((x) < 0.0f)? -1.0f : 1.0f) * RecipSqrt((x) * (x)); } -__always_inline float Div(float x, float y) { return x * Invert(y); } -__always_inline float Lerp(float a, float b, float t) { return Fmac(t, (b - a), a); } \ No newline at end of file +using namespace dc; diff --git a/src/miami/core/common.h b/src/miami/core/common.h index 71911ec3..426ebeca 100644 --- a/src/miami/core/common.h +++ b/src/miami/core/common.h @@ -85,8 +85,6 @@ #define rwVENDORID_ROCKSTAR 0x0253F2 -#define Max(a,b) ((a) > (b) ? (a) : (b)) -#define Min(a,b) ((a) < (b) ? (a) : (b)) // Use this to add const that wasn't there in the original code #define Const const @@ -303,12 +301,8 @@ extern int strncasecmp(const char *str1, const char *str2, size_t len); extern wchar *AllocUnicode(const char*src); -#define Clamp(v, low, high) ((v)<(low) ? (low) : (v)>(high) ? (high) : (v)) - -#define Clamp2(v, center, radius) ((v) > (center) ? Min(v, center + radius) : Max(v, center - radius)) - -inline float sq(float x) { return x*x; } #define SQR(x) ((x) * (x)) +__always_inline auto sq(auto x) { return SQR(x); } #ifdef __MWERKS__ #define M_E 2.71828182845904523536 // e @@ -326,7 +320,10 @@ inline float sq(float x) { return x*x; } #define M_SQRT1_2 0.707106781186547524401 // 1/sqrt(2) #endif -#define PI (float)M_PI +#ifndef DC_SH4 +#define F_PI M_PI +#endif +#define PI (float)F_PI #define TWOPI (PI*2) #define HALFPI (PI/2) #define DEGTORAD(x) ((x) * PI / 180.0f) @@ -380,7 +377,10 @@ __inline__ void TRACE(char *f, ...) { } // this is re3 only, and so the function #endif #endif -#ifndef MASTER +#ifdef assert +#undef assert +#endif +#if !defined(MASTER) #define assert(_Expression) (void)( (!!(_Expression)) || (re3_assert(#_Expression, __FILE__, __LINE__, __FUNCTION__), 0) ) #else #define assert(_Expression) (_Expression) @@ -411,12 +411,7 @@ template struct check_size { #endif #define BIT(num) (1<<(num)) -#define ABS(a) (((a) < 0) ? (-(a)) : (a)) -#define norm(value, min, max) (((value) < (min)) ? 0 : (((value) > (max)) ? 1 : (((value) - (min)) / ((max) - (min))))) -#define Lerp(norm, min, max) ( (norm) * ((max) - (min)) + (min) ) - -#define STRINGIFY(x) #x -#define STR(x) STRINGIFY(x) -#define CONCAT_(x,y) x##y -#define CONCAT(x,y) CONCAT_(x,y) +#define ABS(a) Abs(a) +// we use std::lerp now +//#define lerp(norm, min, max) ( (norm) * ((max) - (min)) + (min) ) diff --git a/src/miami/math/Matrix.cpp b/src/miami/math/Matrix.cpp index 62b7029c..f90e3c06 100644 --- a/src/miami/math/Matrix.cpp +++ b/src/miami/math/Matrix.cpp @@ -4,7 +4,7 @@ CMatrix::CMatrix(CMatrix const &m) { m_attachment = nil; m_hasRwMatrix = false; - *this = m; + mat_copy(*this, m); } CMatrix::CMatrix(RwMatrix *matrix, bool owner) @@ -75,7 +75,7 @@ CMatrix::UpdateRW(void) void CMatrix::operator=(CMatrix const &rhs) { - memcpy(this, &rhs, sizeof(f)); + mat_copy(*this, rhs); if (m_attachment) UpdateRW(); } @@ -83,7 +83,7 @@ CMatrix::operator=(CMatrix const &rhs) void CMatrix::CopyOnlyMatrix(const CMatrix &other) { - memcpy(this, &other, sizeof(f)); + mat_copy(*this, other); } CMatrix & @@ -358,12 +358,14 @@ CMatrix::RotateZ(float z) void CMatrix::Rotate(float x, float y, float z) { - float cX = Cos(x); - float sX = Sin(x); - float cY = Cos(y); - float sY = Sin(y); - float cZ = Cos(z); - float sZ = Sin(z); +#if 0 && defined(DC_SH4) // this is bugged and does not yield correct results + mat_load(reinterpret_cast(this)); + mat_rotate(x, y, z); + mat_store(reinterpret_cast(this)); +#else + auto [sX, cX] = SinCos(x); + auto [sY, cY] = SinCos(y); + auto [sZ, cZ] = SinCos(z); float rx = this->rx; float ry = this->ry; @@ -388,6 +390,20 @@ CMatrix::Rotate(float x, float y, float z) float z2 = sZ * sY - (cZ * sX) * cY; float z3 = cX * cY; + #if !defined(DC_TEXCONV) && !defined(DC_SIM) + this->rx = fipr(x1, y1, z1, 0, rx, ry, rz, 0); + this->ry = fipr(x2, y2, z2, 0, rx, ry, rz, 0); + this->rz = fipr(x3, y3, z3, 0, rx, ry, rz, 0); + this->fx = fipr(x1, y1, z1, 0, ux, uy, uz, 0); + this->fy = fipr(x2, y2, z2, 0, ux, uy, uz, 0); + this->fz = fipr(x3, y3, z3, 0, ux, uy, uz, 0); + this->ux = fipr(x1, y1, z1, 0, ax, ay, az, 0); + this->uy = fipr(x2, y2, z2, 0, ax, ay, az, 0); + this->uz = fipr(x3, y3, z3, 0, ax, ay, az, 0); + this->px = fipr(x1, y1, z1, 0, px, py, pz, 0); + this->py = fipr(x2, y2, z2, 0, px, py, pz, 0); + this->pz = fipr(x3, y3, z3, 0, px, py, pz, 0); + #else this->rx = x1 * rx + y1 * ry + z1 * rz; this->ry = x2 * rx + y2 * ry + z2 * rz; this->rz = x3 * rx + y3 * ry + z3 * rz; @@ -400,6 +416,8 @@ CMatrix::Rotate(float x, float y, float z) this->px = x1 * px + y1 * py + z1 * pz; this->py = x2 * px + y2 * py + z2 * pz; this->pz = x3 * px + y3 * py + z3 * pz; + #endif +#endif } CMatrix & @@ -429,9 +447,7 @@ operator*(const CMatrix &m1, const CMatrix &m2) // TODO: VU0 code CMatrix out; #if defined(RW_DC) - mat_load(reinterpret_cast(&m1)); - mat_apply(reinterpret_cast(&m2)); - mat_store(reinterpret_cast(&out)); + mat_mult(out, m1, m2); #else out.rx = m1.rx * m2.rx + m1.fx * m2.ry + m1.ux * m2.rz; out.ry = m1.ry * m2.rx + m1.fy * m2.ry + m1.uy * m2.rz; diff --git a/src/miami/math/Matrix.h b/src/miami/math/Matrix.h index dd6f892e..456c6f05 100644 --- a/src/miami/math/Matrix.h +++ b/src/miami/math/Matrix.h @@ -43,6 +43,8 @@ public: SetScale(scale); } ~CMatrix(void); + operator matrix_t *() { return reinterpret_cast(this); } + operator const matrix_t *() const { return reinterpret_cast(this); } void Attach(RwMatrix *matrix, bool owner = false); void AttachRW(RwMatrix *matrix, bool owner = false); void Detach(void); @@ -75,20 +77,31 @@ public: void SetScale(float s); void Scale(float scale) { +#if !defined(DC_SH4) || 1 /* TODO */ for (int i = 0; i < 3; i++) for (int j = 0; j < 3; j++) f[i][j] *= scale; +#else + mat_load(*this); + mat_scale(scale, scale, scale); + mat_store(*this); +#endif } void Scale(float sx, float sy, float sz) { +#if !defined(DC_SH4) || 1 /* TODO */ for (int i = 0; i < 3; i++){ - f[i][0] *= sx; - f[i][1] *= sy; - f[i][2] *= sz; + f[i][0] *= sx; + f[i][1] *= sy; + f[i][2] *= sz; } +#else + mat_load(*this); + mat_scale(sx, sy, sz); + mat_store(*this); +#endif } - void SetRotateXOnly(float angle); void SetRotateYOnly(float angle); void SetRotateZOnly(float angle); @@ -125,11 +138,22 @@ CMatrix Invert(const CMatrix &matrix); CMatrix operator*(const CMatrix &m1, const CMatrix &m2); inline CVector MultiplyInverse(const CMatrix &mat, const CVector &vec) { +#ifndef DC_SH4 CVector v(vec.x - mat.px, vec.y - mat.py, vec.z - mat.pz); return CVector( mat.rx * v.x + mat.ry * v.y + mat.rz * v.z, mat.fx * v.x + mat.fy * v.y + mat.fz * v.z, mat.ux * v.x + mat.uy * v.y + mat.uz * v.z); +#else + register float x asm(KOS_FPARG(0)) = vec.x - mat.px; + register float y asm(KOS_FPARG(1)) = vec.y - mat.py; + register float z asm(KOS_FPARG(2)) = vec.z - mat.pz; + return CVector( + fipr(x, y, z, 0.0f, mat.rx, mat.ry, mat.rz, 0.0f), + fipr(x, y, z, 0.0f, mat.fx, mat.fy, mat.fz, 0.0f), + fipr(x, y, z, 0.0f, mat.ux, mat.uy, mat.uz, 0.0f) + ); +#endif } diff --git a/src/miami/math/Quaternion.cpp b/src/miami/math/Quaternion.cpp index b0e782e2..9f55b234 100644 --- a/src/miami/math/Quaternion.cpp +++ b/src/miami/math/Quaternion.cpp @@ -39,10 +39,14 @@ CQuaternion::Slerp(const CQuaternion &q1, const CQuaternion &q2, float theta, fl void CQuaternion::Multiply(const CQuaternion &q1, const CQuaternion &q2) { +#ifndef DC_SH4 x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w); y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w); z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w); w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z); +#else + quat_mult(*this, q1, q2); +#endif } void @@ -51,9 +55,16 @@ CQuaternion::Get(RwV3d *axis, float *angle) *angle = Acos(w); float s = Sin(*angle); +#ifndef DC_SH4 axis->x = x * (1.0f / s); axis->y = y * (1.0f / s); axis->z = z * (1.0f / s); +#else + float invS = dc::Invert(s); + axis->x = x * invS; + axis->y = y * invS; + axis->z = z * invS; +#endif } void @@ -104,7 +115,7 @@ CQuaternion::Set(const RwMatrix &matrix) if (f >= 0.0f) { s = Sqrt(f + 1.0f); w = 0.5f * s; - m = 0.5f / s; + m = Div(0.5f, s); x = (matrix.up.z - matrix.at.y) * m; y = (matrix.at.x - matrix.right.z) * m; z = (matrix.right.y - matrix.up.x) * m; @@ -115,7 +126,7 @@ CQuaternion::Set(const RwMatrix &matrix) if (f >= 0.0f) { s = Sqrt(f + 1.0f); x = 0.5f * s; - m = 0.5f / s; + m = Div(0.5f, s); y = (matrix.up.x + matrix.right.y) * m; z = (matrix.at.x + matrix.right.z) * m; w = (matrix.up.z - matrix.at.y) * m; @@ -126,7 +137,7 @@ CQuaternion::Set(const RwMatrix &matrix) if (f >= 0.0f) { s = Sqrt(f + 1.0f); y = 0.5f * s; - m = 0.5f / s; + m = Div(0.5f, s); w = (matrix.at.x - matrix.right.z) * m; x = (matrix.up.x - matrix.right.y) * m; z = (matrix.at.y + matrix.up.z) * m; diff --git a/src/miami/math/Quaternion.h b/src/miami/math/Quaternion.h index 47581368..c1b0e481 100644 --- a/src/miami/math/Quaternion.h +++ b/src/miami/math/Quaternion.h @@ -1,6 +1,7 @@ #pragma once #include "src/common_defines.h" +#include "rwdc_common.h" // TODO: actually implement this class CQuaternion @@ -10,6 +11,10 @@ public: CQuaternion(void) {} CQuaternion(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {} + operator quaternion_t *() { return reinterpret_cast(this); } + operator const quaternion_t *() const { return reinterpret_cast(this); } + operator quaternion_t &() { return *reinterpret_cast(this); } + operator const quaternion_t &() const { return *reinterpret_cast(this); } float Magnitude(void) const { #ifndef DC_SH4 return Sqrt(x*x + y*y + z*z + w*w); @@ -57,10 +62,11 @@ public: } const CQuaternion &operator/=(float right) { - x /= right; - y /= right; - z /= right; - w /= right; + right = dc::Invert(right); + x *= right; + y *= right; + z *= right; + w *= right; return *this; } diff --git a/src/miami/math/Vector.cpp b/src/miami/math/Vector.cpp index a3510a6c..e2bc5f7e 100644 --- a/src/miami/math/Vector.cpp +++ b/src/miami/math/Vector.cpp @@ -50,9 +50,18 @@ Multiply3x3(const CMatrix &mat, const CVector &vec) CVector Multiply3x3(const CVector &vec, const CMatrix &mat) { +#ifndef DC_SH4 return CVector(mat.rx * vec.x + mat.ry * vec.y + mat.rz * vec.z, mat.fx * vec.x + mat.fy * vec.y + mat.fz * vec.z, mat.ux * vec.x + mat.uy * vec.y + mat.uz * vec.z); +#else + CVector out; + mat_load(mat); + mat_transpose(); + mat_trans_vec3_nomod(vec.x, vec.y, vec.z, + out.x, out.y, out.z); + return out; +#endif } CVector diff --git a/src/miami/math/VuVector.h b/src/miami/math/VuVector.h index 1b184f97..3c5933c4 100644 --- a/src/miami/math/VuVector.h +++ b/src/miami/math/VuVector.h @@ -50,7 +50,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const CV sqc2 vf06,0x0(%0)\n\ ": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory"); #elif defined(DC_SH4) - mat_load(reinterpret_cast(const_cast(&mat))); + mat_load(mat); mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z); #else out = mat * in; @@ -77,7 +77,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const Rw sqc2 vf06,0x0(%0)\n\ ": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory"); #elif defined(DC_SH4) - mat_load(reinterpret_cast(const_cast(&mat))); + mat_load(mat); mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z); #else out = mat * in; @@ -114,7 +114,7 @@ __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat, bnez %1,1b\n\ ": : "r" (out) , "r" (n), "r" (&mat), "r" (in), "r" (stride): "memory"); #elif defined(DC_SH4) - mat_load(reinterpret_cast(const_cast(&mat))); + mat_load(mat); while(n--) { mat_trans_single3_nodiv_nomod(in->x, in->y, in->z, out->x, out->y, out->z); in = reinterpret_cast(reinterpret_cast(in) + stride); @@ -137,7 +137,6 @@ __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat, lqc2 vf03,0x10(%2)\n\ lqc2 vf04,0x20(%2)\n\ lqc2 vf05,0x30(%2)\n\ - lqc2 vf01,0x0(%3)\n\ nop\n\ 1: vmulax.xyz ACC, vf02,vf01\n\ vmadday.xyz ACC, vf03,vf01\n\ diff --git a/src/miami/math/maths.h b/src/miami/math/maths.h index 6f2458a2..5e1cf016 100644 --- a/src/miami/math/maths.h +++ b/src/miami/math/maths.h @@ -1,46 +1,6 @@ #pragma once #include "src/common_defines.h" +#include "rwdc_common.h" -#include - -#ifdef DC_SH4 - -#define mat_trans_nodiv_nomod(x, y, z, x2, y2, z2, w2) do { \ - register float __x __asm__("fr12") = (x); \ - register float __y __asm__("fr13") = (y); \ - register float __z __asm__("fr14") = (z); \ - register float __w __asm__("fr15") = 1.0f; \ - __asm__ __volatile__( "ftrv xmtrx, fv12\n" \ - : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \ - : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \ - x2 = __x; y2 = __y; z2 = __z; w2 = __w; \ - } while(false) - -#else - -#define mat_trans_nodiv_nomod(x_, y_, z_, x2, y2, z2, w2) do { \ - vector_t tmp = { x_, y_, z_, 1.0f }; \ - mat_transform(&tmp, &tmp, 1, 0); \ - x2 = tmp.x; y2 = tmp.y; z2 = tmp.z; w2 = tmp.w; \ - } while(false) -#endif - - -// wrapper around float versions of functions -// in gta they are in CMaths but that makes the code rather noisy - -inline float Sin(float x) { return sinf(x); } -inline float Asin(float x) { return asinf(x); } -inline float Cos(float x) { return cosf(x); } -inline float Acos(float x) { return acosf(x); } -inline float Tan(float x) { return tanf(x); } -inline float Atan(float x) { return atanf(x); } -inline float Atan2(float y, float x) { return atan2f(y, x); } -inline float Abs(float x) { return fabsf(x); } -inline float Sqrt(float x) { return sqrtf(x); } -inline float RecipSqrt(float x, float y) { return x/Sqrt(y); } -inline float RecipSqrt(float x) { return RecipSqrt(1.0f, x); } -inline float Pow(float x, float y) { return powf(x, y); } -inline float Floor(float x) { return floorf(x); } -inline float Ceil(float x) { return ceilf(x); } +using namespace dc; diff --git a/vendor/dca3-kos b/vendor/dca3-kos index 5f69d048..144eaeab 160000 --- a/vendor/dca3-kos +++ b/vendor/dca3-kos @@ -1 +1 @@ -Subproject commit 5f69d048aaf2548687c33afc7726d603a22e29b7 +Subproject commit 144eaeab940246d9c84c1f14b45d9e2847705c5e diff --git a/vendor/librw/src/base.cpp b/vendor/librw/src/base.cpp index 391af2ab..a9b7b5d6 100644 --- a/vendor/librw/src/base.cpp +++ b/vendor/librw/src/base.cpp @@ -89,10 +89,18 @@ strncmp_ci(const char *s1, const char *s2, int n) Quat mult(const Quat &q, const Quat &p) { +#ifndef DC_SH4 return makeQuat(q.w*p.w - q.x*p.x - q.y*p.y - q.z*p.z, q.w*p.x + q.x*p.w + q.y*p.z - q.z*p.y, q.w*p.y + q.y*p.w + q.z*p.x - q.x*p.z, q.w*p.z + q.z*p.w + q.x*p.y - q.y*p.x); +#else + Quat o; + dc::quat_mult(reinterpret_cast(&o), + reinterpret_cast(q), + reinterpret_cast(p)); + return o; +#endif } diff --git a/vendor/librw/src/dc/rwdc.cpp b/vendor/librw/src/dc/rwdc.cpp index d282b317..d33de9b0 100644 --- a/vendor/librw/src/dc/rwdc.cpp +++ b/vendor/librw/src/dc/rwdc.cpp @@ -728,13 +728,6 @@ __always_inline void DCE_RenderSubmitVertexIM3D(float x, float y, float w, /* END TA Submission Functions*/ - - - -#if defined(DC_TEXCONV) -void malloc_stats() { } -#endif - #if 0 #define UNIMPL_LOG() printf("TODO: Implement %s @ %s:%d\n", __func__, __FILE__, __LINE__); #define UNIMPL_LOGV(fmt, ...) printf("TODO: Implement %s @ %s:%d " fmt "\n", __func__, __FILE__, __LINE__, __VA_ARGS__); diff --git a/vendor/librw/src/dc/rwdc_common.h b/vendor/librw/src/dc/rwdc_common.h new file mode 100644 index 00000000..e0f940b7 --- /dev/null +++ b/vendor/librw/src/dc/rwdc_common.h @@ -0,0 +1,739 @@ +#ifndef __RWDC_COMMON_H +#define __RWDC_COMMON_H + +#include +#include +#include +#include +#include +#include + +#ifndef RW_DC +# pragma warning(disable: 4244) // int to float +# pragma warning(disable: 4800) // int to bool +# pragma warning(disable: 4838) // narrowing conversion +# pragma warning(disable: 4996) // POSIX names +#else +# if !defined(DC_TEXCONV) && !defined(DC_SIM) +# include +# define DC_SH4 +# define VIDEO_MODE_WIDTH static_cast(vid_mode->width) +# define VIDEO_MODE_HEIGHT static_cast(vid_mode->height) +# define memcpy4 memcpy +# else +# ifdef DC_TEXCONV +# define malloc_stats() +# endif +# include +# include +# include +# define VIDEO_MODE_WIDTH 640.0f +# define VIDEO_MODE_HEIGHT 480.0f +# define memcpy4 memcpy +# define dcache_pref_block(a) __builtin_prefetch(a) +# define F_PI M_PI +# ifndef __always_inline +# define __always_inline inline +# endif +# endif +# ifdef PVR_TXRFMT_STRIDE +# undef PVR_TXRFMT_STRIDE +# define PVR_TXRFMT_STRIDE (1 << 25) +# endif + static_assert(PVR_TXRFMT_STRIDE == (1 << 25), + "PVR_TXRFMT_STRIDE is bugged in your KOS version"); +#endif + +#define F_PI_2 (F_PI * 0.5f) +#define __hot __attribute__((hot)) +#define __cold __attribute__((cold)) + +#define STRINGIFY(x) #x +#define STR(x) STRINGIFY(x) +#define CONCAT_(x,y) x##y +#define CONCAT(x,y) CONCAT_(x,y) + +namespace rw { + class Matrix; +} + +namespace dc { + +struct quaternion_t { + float x, y, z, w; +}; + +__always_inline __hot constexpr float Sin(float x) { return sinf(x); } +__always_inline __hot constexpr float Cos(float x) { return cosf(x); } +__always_inline __hot constexpr auto SinCos(float x) { return std::pair { Sin(x), Cos(x) }; } +__always_inline __hot constexpr float Abs(float x) { return fabsf(x); } +__always_inline __hot constexpr float Sqrt(float x) { return sqrtf(x); } +__always_inline __hot constexpr float RecipSqrt(float x, float y) { return x / Sqrt(y); } +__always_inline __hot constexpr float Pow(float x, float y) { return powf(x, y); } +__always_inline __hot constexpr float Floor(float x) { return floorf(x); } +__always_inline __hot constexpr float Ceil(float x) { return ceilf(x); } +__always_inline __hot constexpr float Fmac(auto a, auto b, auto c) { return a * b + c; } +__always_inline __hot constexpr float Lerp(float a, float b, float t) { return Fmac(t, (b - a), a); } +__always_inline __hot constexpr auto Max(auto a, auto b) { return ((a > b)? a : b); } +__always_inline __hot constexpr auto Min(auto a, auto b) { return ((a < b)? a : b); } + +template +__always_inline __hot constexpr float RecipSqrt(float x) { + if constexpr(CHECK_ZERO) + if(x == 0.0f) + return 0.0f; + + return 1.0f / Sqrt(x); +} + +template +__always_inline __hot constexpr T Clamp(T v, auto low, auto high) { + return std::clamp(v, static_cast(low), static_cast(high)); +} + +__always_inline constexpr auto Clamp2(auto v, auto center, auto radius) { + return (v > center) ? Min(v, center + radius) : Max(v, center - radius); +} + +template +__always_inline __hot constexpr float Invert(float x) { + float value; + + if(!std::is_constant_evaluated() && FAST_APPROX) { + value = RecipSqrt(x * x); + + if constexpr(COPY_SIGN) + if(x < 0.0f) + value = -value; + + } else value = 1.0f / x; + + return value; +} + +template +__always_inline __hot constexpr float Div(float x, float y) { + if(FAST_APPROX && !std::is_constant_evaluated()) + return x * Invert(y); + else + return x / y; +} + +template +__always_inline __hot constexpr auto Norm(auto value, auto min, auto max) { + auto numerator = Clamp(value, min, max) - min; + auto denominator = (max - min); + + if(FAST_APPROX && !std::is_constant_evaluated()) + return Div(numerator, denominator); + else + return numerator / denominator; +} + +template +__always_inline __hot constexpr float Tan(float x) { + if(!std::is_constant_evaluated() && FAST_APPROX) { + constexpr float pisqby4 = 2.4674011002723397f; + constexpr float adjpisqby4 = 2.471688400562703f; + constexpr float adj1minus8bypisq = 0.189759681063053f; + float xsq = x * x; + + return x * Div(adjpisqby4 - adj1minus8bypisq * xsq, + pisqby4 - xsq); + } else + return tanf(x); +} + +template +__always_inline __hot constexpr float Atan(float x) { + if(FAST_APPROX && !std::is_constant_evaluated()) { + constexpr float a[3] = { // + 0.998418889819911f, -2.9993501171084700E-01f, 0.0869142852883849f}; + float xx = x * x; + return ((a[2] * xx + a[1]) * xx + a[0]) * x; + } else return atanf(x); +} + +template +__hot constexpr float Atan2(float y, float x) { + if(FAST_APPROX && !std::is_constant_evaluated()) { +#if 0 + constexpr float halfpi_i754 = M_PI * 0.5f; + constexpr float quarterpi_i754 = M_PI * 0.25f; + // kludge to prevent 0/0 condition + float abs_y = Abs(y) + std::numeric_limits::epsilon(); + float absy_plus_absx = abs_y + Abs(x); + float inv_absy_plus_absx = Invert(absy_plus_absx); + float angle = halfpi_i754 - copysignf(quarterpi_i754, x); + float r = (x - copysignf(abs_y, x)) * inv_absy_plus_absx; + angle += (0.1963f * r * r - 0.9817f) * r; + return copysignf(angle, y); +#else + // Ensure input is in [-1, +1] + bool swap = fabs(x) < fabs(y); + float atan_input = (swap ? x : y) / (swap ? y : x); + + // Approximate atan + float res = Atan(atan_input); + + // If swapped, adjust atan output + res = swap ? (atan_input >= 0.0f ? F_PI_2 : -F_PI_2) - res : res; + // Adjust quadrants + if (x >= 0.0f && y >= 0.0f) {} // 1st quadrant + else if (x < 0.0f && y >= 0.0f) { res = F_PI + res; } // 2nd quadrant + else if (x < 0.0f && y < 0.0f) { res = -F_PI + res; } // 3rd quadrant + else if (x >= 0.0f && y < 0.0f) {} // 4th quadrant + + // Store result + return res; +#endif + } else return atan2f(y, x); +} + +template +__always_inline __hot constexpr float Asin(float x) { + if(FAST_APPROX && !std::is_constant_evaluated()) { + Atan(Div(x, Sqrt(1.0f-(x*x)))); + } else return asinf(x); +} + +template +__always_inline __hot constexpr float Acos(float x) { + if(FAST_APPROX && !std::is_constant_evaluated()) { + return (-0.69813170079773212f * x * x - 0.87266462599716477f) * x + 1.5707963267948966f; + } else return acosf(x); +} + +#ifdef RW_DC +# ifdef DC_SH4 + +#define mat_trans_nodiv_nomod(x, y, z, x2, y2, z2, w2) do { \ + register float __x __asm__("fr12") = (x); \ + register float __y __asm__("fr13") = (y); \ + register float __z __asm__("fr14") = (z); \ + register float __w __asm__("fr15") = 1.0f; \ + __asm__ __volatile__( "ftrv xmtrx, fv12\n" \ + : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \ + : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \ + x2 = __x; y2 = __y; z2 = __z; w2 = __w; \ + } while(false) + +#define mat_trans_nodiv_nomod_zerow(x, y, z, x2, y2, z2, w2) do { \ + register float __x __asm__("fr12") = (x); \ + register float __y __asm__("fr13") = (y); \ + register float __z __asm__("fr14") = (z); \ + register float __w __asm__("fr15") = 0.0f; \ + __asm__ __volatile__( "ftrv xmtrx, fv12\n" \ + : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \ + : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \ + x2 = __x; y2 = __y; z2 = __z; w2 = __w; \ + } while(false) + +#define mat_trans_w_nodiv_nomod(x, y, z, w) do { \ + register float __x __asm__("fr12") = (x); \ + register float __y __asm__("fr13") = (y); \ + register float __z __asm__("fr14") = (z); \ + register float __w __asm__("fr15") = 1.0f; \ + __asm__ __volatile__( "ftrv xmtrx, fv12\n" \ + : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \ + : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \ + w = __w; \ + } while(false) + +#define mat_trans_vec3_nomod(x, y, z, x2, y2, z2) { \ + register float __x __asm__("fr12") = (x); \ + register float __y __asm__("fr13") = (y); \ + register float __z __asm__("fr14") = (z); \ + __asm__ __volatile__( \ + "fldi0 fr15\n" \ + "ftrv xmtrx, fv8\n" \ + : "=f" (__x), "=f" (__y), "=f" (__z) \ + : "0" (__x), "1" (__y), "2" (__z) \ + : "fr15" ); \ + x2 = __x; y2 = __y; z2 = __z; \ + } + +// no declspec naked, so can't do rts / fschg. instead compiler pads with nop? +inline __hot void mat_load_3x3(const matrix_t* mtx) { + __asm__ __volatile__ ( + R"( + fschg + frchg + + fmov @%[mtx]+, dr0 + fldi0 fr12 + + fmov @%[mtx]+, dr2 + fldi0 fr13 + + fmov @%[mtx]+, dr4 + fldi0 fr3 + + fmov @%[mtx]+, dr6 + fmov dr12, dr14 + + fmov @%[mtx]+, dr8 + fldi0 fr7 + + fmov @%[mtx]+, dr10 + fldi0 fr11 + + fschg + frchg + )" + : [mtx] "+r" (mtx) + ); +} + +// sets pos.w to 1 +inline __hot void rw_mat_load_4x4(const rw::Matrix* mtx) { + __asm__ __volatile__ ( + R"( + fschg + frchg + fmov @%[mtx]+, dr0 + + fmov @%[mtx]+, dr2 + fmov @%[mtx]+, dr4 + fmov @%[mtx]+, dr6 + fmov @%[mtx]+, dr8 + fmov @%[mtx]+, dr10 + fmov @%[mtx]+, dr12 + fmov @%[mtx]+, dr14 + fldi1 fr15 + + fschg + frchg + )" + : [mtx] "+r" (mtx) + ); +} + +__hot inline void mat_transpose(void) { + asm volatile ( + "frchg\n\t" // fmov for singles only works on front bank + // FR0, FR5, FR10, and FR15 are already in place + // swap FR1 and FR4 + "flds FR1, FPUL\n\t" + "fmov FR4, FR1\n\t" + "fsts FPUL, FR4\n\t" + // swap FR2 and FR8 + "flds FR2, FPUL\n\t" + "fmov FR8, FR2\n\t" + "fsts FPUL, FR8\n\t" + // swap FR3 and FR12 + "flds FR3, FPUL\n\t" + "fmov FR12, FR3\n\t" + "fsts FPUL, FR12\n\t" + // swap FR6 and FR9 + "flds FR6, FPUL\n\t" + "fmov FR9, FR6\n\t" + "fsts FPUL, FR9\n\t" + // swap FR7 and FR13 + "flds FR7, FPUL\n\t" + "fmov FR13, FR7\n\t" + "fsts FPUL, FR13\n\t" + // swap FR11 and FR14 + "flds FR11, FPUL\n\t" + "fmov FR14, FR11\n\t" + "fsts FPUL, FR14\n\t" + // restore XMTRX to back bank + "frchg\n" + : // no outputs + : // no inputs + : "fpul" // clobbers + ); +} + +__attribute__((optimize("align-functions=32"))) +__hot inline void mat_copy(matrix_t *dst, const matrix_t *src) { + asm volatile(R"( + fschg + + pref @%[dst] + + fmov.d @%[src]+, xd0 + fmov.d @%[src]+, xd2 + fmov.d @%[src]+, xd4 + fmov.d @%[src]+, xd6 + + pref @%[src] + + add #32, %[dst] + + fmov.d xd6, @-%[dst] + fmov.d xd4, @-%[dst] + fmov.d xd2, @-%[dst] + fmov.d xd0, @-%[dst] + + add #32, %[dst] + pref @%[dst] + + fmov.d @%[src]+, xd0 + fmov.d @%[src]+, xd2 + fmov.d @%[src]+, xd4 + fmov.d @%[src]+, xd6 + + add #32, %[dst] + + fmov.d xd6, @-%[dst] + fmov.d xd4, @-%[dst] + fmov.d xd2, @-%[dst] + fmov.d xd0, @-%[dst] + + fschg + )": [dst] "+&r" (dst), [src] "+&r" (src) + : + : "memory"); +} + +template +__hot constexpr inline void quat_mult(quaternion_t *r, const quaternion_t &q1, const quaternion_t &q2) { + if(FAST_APPROX && !std::is_constant_evaluated()) { + /* + // reorder the coefficients so that q1 stays in constant order {x,y,z,w} + // q2 then needs to be rotated after each inner product + x = (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x); + y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y); + z = (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z); + w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w); + */ + // keep q1 in fv4 + register float q1x __asm__ ("fr4") = (q1.x); + register float q1y __asm__ ("fr5") = (q1.y); + register float q1z __asm__ ("fr6") = (q1.z); + register float q1w __asm__ ("fr7") = (q1.w); + + // load q2 into fv8, use it to get the shuffled reorder into fv0 + register float q2x __asm__ ("fr8") = (q2.x); + register float q2y __asm__ ("fr9") = (q2.y); + register float q2z __asm__ ("fr10") = (q2.z); + register float q2w __asm__ ("fr11") = (q2.w); + + // temporary operand / result in fv0 + register float t1x __asm__ ("fr0"); + register float t1y __asm__ ("fr1"); + register float t1z __asm__ ("fr2"); + register float t1w __asm__ ("fr3"); + + // x = (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x); + t1x = q2w; + t1y = q2z; + t1z = -q2y; + t1w = q2w; + __asm__ ("\n" + " fipr fv4,fv0\n" + : "+f" (t1w) + : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w), + "f" (t1x), "f" (t1y), "f" (t1z) + ); + // x = t1w; try to avoid the stall by not reading the fipr result immediately + + // y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y); + t1x = -q2z; + t1y = q2w; + t1z = q2x; + __atomic_thread_fence(1); + r->x = t1w; // get previous result + t1w = q2y; + __asm__ ("\n" + " fipr fv4,fv0\n" + : "+f" (t1w) + : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w), + "f" (t1x), "f" (t1y), "f" (t1z) + ); + //y = t1w; + + // z = (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z); + t1x = q2y; + t1y = -q2x; + t1z = q2w; + __atomic_thread_fence(1); + r->y = t1w; // get previous result + t1w = q2z; + __asm__ ("\n" + " fipr fv4,fv0\n" + : "+f" (t1w) + : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w), + "f" (t1x), "f" (t1y), "f" (t1z) + ); + //z = t1w; + __atomic_thread_fence(1); + + // w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w); + q2x = -q2x; + q2y = -q2y; + q2z = -q2z; + __asm__ ("\n" + " fipr fv4,fv8\n" + : "+f" (q2w) + : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w), + "f" (q2x), "f" (q2y), "f" (q2z) + ); + + __atomic_thread_fence(1); + r->z = t1w; + __atomic_thread_fence(1); + r->w = q2w; + } else { + r->x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w); + r->y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w); + r->z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w); + r->w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z); + } +} + +__hot inline void mat_load_apply(const matrix_t* matrix1, const matrix_t* matrix2) { + unsigned int prefetch_scratch; + + asm volatile ( + "mov %[bmtrx], %[pref_scratch]\n\t" // (MT) + "add #32, %[pref_scratch]\n\t" // offset by 32 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle) + "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) (FE) + "pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS) + // back matrix + "fmov.d @%[bmtrx]+, XD0\n\t" // (LS) + "fmov.d @%[bmtrx]+, XD2\n\t" + "fmov.d @%[bmtrx]+, XD4\n\t" + "fmov.d @%[bmtrx]+, XD6\n\t" + "pref @%[fmtrx]\n\t" // prefetch fmtrx now while we wait (LS) + "fmov.d @%[bmtrx]+, XD8\n\t" // bmtrx prefetch should work for here + "fmov.d @%[bmtrx]+, XD10\n\t" + "fmov.d @%[bmtrx]+, XD12\n\t" + "mov %[fmtrx], %[pref_scratch]\n\t" // (MT) + "add #32, %[pref_scratch]\n\t" // store offset by 32 in r0 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle) + "fmov.d @%[bmtrx], XD14\n\t" + "pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS) + // front matrix + // interleave loads and matrix multiply 4x4 + "fmov.d @%[fmtrx]+, DR0\n\t" + "fmov.d @%[fmtrx]+, DR2\n\t" + "fmov.d @%[fmtrx]+, DR4\n\t" // (LS) want to issue the next one before 'ftrv' for parallel exec + "ftrv XMTRX, FV0\n\t" // (FE) + + "fmov.d @%[fmtrx]+, DR6\n\t" + "fmov.d @%[fmtrx]+, DR8\n\t" + "ftrv XMTRX, FV4\n\t" + + "fmov.d @%[fmtrx]+, DR10\n\t" + "fmov.d @%[fmtrx]+, DR12\n\t" + "ftrv XMTRX, FV8\n\t" + + "fmov.d @%[fmtrx], DR14\n\t" // (LS, but this will stall 'ftrv' for 3 cycles) + "fschg\n\t" // switch back to single moves (and avoid stalling 'ftrv') (FE) + "ftrv XMTRX, FV12\n\t" // (FE) + // Save output in XF regs + "frchg\n" + : [bmtrx] "+&r" ((unsigned int)matrix1), [fmtrx] "+r" ((unsigned int)matrix2), [pref_scratch] "=&r" (prefetch_scratch) // outputs, "+" means r/w, "&" means it's written to before all inputs are consumed + : // no inputs + : "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11", "fr12", "fr13", "fr14", "fr15" // clobbers (GCC doesn't know about back bank, so writing to it isn't clobbered) + ); +} + +__hot inline void mat_apply_rotate_x(float x) { + x *= 10430.37835f; + asm volatile( + "ftrc %[a], fpul\n\t" + "fsca fpul, dr4\n\t" + "fldi0 fr8\n\t" + "fldi0 fr11\n\t" + "fmov fr5, fr10\n\t" + "fmov fr4, fr9\n\t" + "fneg fr9\n\t" + "ftrv xmtrx, fv8\n\t" + "fmov fr4, fr6\n\t" + "fldi0 fr7\n\t" + "fldi0 fr4\n\t" + "ftrv xmtrx, fv4\n\t" + "fschg\n\t" + "fmov dr8, xd8\n\t" + "fmov dr10, xd10\n\t" + "fmov dr4, xd4\n\t" + "fmov dr6, xd6\n\t" + "fschg\n" + : + : [a] "f"(x) + : "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11"); +} + +__hot inline void mat_apply_rotate_y(float y) { + y *= 10430.37835f; + asm volatile( + "ftrc %[a], fpul\n\t" + "fsca fpul, dr6\n\t" + "fldi0 fr9\n\t" + "fldi0 fr11\n\t" + "fmov fr6, fr8\n\t" + "fmov fr7, fr10\n\t" + "ftrv xmtrx, fv8\n\t" + "fmov fr7, fr4\n\t" + "fldi0 fr5\n\t" + "fneg fr6\n\t" + "fldi0 fr7\n\t" + "ftrv xmtrx, fv4\n\t" + "fschg\n\t" + "fmov dr8, xd8\n\t" + "fmov dr10, xd10\n\t" + "fmov dr4, xd0\n\t" + "fmov dr6, xd2\n\t" + "fschg\n" + : + : [a] "f"(y) + : "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11"); +} + +__hot inline void mat_apply_rotate_z(float z) { + z *= 10430.37835f; + asm volatile( + "ftrc %[a], fpul\n\t" + "fsca fpul, dr8\n\t" + "fldi0 fr10\n\t" + "fldi0 fr11\n\t" + "fmov fr8, fr5\n\t" + "fneg fr8\n\t" + "ftrv xmtrx, fv8\n\t" + "fmov fr9, fr4\n\t" + "fschg\n\t" + "fmov dr10, dr6\n\t" + "ftrv xmtrx, fv4\n\t" + "fmov dr8, xd4\n\t" + "fmov dr10, xd6\n\t" + "fmov dr4, xd0\n\t" + "fmov dr6, xd2\n\t" + "fschg\n" + : + : [a] "f"(z) + : "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11"); +} + +# else +# ifdef DC_TEXCONV +# define mat_transform(a, b, c, d) +# define mat_apply(a) +# define mat_load(a) +# define mat_store(a) +# define mat_identity(a) +# define pvr_fog_table_color(a,r,g,b) +# define pvr_fog_table_linear(s,e) +# endif + +#define mat_trans_vec3_nomod(x_, y_, z_, x2, y2, z2) do { \ + vector_t tmp = { x_, y_, z_, 0.0f }; \ + mat_transform(&tmp, &tmp, 1, 0); \ + x2 = tmp.x; y2 = tmp.y; z2 = tmp.z; \ + } while(false) + +#define mat_trans_single3_nomod(x_, y_, z_, x2, y2, z2) do { \ + vector_t tmp = { x_, y_, z_, 1.0f }; \ + mat_transform(&tmp, &tmp, 1, 0); \ + z2 = 1.0f / tmp.w; \ + x2 = tmp.x * z2; \ + y2 = tmp.y * z2; \ + } while(false) + +#define mat_trans_single3_nodiv_nomod(x_, y_, z_, x2, y2, z2) do { \ + vector_t tmp = { x_, y_, z_, 1.0f }; \ + mat_transform(&tmp, &tmp, 1, 0); \ + z2 = tmp.z; \ + x2 = tmp.x; \ + y2 = tmp.y; \ + } while(false) + +#define mat_trans_nodiv_nomod(x_, y_, z_, x2, y2, z2, w2) do { \ + vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \ + mat_transform(&tmp1233123, &tmp1233123, 1, 0); \ + x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \ + } while(false) + +#define mat_trans_nodiv_nomod_zerow(x_, y_, z_, x2, y2, z2, w2) do { \ + vector_t tmp1233123 = { x_, y_, z_, 0.0f }; \ + mat_transform(&tmp1233123, &tmp1233123, 1, 0); \ + x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \ + } while(false) + +#define mat_trans_w_nodiv_nomod(x_, y_, z_, w_) do { \ + vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \ + mat_transform(&tmp1233123, &tmp1233123, 1, 0); \ + w_ = tmp1233123.w; \ + } while(false) + +inline void mat_load_3x3(const matrix_t* mtx) { + memcpy(XMTRX, mtx, sizeof(matrix_t)); + XMTRX[0][3] = 0.0f; + XMTRX[1][3] = 0.0f; + XMTRX[2][3] = 0.0f; + + XMTRX[3][0] = 0.0f; + XMTRX[3][1] = 0.0f; + XMTRX[3][2] = 0.0f; + XMTRX[3][3] = 0.0f; +} + +inline void rw_mat_load_4x4(const rw::Matrix* mtx) { + memcpy(XMTRX, mtx, sizeof(matrix_t)); + XMTRX[3][3] = 1.0f; +} + +inline void mat_transpose(void) { + matrix_t tmp; + + for(unsigned i = 0; i < 4; ++i) + for(unsigned j = 0; j < 4; ++j) + tmp[j][i] = XMTRX[i][j]; + + mat_load(&tmp); +} + +template +__hot inline void quat_mult(quaternion_t *r, const quaternion_t &q1, const quaternion_t &q2) { + r->x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w); + r->y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w); + r->z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w); + r->w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z); +} + +__hot inline void mat_load_apply(const matrix_t* matrix1, const matrix_t* matrix2) { + mat_load(matrix1); + mat_apply(matrix2); +} + +__always_inline __hot void mat_copy(matrix_t *dst, const matrix_t *src) { + mat_load(src); + mat_store(dst); +} + +#endif + +__hot inline void mat_mult(matrix_t *out, const matrix_t* matrix1, const matrix_t* matrix2) { + mat_load_apply(matrix1, matrix2); + mat_store(out); +} + +#if 0 +template +__always_inline __hot +auto vec3_dot_prod(const vec3_t &v1, vec3_t const (&v2)[N]) { + std::array result; + +#ifdef DC_SH4 + register float x asm(KOS_FPARG(0)) = v1.x; + register float y asm(KOS_FPARG(1)) = v1.y; + register float z asm(KOS_FPARG(2)) = v1.z; +#else + float x = v1.x; + float y = v1.y; + float z = v1.z; +#endif + + for(std::size_t v = 0; v < N; ++v) + result[v] = fipr(x, y, z, 0.0f, v2[v].x, v2[v].y, v2[v].z, 0.0f); + + return result; +} +#endif + +#endif + +} + +#endif /* RWDC_COMMON_H */ \ No newline at end of file diff --git a/vendor/librw/src/rwbase.h b/vendor/librw/src/rwbase.h index 4274be0e..8b567709 100644 --- a/vendor/librw/src/rwbase.h +++ b/vendor/librw/src/rwbase.h @@ -1,6 +1,9 @@ #pragma once #include "common_defines.h" +#ifdef RW_DC +#include "rwdc_common.h" +#endif #ifndef RW_PS2 #include @@ -322,7 +325,13 @@ inline float32 dot(const Quat &q, const Quat &p) { #endif } inline Quat scale(const Quat &q, float32 r) { return makeQuat(q.w*r, q.x*r, q.y*r, q.z*r); } -inline float32 length(const Quat &q) { return sqrtf(q.w*q.w + q.x*q.x + q.y*q.y + q.z*q.z); } +inline float32 length(const Quat &q) { +#ifndef DC_SH4 + return sqrtf(q.w*q.w + q.x*q.x + q.y*q.y + q.z*q.z); +#else + return dc::Sqrt(fipr_magnitude_sqr(q.x, q.y, q.z, q.w)); +#endif +} inline Quat normalize(const Quat &q) { return scale(q, 1.0f/length(q)); } inline Quat conj(const Quat &q) { return makeQuat(q.w, -q.x, -q.y, -q.z); } Quat mult(const Quat &q, const Quat &p);