From 2716147db45d180979f547926bb575d5ee349369 Mon Sep 17 00:00:00 2001 From: Falco Girgis Date: Sat, 26 Apr 2025 10:52:36 -0500 Subject: [PATCH] Everything tested/working: liberty/miami + gainz! --- src/liberty/collision/Collision.cpp | 12 +- src/liberty/math/Matrix.cpp | 16 +- src/liberty/math/Matrix.h | 4 +- src/liberty/math/Vector.cpp | 6 +- src/liberty/math/VuVector.h | 6 +- src/miami/collision/Collision.cpp | 12 +- src/miami/math/Matrix.cpp | 4 +- src/miami/math/Matrix.h | 12 - src/miami/math/Quaternion.h | 5 +- src/miami/math/Vector.cpp | 28 +- src/miami/math/VuVector.h | 8 +- vendor/dca3-kos | 2 +- vendor/librw/src/base.cpp | 4 +- vendor/librw/src/camera.cpp | 206 ++++++++++++++- vendor/librw/src/dc/rwdc.cpp | 236 ++--------------- vendor/librw/src/dc/rwdc_common.h | 396 ++++++++++++++-------------- 16 files changed, 476 insertions(+), 481 deletions(-) diff --git a/src/liberty/collision/Collision.cpp b/src/liberty/collision/Collision.cpp index 3d85f0c4..05185f9e 100644 --- a/src/liberty/collision/Collision.cpp +++ b/src/liberty/collision/Collision.cpp @@ -1619,7 +1619,7 @@ CCollision::ProcessLineOfSight(const CColLine &line, point.point = matrix * point.point; point.normal = Multiply3x3(matrix, point.normal); #else - mat_load(reinterpret_cast(const_cast(&matrix))); + dc::mat_load2(matrix); mat_trans_single3_nodiv(point.point.x, point.point.y, point.point.z); @@ -1798,7 +1798,7 @@ CCollision::ProcessVerticalLine(const CColLine &line, point.point = matrix * point.point; point.normal = Multiply3x3(matrix, point.normal); #else - mat_load(reinterpret_cast(const_cast(&matrix))); + dc::mat_load2(matrix); mat_trans_single3_nodiv(point.point.x, point.point.y, point.point.z); @@ -2173,8 +2173,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA, #ifndef DC_SH4 matAB *= matrixA; #else - mat_load(reinterpret_cast(&matAB)); - mat_apply(reinterpret_cast(&matrixA)); + dc::mat_load_apply(matAB, matrixA); #endif CColSphere bsphereAB; // bounding sphere of A in B space @@ -2246,8 +2245,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA, #ifndef DC_SH4 matBA *= matrixB; #else - mat_load(reinterpret_cast(&matBA)); - mat_apply(reinterpret_cast(&matrixB)); + dc::mat_load_apply(matBA, matrixB); #endif for(i = 0; i < modelB.numSpheres; i++){ s.radius = modelB.spheres[i].radius; @@ -2309,7 +2307,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA, } #ifdef DC_SH4 - mat_load(reinterpret_cast(const_cast(&matrixB))); + dc::mat_load2(matrixB); #endif for(i = 0; i < numCollisions; i++){ #ifndef DC_SH4 diff --git a/src/liberty/math/Matrix.cpp b/src/liberty/math/Matrix.cpp index 5ac7d4c4..a4b6c73f 100644 --- a/src/liberty/math/Matrix.cpp +++ b/src/liberty/math/Matrix.cpp @@ -277,9 +277,9 @@ void CMatrix::RotateX(float x) { #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results - mat_load(reinterpret_cast(this)); + dc::mat_load2(reinterpret_cast(this)); mat_rotate_x(x); - mat_store(reinterpret_cast(this)); + dc::mat_store2(reinterpret_cast(this)); #else auto [s, c] = SinCos(x); @@ -307,9 +307,9 @@ void CMatrix::RotateY(float y) { #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results - mat_load(reinterpret_cast(this)); + dc::mat_load2(reinterpret_cast(this)); mat_rotate_y(y); - mat_store(reinterpret_cast(this)); + dc::mat_store2(reinterpret_cast(this)); #else auto [s, c] = SinCos(y); @@ -337,9 +337,9 @@ void CMatrix::RotateZ(float z) { #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results - mat_load(reinterpret_cast(this)); + dc::mat_load2(reinterpret_cast(this)); mat_rotate_z(z); - mat_store(reinterpret_cast(this)); + dc::mat_store2(reinterpret_cast(this)); #else auto [s, c] = SinCos(z); @@ -367,9 +367,9 @@ void CMatrix::Rotate(float x, float y, float z) { #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results - mat_load(reinterpret_cast(this)); + dc::mat_load2(reinterpret_cast(this)); mat_rotate(x, y, z); - mat_store(reinterpret_cast(this)); + dc::mat_store2(reinterpret_cast(this)); #else auto [sX, cX] = SinCos(x); auto [sY, cY] = SinCos(y); diff --git a/src/liberty/math/Matrix.h b/src/liberty/math/Matrix.h index 295d5540..50f4031e 100644 --- a/src/liberty/math/Matrix.h +++ b/src/liberty/math/Matrix.h @@ -106,7 +106,7 @@ CMatrix Invert(const CMatrix &matrix); CMatrix operator*(const CMatrix &m1, const CMatrix &m2); inline CVector MultiplyInverse(const CMatrix &mat, const CVector &vec) { -#if 1 +#ifndef DC_SH4 CVector v(vec.x - mat.px, vec.y - mat.py, vec.z - mat.pz); return CVector( mat.rx * v.x + mat.ry * v.y + mat.rz * v.z, @@ -124,8 +124,6 @@ inline CVector MultiplyInverse(const CMatrix &mat, const CVector &vec) #endif } - - class CCompressedMatrixNotAligned { CVector m_vecPos; diff --git a/src/liberty/math/Vector.cpp b/src/liberty/math/Vector.cpp index 91ac44fe..4f5a21ed 100644 --- a/src/liberty/math/Vector.cpp +++ b/src/liberty/math/Vector.cpp @@ -34,7 +34,7 @@ Multiply3x3(const CMatrix &mat, const CVector &vec) mat.rz * vec.x + mat.fz * vec.y + mat.uz * vec.z); #else CVector out; - mat_load(mat); + dc::mat_load2(mat); mat_trans_normal3_nomod(vec.x, vec.y, vec.z, out.x, out.y, out.z); return out; @@ -50,7 +50,7 @@ Multiply3x3(const CVector &vec, const CMatrix &mat) mat.ux * vec.x + mat.uy * vec.y + mat.uz * vec.z); #else CVector out; - mat_load(mat); + dc::mat_load2(mat); mat_transpose(); mat_trans_normal3_nomod(vec.x, vec.y, vec.z, out.x, out.y, out.z); @@ -63,7 +63,7 @@ operator*(const CMatrix &mat, const CVector &vec) { #ifdef DC_SH4 CVector out; - mat_load(reinterpret_cast(const_cast(&mat))); + dc::mat_load2(mat); mat_trans_single3_nodiv_nomod(vec.x, vec.y, vec.z, out.x, out.y, out.z); return out; #else diff --git a/src/liberty/math/VuVector.h b/src/liberty/math/VuVector.h index f2f4a8e4..b8228ec8 100644 --- a/src/liberty/math/VuVector.h +++ b/src/liberty/math/VuVector.h @@ -44,7 +44,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const CV sqc2 vf06,0x0(%0)\n\ ": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory"); #elif defined(DC_SH4) - mat_load(mat); + dc::mat_load2(mat); mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z); #else out = mat * in; @@ -71,7 +71,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const Rw sqc2 vf06,0x0(%0)\n\ ": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory"); #elif defined(DC_SH4) - mat_load(mat); + dc::mat_load2(mat); mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z); #else out = mat * in; @@ -108,7 +108,7 @@ __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat, bnez %1,1b\n\ ": : "r" (out) , "r" (n), "r" (&mat), "r" (in), "r" (stride): "memory"); #elif defined(DC_SH4) - mat_load(mat); + dc::mat_load2(mat); while(n--) { mat_trans_single3_nodiv_nomod(in->x, in->y, in->z, out->x, out->y, out->z); in = reinterpret_cast(reinterpret_cast(in) + stride); diff --git a/src/miami/collision/Collision.cpp b/src/miami/collision/Collision.cpp index 8db6c42a..683ebd75 100644 --- a/src/miami/collision/Collision.cpp +++ b/src/miami/collision/Collision.cpp @@ -1473,7 +1473,7 @@ CCollision::ProcessLineOfSight(const CColLine &line, point.point = matrix * point.point; point.normal = Multiply3x3(matrix, point.normal); #else - mat_load(reinterpret_cast(const_cast(&matrix))); + dc::mat_load2(matrix); mat_trans_single3_nodiv(point.point.x, point.point.y, point.point.z); @@ -1653,7 +1653,7 @@ CCollision::ProcessVerticalLine(const CColLine &line, point.point = matrix * point.point; point.normal = Multiply3x3(matrix, point.normal); #else - mat_load(reinterpret_cast(const_cast(&matrix))); + dc::mat_load2(matrix); mat_trans_single3_nodiv(point.point.x, point.point.y, point.point.z); @@ -2027,8 +2027,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA, #ifndef DC_SH4 matAB *= matrixA; #else - mat_load(reinterpret_cast(&matAB)); - mat_apply(reinterpret_cast(&matrixA)); + dc::mat_load_apply(matAB, matrixA); #endif CColSphere bsphereAB; // bounding sphere of A in B space @@ -2099,8 +2098,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA, #ifndef DC_SH4 matBA *= matrixB; #else - mat_load(reinterpret_cast(&matBA)); - mat_apply(reinterpret_cast(&matrixB)); + dc::mat_load_apply(matBA, matrixB); #endif for(i = 0; i < modelB.numSpheres; i++){ s.radius = modelB.spheres[i].radius; @@ -2162,7 +2160,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA, } #ifdef DC_SH4 - mat_load(reinterpret_cast(const_cast(&matrixB))); + dc::mat_load2(matrixB); #endif for(i = 0; i < numCollisions; i++){ #ifndef DC_SH4 diff --git a/src/miami/math/Matrix.cpp b/src/miami/math/Matrix.cpp index f90e3c06..6f06c0af 100644 --- a/src/miami/math/Matrix.cpp +++ b/src/miami/math/Matrix.cpp @@ -359,9 +359,9 @@ void CMatrix::Rotate(float x, float y, float z) { #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results - mat_load(reinterpret_cast(this)); + dc::mat_load2(reinterpret_cast(this)); mat_rotate(x, y, z); - mat_store(reinterpret_cast(this)); + dc::mat_store2(reinterpret_cast(this)); #else auto [sX, cX] = SinCos(x); auto [sY, cY] = SinCos(y); diff --git a/src/miami/math/Matrix.h b/src/miami/math/Matrix.h index 456c6f05..76b02d34 100644 --- a/src/miami/math/Matrix.h +++ b/src/miami/math/Matrix.h @@ -77,29 +77,17 @@ public: void SetScale(float s); void Scale(float scale) { -#if !defined(DC_SH4) || 1 /* TODO */ for (int i = 0; i < 3; i++) for (int j = 0; j < 3; j++) f[i][j] *= scale; -#else - mat_load(*this); - mat_scale(scale, scale, scale); - mat_store(*this); -#endif } void Scale(float sx, float sy, float sz) { -#if !defined(DC_SH4) || 1 /* TODO */ for (int i = 0; i < 3; i++){ f[i][0] *= sx; f[i][1] *= sy; f[i][2] *= sz; } -#else - mat_load(*this); - mat_scale(sx, sy, sz); - mat_store(*this); -#endif } void SetRotateXOnly(float angle); diff --git a/src/miami/math/Quaternion.h b/src/miami/math/Quaternion.h index c1b0e481..185adb7c 100644 --- a/src/miami/math/Quaternion.h +++ b/src/miami/math/Quaternion.h @@ -62,7 +62,7 @@ public: } const CQuaternion &operator/=(float right) { - right = dc::Invert(right); + right = dc::Invert(right); x *= right; y *= right; z *= right; @@ -115,5 +115,6 @@ inline CQuaternion operator*(float left, const CQuaternion &right) inline CQuaternion operator/(const CQuaternion &left, float right) { - return CQuaternion(left.x / right, left.y / right, left.z / right, left.w / right); + right = Invert(right); + return CQuaternion(left.x * right, left.y * right, left.z * right, left.w * right); } diff --git a/src/miami/math/Vector.cpp b/src/miami/math/Vector.cpp index e2bc5f7e..2667b969 100644 --- a/src/miami/math/Vector.cpp +++ b/src/miami/math/Vector.cpp @@ -27,23 +27,17 @@ CrossProduct(const CVector &v1, const CVector &v2) CVector Multiply3x3(const CMatrix &mat, const CVector &vec) { -#ifdef DC_SH4 - register float __x __asm__("fr12") = vec.x; - register float __y __asm__("fr13") = vec.y; - register float __z __asm__("fr14") = vec.z; - register float __w __asm__("fr15") = 0.0f; - - mat_load(reinterpret_cast(const_cast(&mat))); - - asm volatile( "ftrv xmtrx, fv12\n" - : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) - : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); - return { __x, __y, __z }; -#else +#ifndef DC_SH4 // TODO: VU0 code return CVector(mat.rx * vec.x + mat.fx * vec.y + mat.ux * vec.z, mat.ry * vec.x + mat.fy * vec.y + mat.uy * vec.z, mat.rz * vec.x + mat.fz * vec.y + mat.uz * vec.z); +#else + CVector out; + dc::mat_load2(mat); + mat_trans_normal3_nomod(vec.x, vec.y, vec.z, + out.x, out.y, out.z); + return out; #endif } @@ -56,10 +50,10 @@ Multiply3x3(const CVector &vec, const CMatrix &mat) mat.ux * vec.x + mat.uy * vec.y + mat.uz * vec.z); #else CVector out; - mat_load(mat); + dc::mat_load2(mat); mat_transpose(); - mat_trans_vec3_nomod(vec.x, vec.y, vec.z, - out.x, out.y, out.z); + mat_trans_normal3_nomod(vec.x, vec.y, vec.z, + out.x, out.y, out.z); return out; #endif } @@ -69,7 +63,7 @@ operator*(const CMatrix &mat, const CVector &vec) { #ifdef DC_SH4 CVector out; - mat_load(reinterpret_cast(const_cast(&mat))); + dc::mat_load2(mat); mat_trans_single3_nodiv_nomod(vec.x, vec.y, vec.z, out.x, out.y, out.z); return out; #else diff --git a/src/miami/math/VuVector.h b/src/miami/math/VuVector.h index 3c5933c4..d21ad18a 100644 --- a/src/miami/math/VuVector.h +++ b/src/miami/math/VuVector.h @@ -50,7 +50,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const CV sqc2 vf06,0x0(%0)\n\ ": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory"); #elif defined(DC_SH4) - mat_load(mat); + dc::mat_load2(mat); mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z); #else out = mat * in; @@ -77,7 +77,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const Rw sqc2 vf06,0x0(%0)\n\ ": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory"); #elif defined(DC_SH4) - mat_load(mat); + dc::mat_load2(mat); mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z); #else out = mat * in; @@ -86,7 +86,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const Rw __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat, const RwV3d *in, int stride) { -#ifdef GTA_PS3 +#ifdef GTA_PS2 __asm__ __volatile__("\n\ paddub $3,%4,$0\n\ lqc2 vf02,0x0(%2)\n\ @@ -114,7 +114,7 @@ __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat, bnez %1,1b\n\ ": : "r" (out) , "r" (n), "r" (&mat), "r" (in), "r" (stride): "memory"); #elif defined(DC_SH4) - mat_load(mat); + dc::mat_load2(mat); while(n--) { mat_trans_single3_nodiv_nomod(in->x, in->y, in->z, out->x, out->y, out->z); in = reinterpret_cast(reinterpret_cast(in) + stride); diff --git a/vendor/dca3-kos b/vendor/dca3-kos index 144eaeab..b1f6bbc9 160000 --- a/vendor/dca3-kos +++ b/vendor/dca3-kos @@ -1 +1 @@ -Subproject commit 144eaeab940246d9c84c1f14b45d9e2847705c5e +Subproject commit b1f6bbc960cd1f5cd05be8d9d0eca4556ca3a56a diff --git a/vendor/librw/src/base.cpp b/vendor/librw/src/base.cpp index a9b7b5d6..9f45b459 100644 --- a/vendor/librw/src/base.cpp +++ b/vendor/librw/src/base.cpp @@ -97,8 +97,8 @@ mult(const Quat &q, const Quat &p) #else Quat o; dc::quat_mult(reinterpret_cast(&o), - reinterpret_cast(q), - reinterpret_cast(p)); + reinterpret_cast(q), + reinterpret_cast(p)); return o; #endif } diff --git a/vendor/librw/src/camera.cpp b/vendor/librw/src/camera.cpp index b0204585..0d2d55af 100644 --- a/vendor/librw/src/camera.cpp +++ b/vendor/librw/src/camera.cpp @@ -429,6 +429,7 @@ Camera::frustumTestSphere(const Sphere *s) const { int32 res = SPHEREINSIDE; const FrustumPlane *p = this->frustumPlanes; +#ifndef DC_SH4 for(int32 i = 0; i < 6; i++){ float32 dist = dot(p->plane.normal, s->center) - p->plane.distance; if(s->radius < dist) @@ -437,6 +438,114 @@ Camera::frustumTestSphere(const Sphere *s) const res = SPHEREBOUNDARY; p++; } +#else + __builtin_prefetch(p); + + register float sx asm("fr0") = s->center.x; + register float sy asm("fr1") = s->center.y; + register float sz asm("fr2") = s->center.z; + register float sw asm("fr3") = -1.0f; + + // far + register float px asm("fr4") = p->plane.normal.x; + register float py asm("fr5") = p->plane.normal.y; + register float pz asm("fr6") = p->plane.normal.z; + register float pw asm("fr7") = p->plane.distance; + + asm volatile("fipr fv0, fv4" + : "+f" (pw) + : "f" (sx), "f" (sy), "f" (sz), "f" (sw), + "f" (px), "f" (py), "f" (pz)); + if(s->radius < pw) + return SPHEREOUTSIDE; + else if(s->radius > -pw) + res = SPHEREBOUNDARY; + p++; + + // near + px = p->plane.normal.x; + py = p->plane.normal.y; + pz = p->plane.normal.z; + pw = p->plane.distance; + asm volatile("fipr fv0, fv4" + : "+f" (pw) + : "f" (sx), "f" (sy), "f" (sz), "f" (sw), + "f" (px), "f" (py), "f" (pz)); + if(s->radius < pw) + return SPHEREOUTSIDE; + if(s->radius > -pw) + res = SPHEREBOUNDARY_NEAR; + p++; + + const float* base_ptr0 = &p[0].plane.normal.x; + const float* base_ptr1 = &p[1].plane.normal.x; + const float* base_ptr2 = &p[2].plane.normal.x; + const float* base_ptr3 = &p[3].plane.normal.x; + + __builtin_prefetch(base_ptr0); + + static_assert(offsetof (decltype (p[0].plane.normal), y) + -offsetof (decltype (p[0].plane.normal), x) == sizeof (float)); + + static_assert(offsetof (decltype (p[0].plane.normal), z) + -offsetof (decltype (p[0].plane.normal), y) == sizeof (float)); + + static_assert(offsetof (decltype (p[0].plane), distance) + -offsetof (decltype (p[0].plane.normal), z) == sizeof (float)); + + asm volatile (R"( + frchg + + fmov.s @%0+,fr0 + fmov.s @%1+,fr1 + fmov.s @%2+,fr2 + fmov.s @%3+,fr3 + + fmov.s @%0+,fr4 + fmov.s @%1+,fr5 + fmov.s @%2+,fr6 + fmov.s @%3+,fr7 + + fmov.s @%0+,fr8 + fmov.s @%1+,fr9 + fmov.s @%2+,fr10 + fmov.s @%3+,fr11 + + fmov.s @%0,fr12 + fmov.s @%1,fr13 + fmov.s @%2,fr14 + fmov.s @%3,fr15 + + frchg + )" : "+&r" (base_ptr0), "+&r" (base_ptr1), "+&r" (base_ptr2), "+&r" (base_ptr3) + : + : ); + + float dists[4]; + mat_trans_vec4_nodiv_nomod(sx, sy, sz, sw, + dists[0], dists[1], dists[2], dists[3]); + + if(s->radius < dists[0]) + return SPHEREOUTSIDE; + else if(s->radius > -dists[0]) + res = SPHEREBOUNDARY; + + if(s->radius < dists[1]) + return SPHEREOUTSIDE; + else if(s->radius > -dists[1]) + res = SPHEREBOUNDARY; + + if(s->radius < dists[2]) + return SPHEREOUTSIDE; + else if(s->radius > -dists[2]) + res = SPHEREBOUNDARY; + + if(s->radius < dists[3]) + return SPHEREOUTSIDE; + else if(s->radius > -dists[3]) + res = SPHEREBOUNDARY; + +#endif return res; } @@ -445,7 +554,7 @@ Camera::frustumTestSphereNear(const Sphere *s) const { int32 res = SPHEREINSIDE; const FrustumPlane *p = this->frustumPlanes; - +#ifndef DC_SH4 // far float32 dist = dot(p->plane.normal, s->center) - p->plane.distance; if(s->radius < dist) @@ -481,6 +590,101 @@ Camera::frustumTestSphereNear(const Sphere *s) const return SPHEREOUTSIDE; p++; +#else + __builtin_prefetch(p); + + register float sx asm("fr0") = s->center.x; + register float sy asm("fr1") = s->center.y; + register float sz asm("fr2") = s->center.z; + register float sw asm("fr3") = -1.0f; + + // far + register float px asm("fr4") = p->plane.normal.x; + register float py asm("fr5") = p->plane.normal.y; + register float pz asm("fr6") = p->plane.normal.z; + register float pw asm("fr7") = p->plane.distance; + + asm volatile("fipr fv0, fv4" + : "+f" (pw) + : "f" (sx), "f" (sy), "f" (sz), "f" (sw), + "f" (px), "f" (py), "f" (pz)); + if(s->radius < pw) + return SPHEREOUTSIDE; + p++; + + // near + px = p->plane.normal.x; + py = p->plane.normal.y; + pz = p->plane.normal.z; + pw = p->plane.distance; + asm volatile("fipr fv0, fv4" + : "+f" (pw) + : "f" (sx), "f" (sy), "f" (sz), "f" (sw), + "f" (px), "f" (py), "f" (pz)); + if(s->radius < pw) + return SPHEREOUTSIDE; + if(s->radius > -pw) + res = SPHEREBOUNDARY_NEAR; + p++; + + const float* base_ptr0 = &p[0].plane.normal.x; + const float* base_ptr1 = &p[1].plane.normal.x; + const float* base_ptr2 = &p[2].plane.normal.x; + const float* base_ptr3 = &p[3].plane.normal.x; + + __builtin_prefetch(base_ptr0); + + static_assert(offsetof (decltype (p[0].plane.normal), y) + -offsetof (decltype (p[0].plane.normal), x) == sizeof (float)); + + static_assert(offsetof (decltype (p[0].plane.normal), z) + -offsetof (decltype (p[0].plane.normal), y) == sizeof (float)); + + static_assert(offsetof (decltype (p[0].plane), distance) + -offsetof (decltype (p[0].plane.normal), z) == sizeof (float)); + + asm volatile (R"( + frchg + + fmov.s @%0+,fr0 + fmov.s @%1+,fr1 + fmov.s @%2+,fr2 + fmov.s @%3+,fr3 + + fmov.s @%0+,fr4 + fmov.s @%1+,fr5 + fmov.s @%2+,fr6 + fmov.s @%3+,fr7 + + fmov.s @%0+,fr8 + fmov.s @%1+,fr9 + fmov.s @%2+,fr10 + fmov.s @%3+,fr11 + + fmov.s @%0,fr12 + fmov.s @%1,fr13 + fmov.s @%2,fr14 + fmov.s @%3,fr15 + + frchg + )" : "+&r" (base_ptr0), "+&r" (base_ptr1), "+&r" (base_ptr2), "+&r" (base_ptr3) + : + : ); + + float dists[4]; + mat_trans_vec4_nodiv_nomod(sx, sy, sz, sw, + dists[0], dists[1], dists[2], dists[3]); + + if(s->radius < dists[0]) + return SPHEREOUTSIDE; + else if(s->radius < dists[1]) + return SPHEREOUTSIDE; + else if(s->radius < dists[2]) + return SPHEREOUTSIDE; + else if(s->radius < dists[3]) + return SPHEREOUTSIDE; + +#endif return res; } diff --git a/vendor/librw/src/dc/rwdc.cpp b/vendor/librw/src/dc/rwdc.cpp index d33de9b0..2849461f 100644 --- a/vendor/librw/src/dc/rwdc.cpp +++ b/vendor/librw/src/dc/rwdc.cpp @@ -37,8 +37,6 @@ extern const char* currentFile; #include #include -#define ARRAY_SIZE(array) (sizeof(array) / sizeof(array[0])) - #define errorf(...) dbglog(DBG_CRITICAL, __VA_ARGS__) #define logf(...) // printf(__VA_ARGS__) bool re3RemoveLeastUsedModel(); @@ -47,21 +45,13 @@ void* re3StreamingAlloc(size_t size); // #include "rwdcimpl.h" -#include -#include #include "alloc.h" -#undef PVR_TXRFMT_STRIDE -#define PVR_TXRFMT_STRIDE (1 << 25) - -static_assert(PVR_TXRFMT_STRIDE == (1 << 25), "PVR_TXRFMT_STRIDE is bugged in your KOS version"); +using namespace dc; // TODO: probably needs a better place to be bool doEnvironmentMaps = true; -#define fclamp0_1(n) ((n) > 1.0f ? 1.0f : n < 0.0f ? 0.0f : n) -#define fclamp1(n) ((n) > 1.0f ? 1.0f : n) - struct alignas(32) pvr_vertex16_t { uint32_t flags; /**< \brief TA command (vertex flags) */ float x; /**< \brief X coordinate */ @@ -170,177 +160,10 @@ struct alignas(32) pvr_vertex32_ut { static_assert(sizeof(pvr_vertex16_t) == 32, "pvr_vertex16_t size mismatch"); static_assert(alignof(pvr_vertex16_t) == 32, "pvr_vertex16_t alignof mismatch"); - -#define MATH_Fast_Invert(x) ({ (((x) < 0.0f)? -1.0f : 1.0f) * frsqrt((x) * (x)); }) - static pvr_dr_state_t drState; -#include - float VIDEO_MODE_SCALE_X; -#if !defined(DC_TEXCONV) && !defined(DC_SIM) -#include - -#define mat_trans_nodiv_nomod(x, y, z, x2, y2, z2, w2) do { \ - register float __x __asm__("fr12") = (x); \ - register float __y __asm__("fr13") = (y); \ - register float __z __asm__("fr14") = (z); \ - register float __w __asm__("fr15") = 1.0f; \ - __asm__ __volatile__( "ftrv xmtrx, fv12\n" \ - : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \ - : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \ - x2 = __x; y2 = __y; z2 = __z; w2 = __w; \ - } while(false) - -#define mat_trans_nodiv_nomod_zerow(x, y, z, x2, y2, z2, w2) do { \ - register float __x __asm__("fr12") = (x); \ - register float __y __asm__("fr13") = (y); \ - register float __z __asm__("fr14") = (z); \ - register float __w __asm__("fr15") = 0.0f; \ - __asm__ __volatile__( "ftrv xmtrx, fv12\n" \ - : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \ - : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \ - x2 = __x; y2 = __y; z2 = __z; w2 = __w; \ - } while(false) - -#define mat_trans_w_nodiv_nomod(x, y, z, w) do { \ - register float __x __asm__("fr12") = (x); \ - register float __y __asm__("fr13") = (y); \ - register float __z __asm__("fr14") = (z); \ - register float __w __asm__("fr15") = 1.0f; \ - __asm__ __volatile__( "ftrv xmtrx, fv12\n" \ - : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \ - : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \ - w = __w; \ - } while(false) - - // no declspec naked, so can't do rts / fschg. instead compiler pads with nop? - - inline void rw_mat_load_3x3(const rw::Matrix* mtx) { - __asm__ __volatile__ ( - R"( - fschg - frchg - - fmov @%[mtx]+, dr0 - - fldi0 fr12 - fldi0 fr13 - - fmov @%[mtx]+, dr2 - fmov @%[mtx]+, dr4 - fmov @%[mtx]+, dr6 - fmov @%[mtx]+, dr8 - fmov @%[mtx]+, dr10 - - fldi0 fr3 - fldi0 fr7 - fldi0 fr11 - fmov dr12, dr14 - - fschg - frchg - )" - : [mtx] "+r" (mtx) - ); - } - - // sets pos.w to 1 - inline void rw_mat_load_4x4(const rw::Matrix* mtx) { - __asm__ __volatile__ ( - R"( - fschg - frchg - fmov @%[mtx]+, dr0 - - fmov @%[mtx]+, dr2 - fmov @%[mtx]+, dr4 - fmov @%[mtx]+, dr6 - fmov @%[mtx]+, dr8 - fmov @%[mtx]+, dr10 - fmov @%[mtx]+, dr12 - fmov @%[mtx]+, dr14 - fldi1 fr15 - - fschg - frchg - )" - : [mtx] "+r" (mtx) - ); - } - -#else -extern matrix_t XMTRX; - -void rw_mat_load_3x3(rw::Matrix* mtx) { - memcpy(XMTRX, mtx, sizeof(matrix_t)); - XMTRX[0][3] = 0.0f; - XMTRX[1][3] = 0.0f; - XMTRX[2][3] = 0.0f; - - XMTRX[3][0] = 0.0f; - XMTRX[3][1] = 0.0f; - XMTRX[3][2] = 0.0f; - XMTRX[3][3] = 0.0f; -} - -void rw_mat_load_4x4(rw::Matrix* mtx) { - memcpy(XMTRX, mtx, sizeof(matrix_t)); - XMTRX[3][3] = 1.0f; -} - -#include -#define frsqrt(a) (1.0f/sqrt(a)) -#define dcache_pref_block(a) __builtin_prefetch(a) - -#ifndef __always_inline -#define __always_inline __attribute__((always_inline)) inline -#endif - -#ifdef DC_TEXCONV -#define mat_transform(a, b, c, d) -#define mat_apply(a) -#define mat_load(a) -#define mat_store(a) -#define mat_identity(a) -#define pvr_fog_table_color(a,r,g,b) -#define pvr_fog_table_linear(s,e) -#define pvr_fog_table_exp(d) -#define pvr_fog_table_custom(d) -#endif - -#define mat_trans_single3_nomod(x_, y_, z_, x2, y2, z2) do { \ - vector_t tmp = { x_, y_, z_, 1.0f }; \ - mat_transform(&tmp, &tmp, 1, 0); \ - z2 = 1.0f / tmp.w; \ - x2 = tmp.x * z2; \ - y2 = tmp.y * z2; \ - } while(false) - -#define mat_trans_nodiv_nomod(x_, y_, z_, x2, y2, z2, w2) do { \ - vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \ - mat_transform(&tmp1233123, &tmp1233123, 1, 0); \ - x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \ - } while(false) - -#define mat_trans_nodiv_nomod_zerow(x_, y_, z_, x2, y2, z2, w2) do { \ - vector_t tmp1233123 = { x_, y_, z_, 0.0f }; \ - mat_transform(&tmp1233123, &tmp1233123, 1, 0); \ - x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \ - } while(false) - -#define mat_trans_w_nodiv_nomod(x_, y_, z_, w_) do { \ - vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \ - mat_transform(&tmp1233123, &tmp1233123, 1, 0); \ - w_ = tmp1233123.w; \ - } while(false) - -#define memcpy4 memcpy - -// END STUBS -#endif - static pvr_ptr_t fake_tex; alignas(4) static const uint16_t fake_tex_data[] = { @@ -551,9 +374,9 @@ void DCE_MatrixViewport(float x, float y, float width, float height) { void DCE_InitMatrices() { // Setup the screenview matrix. Only need to do once since this matrix does not need to change for single player viewpoint. - mat_identity(); + mat_identity2(); - mat_store(&DCE_MAT_SCREENVIEW); + mat_store2(&DCE_MAT_SCREENVIEW); } } @@ -684,7 +507,7 @@ struct atomic_context_t { __always_inline void DCE_RenderSubmitVertex(const pvr_vertex_t *v, uint32_t flags) { auto *sq = reinterpret_cast(pvr_dr_target(drState)); auto *src = reinterpret_cast(v); - float sz = MATH_Fast_Invert(v->z); + float sz = Invert(v->z); float sx = v->x * sz; float sy = v->y * sz; @@ -711,7 +534,7 @@ __always_inline void DCE_RenderSubmitVertexIM3D(float x, float y, float w, { auto *sq = reinterpret_cast(pvr_dr_target(drState)); auto *uv32 = reinterpret_cast(uv); - float sz = MATH_Fast_Invert(w); + float sz = Invert(w); float sx = x * sz; float sy = y * sz; @@ -801,9 +624,8 @@ void beginUpdate(Camera* cam) { DCE_MatrixViewport(0, 0, cam->frameBuffer->width * VIDEO_MODE_SCALE_X, cam->frameBuffer->height); - mat_load((matrix_t*)&DCE_MAT_SCREENVIEW); - mat_apply((matrix_t*)&cam->devProj); - mat_store((matrix_t*)&cam->devProjScreen); + mat_load_apply((matrix_t*)&DCE_MAT_SCREENVIEW, (matrix_t*)&cam->devProj); + mat_store2((matrix_t*)&cam->devProjScreen); } @@ -1831,7 +1653,7 @@ void im2DRenderPrimitive(PrimitiveType primType, void *vertices, int32_t numVert pvrVert->flags = flags; pvrVert->x = gtaVert.x * VIDEO_MODE_SCALE_X; pvrVert->y = gtaVert.y; - pvrVert->z = MATH_Fast_Invert(gtaVert.w); // this is perfect for almost every case... + pvrVert->z = Invert(gtaVert.w); // this is perfect for almost every case... pvrVert->u = gtaVert.u; pvrVert->v = gtaVert.v; pvrVert->argb = (gtaVert.a << 24) | @@ -1952,7 +1774,7 @@ void im2DRenderIndexedPrimitive(PrimitiveType primType, void *vertices, int32 nu pvrVert->flags = flags; pvrVert->x = gtaVert.x * VIDEO_MODE_SCALE_X; pvrVert->y = gtaVert.y; - pvrVert->z = MATH_Fast_Invert(gtaVert.w); // this is perfect for almost every case... + pvrVert->z = Invert(gtaVert.w); // this is perfect for almost every case... pvrVert->u = gtaVert.u; pvrVert->v = gtaVert.v; pvrVert->argb = (gtaVert.a << 24) | @@ -2015,8 +1837,8 @@ void im3DTransform(void *vertices, int32 numVertices, Matrix *worldMat, uint32 f rw::RawMatrix::mult(&worldview, &world, &cam->devView); rw::RawMatrix::mult(&proj, &worldview, &cam->devProj); rw::RawMatrix::mult(&mtx, &proj, (RawMatrix*)&DCE_MAT_SCREENVIEW); - // mat_load(&DCE_MAT_SCREENVIEW); // ~11 cycles. - mat_load(( matrix_t*)&mtx.right); // Number of cycles: ~32. + // mat_load2(&DCE_MAT_SCREENVIEW); // ~11 cycles. + mat_load2(( matrix_t*)&mtx.right); // Number of cycles: ~32. if (im3dVertices) { free(im3dVertices); } @@ -2110,7 +1932,7 @@ void im3DRenderIndexedPrimitive(PrimitiveType primType, // assuming near plane is 0.0f // gv1 is visible (posi), and gv2 is behind the plane (negative) - float t = (1.0f - gv1.position.z) * MATH_Fast_Invert(gv2.position.z - gv1.position.z); + float t = (1.0f - gv1.position.z) * Invert(gv2.position.z - gv1.position.z); pvr_vertex_t pvrVert; @@ -3388,13 +3210,13 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve auto skinningWeightData = (uint8_t*)skinWeights; if (!matrix0Identity) { - rw_mat_load_4x4(&skinMatrices[0]); + mat_load_4x4(&skinMatrices[0]); if (small_xyz) { mat_apply(&DCE_MESHLET_MAT_DECODE); } } else { if (small_xyz) { - mat_load(&DCE_MESHLET_MAT_DECODE); + mat_load2(&DCE_MESHLET_MAT_DECODE); } } @@ -3458,7 +3280,7 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve break; } - rw_mat_load_4x4(currentMatrix); + mat_load_4x4(currentMatrix); if (small_xyz){ mat_apply(&DCE_MESHLET_MAT_DECODE); } @@ -3491,7 +3313,7 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve auto skinningWeightData = (uint8_t*)skinWeights; if (!matrix0Identity) { - rw_mat_load_3x3(&skinMatrices[0]); + mat_load_3x3(&skinMatrices[0]); } for(;;) { @@ -3549,7 +3371,7 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve break; } - rw_mat_load_3x3(currentMatrix); + mat_load_3x3(currentMatrix); do { auto srcOffset = *skinningIndexData++; @@ -3574,7 +3396,7 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve __attribute__((noinline)) void tnlMeshletEnvMap(uint8_t* OCR, uint8_t* normal, int vertexCount, int vertexSize, matrix_t* matfxMatrix, float matfxCoefficient) { - mat_load(matfxMatrix); + mat_load2(matfxMatrix); do { pvr_vertex64_t* v = (pvr_vertex64_t*)OCR; @@ -3991,15 +3813,12 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) { lightingCB(atomic, ac->uniform); - rw::RawMatrix world; rw::convMatrix(&world, atomic->getFrame()->getLTM()); - - mat_load((matrix_t*)&cam->devProjScreen); - mat_apply((matrix_t*)&cam->devView); + mat_load_apply((matrix_t*)&cam->devProjScreen, (matrix_t*)&cam->devView); mat_apply((matrix_t*)&world); - mat_store((matrix_t*)&atomicContexts.back().mtx); + mat_store2((matrix_t*)&atomicContexts.back().mtx); auto meshes = geo->meshHeader->getMeshes(); @@ -4238,15 +4057,14 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) { unsigned skinSelector = small_xyz + acp->skinMatrix0Identity*2; tnlMeshletSkinVerticesSelector[skinSelector](OCR_SPACE, normalDst, &dcModel->data[meshlet->vertexOffset], normalSrc, &dcModel->data[meshlet->skinWeightOffset], &dcModel->data[meshlet->skinIndexOffset], meshlet->vertexCount, meshlet->vertexSize, &acp->skinContextPointer->mtx); - mat_load(&mtx); + mat_load2(&mtx); tnlMeshletTransformSelector[clippingRequired * 2](OCR_SPACE, OCR_SPACE + 4, meshlet->vertexCount, 64); } else { if (selector & 8) { - mat_load(&mtx); - mat_apply(&DCE_MESHLET_MAT_DECODE); + mat_load_apply(&mtx, &DCE_MESHLET_MAT_DECODE); } else { - mat_load(&mtx); + mat_load2(&mtx); } tnlMeshletTransformSelector[smallSelector](OCR_SPACE, &dcModel->data[meshlet->vertexOffset], meshlet->vertexCount, meshlet->vertexSize); } @@ -4273,7 +4091,7 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) { unsigned dstColOffset = textured ? offsetof(pvr_vertex64_t, a) : offsetof(pvr_vertex32_ut, a); dce_set_mat_vertex_color(&residual, &material); - mat_load(&DCE_MESHLET_MAT_VERTEX_COLOR); + mat_load2(&DCE_MESHLET_MAT_VERTEX_COLOR); tnlMeshletVertexColorSelector[0](OCR_SPACE + dstColOffset, (int8_t*)&dcModel->data[meshlet->vertexOffset] + colOffset, meshlet->vertexCount, meshlet->vertexSize); } else { unsigned dstColOffset = textured ? offsetof(pvr_vertex64_t, a) : offsetof(pvr_vertex32_ut, a); @@ -4295,7 +4113,7 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) { unsigned normalSelector = (pass1 - 1) + (skin != 0) * 4; - mat_load((matrix_t*)&uniformObject.dir[0][0][0]); + mat_load2((matrix_t*)&uniformObject.dir[0][0][0]); auto normalPointer = &dcModel->data[meshlet->vertexOffset] + normalOffset; auto vtxSize = meshlet->vertexSize; if (skin) { @@ -4310,7 +4128,7 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) { if (pass2) { unsigned normalSelector = (pass2 - 1) + (skin != 0) * 4; - mat_load((matrix_t*)&uniformObject.dir[1][0][0]); + mat_load2((matrix_t*)&uniformObject.dir[1][0][0]); tnlMeshletDiffuseColorSelector[normalSelector](OCR_SPACE + dstColOffset, normalPointer, meshlet->vertexCount, vtxSize, &lightDiffuseColors[4]); } } @@ -4373,7 +4191,7 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) { indices.back() |= 0x80; pvr_vertex64_t *vd = (pvr_vertex64_t *)OCR_SPACE; - mat_load(&mtx); // Number of cycles: ~11 + mat_load2(&mtx); // Number of cycles: ~11 for (int idx = 0; idx < geo->numVertices; idx++) { auto& vert = vertices[idx]; diff --git a/vendor/librw/src/dc/rwdc_common.h b/vendor/librw/src/dc/rwdc_common.h index e0f940b7..c6d094e6 100644 --- a/vendor/librw/src/dc/rwdc_common.h +++ b/vendor/librw/src/dc/rwdc_common.h @@ -30,6 +30,7 @@ # define VIDEO_MODE_WIDTH 640.0f # define VIDEO_MODE_HEIGHT 480.0f # define memcpy4 memcpy +# define frsqrt(a) (1.0f/sqrtf(a)) # define dcache_pref_block(a) __builtin_prefetch(a) # define F_PI M_PI # ifndef __always_inline @@ -44,14 +45,16 @@ "PVR_TXRFMT_STRIDE is bugged in your KOS version"); #endif -#define F_PI_2 (F_PI * 0.5f) -#define __hot __attribute__((hot)) -#define __cold __attribute__((cold)) +#define F_PI_2 (F_PI * 0.5f) +#define __hot __attribute__((hot)) +#define __cold __attribute__((cold)) +#define __icache_aligned __attribute__((aligned(32))) -#define STRINGIFY(x) #x -#define STR(x) STRINGIFY(x) -#define CONCAT_(x,y) x##y -#define CONCAT(x,y) CONCAT_(x,y) +#define STRINGIFY(x) #x +#define STR(x) STRINGIFY(x) +#define CONCAT_(x,y) x##y +#define CONCAT(x,y) CONCAT_(x,y) +#define ARRAY_SIZE(array) (sizeof(array) / sizeof(array[0])) namespace rw { class Matrix; @@ -66,6 +69,11 @@ struct quaternion_t { __always_inline __hot constexpr float Sin(float x) { return sinf(x); } __always_inline __hot constexpr float Cos(float x) { return cosf(x); } __always_inline __hot constexpr auto SinCos(float x) { return std::pair { Sin(x), Cos(x) }; } +__always_inline __hot constexpr float Tan(float x) { return tanf(x); } +__always_inline __hot constexpr float Atan(float x) { return atanf(x); } +__always_inline __hot constexpr float Atan2(float y, float x) { return atan2f(y, x); } +__always_inline __hot constexpr float Asin(float x) { return asinf(x); } +__always_inline __hot constexpr float Acos(float x) { return acosf(x); } __always_inline __hot constexpr float Abs(float x) { return fabsf(x); } __always_inline __hot constexpr float Sqrt(float x) { return sqrtf(x); } __always_inline __hot constexpr float RecipSqrt(float x, float y) { return x / Sqrt(y); } @@ -130,80 +138,6 @@ __always_inline __hot constexpr auto Norm(auto value, auto min, auto max) { return numerator / denominator; } -template -__always_inline __hot constexpr float Tan(float x) { - if(!std::is_constant_evaluated() && FAST_APPROX) { - constexpr float pisqby4 = 2.4674011002723397f; - constexpr float adjpisqby4 = 2.471688400562703f; - constexpr float adj1minus8bypisq = 0.189759681063053f; - float xsq = x * x; - - return x * Div(adjpisqby4 - adj1minus8bypisq * xsq, - pisqby4 - xsq); - } else - return tanf(x); -} - -template -__always_inline __hot constexpr float Atan(float x) { - if(FAST_APPROX && !std::is_constant_evaluated()) { - constexpr float a[3] = { // - 0.998418889819911f, -2.9993501171084700E-01f, 0.0869142852883849f}; - float xx = x * x; - return ((a[2] * xx + a[1]) * xx + a[0]) * x; - } else return atanf(x); -} - -template -__hot constexpr float Atan2(float y, float x) { - if(FAST_APPROX && !std::is_constant_evaluated()) { -#if 0 - constexpr float halfpi_i754 = M_PI * 0.5f; - constexpr float quarterpi_i754 = M_PI * 0.25f; - // kludge to prevent 0/0 condition - float abs_y = Abs(y) + std::numeric_limits::epsilon(); - float absy_plus_absx = abs_y + Abs(x); - float inv_absy_plus_absx = Invert(absy_plus_absx); - float angle = halfpi_i754 - copysignf(quarterpi_i754, x); - float r = (x - copysignf(abs_y, x)) * inv_absy_plus_absx; - angle += (0.1963f * r * r - 0.9817f) * r; - return copysignf(angle, y); -#else - // Ensure input is in [-1, +1] - bool swap = fabs(x) < fabs(y); - float atan_input = (swap ? x : y) / (swap ? y : x); - - // Approximate atan - float res = Atan(atan_input); - - // If swapped, adjust atan output - res = swap ? (atan_input >= 0.0f ? F_PI_2 : -F_PI_2) - res : res; - // Adjust quadrants - if (x >= 0.0f && y >= 0.0f) {} // 1st quadrant - else if (x < 0.0f && y >= 0.0f) { res = F_PI + res; } // 2nd quadrant - else if (x < 0.0f && y < 0.0f) { res = -F_PI + res; } // 3rd quadrant - else if (x >= 0.0f && y < 0.0f) {} // 4th quadrant - - // Store result - return res; -#endif - } else return atan2f(y, x); -} - -template -__always_inline __hot constexpr float Asin(float x) { - if(FAST_APPROX && !std::is_constant_evaluated()) { - Atan(Div(x, Sqrt(1.0f-(x*x)))); - } else return asinf(x); -} - -template -__always_inline __hot constexpr float Acos(float x) { - if(FAST_APPROX && !std::is_constant_evaluated()) { - return (-0.69813170079773212f * x * x - 0.87266462599716477f) * x + 1.5707963267948966f; - } else return acosf(x); -} - #ifdef RW_DC # ifdef DC_SH4 @@ -240,43 +174,134 @@ __always_inline __hot constexpr float Acos(float x) { w = __w; \ } while(false) -#define mat_trans_vec3_nomod(x, y, z, x2, y2, z2) { \ - register float __x __asm__("fr12") = (x); \ - register float __y __asm__("fr13") = (y); \ - register float __z __asm__("fr14") = (z); \ - __asm__ __volatile__( \ - "fldi0 fr15\n" \ - "ftrv xmtrx, fv8\n" \ - : "=f" (__x), "=f" (__y), "=f" (__z) \ - : "0" (__x), "1" (__y), "2" (__z) \ - : "fr15" ); \ - x2 = __x; y2 = __y; z2 = __z; \ - } +#define mat_trans_vec4_nodiv_nomod(x, y, z, w, x2, y2, z2, w2) { \ + register float __x __asm__("fr0") = (x); \ + register float __y __asm__("fr1") = (y); \ + register float __z __asm__("fr2") = (z); \ + register float __w __asm__("fr3") = (w); \ + __asm__ __volatile__( "ftrv xmtrx, fv0\n" \ + : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \ + : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \ + x2 = __x; y2 = __y; z2 = __z; w2 = __w; \ + } while(false) + +inline __hot __icache_aligned void mat_load2(const matrix_t* mtx) { + asm volatile( + R"( + fschg + fmov.d @%[mtx],xd0 + add #32,%[mtx] + pref @%[mtx] + add #-(32-8),%[mtx] + fmov.d @%[mtx]+,xd2 + fmov.d @%[mtx]+,xd4 + fmov.d @%[mtx]+,xd6 + fmov.d @%[mtx]+,xd8 + fmov.d @%[mtx]+,xd10 + fmov.d @%[mtx]+,xd12 + fmov.d @%[mtx]+,xd14 + fschg + )" + : [mtx] "+r" (mtx) + : + : + ); +} + +inline __hot __icache_aligned void mat_store2(matrix_t *mtx) { + asm volatile( + R"( + fschg + add #64-8,%[mtx] + fmov.d xd14,@%[mtx] + add #-32,%[mtx] + pref @%[mtx] + add #32,%[mtx] + fmov.d xd12,@-%[mtx] + fmov.d xd10,@-%[mtx] + fmov.d xd8,@-%[mtx] + fmov.d xd6,@-%[mtx] + fmov.d xd4,@-%[mtx] + fmov.d xd2,@-%[mtx] + fmov.d xd0,@-%[mtx] + fschg + )" + : [mtx] "+&r" (mtx), "=m" (*mtx) + : + : + ); +} + +inline __hot __icache_aligned void mat_identity2(void) { + asm volatile( + R"( + frchg + fldi1 fr0 + fschg + fldi0 fr1 + fldi0 fr2 + fldi0 fr3 + fldi0 fr4 + fldi1 fr5 + fmov dr2,dr6 + fmov dr2,dr8 + fmov dr0,dr10 + fmov dr2,dr12 + fmov dr4,dr14 + fschg + frchg + )" + ); +} + +inline __hot __icache_aligned void mat_set_scale(float x, float y, float z) { + asm volatile( + R"( + frchg + fldi0 fr1 + fschg + fldi0 fr2 + fldi0 fr3 + fldi0 fr4 + fmov dr2, dr6 + fmov dr2, dr8 + fldi0 fr11 + fmov dr2, dr12 + fldi0 fr14 + fschg + frchg + fmov %[x], xf0 + fmov %[y], xf5 + fmov %[z], xf10 + )" + : + : [x] "r" (x), [y] "r" (y), [z] "r" (z) + : + ); +} // no declspec naked, so can't do rts / fschg. instead compiler pads with nop? -inline __hot void mat_load_3x3(const matrix_t* mtx) { +inline __hot void mat_load_3x3(const rw::Matrix* mtx) { __asm__ __volatile__ ( R"( fschg frchg fmov @%[mtx]+, dr0 - fldi0 fr12 - fmov @%[mtx]+, dr2 + fldi0 fr12 fldi0 fr13 + fmov @%[mtx]+, dr2 fmov @%[mtx]+, dr4 - fldi0 fr3 - fmov @%[mtx]+, dr6 - fmov dr12, dr14 - fmov @%[mtx]+, dr8 - fldi0 fr7 - fmov @%[mtx]+, dr10 + + fldi0 fr3 + fldi0 fr7 fldi0 fr11 + fmov dr12, dr14 fschg frchg @@ -286,28 +311,28 @@ inline __hot void mat_load_3x3(const matrix_t* mtx) { } // sets pos.w to 1 -inline __hot void rw_mat_load_4x4(const rw::Matrix* mtx) { - __asm__ __volatile__ ( - R"( - fschg - frchg - fmov @%[mtx]+, dr0 +inline __hot void mat_load_4x4(const rw::Matrix* mtx) { + __asm__ __volatile__ ( + R"( + fschg + frchg + fmov @%[mtx]+, dr0 - fmov @%[mtx]+, dr2 - fmov @%[mtx]+, dr4 - fmov @%[mtx]+, dr6 - fmov @%[mtx]+, dr8 - fmov @%[mtx]+, dr10 - fmov @%[mtx]+, dr12 - fmov @%[mtx]+, dr14 - fldi1 fr15 + fmov @%[mtx]+, dr2 + fmov @%[mtx]+, dr4 + fmov @%[mtx]+, dr6 + fmov @%[mtx]+, dr8 + fmov @%[mtx]+, dr10 + fmov @%[mtx]+, dr12 + fmov @%[mtx]+, dr14 + fldi1 fr15 - fschg - frchg - )" - : [mtx] "+r" (mtx) - ); -} + fschg + frchg + )" + : [mtx] "+r" (mtx) + ); + } __hot inline void mat_transpose(void) { asm volatile ( @@ -345,8 +370,7 @@ __hot inline void mat_transpose(void) { ); } -__attribute__((optimize("align-functions=32"))) -__hot inline void mat_copy(matrix_t *dst, const matrix_t *src) { +__hot __icache_aligned inline void mat_copy(matrix_t *dst, const matrix_t *src) { asm volatile(R"( fschg @@ -367,7 +391,7 @@ __hot inline void mat_copy(matrix_t *dst, const matrix_t *src) { fmov.d xd0, @-%[dst] add #32, %[dst] - pref @%[dst] + pref @%[dst] fmov.d @%[src]+, xd0 fmov.d @%[src]+, xd2 @@ -382,12 +406,13 @@ __hot inline void mat_copy(matrix_t *dst, const matrix_t *src) { fmov.d xd0, @-%[dst] fschg - )": [dst] "+&r" (dst), [src] "+&r" (src) + )": [dst] "+&r" (dst), [src] "+&r" (src), "=m" (*dst) : - : "memory"); + :); } -template +//TODO: FIXME FOR VC (AND USE FTRV) +template __hot constexpr inline void quat_mult(quaternion_t *r, const quaternion_t &q1, const quaternion_t &q2) { if(FAST_APPROX && !std::is_constant_evaluated()) { /* @@ -477,9 +502,9 @@ __hot constexpr inline void quat_mult(quaternion_t *r, const quaternion_t &q1, c r->w = q2w; } else { r->x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w); - r->y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w); - r->z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w); - r->w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z); + r->y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w); + r->z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w); + r->w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z); } } @@ -608,70 +633,64 @@ __hot inline void mat_apply_rotate_z(float z) { # else # ifdef DC_TEXCONV -# define mat_transform(a, b, c, d) # define mat_apply(a) -# define mat_load(a) -# define mat_store(a) -# define mat_identity(a) +# define mat_load2(a) +# define mat_store2(a) +# define mat_identity2() +# define mat_transform(a, b, c, d) # define pvr_fog_table_color(a,r,g,b) # define pvr_fog_table_linear(s,e) # endif -#define mat_trans_vec3_nomod(x_, y_, z_, x2, y2, z2) do { \ - vector_t tmp = { x_, y_, z_, 0.0f }; \ +#define mat_trans_single3_nomod(x_, y_, z_, x2, y2, z2) do { \ + vector_t tmp = { x_, y_, z_, 1.0f }; \ mat_transform(&tmp, &tmp, 1, 0); \ - x2 = tmp.x; y2 = tmp.y; z2 = tmp.z; \ + z2 = 1.0f / tmp.w; \ + x2 = tmp.x * z2; \ + y2 = tmp.y * z2; \ } while(false) -#define mat_trans_single3_nomod(x_, y_, z_, x2, y2, z2) do { \ - vector_t tmp = { x_, y_, z_, 1.0f }; \ - mat_transform(&tmp, &tmp, 1, 0); \ - z2 = 1.0f / tmp.w; \ - x2 = tmp.x * z2; \ - y2 = tmp.y * z2; \ - } while(false) - #define mat_trans_single3_nodiv_nomod(x_, y_, z_, x2, y2, z2) do { \ - vector_t tmp = { x_, y_, z_, 1.0f }; \ - mat_transform(&tmp, &tmp, 1, 0); \ - z2 = tmp.z; \ - x2 = tmp.x; \ - y2 = tmp.y; \ - } while(false) + vector_t tmp = { x_, y_, z_, 1.0f }; \ + mat_transform(&tmp, &tmp, 1, 0); \ + z2 = tmp.z; \ + x2 = tmp.x; \ + y2 = tmp.y; \ + } while(false) #define mat_trans_nodiv_nomod(x_, y_, z_, x2, y2, z2, w2) do { \ - vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \ - mat_transform(&tmp1233123, &tmp1233123, 1, 0); \ - x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \ - } while(false) + vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \ + mat_transform(&tmp1233123, &tmp1233123, 1, 0); \ + x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \ + } while(false) #define mat_trans_nodiv_nomod_zerow(x_, y_, z_, x2, y2, z2, w2) do { \ - vector_t tmp1233123 = { x_, y_, z_, 0.0f }; \ - mat_transform(&tmp1233123, &tmp1233123, 1, 0); \ - x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \ - } while(false) + vector_t tmp1233123 = { x_, y_, z_, 0.0f }; \ + mat_transform(&tmp1233123, &tmp1233123, 1, 0); \ + x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \ + } while(false) #define mat_trans_w_nodiv_nomod(x_, y_, z_, w_) do { \ - vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \ - mat_transform(&tmp1233123, &tmp1233123, 1, 0); \ - w_ = tmp1233123.w; \ - } while(false) + vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \ + mat_transform(&tmp1233123, &tmp1233123, 1, 0); \ + w_ = tmp1233123.w; \ + } while(false) -inline void mat_load_3x3(const matrix_t* mtx) { - memcpy(XMTRX, mtx, sizeof(matrix_t)); - XMTRX[0][3] = 0.0f; - XMTRX[1][3] = 0.0f; - XMTRX[2][3] = 0.0f; +inline void mat_load_3x3(const rw::Matrix* mtx) { + memcpy(XMTRX, mtx, sizeof(matrix_t)); + XMTRX[0][3] = 0.0f; + XMTRX[1][3] = 0.0f; + XMTRX[2][3] = 0.0f; - XMTRX[3][0] = 0.0f; - XMTRX[3][1] = 0.0f; - XMTRX[3][2] = 0.0f; - XMTRX[3][3] = 0.0f; + XMTRX[3][0] = 0.0f; + XMTRX[3][1] = 0.0f; + XMTRX[3][2] = 0.0f; + XMTRX[3][3] = 0.0f; } -inline void rw_mat_load_4x4(const rw::Matrix* mtx) { - memcpy(XMTRX, mtx, sizeof(matrix_t)); - XMTRX[3][3] = 1.0f; +inline void mat_load_4x4(const rw::Matrix* mtx) { + memcpy(XMTRX, mtx, sizeof(matrix_t)); + XMTRX[3][3] = 1.0f; } inline void mat_transpose(void) { @@ -706,32 +725,9 @@ __always_inline __hot void mat_copy(matrix_t *dst, const matrix_t *src) { __hot inline void mat_mult(matrix_t *out, const matrix_t* matrix1, const matrix_t* matrix2) { mat_load_apply(matrix1, matrix2); - mat_store(out); + mat_store2(out); } -#if 0 -template -__always_inline __hot -auto vec3_dot_prod(const vec3_t &v1, vec3_t const (&v2)[N]) { - std::array result; - -#ifdef DC_SH4 - register float x asm(KOS_FPARG(0)) = v1.x; - register float y asm(KOS_FPARG(1)) = v1.y; - register float z asm(KOS_FPARG(2)) = v1.z; -#else - float x = v1.x; - float y = v1.y; - float z = v1.z; -#endif - - for(std::size_t v = 0; v < N; ++v) - result[v] = fipr(x, y, z, 0.0f, v2[v].x, v2[v].y, v2[v].z, 0.0f); - - return result; -} -#endif - #endif }