Merge branch 'falco/gainz_phase_3' into 'main'

Liberty/Miami Perf Gainz Phase 3 See merge request skmp/dca3-game!97
2025-04-28 04:57:59 +03:00 · 2025-04-26 17:00:40 +00:00 · 2025-04-26 17:00:40 +00:00 · 1f2f270da9
commit 1f2f270da9
parent cca7b3c6fa 1f4dace511
28 changed files with 1216 additions and 538 deletions
--- a/liberty/common.mk
+++ b/liberty/common.mk
@ -381,6 +381,7 @@ INCLUDE = \
 -I../src/liberty/skel/win \
 \
 -I../vendor/librw \
+-I../vendor/librw/src/dc \
 \
 -I../vendor/miniLZO \
 \
--- a/miami/common.mk
+++ b/miami/common.mk
@ -396,6 +396,7 @@ INCLUDE = \
 -I../src/miami/skel/win \
 \
 -I../vendor/librw \
+-I../vendor/librw/src/dc \
 \
 -I../vendor/miniLZO \
 \
--- a/src/common/vmu/vmu.cpp
+++ b/src/common/vmu/vmu.cpp
@ -94,23 +94,26 @@ void VmuProfiler::run() {
            pvr_stats_t pvrStats;   pvr_get_stats(&pvrStats);
            uint32_t    sramStats = snd_mem_available();
            size_t      pvrAvail  = pvr_mem_available();
-            float       fps       = std::accumulate(std::begin(fps_), std::end(fps_), 0.0f) 
-                                    / static_cast<float>(fpsSamples);

            float sh4Mem = heapUtilization();
            float pvrMem = (8_MB - pvrAvail ) / 8_MB * 100.0f;
            float armMem = (2_MB - sramStats) / 2_MB * 100.0f;
-            float vtxBuf = vertBuffUse_;
-            {
-                std::shared_lock lk(mtx_);

-                vmu_printf("FPS :%6.2f\n"
-                           "SH4 :%6.2f%%\n"
-                           "PVR :%6.2f%%\n"
-                           "ARM :%6.2f%%\n"
-                           "VTX :%6.2f%%",
-                           fps, sh4Mem, pvrMem, armMem, vtxBuf);
+            float vtxBuf;
+            float fps;
+            { /* Critical section with main thread. */
+                std::shared_lock lk(mtx_);
+                vtxBuf = vertBuffUse_;
+                fps    = std::accumulate(std::begin(fps_), std::end(fps_), 0.0f)
+                       / static_cast<float>(fpsSamples);
            }
+
+            vmu_printf(" FPS:%6.2f\n"
+                       " RAM:%6.2f%%\n"
+                       "VRAM:%6.2f%%\n"
+                       "SRAM:%6.2f%%\n"
+                       " VTX:%6.2f%%",
+                       fps, sh4Mem, pvrMem, armMem, vtxBuf);
        }
 #endif

@ -119,15 +122,21 @@ void VmuProfiler::run() {
 }

 void VmuProfiler::updateVertexBufferUsage() {
+#ifndef DC_SH4
    std::unique_lock lk(mtx_);
    updated_ = true;

-#ifdef DC_SH4
-    vertBuffUse_ = vertexBufferUtilization();
+#else
+    pvr_stats_t pvrStats;
+    pvr_get_stats(&pvrStats);
+    float vtxUtil  = vertexBufferUtilization();

-    pvr_stats_t pvrStats; 
-    pvr_get_stats(&pvrStats);       
-    fps_[fpsFrame_++] = pvrStats.frame_rate;
+    { /* Critical section with VMU thread. */
+        std::unique_lock lk(mtx_);
+        vertBuffUse_ = vtxUtil;
+        updated_ = true;
+        fps_[fpsFrame_++] = pvrStats.frame_rate;
+    }

    if(fpsFrame_ >= fpsSamples)
        fpsFrame_ = 0;
--- a/src/liberty/collision/Collision.cpp
+++ b/src/liberty/collision/Collision.cpp
@ -1619,7 +1619,7 @@ CCollision::ProcessLineOfSight(const CColLine &line,
 		point.point = matrix * point.point;
 		point.normal = Multiply3x3(matrix, point.normal);
 #else
-		mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&matrix)));
+		dc::mat_load2(matrix);
 		mat_trans_single3_nodiv(point.point.x,
 		                        point.point.y,
 		                        point.point.z);
@ -1798,7 +1798,7 @@ CCollision::ProcessVerticalLine(const CColLine &line,
 		point.point = matrix * point.point;
 		point.normal = Multiply3x3(matrix, point.normal);
 #else
-		mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&matrix)));
+		dc::mat_load2(matrix);
 		mat_trans_single3_nodiv(point.point.x,
 		                        point.point.y,
 		                        point.point.z);
@ -2173,8 +2173,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA,
 #ifndef DC_SH4
 	matAB *= matrixA;
 #else
-	mat_load(reinterpret_cast<const matrix_t*>(&matAB));
-	mat_apply(reinterpret_cast<const matrix_t*>(&matrixA));
+	dc::mat_load_apply(matAB, matrixA);
 #endif

 	CColSphere bsphereAB;	// bounding sphere of A in B space
@ -2246,8 +2245,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA,
 #ifndef DC_SH4
 	matBA *= matrixB;
 #else
-	mat_load(reinterpret_cast<const matrix_t*>(&matBA));
-	mat_apply(reinterpret_cast<const matrix_t*>(&matrixB));
+	dc::mat_load_apply(matBA, matrixB);
 #endif
 	for(i = 0; i < modelB.numSpheres; i++){
 		s.radius = modelB.spheres[i].radius;
@ -2309,7 +2307,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA,
 	}

 #ifdef DC_SH4
-	mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&matrixB)));
+	dc::mat_load2(matrixB);
 #endif
 	for(i = 0; i < numCollisions; i++){
 #ifndef DC_SH4
--- a/src/liberty/core/common.h
+++ b/src/liberty/core/common.h
@ -82,8 +82,6 @@

 #define rwVENDORID_ROCKSTAR 0x0253F2

-__always_inline auto Max(auto a, auto b) { return ((a > b)? a : b); }
-__always_inline auto Min(auto a, auto b) { return ((a < b)? a : b); }

 // Use this to add const that wasn't there in the original code
 #define Const const
@ -299,15 +297,6 @@ extern int strcasecmp(const char *str1, const char *str2);

 extern wchar *AllocUnicode(const char*src);

-template<typename T>
-__always_inline T Clamp(T v, auto low, auto high) {
-	return std::clamp(v, static_cast<T>(low), static_cast<T>(high));
-}
-
-__always_inline auto Clamp2(auto v, auto center, auto radius) {
-	return (v > center) ? Min(v, center + radius) : Max(v, center - radius);
-}
-
 #define SQR(x) ((x) * (x))
 __always_inline auto sq(auto x) { return SQR(x); }

@ -418,15 +407,7 @@ template<int s, int t> struct check_size {
 #endif
 #define BIT(num)                         (1<<(num))

-#define ABS(a) std::abs(a)
+#define ABS(a) Abs(a)

-__always_inline auto norm(auto value, auto min, auto max) {
-	return (Clamp(value, min, max) - min) / (max - min);
-}
 // we use std::lerp now
 //#define lerp(norm, min, max) ( (norm) * ((max) - (min)) + (min) )
-
-#define STRINGIFY(x)                    #x
-#define STR(x)                          STRINGIFY(x)
-#define CONCAT_(x,y) x##y
-#define CONCAT(x,y) CONCAT_(x,y)
--- a/src/liberty/math/Matrix.cpp
+++ b/src/liberty/math/Matrix.cpp
@ -4,7 +4,7 @@ CMatrix::CMatrix(CMatrix const &m)
 {
 	m_attachment = nil;
 	m_hasRwMatrix = false;
-	*this = m;
+	mat_copy(*this, m);
 }

 CMatrix::CMatrix(RwMatrix *matrix, bool owner)
@ -75,7 +75,7 @@ CMatrix::UpdateRW(void)
 void
 CMatrix::operator=(CMatrix const &rhs)
 {
-	memcpy(this, &rhs, sizeof(f));
+	mat_copy(*this, rhs);
 	if (m_attachment)
 		UpdateRW();
 }
@ -83,7 +83,7 @@ CMatrix::operator=(CMatrix const &rhs)
 void
 CMatrix::CopyOnlyMatrix(const CMatrix &other)
 {
-	memcpy(this, &other, sizeof(f));
+	mat_copy(*this, other);
 }

 CMatrix &
@ -277,9 +277,9 @@ void
 CMatrix::RotateX(float x)
 {
 #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
-	mat_load(reinterpret_cast<matrix_t *>(this));
-	mat_rotate_x(x);
-	mat_store(reinterpret_cast<matrix_t *>(this));
+       dc::mat_load2(reinterpret_cast<matrix_t *>(this));
+       mat_rotate_x(x);
+       dc::mat_store2(reinterpret_cast<matrix_t *>(this));
 #else
 	auto [s, c] = SinCos(x);

@ -307,9 +307,9 @@ void
 CMatrix::RotateY(float y)
 {
 #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
-	mat_load(reinterpret_cast<matrix_t *>(this));
-	mat_rotate_y(y);
-	mat_store(reinterpret_cast<matrix_t *>(this));
+       dc::mat_load2(reinterpret_cast<matrix_t *>(this));
+       mat_rotate_y(y);
+       dc::mat_store2(reinterpret_cast<matrix_t *>(this));
 #else
 	auto [s, c] = SinCos(y);

@ -337,9 +337,9 @@ void
 CMatrix::RotateZ(float z)
 {
 #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
-	mat_load(reinterpret_cast<matrix_t *>(this));
-	mat_rotate_z(z);
-	mat_store(reinterpret_cast<matrix_t *>(this));
+       dc::mat_load2(reinterpret_cast<matrix_t *>(this));
+       mat_rotate_z(z);
+       dc::mat_store2(reinterpret_cast<matrix_t *>(this));
 #else	
 	auto [s, c] = SinCos(z);

@ -367,9 +367,9 @@ void
 CMatrix::Rotate(float x, float y, float z)
 {
 #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
-	mat_load(reinterpret_cast<matrix_t *>(this));
-	mat_rotate(x, y, z);
-	mat_store(reinterpret_cast<matrix_t *>(this));
+       dc::mat_load2(reinterpret_cast<matrix_t *>(this));
+       mat_rotate(x, y, z);
+       dc::mat_store2(reinterpret_cast<matrix_t *>(this));
 #else
 	auto [sX, cX] = SinCos(x);
 	auto [sY, cY] = SinCos(y);
@ -449,65 +449,13 @@ CMatrix::Reorthogonalise(void)
 	f = CrossProduct(u, r);
 }

-#ifdef DC_SH4
-static __always_inline void MATH_Load_Matrix_Product(const matrix_t* matrix1, const matrix_t* matrix2)
-{
-    unsigned int prefetch_scratch;
-
-    asm volatile (
-        "mov %[bmtrx], %[pref_scratch]\n\t" // (MT)
-        "add #32, %[pref_scratch]\n\t" // offset by 32 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle)
-        "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) (FE)
-        "pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS)
-        // back matrix
-        "fmov.d @%[bmtrx]+, XD0\n\t" // (LS)
-        "fmov.d @%[bmtrx]+, XD2\n\t"
-        "fmov.d @%[bmtrx]+, XD4\n\t"
-        "fmov.d @%[bmtrx]+, XD6\n\t"
-        "pref @%[fmtrx]\n\t" // prefetch fmtrx now while we wait (LS)
-        "fmov.d @%[bmtrx]+, XD8\n\t" // bmtrx prefetch should work for here
-        "fmov.d @%[bmtrx]+, XD10\n\t"
-        "fmov.d @%[bmtrx]+, XD12\n\t"
-        "mov %[fmtrx], %[pref_scratch]\n\t" // (MT)
-        "add #32, %[pref_scratch]\n\t" // store offset by 32 in r0 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle)
-        "fmov.d @%[bmtrx], XD14\n\t"
-        "pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS)
-        // front matrix
-        // interleave loads and matrix multiply 4x4
-        "fmov.d @%[fmtrx]+, DR0\n\t"
-        "fmov.d @%[fmtrx]+, DR2\n\t"
-        "fmov.d @%[fmtrx]+, DR4\n\t" // (LS) want to issue the next one before 'ftrv' for parallel exec
-        "ftrv XMTRX, FV0\n\t" // (FE)
-
-        "fmov.d @%[fmtrx]+, DR6\n\t"
-        "fmov.d @%[fmtrx]+, DR8\n\t"
-        "ftrv XMTRX, FV4\n\t"
-
-        "fmov.d @%[fmtrx]+, DR10\n\t"
-        "fmov.d @%[fmtrx]+, DR12\n\t"
-        "ftrv XMTRX, FV8\n\t"
-
-        "fmov.d @%[fmtrx], DR14\n\t" // (LS, but this will stall 'ftrv' for 3 cycles)
-        "fschg\n\t" // switch back to single moves (and avoid stalling 'ftrv') (FE)
-        "ftrv XMTRX, FV12\n\t" // (FE)
-        // Save output in XF regs
-        "frchg\n"
-        : [bmtrx] "+&r" ((unsigned int)matrix1), [fmtrx] "+r" ((unsigned int)matrix2), [pref_scratch] "=&r" (prefetch_scratch) // outputs, "+" means r/w, "&" means it's written to before all inputs are consumed
-        : // no inputs
-        : "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11", "fr12", "fr13", "fr14", "fr15" // clobbers (GCC doesn't know about back bank, so writing to it isn't clobbered)
-    );
-}
-#endif
-
 CMatrix
 operator*(const CMatrix &m1, const CMatrix &m2)
 {
 	// TODO: VU0 code
 	CMatrix out;
-#if defined(RW_DC)
-	mat_load(reinterpret_cast<const matrix_t *>(&m1));
-	mat_apply(reinterpret_cast<const matrix_t *>(&m2));
-	mat_store(reinterpret_cast<matrix_t *>(&out));
+#ifdef DC_SH4
+	mat_mult(out, m1, m2);
 #else
 	out.rx = m1.rx * m2.rx + m1.fx * m2.ry + m1.ux * m2.rz;
 	out.ry = m1.ry * m2.rx + m1.fy * m2.ry + m1.uy * m2.rz;
--- a/src/liberty/math/Matrix.h
+++ b/src/liberty/math/Matrix.h
@ -1,5 +1,7 @@
 #pragma once

+#include "rwdc_common.h"
+
 class alignas(8) CMatrix
 {
 public:
@ -27,6 +29,8 @@ public:
 		SetScale(scale);
 	}
 	~CMatrix(void);
+	operator matrix_t *() { return reinterpret_cast<matrix_t *>(this); }
+	operator const matrix_t *() const { return reinterpret_cast<const matrix_t *>(this); }
 	void Attach(RwMatrix *matrix, bool owner = false);
 	void AttachRW(RwMatrix *matrix, bool owner = false);
 	void Detach(void);
@ -102,15 +106,24 @@ CMatrix Invert(const CMatrix &matrix);
 CMatrix operator*(const CMatrix &m1, const CMatrix &m2);
 inline CVector MultiplyInverse(const CMatrix &mat, const CVector &vec)
 {
+#ifndef DC_SH4
 	CVector v(vec.x - mat.px, vec.y - mat.py, vec.z - mat.pz);
 	return CVector(
 		mat.rx * v.x + mat.ry * v.y + mat.rz * v.z,
 		mat.fx * v.x + mat.fy * v.y + mat.fz * v.z,
 		mat.ux * v.x + mat.uy * v.y + mat.uz * v.z);
+#else
+    register float x asm(KOS_FPARG(0)) = vec.x - mat.px;
+    register float y asm(KOS_FPARG(1)) = vec.y - mat.py;
+    register float z asm(KOS_FPARG(2)) = vec.z - mat.pz;
+	return CVector(
+		fipr(x, y, z, 0.0f, mat.rx, mat.ry, mat.rz, 0.0f),
+		fipr(x, y, z, 0.0f, mat.fx, mat.fy, mat.fz, 0.0f),
+		fipr(x, y, z, 0.0f, mat.ux, mat.uy, mat.uz, 0.0f)
+	);
+#endif
 }

-
-
 class CCompressedMatrixNotAligned
 {
 	CVector m_vecPos;
--- a/src/liberty/math/Quaternion.cpp
+++ b/src/liberty/math/Quaternion.cpp
@ -39,10 +39,14 @@ CQuaternion::Slerp(const CQuaternion &q1, const CQuaternion &q2, float theta, fl
 void
 CQuaternion::Multiply(const CQuaternion &q1, const CQuaternion &q2)
 {
+#ifndef DC_SH4
 	x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w);
 	y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w);
 	z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w);
 	w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z);
+#else
+	quat_mult(*this, q1, q2);
+#endif
 }

 void
@ -51,9 +55,16 @@ CQuaternion::Get(RwV3d *axis, float *angle)
 	*angle = Acos(w);
 	float s = Sin(*angle);

+#ifndef DC_SH4
 	axis->x = x * (1.0f / s);
 	axis->y = y * (1.0f / s);
 	axis->z = z * (1.0f / s);
+#else
+	float invS = dc::Invert<true, false>(s);
+	axis->x = x * invS;
+	axis->y = y * invS;
+	axis->z = z * invS;
+#endif
 }

 void
@ -104,7 +115,7 @@ CQuaternion::Set(const RwMatrix &matrix)
 	if (f >= 0.0f) {
 		s = Sqrt(f + 1.0f);
 		w = 0.5f * s;
-		m = 0.5f / s;
+		m = Div<true, false>(0.5f, s);
 		x = (matrix.up.z - matrix.at.y) * m;
 		y = (matrix.at.x - matrix.right.z) * m;
 		z = (matrix.right.y - matrix.up.x) * m;
@ -115,7 +126,7 @@ CQuaternion::Set(const RwMatrix &matrix)
 	if (f >= 0.0f) {
 		s = Sqrt(f + 1.0f);
 		x = 0.5f * s;
-		m = 0.5f / s;
+		m = Div<true, false>(0.5f, s);
 		y = (matrix.up.x + matrix.right.y) * m;
 		z = (matrix.at.x + matrix.right.z) * m;
 		w = (matrix.up.z - matrix.at.y) * m;
@ -126,7 +137,7 @@ CQuaternion::Set(const RwMatrix &matrix)
 	if (f >= 0.0f) {
 		s = Sqrt(f + 1.0f);
 		y = 0.5f * s;
-		m = 0.5f / s;
+		m = Div<true, false>(0.5f, s);
 		w = (matrix.at.x - matrix.right.z) * m;
 		x = (matrix.up.x - matrix.right.y) * m;
 		z = (matrix.at.y + matrix.up.z) * m;
--- a/src/liberty/math/Quaternion.h
+++ b/src/liberty/math/Quaternion.h
@ -1,6 +1,7 @@
 #pragma once

 #include "src/common_defines.h"
+#include "rwdc_common.h"

 // TODO: actually implement this
 class CQuaternion
@ -10,6 +11,10 @@ public:
 	CQuaternion(void) {}
 	CQuaternion(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {}

+	operator quaternion_t *() { return reinterpret_cast<quaternion_t *>(this); }
+	operator const quaternion_t *() const { return reinterpret_cast<const quaternion_t *>(this); }
+	operator quaternion_t &() { return *reinterpret_cast<quaternion_t *>(this); }
+	operator const quaternion_t &() const { return *reinterpret_cast<const quaternion_t *>(this); }
 	float Magnitude(void) const {
 #ifndef DC_SH4
 		return Sqrt(x*x + y*y + z*z + w*w);
@ -57,10 +62,11 @@ public:
 	}

 	const CQuaternion &operator/=(float right) {
-		x /= right;
-		y /= right;
-		z /= right;
-		w /= right;
+		right = dc::Invert<false>(right);
+		x *= right;
+		y *= right;
+		z *= right;
+		w *= right;
 		return *this;
 	}

@ -109,5 +115,6 @@ inline CQuaternion operator*(float left, const CQuaternion &right)

 inline CQuaternion operator/(const CQuaternion &left, float right)
 {
-	return CQuaternion(left.x / right, left.y / right, left.z / right, left.w / right);
+	right = Invert<false>(right);
+	return CQuaternion(left.x * right, left.y * right, left.z * right, left.w * right);
 }
--- a/src/liberty/math/Vector.cpp
+++ b/src/liberty/math/Vector.cpp
@ -27,32 +27,35 @@ CrossProduct(const CVector &v1, const CVector &v2)
 CVector
 Multiply3x3(const CMatrix &mat, const CVector &vec)
 {
-#ifdef DC_SH4
-    register float __x __asm__("fr12") = vec.x;
-    register float __y __asm__("fr13") = vec.y;
-    register float __z __asm__("fr14") = vec.z;
-    register float __w __asm__("fr15") = 0.0f;
-
-	mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat)));
-
-	asm volatile( "ftrv  xmtrx, fv12\n"
-                : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w)
-                : "0" (__x), "1" (__y), "2" (__z), "3" (__w) );
-    return { __x, __y, __z };
-#else
+#ifndef DC_SH4
 	// TODO: VU0 code
 	return CVector(mat.rx * vec.x + mat.fx * vec.y + mat.ux * vec.z,
 	               mat.ry * vec.x + mat.fy * vec.y + mat.uy * vec.z,
 	               mat.rz * vec.x + mat.fz * vec.y + mat.uz * vec.z);
+#else
+	CVector out;
+	dc::mat_load2(mat);
+	mat_trans_normal3_nomod(vec.x, vec.y, vec.z,
+	                        out.x, out.y, out.z);
+    return out;
 #endif
 }

 CVector
 Multiply3x3(const CVector &vec, const CMatrix &mat)
 {
+#ifndef DC_SH4
 	return CVector(mat.rx * vec.x + mat.ry * vec.y + mat.rz * vec.z,
 	               mat.fx * vec.x + mat.fy * vec.y + mat.fz * vec.z,
 	               mat.ux * vec.x + mat.uy * vec.y + mat.uz * vec.z);
+#else
+	CVector out;
+	dc::mat_load2(mat);
+	mat_transpose();
+	mat_trans_normal3_nomod(vec.x, vec.y, vec.z,
+	                        out.x, out.y, out.z);
+	return out;
+#endif
 }

 CVector
@ -60,7 +63,7 @@ operator*(const CMatrix &mat, const CVector &vec)
 {
 #ifdef DC_SH4
 	CVector out;
-	mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat)));
+	dc::mat_load2(mat);
 	mat_trans_single3_nodiv_nomod(vec.x, vec.y, vec.z, out.x, out.y, out.z);
 	return out;
 #else
--- a/src/liberty/math/Vector.h
+++ b/src/liberty/math/Vector.h
@ -68,7 +68,7 @@ public:
  }

  const CVector &operator/=(float right) {
-    right = Invert(right);
+    right = Invert<false>(right);
    x *= right;
    y *= right;
    z *= right;
@ -112,7 +112,8 @@ inline CVector operator*(float left, const CVector &right)

 inline CVector operator/(const CVector &left, float right)
 {
-  return CVector(left.x / right, left.y / right, left.z / right);
+  right = Invert<false>(right);
+  return CVector(left.x * right, left.y * right, left.z * right);
 }

 __always_inline float
--- a/src/liberty/math/VuVector.h
+++ b/src/liberty/math/VuVector.h
@ -44,7 +44,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const CV
 		sqc2    vf06,0x0(%0)\n\
 		": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory");
 #elif defined(DC_SH4)
-	mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat)));
+	dc::mat_load2(mat);
 	mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z);
 #else
 	out = mat * in;
@ -71,7 +71,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const Rw
 		sqc2    vf06,0x0(%0)\n\
 		": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory");
 #elif defined(DC_SH4)
-	mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat)));
+	dc::mat_load2(mat);
 	mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z);
 #else
 	out = mat * in;
@ -108,7 +108,7 @@ __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat,
 		bnez	%1,1b\n\
 		": : "r" (out) , "r" (n), "r" (&mat), "r" (in), "r" (stride): "memory");
 #elif defined(DC_SH4)
-    mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat)));
+    dc::mat_load2(mat);
 	while(n--) {
 		mat_trans_single3_nodiv_nomod(in->x, in->y, in->z, out->x, out->y, out->z);
 		in = reinterpret_cast<const RwV3d *>(reinterpret_cast<const uint8_t *>(in) + stride);
--- a/src/liberty/math/maths.h
+++ b/src/liberty/math/maths.h
@ -1,73 +1,6 @@
 #pragma once

 #include "src/common_defines.h"
+#include "rwdc_common.h"

-#include <tuple>
-#include <dc/matrix.h>
-
-#ifdef DC_SH4
-
-#define mat_trans_nodiv_nomod(x, y, z, x2, y2, z2, w2) do { \
-        register float __x __asm__("fr12") = (x); \
-        register float __y __asm__("fr13") = (y); \
-        register float __z __asm__("fr14") = (z); \
-        register float __w __asm__("fr15") = 1.0f; \
-        __asm__ __volatile__( "ftrv  xmtrx, fv12\n" \
-                              : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
-                              : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
-        x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
-    } while(false)
-
-#define mat_trans_w_nodiv_nomod(x, y, z, w) do { \
-        register float __x __asm__("fr12") = (x); \
-        register float __y __asm__("fr13") = (y); \
-        register float __z __asm__("fr14") = (z); \
-        register float __w __asm__("fr15") = 1.0f; \
-        __asm__ __volatile__( "ftrv  xmtrx, fv12\n" \
-                              : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
-                              : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
-        w = __w; \
-    } while(false)
-
-__always_inline float Fmac(float a, float b, float c) {
-    asm volatile ("fmac fr0, %[floatb], %[floatc]\n"
-        : [floatc] "+f" (c) : "w" (a), [floatb] "f" (b) : );
-    return c;
-}
-
-#else
-
-#define mat_trans_nodiv_nomod(x_, y_, z_, x2, y2, z2, w2) do { \
-		vector_t tmp = { x_, y_, z_, 1.0f }; \
-		mat_transform(&tmp, &tmp, 1, 0); \
-		x2 = tmp.x; y2 = tmp.y; z2 = tmp.z; w2 = tmp.w; \
-	} while(false)
-
-#define mat_trans_w_nodiv_nomod(x_, y_, z_, w_) do { \
-		vector_t tmp = { x_, y_, z_, 1.0f }; \
-		mat_transform(&tmp, &tmp, 1, 0); \
-		w_ = tmp.w; \
-	} while(false)
-
-__always_inline float Fmac(float a, float b, float c) { return a * b + c; }
-
-#endif
-
-__always_inline float Sin(float x) { return __builtin_sinf(x); }
-__always_inline float Cos(float x) { return __builtin_cosf(x); }
-__always_inline auto SinCos(float x) { return std::pair { Sin(x), Cos(x) }; }
-__always_inline float Tan(float x) { return __builtin_tanf(x); }
-__always_inline float Abs(float x) { return __builtin_fabsf(x); }
-__always_inline float Sqrt(float x) { return __builtin_sqrtf(x); }
-__always_inline float RecipSqrt(float x) { return 1.0f / __builtin_sqrtf(x); }
-__always_inline float Asin(float x) { return __builtin_asinf(x); }
-__always_inline float Acos(float x) { return __builtin_acosf(x); }
-__always_inline float Atan(float x) { return __builtin_atanf(x); }
-__always_inline float Atan2(float y, float x) { return __builtin_atan2f(y, x); }
-__always_inline float RecipSqrt(float x, float y) { return x / __builtin_sqrtf(y); /*y = RecipSqrt(y); return x * y * y;*/ }
-__always_inline float Pow(float x, float y) { return __builtin_powf(x, y); }
-__always_inline float Floor(float x) { return __builtin_floorf(x); }
-__always_inline float Ceil(float x) { return __builtin_ceilf(x); }
-__always_inline float Invert(float x) { return (((x) < 0.0f)? -1.0f : 1.0f) * RecipSqrt((x) * (x)); } 
-__always_inline float Div(float x, float y) { return x * Invert(y); }
-__always_inline float Lerp(float a, float b, float t) { return Fmac(t, (b - a), a); }
+using namespace dc;
--- a/src/miami/collision/Collision.cpp
+++ b/src/miami/collision/Collision.cpp
@ -1473,7 +1473,7 @@ CCollision::ProcessLineOfSight(const CColLine &line,
 		point.point = matrix * point.point;
 		point.normal = Multiply3x3(matrix, point.normal);
 #else
-		mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&matrix)));
+		dc::mat_load2(matrix);
 		mat_trans_single3_nodiv(point.point.x,
 		                        point.point.y,
 		                        point.point.z);
@ -1653,7 +1653,7 @@ CCollision::ProcessVerticalLine(const CColLine &line,
 		point.point = matrix * point.point;
 		point.normal = Multiply3x3(matrix, point.normal);
 #else
-		mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&matrix)));
+		dc::mat_load2(matrix);
 		mat_trans_single3_nodiv(point.point.x,
 		                        point.point.y,
 		                        point.point.z);
@ -2027,8 +2027,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA,
 #ifndef DC_SH4
 	matAB *= matrixA;
 #else
-	mat_load(reinterpret_cast<const matrix_t*>(&matAB));
-	mat_apply(reinterpret_cast<const matrix_t*>(&matrixA));
+	dc::mat_load_apply(matAB, matrixA);
 #endif

 	CColSphere bsphereAB;	// bounding sphere of A in B space
@ -2099,8 +2098,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA,
 #ifndef DC_SH4
 	matBA *= matrixB;
 #else
-	mat_load(reinterpret_cast<const matrix_t*>(&matBA));
-	mat_apply(reinterpret_cast<const matrix_t*>(&matrixB));
+	dc::mat_load_apply(matBA, matrixB);
 #endif
 	for(i = 0; i < modelB.numSpheres; i++){
 		s.radius = modelB.spheres[i].radius;
@ -2162,7 +2160,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA,
 	}

 #ifdef DC_SH4
-	mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&matrixB)));
+	dc::mat_load2(matrixB);
 #endif
 	for(i = 0; i < numCollisions; i++){
 #ifndef DC_SH4
--- a/src/miami/core/common.h
+++ b/src/miami/core/common.h
@ -85,8 +85,6 @@

 #define rwVENDORID_ROCKSTAR 0x0253F2

-#define Max(a,b) ((a) > (b) ? (a) : (b))
-#define Min(a,b) ((a) < (b) ? (a) : (b))

 // Use this to add const that wasn't there in the original code
 #define Const const
@ -303,12 +301,8 @@ extern int strncasecmp(const char *str1, const char *str2, size_t len);

 extern wchar *AllocUnicode(const char*src);

-#define Clamp(v, low, high) ((v)<(low) ? (low) : (v)>(high) ? (high) : (v))
-
-#define Clamp2(v, center, radius) ((v) > (center) ? Min(v, center + radius) : Max(v, center - radius))
-
-inline float sq(float x) { return x*x; }
 #define SQR(x) ((x) * (x))
+__always_inline auto sq(auto x) { return SQR(x); }

 #ifdef __MWERKS__
 #define M_E        2.71828182845904523536   // e
@ -326,7 +320,10 @@ inline float sq(float x) { return x*x; }
 #define M_SQRT1_2  0.707106781186547524401  // 1/sqrt(2)
 #endif

-#define PI (float)M_PI
+#ifndef DC_SH4
+#define F_PI M_PI
+#endif
+#define PI (float)F_PI
 #define TWOPI (PI*2)
 #define HALFPI (PI/2)
 #define DEGTORAD(x) ((x) * PI / 180.0f)
@ -380,7 +377,10 @@ __inline__ void TRACE(char *f, ...) { } // this is re3 only, and so the function
 #endif
 #endif

-#ifndef MASTER
+#ifdef assert
+#undef assert
+#endif
+#if !defined(MASTER)
 #define assert(_Expression) (void)( (!!(_Expression)) || (re3_assert(#_Expression, __FILE__, __LINE__, __FUNCTION__), 0) )
 #else
 #define assert(_Expression) (_Expression)
@ -411,12 +411,7 @@ template<int s, int t> struct check_size {
 #endif
 #define BIT(num)                         (1<<(num))

-#define ABS(a)  (((a) < 0) ? (-(a)) : (a))
-#define norm(value, min, max) (((value) < (min)) ? 0 : (((value) > (max)) ? 1 : (((value) - (min)) / ((max) - (min)))))
-#define Lerp(norm, min, max) ( (norm) * ((max) - (min)) + (min) )
-
-#define STRINGIFY(x)                    #x
-#define STR(x)                          STRINGIFY(x)
-#define CONCAT_(x,y) x##y
-#define CONCAT(x,y) CONCAT_(x,y)
+#define ABS(a) Abs(a)

+// we use std::lerp now
+//#define lerp(norm, min, max) ( (norm) * ((max) - (min)) + (min) )
--- a/src/miami/math/Matrix.cpp
+++ b/src/miami/math/Matrix.cpp
@ -4,7 +4,7 @@ CMatrix::CMatrix(CMatrix const &m)
 {
 	m_attachment = nil;
 	m_hasRwMatrix = false;
-	*this = m;
+	mat_copy(*this, m);
 }

 CMatrix::CMatrix(RwMatrix *matrix, bool owner)
@ -75,7 +75,7 @@ CMatrix::UpdateRW(void)
 void
 CMatrix::operator=(CMatrix const &rhs)
 {
-	memcpy(this, &rhs, sizeof(f));
+	mat_copy(*this, rhs);
 	if (m_attachment)
 		UpdateRW();
 }
@ -83,7 +83,7 @@ CMatrix::operator=(CMatrix const &rhs)
 void
 CMatrix::CopyOnlyMatrix(const CMatrix &other)
 {
-	memcpy(this, &other, sizeof(f));
+	mat_copy(*this, other);
 }

 CMatrix &
@ -358,12 +358,14 @@ CMatrix::RotateZ(float z)
 void
 CMatrix::Rotate(float x, float y, float z)
 {
-	float cX = Cos(x);
-	float sX = Sin(x);
-	float cY = Cos(y);
-	float sY = Sin(y);
-	float cZ = Cos(z);
-	float sZ = Sin(z);
+#if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
+       dc::mat_load2(reinterpret_cast<matrix_t *>(this));
+       mat_rotate(x, y, z);
+       dc::mat_store2(reinterpret_cast<matrix_t *>(this));
+#else
+	auto [sX, cX] = SinCos(x);
+	auto [sY, cY] = SinCos(y);
+	auto [sZ, cZ] = SinCos(z);
 	
 	float rx = this->rx;
 	float ry = this->ry;
@ -388,6 +390,20 @@ CMatrix::Rotate(float x, float y, float z)
 	float z2 = sZ * sY - (cZ * sX) * cY;
 	float z3 = cX * cY;

+	#if !defined(DC_TEXCONV) && !defined(DC_SIM)
+	this->rx = fipr(x1, y1, z1, 0,  rx, ry, rz, 0);
+	this->ry = fipr(x2, y2, z2, 0,  rx, ry, rz, 0);
+	this->rz = fipr(x3, y3, z3, 0,  rx, ry, rz, 0);
+	this->fx = fipr(x1, y1, z1, 0,  ux, uy, uz, 0);
+	this->fy = fipr(x2, y2, z2, 0,  ux, uy, uz, 0);
+	this->fz = fipr(x3, y3, z3, 0,  ux, uy, uz, 0);
+	this->ux = fipr(x1, y1, z1, 0,  ax, ay, az, 0);
+	this->uy = fipr(x2, y2, z2, 0,  ax, ay, az, 0);
+	this->uz = fipr(x3, y3, z3, 0,  ax, ay, az, 0);
+	this->px = fipr(x1, y1, z1, 0,  px, py, pz, 0);
+	this->py = fipr(x2, y2, z2, 0,  px, py, pz, 0);
+	this->pz = fipr(x3, y3, z3, 0,  px, py, pz, 0);
+	#else
 	this->rx = x1 * rx + y1 * ry + z1 * rz;
 	this->ry = x2 * rx + y2 * ry + z2 * rz;
 	this->rz = x3 * rx + y3 * ry + z3 * rz;
@ -400,6 +416,8 @@ CMatrix::Rotate(float x, float y, float z)
 	this->px = x1 * px + y1 * py + z1 * pz;
 	this->py = x2 * px + y2 * py + z2 * pz;
 	this->pz = x3 * px + y3 * py + z3 * pz;
+	#endif
+#endif
 }

 CMatrix &
@ -429,9 +447,7 @@ operator*(const CMatrix &m1, const CMatrix &m2)
 	// TODO: VU0 code
 	CMatrix out;
 #if defined(RW_DC)
-	mat_load(reinterpret_cast<const matrix_t *>(&m1));
-	mat_apply(reinterpret_cast<const matrix_t *>(&m2));
-	mat_store(reinterpret_cast<matrix_t *>(&out));
+	mat_mult(out, m1, m2);
 #else
 	out.rx = m1.rx * m2.rx + m1.fx * m2.ry + m1.ux * m2.rz;
 	out.ry = m1.ry * m2.rx + m1.fy * m2.ry + m1.uy * m2.rz;
--- a/src/miami/math/Matrix.h
+++ b/src/miami/math/Matrix.h
@ -43,6 +43,8 @@ public:
 		SetScale(scale);
 	}
 	~CMatrix(void);
+	operator matrix_t *() { return reinterpret_cast<matrix_t *>(this); }
+	operator const matrix_t *() const { return reinterpret_cast<const matrix_t *>(this); }
 	void Attach(RwMatrix *matrix, bool owner = false);
 	void AttachRW(RwMatrix *matrix, bool owner = false);
 	void Detach(void);
@ -82,13 +84,12 @@ public:
 	void Scale(float sx, float sy, float sz)
 	{
 		for (int i = 0; i < 3; i++){
-			f[i][0] *= sx;
-			f[i][1] *= sy;
-			f[i][2] *= sz;
+				f[i][0] *= sx;
+				f[i][1] *= sy;
+				f[i][2] *= sz;
 		}
 	}

-
 	void SetRotateXOnly(float angle);
 	void SetRotateYOnly(float angle);
 	void SetRotateZOnly(float angle);
@ -125,11 +126,22 @@ CMatrix Invert(const CMatrix &matrix);
 CMatrix operator*(const CMatrix &m1, const CMatrix &m2);
 inline CVector MultiplyInverse(const CMatrix &mat, const CVector &vec)
 {
+#ifndef DC_SH4
 	CVector v(vec.x - mat.px, vec.y - mat.py, vec.z - mat.pz);
 	return CVector(
 		mat.rx * v.x + mat.ry * v.y + mat.rz * v.z,
 		mat.fx * v.x + mat.fy * v.y + mat.fz * v.z,
 		mat.ux * v.x + mat.uy * v.y + mat.uz * v.z);
+#else
+    register float x asm(KOS_FPARG(0)) = vec.x - mat.px;
+    register float y asm(KOS_FPARG(1)) = vec.y - mat.py;
+    register float z asm(KOS_FPARG(2)) = vec.z - mat.pz;
+	return CVector(
+		fipr(x, y, z, 0.0f, mat.rx, mat.ry, mat.rz, 0.0f),
+		fipr(x, y, z, 0.0f, mat.fx, mat.fy, mat.fz, 0.0f),
+		fipr(x, y, z, 0.0f, mat.ux, mat.uy, mat.uz, 0.0f)
+	);
+#endif
 }


--- a/src/miami/math/Quaternion.cpp
+++ b/src/miami/math/Quaternion.cpp
@ -39,10 +39,14 @@ CQuaternion::Slerp(const CQuaternion &q1, const CQuaternion &q2, float theta, fl
 void
 CQuaternion::Multiply(const CQuaternion &q1, const CQuaternion &q2)
 {
+#ifndef DC_SH4
 	x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w);
 	y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w);
 	z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w);
 	w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z);
+#else
+	quat_mult(*this, q1, q2);
+#endif
 }

 void
@ -51,9 +55,16 @@ CQuaternion::Get(RwV3d *axis, float *angle)
 	*angle = Acos(w);
 	float s = Sin(*angle);

+#ifndef DC_SH4
 	axis->x = x * (1.0f / s);
 	axis->y = y * (1.0f / s);
 	axis->z = z * (1.0f / s);
+#else
+	float invS = dc::Invert<true, false>(s);
+	axis->x = x * invS;
+	axis->y = y * invS;
+	axis->z = z * invS;
+#endif
 }

 void
@ -104,7 +115,7 @@ CQuaternion::Set(const RwMatrix &matrix)
 	if (f >= 0.0f) {
 		s = Sqrt(f + 1.0f);
 		w = 0.5f * s;
-		m = 0.5f / s;
+		m = Div<true, false>(0.5f, s);
 		x = (matrix.up.z - matrix.at.y) * m;
 		y = (matrix.at.x - matrix.right.z) * m;
 		z = (matrix.right.y - matrix.up.x) * m;
@ -115,7 +126,7 @@ CQuaternion::Set(const RwMatrix &matrix)
 	if (f >= 0.0f) {
 		s = Sqrt(f + 1.0f);
 		x = 0.5f * s;
-		m = 0.5f / s;
+		m = Div<true, false>(0.5f, s);
 		y = (matrix.up.x + matrix.right.y) * m;
 		z = (matrix.at.x + matrix.right.z) * m;
 		w = (matrix.up.z - matrix.at.y) * m;
@ -126,7 +137,7 @@ CQuaternion::Set(const RwMatrix &matrix)
 	if (f >= 0.0f) {
 		s = Sqrt(f + 1.0f);
 		y = 0.5f * s;
-		m = 0.5f / s;
+		m = Div<true, false>(0.5f, s);
 		w = (matrix.at.x - matrix.right.z) * m;
 		x = (matrix.up.x - matrix.right.y) * m;
 		z = (matrix.at.y + matrix.up.z) * m;
--- a/src/miami/math/Quaternion.h
+++ b/src/miami/math/Quaternion.h
@ -1,6 +1,7 @@
 #pragma once

 #include "src/common_defines.h"
+#include "rwdc_common.h"

 // TODO: actually implement this
 class CQuaternion
@ -10,6 +11,10 @@ public:
 	CQuaternion(void) {}
 	CQuaternion(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {}

+	operator quaternion_t *() { return reinterpret_cast<quaternion_t *>(this); }
+	operator const quaternion_t *() const { return reinterpret_cast<const quaternion_t *>(this); }
+	operator quaternion_t &() { return *reinterpret_cast<quaternion_t *>(this); }
+	operator const quaternion_t &() const { return *reinterpret_cast<const quaternion_t *>(this); }
 	float Magnitude(void) const {
 #ifndef DC_SH4
 		return Sqrt(x*x + y*y + z*z + w*w);
@ -57,10 +62,11 @@ public:
 	}

 	const CQuaternion &operator/=(float right) {
-		x /= right;
-		y /= right;
-		z /= right;
-		w /= right;
+		right = dc::Invert<false>(right);
+		x *= right;
+		y *= right;
+		z *= right;
+		w *= right;
 		return *this;
 	}

@ -109,5 +115,6 @@ inline CQuaternion operator*(float left, const CQuaternion &right)

 inline CQuaternion operator/(const CQuaternion &left, float right)
 {
-	return CQuaternion(left.x / right, left.y / right, left.z / right, left.w / right);
+	right = Invert<false>(right);
+	return CQuaternion(left.x * right, left.y * right, left.z * right, left.w * right);
 }
--- a/src/miami/math/Vector.cpp
+++ b/src/miami/math/Vector.cpp
@ -27,32 +27,35 @@ CrossProduct(const CVector &v1, const CVector &v2)
 CVector
 Multiply3x3(const CMatrix &mat, const CVector &vec)
 {
-#ifdef DC_SH4
-    register float __x __asm__("fr12") = vec.x;
-    register float __y __asm__("fr13") = vec.y;
-    register float __z __asm__("fr14") = vec.z;
-    register float __w __asm__("fr15") = 0.0f;
-
-	mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat)));
-
-	asm volatile( "ftrv  xmtrx, fv12\n"
-                : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w)
-                : "0" (__x), "1" (__y), "2" (__z), "3" (__w) );
-    return { __x, __y, __z };
-#else
+#ifndef DC_SH4
 	// TODO: VU0 code
 	return CVector(mat.rx * vec.x + mat.fx * vec.y + mat.ux * vec.z,
 	               mat.ry * vec.x + mat.fy * vec.y + mat.uy * vec.z,
 	               mat.rz * vec.x + mat.fz * vec.y + mat.uz * vec.z);
+#else
+	CVector out;
+	dc::mat_load2(mat);
+	mat_trans_normal3_nomod(vec.x, vec.y, vec.z,
+	                        out.x, out.y, out.z);
+    return out;
 #endif
 }

 CVector
 Multiply3x3(const CVector &vec, const CMatrix &mat)
 {
+#ifndef DC_SH4
 	return CVector(mat.rx * vec.x + mat.ry * vec.y + mat.rz * vec.z,
 	               mat.fx * vec.x + mat.fy * vec.y + mat.fz * vec.z,
 	               mat.ux * vec.x + mat.uy * vec.y + mat.uz * vec.z);
+#else
+	CVector out;
+	dc::mat_load2(mat);
+	mat_transpose();
+	mat_trans_normal3_nomod(vec.x, vec.y, vec.z,
+	                        out.x, out.y, out.z);
+	return out;
+#endif
 }

 CVector
@ -60,7 +63,7 @@ operator*(const CMatrix &mat, const CVector &vec)
 {
 #ifdef DC_SH4
 	CVector out;
-	mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat)));
+	dc::mat_load2(mat);
 	mat_trans_single3_nodiv_nomod(vec.x, vec.y, vec.z, out.x, out.y, out.z);
 	return out;
 #else
--- a/src/miami/math/VuVector.h
+++ b/src/miami/math/VuVector.h
@ -50,7 +50,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const CV
 		sqc2    vf06,0x0(%0)\n\
 		": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory");
 #elif defined(DC_SH4)
-	mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat)));
+	dc::mat_load2(mat);
 	mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z);
 #else
 	out = mat * in;
@ -77,7 +77,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const Rw
 		sqc2    vf06,0x0(%0)\n\
 		": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory");
 #elif defined(DC_SH4)
-	mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat)));
+	dc::mat_load2(mat);
 	mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z);
 #else
 	out = mat * in;
@ -86,7 +86,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const Rw

 __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat, const RwV3d *in, int stride)
 {
-#ifdef GTA_PS3
+#ifdef GTA_PS2
 	__asm__ __volatile__("\n\
 		paddub	$3,%4,$0\n\
 		lqc2    vf02,0x0(%2)\n\
@ -114,7 +114,7 @@ __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat,
 		bnez	%1,1b\n\
 		": : "r" (out) , "r" (n), "r" (&mat), "r" (in), "r" (stride): "memory");
 #elif defined(DC_SH4)
-    mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat)));
+    dc::mat_load2(mat);
 	while(n--) {
 		mat_trans_single3_nodiv_nomod(in->x, in->y, in->z, out->x, out->y, out->z);
 		in = reinterpret_cast<const RwV3d *>(reinterpret_cast<const uint8_t *>(in) + stride);
@ -137,7 +137,6 @@ __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat,
 		lqc2    vf03,0x10(%2)\n\
 		lqc2    vf04,0x20(%2)\n\
 		lqc2    vf05,0x30(%2)\n\
-		lqc2    vf01,0x0(%3)\n\
 		nop\n\
 	1:	vmulax.xyz	ACC,   vf02,vf01\n\
 		vmadday.xyz	ACC,   vf03,vf01\n\
--- a/src/miami/math/maths.h
+++ b/src/miami/math/maths.h
@ -1,46 +1,6 @@
 #pragma once

 #include "src/common_defines.h"
+#include "rwdc_common.h"

-#include <dc/matrix.h>
-
-#ifdef DC_SH4
-
-#define mat_trans_nodiv_nomod(x, y, z, x2, y2, z2, w2) do { \
-        register float __x __asm__("fr12") = (x); \
-        register float __y __asm__("fr13") = (y); \
-        register float __z __asm__("fr14") = (z); \
-        register float __w __asm__("fr15") = 1.0f; \
-        __asm__ __volatile__( "ftrv  xmtrx, fv12\n" \
-                              : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
-                              : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
-        x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
-    } while(false)
-
-#else
-
-#define mat_trans_nodiv_nomod(x_, y_, z_, x2, y2, z2, w2) do { \
-		vector_t tmp = { x_, y_, z_, 1.0f }; \
-		mat_transform(&tmp, &tmp, 1, 0); \
-		x2 = tmp.x; y2 = tmp.y; z2 = tmp.z; w2 = tmp.w; \
-	} while(false)
-#endif
-
-
-// wrapper around float versions of functions
-// in gta they are in CMaths but that makes the code rather noisy
-
-inline float Sin(float x) { return sinf(x); }
-inline float Asin(float x) { return asinf(x); }
-inline float Cos(float x) { return cosf(x); }
-inline float Acos(float x) { return acosf(x); }
-inline float Tan(float x) { return tanf(x); }
-inline float Atan(float x) { return atanf(x); }
-inline float Atan2(float y, float x) { return atan2f(y, x); }
-inline float Abs(float x) { return fabsf(x); }
-inline float Sqrt(float x) { return sqrtf(x); }
-inline float RecipSqrt(float x, float y) { return x/Sqrt(y); }
-inline float RecipSqrt(float x) { return RecipSqrt(1.0f, x); }
-inline float Pow(float x, float y) { return powf(x, y); }
-inline float Floor(float x) { return floorf(x); }
-inline float Ceil(float x) { return ceilf(x); }
+using namespace dc;
--- a/vendor/dca3-kos
+++ b/vendor/dca3-kos
@ -1 +1 @@
-Subproject commit 5f69d048aaf2548687c33afc7726d603a22e29b7
+Subproject commit a2706087626795b69689f06803329c1e37bdf7e2
--- a/vendor/librw/src/base.cpp
+++ b/vendor/librw/src/base.cpp
@ -89,10 +89,18 @@ strncmp_ci(const char *s1, const char *s2, int n)
 Quat
 mult(const Quat &q, const Quat &p)
 {
+#ifndef DC_SH4
 	return makeQuat(q.w*p.w - q.x*p.x - q.y*p.y - q.z*p.z,
 	                q.w*p.x + q.x*p.w + q.y*p.z - q.z*p.y,
 	                q.w*p.y + q.y*p.w + q.z*p.x - q.x*p.z,
 	                q.w*p.z + q.z*p.w + q.x*p.y - q.y*p.x);
+#else
+	Quat o;
+	dc::quat_mult(reinterpret_cast<dc::quaternion_t *>(&o),
+	              reinterpret_cast<const dc::quaternion_t &>(q),
+				  reinterpret_cast<const dc::quaternion_t &>(p));
+	return o;
+#endif
 }


--- a/vendor/librw/src/camera.cpp
+++ b/vendor/librw/src/camera.cpp
@ -429,6 +429,7 @@ Camera::frustumTestSphere(const Sphere *s) const
 {
 	int32 res = SPHEREINSIDE;
 	const FrustumPlane *p = this->frustumPlanes;
+#ifndef DC_SH4
 	for(int32 i = 0; i < 6; i++){
 		float32 dist = dot(p->plane.normal, s->center) - p->plane.distance;
 		if(s->radius < dist)
@ -437,6 +438,114 @@ Camera::frustumTestSphere(const Sphere *s) const
 			res = SPHEREBOUNDARY;
 		p++;
 	}
+#else
+	__builtin_prefetch(p);
+
+	register float sx asm("fr0") = s->center.x;
+	register float sy asm("fr1") = s->center.y;
+	register float sz asm("fr2") = s->center.z;
+	register float sw asm("fr3") = -1.0f;
+
+	// far
+	register float px asm("fr4") = p->plane.normal.x;
+	register float py asm("fr5") = p->plane.normal.y;
+	register float pz asm("fr6") = p->plane.normal.z;
+	register float pw asm("fr7") = p->plane.distance;
+
+	asm volatile("fipr fv0, fv4"
+				: "+f" (pw)
+				: "f" (sx), "f" (sy), "f" (sz), "f" (sw),
+				"f" (px), "f" (py), "f" (pz));
+	if(s->radius < pw)
+		return SPHEREOUTSIDE;
+	else if(s->radius > -pw)
+		res = SPHEREBOUNDARY;
+	p++;
+
+	// near
+	px = p->plane.normal.x;
+	py = p->plane.normal.y;
+	pz = p->plane.normal.z;
+	pw = p->plane.distance;
+	asm volatile("fipr fv0, fv4"
+				: "+f" (pw)
+				: "f" (sx), "f" (sy), "f" (sz), "f" (sw),
+				"f" (px), "f" (py), "f" (pz));
+	if(s->radius < pw)
+		return SPHEREOUTSIDE;
+	if(s->radius > -pw)
+		res = SPHEREBOUNDARY_NEAR;
+	p++;
+
+	const float* base_ptr0 = &p[0].plane.normal.x;
+	const float* base_ptr1 = &p[1].plane.normal.x;
+	const float* base_ptr2 = &p[2].plane.normal.x;
+	const float* base_ptr3 = &p[3].plane.normal.x;
+
+	__builtin_prefetch(base_ptr0);
+
+	static_assert(offsetof (decltype (p[0].plane.normal), y)
+				 -offsetof (decltype (p[0].plane.normal), x) == sizeof (float));
+
+	static_assert(offsetof (decltype (p[0].plane.normal), z)
+				  -offsetof (decltype (p[0].plane.normal), y) == sizeof (float));
+
+	static_assert(offsetof (decltype (p[0].plane), distance)
+				 -offsetof (decltype (p[0].plane.normal), z) == sizeof (float));
+
+	asm volatile (R"(
+		frchg
+
+		fmov.s  @%0+,fr0
+		fmov.s  @%1+,fr1
+		fmov.s  @%2+,fr2
+		fmov.s  @%3+,fr3
+
+		fmov.s  @%0+,fr4
+		fmov.s  @%1+,fr5
+		fmov.s  @%2+,fr6
+		fmov.s  @%3+,fr7
+
+		fmov.s  @%0+,fr8
+		fmov.s  @%1+,fr9
+		fmov.s  @%2+,fr10
+		fmov.s  @%3+,fr11
+
+		fmov.s  @%0,fr12
+		fmov.s  @%1,fr13
+		fmov.s  @%2,fr14
+		fmov.s  @%3,fr15
+
+		frchg
+	)" : "+&r" (base_ptr0), "+&r" (base_ptr1), "+&r" (base_ptr2), "+&r" (base_ptr3)
+	:
+	: );
+
+	float dists[4];
+	mat_trans_vec4_nodiv_nomod(sx, sy, sz, sw,
+	                           dists[0], dists[1], dists[2], dists[3]);
+
+	if(s->radius < dists[0])
+		return SPHEREOUTSIDE;
+	else if(s->radius > -dists[0])
+		res = SPHEREBOUNDARY;
+
+	if(s->radius < dists[1])
+		return SPHEREOUTSIDE;
+	else if(s->radius > -dists[1])
+		res = SPHEREBOUNDARY;
+
+	if(s->radius < dists[2])
+		return SPHEREOUTSIDE;
+	else if(s->radius > -dists[2])
+		res = SPHEREBOUNDARY;
+
+	if(s->radius < dists[3])
+		return SPHEREOUTSIDE;
+	else if(s->radius > -dists[3])
+		res = SPHEREBOUNDARY;
+
+#endif
 	return res;
 }

@ -445,7 +554,7 @@ Camera::frustumTestSphereNear(const Sphere *s) const
 {
 	int32 res = SPHEREINSIDE;
 	const FrustumPlane *p = this->frustumPlanes;
-
+#ifndef DC_SH4
 	// far
 	float32 dist = dot(p->plane.normal, s->center) - p->plane.distance;
 	if(s->radius < dist)
@ -481,6 +590,101 @@ Camera::frustumTestSphereNear(const Sphere *s) const
 		return SPHEREOUTSIDE;
 	p++;

+#else
+	__builtin_prefetch(p);
+
+	register float sx asm("fr0") = s->center.x;
+	register float sy asm("fr1") = s->center.y;
+	register float sz asm("fr2") = s->center.z;
+	register float sw asm("fr3") = -1.0f;
+
+	// far
+	register float px asm("fr4") = p->plane.normal.x;
+	register float py asm("fr5") = p->plane.normal.y;
+	register float pz asm("fr6") = p->plane.normal.z;
+	register float pw asm("fr7") = p->plane.distance;
+
+	asm volatile("fipr fv0, fv4"
+				 : "+f" (pw)
+				 : "f" (sx), "f" (sy), "f" (sz), "f" (sw),
+				   "f" (px), "f" (py), "f" (pz));
+	if(s->radius < pw)
+		return SPHEREOUTSIDE;
+	p++;
+
+	// near
+	px = p->plane.normal.x;
+	py = p->plane.normal.y;
+	pz = p->plane.normal.z;
+	pw = p->plane.distance;
+	asm volatile("fipr fv0, fv4"
+				 : "+f" (pw)
+				 : "f" (sx), "f" (sy), "f" (sz), "f" (sw),
+				   "f" (px), "f" (py), "f" (pz));
+	if(s->radius < pw)
+		return SPHEREOUTSIDE;
+	if(s->radius > -pw)
+		res = SPHEREBOUNDARY_NEAR;
+	p++;
+
+	const float* base_ptr0 = &p[0].plane.normal.x;
+	const float* base_ptr1 = &p[1].plane.normal.x;
+	const float* base_ptr2 = &p[2].plane.normal.x;
+	const float* base_ptr3 = &p[3].plane.normal.x;
+
+	__builtin_prefetch(base_ptr0);
+
+	static_assert(offsetof (decltype (p[0].plane.normal), y)
+				 -offsetof (decltype (p[0].plane.normal), x) == sizeof (float));
+
+	static_assert(offsetof (decltype (p[0].plane.normal), z)
+				  -offsetof (decltype (p[0].plane.normal), y) == sizeof (float));
+
+	static_assert(offsetof (decltype (p[0].plane), distance)
+				 -offsetof (decltype (p[0].plane.normal), z) == sizeof (float));
+
+	asm volatile (R"(
+		frchg
+
+		fmov.s  @%0+,fr0
+		fmov.s  @%1+,fr1
+		fmov.s  @%2+,fr2
+		fmov.s  @%3+,fr3
+
+		fmov.s  @%0+,fr4
+		fmov.s  @%1+,fr5
+		fmov.s  @%2+,fr6
+		fmov.s  @%3+,fr7
+
+		fmov.s  @%0+,fr8
+		fmov.s  @%1+,fr9
+		fmov.s  @%2+,fr10
+		fmov.s  @%3+,fr11
+
+		fmov.s  @%0,fr12
+		fmov.s  @%1,fr13
+		fmov.s  @%2,fr14
+		fmov.s  @%3,fr15
+
+		frchg
+	)" : "+&r" (base_ptr0), "+&r" (base_ptr1), "+&r" (base_ptr2), "+&r" (base_ptr3)
+	:
+	: );
+
+	float dists[4];
+	mat_trans_vec4_nodiv_nomod(sx, sy, sz, sw,
+	                           dists[0], dists[1], dists[2], dists[3]);
+
+	if(s->radius < dists[0])
+		return SPHEREOUTSIDE;
+	else if(s->radius < dists[1])
+		return SPHEREOUTSIDE;
+	else if(s->radius < dists[2])
+		return SPHEREOUTSIDE;
+	else if(s->radius < dists[3])
+		return SPHEREOUTSIDE;
+
+#endif
 	return res;
 }

--- a/vendor/librw/src/dc/rwdc.cpp
+++ b/vendor/librw/src/dc/rwdc.cpp
@ -37,8 +37,6 @@ extern const char* currentFile;
 #include <functional>
 #include <fstream>

-#define ARRAY_SIZE(array)                (sizeof(array) / sizeof(array[0]))
-
 #define errorf(...) dbglog(DBG_CRITICAL, __VA_ARGS__)
 #define logf(...) // printf(__VA_ARGS__)
 bool re3RemoveLeastUsedModel();
@ -47,21 +45,13 @@ void* re3StreamingAlloc(size_t size);

 // #include "rwdcimpl.h"

-#include <dc/pvr.h>
-#include <dc/matrix.h>
 #include "alloc.h"

-#undef PVR_TXRFMT_STRIDE
-#define PVR_TXRFMT_STRIDE       (1 << 25)
-
-static_assert(PVR_TXRFMT_STRIDE == (1 << 25), "PVR_TXRFMT_STRIDE is bugged in your KOS version");
+using namespace dc;

 // TODO: probably needs a better place to be
 bool doEnvironmentMaps = true;

-#define fclamp0_1(n) ((n) > 1.0f ? 1.0f : n < 0.0f ? 0.0f : n)
-#define fclamp1(n) ((n) > 1.0f ? 1.0f : n)
-
 struct alignas(32) pvr_vertex16_t {
 	uint32_t flags;			/**< \brief TA command (vertex flags) */
 	float	 x;				/**< \brief X coordinate */
@ -170,177 +160,10 @@ struct alignas(32) pvr_vertex32_ut {
 static_assert(sizeof(pvr_vertex16_t) == 32, "pvr_vertex16_t size mismatch");
 static_assert(alignof(pvr_vertex16_t) == 32, "pvr_vertex16_t alignof mismatch");

-
-#define MATH_Fast_Invert(x) ({ (((x) < 0.0f)? -1.0f : 1.0f) * frsqrt((x) * (x)); }) 
-
 static pvr_dr_state_t drState;

-#include <kos/dbglog.h>
-
 float VIDEO_MODE_SCALE_X;

-#if !defined(DC_TEXCONV) && !defined(DC_SIM)
-#include <kos.h>
-
-#define mat_trans_nodiv_nomod(x, y, z, x2, y2, z2, w2) do { \
-        register float __x __asm__("fr12") = (x); \
-        register float __y __asm__("fr13") = (y); \
-        register float __z __asm__("fr14") = (z); \
-        register float __w __asm__("fr15") = 1.0f; \
-        __asm__ __volatile__( "ftrv  xmtrx, fv12\n" \
-                              : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
-                              : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
-        x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
-    } while(false)
-
-#define mat_trans_nodiv_nomod_zerow(x, y, z, x2, y2, z2, w2) do { \
-        register float __x __asm__("fr12") = (x); \
-        register float __y __asm__("fr13") = (y); \
-        register float __z __asm__("fr14") = (z); \
-        register float __w __asm__("fr15") = 0.0f; \
-        __asm__ __volatile__( "ftrv  xmtrx, fv12\n" \
-                              : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
-                              : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
-        x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
-    } while(false)
-
-#define mat_trans_w_nodiv_nomod(x, y, z, w) do { \
-        register float __x __asm__("fr12") = (x); \
-        register float __y __asm__("fr13") = (y); \
-        register float __z __asm__("fr14") = (z); \
-        register float __w __asm__("fr15") = 1.0f; \
-        __asm__ __volatile__( "ftrv  xmtrx, fv12\n" \
-                              : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
-                              : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
-        w = __w; \
-    } while(false)
-
-	// no declspec naked, so can't do rts / fschg. instead compiler pads with nop?
-
-	inline void rw_mat_load_3x3(const rw::Matrix* mtx) {
-		__asm__ __volatile__ (
-			R"(
-				fschg
-				frchg
-
-				fmov        @%[mtx]+, dr0
-
-				fldi0 		fr12
-				fldi0 		fr13
-
-				fmov        @%[mtx]+, dr2
-				fmov        @%[mtx]+, dr4
-				fmov        @%[mtx]+, dr6
-				fmov        @%[mtx]+, dr8
-				fmov        @%[mtx]+, dr10
-
-				fldi0	    fr3
-				fldi0	    fr7
-				fldi0	    fr11
-				fmov        dr12, dr14
-
-				fschg
-				frchg
-			)"
-			: [mtx] "+r" (mtx)
-		);
-	}
-
-	// sets pos.w to 1
-	inline void rw_mat_load_4x4(const rw::Matrix* mtx) {
-		__asm__ __volatile__ (
-			R"(
-				fschg
-				frchg
-				fmov        @%[mtx]+, dr0
-
-				fmov        @%[mtx]+, dr2
-				fmov        @%[mtx]+, dr4
-				fmov        @%[mtx]+, dr6
-				fmov        @%[mtx]+, dr8
-				fmov        @%[mtx]+, dr10
-				fmov        @%[mtx]+, dr12
-				fmov        @%[mtx]+, dr14
-				fldi1 	 	fr15
-
-				fschg
-				frchg
-			)"
-			: [mtx] "+r" (mtx)
-		);
-	}
-
-#else
-extern matrix_t XMTRX;
-
-void rw_mat_load_3x3(rw::Matrix* mtx) {
-	memcpy(XMTRX, mtx, sizeof(matrix_t));
-	XMTRX[0][3] = 0.0f;
-	XMTRX[1][3] = 0.0f;
-	XMTRX[2][3] = 0.0f;
-
-	XMTRX[3][0] = 0.0f;
-	XMTRX[3][1] = 0.0f;
-	XMTRX[3][2] = 0.0f;
-	XMTRX[3][3] = 0.0f;
-}
-
-void rw_mat_load_4x4(rw::Matrix* mtx) {
-	memcpy(XMTRX, mtx, sizeof(matrix_t));
-	XMTRX[3][3] = 1.0f;
-}
-
-#include <dc/matrix.h>
-#define frsqrt(a) 				(1.0f/sqrt(a))
-#define dcache_pref_block(a)	__builtin_prefetch(a)
-
-#ifndef __always_inline
-#define __always_inline 		__attribute__((always_inline)) inline
-#endif
-
-#ifdef DC_TEXCONV
-#define mat_transform(a, b, c, d)
-#define mat_apply(a)
-#define mat_load(a)
-#define mat_store(a)
-#define mat_identity(a)
-#define pvr_fog_table_color(a,r,g,b)
-#define pvr_fog_table_linear(s,e)
-#define pvr_fog_table_exp(d)
-#define pvr_fog_table_custom(d)
-#endif
-
-#define mat_trans_single3_nomod(x_, y_, z_, x2, y2, z2) do { \
-		vector_t tmp = { x_, y_, z_, 1.0f }; \
-		mat_transform(&tmp, &tmp, 1, 0); \
-		z2 = 1.0f / tmp.w; \
-		x2 = tmp.x * z2; \
-		y2 = tmp.y * z2; \
-	} while(false)
-
-#define mat_trans_nodiv_nomod(x_, y_, z_, x2, y2, z2, w2) do { \
-		vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \
-		mat_transform(&tmp1233123, &tmp1233123, 1, 0); \
-		x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \
-	} while(false)
-
-#define mat_trans_nodiv_nomod_zerow(x_, y_, z_, x2, y2, z2, w2) do { \
-		vector_t tmp1233123 = { x_, y_, z_, 0.0f }; \
-		mat_transform(&tmp1233123, &tmp1233123, 1, 0); \
-		x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \
-	} while(false)
-
-#define mat_trans_w_nodiv_nomod(x_, y_, z_, w_) do { \
-		vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \
-		mat_transform(&tmp1233123, &tmp1233123, 1, 0); \
-		w_ = tmp1233123.w; \
-	} while(false)
-
-#define memcpy4 memcpy
-
-// END STUBS
-#endif
-
 static pvr_ptr_t fake_tex;

 alignas(4) static const uint16_t fake_tex_data[] = {
@ -551,9 +374,9 @@ void DCE_MatrixViewport(float x, float y, float width, float height) {

 void DCE_InitMatrices() {
 	// Setup the screenview matrix.  Only need to do once since this matrix does not need to change for single player viewpoint.
-	mat_identity();
+	mat_identity2();
 	
-	mat_store(&DCE_MAT_SCREENVIEW);
+	mat_store2(&DCE_MAT_SCREENVIEW);
 }

 }
@ -684,7 +507,7 @@ struct atomic_context_t {
 __always_inline void DCE_RenderSubmitVertex(const pvr_vertex_t *v, uint32_t flags) {
    auto *sq  = reinterpret_cast<uint32_t *>(pvr_dr_target(drState));
    auto *src = reinterpret_cast<const uint32_t *>(v);
-    float sz  = MATH_Fast_Invert(v->z);
+    float sz  = Invert<true, false>(v->z);
    float sx  = v->x * sz;
    float sy  = v->y * sz;
    
@ -711,7 +534,7 @@ __always_inline void DCE_RenderSubmitVertexIM3D(float x, float y, float w,
 {
    auto *sq   = reinterpret_cast<uint32_t *>(pvr_dr_target(drState));
    auto *uv32 = reinterpret_cast<const uint32_t *>(uv);
-    float sz   = MATH_Fast_Invert(w);
+    float sz   = Invert<true, false>(w);
    float sx   = x * sz;
    float sy   = y * sz;

@ -728,13 +551,6 @@ __always_inline void DCE_RenderSubmitVertexIM3D(float x, float y, float w,

 /* END TA Submission Functions*/

-
-
-
-#if defined(DC_TEXCONV)
-void malloc_stats() { }
-#endif
-
 #if 0
 #define UNIMPL_LOG() printf("TODO: Implement %s @ %s:%d\n", __func__, __FILE__, __LINE__);
 #define UNIMPL_LOGV(fmt, ...) printf("TODO: Implement %s @ %s:%d " fmt "\n", __func__, __FILE__, __LINE__, __VA_ARGS__);
@ -808,9 +624,8 @@ void beginUpdate(Camera* cam)  {
 	
 	DCE_MatrixViewport(0, 0, cam->frameBuffer->width * VIDEO_MODE_SCALE_X, cam->frameBuffer->height);
 	
-	mat_load((matrix_t*)&DCE_MAT_SCREENVIEW);
-	mat_apply((matrix_t*)&cam->devProj);
-	mat_store((matrix_t*)&cam->devProjScreen);
+	mat_load_apply((matrix_t*)&DCE_MAT_SCREENVIEW, (matrix_t*)&cam->devProj);
+	mat_store2((matrix_t*)&cam->devProjScreen);

 }

@ -1838,7 +1653,7 @@ void im2DRenderPrimitive(PrimitiveType primType, void *vertices, int32_t numVert
 			pvrVert->flags = flags;
 			pvrVert->x 	   = gtaVert.x * VIDEO_MODE_SCALE_X;
 			pvrVert->y	   = gtaVert.y;
-			pvrVert->z 	   = MATH_Fast_Invert(gtaVert.w); // this is perfect for almost every case...
+			pvrVert->z 	   = Invert<true, false>(gtaVert.w); // this is perfect for almost every case...
 			pvrVert->u 	   = gtaVert.u;
 			pvrVert->v 	   = gtaVert.v;
 			pvrVert->argb  = (gtaVert.a << 24) |
@ -1959,7 +1774,7 @@ void im2DRenderIndexedPrimitive(PrimitiveType primType, void *vertices, int32 nu
 			pvrVert->flags = flags;
 			pvrVert->x 	   = gtaVert.x * VIDEO_MODE_SCALE_X;
 			pvrVert->y	   = gtaVert.y;
-			pvrVert->z 	   = MATH_Fast_Invert(gtaVert.w); // this is perfect for almost every case...
+			pvrVert->z 	   = Invert<true, false>(gtaVert.w); // this is perfect for almost every case...
 			pvrVert->u 	   = gtaVert.u;
 			pvrVert->v 	   = gtaVert.v;
 			pvrVert->argb  = (gtaVert.a << 24) |
@ -2022,8 +1837,8 @@ void im3DTransform(void *vertices, int32 numVertices, Matrix *worldMat, uint32 f
 	rw::RawMatrix::mult(&worldview, &world, &cam->devView);
 	rw::RawMatrix::mult(&proj, &worldview, &cam->devProj);
 	rw::RawMatrix::mult(&mtx, &proj, (RawMatrix*)&DCE_MAT_SCREENVIEW);
-	// mat_load(&DCE_MAT_SCREENVIEW);     // ~11 cycles.
-	mat_load(( matrix_t*)&mtx.right);  // Number of cycles: ~32.
+	// mat_load2(&DCE_MAT_SCREENVIEW);     // ~11 cycles.
+	mat_load2(( matrix_t*)&mtx.right);  // Number of cycles: ~32.
    if (im3dVertices) {
 		free(im3dVertices);
 	}
@ -2117,7 +1932,7 @@ void im3DRenderIndexedPrimitive(PrimitiveType primType,

 			// assuming near plane is 0.0f
 			// gv1 is visible (posi), and gv2 is behind the plane (negative)
-			float t = (1.0f - gv1.position.z) * MATH_Fast_Invert(gv2.position.z - gv1.position.z);
+			float t = (1.0f - gv1.position.z) * Invert<true, true>(gv2.position.z - gv1.position.z);

 			pvr_vertex_t pvrVert; 

@ -3395,13 +3210,13 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve
 		auto skinningWeightData = (uint8_t*)skinWeights;

 		if (!matrix0Identity) {
-			rw_mat_load_4x4(&skinMatrices[0]);
+			mat_load_4x4(&skinMatrices[0]);
 			if (small_xyz) {
 				mat_apply(&DCE_MESHLET_MAT_DECODE);
 			}
 		} else {
 			if (small_xyz) {
-				mat_load(&DCE_MESHLET_MAT_DECODE);
+				mat_load2(&DCE_MESHLET_MAT_DECODE);
 			}
 		}

@ -3465,7 +3280,7 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve
 				break;
 			}

-			rw_mat_load_4x4(currentMatrix);
+			mat_load_4x4(currentMatrix);
 			if (small_xyz){
 				mat_apply(&DCE_MESHLET_MAT_DECODE);
 			}
@ -3498,7 +3313,7 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve
 		auto skinningWeightData = (uint8_t*)skinWeights;

 		if (!matrix0Identity) {
-			rw_mat_load_3x3(&skinMatrices[0]);
+			mat_load_3x3(&skinMatrices[0]);
 		}

 		for(;;) {
@ -3556,7 +3371,7 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve
 				break;
 			}

-			rw_mat_load_3x3(currentMatrix);
+			mat_load_3x3(currentMatrix);

 			do {
 				auto srcOffset = *skinningIndexData++;
@ -3581,7 +3396,7 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve
 __attribute__((noinline))
 void tnlMeshletEnvMap(uint8_t* OCR, uint8_t* normal, int vertexCount, int vertexSize, matrix_t* matfxMatrix, float matfxCoefficient) {

-	mat_load(matfxMatrix);
+	mat_load2(matfxMatrix);

 	do {
 		pvr_vertex64_t* v = (pvr_vertex64_t*)OCR;
@ -3998,15 +3813,12 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {

 	lightingCB(atomic, ac->uniform);

-
 	rw::RawMatrix world;
 	rw::convMatrix(&world, atomic->getFrame()->getLTM());
 	
-
-	mat_load((matrix_t*)&cam->devProjScreen);
-	mat_apply((matrix_t*)&cam->devView);
+	mat_load_apply((matrix_t*)&cam->devProjScreen, (matrix_t*)&cam->devView);
 	mat_apply((matrix_t*)&world);
-	mat_store((matrix_t*)&atomicContexts.back().mtx);
+	mat_store2((matrix_t*)&atomicContexts.back().mtx);

 	auto meshes = geo->meshHeader->getMeshes();

@ -4245,15 +4057,14 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {
 						unsigned skinSelector = small_xyz + acp->skinMatrix0Identity*2;
 						tnlMeshletSkinVerticesSelector[skinSelector](OCR_SPACE, normalDst, &dcModel->data[meshlet->vertexOffset],  normalSrc, &dcModel->data[meshlet->skinWeightOffset], &dcModel->data[meshlet->skinIndexOffset], meshlet->vertexCount, meshlet->vertexSize, &acp->skinContextPointer->mtx);
 						
-						mat_load(&mtx);
+						mat_load2(&mtx);
 						tnlMeshletTransformSelector[clippingRequired * 2](OCR_SPACE, OCR_SPACE + 4, meshlet->vertexCount, 64);
 					} else {
 						
 						if (selector & 8) {
-							mat_load(&mtx);
-							mat_apply(&DCE_MESHLET_MAT_DECODE);
+							mat_load_apply(&mtx, &DCE_MESHLET_MAT_DECODE);
 						} else {
-							mat_load(&mtx);
+							mat_load2(&mtx);
 						}
 						tnlMeshletTransformSelector[smallSelector](OCR_SPACE, &dcModel->data[meshlet->vertexOffset], meshlet->vertexCount, meshlet->vertexSize);
 					}
@ -4280,7 +4091,7 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {

 						unsigned dstColOffset = textured ? offsetof(pvr_vertex64_t, a) : offsetof(pvr_vertex32_ut, a);
 						dce_set_mat_vertex_color(&residual, &material);
-						mat_load(&DCE_MESHLET_MAT_VERTEX_COLOR);
+						mat_load2(&DCE_MESHLET_MAT_VERTEX_COLOR);
 						tnlMeshletVertexColorSelector[0](OCR_SPACE + dstColOffset, (int8_t*)&dcModel->data[meshlet->vertexOffset] + colOffset, meshlet->vertexCount, meshlet->vertexSize);
 					} else {
 						unsigned dstColOffset = textured ? offsetof(pvr_vertex64_t, a) : offsetof(pvr_vertex32_ut, a);
@ -4302,7 +4113,7 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {

 						
 						unsigned normalSelector = (pass1 - 1) + (skin != 0) * 4;
-						mat_load((matrix_t*)&uniformObject.dir[0][0][0]);
+						mat_load2((matrix_t*)&uniformObject.dir[0][0][0]);
 						auto normalPointer = &dcModel->data[meshlet->vertexOffset] + normalOffset;
 						auto vtxSize = meshlet->vertexSize;
 						if (skin) {
@ -4317,7 +4128,7 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {
 					
 						if (pass2) {
 							unsigned normalSelector = (pass2 - 1) + (skin != 0) * 4;
-							mat_load((matrix_t*)&uniformObject.dir[1][0][0]);
+							mat_load2((matrix_t*)&uniformObject.dir[1][0][0]);
 							tnlMeshletDiffuseColorSelector[normalSelector](OCR_SPACE + dstColOffset, normalPointer, meshlet->vertexCount, vtxSize, &lightDiffuseColors[4]);
 						}
 					}
@ -4380,7 +4191,7 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {
 					indices.back() |= 0x80;

 					pvr_vertex64_t *vd = (pvr_vertex64_t *)OCR_SPACE;
-					mat_load(&mtx);  // Number of cycles: ~11
+					mat_load2(&mtx);  // Number of cycles: ~11

 					for (int idx = 0; idx < geo->numVertices; idx++) {
 						auto& vert = vertices[idx];
--- a/vendor/librw/src/dc/rwdc_common.h
+++ b/vendor/librw/src/dc/rwdc_common.h
@ -0,0 +1,739 @@
+#ifndef __RWDC_COMMON_H
+#define __RWDC_COMMON_H
+
+#include <tuple>
+#include <limits>
+#include <cmath>
+#include <type_traits>
+#include <algorithm>
+#include <cstring>
+
+#ifndef RW_DC
+#   pragma warning(disable: 4244)  // int to float
+#   pragma warning(disable: 4800)  // int to bool
+#   pragma warning(disable: 4838)  // narrowing conversion
+#   pragma warning(disable: 4996)  // POSIX names
+#else
+#   if !defined(DC_TEXCONV) && !defined(DC_SIM)
+#       include <kos.h>
+#       define DC_SH4
+#       define VIDEO_MODE_WIDTH     static_cast<float>(vid_mode->width)
+#       define VIDEO_MODE_HEIGHT    static_cast<float>(vid_mode->height)
+#       define memcpy4              memcpy
+#   else
+#       ifdef DC_TEXCONV
+#           define malloc_stats()
+#       endif
+#       include <dc/matrix.h>
+#       include <dc/pvr.h>
+#       include <kos/dbglog.h>
+#       define VIDEO_MODE_WIDTH     640.0f
+#       define VIDEO_MODE_HEIGHT    480.0f
+#       define memcpy4              memcpy
+#       define frsqrt(a)            (1.0f/sqrtf(a))
+#       define dcache_pref_block(a)	__builtin_prefetch(a)
+#       define F_PI                 M_PI
+#       ifndef __always_inline 
+#           define __always_inline  inline
+#       endif
+#   endif
+#   ifdef PVR_TXRFMT_STRIDE
+#       undef PVR_TXRFMT_STRIDE
+#       define PVR_TXRFMT_STRIDE (1 << 25)
+#   endif
+    static_assert(PVR_TXRFMT_STRIDE == (1 << 25), 
+                  "PVR_TXRFMT_STRIDE is bugged in your KOS version");
+#endif
+
+#define F_PI_2              (F_PI * 0.5f)
+#define __hot               __attribute__((hot))
+#define __cold              __attribute__((cold))
+#define __icache_aligned    __attribute__((aligned(32)))
+
+#define STRINGIFY(x)        #x
+#define STR(x)              STRINGIFY(x)
+#define CONCAT_(x,y)        x##y
+#define CONCAT(x,y)         CONCAT_(x,y)
+#define ARRAY_SIZE(array)   (sizeof(array) / sizeof(array[0]))
+
+namespace rw {
+    class Matrix;
+}
+
+namespace dc {
+
+struct quaternion_t {
+    float x, y, z, w;
+};
+
+__always_inline __hot constexpr float Sin(float x) { return sinf(x); }
+__always_inline __hot constexpr float Cos(float x) { return cosf(x); }
+__always_inline __hot constexpr auto  SinCos(float x) { return std::pair { Sin(x), Cos(x) }; }
+__always_inline __hot constexpr float Tan(float x) { return tanf(x); }
+__always_inline __hot constexpr float Atan(float x) { return atanf(x); }
+__always_inline __hot constexpr float Atan2(float y, float x) { return atan2f(y, x); }
+__always_inline __hot constexpr float Asin(float x) { return asinf(x);  }
+__always_inline __hot constexpr float Acos(float x) { return acosf(x); }
+__always_inline __hot constexpr float Abs(float x) { return fabsf(x); }
+__always_inline __hot constexpr float Sqrt(float x) { return sqrtf(x); }
+__always_inline __hot constexpr float RecipSqrt(float x, float y) { return x / Sqrt(y); }
+__always_inline __hot constexpr float Pow(float x, float y) { return powf(x, y); }
+__always_inline __hot constexpr float Floor(float x) { return floorf(x); }
+__always_inline __hot constexpr float Ceil(float x) { return ceilf(x); }
+__always_inline __hot constexpr float Fmac(auto a, auto b, auto c) { return a * b + c; }
+__always_inline __hot constexpr float Lerp(float a, float b, float t) { return Fmac(t, (b - a), a); }
+__always_inline __hot constexpr auto Max(auto a, auto b) { return ((a > b)? a : b); }
+__always_inline __hot constexpr auto Min(auto a, auto b) { return ((a < b)? a : b); }
+
+template<bool CHECK_ZERO=false>
+__always_inline __hot constexpr float RecipSqrt(float x) { 
+    if constexpr(CHECK_ZERO)
+        if(x == 0.0f) 
+            return 0.0f;
+    
+    return 1.0f / Sqrt(x); 
+}
+
+template<typename T>
+__always_inline __hot constexpr T Clamp(T v, auto low, auto high) {
+    return std::clamp(v, static_cast<T>(low), static_cast<T>(high));
+}
+
+__always_inline constexpr auto Clamp2(auto v, auto center, auto radius) {
+    return (v > center) ? Min(v, center + radius) : Max(v, center - radius);
+}
+
+template<bool FAST_APPROX=true, bool COPY_SIGN=true>
+__always_inline __hot constexpr float Invert(float x) { 
+    float value;
+    
+    if(!std::is_constant_evaluated() && FAST_APPROX) {
+        value = RecipSqrt(x * x);
+
+        if constexpr(COPY_SIGN)
+            if(x < 0.0f)
+                value = -value;
+
+    } else value = 1.0f / x;
+
+    return value;
+} 
+
+template<bool FAST_APPROX=true, bool COPY_SIGN=true>
+__always_inline __hot constexpr float Div(float x, float y) { 
+    if(FAST_APPROX && !std::is_constant_evaluated())
+        return x * Invert<true, COPY_SIGN>(y);
+    else
+        return x / y;
+}
+
+template<bool FAST_APPROX=false, bool COPY_SIGN=true>
+__always_inline __hot constexpr auto Norm(auto value, auto min, auto max) {
+    auto numerator = Clamp(value, min, max) - min;
+    auto denominator = (max - min);
+
+    if(FAST_APPROX && !std::is_constant_evaluated())
+        return Div<true, COPY_SIGN>(numerator, denominator);
+    else
+        return numerator / denominator;
+}
+
+#ifdef RW_DC
+#   ifdef DC_SH4
+
+#define mat_trans_nodiv_nomod(x, y, z, x2, y2, z2, w2) do { \
+        register float __x __asm__("fr12") = (x); \
+        register float __y __asm__("fr13") = (y); \
+        register float __z __asm__("fr14") = (z); \
+        register float __w __asm__("fr15") = 1.0f; \
+        __asm__ __volatile__( "ftrv  xmtrx, fv12\n" \
+                              : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
+                              : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
+        x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
+    } while(false)
+
+#define mat_trans_nodiv_nomod_zerow(x, y, z, x2, y2, z2, w2) do { \
+        register float __x __asm__("fr12") = (x); \
+        register float __y __asm__("fr13") = (y); \
+        register float __z __asm__("fr14") = (z); \
+        register float __w __asm__("fr15") = 0.0f; \
+        __asm__ __volatile__( "ftrv  xmtrx, fv12\n" \
+                              : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
+                              : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
+        x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
+    } while(false)
+
+#define mat_trans_w_nodiv_nomod(x, y, z, w) do { \
+        register float __x __asm__("fr12") = (x); \
+        register float __y __asm__("fr13") = (y); \
+        register float __z __asm__("fr14") = (z); \
+        register float __w __asm__("fr15") = 1.0f; \
+        __asm__ __volatile__( "ftrv  xmtrx, fv12\n" \
+                              : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
+                              : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
+        w = __w; \
+    } while(false)
+
+#define mat_trans_vec4_nodiv_nomod(x, y, z, w, x2, y2, z2, w2) { \
+        register float __x __asm__("fr0") = (x); \
+        register float __y __asm__("fr1") = (y); \
+        register float __z __asm__("fr2") = (z); \
+        register float __w __asm__("fr3") = (w); \
+        __asm__ __volatile__( "ftrv  xmtrx, fv0\n" \
+                              : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
+                              : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
+        x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
+    } while(false)
+
+inline __hot __icache_aligned void mat_load2(const matrix_t* mtx) {
+    asm volatile(
+        R"(
+            fschg
+            fmov.d	@%[mtx],xd0
+            add	    #32,%[mtx]
+            pref	@%[mtx]
+            add	    #-(32-8),%[mtx]
+            fmov.d	@%[mtx]+,xd2
+            fmov.d	@%[mtx]+,xd4
+            fmov.d	@%[mtx]+,xd6
+            fmov.d	@%[mtx]+,xd8
+            fmov.d	@%[mtx]+,xd10
+            fmov.d	@%[mtx]+,xd12
+            fmov.d	@%[mtx]+,xd14
+            fschg
+        )"
+        : [mtx] "+r" (mtx)
+        :
+        :
+    );
+}
+
+inline __hot __icache_aligned void mat_store2(matrix_t *mtx) {
+    asm volatile(
+        R"(
+            fschg
+            add	    #64-8,%[mtx]
+            fmov.d	xd14,@%[mtx]
+            add	    #-32,%[mtx]
+            pref	@%[mtx]
+            add	    #32,%[mtx]
+            fmov.d	xd12,@-%[mtx]
+            fmov.d	xd10,@-%[mtx]
+            fmov.d	xd8,@-%[mtx]
+            fmov.d	xd6,@-%[mtx]
+            fmov.d	xd4,@-%[mtx]
+            fmov.d	xd2,@-%[mtx]
+            fmov.d	xd0,@-%[mtx]
+            fschg
+        )"
+        : [mtx] "+&r" (mtx), "=m" (*mtx)
+        :
+        :
+    );
+}
+
+inline __hot __icache_aligned void mat_identity2(void) {
+    asm volatile(
+        R"(
+            frchg
+            fldi1	fr0
+            fschg
+            fldi0	fr1
+            fldi0	fr2
+            fldi0	fr3
+            fldi0	fr4
+            fldi1	fr5
+            fmov	dr2,dr6
+            fmov	dr2,dr8
+            fmov	dr0,dr10
+            fmov	dr2,dr12
+            fmov	dr4,dr14
+            fschg
+            frchg
+        )"
+    );
+}
+
+inline __hot __icache_aligned void mat_set_scale(float x, float y, float z) {
+    asm volatile(
+        R"(
+            frchg
+            fldi0	fr1
+            fschg
+            fldi0	fr2
+            fldi0	fr3
+            fldi0	fr4
+            fmov	dr2, dr6
+            fmov	dr2, dr8
+            fldi0	fr11
+            fmov	dr2, dr12
+            fldi0	fr14
+            fschg
+            frchg
+            fmov    %[x], xf0
+            fmov    %[y], xf5
+            fmov    %[z], xf10
+        )"
+        :
+        : [x] "r" (x), [y] "r" (y), [z] "r" (z)
+        :
+    );
+}
+
+// no declspec naked, so can't do rts / fschg. instead compiler pads with nop?
+inline __hot void mat_load_3x3(const rw::Matrix* mtx) {
+    __asm__ __volatile__ (
+        R"(
+            fschg
+            frchg
+
+            fmov        @%[mtx]+, dr0
+
+            fldi0 		fr12
+            fldi0 		fr13
+
+            fmov        @%[mtx]+, dr2
+            fmov        @%[mtx]+, dr4
+            fmov        @%[mtx]+, dr6
+            fmov        @%[mtx]+, dr8
+            fmov        @%[mtx]+, dr10
+
+            fldi0	    fr3
+            fldi0	    fr7
+            fldi0	    fr11
+            fmov        dr12, dr14
+
+            fschg
+            frchg
+        )"
+        : [mtx] "+r" (mtx)
+    );
+}
+
+// sets pos.w to 1
+inline __hot void mat_load_4x4(const rw::Matrix* mtx) {
+		__asm__ __volatile__ (
+			R"(
+				fschg
+				frchg
+				fmov        @%[mtx]+, dr0
+
+				fmov        @%[mtx]+, dr2
+				fmov        @%[mtx]+, dr4
+				fmov        @%[mtx]+, dr6
+				fmov        @%[mtx]+, dr8
+				fmov        @%[mtx]+, dr10
+				fmov        @%[mtx]+, dr12
+				fmov        @%[mtx]+, dr14
+				fldi1 	 	fr15
+
+				fschg
+				frchg
+			)"
+			: [mtx] "+r" (mtx)
+		);
+	}
+
+__hot inline void mat_transpose(void) {
+    asm volatile (
+        "frchg\n\t" // fmov for singles only works on front bank
+        // FR0, FR5, FR10, and FR15 are already in place
+        // swap FR1 and FR4
+        "flds FR1, FPUL\n\t"
+        "fmov FR4, FR1\n\t"
+        "fsts FPUL, FR4\n\t"
+        // swap FR2 and FR8
+        "flds FR2, FPUL\n\t"
+        "fmov FR8, FR2\n\t"
+        "fsts FPUL, FR8\n\t"
+        // swap FR3 and FR12
+        "flds FR3, FPUL\n\t"
+        "fmov FR12, FR3\n\t"
+        "fsts FPUL, FR12\n\t"
+        // swap FR6 and FR9
+        "flds FR6, FPUL\n\t"
+        "fmov FR9, FR6\n\t"
+        "fsts FPUL, FR9\n\t"
+        // swap FR7 and FR13
+        "flds FR7, FPUL\n\t"
+        "fmov FR13, FR7\n\t"
+        "fsts FPUL, FR13\n\t"
+        // swap FR11 and FR14
+        "flds FR11, FPUL\n\t"
+        "fmov FR14, FR11\n\t"
+        "fsts FPUL, FR14\n\t"
+        // restore XMTRX to back bank
+        "frchg\n"
+        : // no outputs
+        : // no inputs
+        : "fpul" // clobbers
+    );
+}
+
+__hot __icache_aligned inline void mat_copy(matrix_t *dst, const matrix_t *src) {
+    asm volatile(R"(
+        fschg
+
+        pref    @%[dst]
+
+        fmov.d  @%[src]+, xd0
+        fmov.d  @%[src]+, xd2
+        fmov.d  @%[src]+, xd4
+        fmov.d  @%[src]+, xd6
+    
+        pref    @%[src]
+
+        add     #32, %[dst]
+
+        fmov.d  xd6, @-%[dst]
+        fmov.d  xd4, @-%[dst]
+        fmov.d  xd2, @-%[dst]
+        fmov.d  xd0, @-%[dst]
+
+        add     #32, %[dst]
+        pref    @%[dst]
+
+        fmov.d  @%[src]+, xd0
+        fmov.d  @%[src]+, xd2
+        fmov.d  @%[src]+, xd4
+        fmov.d  @%[src]+, xd6
+
+        add     #32, %[dst]
+
+        fmov.d  xd6, @-%[dst]
+        fmov.d  xd4, @-%[dst]
+        fmov.d  xd2, @-%[dst]
+        fmov.d  xd0, @-%[dst]
+
+        fschg
+    )": [dst] "+&r" (dst), [src] "+&r" (src), "=m" (*dst)
+      :
+      :);
+}
+
+//TODO: FIXME FOR VC (AND USE FTRV)
+template<bool FAST_APPROX=false>
+__hot constexpr inline void quat_mult(quaternion_t *r, const quaternion_t &q1, const quaternion_t &q2) {
+    if(FAST_APPROX && !std::is_constant_evaluated()) {
+    /*
+        // reorder the coefficients so that q1 stays in constant order {x,y,z,w}
+        // q2 then needs to be rotated after each inner product
+        x =  (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x);
+        y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y);
+        z =  (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z);
+        w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w);
+    */
+        // keep q1 in fv4
+        register float q1x __asm__ ("fr4") = (q1.x);
+        register float q1y __asm__ ("fr5") = (q1.y);
+        register float q1z __asm__ ("fr6") = (q1.z);
+        register float q1w __asm__ ("fr7") = (q1.w);
+
+        // load q2 into fv8, use it to get the shuffled reorder into fv0
+        register float q2x __asm__ ("fr8")  = (q2.x);
+        register float q2y __asm__ ("fr9")  = (q2.y);
+        register float q2z __asm__ ("fr10") = (q2.z);
+        register float q2w __asm__ ("fr11") = (q2.w);
+
+        // temporary operand / result in fv0
+        register float t1x __asm__ ("fr0");
+        register float t1y __asm__ ("fr1");
+        register float t1z __asm__ ("fr2");
+        register float t1w __asm__ ("fr3");
+
+        // x =  (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x);
+        t1x = q2w;
+        t1y = q2z;
+        t1z = -q2y;
+        t1w = q2w;
+        __asm__ ("\n"
+            " fipr	fv4,fv0\n"
+            : "+f" (t1w)
+            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
+              "f" (t1x), "f" (t1y), "f" (t1z)
+        );
+        // x = t1w;  try to avoid the stall by not reading the fipr result immediately
+
+        // y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y);
+        t1x = -q2z;
+        t1y = q2w;
+        t1z = q2x;
+        __atomic_thread_fence(1);
+        r->x = t1w;   // get previous result
+        t1w = q2y;
+        __asm__ ("\n"
+            "	fipr	fv4,fv0\n"
+            : "+f" (t1w)
+            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
+              "f" (t1x), "f" (t1y), "f" (t1z)
+        );
+        //y = t1w;
+
+        // z =  (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z);
+        t1x = q2y;
+        t1y = -q2x;
+        t1z = q2w;
+        __atomic_thread_fence(1);
+        r->y = t1w;   // get previous result
+        t1w = q2z;
+        __asm__ ("\n"
+            "	fipr	fv4,fv0\n"
+            : "+f" (t1w)
+            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
+              "f" (t1x), "f" (t1y), "f" (t1z)
+        );
+        //z = t1w;
+        __atomic_thread_fence(1);
+
+        // w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w);
+        q2x = -q2x;
+        q2y = -q2y;
+        q2z = -q2z;
+        __asm__ ("\n"
+            "	fipr	fv4,fv8\n"
+            : "+f" (q2w)
+            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
+              "f" (q2x), "f" (q2y), "f" (q2z)
+        );
+
+        __atomic_thread_fence(1);
+        r->z = t1w;
+        __atomic_thread_fence(1);
+        r->w = q2w;
+    } else {
+        r->x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w);
+        r->y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w);
+        r->z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w);
+        r->w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z);
+    }
+}
+
+__hot inline void mat_load_apply(const matrix_t* matrix1, const matrix_t* matrix2) {
+    unsigned int prefetch_scratch;
+
+    asm volatile (
+        "mov %[bmtrx], %[pref_scratch]\n\t" // (MT)
+        "add #32, %[pref_scratch]\n\t" // offset by 32 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle)
+        "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) (FE)
+        "pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS)
+        // back matrix
+        "fmov.d @%[bmtrx]+, XD0\n\t" // (LS)
+        "fmov.d @%[bmtrx]+, XD2\n\t"
+        "fmov.d @%[bmtrx]+, XD4\n\t"
+        "fmov.d @%[bmtrx]+, XD6\n\t"
+        "pref @%[fmtrx]\n\t" // prefetch fmtrx now while we wait (LS)
+        "fmov.d @%[bmtrx]+, XD8\n\t" // bmtrx prefetch should work for here
+        "fmov.d @%[bmtrx]+, XD10\n\t"
+        "fmov.d @%[bmtrx]+, XD12\n\t"
+        "mov %[fmtrx], %[pref_scratch]\n\t" // (MT)
+        "add #32, %[pref_scratch]\n\t" // store offset by 32 in r0 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle)
+        "fmov.d @%[bmtrx], XD14\n\t"
+        "pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS)
+        // front matrix
+        // interleave loads and matrix multiply 4x4
+        "fmov.d @%[fmtrx]+, DR0\n\t"
+        "fmov.d @%[fmtrx]+, DR2\n\t"
+        "fmov.d @%[fmtrx]+, DR4\n\t" // (LS) want to issue the next one before 'ftrv' for parallel exec
+        "ftrv XMTRX, FV0\n\t" // (FE)
+
+        "fmov.d @%[fmtrx]+, DR6\n\t"
+        "fmov.d @%[fmtrx]+, DR8\n\t"
+        "ftrv XMTRX, FV4\n\t"
+
+        "fmov.d @%[fmtrx]+, DR10\n\t"
+        "fmov.d @%[fmtrx]+, DR12\n\t"
+        "ftrv XMTRX, FV8\n\t"
+
+        "fmov.d @%[fmtrx], DR14\n\t" // (LS, but this will stall 'ftrv' for 3 cycles)
+        "fschg\n\t" // switch back to single moves (and avoid stalling 'ftrv') (FE)
+        "ftrv XMTRX, FV12\n\t" // (FE)
+        // Save output in XF regs
+        "frchg\n"
+        : [bmtrx] "+&r" ((unsigned int)matrix1), [fmtrx] "+r" ((unsigned int)matrix2), [pref_scratch] "=&r" (prefetch_scratch) // outputs, "+" means r/w, "&" means it's written to before all inputs are consumed
+        : // no inputs
+        : "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11", "fr12", "fr13", "fr14", "fr15" // clobbers (GCC doesn't know about back bank, so writing to it isn't clobbered)
+    );
+}
+
+__hot inline void mat_apply_rotate_x(float x) {
+    x *= 10430.37835f;
+    asm volatile(
+        "ftrc	%[a], fpul\n\t"
+        "fsca 	fpul, dr4\n\t"
+        "fldi0	fr8\n\t"
+        "fldi0	fr11\n\t"
+        "fmov	fr5, fr10\n\t"
+        "fmov	fr4, fr9\n\t"
+        "fneg	fr9\n\t"
+        "ftrv	xmtrx, fv8\n\t"
+        "fmov	fr4, fr6\n\t"
+        "fldi0	fr7\n\t"
+        "fldi0	fr4\n\t"
+        "ftrv	xmtrx, fv4\n\t"
+        "fschg\n\t"
+        "fmov	dr8, xd8\n\t"
+        "fmov	dr10, xd10\n\t"
+        "fmov	dr4, xd4\n\t"
+        "fmov	dr6, xd6\n\t"
+        "fschg\n"
+        :
+        : [a] "f"(x)
+        : "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11");
+}
+
+__hot inline void mat_apply_rotate_y(float y) {
+    y *= 10430.37835f;
+    asm volatile(
+        "ftrc	%[a], fpul\n\t"
+        "fsca   fpul, dr6\n\t"
+        "fldi0	fr9\n\t"
+        "fldi0	fr11\n\t"
+        "fmov	fr6, fr8\n\t"
+        "fmov	fr7, fr10\n\t"
+        "ftrv	xmtrx, fv8\n\t"
+        "fmov	fr7, fr4\n\t"
+        "fldi0	fr5\n\t"
+        "fneg	fr6\n\t"
+        "fldi0	fr7\n\t"
+        "ftrv	xmtrx, fv4\n\t"
+        "fschg\n\t"
+        "fmov	dr8, xd8\n\t"
+        "fmov	dr10, xd10\n\t"
+        "fmov	dr4, xd0\n\t"
+        "fmov	dr6, xd2\n\t"
+        "fschg\n"
+        :
+        : [a] "f"(y)
+        : "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11");
+}
+
+__hot inline void mat_apply_rotate_z(float z) {
+    z *= 10430.37835f;
+    asm volatile(
+        "ftrc	%[a], fpul\n\t"
+        "fsca   fpul, dr8\n\t"
+        "fldi0	fr10\n\t"
+        "fldi0	fr11\n\t"
+        "fmov	fr8, fr5\n\t"
+        "fneg	fr8\n\t"
+        "ftrv	xmtrx, fv8\n\t"
+        "fmov	fr9, fr4\n\t"
+        "fschg\n\t"
+        "fmov	dr10, dr6\n\t"
+        "ftrv	xmtrx, fv4\n\t"
+        "fmov	dr8, xd4\n\t"
+        "fmov	dr10, xd6\n\t"
+        "fmov	dr4, xd0\n\t"
+        "fmov	dr6, xd2\n\t"
+        "fschg\n"
+        :
+        : [a] "f"(z)
+        : "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11");
+}
+
+#   else
+#       ifdef DC_TEXCONV
+#           define mat_apply(a)
+#           define mat_load2(a)
+#           define mat_store2(a)
+#           define mat_identity2()
+#           define mat_transform(a, b, c, d)
+#           define pvr_fog_table_color(a,r,g,b)
+#           define pvr_fog_table_linear(s,e)
+#       else
+#           define mat_load2(a)    mat_load(a)
+#           define mat_store2(a)   mat_store(a)
+#           define mat_identity2() mat_identity()
+#       endif
+
+#define mat_trans_single3_nomod(x_, y_, z_, x2, y2, z2) do { \
+        vector_t tmp = { x_, y_, z_, 1.0f }; \
+        mat_transform(&tmp, &tmp, 1, 0); \
+        z2 = 1.0f / tmp.w; \
+        x2 = tmp.x * z2; \
+        y2 = tmp.y * z2; \
+    } while(false)
+
+#define mat_trans_single3_nodiv_nomod(x_, y_, z_, x2, y2, z2) do { \
+        vector_t tmp = { x_, y_, z_, 1.0f }; \
+        mat_transform(&tmp, &tmp, 1, 0); \
+        z2 = tmp.z; \
+        x2 = tmp.x; \
+        y2 = tmp.y; \
+    } while(false)
+
+#define mat_trans_nodiv_nomod(x_, y_, z_, x2, y2, z2, w2) do { \
+        vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \
+        mat_transform(&tmp1233123, &tmp1233123, 1, 0); \
+        x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \
+    } while(false)
+
+#define mat_trans_nodiv_nomod_zerow(x_, y_, z_, x2, y2, z2, w2) do { \
+        vector_t tmp1233123 = { x_, y_, z_, 0.0f }; \
+        mat_transform(&tmp1233123, &tmp1233123, 1, 0); \
+        x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \
+    } while(false)
+
+#define mat_trans_w_nodiv_nomod(x_, y_, z_, w_) do { \
+        vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \
+        mat_transform(&tmp1233123, &tmp1233123, 1, 0); \
+        w_ = tmp1233123.w; \
+    } while(false)
+
+inline void mat_load_3x3(const rw::Matrix* mtx) {
+    memcpy(XMTRX, mtx, sizeof(matrix_t));
+    XMTRX[0][3] = 0.0f;
+    XMTRX[1][3] = 0.0f;
+    XMTRX[2][3] = 0.0f;
+
+    XMTRX[3][0] = 0.0f;
+    XMTRX[3][1] = 0.0f;
+    XMTRX[3][2] = 0.0f;
+    XMTRX[3][3] = 0.0f;
+}
+
+inline void mat_load_4x4(const rw::Matrix* mtx) {
+    memcpy(XMTRX, mtx, sizeof(matrix_t));
+    XMTRX[3][3] = 1.0f;
+}
+
+inline void mat_transpose(void) {
+    matrix_t tmp;
+
+    for(unsigned i = 0; i < 4; ++i)
+        for(unsigned j = 0; j < 4; ++j)
+            tmp[j][i] = XMTRX[i][j];
+    
+    mat_load(&tmp);
+}
+
+template<bool FAST_APPROX=true>
+__hot inline void quat_mult(quaternion_t *r, const quaternion_t &q1, const quaternion_t &q2) {
+    r->x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w);
+    r->y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w);
+    r->z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w);
+    r->w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z);
+}
+
+__hot inline void mat_load_apply(const matrix_t* matrix1, const matrix_t* matrix2) {
+    mat_load(matrix1);
+    mat_apply(matrix2);
+}
+
+__always_inline __hot void mat_copy(matrix_t *dst, const matrix_t *src) {
+    mat_load(src);
+    mat_store(dst);
+}
+
+#endif
+
+__hot inline void mat_mult(matrix_t *out, const matrix_t* matrix1, const matrix_t* matrix2) {
+    mat_load_apply(matrix1, matrix2);
+    mat_store2(out);
+}
+
+#endif
+
+}
+
+#endif /* RWDC_COMMON_H */
--- a/vendor/librw/src/rwbase.h
+++ b/vendor/librw/src/rwbase.h
@ -1,6 +1,9 @@
 #pragma once

 #include "common_defines.h"
+#ifdef RW_DC
+#include "rwdc_common.h"
+#endif

 #ifndef RW_PS2
 #include <stdint.h>
@ -322,7 +325,13 @@ inline float32 dot(const Quat &q, const Quat &p) {
 #endif
 }
 inline Quat scale(const Quat &q, float32 r) { return makeQuat(q.w*r, q.x*r, q.y*r, q.z*r); }
-inline float32 length(const Quat &q) { return sqrtf(q.w*q.w + q.x*q.x + q.y*q.y + q.z*q.z); }
+inline float32 length(const Quat &q) {
+#ifndef DC_SH4
+	return sqrtf(q.w*q.w + q.x*q.x + q.y*q.y + q.z*q.z);
+#else
+	return dc::Sqrt(fipr_magnitude_sqr(q.x, q.y, q.z, q.w));
+#endif
+}
 inline Quat normalize(const Quat &q) { return scale(q, 1.0f/length(q)); }
 inline Quat conj(const Quat &q) { return makeQuat(q.w, -q.x, -q.y, -q.z); }
 Quat mult(const Quat &q, const Quat &p);