Liberty/Miami Perf Gainz Phase 3

1) synced dca3-kos repo which has some gainzy commits
2) rwdc_common.h
    - all low-level and matrix/vector routines for SH4 are now shared in
      this common file, included in both RW and Liberty/Miami engines
3) CMatrix
    a. assignment operator: now uses asm-optimized mat_copy()
    b. multiplication operator: now use mat_mult() SH4 routine
    c. Scale(): applies a scale matrix via mat_scale
    d. MultiplyInverse: fipr-optimizations
4) CQuaternion
    a. multiplication: SH4 ASM FIPR optimized
    b. Get(V3d& axis, float &angle): fast inversion/division
    c. Set(RWMatrix&): fast division
5) CVector
    a. Multiply3x3() now accelerated with mat_transpose
5) RwQuat
    a. mult(): FIPR accelerated
    b. length(): FIPR/FSRRA accelerated
This commit is contained in:
Falco Girgis 2025-04-15 12:06:12 -05:00
parent cca7b3c6fa
commit 4d13e821b5
24 changed files with 967 additions and 278 deletions

View file

@ -381,6 +381,7 @@ INCLUDE = \
-I../src/liberty/skel/win \ -I../src/liberty/skel/win \
\ \
-I../vendor/librw \ -I../vendor/librw \
-I../vendor/librw/src/dc \
\ \
-I../vendor/miniLZO \ -I../vendor/miniLZO \
\ \

View file

@ -396,6 +396,7 @@ INCLUDE = \
-I../src/miami/skel/win \ -I../src/miami/skel/win \
\ \
-I../vendor/librw \ -I../vendor/librw \
-I../vendor/librw/src/dc \
\ \
-I../vendor/miniLZO \ -I../vendor/miniLZO \
\ \

View file

@ -94,23 +94,26 @@ void VmuProfiler::run() {
pvr_stats_t pvrStats; pvr_get_stats(&pvrStats); pvr_stats_t pvrStats; pvr_get_stats(&pvrStats);
uint32_t sramStats = snd_mem_available(); uint32_t sramStats = snd_mem_available();
size_t pvrAvail = pvr_mem_available(); size_t pvrAvail = pvr_mem_available();
float fps = std::accumulate(std::begin(fps_), std::end(fps_), 0.0f)
/ static_cast<float>(fpsSamples);
float sh4Mem = heapUtilization(); float sh4Mem = heapUtilization();
float pvrMem = (8_MB - pvrAvail ) / 8_MB * 100.0f; float pvrMem = (8_MB - pvrAvail ) / 8_MB * 100.0f;
float armMem = (2_MB - sramStats) / 2_MB * 100.0f; float armMem = (2_MB - sramStats) / 2_MB * 100.0f;
float vtxBuf = vertBuffUse_;
{
std::shared_lock lk(mtx_);
vmu_printf("FPS :%6.2f\n" float vtxBuf;
"SH4 :%6.2f%%\n" float fps;
"PVR :%6.2f%%\n" { /* Critical section with main thread. */
"ARM :%6.2f%%\n" std::shared_lock lk(mtx_);
"VTX :%6.2f%%", vtxBuf = vertBuffUse_;
fps, sh4Mem, pvrMem, armMem, vtxBuf); fps = std::accumulate(std::begin(fps_), std::end(fps_), 0.0f)
/ static_cast<float>(fpsSamples);
} }
vmu_printf(" FPS:%6.2f\n"
" RAM:%6.2f%%\n"
"VRAM:%6.2f%%\n"
"SRAM:%6.2f%%\n"
" VTX:%6.2f%%",
fps, sh4Mem, pvrMem, armMem, vtxBuf);
} }
#endif #endif
@ -119,15 +122,21 @@ void VmuProfiler::run() {
} }
void VmuProfiler::updateVertexBufferUsage() { void VmuProfiler::updateVertexBufferUsage() {
#ifndef DC_SH4
std::unique_lock lk(mtx_); std::unique_lock lk(mtx_);
updated_ = true; updated_ = true;
#ifdef DC_SH4 #else
vertBuffUse_ = vertexBufferUtilization(); pvr_stats_t pvrStats;
pvr_get_stats(&pvrStats);
float vtxUtil = vertexBufferUtilization();
pvr_stats_t pvrStats; { /* Critical section with VMU thread. */
pvr_get_stats(&pvrStats); std::unique_lock lk(mtx_);
fps_[fpsFrame_++] = pvrStats.frame_rate; vertBuffUse_ = vtxUtil;
updated_ = true;
fps_[fpsFrame_++] = pvrStats.frame_rate;
}
if(fpsFrame_ >= fpsSamples) if(fpsFrame_ >= fpsSamples)
fpsFrame_ = 0; fpsFrame_ = 0;

View file

@ -82,8 +82,6 @@
#define rwVENDORID_ROCKSTAR 0x0253F2 #define rwVENDORID_ROCKSTAR 0x0253F2
__always_inline auto Max(auto a, auto b) { return ((a > b)? a : b); }
__always_inline auto Min(auto a, auto b) { return ((a < b)? a : b); }
// Use this to add const that wasn't there in the original code // Use this to add const that wasn't there in the original code
#define Const const #define Const const
@ -299,15 +297,6 @@ extern int strcasecmp(const char *str1, const char *str2);
extern wchar *AllocUnicode(const char*src); extern wchar *AllocUnicode(const char*src);
template<typename T>
__always_inline T Clamp(T v, auto low, auto high) {
return std::clamp(v, static_cast<T>(low), static_cast<T>(high));
}
__always_inline auto Clamp2(auto v, auto center, auto radius) {
return (v > center) ? Min(v, center + radius) : Max(v, center - radius);
}
#define SQR(x) ((x) * (x)) #define SQR(x) ((x) * (x))
__always_inline auto sq(auto x) { return SQR(x); } __always_inline auto sq(auto x) { return SQR(x); }
@ -418,15 +407,7 @@ template<int s, int t> struct check_size {
#endif #endif
#define BIT(num) (1<<(num)) #define BIT(num) (1<<(num))
#define ABS(a) std::abs(a) #define ABS(a) Abs(a)
__always_inline auto norm(auto value, auto min, auto max) {
return (Clamp(value, min, max) - min) / (max - min);
}
// we use std::lerp now // we use std::lerp now
//#define lerp(norm, min, max) ( (norm) * ((max) - (min)) + (min) ) //#define lerp(norm, min, max) ( (norm) * ((max) - (min)) + (min) )
#define STRINGIFY(x) #x
#define STR(x) STRINGIFY(x)
#define CONCAT_(x,y) x##y
#define CONCAT(x,y) CONCAT_(x,y)

View file

@ -4,7 +4,7 @@ CMatrix::CMatrix(CMatrix const &m)
{ {
m_attachment = nil; m_attachment = nil;
m_hasRwMatrix = false; m_hasRwMatrix = false;
*this = m; mat_copy(*this, m);
} }
CMatrix::CMatrix(RwMatrix *matrix, bool owner) CMatrix::CMatrix(RwMatrix *matrix, bool owner)
@ -75,7 +75,7 @@ CMatrix::UpdateRW(void)
void void
CMatrix::operator=(CMatrix const &rhs) CMatrix::operator=(CMatrix const &rhs)
{ {
memcpy(this, &rhs, sizeof(f)); mat_copy(*this, rhs);
if (m_attachment) if (m_attachment)
UpdateRW(); UpdateRW();
} }
@ -83,7 +83,7 @@ CMatrix::operator=(CMatrix const &rhs)
void void
CMatrix::CopyOnlyMatrix(const CMatrix &other) CMatrix::CopyOnlyMatrix(const CMatrix &other)
{ {
memcpy(this, &other, sizeof(f)); mat_copy(*this, other);
} }
CMatrix & CMatrix &
@ -277,9 +277,9 @@ void
CMatrix::RotateX(float x) CMatrix::RotateX(float x)
{ {
#if 0 && defined(DC_SH4) // this is bugged and does not yield correct results #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
mat_load(reinterpret_cast<matrix_t *>(this)); mat_load(reinterpret_cast<matrix_t *>(this));
mat_rotate_x(x); mat_rotate_x(x);
mat_store(reinterpret_cast<matrix_t *>(this)); mat_store(reinterpret_cast<matrix_t *>(this));
#else #else
auto [s, c] = SinCos(x); auto [s, c] = SinCos(x);
@ -307,9 +307,9 @@ void
CMatrix::RotateY(float y) CMatrix::RotateY(float y)
{ {
#if 0 && defined(DC_SH4) // this is bugged and does not yield correct results #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
mat_load(reinterpret_cast<matrix_t *>(this)); mat_load(reinterpret_cast<matrix_t *>(this));
mat_rotate_y(y); mat_rotate_y(y);
mat_store(reinterpret_cast<matrix_t *>(this)); mat_store(reinterpret_cast<matrix_t *>(this));
#else #else
auto [s, c] = SinCos(y); auto [s, c] = SinCos(y);
@ -337,9 +337,9 @@ void
CMatrix::RotateZ(float z) CMatrix::RotateZ(float z)
{ {
#if 0 && defined(DC_SH4) // this is bugged and does not yield correct results #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
mat_load(reinterpret_cast<matrix_t *>(this)); mat_load(reinterpret_cast<matrix_t *>(this));
mat_rotate_z(z); mat_rotate_z(z);
mat_store(reinterpret_cast<matrix_t *>(this)); mat_store(reinterpret_cast<matrix_t *>(this));
#else #else
auto [s, c] = SinCos(z); auto [s, c] = SinCos(z);
@ -367,9 +367,9 @@ void
CMatrix::Rotate(float x, float y, float z) CMatrix::Rotate(float x, float y, float z)
{ {
#if 0 && defined(DC_SH4) // this is bugged and does not yield correct results #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
mat_load(reinterpret_cast<matrix_t *>(this)); mat_load(reinterpret_cast<matrix_t *>(this));
mat_rotate(x, y, z); mat_rotate(x, y, z);
mat_store(reinterpret_cast<matrix_t *>(this)); mat_store(reinterpret_cast<matrix_t *>(this));
#else #else
auto [sX, cX] = SinCos(x); auto [sX, cX] = SinCos(x);
auto [sY, cY] = SinCos(y); auto [sY, cY] = SinCos(y);
@ -449,65 +449,13 @@ CMatrix::Reorthogonalise(void)
f = CrossProduct(u, r); f = CrossProduct(u, r);
} }
#ifdef DC_SH4
static __always_inline void MATH_Load_Matrix_Product(const matrix_t* matrix1, const matrix_t* matrix2)
{
unsigned int prefetch_scratch;
asm volatile (
"mov %[bmtrx], %[pref_scratch]\n\t" // (MT)
"add #32, %[pref_scratch]\n\t" // offset by 32 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle)
"fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) (FE)
"pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS)
// back matrix
"fmov.d @%[bmtrx]+, XD0\n\t" // (LS)
"fmov.d @%[bmtrx]+, XD2\n\t"
"fmov.d @%[bmtrx]+, XD4\n\t"
"fmov.d @%[bmtrx]+, XD6\n\t"
"pref @%[fmtrx]\n\t" // prefetch fmtrx now while we wait (LS)
"fmov.d @%[bmtrx]+, XD8\n\t" // bmtrx prefetch should work for here
"fmov.d @%[bmtrx]+, XD10\n\t"
"fmov.d @%[bmtrx]+, XD12\n\t"
"mov %[fmtrx], %[pref_scratch]\n\t" // (MT)
"add #32, %[pref_scratch]\n\t" // store offset by 32 in r0 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle)
"fmov.d @%[bmtrx], XD14\n\t"
"pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS)
// front matrix
// interleave loads and matrix multiply 4x4
"fmov.d @%[fmtrx]+, DR0\n\t"
"fmov.d @%[fmtrx]+, DR2\n\t"
"fmov.d @%[fmtrx]+, DR4\n\t" // (LS) want to issue the next one before 'ftrv' for parallel exec
"ftrv XMTRX, FV0\n\t" // (FE)
"fmov.d @%[fmtrx]+, DR6\n\t"
"fmov.d @%[fmtrx]+, DR8\n\t"
"ftrv XMTRX, FV4\n\t"
"fmov.d @%[fmtrx]+, DR10\n\t"
"fmov.d @%[fmtrx]+, DR12\n\t"
"ftrv XMTRX, FV8\n\t"
"fmov.d @%[fmtrx], DR14\n\t" // (LS, but this will stall 'ftrv' for 3 cycles)
"fschg\n\t" // switch back to single moves (and avoid stalling 'ftrv') (FE)
"ftrv XMTRX, FV12\n\t" // (FE)
// Save output in XF regs
"frchg\n"
: [bmtrx] "+&r" ((unsigned int)matrix1), [fmtrx] "+r" ((unsigned int)matrix2), [pref_scratch] "=&r" (prefetch_scratch) // outputs, "+" means r/w, "&" means it's written to before all inputs are consumed
: // no inputs
: "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11", "fr12", "fr13", "fr14", "fr15" // clobbers (GCC doesn't know about back bank, so writing to it isn't clobbered)
);
}
#endif
CMatrix CMatrix
operator*(const CMatrix &m1, const CMatrix &m2) operator*(const CMatrix &m1, const CMatrix &m2)
{ {
// TODO: VU0 code // TODO: VU0 code
CMatrix out; CMatrix out;
#if defined(RW_DC) #if defined(RW_DC)
mat_load(reinterpret_cast<const matrix_t *>(&m1)); mat_mult(out, m1, m2);
mat_apply(reinterpret_cast<const matrix_t *>(&m2));
mat_store(reinterpret_cast<matrix_t *>(&out));
#else #else
out.rx = m1.rx * m2.rx + m1.fx * m2.ry + m1.ux * m2.rz; out.rx = m1.rx * m2.rx + m1.fx * m2.ry + m1.ux * m2.rz;
out.ry = m1.ry * m2.rx + m1.fy * m2.ry + m1.uy * m2.rz; out.ry = m1.ry * m2.rx + m1.fy * m2.ry + m1.uy * m2.rz;

View file

@ -1,5 +1,7 @@
#pragma once #pragma once
#include "rwdc_common.h"
class alignas(8) CMatrix class alignas(8) CMatrix
{ {
public: public:
@ -27,6 +29,8 @@ public:
SetScale(scale); SetScale(scale);
} }
~CMatrix(void); ~CMatrix(void);
operator matrix_t *() { return reinterpret_cast<matrix_t *>(this); }
operator const matrix_t *() const { return reinterpret_cast<const matrix_t *>(this); }
void Attach(RwMatrix *matrix, bool owner = false); void Attach(RwMatrix *matrix, bool owner = false);
void AttachRW(RwMatrix *matrix, bool owner = false); void AttachRW(RwMatrix *matrix, bool owner = false);
void Detach(void); void Detach(void);
@ -59,6 +63,7 @@ public:
void SetScale(float s); void SetScale(float s);
void Scale(float scale) void Scale(float scale)
{ {
#ifndef DC_SH4
for (int i = 0; i < 3; i++) for (int i = 0; i < 3; i++)
#ifdef FIX_BUGS // BUGFIX from VC #ifdef FIX_BUGS // BUGFIX from VC
for (int j = 0; j < 3; j++) for (int j = 0; j < 3; j++)
@ -66,6 +71,11 @@ public:
for (int j = 0; j < 4; j++) for (int j = 0; j < 4; j++)
#endif #endif
f[i][j] *= scale; f[i][j] *= scale;
#else
mat_load(*this);
mat_scale(scale, scale, scale);
mat_store(*this);
#endif
} }
@ -102,11 +112,22 @@ CMatrix Invert(const CMatrix &matrix);
CMatrix operator*(const CMatrix &m1, const CMatrix &m2); CMatrix operator*(const CMatrix &m1, const CMatrix &m2);
inline CVector MultiplyInverse(const CMatrix &mat, const CVector &vec) inline CVector MultiplyInverse(const CMatrix &mat, const CVector &vec)
{ {
#ifndef DC_SH4
CVector v(vec.x - mat.px, vec.y - mat.py, vec.z - mat.pz); CVector v(vec.x - mat.px, vec.y - mat.py, vec.z - mat.pz);
return CVector( return CVector(
mat.rx * v.x + mat.ry * v.y + mat.rz * v.z, mat.rx * v.x + mat.ry * v.y + mat.rz * v.z,
mat.fx * v.x + mat.fy * v.y + mat.fz * v.z, mat.fx * v.x + mat.fy * v.y + mat.fz * v.z,
mat.ux * v.x + mat.uy * v.y + mat.uz * v.z); mat.ux * v.x + mat.uy * v.y + mat.uz * v.z);
#else
register float x asm(KOS_FPARG(0)) = vec.x - mat.px;
register float y asm(KOS_FPARG(1)) = vec.y - mat.py;
register float z asm(KOS_FPARG(2)) = vec.z - mat.pz;
return CVector(
fipr(x, y, z, 0.0f, mat.rx, mat.ry, mat.rz, 0.0f),
fipr(x, y, z, 0.0f, mat.fx, mat.fy, mat.fz, 0.0f),
fipr(x, y, z, 0.0f, mat.ux, mat.uy, mat.uz, 0.0f)
);
#endif
} }

View file

@ -39,10 +39,14 @@ CQuaternion::Slerp(const CQuaternion &q1, const CQuaternion &q2, float theta, fl
void void
CQuaternion::Multiply(const CQuaternion &q1, const CQuaternion &q2) CQuaternion::Multiply(const CQuaternion &q1, const CQuaternion &q2)
{ {
#ifndef DC_SH4
x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w); x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w);
y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w); y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w);
z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w); z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w);
w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z); w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z);
#else
quat_mult(*this, q1, q2);
#endif
} }
void void
@ -51,9 +55,16 @@ CQuaternion::Get(RwV3d *axis, float *angle)
*angle = Acos(w); *angle = Acos(w);
float s = Sin(*angle); float s = Sin(*angle);
#ifndef DC_SH4
axis->x = x * (1.0f / s); axis->x = x * (1.0f / s);
axis->y = y * (1.0f / s); axis->y = y * (1.0f / s);
axis->z = z * (1.0f / s); axis->z = z * (1.0f / s);
#else
float invS = dc::Invert<true, false>(s);
axis->x = x * invS;
axis->y = y * invS;
axis->z = z * invS;
#endif
} }
void void
@ -104,7 +115,7 @@ CQuaternion::Set(const RwMatrix &matrix)
if (f >= 0.0f) { if (f >= 0.0f) {
s = Sqrt(f + 1.0f); s = Sqrt(f + 1.0f);
w = 0.5f * s; w = 0.5f * s;
m = 0.5f / s; m = Div<true, false>(0.5f, s);
x = (matrix.up.z - matrix.at.y) * m; x = (matrix.up.z - matrix.at.y) * m;
y = (matrix.at.x - matrix.right.z) * m; y = (matrix.at.x - matrix.right.z) * m;
z = (matrix.right.y - matrix.up.x) * m; z = (matrix.right.y - matrix.up.x) * m;
@ -115,7 +126,7 @@ CQuaternion::Set(const RwMatrix &matrix)
if (f >= 0.0f) { if (f >= 0.0f) {
s = Sqrt(f + 1.0f); s = Sqrt(f + 1.0f);
x = 0.5f * s; x = 0.5f * s;
m = 0.5f / s; m = Div<true, false>(0.5f, s);
y = (matrix.up.x + matrix.right.y) * m; y = (matrix.up.x + matrix.right.y) * m;
z = (matrix.at.x + matrix.right.z) * m; z = (matrix.at.x + matrix.right.z) * m;
w = (matrix.up.z - matrix.at.y) * m; w = (matrix.up.z - matrix.at.y) * m;
@ -126,7 +137,7 @@ CQuaternion::Set(const RwMatrix &matrix)
if (f >= 0.0f) { if (f >= 0.0f) {
s = Sqrt(f + 1.0f); s = Sqrt(f + 1.0f);
y = 0.5f * s; y = 0.5f * s;
m = 0.5f / s; m = Div<true, false>(0.5f, s);
w = (matrix.at.x - matrix.right.z) * m; w = (matrix.at.x - matrix.right.z) * m;
x = (matrix.up.x - matrix.right.y) * m; x = (matrix.up.x - matrix.right.y) * m;
z = (matrix.at.y + matrix.up.z) * m; z = (matrix.at.y + matrix.up.z) * m;

View file

@ -1,6 +1,7 @@
#pragma once #pragma once
#include "src/common_defines.h" #include "src/common_defines.h"
#include "rwdc_common.h"
// TODO: actually implement this // TODO: actually implement this
class CQuaternion class CQuaternion
@ -10,6 +11,10 @@ public:
CQuaternion(void) {} CQuaternion(void) {}
CQuaternion(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {} CQuaternion(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {}
operator quaternion_t *() { return reinterpret_cast<quaternion_t *>(this); }
operator const quaternion_t *() const { return reinterpret_cast<const quaternion_t *>(this); }
operator quaternion_t &() { return *reinterpret_cast<quaternion_t *>(this); }
operator const quaternion_t &() const { return *reinterpret_cast<const quaternion_t *>(this); }
float Magnitude(void) const { float Magnitude(void) const {
#ifndef DC_SH4 #ifndef DC_SH4
return Sqrt(x*x + y*y + z*z + w*w); return Sqrt(x*x + y*y + z*z + w*w);
@ -57,10 +62,11 @@ public:
} }
const CQuaternion &operator/=(float right) { const CQuaternion &operator/=(float right) {
x /= right; right = dc::Invert(right);
y /= right; x *= right;
z /= right; y *= right;
w /= right; z *= right;
w *= right;
return *this; return *this;
} }

View file

@ -50,9 +50,18 @@ Multiply3x3(const CMatrix &mat, const CVector &vec)
CVector CVector
Multiply3x3(const CVector &vec, const CMatrix &mat) Multiply3x3(const CVector &vec, const CMatrix &mat)
{ {
#ifndef DC_SH4
return CVector(mat.rx * vec.x + mat.ry * vec.y + mat.rz * vec.z, return CVector(mat.rx * vec.x + mat.ry * vec.y + mat.rz * vec.z,
mat.fx * vec.x + mat.fy * vec.y + mat.fz * vec.z, mat.fx * vec.x + mat.fy * vec.y + mat.fz * vec.z,
mat.ux * vec.x + mat.uy * vec.y + mat.uz * vec.z); mat.ux * vec.x + mat.uy * vec.y + mat.uz * vec.z);
#else
CVector out;
mat_load(mat);
mat_transpose();
mat_trans_vec3_nomod(vec.x, vec.y, vec.z,
out.x, out.y, out.z);
return out;
#endif
} }
CVector CVector

View file

@ -44,7 +44,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const CV
sqc2 vf06,0x0(%0)\n\ sqc2 vf06,0x0(%0)\n\
": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory"); ": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory");
#elif defined(DC_SH4) #elif defined(DC_SH4)
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat))); mat_load(mat);
mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z); mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z);
#else #else
out = mat * in; out = mat * in;
@ -71,7 +71,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const Rw
sqc2 vf06,0x0(%0)\n\ sqc2 vf06,0x0(%0)\n\
": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory"); ": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory");
#elif defined(DC_SH4) #elif defined(DC_SH4)
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat))); mat_load(mat);
mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z); mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z);
#else #else
out = mat * in; out = mat * in;
@ -108,7 +108,7 @@ __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat,
bnez %1,1b\n\ bnez %1,1b\n\
": : "r" (out) , "r" (n), "r" (&mat), "r" (in), "r" (stride): "memory"); ": : "r" (out) , "r" (n), "r" (&mat), "r" (in), "r" (stride): "memory");
#elif defined(DC_SH4) #elif defined(DC_SH4)
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat))); mat_load(mat);
while(n--) { while(n--) {
mat_trans_single3_nodiv_nomod(in->x, in->y, in->z, out->x, out->y, out->z); mat_trans_single3_nodiv_nomod(in->x, in->y, in->z, out->x, out->y, out->z);
in = reinterpret_cast<const RwV3d *>(reinterpret_cast<const uint8_t *>(in) + stride); in = reinterpret_cast<const RwV3d *>(reinterpret_cast<const uint8_t *>(in) + stride);

View file

@ -1,73 +1,6 @@
#pragma once #pragma once
#include "src/common_defines.h" #include "src/common_defines.h"
#include "rwdc_common.h"
#include <tuple> using namespace dc;
#include <dc/matrix.h>
#ifdef DC_SH4
#define mat_trans_nodiv_nomod(x, y, z, x2, y2, z2, w2) do { \
register float __x __asm__("fr12") = (x); \
register float __y __asm__("fr13") = (y); \
register float __z __asm__("fr14") = (z); \
register float __w __asm__("fr15") = 1.0f; \
__asm__ __volatile__( "ftrv xmtrx, fv12\n" \
: "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
: "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
} while(false)
#define mat_trans_w_nodiv_nomod(x, y, z, w) do { \
register float __x __asm__("fr12") = (x); \
register float __y __asm__("fr13") = (y); \
register float __z __asm__("fr14") = (z); \
register float __w __asm__("fr15") = 1.0f; \
__asm__ __volatile__( "ftrv xmtrx, fv12\n" \
: "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
: "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
w = __w; \
} while(false)
__always_inline float Fmac(float a, float b, float c) {
asm volatile ("fmac fr0, %[floatb], %[floatc]\n"
: [floatc] "+f" (c) : "w" (a), [floatb] "f" (b) : );
return c;
}
#else
#define mat_trans_nodiv_nomod(x_, y_, z_, x2, y2, z2, w2) do { \
vector_t tmp = { x_, y_, z_, 1.0f }; \
mat_transform(&tmp, &tmp, 1, 0); \
x2 = tmp.x; y2 = tmp.y; z2 = tmp.z; w2 = tmp.w; \
} while(false)
#define mat_trans_w_nodiv_nomod(x_, y_, z_, w_) do { \
vector_t tmp = { x_, y_, z_, 1.0f }; \
mat_transform(&tmp, &tmp, 1, 0); \
w_ = tmp.w; \
} while(false)
__always_inline float Fmac(float a, float b, float c) { return a * b + c; }
#endif
__always_inline float Sin(float x) { return __builtin_sinf(x); }
__always_inline float Cos(float x) { return __builtin_cosf(x); }
__always_inline auto SinCos(float x) { return std::pair { Sin(x), Cos(x) }; }
__always_inline float Tan(float x) { return __builtin_tanf(x); }
__always_inline float Abs(float x) { return __builtin_fabsf(x); }
__always_inline float Sqrt(float x) { return __builtin_sqrtf(x); }
__always_inline float RecipSqrt(float x) { return 1.0f / __builtin_sqrtf(x); }
__always_inline float Asin(float x) { return __builtin_asinf(x); }
__always_inline float Acos(float x) { return __builtin_acosf(x); }
__always_inline float Atan(float x) { return __builtin_atanf(x); }
__always_inline float Atan2(float y, float x) { return __builtin_atan2f(y, x); }
__always_inline float RecipSqrt(float x, float y) { return x / __builtin_sqrtf(y); /*y = RecipSqrt(y); return x * y * y;*/ }
__always_inline float Pow(float x, float y) { return __builtin_powf(x, y); }
__always_inline float Floor(float x) { return __builtin_floorf(x); }
__always_inline float Ceil(float x) { return __builtin_ceilf(x); }
__always_inline float Invert(float x) { return (((x) < 0.0f)? -1.0f : 1.0f) * RecipSqrt((x) * (x)); }
__always_inline float Div(float x, float y) { return x * Invert(y); }
__always_inline float Lerp(float a, float b, float t) { return Fmac(t, (b - a), a); }

View file

@ -85,8 +85,6 @@
#define rwVENDORID_ROCKSTAR 0x0253F2 #define rwVENDORID_ROCKSTAR 0x0253F2
#define Max(a,b) ((a) > (b) ? (a) : (b))
#define Min(a,b) ((a) < (b) ? (a) : (b))
// Use this to add const that wasn't there in the original code // Use this to add const that wasn't there in the original code
#define Const const #define Const const
@ -303,12 +301,8 @@ extern int strncasecmp(const char *str1, const char *str2, size_t len);
extern wchar *AllocUnicode(const char*src); extern wchar *AllocUnicode(const char*src);
#define Clamp(v, low, high) ((v)<(low) ? (low) : (v)>(high) ? (high) : (v))
#define Clamp2(v, center, radius) ((v) > (center) ? Min(v, center + radius) : Max(v, center - radius))
inline float sq(float x) { return x*x; }
#define SQR(x) ((x) * (x)) #define SQR(x) ((x) * (x))
__always_inline auto sq(auto x) { return SQR(x); }
#ifdef __MWERKS__ #ifdef __MWERKS__
#define M_E 2.71828182845904523536 // e #define M_E 2.71828182845904523536 // e
@ -326,7 +320,10 @@ inline float sq(float x) { return x*x; }
#define M_SQRT1_2 0.707106781186547524401 // 1/sqrt(2) #define M_SQRT1_2 0.707106781186547524401 // 1/sqrt(2)
#endif #endif
#define PI (float)M_PI #ifndef DC_SH4
#define F_PI M_PI
#endif
#define PI (float)F_PI
#define TWOPI (PI*2) #define TWOPI (PI*2)
#define HALFPI (PI/2) #define HALFPI (PI/2)
#define DEGTORAD(x) ((x) * PI / 180.0f) #define DEGTORAD(x) ((x) * PI / 180.0f)
@ -380,7 +377,10 @@ __inline__ void TRACE(char *f, ...) { } // this is re3 only, and so the function
#endif #endif
#endif #endif
#ifndef MASTER #ifdef assert
#undef assert
#endif
#if !defined(MASTER)
#define assert(_Expression) (void)( (!!(_Expression)) || (re3_assert(#_Expression, __FILE__, __LINE__, __FUNCTION__), 0) ) #define assert(_Expression) (void)( (!!(_Expression)) || (re3_assert(#_Expression, __FILE__, __LINE__, __FUNCTION__), 0) )
#else #else
#define assert(_Expression) (_Expression) #define assert(_Expression) (_Expression)
@ -411,12 +411,7 @@ template<int s, int t> struct check_size {
#endif #endif
#define BIT(num) (1<<(num)) #define BIT(num) (1<<(num))
#define ABS(a) (((a) < 0) ? (-(a)) : (a)) #define ABS(a) Abs(a)
#define norm(value, min, max) (((value) < (min)) ? 0 : (((value) > (max)) ? 1 : (((value) - (min)) / ((max) - (min)))))
#define Lerp(norm, min, max) ( (norm) * ((max) - (min)) + (min) )
#define STRINGIFY(x) #x
#define STR(x) STRINGIFY(x)
#define CONCAT_(x,y) x##y
#define CONCAT(x,y) CONCAT_(x,y)
// we use std::lerp now
//#define lerp(norm, min, max) ( (norm) * ((max) - (min)) + (min) )

View file

@ -4,7 +4,7 @@ CMatrix::CMatrix(CMatrix const &m)
{ {
m_attachment = nil; m_attachment = nil;
m_hasRwMatrix = false; m_hasRwMatrix = false;
*this = m; mat_copy(*this, m);
} }
CMatrix::CMatrix(RwMatrix *matrix, bool owner) CMatrix::CMatrix(RwMatrix *matrix, bool owner)
@ -75,7 +75,7 @@ CMatrix::UpdateRW(void)
void void
CMatrix::operator=(CMatrix const &rhs) CMatrix::operator=(CMatrix const &rhs)
{ {
memcpy(this, &rhs, sizeof(f)); mat_copy(*this, rhs);
if (m_attachment) if (m_attachment)
UpdateRW(); UpdateRW();
} }
@ -83,7 +83,7 @@ CMatrix::operator=(CMatrix const &rhs)
void void
CMatrix::CopyOnlyMatrix(const CMatrix &other) CMatrix::CopyOnlyMatrix(const CMatrix &other)
{ {
memcpy(this, &other, sizeof(f)); mat_copy(*this, other);
} }
CMatrix & CMatrix &
@ -358,12 +358,14 @@ CMatrix::RotateZ(float z)
void void
CMatrix::Rotate(float x, float y, float z) CMatrix::Rotate(float x, float y, float z)
{ {
float cX = Cos(x); #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
float sX = Sin(x); mat_load(reinterpret_cast<matrix_t *>(this));
float cY = Cos(y); mat_rotate(x, y, z);
float sY = Sin(y); mat_store(reinterpret_cast<matrix_t *>(this));
float cZ = Cos(z); #else
float sZ = Sin(z); auto [sX, cX] = SinCos(x);
auto [sY, cY] = SinCos(y);
auto [sZ, cZ] = SinCos(z);
float rx = this->rx; float rx = this->rx;
float ry = this->ry; float ry = this->ry;
@ -388,6 +390,20 @@ CMatrix::Rotate(float x, float y, float z)
float z2 = sZ * sY - (cZ * sX) * cY; float z2 = sZ * sY - (cZ * sX) * cY;
float z3 = cX * cY; float z3 = cX * cY;
#if !defined(DC_TEXCONV) && !defined(DC_SIM)
this->rx = fipr(x1, y1, z1, 0, rx, ry, rz, 0);
this->ry = fipr(x2, y2, z2, 0, rx, ry, rz, 0);
this->rz = fipr(x3, y3, z3, 0, rx, ry, rz, 0);
this->fx = fipr(x1, y1, z1, 0, ux, uy, uz, 0);
this->fy = fipr(x2, y2, z2, 0, ux, uy, uz, 0);
this->fz = fipr(x3, y3, z3, 0, ux, uy, uz, 0);
this->ux = fipr(x1, y1, z1, 0, ax, ay, az, 0);
this->uy = fipr(x2, y2, z2, 0, ax, ay, az, 0);
this->uz = fipr(x3, y3, z3, 0, ax, ay, az, 0);
this->px = fipr(x1, y1, z1, 0, px, py, pz, 0);
this->py = fipr(x2, y2, z2, 0, px, py, pz, 0);
this->pz = fipr(x3, y3, z3, 0, px, py, pz, 0);
#else
this->rx = x1 * rx + y1 * ry + z1 * rz; this->rx = x1 * rx + y1 * ry + z1 * rz;
this->ry = x2 * rx + y2 * ry + z2 * rz; this->ry = x2 * rx + y2 * ry + z2 * rz;
this->rz = x3 * rx + y3 * ry + z3 * rz; this->rz = x3 * rx + y3 * ry + z3 * rz;
@ -400,6 +416,8 @@ CMatrix::Rotate(float x, float y, float z)
this->px = x1 * px + y1 * py + z1 * pz; this->px = x1 * px + y1 * py + z1 * pz;
this->py = x2 * px + y2 * py + z2 * pz; this->py = x2 * px + y2 * py + z2 * pz;
this->pz = x3 * px + y3 * py + z3 * pz; this->pz = x3 * px + y3 * py + z3 * pz;
#endif
#endif
} }
CMatrix & CMatrix &
@ -429,9 +447,7 @@ operator*(const CMatrix &m1, const CMatrix &m2)
// TODO: VU0 code // TODO: VU0 code
CMatrix out; CMatrix out;
#if defined(RW_DC) #if defined(RW_DC)
mat_load(reinterpret_cast<const matrix_t *>(&m1)); mat_mult(out, m1, m2);
mat_apply(reinterpret_cast<const matrix_t *>(&m2));
mat_store(reinterpret_cast<matrix_t *>(&out));
#else #else
out.rx = m1.rx * m2.rx + m1.fx * m2.ry + m1.ux * m2.rz; out.rx = m1.rx * m2.rx + m1.fx * m2.ry + m1.ux * m2.rz;
out.ry = m1.ry * m2.rx + m1.fy * m2.ry + m1.uy * m2.rz; out.ry = m1.ry * m2.rx + m1.fy * m2.ry + m1.uy * m2.rz;

View file

@ -43,6 +43,8 @@ public:
SetScale(scale); SetScale(scale);
} }
~CMatrix(void); ~CMatrix(void);
operator matrix_t *() { return reinterpret_cast<matrix_t *>(this); }
operator const matrix_t *() const { return reinterpret_cast<const matrix_t *>(this); }
void Attach(RwMatrix *matrix, bool owner = false); void Attach(RwMatrix *matrix, bool owner = false);
void AttachRW(RwMatrix *matrix, bool owner = false); void AttachRW(RwMatrix *matrix, bool owner = false);
void Detach(void); void Detach(void);
@ -75,20 +77,31 @@ public:
void SetScale(float s); void SetScale(float s);
void Scale(float scale) void Scale(float scale)
{ {
#if !defined(DC_SH4) || 1 /* TODO */
for (int i = 0; i < 3; i++) for (int i = 0; i < 3; i++)
for (int j = 0; j < 3; j++) for (int j = 0; j < 3; j++)
f[i][j] *= scale; f[i][j] *= scale;
#else
mat_load(*this);
mat_scale(scale, scale, scale);
mat_store(*this);
#endif
} }
void Scale(float sx, float sy, float sz) void Scale(float sx, float sy, float sz)
{ {
#if !defined(DC_SH4) || 1 /* TODO */
for (int i = 0; i < 3; i++){ for (int i = 0; i < 3; i++){
f[i][0] *= sx; f[i][0] *= sx;
f[i][1] *= sy; f[i][1] *= sy;
f[i][2] *= sz; f[i][2] *= sz;
} }
#else
mat_load(*this);
mat_scale(sx, sy, sz);
mat_store(*this);
#endif
} }
void SetRotateXOnly(float angle); void SetRotateXOnly(float angle);
void SetRotateYOnly(float angle); void SetRotateYOnly(float angle);
void SetRotateZOnly(float angle); void SetRotateZOnly(float angle);
@ -125,11 +138,22 @@ CMatrix Invert(const CMatrix &matrix);
CMatrix operator*(const CMatrix &m1, const CMatrix &m2); CMatrix operator*(const CMatrix &m1, const CMatrix &m2);
inline CVector MultiplyInverse(const CMatrix &mat, const CVector &vec) inline CVector MultiplyInverse(const CMatrix &mat, const CVector &vec)
{ {
#ifndef DC_SH4
CVector v(vec.x - mat.px, vec.y - mat.py, vec.z - mat.pz); CVector v(vec.x - mat.px, vec.y - mat.py, vec.z - mat.pz);
return CVector( return CVector(
mat.rx * v.x + mat.ry * v.y + mat.rz * v.z, mat.rx * v.x + mat.ry * v.y + mat.rz * v.z,
mat.fx * v.x + mat.fy * v.y + mat.fz * v.z, mat.fx * v.x + mat.fy * v.y + mat.fz * v.z,
mat.ux * v.x + mat.uy * v.y + mat.uz * v.z); mat.ux * v.x + mat.uy * v.y + mat.uz * v.z);
#else
register float x asm(KOS_FPARG(0)) = vec.x - mat.px;
register float y asm(KOS_FPARG(1)) = vec.y - mat.py;
register float z asm(KOS_FPARG(2)) = vec.z - mat.pz;
return CVector(
fipr(x, y, z, 0.0f, mat.rx, mat.ry, mat.rz, 0.0f),
fipr(x, y, z, 0.0f, mat.fx, mat.fy, mat.fz, 0.0f),
fipr(x, y, z, 0.0f, mat.ux, mat.uy, mat.uz, 0.0f)
);
#endif
} }

View file

@ -39,10 +39,14 @@ CQuaternion::Slerp(const CQuaternion &q1, const CQuaternion &q2, float theta, fl
void void
CQuaternion::Multiply(const CQuaternion &q1, const CQuaternion &q2) CQuaternion::Multiply(const CQuaternion &q1, const CQuaternion &q2)
{ {
#ifndef DC_SH4
x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w); x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w);
y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w); y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w);
z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w); z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w);
w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z); w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z);
#else
quat_mult(*this, q1, q2);
#endif
} }
void void
@ -51,9 +55,16 @@ CQuaternion::Get(RwV3d *axis, float *angle)
*angle = Acos(w); *angle = Acos(w);
float s = Sin(*angle); float s = Sin(*angle);
#ifndef DC_SH4
axis->x = x * (1.0f / s); axis->x = x * (1.0f / s);
axis->y = y * (1.0f / s); axis->y = y * (1.0f / s);
axis->z = z * (1.0f / s); axis->z = z * (1.0f / s);
#else
float invS = dc::Invert<true, false>(s);
axis->x = x * invS;
axis->y = y * invS;
axis->z = z * invS;
#endif
} }
void void
@ -104,7 +115,7 @@ CQuaternion::Set(const RwMatrix &matrix)
if (f >= 0.0f) { if (f >= 0.0f) {
s = Sqrt(f + 1.0f); s = Sqrt(f + 1.0f);
w = 0.5f * s; w = 0.5f * s;
m = 0.5f / s; m = Div<true, false>(0.5f, s);
x = (matrix.up.z - matrix.at.y) * m; x = (matrix.up.z - matrix.at.y) * m;
y = (matrix.at.x - matrix.right.z) * m; y = (matrix.at.x - matrix.right.z) * m;
z = (matrix.right.y - matrix.up.x) * m; z = (matrix.right.y - matrix.up.x) * m;
@ -115,7 +126,7 @@ CQuaternion::Set(const RwMatrix &matrix)
if (f >= 0.0f) { if (f >= 0.0f) {
s = Sqrt(f + 1.0f); s = Sqrt(f + 1.0f);
x = 0.5f * s; x = 0.5f * s;
m = 0.5f / s; m = Div<true, false>(0.5f, s);
y = (matrix.up.x + matrix.right.y) * m; y = (matrix.up.x + matrix.right.y) * m;
z = (matrix.at.x + matrix.right.z) * m; z = (matrix.at.x + matrix.right.z) * m;
w = (matrix.up.z - matrix.at.y) * m; w = (matrix.up.z - matrix.at.y) * m;
@ -126,7 +137,7 @@ CQuaternion::Set(const RwMatrix &matrix)
if (f >= 0.0f) { if (f >= 0.0f) {
s = Sqrt(f + 1.0f); s = Sqrt(f + 1.0f);
y = 0.5f * s; y = 0.5f * s;
m = 0.5f / s; m = Div<true, false>(0.5f, s);
w = (matrix.at.x - matrix.right.z) * m; w = (matrix.at.x - matrix.right.z) * m;
x = (matrix.up.x - matrix.right.y) * m; x = (matrix.up.x - matrix.right.y) * m;
z = (matrix.at.y + matrix.up.z) * m; z = (matrix.at.y + matrix.up.z) * m;

View file

@ -1,6 +1,7 @@
#pragma once #pragma once
#include "src/common_defines.h" #include "src/common_defines.h"
#include "rwdc_common.h"
// TODO: actually implement this // TODO: actually implement this
class CQuaternion class CQuaternion
@ -10,6 +11,10 @@ public:
CQuaternion(void) {} CQuaternion(void) {}
CQuaternion(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {} CQuaternion(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {}
operator quaternion_t *() { return reinterpret_cast<quaternion_t *>(this); }
operator const quaternion_t *() const { return reinterpret_cast<const quaternion_t *>(this); }
operator quaternion_t &() { return *reinterpret_cast<quaternion_t *>(this); }
operator const quaternion_t &() const { return *reinterpret_cast<const quaternion_t *>(this); }
float Magnitude(void) const { float Magnitude(void) const {
#ifndef DC_SH4 #ifndef DC_SH4
return Sqrt(x*x + y*y + z*z + w*w); return Sqrt(x*x + y*y + z*z + w*w);
@ -57,10 +62,11 @@ public:
} }
const CQuaternion &operator/=(float right) { const CQuaternion &operator/=(float right) {
x /= right; right = dc::Invert(right);
y /= right; x *= right;
z /= right; y *= right;
w /= right; z *= right;
w *= right;
return *this; return *this;
} }

View file

@ -50,9 +50,18 @@ Multiply3x3(const CMatrix &mat, const CVector &vec)
CVector CVector
Multiply3x3(const CVector &vec, const CMatrix &mat) Multiply3x3(const CVector &vec, const CMatrix &mat)
{ {
#ifndef DC_SH4
return CVector(mat.rx * vec.x + mat.ry * vec.y + mat.rz * vec.z, return CVector(mat.rx * vec.x + mat.ry * vec.y + mat.rz * vec.z,
mat.fx * vec.x + mat.fy * vec.y + mat.fz * vec.z, mat.fx * vec.x + mat.fy * vec.y + mat.fz * vec.z,
mat.ux * vec.x + mat.uy * vec.y + mat.uz * vec.z); mat.ux * vec.x + mat.uy * vec.y + mat.uz * vec.z);
#else
CVector out;
mat_load(mat);
mat_transpose();
mat_trans_vec3_nomod(vec.x, vec.y, vec.z,
out.x, out.y, out.z);
return out;
#endif
} }
CVector CVector

View file

@ -50,7 +50,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const CV
sqc2 vf06,0x0(%0)\n\ sqc2 vf06,0x0(%0)\n\
": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory"); ": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory");
#elif defined(DC_SH4) #elif defined(DC_SH4)
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat))); mat_load(mat);
mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z); mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z);
#else #else
out = mat * in; out = mat * in;
@ -77,7 +77,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const Rw
sqc2 vf06,0x0(%0)\n\ sqc2 vf06,0x0(%0)\n\
": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory"); ": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory");
#elif defined(DC_SH4) #elif defined(DC_SH4)
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat))); mat_load(mat);
mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z); mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z);
#else #else
out = mat * in; out = mat * in;
@ -114,7 +114,7 @@ __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat,
bnez %1,1b\n\ bnez %1,1b\n\
": : "r" (out) , "r" (n), "r" (&mat), "r" (in), "r" (stride): "memory"); ": : "r" (out) , "r" (n), "r" (&mat), "r" (in), "r" (stride): "memory");
#elif defined(DC_SH4) #elif defined(DC_SH4)
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat))); mat_load(mat);
while(n--) { while(n--) {
mat_trans_single3_nodiv_nomod(in->x, in->y, in->z, out->x, out->y, out->z); mat_trans_single3_nodiv_nomod(in->x, in->y, in->z, out->x, out->y, out->z);
in = reinterpret_cast<const RwV3d *>(reinterpret_cast<const uint8_t *>(in) + stride); in = reinterpret_cast<const RwV3d *>(reinterpret_cast<const uint8_t *>(in) + stride);
@ -137,7 +137,6 @@ __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat,
lqc2 vf03,0x10(%2)\n\ lqc2 vf03,0x10(%2)\n\
lqc2 vf04,0x20(%2)\n\ lqc2 vf04,0x20(%2)\n\
lqc2 vf05,0x30(%2)\n\ lqc2 vf05,0x30(%2)\n\
lqc2 vf01,0x0(%3)\n\
nop\n\ nop\n\
1: vmulax.xyz ACC, vf02,vf01\n\ 1: vmulax.xyz ACC, vf02,vf01\n\
vmadday.xyz ACC, vf03,vf01\n\ vmadday.xyz ACC, vf03,vf01\n\

View file

@ -1,46 +1,6 @@
#pragma once #pragma once
#include "src/common_defines.h" #include "src/common_defines.h"
#include "rwdc_common.h"
#include <dc/matrix.h> using namespace dc;
#ifdef DC_SH4
#define mat_trans_nodiv_nomod(x, y, z, x2, y2, z2, w2) do { \
register float __x __asm__("fr12") = (x); \
register float __y __asm__("fr13") = (y); \
register float __z __asm__("fr14") = (z); \
register float __w __asm__("fr15") = 1.0f; \
__asm__ __volatile__( "ftrv xmtrx, fv12\n" \
: "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
: "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
} while(false)
#else
#define mat_trans_nodiv_nomod(x_, y_, z_, x2, y2, z2, w2) do { \
vector_t tmp = { x_, y_, z_, 1.0f }; \
mat_transform(&tmp, &tmp, 1, 0); \
x2 = tmp.x; y2 = tmp.y; z2 = tmp.z; w2 = tmp.w; \
} while(false)
#endif
// wrapper around float versions of functions
// in gta they are in CMaths but that makes the code rather noisy
inline float Sin(float x) { return sinf(x); }
inline float Asin(float x) { return asinf(x); }
inline float Cos(float x) { return cosf(x); }
inline float Acos(float x) { return acosf(x); }
inline float Tan(float x) { return tanf(x); }
inline float Atan(float x) { return atanf(x); }
inline float Atan2(float y, float x) { return atan2f(y, x); }
inline float Abs(float x) { return fabsf(x); }
inline float Sqrt(float x) { return sqrtf(x); }
inline float RecipSqrt(float x, float y) { return x/Sqrt(y); }
inline float RecipSqrt(float x) { return RecipSqrt(1.0f, x); }
inline float Pow(float x, float y) { return powf(x, y); }
inline float Floor(float x) { return floorf(x); }
inline float Ceil(float x) { return ceilf(x); }

2
vendor/dca3-kos vendored

@ -1 +1 @@
Subproject commit 5f69d048aaf2548687c33afc7726d603a22e29b7 Subproject commit 144eaeab940246d9c84c1f14b45d9e2847705c5e

View file

@ -89,10 +89,18 @@ strncmp_ci(const char *s1, const char *s2, int n)
Quat Quat
mult(const Quat &q, const Quat &p) mult(const Quat &q, const Quat &p)
{ {
#ifndef DC_SH4
return makeQuat(q.w*p.w - q.x*p.x - q.y*p.y - q.z*p.z, return makeQuat(q.w*p.w - q.x*p.x - q.y*p.y - q.z*p.z,
q.w*p.x + q.x*p.w + q.y*p.z - q.z*p.y, q.w*p.x + q.x*p.w + q.y*p.z - q.z*p.y,
q.w*p.y + q.y*p.w + q.z*p.x - q.x*p.z, q.w*p.y + q.y*p.w + q.z*p.x - q.x*p.z,
q.w*p.z + q.z*p.w + q.x*p.y - q.y*p.x); q.w*p.z + q.z*p.w + q.x*p.y - q.y*p.x);
#else
Quat o;
dc::quat_mult(reinterpret_cast<dc::quaternion_t *>(&o),
reinterpret_cast<const dc::quaternion_t&>(q),
reinterpret_cast<const dc::quaternion_t&>(p));
return o;
#endif
} }

View file

@ -728,13 +728,6 @@ __always_inline void DCE_RenderSubmitVertexIM3D(float x, float y, float w,
/* END TA Submission Functions*/ /* END TA Submission Functions*/
#if defined(DC_TEXCONV)
void malloc_stats() { }
#endif
#if 0 #if 0
#define UNIMPL_LOG() printf("TODO: Implement %s @ %s:%d\n", __func__, __FILE__, __LINE__); #define UNIMPL_LOG() printf("TODO: Implement %s @ %s:%d\n", __func__, __FILE__, __LINE__);
#define UNIMPL_LOGV(fmt, ...) printf("TODO: Implement %s @ %s:%d " fmt "\n", __func__, __FILE__, __LINE__, __VA_ARGS__); #define UNIMPL_LOGV(fmt, ...) printf("TODO: Implement %s @ %s:%d " fmt "\n", __func__, __FILE__, __LINE__, __VA_ARGS__);

739
vendor/librw/src/dc/rwdc_common.h vendored Normal file
View file

@ -0,0 +1,739 @@
#ifndef __RWDC_COMMON_H
#define __RWDC_COMMON_H
#include <tuple>
#include <limits>
#include <cmath>
#include <type_traits>
#include <algorithm>
#include <cstring>
#ifndef RW_DC
# pragma warning(disable: 4244) // int to float
# pragma warning(disable: 4800) // int to bool
# pragma warning(disable: 4838) // narrowing conversion
# pragma warning(disable: 4996) // POSIX names
#else
# if !defined(DC_TEXCONV) && !defined(DC_SIM)
# include <kos.h>
# define DC_SH4
# define VIDEO_MODE_WIDTH static_cast<float>(vid_mode->width)
# define VIDEO_MODE_HEIGHT static_cast<float>(vid_mode->height)
# define memcpy4 memcpy
# else
# ifdef DC_TEXCONV
# define malloc_stats()
# endif
# include <dc/matrix.h>
# include <dc/pvr.h>
# include <kos/dbglog.h>
# define VIDEO_MODE_WIDTH 640.0f
# define VIDEO_MODE_HEIGHT 480.0f
# define memcpy4 memcpy
# define dcache_pref_block(a) __builtin_prefetch(a)
# define F_PI M_PI
# ifndef __always_inline
# define __always_inline inline
# endif
# endif
# ifdef PVR_TXRFMT_STRIDE
# undef PVR_TXRFMT_STRIDE
# define PVR_TXRFMT_STRIDE (1 << 25)
# endif
static_assert(PVR_TXRFMT_STRIDE == (1 << 25),
"PVR_TXRFMT_STRIDE is bugged in your KOS version");
#endif
#define F_PI_2 (F_PI * 0.5f)
#define __hot __attribute__((hot))
#define __cold __attribute__((cold))
#define STRINGIFY(x) #x
#define STR(x) STRINGIFY(x)
#define CONCAT_(x,y) x##y
#define CONCAT(x,y) CONCAT_(x,y)
namespace rw {
class Matrix;
}
namespace dc {
struct quaternion_t {
float x, y, z, w;
};
__always_inline __hot constexpr float Sin(float x) { return sinf(x); }
__always_inline __hot constexpr float Cos(float x) { return cosf(x); }
__always_inline __hot constexpr auto SinCos(float x) { return std::pair { Sin(x), Cos(x) }; }
__always_inline __hot constexpr float Abs(float x) { return fabsf(x); }
__always_inline __hot constexpr float Sqrt(float x) { return sqrtf(x); }
__always_inline __hot constexpr float RecipSqrt(float x, float y) { return x / Sqrt(y); }
__always_inline __hot constexpr float Pow(float x, float y) { return powf(x, y); }
__always_inline __hot constexpr float Floor(float x) { return floorf(x); }
__always_inline __hot constexpr float Ceil(float x) { return ceilf(x); }
__always_inline __hot constexpr float Fmac(auto a, auto b, auto c) { return a * b + c; }
__always_inline __hot constexpr float Lerp(float a, float b, float t) { return Fmac(t, (b - a), a); }
__always_inline __hot constexpr auto Max(auto a, auto b) { return ((a > b)? a : b); }
__always_inline __hot constexpr auto Min(auto a, auto b) { return ((a < b)? a : b); }
template<bool CHECK_ZERO=false>
__always_inline __hot constexpr float RecipSqrt(float x) {
if constexpr(CHECK_ZERO)
if(x == 0.0f)
return 0.0f;
return 1.0f / Sqrt(x);
}
template<typename T>
__always_inline __hot constexpr T Clamp(T v, auto low, auto high) {
return std::clamp(v, static_cast<T>(low), static_cast<T>(high));
}
__always_inline constexpr auto Clamp2(auto v, auto center, auto radius) {
return (v > center) ? Min(v, center + radius) : Max(v, center - radius);
}
template<bool FAST_APPROX=true, bool COPY_SIGN=true>
__always_inline __hot constexpr float Invert(float x) {
float value;
if(!std::is_constant_evaluated() && FAST_APPROX) {
value = RecipSqrt(x * x);
if constexpr(COPY_SIGN)
if(x < 0.0f)
value = -value;
} else value = 1.0f / x;
return value;
}
template<bool FAST_APPROX=true, bool COPY_SIGN=true>
__always_inline __hot constexpr float Div(float x, float y) {
if(FAST_APPROX && !std::is_constant_evaluated())
return x * Invert<true, COPY_SIGN>(y);
else
return x / y;
}
template<bool FAST_APPROX=false, bool COPY_SIGN=true>
__always_inline __hot constexpr auto Norm(auto value, auto min, auto max) {
auto numerator = Clamp(value, min, max) - min;
auto denominator = (max - min);
if(FAST_APPROX && !std::is_constant_evaluated())
return Div<true, COPY_SIGN>(numerator, denominator);
else
return numerator / denominator;
}
template<bool FAST_APPROX=false, bool FAST_DIV=false, bool DIV_COPY_SIGN=false>
__always_inline __hot constexpr float Tan(float x) {
if(!std::is_constant_evaluated() && FAST_APPROX) {
constexpr float pisqby4 = 2.4674011002723397f;
constexpr float adjpisqby4 = 2.471688400562703f;
constexpr float adj1minus8bypisq = 0.189759681063053f;
float xsq = x * x;
return x * Div<FAST_DIV, DIV_COPY_SIGN>(adjpisqby4 - adj1minus8bypisq * xsq,
pisqby4 - xsq);
} else
return tanf(x);
}
template<bool FAST_APPROX=false>
__always_inline __hot constexpr float Atan(float x) {
if(FAST_APPROX && !std::is_constant_evaluated()) {
constexpr float a[3] = { //
0.998418889819911f, -2.9993501171084700E-01f, 0.0869142852883849f};
float xx = x * x;
return ((a[2] * xx + a[1]) * xx + a[0]) * x;
} else return atanf(x);
}
template<bool FAST_APPROX=false>
__hot constexpr float Atan2(float y, float x) {
if(FAST_APPROX && !std::is_constant_evaluated()) {
#if 0
constexpr float halfpi_i754 = M_PI * 0.5f;
constexpr float quarterpi_i754 = M_PI * 0.25f;
// kludge to prevent 0/0 condition
float abs_y = Abs(y) + std::numeric_limits<float>::epsilon();
float absy_plus_absx = abs_y + Abs(x);
float inv_absy_plus_absx = Invert<true, true>(absy_plus_absx);
float angle = halfpi_i754 - copysignf(quarterpi_i754, x);
float r = (x - copysignf(abs_y, x)) * inv_absy_plus_absx;
angle += (0.1963f * r * r - 0.9817f) * r;
return copysignf(angle, y);
#else
// Ensure input is in [-1, +1]
bool swap = fabs(x) < fabs(y);
float atan_input = (swap ? x : y) / (swap ? y : x);
// Approximate atan
float res = Atan<true>(atan_input);
// If swapped, adjust atan output
res = swap ? (atan_input >= 0.0f ? F_PI_2 : -F_PI_2) - res : res;
// Adjust quadrants
if (x >= 0.0f && y >= 0.0f) {} // 1st quadrant
else if (x < 0.0f && y >= 0.0f) { res = F_PI + res; } // 2nd quadrant
else if (x < 0.0f && y < 0.0f) { res = -F_PI + res; } // 3rd quadrant
else if (x >= 0.0f && y < 0.0f) {} // 4th quadrant
// Store result
return res;
#endif
} else return atan2f(y, x);
}
template<bool FAST_APPROX=false>
__always_inline __hot constexpr float Asin(float x) {
if(FAST_APPROX && !std::is_constant_evaluated()) {
Atan(Div<true, false>(x, Sqrt(1.0f-(x*x))));
} else return asinf(x);
}
template<bool FAST_APPROX=false>
__always_inline __hot constexpr float Acos(float x) {
if(FAST_APPROX && !std::is_constant_evaluated()) {
return (-0.69813170079773212f * x * x - 0.87266462599716477f) * x + 1.5707963267948966f;
} else return acosf(x);
}
#ifdef RW_DC
# ifdef DC_SH4
#define mat_trans_nodiv_nomod(x, y, z, x2, y2, z2, w2) do { \
register float __x __asm__("fr12") = (x); \
register float __y __asm__("fr13") = (y); \
register float __z __asm__("fr14") = (z); \
register float __w __asm__("fr15") = 1.0f; \
__asm__ __volatile__( "ftrv xmtrx, fv12\n" \
: "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
: "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
} while(false)
#define mat_trans_nodiv_nomod_zerow(x, y, z, x2, y2, z2, w2) do { \
register float __x __asm__("fr12") = (x); \
register float __y __asm__("fr13") = (y); \
register float __z __asm__("fr14") = (z); \
register float __w __asm__("fr15") = 0.0f; \
__asm__ __volatile__( "ftrv xmtrx, fv12\n" \
: "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
: "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
} while(false)
#define mat_trans_w_nodiv_nomod(x, y, z, w) do { \
register float __x __asm__("fr12") = (x); \
register float __y __asm__("fr13") = (y); \
register float __z __asm__("fr14") = (z); \
register float __w __asm__("fr15") = 1.0f; \
__asm__ __volatile__( "ftrv xmtrx, fv12\n" \
: "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
: "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
w = __w; \
} while(false)
#define mat_trans_vec3_nomod(x, y, z, x2, y2, z2) { \
register float __x __asm__("fr12") = (x); \
register float __y __asm__("fr13") = (y); \
register float __z __asm__("fr14") = (z); \
__asm__ __volatile__( \
"fldi0 fr15\n" \
"ftrv xmtrx, fv8\n" \
: "=f" (__x), "=f" (__y), "=f" (__z) \
: "0" (__x), "1" (__y), "2" (__z) \
: "fr15" ); \
x2 = __x; y2 = __y; z2 = __z; \
}
// no declspec naked, so can't do rts / fschg. instead compiler pads with nop?
inline __hot void mat_load_3x3(const matrix_t* mtx) {
__asm__ __volatile__ (
R"(
fschg
frchg
fmov @%[mtx]+, dr0
fldi0 fr12
fmov @%[mtx]+, dr2
fldi0 fr13
fmov @%[mtx]+, dr4
fldi0 fr3
fmov @%[mtx]+, dr6
fmov dr12, dr14
fmov @%[mtx]+, dr8
fldi0 fr7
fmov @%[mtx]+, dr10
fldi0 fr11
fschg
frchg
)"
: [mtx] "+r" (mtx)
);
}
// sets pos.w to 1
inline __hot void rw_mat_load_4x4(const rw::Matrix* mtx) {
__asm__ __volatile__ (
R"(
fschg
frchg
fmov @%[mtx]+, dr0
fmov @%[mtx]+, dr2
fmov @%[mtx]+, dr4
fmov @%[mtx]+, dr6
fmov @%[mtx]+, dr8
fmov @%[mtx]+, dr10
fmov @%[mtx]+, dr12
fmov @%[mtx]+, dr14
fldi1 fr15
fschg
frchg
)"
: [mtx] "+r" (mtx)
);
}
__hot inline void mat_transpose(void) {
asm volatile (
"frchg\n\t" // fmov for singles only works on front bank
// FR0, FR5, FR10, and FR15 are already in place
// swap FR1 and FR4
"flds FR1, FPUL\n\t"
"fmov FR4, FR1\n\t"
"fsts FPUL, FR4\n\t"
// swap FR2 and FR8
"flds FR2, FPUL\n\t"
"fmov FR8, FR2\n\t"
"fsts FPUL, FR8\n\t"
// swap FR3 and FR12
"flds FR3, FPUL\n\t"
"fmov FR12, FR3\n\t"
"fsts FPUL, FR12\n\t"
// swap FR6 and FR9
"flds FR6, FPUL\n\t"
"fmov FR9, FR6\n\t"
"fsts FPUL, FR9\n\t"
// swap FR7 and FR13
"flds FR7, FPUL\n\t"
"fmov FR13, FR7\n\t"
"fsts FPUL, FR13\n\t"
// swap FR11 and FR14
"flds FR11, FPUL\n\t"
"fmov FR14, FR11\n\t"
"fsts FPUL, FR14\n\t"
// restore XMTRX to back bank
"frchg\n"
: // no outputs
: // no inputs
: "fpul" // clobbers
);
}
__attribute__((optimize("align-functions=32")))
__hot inline void mat_copy(matrix_t *dst, const matrix_t *src) {
asm volatile(R"(
fschg
pref @%[dst]
fmov.d @%[src]+, xd0
fmov.d @%[src]+, xd2
fmov.d @%[src]+, xd4
fmov.d @%[src]+, xd6
pref @%[src]
add #32, %[dst]
fmov.d xd6, @-%[dst]
fmov.d xd4, @-%[dst]
fmov.d xd2, @-%[dst]
fmov.d xd0, @-%[dst]
add #32, %[dst]
pref @%[dst]
fmov.d @%[src]+, xd0
fmov.d @%[src]+, xd2
fmov.d @%[src]+, xd4
fmov.d @%[src]+, xd6
add #32, %[dst]
fmov.d xd6, @-%[dst]
fmov.d xd4, @-%[dst]
fmov.d xd2, @-%[dst]
fmov.d xd0, @-%[dst]
fschg
)": [dst] "+&r" (dst), [src] "+&r" (src)
:
: "memory");
}
template<bool FAST_APPROX=true>
__hot constexpr inline void quat_mult(quaternion_t *r, const quaternion_t &q1, const quaternion_t &q2) {
if(FAST_APPROX && !std::is_constant_evaluated()) {
/*
// reorder the coefficients so that q1 stays in constant order {x,y,z,w}
// q2 then needs to be rotated after each inner product
x = (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x);
y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y);
z = (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z);
w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w);
*/
// keep q1 in fv4
register float q1x __asm__ ("fr4") = (q1.x);
register float q1y __asm__ ("fr5") = (q1.y);
register float q1z __asm__ ("fr6") = (q1.z);
register float q1w __asm__ ("fr7") = (q1.w);
// load q2 into fv8, use it to get the shuffled reorder into fv0
register float q2x __asm__ ("fr8") = (q2.x);
register float q2y __asm__ ("fr9") = (q2.y);
register float q2z __asm__ ("fr10") = (q2.z);
register float q2w __asm__ ("fr11") = (q2.w);
// temporary operand / result in fv0
register float t1x __asm__ ("fr0");
register float t1y __asm__ ("fr1");
register float t1z __asm__ ("fr2");
register float t1w __asm__ ("fr3");
// x = (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x);
t1x = q2w;
t1y = q2z;
t1z = -q2y;
t1w = q2w;
__asm__ ("\n"
" fipr fv4,fv0\n"
: "+f" (t1w)
: "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
"f" (t1x), "f" (t1y), "f" (t1z)
);
// x = t1w; try to avoid the stall by not reading the fipr result immediately
// y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y);
t1x = -q2z;
t1y = q2w;
t1z = q2x;
__atomic_thread_fence(1);
r->x = t1w; // get previous result
t1w = q2y;
__asm__ ("\n"
" fipr fv4,fv0\n"
: "+f" (t1w)
: "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
"f" (t1x), "f" (t1y), "f" (t1z)
);
//y = t1w;
// z = (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z);
t1x = q2y;
t1y = -q2x;
t1z = q2w;
__atomic_thread_fence(1);
r->y = t1w; // get previous result
t1w = q2z;
__asm__ ("\n"
" fipr fv4,fv0\n"
: "+f" (t1w)
: "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
"f" (t1x), "f" (t1y), "f" (t1z)
);
//z = t1w;
__atomic_thread_fence(1);
// w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w);
q2x = -q2x;
q2y = -q2y;
q2z = -q2z;
__asm__ ("\n"
" fipr fv4,fv8\n"
: "+f" (q2w)
: "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
"f" (q2x), "f" (q2y), "f" (q2z)
);
__atomic_thread_fence(1);
r->z = t1w;
__atomic_thread_fence(1);
r->w = q2w;
} else {
r->x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w);
r->y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w);
r->z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w);
r->w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z);
}
}
__hot inline void mat_load_apply(const matrix_t* matrix1, const matrix_t* matrix2) {
unsigned int prefetch_scratch;
asm volatile (
"mov %[bmtrx], %[pref_scratch]\n\t" // (MT)
"add #32, %[pref_scratch]\n\t" // offset by 32 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle)
"fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) (FE)
"pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS)
// back matrix
"fmov.d @%[bmtrx]+, XD0\n\t" // (LS)
"fmov.d @%[bmtrx]+, XD2\n\t"
"fmov.d @%[bmtrx]+, XD4\n\t"
"fmov.d @%[bmtrx]+, XD6\n\t"
"pref @%[fmtrx]\n\t" // prefetch fmtrx now while we wait (LS)
"fmov.d @%[bmtrx]+, XD8\n\t" // bmtrx prefetch should work for here
"fmov.d @%[bmtrx]+, XD10\n\t"
"fmov.d @%[bmtrx]+, XD12\n\t"
"mov %[fmtrx], %[pref_scratch]\n\t" // (MT)
"add #32, %[pref_scratch]\n\t" // store offset by 32 in r0 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle)
"fmov.d @%[bmtrx], XD14\n\t"
"pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS)
// front matrix
// interleave loads and matrix multiply 4x4
"fmov.d @%[fmtrx]+, DR0\n\t"
"fmov.d @%[fmtrx]+, DR2\n\t"
"fmov.d @%[fmtrx]+, DR4\n\t" // (LS) want to issue the next one before 'ftrv' for parallel exec
"ftrv XMTRX, FV0\n\t" // (FE)
"fmov.d @%[fmtrx]+, DR6\n\t"
"fmov.d @%[fmtrx]+, DR8\n\t"
"ftrv XMTRX, FV4\n\t"
"fmov.d @%[fmtrx]+, DR10\n\t"
"fmov.d @%[fmtrx]+, DR12\n\t"
"ftrv XMTRX, FV8\n\t"
"fmov.d @%[fmtrx], DR14\n\t" // (LS, but this will stall 'ftrv' for 3 cycles)
"fschg\n\t" // switch back to single moves (and avoid stalling 'ftrv') (FE)
"ftrv XMTRX, FV12\n\t" // (FE)
// Save output in XF regs
"frchg\n"
: [bmtrx] "+&r" ((unsigned int)matrix1), [fmtrx] "+r" ((unsigned int)matrix2), [pref_scratch] "=&r" (prefetch_scratch) // outputs, "+" means r/w, "&" means it's written to before all inputs are consumed
: // no inputs
: "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11", "fr12", "fr13", "fr14", "fr15" // clobbers (GCC doesn't know about back bank, so writing to it isn't clobbered)
);
}
__hot inline void mat_apply_rotate_x(float x) {
x *= 10430.37835f;
asm volatile(
"ftrc %[a], fpul\n\t"
"fsca fpul, dr4\n\t"
"fldi0 fr8\n\t"
"fldi0 fr11\n\t"
"fmov fr5, fr10\n\t"
"fmov fr4, fr9\n\t"
"fneg fr9\n\t"
"ftrv xmtrx, fv8\n\t"
"fmov fr4, fr6\n\t"
"fldi0 fr7\n\t"
"fldi0 fr4\n\t"
"ftrv xmtrx, fv4\n\t"
"fschg\n\t"
"fmov dr8, xd8\n\t"
"fmov dr10, xd10\n\t"
"fmov dr4, xd4\n\t"
"fmov dr6, xd6\n\t"
"fschg\n"
:
: [a] "f"(x)
: "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11");
}
__hot inline void mat_apply_rotate_y(float y) {
y *= 10430.37835f;
asm volatile(
"ftrc %[a], fpul\n\t"
"fsca fpul, dr6\n\t"
"fldi0 fr9\n\t"
"fldi0 fr11\n\t"
"fmov fr6, fr8\n\t"
"fmov fr7, fr10\n\t"
"ftrv xmtrx, fv8\n\t"
"fmov fr7, fr4\n\t"
"fldi0 fr5\n\t"
"fneg fr6\n\t"
"fldi0 fr7\n\t"
"ftrv xmtrx, fv4\n\t"
"fschg\n\t"
"fmov dr8, xd8\n\t"
"fmov dr10, xd10\n\t"
"fmov dr4, xd0\n\t"
"fmov dr6, xd2\n\t"
"fschg\n"
:
: [a] "f"(y)
: "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11");
}
__hot inline void mat_apply_rotate_z(float z) {
z *= 10430.37835f;
asm volatile(
"ftrc %[a], fpul\n\t"
"fsca fpul, dr8\n\t"
"fldi0 fr10\n\t"
"fldi0 fr11\n\t"
"fmov fr8, fr5\n\t"
"fneg fr8\n\t"
"ftrv xmtrx, fv8\n\t"
"fmov fr9, fr4\n\t"
"fschg\n\t"
"fmov dr10, dr6\n\t"
"ftrv xmtrx, fv4\n\t"
"fmov dr8, xd4\n\t"
"fmov dr10, xd6\n\t"
"fmov dr4, xd0\n\t"
"fmov dr6, xd2\n\t"
"fschg\n"
:
: [a] "f"(z)
: "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11");
}
# else
# ifdef DC_TEXCONV
# define mat_transform(a, b, c, d)
# define mat_apply(a)
# define mat_load(a)
# define mat_store(a)
# define mat_identity(a)
# define pvr_fog_table_color(a,r,g,b)
# define pvr_fog_table_linear(s,e)
# endif
#define mat_trans_vec3_nomod(x_, y_, z_, x2, y2, z2) do { \
vector_t tmp = { x_, y_, z_, 0.0f }; \
mat_transform(&tmp, &tmp, 1, 0); \
x2 = tmp.x; y2 = tmp.y; z2 = tmp.z; \
} while(false)
#define mat_trans_single3_nomod(x_, y_, z_, x2, y2, z2) do { \
vector_t tmp = { x_, y_, z_, 1.0f }; \
mat_transform(&tmp, &tmp, 1, 0); \
z2 = 1.0f / tmp.w; \
x2 = tmp.x * z2; \
y2 = tmp.y * z2; \
} while(false)
#define mat_trans_single3_nodiv_nomod(x_, y_, z_, x2, y2, z2) do { \
vector_t tmp = { x_, y_, z_, 1.0f }; \
mat_transform(&tmp, &tmp, 1, 0); \
z2 = tmp.z; \
x2 = tmp.x; \
y2 = tmp.y; \
} while(false)
#define mat_trans_nodiv_nomod(x_, y_, z_, x2, y2, z2, w2) do { \
vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \
mat_transform(&tmp1233123, &tmp1233123, 1, 0); \
x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \
} while(false)
#define mat_trans_nodiv_nomod_zerow(x_, y_, z_, x2, y2, z2, w2) do { \
vector_t tmp1233123 = { x_, y_, z_, 0.0f }; \
mat_transform(&tmp1233123, &tmp1233123, 1, 0); \
x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \
} while(false)
#define mat_trans_w_nodiv_nomod(x_, y_, z_, w_) do { \
vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \
mat_transform(&tmp1233123, &tmp1233123, 1, 0); \
w_ = tmp1233123.w; \
} while(false)
inline void mat_load_3x3(const matrix_t* mtx) {
memcpy(XMTRX, mtx, sizeof(matrix_t));
XMTRX[0][3] = 0.0f;
XMTRX[1][3] = 0.0f;
XMTRX[2][3] = 0.0f;
XMTRX[3][0] = 0.0f;
XMTRX[3][1] = 0.0f;
XMTRX[3][2] = 0.0f;
XMTRX[3][3] = 0.0f;
}
inline void rw_mat_load_4x4(const rw::Matrix* mtx) {
memcpy(XMTRX, mtx, sizeof(matrix_t));
XMTRX[3][3] = 1.0f;
}
inline void mat_transpose(void) {
matrix_t tmp;
for(unsigned i = 0; i < 4; ++i)
for(unsigned j = 0; j < 4; ++j)
tmp[j][i] = XMTRX[i][j];
mat_load(&tmp);
}
template<bool FAST_APPROX=true>
__hot inline void quat_mult(quaternion_t *r, const quaternion_t &q1, const quaternion_t &q2) {
r->x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w);
r->y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w);
r->z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w);
r->w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z);
}
__hot inline void mat_load_apply(const matrix_t* matrix1, const matrix_t* matrix2) {
mat_load(matrix1);
mat_apply(matrix2);
}
__always_inline __hot void mat_copy(matrix_t *dst, const matrix_t *src) {
mat_load(src);
mat_store(dst);
}
#endif
__hot inline void mat_mult(matrix_t *out, const matrix_t* matrix1, const matrix_t* matrix2) {
mat_load_apply(matrix1, matrix2);
mat_store(out);
}
#if 0
template<std::size_t N>
__always_inline __hot
auto vec3_dot_prod(const vec3_t &v1, vec3_t const (&v2)[N]) {
std::array<float, N> result;
#ifdef DC_SH4
register float x asm(KOS_FPARG(0)) = v1.x;
register float y asm(KOS_FPARG(1)) = v1.y;
register float z asm(KOS_FPARG(2)) = v1.z;
#else
float x = v1.x;
float y = v1.y;
float z = v1.z;
#endif
for(std::size_t v = 0; v < N; ++v)
result[v] = fipr(x, y, z, 0.0f, v2[v].x, v2[v].y, v2[v].z, 0.0f);
return result;
}
#endif
#endif
}
#endif /* RWDC_COMMON_H */

View file

@ -1,6 +1,9 @@
#pragma once #pragma once
#include "common_defines.h" #include "common_defines.h"
#ifdef RW_DC
#include "rwdc_common.h"
#endif
#ifndef RW_PS2 #ifndef RW_PS2
#include <stdint.h> #include <stdint.h>
@ -322,7 +325,13 @@ inline float32 dot(const Quat &q, const Quat &p) {
#endif #endif
} }
inline Quat scale(const Quat &q, float32 r) { return makeQuat(q.w*r, q.x*r, q.y*r, q.z*r); } inline Quat scale(const Quat &q, float32 r) { return makeQuat(q.w*r, q.x*r, q.y*r, q.z*r); }
inline float32 length(const Quat &q) { return sqrtf(q.w*q.w + q.x*q.x + q.y*q.y + q.z*q.z); } inline float32 length(const Quat &q) {
#ifndef DC_SH4
return sqrtf(q.w*q.w + q.x*q.x + q.y*q.y + q.z*q.z);
#else
return dc::Sqrt(fipr_magnitude_sqr(q.x, q.y, q.z, q.w));
#endif
}
inline Quat normalize(const Quat &q) { return scale(q, 1.0f/length(q)); } inline Quat normalize(const Quat &q) { return scale(q, 1.0f/length(q)); }
inline Quat conj(const Quat &q) { return makeQuat(q.w, -q.x, -q.y, -q.z); } inline Quat conj(const Quat &q) { return makeQuat(q.w, -q.x, -q.y, -q.z); }
Quat mult(const Quat &q, const Quat &p); Quat mult(const Quat &q, const Quat &p);