Merge branch 'falco/gainz_phase_3' into 'main'
Some checks failed
re3 premake amd64 / build (Debug, win-amd64-librw_d3d9-oal) (push) Has been cancelled
re3 conan+cmake / build-cmake (openal, glfw, macos-latest, gl3) (push) Has been cancelled
re3 premake amd64 / build (Debug, win-amd64-librw_gl3_glfw-oal) (push) Has been cancelled
re3 conan+cmake / build-cmake (openal, glfw, ubuntu-18.04, gl3) (push) Has been cancelled
re3 conan+cmake / build-cmake (openal, glfw, windows-latest, gl3) (push) Has been cancelled
re3 premake x86 / build (Debug, win-x86-librw_d3d9-mss) (push) Has been cancelled
re3 premake x86 / build (Debug, win-x86-librw_d3d9-oal) (push) Has been cancelled
re3 premake x86 / build (Debug, win-x86-librw_gl3_glfw-mss) (push) Has been cancelled
re3 premake x86 / build (Debug, win-x86-librw_gl3_glfw-oal) (push) Has been cancelled
re3 conan+cmake / build-cmake (openal, windows-latest, d3d9) (push) Has been cancelled
re3 cmake devkitA64 (Nintendo Switch) / build-nintendo-switch (push) Has been cancelled
re3 premake amd64 / build (Release, win-amd64-librw_d3d9-oal) (push) Has been cancelled
re3 premake amd64 / build (Release, win-amd64-librw_gl3_glfw-oal) (push) Has been cancelled
re3 premake x86 / build (Release, win-x86-librw_gl3_glfw-mss) (push) Has been cancelled
re3 premake x86 / build (Release, win-x86-librw_gl3_glfw-oal) (push) Has been cancelled
re3 premake x86 / build (Vanilla, win-x86-librw_d3d9-mss) (push) Has been cancelled
re3 premake x86 / build (Vanilla, win-x86-librw_d3d9-oal) (push) Has been cancelled
re3 premake x86 / build (Vanilla, win-x86-librw_gl3_glfw-mss) (push) Has been cancelled
re3 premake x86 / build (Vanilla, win-x86-librw_gl3_glfw-oal) (push) Has been cancelled
re3 premake x86 / build (Release, win-x86-librw_d3d9-mss) (push) Has been cancelled
re3 premake x86 / build (Release, win-x86-librw_d3d9-oal) (push) Has been cancelled

Liberty/Miami Perf Gainz Phase 3

See merge request skmp/dca3-game!97
This commit is contained in:
Stefanos Kornilios Mitsis Poiitidis 2025-04-26 17:00:40 +00:00
commit 1f2f270da9
28 changed files with 1216 additions and 538 deletions

View file

@ -381,6 +381,7 @@ INCLUDE = \
-I../src/liberty/skel/win \ -I../src/liberty/skel/win \
\ \
-I../vendor/librw \ -I../vendor/librw \
-I../vendor/librw/src/dc \
\ \
-I../vendor/miniLZO \ -I../vendor/miniLZO \
\ \

View file

@ -396,6 +396,7 @@ INCLUDE = \
-I../src/miami/skel/win \ -I../src/miami/skel/win \
\ \
-I../vendor/librw \ -I../vendor/librw \
-I../vendor/librw/src/dc \
\ \
-I../vendor/miniLZO \ -I../vendor/miniLZO \
\ \

View file

@ -94,23 +94,26 @@ void VmuProfiler::run() {
pvr_stats_t pvrStats; pvr_get_stats(&pvrStats); pvr_stats_t pvrStats; pvr_get_stats(&pvrStats);
uint32_t sramStats = snd_mem_available(); uint32_t sramStats = snd_mem_available();
size_t pvrAvail = pvr_mem_available(); size_t pvrAvail = pvr_mem_available();
float fps = std::accumulate(std::begin(fps_), std::end(fps_), 0.0f)
/ static_cast<float>(fpsSamples);
float sh4Mem = heapUtilization(); float sh4Mem = heapUtilization();
float pvrMem = (8_MB - pvrAvail ) / 8_MB * 100.0f; float pvrMem = (8_MB - pvrAvail ) / 8_MB * 100.0f;
float armMem = (2_MB - sramStats) / 2_MB * 100.0f; float armMem = (2_MB - sramStats) / 2_MB * 100.0f;
float vtxBuf = vertBuffUse_;
{
std::shared_lock lk(mtx_);
vmu_printf("FPS :%6.2f\n" float vtxBuf;
"SH4 :%6.2f%%\n" float fps;
"PVR :%6.2f%%\n" { /* Critical section with main thread. */
"ARM :%6.2f%%\n" std::shared_lock lk(mtx_);
"VTX :%6.2f%%", vtxBuf = vertBuffUse_;
fps, sh4Mem, pvrMem, armMem, vtxBuf); fps = std::accumulate(std::begin(fps_), std::end(fps_), 0.0f)
/ static_cast<float>(fpsSamples);
} }
vmu_printf(" FPS:%6.2f\n"
" RAM:%6.2f%%\n"
"VRAM:%6.2f%%\n"
"SRAM:%6.2f%%\n"
" VTX:%6.2f%%",
fps, sh4Mem, pvrMem, armMem, vtxBuf);
} }
#endif #endif
@ -119,15 +122,21 @@ void VmuProfiler::run() {
} }
void VmuProfiler::updateVertexBufferUsage() { void VmuProfiler::updateVertexBufferUsage() {
#ifndef DC_SH4
std::unique_lock lk(mtx_); std::unique_lock lk(mtx_);
updated_ = true; updated_ = true;
#ifdef DC_SH4 #else
vertBuffUse_ = vertexBufferUtilization();
pvr_stats_t pvrStats; pvr_stats_t pvrStats;
pvr_get_stats(&pvrStats); pvr_get_stats(&pvrStats);
fps_[fpsFrame_++] = pvrStats.frame_rate; float vtxUtil = vertexBufferUtilization();
{ /* Critical section with VMU thread. */
std::unique_lock lk(mtx_);
vertBuffUse_ = vtxUtil;
updated_ = true;
fps_[fpsFrame_++] = pvrStats.frame_rate;
}
if(fpsFrame_ >= fpsSamples) if(fpsFrame_ >= fpsSamples)
fpsFrame_ = 0; fpsFrame_ = 0;

View file

@ -1619,7 +1619,7 @@ CCollision::ProcessLineOfSight(const CColLine &line,
point.point = matrix * point.point; point.point = matrix * point.point;
point.normal = Multiply3x3(matrix, point.normal); point.normal = Multiply3x3(matrix, point.normal);
#else #else
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&matrix))); dc::mat_load2(matrix);
mat_trans_single3_nodiv(point.point.x, mat_trans_single3_nodiv(point.point.x,
point.point.y, point.point.y,
point.point.z); point.point.z);
@ -1798,7 +1798,7 @@ CCollision::ProcessVerticalLine(const CColLine &line,
point.point = matrix * point.point; point.point = matrix * point.point;
point.normal = Multiply3x3(matrix, point.normal); point.normal = Multiply3x3(matrix, point.normal);
#else #else
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&matrix))); dc::mat_load2(matrix);
mat_trans_single3_nodiv(point.point.x, mat_trans_single3_nodiv(point.point.x,
point.point.y, point.point.y,
point.point.z); point.point.z);
@ -2173,8 +2173,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA,
#ifndef DC_SH4 #ifndef DC_SH4
matAB *= matrixA; matAB *= matrixA;
#else #else
mat_load(reinterpret_cast<const matrix_t*>(&matAB)); dc::mat_load_apply(matAB, matrixA);
mat_apply(reinterpret_cast<const matrix_t*>(&matrixA));
#endif #endif
CColSphere bsphereAB; // bounding sphere of A in B space CColSphere bsphereAB; // bounding sphere of A in B space
@ -2246,8 +2245,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA,
#ifndef DC_SH4 #ifndef DC_SH4
matBA *= matrixB; matBA *= matrixB;
#else #else
mat_load(reinterpret_cast<const matrix_t*>(&matBA)); dc::mat_load_apply(matBA, matrixB);
mat_apply(reinterpret_cast<const matrix_t*>(&matrixB));
#endif #endif
for(i = 0; i < modelB.numSpheres; i++){ for(i = 0; i < modelB.numSpheres; i++){
s.radius = modelB.spheres[i].radius; s.radius = modelB.spheres[i].radius;
@ -2309,7 +2307,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA,
} }
#ifdef DC_SH4 #ifdef DC_SH4
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&matrixB))); dc::mat_load2(matrixB);
#endif #endif
for(i = 0; i < numCollisions; i++){ for(i = 0; i < numCollisions; i++){
#ifndef DC_SH4 #ifndef DC_SH4

View file

@ -82,8 +82,6 @@
#define rwVENDORID_ROCKSTAR 0x0253F2 #define rwVENDORID_ROCKSTAR 0x0253F2
__always_inline auto Max(auto a, auto b) { return ((a > b)? a : b); }
__always_inline auto Min(auto a, auto b) { return ((a < b)? a : b); }
// Use this to add const that wasn't there in the original code // Use this to add const that wasn't there in the original code
#define Const const #define Const const
@ -299,15 +297,6 @@ extern int strcasecmp(const char *str1, const char *str2);
extern wchar *AllocUnicode(const char*src); extern wchar *AllocUnicode(const char*src);
template<typename T>
__always_inline T Clamp(T v, auto low, auto high) {
return std::clamp(v, static_cast<T>(low), static_cast<T>(high));
}
__always_inline auto Clamp2(auto v, auto center, auto radius) {
return (v > center) ? Min(v, center + radius) : Max(v, center - radius);
}
#define SQR(x) ((x) * (x)) #define SQR(x) ((x) * (x))
__always_inline auto sq(auto x) { return SQR(x); } __always_inline auto sq(auto x) { return SQR(x); }
@ -418,15 +407,7 @@ template<int s, int t> struct check_size {
#endif #endif
#define BIT(num) (1<<(num)) #define BIT(num) (1<<(num))
#define ABS(a) std::abs(a) #define ABS(a) Abs(a)
__always_inline auto norm(auto value, auto min, auto max) {
return (Clamp(value, min, max) - min) / (max - min);
}
// we use std::lerp now // we use std::lerp now
//#define lerp(norm, min, max) ( (norm) * ((max) - (min)) + (min) ) //#define lerp(norm, min, max) ( (norm) * ((max) - (min)) + (min) )
#define STRINGIFY(x) #x
#define STR(x) STRINGIFY(x)
#define CONCAT_(x,y) x##y
#define CONCAT(x,y) CONCAT_(x,y)

View file

@ -4,7 +4,7 @@ CMatrix::CMatrix(CMatrix const &m)
{ {
m_attachment = nil; m_attachment = nil;
m_hasRwMatrix = false; m_hasRwMatrix = false;
*this = m; mat_copy(*this, m);
} }
CMatrix::CMatrix(RwMatrix *matrix, bool owner) CMatrix::CMatrix(RwMatrix *matrix, bool owner)
@ -75,7 +75,7 @@ CMatrix::UpdateRW(void)
void void
CMatrix::operator=(CMatrix const &rhs) CMatrix::operator=(CMatrix const &rhs)
{ {
memcpy(this, &rhs, sizeof(f)); mat_copy(*this, rhs);
if (m_attachment) if (m_attachment)
UpdateRW(); UpdateRW();
} }
@ -83,7 +83,7 @@ CMatrix::operator=(CMatrix const &rhs)
void void
CMatrix::CopyOnlyMatrix(const CMatrix &other) CMatrix::CopyOnlyMatrix(const CMatrix &other)
{ {
memcpy(this, &other, sizeof(f)); mat_copy(*this, other);
} }
CMatrix & CMatrix &
@ -277,9 +277,9 @@ void
CMatrix::RotateX(float x) CMatrix::RotateX(float x)
{ {
#if 0 && defined(DC_SH4) // this is bugged and does not yield correct results #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
mat_load(reinterpret_cast<matrix_t *>(this)); dc::mat_load2(reinterpret_cast<matrix_t *>(this));
mat_rotate_x(x); mat_rotate_x(x);
mat_store(reinterpret_cast<matrix_t *>(this)); dc::mat_store2(reinterpret_cast<matrix_t *>(this));
#else #else
auto [s, c] = SinCos(x); auto [s, c] = SinCos(x);
@ -307,9 +307,9 @@ void
CMatrix::RotateY(float y) CMatrix::RotateY(float y)
{ {
#if 0 && defined(DC_SH4) // this is bugged and does not yield correct results #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
mat_load(reinterpret_cast<matrix_t *>(this)); dc::mat_load2(reinterpret_cast<matrix_t *>(this));
mat_rotate_y(y); mat_rotate_y(y);
mat_store(reinterpret_cast<matrix_t *>(this)); dc::mat_store2(reinterpret_cast<matrix_t *>(this));
#else #else
auto [s, c] = SinCos(y); auto [s, c] = SinCos(y);
@ -337,9 +337,9 @@ void
CMatrix::RotateZ(float z) CMatrix::RotateZ(float z)
{ {
#if 0 && defined(DC_SH4) // this is bugged and does not yield correct results #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
mat_load(reinterpret_cast<matrix_t *>(this)); dc::mat_load2(reinterpret_cast<matrix_t *>(this));
mat_rotate_z(z); mat_rotate_z(z);
mat_store(reinterpret_cast<matrix_t *>(this)); dc::mat_store2(reinterpret_cast<matrix_t *>(this));
#else #else
auto [s, c] = SinCos(z); auto [s, c] = SinCos(z);
@ -367,9 +367,9 @@ void
CMatrix::Rotate(float x, float y, float z) CMatrix::Rotate(float x, float y, float z)
{ {
#if 0 && defined(DC_SH4) // this is bugged and does not yield correct results #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
mat_load(reinterpret_cast<matrix_t *>(this)); dc::mat_load2(reinterpret_cast<matrix_t *>(this));
mat_rotate(x, y, z); mat_rotate(x, y, z);
mat_store(reinterpret_cast<matrix_t *>(this)); dc::mat_store2(reinterpret_cast<matrix_t *>(this));
#else #else
auto [sX, cX] = SinCos(x); auto [sX, cX] = SinCos(x);
auto [sY, cY] = SinCos(y); auto [sY, cY] = SinCos(y);
@ -449,65 +449,13 @@ CMatrix::Reorthogonalise(void)
f = CrossProduct(u, r); f = CrossProduct(u, r);
} }
#ifdef DC_SH4
static __always_inline void MATH_Load_Matrix_Product(const matrix_t* matrix1, const matrix_t* matrix2)
{
unsigned int prefetch_scratch;
asm volatile (
"mov %[bmtrx], %[pref_scratch]\n\t" // (MT)
"add #32, %[pref_scratch]\n\t" // offset by 32 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle)
"fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) (FE)
"pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS)
// back matrix
"fmov.d @%[bmtrx]+, XD0\n\t" // (LS)
"fmov.d @%[bmtrx]+, XD2\n\t"
"fmov.d @%[bmtrx]+, XD4\n\t"
"fmov.d @%[bmtrx]+, XD6\n\t"
"pref @%[fmtrx]\n\t" // prefetch fmtrx now while we wait (LS)
"fmov.d @%[bmtrx]+, XD8\n\t" // bmtrx prefetch should work for here
"fmov.d @%[bmtrx]+, XD10\n\t"
"fmov.d @%[bmtrx]+, XD12\n\t"
"mov %[fmtrx], %[pref_scratch]\n\t" // (MT)
"add #32, %[pref_scratch]\n\t" // store offset by 32 in r0 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle)
"fmov.d @%[bmtrx], XD14\n\t"
"pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS)
// front matrix
// interleave loads and matrix multiply 4x4
"fmov.d @%[fmtrx]+, DR0\n\t"
"fmov.d @%[fmtrx]+, DR2\n\t"
"fmov.d @%[fmtrx]+, DR4\n\t" // (LS) want to issue the next one before 'ftrv' for parallel exec
"ftrv XMTRX, FV0\n\t" // (FE)
"fmov.d @%[fmtrx]+, DR6\n\t"
"fmov.d @%[fmtrx]+, DR8\n\t"
"ftrv XMTRX, FV4\n\t"
"fmov.d @%[fmtrx]+, DR10\n\t"
"fmov.d @%[fmtrx]+, DR12\n\t"
"ftrv XMTRX, FV8\n\t"
"fmov.d @%[fmtrx], DR14\n\t" // (LS, but this will stall 'ftrv' for 3 cycles)
"fschg\n\t" // switch back to single moves (and avoid stalling 'ftrv') (FE)
"ftrv XMTRX, FV12\n\t" // (FE)
// Save output in XF regs
"frchg\n"
: [bmtrx] "+&r" ((unsigned int)matrix1), [fmtrx] "+r" ((unsigned int)matrix2), [pref_scratch] "=&r" (prefetch_scratch) // outputs, "+" means r/w, "&" means it's written to before all inputs are consumed
: // no inputs
: "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11", "fr12", "fr13", "fr14", "fr15" // clobbers (GCC doesn't know about back bank, so writing to it isn't clobbered)
);
}
#endif
CMatrix CMatrix
operator*(const CMatrix &m1, const CMatrix &m2) operator*(const CMatrix &m1, const CMatrix &m2)
{ {
// TODO: VU0 code // TODO: VU0 code
CMatrix out; CMatrix out;
#if defined(RW_DC) #ifdef DC_SH4
mat_load(reinterpret_cast<const matrix_t *>(&m1)); mat_mult(out, m1, m2);
mat_apply(reinterpret_cast<const matrix_t *>(&m2));
mat_store(reinterpret_cast<matrix_t *>(&out));
#else #else
out.rx = m1.rx * m2.rx + m1.fx * m2.ry + m1.ux * m2.rz; out.rx = m1.rx * m2.rx + m1.fx * m2.ry + m1.ux * m2.rz;
out.ry = m1.ry * m2.rx + m1.fy * m2.ry + m1.uy * m2.rz; out.ry = m1.ry * m2.rx + m1.fy * m2.ry + m1.uy * m2.rz;

View file

@ -1,5 +1,7 @@
#pragma once #pragma once
#include "rwdc_common.h"
class alignas(8) CMatrix class alignas(8) CMatrix
{ {
public: public:
@ -27,6 +29,8 @@ public:
SetScale(scale); SetScale(scale);
} }
~CMatrix(void); ~CMatrix(void);
operator matrix_t *() { return reinterpret_cast<matrix_t *>(this); }
operator const matrix_t *() const { return reinterpret_cast<const matrix_t *>(this); }
void Attach(RwMatrix *matrix, bool owner = false); void Attach(RwMatrix *matrix, bool owner = false);
void AttachRW(RwMatrix *matrix, bool owner = false); void AttachRW(RwMatrix *matrix, bool owner = false);
void Detach(void); void Detach(void);
@ -102,15 +106,24 @@ CMatrix Invert(const CMatrix &matrix);
CMatrix operator*(const CMatrix &m1, const CMatrix &m2); CMatrix operator*(const CMatrix &m1, const CMatrix &m2);
inline CVector MultiplyInverse(const CMatrix &mat, const CVector &vec) inline CVector MultiplyInverse(const CMatrix &mat, const CVector &vec)
{ {
#ifndef DC_SH4
CVector v(vec.x - mat.px, vec.y - mat.py, vec.z - mat.pz); CVector v(vec.x - mat.px, vec.y - mat.py, vec.z - mat.pz);
return CVector( return CVector(
mat.rx * v.x + mat.ry * v.y + mat.rz * v.z, mat.rx * v.x + mat.ry * v.y + mat.rz * v.z,
mat.fx * v.x + mat.fy * v.y + mat.fz * v.z, mat.fx * v.x + mat.fy * v.y + mat.fz * v.z,
mat.ux * v.x + mat.uy * v.y + mat.uz * v.z); mat.ux * v.x + mat.uy * v.y + mat.uz * v.z);
#else
register float x asm(KOS_FPARG(0)) = vec.x - mat.px;
register float y asm(KOS_FPARG(1)) = vec.y - mat.py;
register float z asm(KOS_FPARG(2)) = vec.z - mat.pz;
return CVector(
fipr(x, y, z, 0.0f, mat.rx, mat.ry, mat.rz, 0.0f),
fipr(x, y, z, 0.0f, mat.fx, mat.fy, mat.fz, 0.0f),
fipr(x, y, z, 0.0f, mat.ux, mat.uy, mat.uz, 0.0f)
);
#endif
} }
class CCompressedMatrixNotAligned class CCompressedMatrixNotAligned
{ {
CVector m_vecPos; CVector m_vecPos;

View file

@ -39,10 +39,14 @@ CQuaternion::Slerp(const CQuaternion &q1, const CQuaternion &q2, float theta, fl
void void
CQuaternion::Multiply(const CQuaternion &q1, const CQuaternion &q2) CQuaternion::Multiply(const CQuaternion &q1, const CQuaternion &q2)
{ {
#ifndef DC_SH4
x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w); x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w);
y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w); y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w);
z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w); z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w);
w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z); w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z);
#else
quat_mult(*this, q1, q2);
#endif
} }
void void
@ -51,9 +55,16 @@ CQuaternion::Get(RwV3d *axis, float *angle)
*angle = Acos(w); *angle = Acos(w);
float s = Sin(*angle); float s = Sin(*angle);
#ifndef DC_SH4
axis->x = x * (1.0f / s); axis->x = x * (1.0f / s);
axis->y = y * (1.0f / s); axis->y = y * (1.0f / s);
axis->z = z * (1.0f / s); axis->z = z * (1.0f / s);
#else
float invS = dc::Invert<true, false>(s);
axis->x = x * invS;
axis->y = y * invS;
axis->z = z * invS;
#endif
} }
void void
@ -104,7 +115,7 @@ CQuaternion::Set(const RwMatrix &matrix)
if (f >= 0.0f) { if (f >= 0.0f) {
s = Sqrt(f + 1.0f); s = Sqrt(f + 1.0f);
w = 0.5f * s; w = 0.5f * s;
m = 0.5f / s; m = Div<true, false>(0.5f, s);
x = (matrix.up.z - matrix.at.y) * m; x = (matrix.up.z - matrix.at.y) * m;
y = (matrix.at.x - matrix.right.z) * m; y = (matrix.at.x - matrix.right.z) * m;
z = (matrix.right.y - matrix.up.x) * m; z = (matrix.right.y - matrix.up.x) * m;
@ -115,7 +126,7 @@ CQuaternion::Set(const RwMatrix &matrix)
if (f >= 0.0f) { if (f >= 0.0f) {
s = Sqrt(f + 1.0f); s = Sqrt(f + 1.0f);
x = 0.5f * s; x = 0.5f * s;
m = 0.5f / s; m = Div<true, false>(0.5f, s);
y = (matrix.up.x + matrix.right.y) * m; y = (matrix.up.x + matrix.right.y) * m;
z = (matrix.at.x + matrix.right.z) * m; z = (matrix.at.x + matrix.right.z) * m;
w = (matrix.up.z - matrix.at.y) * m; w = (matrix.up.z - matrix.at.y) * m;
@ -126,7 +137,7 @@ CQuaternion::Set(const RwMatrix &matrix)
if (f >= 0.0f) { if (f >= 0.0f) {
s = Sqrt(f + 1.0f); s = Sqrt(f + 1.0f);
y = 0.5f * s; y = 0.5f * s;
m = 0.5f / s; m = Div<true, false>(0.5f, s);
w = (matrix.at.x - matrix.right.z) * m; w = (matrix.at.x - matrix.right.z) * m;
x = (matrix.up.x - matrix.right.y) * m; x = (matrix.up.x - matrix.right.y) * m;
z = (matrix.at.y + matrix.up.z) * m; z = (matrix.at.y + matrix.up.z) * m;

View file

@ -1,6 +1,7 @@
#pragma once #pragma once
#include "src/common_defines.h" #include "src/common_defines.h"
#include "rwdc_common.h"
// TODO: actually implement this // TODO: actually implement this
class CQuaternion class CQuaternion
@ -10,6 +11,10 @@ public:
CQuaternion(void) {} CQuaternion(void) {}
CQuaternion(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {} CQuaternion(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {}
operator quaternion_t *() { return reinterpret_cast<quaternion_t *>(this); }
operator const quaternion_t *() const { return reinterpret_cast<const quaternion_t *>(this); }
operator quaternion_t &() { return *reinterpret_cast<quaternion_t *>(this); }
operator const quaternion_t &() const { return *reinterpret_cast<const quaternion_t *>(this); }
float Magnitude(void) const { float Magnitude(void) const {
#ifndef DC_SH4 #ifndef DC_SH4
return Sqrt(x*x + y*y + z*z + w*w); return Sqrt(x*x + y*y + z*z + w*w);
@ -57,10 +62,11 @@ public:
} }
const CQuaternion &operator/=(float right) { const CQuaternion &operator/=(float right) {
x /= right; right = dc::Invert<false>(right);
y /= right; x *= right;
z /= right; y *= right;
w /= right; z *= right;
w *= right;
return *this; return *this;
} }
@ -109,5 +115,6 @@ inline CQuaternion operator*(float left, const CQuaternion &right)
inline CQuaternion operator/(const CQuaternion &left, float right) inline CQuaternion operator/(const CQuaternion &left, float right)
{ {
return CQuaternion(left.x / right, left.y / right, left.z / right, left.w / right); right = Invert<false>(right);
return CQuaternion(left.x * right, left.y * right, left.z * right, left.w * right);
} }

View file

@ -27,32 +27,35 @@ CrossProduct(const CVector &v1, const CVector &v2)
CVector CVector
Multiply3x3(const CMatrix &mat, const CVector &vec) Multiply3x3(const CMatrix &mat, const CVector &vec)
{ {
#ifdef DC_SH4 #ifndef DC_SH4
register float __x __asm__("fr12") = vec.x;
register float __y __asm__("fr13") = vec.y;
register float __z __asm__("fr14") = vec.z;
register float __w __asm__("fr15") = 0.0f;
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat)));
asm volatile( "ftrv xmtrx, fv12\n"
: "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w)
: "0" (__x), "1" (__y), "2" (__z), "3" (__w) );
return { __x, __y, __z };
#else
// TODO: VU0 code // TODO: VU0 code
return CVector(mat.rx * vec.x + mat.fx * vec.y + mat.ux * vec.z, return CVector(mat.rx * vec.x + mat.fx * vec.y + mat.ux * vec.z,
mat.ry * vec.x + mat.fy * vec.y + mat.uy * vec.z, mat.ry * vec.x + mat.fy * vec.y + mat.uy * vec.z,
mat.rz * vec.x + mat.fz * vec.y + mat.uz * vec.z); mat.rz * vec.x + mat.fz * vec.y + mat.uz * vec.z);
#else
CVector out;
dc::mat_load2(mat);
mat_trans_normal3_nomod(vec.x, vec.y, vec.z,
out.x, out.y, out.z);
return out;
#endif #endif
} }
CVector CVector
Multiply3x3(const CVector &vec, const CMatrix &mat) Multiply3x3(const CVector &vec, const CMatrix &mat)
{ {
#ifndef DC_SH4
return CVector(mat.rx * vec.x + mat.ry * vec.y + mat.rz * vec.z, return CVector(mat.rx * vec.x + mat.ry * vec.y + mat.rz * vec.z,
mat.fx * vec.x + mat.fy * vec.y + mat.fz * vec.z, mat.fx * vec.x + mat.fy * vec.y + mat.fz * vec.z,
mat.ux * vec.x + mat.uy * vec.y + mat.uz * vec.z); mat.ux * vec.x + mat.uy * vec.y + mat.uz * vec.z);
#else
CVector out;
dc::mat_load2(mat);
mat_transpose();
mat_trans_normal3_nomod(vec.x, vec.y, vec.z,
out.x, out.y, out.z);
return out;
#endif
} }
CVector CVector
@ -60,7 +63,7 @@ operator*(const CMatrix &mat, const CVector &vec)
{ {
#ifdef DC_SH4 #ifdef DC_SH4
CVector out; CVector out;
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat))); dc::mat_load2(mat);
mat_trans_single3_nodiv_nomod(vec.x, vec.y, vec.z, out.x, out.y, out.z); mat_trans_single3_nodiv_nomod(vec.x, vec.y, vec.z, out.x, out.y, out.z);
return out; return out;
#else #else

View file

@ -68,7 +68,7 @@ public:
} }
const CVector &operator/=(float right) { const CVector &operator/=(float right) {
right = Invert(right); right = Invert<false>(right);
x *= right; x *= right;
y *= right; y *= right;
z *= right; z *= right;
@ -112,7 +112,8 @@ inline CVector operator*(float left, const CVector &right)
inline CVector operator/(const CVector &left, float right) inline CVector operator/(const CVector &left, float right)
{ {
return CVector(left.x / right, left.y / right, left.z / right); right = Invert<false>(right);
return CVector(left.x * right, left.y * right, left.z * right);
} }
__always_inline float __always_inline float

View file

@ -44,7 +44,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const CV
sqc2 vf06,0x0(%0)\n\ sqc2 vf06,0x0(%0)\n\
": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory"); ": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory");
#elif defined(DC_SH4) #elif defined(DC_SH4)
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat))); dc::mat_load2(mat);
mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z); mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z);
#else #else
out = mat * in; out = mat * in;
@ -71,7 +71,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const Rw
sqc2 vf06,0x0(%0)\n\ sqc2 vf06,0x0(%0)\n\
": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory"); ": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory");
#elif defined(DC_SH4) #elif defined(DC_SH4)
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat))); dc::mat_load2(mat);
mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z); mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z);
#else #else
out = mat * in; out = mat * in;
@ -108,7 +108,7 @@ __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat,
bnez %1,1b\n\ bnez %1,1b\n\
": : "r" (out) , "r" (n), "r" (&mat), "r" (in), "r" (stride): "memory"); ": : "r" (out) , "r" (n), "r" (&mat), "r" (in), "r" (stride): "memory");
#elif defined(DC_SH4) #elif defined(DC_SH4)
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat))); dc::mat_load2(mat);
while(n--) { while(n--) {
mat_trans_single3_nodiv_nomod(in->x, in->y, in->z, out->x, out->y, out->z); mat_trans_single3_nodiv_nomod(in->x, in->y, in->z, out->x, out->y, out->z);
in = reinterpret_cast<const RwV3d *>(reinterpret_cast<const uint8_t *>(in) + stride); in = reinterpret_cast<const RwV3d *>(reinterpret_cast<const uint8_t *>(in) + stride);

View file

@ -1,73 +1,6 @@
#pragma once #pragma once
#include "src/common_defines.h" #include "src/common_defines.h"
#include "rwdc_common.h"
#include <tuple> using namespace dc;
#include <dc/matrix.h>
#ifdef DC_SH4
#define mat_trans_nodiv_nomod(x, y, z, x2, y2, z2, w2) do { \
register float __x __asm__("fr12") = (x); \
register float __y __asm__("fr13") = (y); \
register float __z __asm__("fr14") = (z); \
register float __w __asm__("fr15") = 1.0f; \
__asm__ __volatile__( "ftrv xmtrx, fv12\n" \
: "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
: "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
} while(false)
#define mat_trans_w_nodiv_nomod(x, y, z, w) do { \
register float __x __asm__("fr12") = (x); \
register float __y __asm__("fr13") = (y); \
register float __z __asm__("fr14") = (z); \
register float __w __asm__("fr15") = 1.0f; \
__asm__ __volatile__( "ftrv xmtrx, fv12\n" \
: "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
: "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
w = __w; \
} while(false)
__always_inline float Fmac(float a, float b, float c) {
asm volatile ("fmac fr0, %[floatb], %[floatc]\n"
: [floatc] "+f" (c) : "w" (a), [floatb] "f" (b) : );
return c;
}
#else
#define mat_trans_nodiv_nomod(x_, y_, z_, x2, y2, z2, w2) do { \
vector_t tmp = { x_, y_, z_, 1.0f }; \
mat_transform(&tmp, &tmp, 1, 0); \
x2 = tmp.x; y2 = tmp.y; z2 = tmp.z; w2 = tmp.w; \
} while(false)
#define mat_trans_w_nodiv_nomod(x_, y_, z_, w_) do { \
vector_t tmp = { x_, y_, z_, 1.0f }; \
mat_transform(&tmp, &tmp, 1, 0); \
w_ = tmp.w; \
} while(false)
__always_inline float Fmac(float a, float b, float c) { return a * b + c; }
#endif
__always_inline float Sin(float x) { return __builtin_sinf(x); }
__always_inline float Cos(float x) { return __builtin_cosf(x); }
__always_inline auto SinCos(float x) { return std::pair { Sin(x), Cos(x) }; }
__always_inline float Tan(float x) { return __builtin_tanf(x); }
__always_inline float Abs(float x) { return __builtin_fabsf(x); }
__always_inline float Sqrt(float x) { return __builtin_sqrtf(x); }
__always_inline float RecipSqrt(float x) { return 1.0f / __builtin_sqrtf(x); }
__always_inline float Asin(float x) { return __builtin_asinf(x); }
__always_inline float Acos(float x) { return __builtin_acosf(x); }
__always_inline float Atan(float x) { return __builtin_atanf(x); }
__always_inline float Atan2(float y, float x) { return __builtin_atan2f(y, x); }
__always_inline float RecipSqrt(float x, float y) { return x / __builtin_sqrtf(y); /*y = RecipSqrt(y); return x * y * y;*/ }
__always_inline float Pow(float x, float y) { return __builtin_powf(x, y); }
__always_inline float Floor(float x) { return __builtin_floorf(x); }
__always_inline float Ceil(float x) { return __builtin_ceilf(x); }
__always_inline float Invert(float x) { return (((x) < 0.0f)? -1.0f : 1.0f) * RecipSqrt((x) * (x)); }
__always_inline float Div(float x, float y) { return x * Invert(y); }
__always_inline float Lerp(float a, float b, float t) { return Fmac(t, (b - a), a); }

View file

@ -1473,7 +1473,7 @@ CCollision::ProcessLineOfSight(const CColLine &line,
point.point = matrix * point.point; point.point = matrix * point.point;
point.normal = Multiply3x3(matrix, point.normal); point.normal = Multiply3x3(matrix, point.normal);
#else #else
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&matrix))); dc::mat_load2(matrix);
mat_trans_single3_nodiv(point.point.x, mat_trans_single3_nodiv(point.point.x,
point.point.y, point.point.y,
point.point.z); point.point.z);
@ -1653,7 +1653,7 @@ CCollision::ProcessVerticalLine(const CColLine &line,
point.point = matrix * point.point; point.point = matrix * point.point;
point.normal = Multiply3x3(matrix, point.normal); point.normal = Multiply3x3(matrix, point.normal);
#else #else
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&matrix))); dc::mat_load2(matrix);
mat_trans_single3_nodiv(point.point.x, mat_trans_single3_nodiv(point.point.x,
point.point.y, point.point.y,
point.point.z); point.point.z);
@ -2027,8 +2027,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA,
#ifndef DC_SH4 #ifndef DC_SH4
matAB *= matrixA; matAB *= matrixA;
#else #else
mat_load(reinterpret_cast<const matrix_t*>(&matAB)); dc::mat_load_apply(matAB, matrixA);
mat_apply(reinterpret_cast<const matrix_t*>(&matrixA));
#endif #endif
CColSphere bsphereAB; // bounding sphere of A in B space CColSphere bsphereAB; // bounding sphere of A in B space
@ -2099,8 +2098,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA,
#ifndef DC_SH4 #ifndef DC_SH4
matBA *= matrixB; matBA *= matrixB;
#else #else
mat_load(reinterpret_cast<const matrix_t*>(&matBA)); dc::mat_load_apply(matBA, matrixB);
mat_apply(reinterpret_cast<const matrix_t*>(&matrixB));
#endif #endif
for(i = 0; i < modelB.numSpheres; i++){ for(i = 0; i < modelB.numSpheres; i++){
s.radius = modelB.spheres[i].radius; s.radius = modelB.spheres[i].radius;
@ -2162,7 +2160,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA,
} }
#ifdef DC_SH4 #ifdef DC_SH4
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&matrixB))); dc::mat_load2(matrixB);
#endif #endif
for(i = 0; i < numCollisions; i++){ for(i = 0; i < numCollisions; i++){
#ifndef DC_SH4 #ifndef DC_SH4

View file

@ -85,8 +85,6 @@
#define rwVENDORID_ROCKSTAR 0x0253F2 #define rwVENDORID_ROCKSTAR 0x0253F2
#define Max(a,b) ((a) > (b) ? (a) : (b))
#define Min(a,b) ((a) < (b) ? (a) : (b))
// Use this to add const that wasn't there in the original code // Use this to add const that wasn't there in the original code
#define Const const #define Const const
@ -303,12 +301,8 @@ extern int strncasecmp(const char *str1, const char *str2, size_t len);
extern wchar *AllocUnicode(const char*src); extern wchar *AllocUnicode(const char*src);
#define Clamp(v, low, high) ((v)<(low) ? (low) : (v)>(high) ? (high) : (v))
#define Clamp2(v, center, radius) ((v) > (center) ? Min(v, center + radius) : Max(v, center - radius))
inline float sq(float x) { return x*x; }
#define SQR(x) ((x) * (x)) #define SQR(x) ((x) * (x))
__always_inline auto sq(auto x) { return SQR(x); }
#ifdef __MWERKS__ #ifdef __MWERKS__
#define M_E 2.71828182845904523536 // e #define M_E 2.71828182845904523536 // e
@ -326,7 +320,10 @@ inline float sq(float x) { return x*x; }
#define M_SQRT1_2 0.707106781186547524401 // 1/sqrt(2) #define M_SQRT1_2 0.707106781186547524401 // 1/sqrt(2)
#endif #endif
#define PI (float)M_PI #ifndef DC_SH4
#define F_PI M_PI
#endif
#define PI (float)F_PI
#define TWOPI (PI*2) #define TWOPI (PI*2)
#define HALFPI (PI/2) #define HALFPI (PI/2)
#define DEGTORAD(x) ((x) * PI / 180.0f) #define DEGTORAD(x) ((x) * PI / 180.0f)
@ -380,7 +377,10 @@ __inline__ void TRACE(char *f, ...) { } // this is re3 only, and so the function
#endif #endif
#endif #endif
#ifndef MASTER #ifdef assert
#undef assert
#endif
#if !defined(MASTER)
#define assert(_Expression) (void)( (!!(_Expression)) || (re3_assert(#_Expression, __FILE__, __LINE__, __FUNCTION__), 0) ) #define assert(_Expression) (void)( (!!(_Expression)) || (re3_assert(#_Expression, __FILE__, __LINE__, __FUNCTION__), 0) )
#else #else
#define assert(_Expression) (_Expression) #define assert(_Expression) (_Expression)
@ -411,12 +411,7 @@ template<int s, int t> struct check_size {
#endif #endif
#define BIT(num) (1<<(num)) #define BIT(num) (1<<(num))
#define ABS(a) (((a) < 0) ? (-(a)) : (a)) #define ABS(a) Abs(a)
#define norm(value, min, max) (((value) < (min)) ? 0 : (((value) > (max)) ? 1 : (((value) - (min)) / ((max) - (min)))))
#define Lerp(norm, min, max) ( (norm) * ((max) - (min)) + (min) )
#define STRINGIFY(x) #x
#define STR(x) STRINGIFY(x)
#define CONCAT_(x,y) x##y
#define CONCAT(x,y) CONCAT_(x,y)
// we use std::lerp now
//#define lerp(norm, min, max) ( (norm) * ((max) - (min)) + (min) )

View file

@ -4,7 +4,7 @@ CMatrix::CMatrix(CMatrix const &m)
{ {
m_attachment = nil; m_attachment = nil;
m_hasRwMatrix = false; m_hasRwMatrix = false;
*this = m; mat_copy(*this, m);
} }
CMatrix::CMatrix(RwMatrix *matrix, bool owner) CMatrix::CMatrix(RwMatrix *matrix, bool owner)
@ -75,7 +75,7 @@ CMatrix::UpdateRW(void)
void void
CMatrix::operator=(CMatrix const &rhs) CMatrix::operator=(CMatrix const &rhs)
{ {
memcpy(this, &rhs, sizeof(f)); mat_copy(*this, rhs);
if (m_attachment) if (m_attachment)
UpdateRW(); UpdateRW();
} }
@ -83,7 +83,7 @@ CMatrix::operator=(CMatrix const &rhs)
void void
CMatrix::CopyOnlyMatrix(const CMatrix &other) CMatrix::CopyOnlyMatrix(const CMatrix &other)
{ {
memcpy(this, &other, sizeof(f)); mat_copy(*this, other);
} }
CMatrix & CMatrix &
@ -358,12 +358,14 @@ CMatrix::RotateZ(float z)
void void
CMatrix::Rotate(float x, float y, float z) CMatrix::Rotate(float x, float y, float z)
{ {
float cX = Cos(x); #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
float sX = Sin(x); dc::mat_load2(reinterpret_cast<matrix_t *>(this));
float cY = Cos(y); mat_rotate(x, y, z);
float sY = Sin(y); dc::mat_store2(reinterpret_cast<matrix_t *>(this));
float cZ = Cos(z); #else
float sZ = Sin(z); auto [sX, cX] = SinCos(x);
auto [sY, cY] = SinCos(y);
auto [sZ, cZ] = SinCos(z);
float rx = this->rx; float rx = this->rx;
float ry = this->ry; float ry = this->ry;
@ -388,6 +390,20 @@ CMatrix::Rotate(float x, float y, float z)
float z2 = sZ * sY - (cZ * sX) * cY; float z2 = sZ * sY - (cZ * sX) * cY;
float z3 = cX * cY; float z3 = cX * cY;
#if !defined(DC_TEXCONV) && !defined(DC_SIM)
this->rx = fipr(x1, y1, z1, 0, rx, ry, rz, 0);
this->ry = fipr(x2, y2, z2, 0, rx, ry, rz, 0);
this->rz = fipr(x3, y3, z3, 0, rx, ry, rz, 0);
this->fx = fipr(x1, y1, z1, 0, ux, uy, uz, 0);
this->fy = fipr(x2, y2, z2, 0, ux, uy, uz, 0);
this->fz = fipr(x3, y3, z3, 0, ux, uy, uz, 0);
this->ux = fipr(x1, y1, z1, 0, ax, ay, az, 0);
this->uy = fipr(x2, y2, z2, 0, ax, ay, az, 0);
this->uz = fipr(x3, y3, z3, 0, ax, ay, az, 0);
this->px = fipr(x1, y1, z1, 0, px, py, pz, 0);
this->py = fipr(x2, y2, z2, 0, px, py, pz, 0);
this->pz = fipr(x3, y3, z3, 0, px, py, pz, 0);
#else
this->rx = x1 * rx + y1 * ry + z1 * rz; this->rx = x1 * rx + y1 * ry + z1 * rz;
this->ry = x2 * rx + y2 * ry + z2 * rz; this->ry = x2 * rx + y2 * ry + z2 * rz;
this->rz = x3 * rx + y3 * ry + z3 * rz; this->rz = x3 * rx + y3 * ry + z3 * rz;
@ -400,6 +416,8 @@ CMatrix::Rotate(float x, float y, float z)
this->px = x1 * px + y1 * py + z1 * pz; this->px = x1 * px + y1 * py + z1 * pz;
this->py = x2 * px + y2 * py + z2 * pz; this->py = x2 * px + y2 * py + z2 * pz;
this->pz = x3 * px + y3 * py + z3 * pz; this->pz = x3 * px + y3 * py + z3 * pz;
#endif
#endif
} }
CMatrix & CMatrix &
@ -429,9 +447,7 @@ operator*(const CMatrix &m1, const CMatrix &m2)
// TODO: VU0 code // TODO: VU0 code
CMatrix out; CMatrix out;
#if defined(RW_DC) #if defined(RW_DC)
mat_load(reinterpret_cast<const matrix_t *>(&m1)); mat_mult(out, m1, m2);
mat_apply(reinterpret_cast<const matrix_t *>(&m2));
mat_store(reinterpret_cast<matrix_t *>(&out));
#else #else
out.rx = m1.rx * m2.rx + m1.fx * m2.ry + m1.ux * m2.rz; out.rx = m1.rx * m2.rx + m1.fx * m2.ry + m1.ux * m2.rz;
out.ry = m1.ry * m2.rx + m1.fy * m2.ry + m1.uy * m2.rz; out.ry = m1.ry * m2.rx + m1.fy * m2.ry + m1.uy * m2.rz;

View file

@ -43,6 +43,8 @@ public:
SetScale(scale); SetScale(scale);
} }
~CMatrix(void); ~CMatrix(void);
operator matrix_t *() { return reinterpret_cast<matrix_t *>(this); }
operator const matrix_t *() const { return reinterpret_cast<const matrix_t *>(this); }
void Attach(RwMatrix *matrix, bool owner = false); void Attach(RwMatrix *matrix, bool owner = false);
void AttachRW(RwMatrix *matrix, bool owner = false); void AttachRW(RwMatrix *matrix, bool owner = false);
void Detach(void); void Detach(void);
@ -82,13 +84,12 @@ public:
void Scale(float sx, float sy, float sz) void Scale(float sx, float sy, float sz)
{ {
for (int i = 0; i < 3; i++){ for (int i = 0; i < 3; i++){
f[i][0] *= sx; f[i][0] *= sx;
f[i][1] *= sy; f[i][1] *= sy;
f[i][2] *= sz; f[i][2] *= sz;
} }
} }
void SetRotateXOnly(float angle); void SetRotateXOnly(float angle);
void SetRotateYOnly(float angle); void SetRotateYOnly(float angle);
void SetRotateZOnly(float angle); void SetRotateZOnly(float angle);
@ -125,11 +126,22 @@ CMatrix Invert(const CMatrix &matrix);
CMatrix operator*(const CMatrix &m1, const CMatrix &m2); CMatrix operator*(const CMatrix &m1, const CMatrix &m2);
inline CVector MultiplyInverse(const CMatrix &mat, const CVector &vec) inline CVector MultiplyInverse(const CMatrix &mat, const CVector &vec)
{ {
#ifndef DC_SH4
CVector v(vec.x - mat.px, vec.y - mat.py, vec.z - mat.pz); CVector v(vec.x - mat.px, vec.y - mat.py, vec.z - mat.pz);
return CVector( return CVector(
mat.rx * v.x + mat.ry * v.y + mat.rz * v.z, mat.rx * v.x + mat.ry * v.y + mat.rz * v.z,
mat.fx * v.x + mat.fy * v.y + mat.fz * v.z, mat.fx * v.x + mat.fy * v.y + mat.fz * v.z,
mat.ux * v.x + mat.uy * v.y + mat.uz * v.z); mat.ux * v.x + mat.uy * v.y + mat.uz * v.z);
#else
register float x asm(KOS_FPARG(0)) = vec.x - mat.px;
register float y asm(KOS_FPARG(1)) = vec.y - mat.py;
register float z asm(KOS_FPARG(2)) = vec.z - mat.pz;
return CVector(
fipr(x, y, z, 0.0f, mat.rx, mat.ry, mat.rz, 0.0f),
fipr(x, y, z, 0.0f, mat.fx, mat.fy, mat.fz, 0.0f),
fipr(x, y, z, 0.0f, mat.ux, mat.uy, mat.uz, 0.0f)
);
#endif
} }

View file

@ -39,10 +39,14 @@ CQuaternion::Slerp(const CQuaternion &q1, const CQuaternion &q2, float theta, fl
void void
CQuaternion::Multiply(const CQuaternion &q1, const CQuaternion &q2) CQuaternion::Multiply(const CQuaternion &q1, const CQuaternion &q2)
{ {
#ifndef DC_SH4
x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w); x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w);
y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w); y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w);
z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w); z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w);
w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z); w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z);
#else
quat_mult(*this, q1, q2);
#endif
} }
void void
@ -51,9 +55,16 @@ CQuaternion::Get(RwV3d *axis, float *angle)
*angle = Acos(w); *angle = Acos(w);
float s = Sin(*angle); float s = Sin(*angle);
#ifndef DC_SH4
axis->x = x * (1.0f / s); axis->x = x * (1.0f / s);
axis->y = y * (1.0f / s); axis->y = y * (1.0f / s);
axis->z = z * (1.0f / s); axis->z = z * (1.0f / s);
#else
float invS = dc::Invert<true, false>(s);
axis->x = x * invS;
axis->y = y * invS;
axis->z = z * invS;
#endif
} }
void void
@ -104,7 +115,7 @@ CQuaternion::Set(const RwMatrix &matrix)
if (f >= 0.0f) { if (f >= 0.0f) {
s = Sqrt(f + 1.0f); s = Sqrt(f + 1.0f);
w = 0.5f * s; w = 0.5f * s;
m = 0.5f / s; m = Div<true, false>(0.5f, s);
x = (matrix.up.z - matrix.at.y) * m; x = (matrix.up.z - matrix.at.y) * m;
y = (matrix.at.x - matrix.right.z) * m; y = (matrix.at.x - matrix.right.z) * m;
z = (matrix.right.y - matrix.up.x) * m; z = (matrix.right.y - matrix.up.x) * m;
@ -115,7 +126,7 @@ CQuaternion::Set(const RwMatrix &matrix)
if (f >= 0.0f) { if (f >= 0.0f) {
s = Sqrt(f + 1.0f); s = Sqrt(f + 1.0f);
x = 0.5f * s; x = 0.5f * s;
m = 0.5f / s; m = Div<true, false>(0.5f, s);
y = (matrix.up.x + matrix.right.y) * m; y = (matrix.up.x + matrix.right.y) * m;
z = (matrix.at.x + matrix.right.z) * m; z = (matrix.at.x + matrix.right.z) * m;
w = (matrix.up.z - matrix.at.y) * m; w = (matrix.up.z - matrix.at.y) * m;
@ -126,7 +137,7 @@ CQuaternion::Set(const RwMatrix &matrix)
if (f >= 0.0f) { if (f >= 0.0f) {
s = Sqrt(f + 1.0f); s = Sqrt(f + 1.0f);
y = 0.5f * s; y = 0.5f * s;
m = 0.5f / s; m = Div<true, false>(0.5f, s);
w = (matrix.at.x - matrix.right.z) * m; w = (matrix.at.x - matrix.right.z) * m;
x = (matrix.up.x - matrix.right.y) * m; x = (matrix.up.x - matrix.right.y) * m;
z = (matrix.at.y + matrix.up.z) * m; z = (matrix.at.y + matrix.up.z) * m;

View file

@ -1,6 +1,7 @@
#pragma once #pragma once
#include "src/common_defines.h" #include "src/common_defines.h"
#include "rwdc_common.h"
// TODO: actually implement this // TODO: actually implement this
class CQuaternion class CQuaternion
@ -10,6 +11,10 @@ public:
CQuaternion(void) {} CQuaternion(void) {}
CQuaternion(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {} CQuaternion(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {}
operator quaternion_t *() { return reinterpret_cast<quaternion_t *>(this); }
operator const quaternion_t *() const { return reinterpret_cast<const quaternion_t *>(this); }
operator quaternion_t &() { return *reinterpret_cast<quaternion_t *>(this); }
operator const quaternion_t &() const { return *reinterpret_cast<const quaternion_t *>(this); }
float Magnitude(void) const { float Magnitude(void) const {
#ifndef DC_SH4 #ifndef DC_SH4
return Sqrt(x*x + y*y + z*z + w*w); return Sqrt(x*x + y*y + z*z + w*w);
@ -57,10 +62,11 @@ public:
} }
const CQuaternion &operator/=(float right) { const CQuaternion &operator/=(float right) {
x /= right; right = dc::Invert<false>(right);
y /= right; x *= right;
z /= right; y *= right;
w /= right; z *= right;
w *= right;
return *this; return *this;
} }
@ -109,5 +115,6 @@ inline CQuaternion operator*(float left, const CQuaternion &right)
inline CQuaternion operator/(const CQuaternion &left, float right) inline CQuaternion operator/(const CQuaternion &left, float right)
{ {
return CQuaternion(left.x / right, left.y / right, left.z / right, left.w / right); right = Invert<false>(right);
return CQuaternion(left.x * right, left.y * right, left.z * right, left.w * right);
} }

View file

@ -27,32 +27,35 @@ CrossProduct(const CVector &v1, const CVector &v2)
CVector CVector
Multiply3x3(const CMatrix &mat, const CVector &vec) Multiply3x3(const CMatrix &mat, const CVector &vec)
{ {
#ifdef DC_SH4 #ifndef DC_SH4
register float __x __asm__("fr12") = vec.x;
register float __y __asm__("fr13") = vec.y;
register float __z __asm__("fr14") = vec.z;
register float __w __asm__("fr15") = 0.0f;
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat)));
asm volatile( "ftrv xmtrx, fv12\n"
: "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w)
: "0" (__x), "1" (__y), "2" (__z), "3" (__w) );
return { __x, __y, __z };
#else
// TODO: VU0 code // TODO: VU0 code
return CVector(mat.rx * vec.x + mat.fx * vec.y + mat.ux * vec.z, return CVector(mat.rx * vec.x + mat.fx * vec.y + mat.ux * vec.z,
mat.ry * vec.x + mat.fy * vec.y + mat.uy * vec.z, mat.ry * vec.x + mat.fy * vec.y + mat.uy * vec.z,
mat.rz * vec.x + mat.fz * vec.y + mat.uz * vec.z); mat.rz * vec.x + mat.fz * vec.y + mat.uz * vec.z);
#else
CVector out;
dc::mat_load2(mat);
mat_trans_normal3_nomod(vec.x, vec.y, vec.z,
out.x, out.y, out.z);
return out;
#endif #endif
} }
CVector CVector
Multiply3x3(const CVector &vec, const CMatrix &mat) Multiply3x3(const CVector &vec, const CMatrix &mat)
{ {
#ifndef DC_SH4
return CVector(mat.rx * vec.x + mat.ry * vec.y + mat.rz * vec.z, return CVector(mat.rx * vec.x + mat.ry * vec.y + mat.rz * vec.z,
mat.fx * vec.x + mat.fy * vec.y + mat.fz * vec.z, mat.fx * vec.x + mat.fy * vec.y + mat.fz * vec.z,
mat.ux * vec.x + mat.uy * vec.y + mat.uz * vec.z); mat.ux * vec.x + mat.uy * vec.y + mat.uz * vec.z);
#else
CVector out;
dc::mat_load2(mat);
mat_transpose();
mat_trans_normal3_nomod(vec.x, vec.y, vec.z,
out.x, out.y, out.z);
return out;
#endif
} }
CVector CVector
@ -60,7 +63,7 @@ operator*(const CMatrix &mat, const CVector &vec)
{ {
#ifdef DC_SH4 #ifdef DC_SH4
CVector out; CVector out;
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat))); dc::mat_load2(mat);
mat_trans_single3_nodiv_nomod(vec.x, vec.y, vec.z, out.x, out.y, out.z); mat_trans_single3_nodiv_nomod(vec.x, vec.y, vec.z, out.x, out.y, out.z);
return out; return out;
#else #else

View file

@ -50,7 +50,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const CV
sqc2 vf06,0x0(%0)\n\ sqc2 vf06,0x0(%0)\n\
": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory"); ": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory");
#elif defined(DC_SH4) #elif defined(DC_SH4)
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat))); dc::mat_load2(mat);
mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z); mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z);
#else #else
out = mat * in; out = mat * in;
@ -77,7 +77,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const Rw
sqc2 vf06,0x0(%0)\n\ sqc2 vf06,0x0(%0)\n\
": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory"); ": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory");
#elif defined(DC_SH4) #elif defined(DC_SH4)
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat))); dc::mat_load2(mat);
mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z); mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z);
#else #else
out = mat * in; out = mat * in;
@ -86,7 +86,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const Rw
__always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat, const RwV3d *in, int stride) __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat, const RwV3d *in, int stride)
{ {
#ifdef GTA_PS3 #ifdef GTA_PS2
__asm__ __volatile__("\n\ __asm__ __volatile__("\n\
paddub $3,%4,$0\n\ paddub $3,%4,$0\n\
lqc2 vf02,0x0(%2)\n\ lqc2 vf02,0x0(%2)\n\
@ -114,7 +114,7 @@ __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat,
bnez %1,1b\n\ bnez %1,1b\n\
": : "r" (out) , "r" (n), "r" (&mat), "r" (in), "r" (stride): "memory"); ": : "r" (out) , "r" (n), "r" (&mat), "r" (in), "r" (stride): "memory");
#elif defined(DC_SH4) #elif defined(DC_SH4)
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat))); dc::mat_load2(mat);
while(n--) { while(n--) {
mat_trans_single3_nodiv_nomod(in->x, in->y, in->z, out->x, out->y, out->z); mat_trans_single3_nodiv_nomod(in->x, in->y, in->z, out->x, out->y, out->z);
in = reinterpret_cast<const RwV3d *>(reinterpret_cast<const uint8_t *>(in) + stride); in = reinterpret_cast<const RwV3d *>(reinterpret_cast<const uint8_t *>(in) + stride);
@ -137,7 +137,6 @@ __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat,
lqc2 vf03,0x10(%2)\n\ lqc2 vf03,0x10(%2)\n\
lqc2 vf04,0x20(%2)\n\ lqc2 vf04,0x20(%2)\n\
lqc2 vf05,0x30(%2)\n\ lqc2 vf05,0x30(%2)\n\
lqc2 vf01,0x0(%3)\n\
nop\n\ nop\n\
1: vmulax.xyz ACC, vf02,vf01\n\ 1: vmulax.xyz ACC, vf02,vf01\n\
vmadday.xyz ACC, vf03,vf01\n\ vmadday.xyz ACC, vf03,vf01\n\

View file

@ -1,46 +1,6 @@
#pragma once #pragma once
#include "src/common_defines.h" #include "src/common_defines.h"
#include "rwdc_common.h"
#include <dc/matrix.h> using namespace dc;
#ifdef DC_SH4
#define mat_trans_nodiv_nomod(x, y, z, x2, y2, z2, w2) do { \
register float __x __asm__("fr12") = (x); \
register float __y __asm__("fr13") = (y); \
register float __z __asm__("fr14") = (z); \
register float __w __asm__("fr15") = 1.0f; \
__asm__ __volatile__( "ftrv xmtrx, fv12\n" \
: "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
: "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
} while(false)
#else
#define mat_trans_nodiv_nomod(x_, y_, z_, x2, y2, z2, w2) do { \
vector_t tmp = { x_, y_, z_, 1.0f }; \
mat_transform(&tmp, &tmp, 1, 0); \
x2 = tmp.x; y2 = tmp.y; z2 = tmp.z; w2 = tmp.w; \
} while(false)
#endif
// wrapper around float versions of functions
// in gta they are in CMaths but that makes the code rather noisy
inline float Sin(float x) { return sinf(x); }
inline float Asin(float x) { return asinf(x); }
inline float Cos(float x) { return cosf(x); }
inline float Acos(float x) { return acosf(x); }
inline float Tan(float x) { return tanf(x); }
inline float Atan(float x) { return atanf(x); }
inline float Atan2(float y, float x) { return atan2f(y, x); }
inline float Abs(float x) { return fabsf(x); }
inline float Sqrt(float x) { return sqrtf(x); }
inline float RecipSqrt(float x, float y) { return x/Sqrt(y); }
inline float RecipSqrt(float x) { return RecipSqrt(1.0f, x); }
inline float Pow(float x, float y) { return powf(x, y); }
inline float Floor(float x) { return floorf(x); }
inline float Ceil(float x) { return ceilf(x); }

2
vendor/dca3-kos vendored

@ -1 +1 @@
Subproject commit 5f69d048aaf2548687c33afc7726d603a22e29b7 Subproject commit a2706087626795b69689f06803329c1e37bdf7e2

View file

@ -89,10 +89,18 @@ strncmp_ci(const char *s1, const char *s2, int n)
Quat Quat
mult(const Quat &q, const Quat &p) mult(const Quat &q, const Quat &p)
{ {
#ifndef DC_SH4
return makeQuat(q.w*p.w - q.x*p.x - q.y*p.y - q.z*p.z, return makeQuat(q.w*p.w - q.x*p.x - q.y*p.y - q.z*p.z,
q.w*p.x + q.x*p.w + q.y*p.z - q.z*p.y, q.w*p.x + q.x*p.w + q.y*p.z - q.z*p.y,
q.w*p.y + q.y*p.w + q.z*p.x - q.x*p.z, q.w*p.y + q.y*p.w + q.z*p.x - q.x*p.z,
q.w*p.z + q.z*p.w + q.x*p.y - q.y*p.x); q.w*p.z + q.z*p.w + q.x*p.y - q.y*p.x);
#else
Quat o;
dc::quat_mult(reinterpret_cast<dc::quaternion_t *>(&o),
reinterpret_cast<const dc::quaternion_t &>(q),
reinterpret_cast<const dc::quaternion_t &>(p));
return o;
#endif
} }

View file

@ -429,6 +429,7 @@ Camera::frustumTestSphere(const Sphere *s) const
{ {
int32 res = SPHEREINSIDE; int32 res = SPHEREINSIDE;
const FrustumPlane *p = this->frustumPlanes; const FrustumPlane *p = this->frustumPlanes;
#ifndef DC_SH4
for(int32 i = 0; i < 6; i++){ for(int32 i = 0; i < 6; i++){
float32 dist = dot(p->plane.normal, s->center) - p->plane.distance; float32 dist = dot(p->plane.normal, s->center) - p->plane.distance;
if(s->radius < dist) if(s->radius < dist)
@ -437,6 +438,114 @@ Camera::frustumTestSphere(const Sphere *s) const
res = SPHEREBOUNDARY; res = SPHEREBOUNDARY;
p++; p++;
} }
#else
__builtin_prefetch(p);
register float sx asm("fr0") = s->center.x;
register float sy asm("fr1") = s->center.y;
register float sz asm("fr2") = s->center.z;
register float sw asm("fr3") = -1.0f;
// far
register float px asm("fr4") = p->plane.normal.x;
register float py asm("fr5") = p->plane.normal.y;
register float pz asm("fr6") = p->plane.normal.z;
register float pw asm("fr7") = p->plane.distance;
asm volatile("fipr fv0, fv4"
: "+f" (pw)
: "f" (sx), "f" (sy), "f" (sz), "f" (sw),
"f" (px), "f" (py), "f" (pz));
if(s->radius < pw)
return SPHEREOUTSIDE;
else if(s->radius > -pw)
res = SPHEREBOUNDARY;
p++;
// near
px = p->plane.normal.x;
py = p->plane.normal.y;
pz = p->plane.normal.z;
pw = p->plane.distance;
asm volatile("fipr fv0, fv4"
: "+f" (pw)
: "f" (sx), "f" (sy), "f" (sz), "f" (sw),
"f" (px), "f" (py), "f" (pz));
if(s->radius < pw)
return SPHEREOUTSIDE;
if(s->radius > -pw)
res = SPHEREBOUNDARY_NEAR;
p++;
const float* base_ptr0 = &p[0].plane.normal.x;
const float* base_ptr1 = &p[1].plane.normal.x;
const float* base_ptr2 = &p[2].plane.normal.x;
const float* base_ptr3 = &p[3].plane.normal.x;
__builtin_prefetch(base_ptr0);
static_assert(offsetof (decltype (p[0].plane.normal), y)
-offsetof (decltype (p[0].plane.normal), x) == sizeof (float));
static_assert(offsetof (decltype (p[0].plane.normal), z)
-offsetof (decltype (p[0].plane.normal), y) == sizeof (float));
static_assert(offsetof (decltype (p[0].plane), distance)
-offsetof (decltype (p[0].plane.normal), z) == sizeof (float));
asm volatile (R"(
frchg
fmov.s @%0+,fr0
fmov.s @%1+,fr1
fmov.s @%2+,fr2
fmov.s @%3+,fr3
fmov.s @%0+,fr4
fmov.s @%1+,fr5
fmov.s @%2+,fr6
fmov.s @%3+,fr7
fmov.s @%0+,fr8
fmov.s @%1+,fr9
fmov.s @%2+,fr10
fmov.s @%3+,fr11
fmov.s @%0,fr12
fmov.s @%1,fr13
fmov.s @%2,fr14
fmov.s @%3,fr15
frchg
)" : "+&r" (base_ptr0), "+&r" (base_ptr1), "+&r" (base_ptr2), "+&r" (base_ptr3)
:
: );
float dists[4];
mat_trans_vec4_nodiv_nomod(sx, sy, sz, sw,
dists[0], dists[1], dists[2], dists[3]);
if(s->radius < dists[0])
return SPHEREOUTSIDE;
else if(s->radius > -dists[0])
res = SPHEREBOUNDARY;
if(s->radius < dists[1])
return SPHEREOUTSIDE;
else if(s->radius > -dists[1])
res = SPHEREBOUNDARY;
if(s->radius < dists[2])
return SPHEREOUTSIDE;
else if(s->radius > -dists[2])
res = SPHEREBOUNDARY;
if(s->radius < dists[3])
return SPHEREOUTSIDE;
else if(s->radius > -dists[3])
res = SPHEREBOUNDARY;
#endif
return res; return res;
} }
@ -445,7 +554,7 @@ Camera::frustumTestSphereNear(const Sphere *s) const
{ {
int32 res = SPHEREINSIDE; int32 res = SPHEREINSIDE;
const FrustumPlane *p = this->frustumPlanes; const FrustumPlane *p = this->frustumPlanes;
#ifndef DC_SH4
// far // far
float32 dist = dot(p->plane.normal, s->center) - p->plane.distance; float32 dist = dot(p->plane.normal, s->center) - p->plane.distance;
if(s->radius < dist) if(s->radius < dist)
@ -481,6 +590,101 @@ Camera::frustumTestSphereNear(const Sphere *s) const
return SPHEREOUTSIDE; return SPHEREOUTSIDE;
p++; p++;
#else
__builtin_prefetch(p);
register float sx asm("fr0") = s->center.x;
register float sy asm("fr1") = s->center.y;
register float sz asm("fr2") = s->center.z;
register float sw asm("fr3") = -1.0f;
// far
register float px asm("fr4") = p->plane.normal.x;
register float py asm("fr5") = p->plane.normal.y;
register float pz asm("fr6") = p->plane.normal.z;
register float pw asm("fr7") = p->plane.distance;
asm volatile("fipr fv0, fv4"
: "+f" (pw)
: "f" (sx), "f" (sy), "f" (sz), "f" (sw),
"f" (px), "f" (py), "f" (pz));
if(s->radius < pw)
return SPHEREOUTSIDE;
p++;
// near
px = p->plane.normal.x;
py = p->plane.normal.y;
pz = p->plane.normal.z;
pw = p->plane.distance;
asm volatile("fipr fv0, fv4"
: "+f" (pw)
: "f" (sx), "f" (sy), "f" (sz), "f" (sw),
"f" (px), "f" (py), "f" (pz));
if(s->radius < pw)
return SPHEREOUTSIDE;
if(s->radius > -pw)
res = SPHEREBOUNDARY_NEAR;
p++;
const float* base_ptr0 = &p[0].plane.normal.x;
const float* base_ptr1 = &p[1].plane.normal.x;
const float* base_ptr2 = &p[2].plane.normal.x;
const float* base_ptr3 = &p[3].plane.normal.x;
__builtin_prefetch(base_ptr0);
static_assert(offsetof (decltype (p[0].plane.normal), y)
-offsetof (decltype (p[0].plane.normal), x) == sizeof (float));
static_assert(offsetof (decltype (p[0].plane.normal), z)
-offsetof (decltype (p[0].plane.normal), y) == sizeof (float));
static_assert(offsetof (decltype (p[0].plane), distance)
-offsetof (decltype (p[0].plane.normal), z) == sizeof (float));
asm volatile (R"(
frchg
fmov.s @%0+,fr0
fmov.s @%1+,fr1
fmov.s @%2+,fr2
fmov.s @%3+,fr3
fmov.s @%0+,fr4
fmov.s @%1+,fr5
fmov.s @%2+,fr6
fmov.s @%3+,fr7
fmov.s @%0+,fr8
fmov.s @%1+,fr9
fmov.s @%2+,fr10
fmov.s @%3+,fr11
fmov.s @%0,fr12
fmov.s @%1,fr13
fmov.s @%2,fr14
fmov.s @%3,fr15
frchg
)" : "+&r" (base_ptr0), "+&r" (base_ptr1), "+&r" (base_ptr2), "+&r" (base_ptr3)
:
: );
float dists[4];
mat_trans_vec4_nodiv_nomod(sx, sy, sz, sw,
dists[0], dists[1], dists[2], dists[3]);
if(s->radius < dists[0])
return SPHEREOUTSIDE;
else if(s->radius < dists[1])
return SPHEREOUTSIDE;
else if(s->radius < dists[2])
return SPHEREOUTSIDE;
else if(s->radius < dists[3])
return SPHEREOUTSIDE;
#endif
return res; return res;
} }

View file

@ -37,8 +37,6 @@ extern const char* currentFile;
#include <functional> #include <functional>
#include <fstream> #include <fstream>
#define ARRAY_SIZE(array) (sizeof(array) / sizeof(array[0]))
#define errorf(...) dbglog(DBG_CRITICAL, __VA_ARGS__) #define errorf(...) dbglog(DBG_CRITICAL, __VA_ARGS__)
#define logf(...) // printf(__VA_ARGS__) #define logf(...) // printf(__VA_ARGS__)
bool re3RemoveLeastUsedModel(); bool re3RemoveLeastUsedModel();
@ -47,21 +45,13 @@ void* re3StreamingAlloc(size_t size);
// #include "rwdcimpl.h" // #include "rwdcimpl.h"
#include <dc/pvr.h>
#include <dc/matrix.h>
#include "alloc.h" #include "alloc.h"
#undef PVR_TXRFMT_STRIDE using namespace dc;
#define PVR_TXRFMT_STRIDE (1 << 25)
static_assert(PVR_TXRFMT_STRIDE == (1 << 25), "PVR_TXRFMT_STRIDE is bugged in your KOS version");
// TODO: probably needs a better place to be // TODO: probably needs a better place to be
bool doEnvironmentMaps = true; bool doEnvironmentMaps = true;
#define fclamp0_1(n) ((n) > 1.0f ? 1.0f : n < 0.0f ? 0.0f : n)
#define fclamp1(n) ((n) > 1.0f ? 1.0f : n)
struct alignas(32) pvr_vertex16_t { struct alignas(32) pvr_vertex16_t {
uint32_t flags; /**< \brief TA command (vertex flags) */ uint32_t flags; /**< \brief TA command (vertex flags) */
float x; /**< \brief X coordinate */ float x; /**< \brief X coordinate */
@ -170,177 +160,10 @@ struct alignas(32) pvr_vertex32_ut {
static_assert(sizeof(pvr_vertex16_t) == 32, "pvr_vertex16_t size mismatch"); static_assert(sizeof(pvr_vertex16_t) == 32, "pvr_vertex16_t size mismatch");
static_assert(alignof(pvr_vertex16_t) == 32, "pvr_vertex16_t alignof mismatch"); static_assert(alignof(pvr_vertex16_t) == 32, "pvr_vertex16_t alignof mismatch");
#define MATH_Fast_Invert(x) ({ (((x) < 0.0f)? -1.0f : 1.0f) * frsqrt((x) * (x)); })
static pvr_dr_state_t drState; static pvr_dr_state_t drState;
#include <kos/dbglog.h>
float VIDEO_MODE_SCALE_X; float VIDEO_MODE_SCALE_X;
#if !defined(DC_TEXCONV) && !defined(DC_SIM)
#include <kos.h>
#define mat_trans_nodiv_nomod(x, y, z, x2, y2, z2, w2) do { \
register float __x __asm__("fr12") = (x); \
register float __y __asm__("fr13") = (y); \
register float __z __asm__("fr14") = (z); \
register float __w __asm__("fr15") = 1.0f; \
__asm__ __volatile__( "ftrv xmtrx, fv12\n" \
: "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
: "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
} while(false)
#define mat_trans_nodiv_nomod_zerow(x, y, z, x2, y2, z2, w2) do { \
register float __x __asm__("fr12") = (x); \
register float __y __asm__("fr13") = (y); \
register float __z __asm__("fr14") = (z); \
register float __w __asm__("fr15") = 0.0f; \
__asm__ __volatile__( "ftrv xmtrx, fv12\n" \
: "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
: "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
} while(false)
#define mat_trans_w_nodiv_nomod(x, y, z, w) do { \
register float __x __asm__("fr12") = (x); \
register float __y __asm__("fr13") = (y); \
register float __z __asm__("fr14") = (z); \
register float __w __asm__("fr15") = 1.0f; \
__asm__ __volatile__( "ftrv xmtrx, fv12\n" \
: "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
: "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
w = __w; \
} while(false)
// no declspec naked, so can't do rts / fschg. instead compiler pads with nop?
inline void rw_mat_load_3x3(const rw::Matrix* mtx) {
__asm__ __volatile__ (
R"(
fschg
frchg
fmov @%[mtx]+, dr0
fldi0 fr12
fldi0 fr13
fmov @%[mtx]+, dr2
fmov @%[mtx]+, dr4
fmov @%[mtx]+, dr6
fmov @%[mtx]+, dr8
fmov @%[mtx]+, dr10
fldi0 fr3
fldi0 fr7
fldi0 fr11
fmov dr12, dr14
fschg
frchg
)"
: [mtx] "+r" (mtx)
);
}
// sets pos.w to 1
inline void rw_mat_load_4x4(const rw::Matrix* mtx) {
__asm__ __volatile__ (
R"(
fschg
frchg
fmov @%[mtx]+, dr0
fmov @%[mtx]+, dr2
fmov @%[mtx]+, dr4
fmov @%[mtx]+, dr6
fmov @%[mtx]+, dr8
fmov @%[mtx]+, dr10
fmov @%[mtx]+, dr12
fmov @%[mtx]+, dr14
fldi1 fr15
fschg
frchg
)"
: [mtx] "+r" (mtx)
);
}
#else
extern matrix_t XMTRX;
void rw_mat_load_3x3(rw::Matrix* mtx) {
memcpy(XMTRX, mtx, sizeof(matrix_t));
XMTRX[0][3] = 0.0f;
XMTRX[1][3] = 0.0f;
XMTRX[2][3] = 0.0f;
XMTRX[3][0] = 0.0f;
XMTRX[3][1] = 0.0f;
XMTRX[3][2] = 0.0f;
XMTRX[3][3] = 0.0f;
}
void rw_mat_load_4x4(rw::Matrix* mtx) {
memcpy(XMTRX, mtx, sizeof(matrix_t));
XMTRX[3][3] = 1.0f;
}
#include <dc/matrix.h>
#define frsqrt(a) (1.0f/sqrt(a))
#define dcache_pref_block(a) __builtin_prefetch(a)
#ifndef __always_inline
#define __always_inline __attribute__((always_inline)) inline
#endif
#ifdef DC_TEXCONV
#define mat_transform(a, b, c, d)
#define mat_apply(a)
#define mat_load(a)
#define mat_store(a)
#define mat_identity(a)
#define pvr_fog_table_color(a,r,g,b)
#define pvr_fog_table_linear(s,e)
#define pvr_fog_table_exp(d)
#define pvr_fog_table_custom(d)
#endif
#define mat_trans_single3_nomod(x_, y_, z_, x2, y2, z2) do { \
vector_t tmp = { x_, y_, z_, 1.0f }; \
mat_transform(&tmp, &tmp, 1, 0); \
z2 = 1.0f / tmp.w; \
x2 = tmp.x * z2; \
y2 = tmp.y * z2; \
} while(false)
#define mat_trans_nodiv_nomod(x_, y_, z_, x2, y2, z2, w2) do { \
vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \
mat_transform(&tmp1233123, &tmp1233123, 1, 0); \
x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \
} while(false)
#define mat_trans_nodiv_nomod_zerow(x_, y_, z_, x2, y2, z2, w2) do { \
vector_t tmp1233123 = { x_, y_, z_, 0.0f }; \
mat_transform(&tmp1233123, &tmp1233123, 1, 0); \
x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \
} while(false)
#define mat_trans_w_nodiv_nomod(x_, y_, z_, w_) do { \
vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \
mat_transform(&tmp1233123, &tmp1233123, 1, 0); \
w_ = tmp1233123.w; \
} while(false)
#define memcpy4 memcpy
// END STUBS
#endif
static pvr_ptr_t fake_tex; static pvr_ptr_t fake_tex;
alignas(4) static const uint16_t fake_tex_data[] = { alignas(4) static const uint16_t fake_tex_data[] = {
@ -551,9 +374,9 @@ void DCE_MatrixViewport(float x, float y, float width, float height) {
void DCE_InitMatrices() { void DCE_InitMatrices() {
// Setup the screenview matrix. Only need to do once since this matrix does not need to change for single player viewpoint. // Setup the screenview matrix. Only need to do once since this matrix does not need to change for single player viewpoint.
mat_identity(); mat_identity2();
mat_store(&DCE_MAT_SCREENVIEW); mat_store2(&DCE_MAT_SCREENVIEW);
} }
} }
@ -684,7 +507,7 @@ struct atomic_context_t {
__always_inline void DCE_RenderSubmitVertex(const pvr_vertex_t *v, uint32_t flags) { __always_inline void DCE_RenderSubmitVertex(const pvr_vertex_t *v, uint32_t flags) {
auto *sq = reinterpret_cast<uint32_t *>(pvr_dr_target(drState)); auto *sq = reinterpret_cast<uint32_t *>(pvr_dr_target(drState));
auto *src = reinterpret_cast<const uint32_t *>(v); auto *src = reinterpret_cast<const uint32_t *>(v);
float sz = MATH_Fast_Invert(v->z); float sz = Invert<true, false>(v->z);
float sx = v->x * sz; float sx = v->x * sz;
float sy = v->y * sz; float sy = v->y * sz;
@ -711,7 +534,7 @@ __always_inline void DCE_RenderSubmitVertexIM3D(float x, float y, float w,
{ {
auto *sq = reinterpret_cast<uint32_t *>(pvr_dr_target(drState)); auto *sq = reinterpret_cast<uint32_t *>(pvr_dr_target(drState));
auto *uv32 = reinterpret_cast<const uint32_t *>(uv); auto *uv32 = reinterpret_cast<const uint32_t *>(uv);
float sz = MATH_Fast_Invert(w); float sz = Invert<true, false>(w);
float sx = x * sz; float sx = x * sz;
float sy = y * sz; float sy = y * sz;
@ -728,13 +551,6 @@ __always_inline void DCE_RenderSubmitVertexIM3D(float x, float y, float w,
/* END TA Submission Functions*/ /* END TA Submission Functions*/
#if defined(DC_TEXCONV)
void malloc_stats() { }
#endif
#if 0 #if 0
#define UNIMPL_LOG() printf("TODO: Implement %s @ %s:%d\n", __func__, __FILE__, __LINE__); #define UNIMPL_LOG() printf("TODO: Implement %s @ %s:%d\n", __func__, __FILE__, __LINE__);
#define UNIMPL_LOGV(fmt, ...) printf("TODO: Implement %s @ %s:%d " fmt "\n", __func__, __FILE__, __LINE__, __VA_ARGS__); #define UNIMPL_LOGV(fmt, ...) printf("TODO: Implement %s @ %s:%d " fmt "\n", __func__, __FILE__, __LINE__, __VA_ARGS__);
@ -808,9 +624,8 @@ void beginUpdate(Camera* cam) {
DCE_MatrixViewport(0, 0, cam->frameBuffer->width * VIDEO_MODE_SCALE_X, cam->frameBuffer->height); DCE_MatrixViewport(0, 0, cam->frameBuffer->width * VIDEO_MODE_SCALE_X, cam->frameBuffer->height);
mat_load((matrix_t*)&DCE_MAT_SCREENVIEW); mat_load_apply((matrix_t*)&DCE_MAT_SCREENVIEW, (matrix_t*)&cam->devProj);
mat_apply((matrix_t*)&cam->devProj); mat_store2((matrix_t*)&cam->devProjScreen);
mat_store((matrix_t*)&cam->devProjScreen);
} }
@ -1838,7 +1653,7 @@ void im2DRenderPrimitive(PrimitiveType primType, void *vertices, int32_t numVert
pvrVert->flags = flags; pvrVert->flags = flags;
pvrVert->x = gtaVert.x * VIDEO_MODE_SCALE_X; pvrVert->x = gtaVert.x * VIDEO_MODE_SCALE_X;
pvrVert->y = gtaVert.y; pvrVert->y = gtaVert.y;
pvrVert->z = MATH_Fast_Invert(gtaVert.w); // this is perfect for almost every case... pvrVert->z = Invert<true, false>(gtaVert.w); // this is perfect for almost every case...
pvrVert->u = gtaVert.u; pvrVert->u = gtaVert.u;
pvrVert->v = gtaVert.v; pvrVert->v = gtaVert.v;
pvrVert->argb = (gtaVert.a << 24) | pvrVert->argb = (gtaVert.a << 24) |
@ -1959,7 +1774,7 @@ void im2DRenderIndexedPrimitive(PrimitiveType primType, void *vertices, int32 nu
pvrVert->flags = flags; pvrVert->flags = flags;
pvrVert->x = gtaVert.x * VIDEO_MODE_SCALE_X; pvrVert->x = gtaVert.x * VIDEO_MODE_SCALE_X;
pvrVert->y = gtaVert.y; pvrVert->y = gtaVert.y;
pvrVert->z = MATH_Fast_Invert(gtaVert.w); // this is perfect for almost every case... pvrVert->z = Invert<true, false>(gtaVert.w); // this is perfect for almost every case...
pvrVert->u = gtaVert.u; pvrVert->u = gtaVert.u;
pvrVert->v = gtaVert.v; pvrVert->v = gtaVert.v;
pvrVert->argb = (gtaVert.a << 24) | pvrVert->argb = (gtaVert.a << 24) |
@ -2022,8 +1837,8 @@ void im3DTransform(void *vertices, int32 numVertices, Matrix *worldMat, uint32 f
rw::RawMatrix::mult(&worldview, &world, &cam->devView); rw::RawMatrix::mult(&worldview, &world, &cam->devView);
rw::RawMatrix::mult(&proj, &worldview, &cam->devProj); rw::RawMatrix::mult(&proj, &worldview, &cam->devProj);
rw::RawMatrix::mult(&mtx, &proj, (RawMatrix*)&DCE_MAT_SCREENVIEW); rw::RawMatrix::mult(&mtx, &proj, (RawMatrix*)&DCE_MAT_SCREENVIEW);
// mat_load(&DCE_MAT_SCREENVIEW); // ~11 cycles. // mat_load2(&DCE_MAT_SCREENVIEW); // ~11 cycles.
mat_load(( matrix_t*)&mtx.right); // Number of cycles: ~32. mat_load2(( matrix_t*)&mtx.right); // Number of cycles: ~32.
if (im3dVertices) { if (im3dVertices) {
free(im3dVertices); free(im3dVertices);
} }
@ -2117,7 +1932,7 @@ void im3DRenderIndexedPrimitive(PrimitiveType primType,
// assuming near plane is 0.0f // assuming near plane is 0.0f
// gv1 is visible (posi), and gv2 is behind the plane (negative) // gv1 is visible (posi), and gv2 is behind the plane (negative)
float t = (1.0f - gv1.position.z) * MATH_Fast_Invert(gv2.position.z - gv1.position.z); float t = (1.0f - gv1.position.z) * Invert<true, true>(gv2.position.z - gv1.position.z);
pvr_vertex_t pvrVert; pvr_vertex_t pvrVert;
@ -3395,13 +3210,13 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve
auto skinningWeightData = (uint8_t*)skinWeights; auto skinningWeightData = (uint8_t*)skinWeights;
if (!matrix0Identity) { if (!matrix0Identity) {
rw_mat_load_4x4(&skinMatrices[0]); mat_load_4x4(&skinMatrices[0]);
if (small_xyz) { if (small_xyz) {
mat_apply(&DCE_MESHLET_MAT_DECODE); mat_apply(&DCE_MESHLET_MAT_DECODE);
} }
} else { } else {
if (small_xyz) { if (small_xyz) {
mat_load(&DCE_MESHLET_MAT_DECODE); mat_load2(&DCE_MESHLET_MAT_DECODE);
} }
} }
@ -3465,7 +3280,7 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve
break; break;
} }
rw_mat_load_4x4(currentMatrix); mat_load_4x4(currentMatrix);
if (small_xyz){ if (small_xyz){
mat_apply(&DCE_MESHLET_MAT_DECODE); mat_apply(&DCE_MESHLET_MAT_DECODE);
} }
@ -3498,7 +3313,7 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve
auto skinningWeightData = (uint8_t*)skinWeights; auto skinningWeightData = (uint8_t*)skinWeights;
if (!matrix0Identity) { if (!matrix0Identity) {
rw_mat_load_3x3(&skinMatrices[0]); mat_load_3x3(&skinMatrices[0]);
} }
for(;;) { for(;;) {
@ -3556,7 +3371,7 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve
break; break;
} }
rw_mat_load_3x3(currentMatrix); mat_load_3x3(currentMatrix);
do { do {
auto srcOffset = *skinningIndexData++; auto srcOffset = *skinningIndexData++;
@ -3581,7 +3396,7 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve
__attribute__((noinline)) __attribute__((noinline))
void tnlMeshletEnvMap(uint8_t* OCR, uint8_t* normal, int vertexCount, int vertexSize, matrix_t* matfxMatrix, float matfxCoefficient) { void tnlMeshletEnvMap(uint8_t* OCR, uint8_t* normal, int vertexCount, int vertexSize, matrix_t* matfxMatrix, float matfxCoefficient) {
mat_load(matfxMatrix); mat_load2(matfxMatrix);
do { do {
pvr_vertex64_t* v = (pvr_vertex64_t*)OCR; pvr_vertex64_t* v = (pvr_vertex64_t*)OCR;
@ -3998,15 +3813,12 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {
lightingCB(atomic, ac->uniform); lightingCB(atomic, ac->uniform);
rw::RawMatrix world; rw::RawMatrix world;
rw::convMatrix(&world, atomic->getFrame()->getLTM()); rw::convMatrix(&world, atomic->getFrame()->getLTM());
mat_load_apply((matrix_t*)&cam->devProjScreen, (matrix_t*)&cam->devView);
mat_load((matrix_t*)&cam->devProjScreen);
mat_apply((matrix_t*)&cam->devView);
mat_apply((matrix_t*)&world); mat_apply((matrix_t*)&world);
mat_store((matrix_t*)&atomicContexts.back().mtx); mat_store2((matrix_t*)&atomicContexts.back().mtx);
auto meshes = geo->meshHeader->getMeshes(); auto meshes = geo->meshHeader->getMeshes();
@ -4245,15 +4057,14 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {
unsigned skinSelector = small_xyz + acp->skinMatrix0Identity*2; unsigned skinSelector = small_xyz + acp->skinMatrix0Identity*2;
tnlMeshletSkinVerticesSelector[skinSelector](OCR_SPACE, normalDst, &dcModel->data[meshlet->vertexOffset], normalSrc, &dcModel->data[meshlet->skinWeightOffset], &dcModel->data[meshlet->skinIndexOffset], meshlet->vertexCount, meshlet->vertexSize, &acp->skinContextPointer->mtx); tnlMeshletSkinVerticesSelector[skinSelector](OCR_SPACE, normalDst, &dcModel->data[meshlet->vertexOffset], normalSrc, &dcModel->data[meshlet->skinWeightOffset], &dcModel->data[meshlet->skinIndexOffset], meshlet->vertexCount, meshlet->vertexSize, &acp->skinContextPointer->mtx);
mat_load(&mtx); mat_load2(&mtx);
tnlMeshletTransformSelector[clippingRequired * 2](OCR_SPACE, OCR_SPACE + 4, meshlet->vertexCount, 64); tnlMeshletTransformSelector[clippingRequired * 2](OCR_SPACE, OCR_SPACE + 4, meshlet->vertexCount, 64);
} else { } else {
if (selector & 8) { if (selector & 8) {
mat_load(&mtx); mat_load_apply(&mtx, &DCE_MESHLET_MAT_DECODE);
mat_apply(&DCE_MESHLET_MAT_DECODE);
} else { } else {
mat_load(&mtx); mat_load2(&mtx);
} }
tnlMeshletTransformSelector[smallSelector](OCR_SPACE, &dcModel->data[meshlet->vertexOffset], meshlet->vertexCount, meshlet->vertexSize); tnlMeshletTransformSelector[smallSelector](OCR_SPACE, &dcModel->data[meshlet->vertexOffset], meshlet->vertexCount, meshlet->vertexSize);
} }
@ -4280,7 +4091,7 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {
unsigned dstColOffset = textured ? offsetof(pvr_vertex64_t, a) : offsetof(pvr_vertex32_ut, a); unsigned dstColOffset = textured ? offsetof(pvr_vertex64_t, a) : offsetof(pvr_vertex32_ut, a);
dce_set_mat_vertex_color(&residual, &material); dce_set_mat_vertex_color(&residual, &material);
mat_load(&DCE_MESHLET_MAT_VERTEX_COLOR); mat_load2(&DCE_MESHLET_MAT_VERTEX_COLOR);
tnlMeshletVertexColorSelector[0](OCR_SPACE + dstColOffset, (int8_t*)&dcModel->data[meshlet->vertexOffset] + colOffset, meshlet->vertexCount, meshlet->vertexSize); tnlMeshletVertexColorSelector[0](OCR_SPACE + dstColOffset, (int8_t*)&dcModel->data[meshlet->vertexOffset] + colOffset, meshlet->vertexCount, meshlet->vertexSize);
} else { } else {
unsigned dstColOffset = textured ? offsetof(pvr_vertex64_t, a) : offsetof(pvr_vertex32_ut, a); unsigned dstColOffset = textured ? offsetof(pvr_vertex64_t, a) : offsetof(pvr_vertex32_ut, a);
@ -4302,7 +4113,7 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {
unsigned normalSelector = (pass1 - 1) + (skin != 0) * 4; unsigned normalSelector = (pass1 - 1) + (skin != 0) * 4;
mat_load((matrix_t*)&uniformObject.dir[0][0][0]); mat_load2((matrix_t*)&uniformObject.dir[0][0][0]);
auto normalPointer = &dcModel->data[meshlet->vertexOffset] + normalOffset; auto normalPointer = &dcModel->data[meshlet->vertexOffset] + normalOffset;
auto vtxSize = meshlet->vertexSize; auto vtxSize = meshlet->vertexSize;
if (skin) { if (skin) {
@ -4317,7 +4128,7 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {
if (pass2) { if (pass2) {
unsigned normalSelector = (pass2 - 1) + (skin != 0) * 4; unsigned normalSelector = (pass2 - 1) + (skin != 0) * 4;
mat_load((matrix_t*)&uniformObject.dir[1][0][0]); mat_load2((matrix_t*)&uniformObject.dir[1][0][0]);
tnlMeshletDiffuseColorSelector[normalSelector](OCR_SPACE + dstColOffset, normalPointer, meshlet->vertexCount, vtxSize, &lightDiffuseColors[4]); tnlMeshletDiffuseColorSelector[normalSelector](OCR_SPACE + dstColOffset, normalPointer, meshlet->vertexCount, vtxSize, &lightDiffuseColors[4]);
} }
} }
@ -4380,7 +4191,7 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {
indices.back() |= 0x80; indices.back() |= 0x80;
pvr_vertex64_t *vd = (pvr_vertex64_t *)OCR_SPACE; pvr_vertex64_t *vd = (pvr_vertex64_t *)OCR_SPACE;
mat_load(&mtx); // Number of cycles: ~11 mat_load2(&mtx); // Number of cycles: ~11
for (int idx = 0; idx < geo->numVertices; idx++) { for (int idx = 0; idx < geo->numVertices; idx++) {
auto& vert = vertices[idx]; auto& vert = vertices[idx];

739
vendor/librw/src/dc/rwdc_common.h vendored Normal file
View file

@ -0,0 +1,739 @@
#ifndef __RWDC_COMMON_H
#define __RWDC_COMMON_H
#include <tuple>
#include <limits>
#include <cmath>
#include <type_traits>
#include <algorithm>
#include <cstring>
#ifndef RW_DC
# pragma warning(disable: 4244) // int to float
# pragma warning(disable: 4800) // int to bool
# pragma warning(disable: 4838) // narrowing conversion
# pragma warning(disable: 4996) // POSIX names
#else
# if !defined(DC_TEXCONV) && !defined(DC_SIM)
# include <kos.h>
# define DC_SH4
# define VIDEO_MODE_WIDTH static_cast<float>(vid_mode->width)
# define VIDEO_MODE_HEIGHT static_cast<float>(vid_mode->height)
# define memcpy4 memcpy
# else
# ifdef DC_TEXCONV
# define malloc_stats()
# endif
# include <dc/matrix.h>
# include <dc/pvr.h>
# include <kos/dbglog.h>
# define VIDEO_MODE_WIDTH 640.0f
# define VIDEO_MODE_HEIGHT 480.0f
# define memcpy4 memcpy
# define frsqrt(a) (1.0f/sqrtf(a))
# define dcache_pref_block(a) __builtin_prefetch(a)
# define F_PI M_PI
# ifndef __always_inline
# define __always_inline inline
# endif
# endif
# ifdef PVR_TXRFMT_STRIDE
# undef PVR_TXRFMT_STRIDE
# define PVR_TXRFMT_STRIDE (1 << 25)
# endif
static_assert(PVR_TXRFMT_STRIDE == (1 << 25),
"PVR_TXRFMT_STRIDE is bugged in your KOS version");
#endif
#define F_PI_2 (F_PI * 0.5f)
#define __hot __attribute__((hot))
#define __cold __attribute__((cold))
#define __icache_aligned __attribute__((aligned(32)))
#define STRINGIFY(x) #x
#define STR(x) STRINGIFY(x)
#define CONCAT_(x,y) x##y
#define CONCAT(x,y) CONCAT_(x,y)
#define ARRAY_SIZE(array) (sizeof(array) / sizeof(array[0]))
namespace rw {
class Matrix;
}
namespace dc {
struct quaternion_t {
float x, y, z, w;
};
__always_inline __hot constexpr float Sin(float x) { return sinf(x); }
__always_inline __hot constexpr float Cos(float x) { return cosf(x); }
__always_inline __hot constexpr auto SinCos(float x) { return std::pair { Sin(x), Cos(x) }; }
__always_inline __hot constexpr float Tan(float x) { return tanf(x); }
__always_inline __hot constexpr float Atan(float x) { return atanf(x); }
__always_inline __hot constexpr float Atan2(float y, float x) { return atan2f(y, x); }
__always_inline __hot constexpr float Asin(float x) { return asinf(x); }
__always_inline __hot constexpr float Acos(float x) { return acosf(x); }
__always_inline __hot constexpr float Abs(float x) { return fabsf(x); }
__always_inline __hot constexpr float Sqrt(float x) { return sqrtf(x); }
__always_inline __hot constexpr float RecipSqrt(float x, float y) { return x / Sqrt(y); }
__always_inline __hot constexpr float Pow(float x, float y) { return powf(x, y); }
__always_inline __hot constexpr float Floor(float x) { return floorf(x); }
__always_inline __hot constexpr float Ceil(float x) { return ceilf(x); }
__always_inline __hot constexpr float Fmac(auto a, auto b, auto c) { return a * b + c; }
__always_inline __hot constexpr float Lerp(float a, float b, float t) { return Fmac(t, (b - a), a); }
__always_inline __hot constexpr auto Max(auto a, auto b) { return ((a > b)? a : b); }
__always_inline __hot constexpr auto Min(auto a, auto b) { return ((a < b)? a : b); }
template<bool CHECK_ZERO=false>
__always_inline __hot constexpr float RecipSqrt(float x) {
if constexpr(CHECK_ZERO)
if(x == 0.0f)
return 0.0f;
return 1.0f / Sqrt(x);
}
template<typename T>
__always_inline __hot constexpr T Clamp(T v, auto low, auto high) {
return std::clamp(v, static_cast<T>(low), static_cast<T>(high));
}
__always_inline constexpr auto Clamp2(auto v, auto center, auto radius) {
return (v > center) ? Min(v, center + radius) : Max(v, center - radius);
}
template<bool FAST_APPROX=true, bool COPY_SIGN=true>
__always_inline __hot constexpr float Invert(float x) {
float value;
if(!std::is_constant_evaluated() && FAST_APPROX) {
value = RecipSqrt(x * x);
if constexpr(COPY_SIGN)
if(x < 0.0f)
value = -value;
} else value = 1.0f / x;
return value;
}
template<bool FAST_APPROX=true, bool COPY_SIGN=true>
__always_inline __hot constexpr float Div(float x, float y) {
if(FAST_APPROX && !std::is_constant_evaluated())
return x * Invert<true, COPY_SIGN>(y);
else
return x / y;
}
template<bool FAST_APPROX=false, bool COPY_SIGN=true>
__always_inline __hot constexpr auto Norm(auto value, auto min, auto max) {
auto numerator = Clamp(value, min, max) - min;
auto denominator = (max - min);
if(FAST_APPROX && !std::is_constant_evaluated())
return Div<true, COPY_SIGN>(numerator, denominator);
else
return numerator / denominator;
}
#ifdef RW_DC
# ifdef DC_SH4
#define mat_trans_nodiv_nomod(x, y, z, x2, y2, z2, w2) do { \
register float __x __asm__("fr12") = (x); \
register float __y __asm__("fr13") = (y); \
register float __z __asm__("fr14") = (z); \
register float __w __asm__("fr15") = 1.0f; \
__asm__ __volatile__( "ftrv xmtrx, fv12\n" \
: "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
: "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
} while(false)
#define mat_trans_nodiv_nomod_zerow(x, y, z, x2, y2, z2, w2) do { \
register float __x __asm__("fr12") = (x); \
register float __y __asm__("fr13") = (y); \
register float __z __asm__("fr14") = (z); \
register float __w __asm__("fr15") = 0.0f; \
__asm__ __volatile__( "ftrv xmtrx, fv12\n" \
: "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
: "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
} while(false)
#define mat_trans_w_nodiv_nomod(x, y, z, w) do { \
register float __x __asm__("fr12") = (x); \
register float __y __asm__("fr13") = (y); \
register float __z __asm__("fr14") = (z); \
register float __w __asm__("fr15") = 1.0f; \
__asm__ __volatile__( "ftrv xmtrx, fv12\n" \
: "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
: "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
w = __w; \
} while(false)
#define mat_trans_vec4_nodiv_nomod(x, y, z, w, x2, y2, z2, w2) { \
register float __x __asm__("fr0") = (x); \
register float __y __asm__("fr1") = (y); \
register float __z __asm__("fr2") = (z); \
register float __w __asm__("fr3") = (w); \
__asm__ __volatile__( "ftrv xmtrx, fv0\n" \
: "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
: "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
} while(false)
inline __hot __icache_aligned void mat_load2(const matrix_t* mtx) {
asm volatile(
R"(
fschg
fmov.d @%[mtx],xd0
add #32,%[mtx]
pref @%[mtx]
add #-(32-8),%[mtx]
fmov.d @%[mtx]+,xd2
fmov.d @%[mtx]+,xd4
fmov.d @%[mtx]+,xd6
fmov.d @%[mtx]+,xd8
fmov.d @%[mtx]+,xd10
fmov.d @%[mtx]+,xd12
fmov.d @%[mtx]+,xd14
fschg
)"
: [mtx] "+r" (mtx)
:
:
);
}
inline __hot __icache_aligned void mat_store2(matrix_t *mtx) {
asm volatile(
R"(
fschg
add #64-8,%[mtx]
fmov.d xd14,@%[mtx]
add #-32,%[mtx]
pref @%[mtx]
add #32,%[mtx]
fmov.d xd12,@-%[mtx]
fmov.d xd10,@-%[mtx]
fmov.d xd8,@-%[mtx]
fmov.d xd6,@-%[mtx]
fmov.d xd4,@-%[mtx]
fmov.d xd2,@-%[mtx]
fmov.d xd0,@-%[mtx]
fschg
)"
: [mtx] "+&r" (mtx), "=m" (*mtx)
:
:
);
}
inline __hot __icache_aligned void mat_identity2(void) {
asm volatile(
R"(
frchg
fldi1 fr0
fschg
fldi0 fr1
fldi0 fr2
fldi0 fr3
fldi0 fr4
fldi1 fr5
fmov dr2,dr6
fmov dr2,dr8
fmov dr0,dr10
fmov dr2,dr12
fmov dr4,dr14
fschg
frchg
)"
);
}
inline __hot __icache_aligned void mat_set_scale(float x, float y, float z) {
asm volatile(
R"(
frchg
fldi0 fr1
fschg
fldi0 fr2
fldi0 fr3
fldi0 fr4
fmov dr2, dr6
fmov dr2, dr8
fldi0 fr11
fmov dr2, dr12
fldi0 fr14
fschg
frchg
fmov %[x], xf0
fmov %[y], xf5
fmov %[z], xf10
)"
:
: [x] "r" (x), [y] "r" (y), [z] "r" (z)
:
);
}
// no declspec naked, so can't do rts / fschg. instead compiler pads with nop?
inline __hot void mat_load_3x3(const rw::Matrix* mtx) {
__asm__ __volatile__ (
R"(
fschg
frchg
fmov @%[mtx]+, dr0
fldi0 fr12
fldi0 fr13
fmov @%[mtx]+, dr2
fmov @%[mtx]+, dr4
fmov @%[mtx]+, dr6
fmov @%[mtx]+, dr8
fmov @%[mtx]+, dr10
fldi0 fr3
fldi0 fr7
fldi0 fr11
fmov dr12, dr14
fschg
frchg
)"
: [mtx] "+r" (mtx)
);
}
// sets pos.w to 1
inline __hot void mat_load_4x4(const rw::Matrix* mtx) {
__asm__ __volatile__ (
R"(
fschg
frchg
fmov @%[mtx]+, dr0
fmov @%[mtx]+, dr2
fmov @%[mtx]+, dr4
fmov @%[mtx]+, dr6
fmov @%[mtx]+, dr8
fmov @%[mtx]+, dr10
fmov @%[mtx]+, dr12
fmov @%[mtx]+, dr14
fldi1 fr15
fschg
frchg
)"
: [mtx] "+r" (mtx)
);
}
__hot inline void mat_transpose(void) {
asm volatile (
"frchg\n\t" // fmov for singles only works on front bank
// FR0, FR5, FR10, and FR15 are already in place
// swap FR1 and FR4
"flds FR1, FPUL\n\t"
"fmov FR4, FR1\n\t"
"fsts FPUL, FR4\n\t"
// swap FR2 and FR8
"flds FR2, FPUL\n\t"
"fmov FR8, FR2\n\t"
"fsts FPUL, FR8\n\t"
// swap FR3 and FR12
"flds FR3, FPUL\n\t"
"fmov FR12, FR3\n\t"
"fsts FPUL, FR12\n\t"
// swap FR6 and FR9
"flds FR6, FPUL\n\t"
"fmov FR9, FR6\n\t"
"fsts FPUL, FR9\n\t"
// swap FR7 and FR13
"flds FR7, FPUL\n\t"
"fmov FR13, FR7\n\t"
"fsts FPUL, FR13\n\t"
// swap FR11 and FR14
"flds FR11, FPUL\n\t"
"fmov FR14, FR11\n\t"
"fsts FPUL, FR14\n\t"
// restore XMTRX to back bank
"frchg\n"
: // no outputs
: // no inputs
: "fpul" // clobbers
);
}
__hot __icache_aligned inline void mat_copy(matrix_t *dst, const matrix_t *src) {
asm volatile(R"(
fschg
pref @%[dst]
fmov.d @%[src]+, xd0
fmov.d @%[src]+, xd2
fmov.d @%[src]+, xd4
fmov.d @%[src]+, xd6
pref @%[src]
add #32, %[dst]
fmov.d xd6, @-%[dst]
fmov.d xd4, @-%[dst]
fmov.d xd2, @-%[dst]
fmov.d xd0, @-%[dst]
add #32, %[dst]
pref @%[dst]
fmov.d @%[src]+, xd0
fmov.d @%[src]+, xd2
fmov.d @%[src]+, xd4
fmov.d @%[src]+, xd6
add #32, %[dst]
fmov.d xd6, @-%[dst]
fmov.d xd4, @-%[dst]
fmov.d xd2, @-%[dst]
fmov.d xd0, @-%[dst]
fschg
)": [dst] "+&r" (dst), [src] "+&r" (src), "=m" (*dst)
:
:);
}
//TODO: FIXME FOR VC (AND USE FTRV)
template<bool FAST_APPROX=false>
__hot constexpr inline void quat_mult(quaternion_t *r, const quaternion_t &q1, const quaternion_t &q2) {
if(FAST_APPROX && !std::is_constant_evaluated()) {
/*
// reorder the coefficients so that q1 stays in constant order {x,y,z,w}
// q2 then needs to be rotated after each inner product
x = (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x);
y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y);
z = (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z);
w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w);
*/
// keep q1 in fv4
register float q1x __asm__ ("fr4") = (q1.x);
register float q1y __asm__ ("fr5") = (q1.y);
register float q1z __asm__ ("fr6") = (q1.z);
register float q1w __asm__ ("fr7") = (q1.w);
// load q2 into fv8, use it to get the shuffled reorder into fv0
register float q2x __asm__ ("fr8") = (q2.x);
register float q2y __asm__ ("fr9") = (q2.y);
register float q2z __asm__ ("fr10") = (q2.z);
register float q2w __asm__ ("fr11") = (q2.w);
// temporary operand / result in fv0
register float t1x __asm__ ("fr0");
register float t1y __asm__ ("fr1");
register float t1z __asm__ ("fr2");
register float t1w __asm__ ("fr3");
// x = (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x);
t1x = q2w;
t1y = q2z;
t1z = -q2y;
t1w = q2w;
__asm__ ("\n"
" fipr fv4,fv0\n"
: "+f" (t1w)
: "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
"f" (t1x), "f" (t1y), "f" (t1z)
);
// x = t1w; try to avoid the stall by not reading the fipr result immediately
// y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y);
t1x = -q2z;
t1y = q2w;
t1z = q2x;
__atomic_thread_fence(1);
r->x = t1w; // get previous result
t1w = q2y;
__asm__ ("\n"
" fipr fv4,fv0\n"
: "+f" (t1w)
: "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
"f" (t1x), "f" (t1y), "f" (t1z)
);
//y = t1w;
// z = (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z);
t1x = q2y;
t1y = -q2x;
t1z = q2w;
__atomic_thread_fence(1);
r->y = t1w; // get previous result
t1w = q2z;
__asm__ ("\n"
" fipr fv4,fv0\n"
: "+f" (t1w)
: "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
"f" (t1x), "f" (t1y), "f" (t1z)
);
//z = t1w;
__atomic_thread_fence(1);
// w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w);
q2x = -q2x;
q2y = -q2y;
q2z = -q2z;
__asm__ ("\n"
" fipr fv4,fv8\n"
: "+f" (q2w)
: "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
"f" (q2x), "f" (q2y), "f" (q2z)
);
__atomic_thread_fence(1);
r->z = t1w;
__atomic_thread_fence(1);
r->w = q2w;
} else {
r->x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w);
r->y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w);
r->z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w);
r->w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z);
}
}
__hot inline void mat_load_apply(const matrix_t* matrix1, const matrix_t* matrix2) {
unsigned int prefetch_scratch;
asm volatile (
"mov %[bmtrx], %[pref_scratch]\n\t" // (MT)
"add #32, %[pref_scratch]\n\t" // offset by 32 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle)
"fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) (FE)
"pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS)
// back matrix
"fmov.d @%[bmtrx]+, XD0\n\t" // (LS)
"fmov.d @%[bmtrx]+, XD2\n\t"
"fmov.d @%[bmtrx]+, XD4\n\t"
"fmov.d @%[bmtrx]+, XD6\n\t"
"pref @%[fmtrx]\n\t" // prefetch fmtrx now while we wait (LS)
"fmov.d @%[bmtrx]+, XD8\n\t" // bmtrx prefetch should work for here
"fmov.d @%[bmtrx]+, XD10\n\t"
"fmov.d @%[bmtrx]+, XD12\n\t"
"mov %[fmtrx], %[pref_scratch]\n\t" // (MT)
"add #32, %[pref_scratch]\n\t" // store offset by 32 in r0 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle)
"fmov.d @%[bmtrx], XD14\n\t"
"pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS)
// front matrix
// interleave loads and matrix multiply 4x4
"fmov.d @%[fmtrx]+, DR0\n\t"
"fmov.d @%[fmtrx]+, DR2\n\t"
"fmov.d @%[fmtrx]+, DR4\n\t" // (LS) want to issue the next one before 'ftrv' for parallel exec
"ftrv XMTRX, FV0\n\t" // (FE)
"fmov.d @%[fmtrx]+, DR6\n\t"
"fmov.d @%[fmtrx]+, DR8\n\t"
"ftrv XMTRX, FV4\n\t"
"fmov.d @%[fmtrx]+, DR10\n\t"
"fmov.d @%[fmtrx]+, DR12\n\t"
"ftrv XMTRX, FV8\n\t"
"fmov.d @%[fmtrx], DR14\n\t" // (LS, but this will stall 'ftrv' for 3 cycles)
"fschg\n\t" // switch back to single moves (and avoid stalling 'ftrv') (FE)
"ftrv XMTRX, FV12\n\t" // (FE)
// Save output in XF regs
"frchg\n"
: [bmtrx] "+&r" ((unsigned int)matrix1), [fmtrx] "+r" ((unsigned int)matrix2), [pref_scratch] "=&r" (prefetch_scratch) // outputs, "+" means r/w, "&" means it's written to before all inputs are consumed
: // no inputs
: "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11", "fr12", "fr13", "fr14", "fr15" // clobbers (GCC doesn't know about back bank, so writing to it isn't clobbered)
);
}
__hot inline void mat_apply_rotate_x(float x) {
x *= 10430.37835f;
asm volatile(
"ftrc %[a], fpul\n\t"
"fsca fpul, dr4\n\t"
"fldi0 fr8\n\t"
"fldi0 fr11\n\t"
"fmov fr5, fr10\n\t"
"fmov fr4, fr9\n\t"
"fneg fr9\n\t"
"ftrv xmtrx, fv8\n\t"
"fmov fr4, fr6\n\t"
"fldi0 fr7\n\t"
"fldi0 fr4\n\t"
"ftrv xmtrx, fv4\n\t"
"fschg\n\t"
"fmov dr8, xd8\n\t"
"fmov dr10, xd10\n\t"
"fmov dr4, xd4\n\t"
"fmov dr6, xd6\n\t"
"fschg\n"
:
: [a] "f"(x)
: "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11");
}
__hot inline void mat_apply_rotate_y(float y) {
y *= 10430.37835f;
asm volatile(
"ftrc %[a], fpul\n\t"
"fsca fpul, dr6\n\t"
"fldi0 fr9\n\t"
"fldi0 fr11\n\t"
"fmov fr6, fr8\n\t"
"fmov fr7, fr10\n\t"
"ftrv xmtrx, fv8\n\t"
"fmov fr7, fr4\n\t"
"fldi0 fr5\n\t"
"fneg fr6\n\t"
"fldi0 fr7\n\t"
"ftrv xmtrx, fv4\n\t"
"fschg\n\t"
"fmov dr8, xd8\n\t"
"fmov dr10, xd10\n\t"
"fmov dr4, xd0\n\t"
"fmov dr6, xd2\n\t"
"fschg\n"
:
: [a] "f"(y)
: "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11");
}
__hot inline void mat_apply_rotate_z(float z) {
z *= 10430.37835f;
asm volatile(
"ftrc %[a], fpul\n\t"
"fsca fpul, dr8\n\t"
"fldi0 fr10\n\t"
"fldi0 fr11\n\t"
"fmov fr8, fr5\n\t"
"fneg fr8\n\t"
"ftrv xmtrx, fv8\n\t"
"fmov fr9, fr4\n\t"
"fschg\n\t"
"fmov dr10, dr6\n\t"
"ftrv xmtrx, fv4\n\t"
"fmov dr8, xd4\n\t"
"fmov dr10, xd6\n\t"
"fmov dr4, xd0\n\t"
"fmov dr6, xd2\n\t"
"fschg\n"
:
: [a] "f"(z)
: "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11");
}
# else
# ifdef DC_TEXCONV
# define mat_apply(a)
# define mat_load2(a)
# define mat_store2(a)
# define mat_identity2()
# define mat_transform(a, b, c, d)
# define pvr_fog_table_color(a,r,g,b)
# define pvr_fog_table_linear(s,e)
# else
# define mat_load2(a) mat_load(a)
# define mat_store2(a) mat_store(a)
# define mat_identity2() mat_identity()
# endif
#define mat_trans_single3_nomod(x_, y_, z_, x2, y2, z2) do { \
vector_t tmp = { x_, y_, z_, 1.0f }; \
mat_transform(&tmp, &tmp, 1, 0); \
z2 = 1.0f / tmp.w; \
x2 = tmp.x * z2; \
y2 = tmp.y * z2; \
} while(false)
#define mat_trans_single3_nodiv_nomod(x_, y_, z_, x2, y2, z2) do { \
vector_t tmp = { x_, y_, z_, 1.0f }; \
mat_transform(&tmp, &tmp, 1, 0); \
z2 = tmp.z; \
x2 = tmp.x; \
y2 = tmp.y; \
} while(false)
#define mat_trans_nodiv_nomod(x_, y_, z_, x2, y2, z2, w2) do { \
vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \
mat_transform(&tmp1233123, &tmp1233123, 1, 0); \
x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \
} while(false)
#define mat_trans_nodiv_nomod_zerow(x_, y_, z_, x2, y2, z2, w2) do { \
vector_t tmp1233123 = { x_, y_, z_, 0.0f }; \
mat_transform(&tmp1233123, &tmp1233123, 1, 0); \
x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \
} while(false)
#define mat_trans_w_nodiv_nomod(x_, y_, z_, w_) do { \
vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \
mat_transform(&tmp1233123, &tmp1233123, 1, 0); \
w_ = tmp1233123.w; \
} while(false)
inline void mat_load_3x3(const rw::Matrix* mtx) {
memcpy(XMTRX, mtx, sizeof(matrix_t));
XMTRX[0][3] = 0.0f;
XMTRX[1][3] = 0.0f;
XMTRX[2][3] = 0.0f;
XMTRX[3][0] = 0.0f;
XMTRX[3][1] = 0.0f;
XMTRX[3][2] = 0.0f;
XMTRX[3][3] = 0.0f;
}
inline void mat_load_4x4(const rw::Matrix* mtx) {
memcpy(XMTRX, mtx, sizeof(matrix_t));
XMTRX[3][3] = 1.0f;
}
inline void mat_transpose(void) {
matrix_t tmp;
for(unsigned i = 0; i < 4; ++i)
for(unsigned j = 0; j < 4; ++j)
tmp[j][i] = XMTRX[i][j];
mat_load(&tmp);
}
template<bool FAST_APPROX=true>
__hot inline void quat_mult(quaternion_t *r, const quaternion_t &q1, const quaternion_t &q2) {
r->x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w);
r->y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w);
r->z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w);
r->w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z);
}
__hot inline void mat_load_apply(const matrix_t* matrix1, const matrix_t* matrix2) {
mat_load(matrix1);
mat_apply(matrix2);
}
__always_inline __hot void mat_copy(matrix_t *dst, const matrix_t *src) {
mat_load(src);
mat_store(dst);
}
#endif
__hot inline void mat_mult(matrix_t *out, const matrix_t* matrix1, const matrix_t* matrix2) {
mat_load_apply(matrix1, matrix2);
mat_store2(out);
}
#endif
}
#endif /* RWDC_COMMON_H */

View file

@ -1,6 +1,9 @@
#pragma once #pragma once
#include "common_defines.h" #include "common_defines.h"
#ifdef RW_DC
#include "rwdc_common.h"
#endif
#ifndef RW_PS2 #ifndef RW_PS2
#include <stdint.h> #include <stdint.h>
@ -322,7 +325,13 @@ inline float32 dot(const Quat &q, const Quat &p) {
#endif #endif
} }
inline Quat scale(const Quat &q, float32 r) { return makeQuat(q.w*r, q.x*r, q.y*r, q.z*r); } inline Quat scale(const Quat &q, float32 r) { return makeQuat(q.w*r, q.x*r, q.y*r, q.z*r); }
inline float32 length(const Quat &q) { return sqrtf(q.w*q.w + q.x*q.x + q.y*q.y + q.z*q.z); } inline float32 length(const Quat &q) {
#ifndef DC_SH4
return sqrtf(q.w*q.w + q.x*q.x + q.y*q.y + q.z*q.z);
#else
return dc::Sqrt(fipr_magnitude_sqr(q.x, q.y, q.z, q.w));
#endif
}
inline Quat normalize(const Quat &q) { return scale(q, 1.0f/length(q)); } inline Quat normalize(const Quat &q) { return scale(q, 1.0f/length(q)); }
inline Quat conj(const Quat &q) { return makeQuat(q.w, -q.x, -q.y, -q.z); } inline Quat conj(const Quat &q) { return makeQuat(q.w, -q.x, -q.y, -q.z); }
Quat mult(const Quat &q, const Quat &p); Quat mult(const Quat &q, const Quat &p);