mirror of
https://gitlab.com/skmp/dca3-game.git
synced 2025-04-28 13:07:59 +03:00
Liberty/Miami Perf Gainz Phase 3
1) synced dca3-kos repo which has some gainzy commits 2) rwdc_common.h - all low-level and matrix/vector routines for SH4 are now shared in this common file, included in both RW and Liberty/Miami engines 3) CMatrix a. assignment operator: now uses asm-optimized mat_copy() b. multiplication operator: now use mat_mult() SH4 routine c. Scale(): applies a scale matrix via mat_scale d. MultiplyInverse: fipr-optimizations 4) CQuaternion a. multiplication: SH4 ASM FIPR optimized b. Get(V3d& axis, float &angle): fast inversion/division c. Set(RWMatrix&): fast division 5) CVector a. Multiply3x3() now accelerated with mat_transpose 5) RwQuat a. mult(): FIPR accelerated b. length(): FIPR/FSRRA accelerated
This commit is contained in:
parent
cca7b3c6fa
commit
4d13e821b5
24 changed files with 967 additions and 278 deletions
|
@ -381,6 +381,7 @@ INCLUDE = \
|
|||
-I../src/liberty/skel/win \
|
||||
\
|
||||
-I../vendor/librw \
|
||||
-I../vendor/librw/src/dc \
|
||||
\
|
||||
-I../vendor/miniLZO \
|
||||
\
|
||||
|
|
|
@ -396,6 +396,7 @@ INCLUDE = \
|
|||
-I../src/miami/skel/win \
|
||||
\
|
||||
-I../vendor/librw \
|
||||
-I../vendor/librw/src/dc \
|
||||
\
|
||||
-I../vendor/miniLZO \
|
||||
\
|
||||
|
|
|
@ -94,23 +94,26 @@ void VmuProfiler::run() {
|
|||
pvr_stats_t pvrStats; pvr_get_stats(&pvrStats);
|
||||
uint32_t sramStats = snd_mem_available();
|
||||
size_t pvrAvail = pvr_mem_available();
|
||||
float fps = std::accumulate(std::begin(fps_), std::end(fps_), 0.0f)
|
||||
/ static_cast<float>(fpsSamples);
|
||||
|
||||
float sh4Mem = heapUtilization();
|
||||
float pvrMem = (8_MB - pvrAvail ) / 8_MB * 100.0f;
|
||||
float armMem = (2_MB - sramStats) / 2_MB * 100.0f;
|
||||
float vtxBuf = vertBuffUse_;
|
||||
{
|
||||
std::shared_lock lk(mtx_);
|
||||
|
||||
vmu_printf("FPS :%6.2f\n"
|
||||
"SH4 :%6.2f%%\n"
|
||||
"PVR :%6.2f%%\n"
|
||||
"ARM :%6.2f%%\n"
|
||||
"VTX :%6.2f%%",
|
||||
fps, sh4Mem, pvrMem, armMem, vtxBuf);
|
||||
float vtxBuf;
|
||||
float fps;
|
||||
{ /* Critical section with main thread. */
|
||||
std::shared_lock lk(mtx_);
|
||||
vtxBuf = vertBuffUse_;
|
||||
fps = std::accumulate(std::begin(fps_), std::end(fps_), 0.0f)
|
||||
/ static_cast<float>(fpsSamples);
|
||||
}
|
||||
|
||||
vmu_printf(" FPS:%6.2f\n"
|
||||
" RAM:%6.2f%%\n"
|
||||
"VRAM:%6.2f%%\n"
|
||||
"SRAM:%6.2f%%\n"
|
||||
" VTX:%6.2f%%",
|
||||
fps, sh4Mem, pvrMem, armMem, vtxBuf);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -119,15 +122,21 @@ void VmuProfiler::run() {
|
|||
}
|
||||
|
||||
void VmuProfiler::updateVertexBufferUsage() {
|
||||
#ifndef DC_SH4
|
||||
std::unique_lock lk(mtx_);
|
||||
updated_ = true;
|
||||
|
||||
#ifdef DC_SH4
|
||||
vertBuffUse_ = vertexBufferUtilization();
|
||||
#else
|
||||
pvr_stats_t pvrStats;
|
||||
pvr_get_stats(&pvrStats);
|
||||
float vtxUtil = vertexBufferUtilization();
|
||||
|
||||
pvr_stats_t pvrStats;
|
||||
pvr_get_stats(&pvrStats);
|
||||
fps_[fpsFrame_++] = pvrStats.frame_rate;
|
||||
{ /* Critical section with VMU thread. */
|
||||
std::unique_lock lk(mtx_);
|
||||
vertBuffUse_ = vtxUtil;
|
||||
updated_ = true;
|
||||
fps_[fpsFrame_++] = pvrStats.frame_rate;
|
||||
}
|
||||
|
||||
if(fpsFrame_ >= fpsSamples)
|
||||
fpsFrame_ = 0;
|
||||
|
|
|
@ -82,8 +82,6 @@
|
|||
|
||||
#define rwVENDORID_ROCKSTAR 0x0253F2
|
||||
|
||||
__always_inline auto Max(auto a, auto b) { return ((a > b)? a : b); }
|
||||
__always_inline auto Min(auto a, auto b) { return ((a < b)? a : b); }
|
||||
|
||||
// Use this to add const that wasn't there in the original code
|
||||
#define Const const
|
||||
|
@ -299,15 +297,6 @@ extern int strcasecmp(const char *str1, const char *str2);
|
|||
|
||||
extern wchar *AllocUnicode(const char*src);
|
||||
|
||||
template<typename T>
|
||||
__always_inline T Clamp(T v, auto low, auto high) {
|
||||
return std::clamp(v, static_cast<T>(low), static_cast<T>(high));
|
||||
}
|
||||
|
||||
__always_inline auto Clamp2(auto v, auto center, auto radius) {
|
||||
return (v > center) ? Min(v, center + radius) : Max(v, center - radius);
|
||||
}
|
||||
|
||||
#define SQR(x) ((x) * (x))
|
||||
__always_inline auto sq(auto x) { return SQR(x); }
|
||||
|
||||
|
@ -418,15 +407,7 @@ template<int s, int t> struct check_size {
|
|||
#endif
|
||||
#define BIT(num) (1<<(num))
|
||||
|
||||
#define ABS(a) std::abs(a)
|
||||
#define ABS(a) Abs(a)
|
||||
|
||||
__always_inline auto norm(auto value, auto min, auto max) {
|
||||
return (Clamp(value, min, max) - min) / (max - min);
|
||||
}
|
||||
// we use std::lerp now
|
||||
//#define lerp(norm, min, max) ( (norm) * ((max) - (min)) + (min) )
|
||||
|
||||
#define STRINGIFY(x) #x
|
||||
#define STR(x) STRINGIFY(x)
|
||||
#define CONCAT_(x,y) x##y
|
||||
#define CONCAT(x,y) CONCAT_(x,y)
|
||||
|
|
|
@ -4,7 +4,7 @@ CMatrix::CMatrix(CMatrix const &m)
|
|||
{
|
||||
m_attachment = nil;
|
||||
m_hasRwMatrix = false;
|
||||
*this = m;
|
||||
mat_copy(*this, m);
|
||||
}
|
||||
|
||||
CMatrix::CMatrix(RwMatrix *matrix, bool owner)
|
||||
|
@ -75,7 +75,7 @@ CMatrix::UpdateRW(void)
|
|||
void
|
||||
CMatrix::operator=(CMatrix const &rhs)
|
||||
{
|
||||
memcpy(this, &rhs, sizeof(f));
|
||||
mat_copy(*this, rhs);
|
||||
if (m_attachment)
|
||||
UpdateRW();
|
||||
}
|
||||
|
@ -83,7 +83,7 @@ CMatrix::operator=(CMatrix const &rhs)
|
|||
void
|
||||
CMatrix::CopyOnlyMatrix(const CMatrix &other)
|
||||
{
|
||||
memcpy(this, &other, sizeof(f));
|
||||
mat_copy(*this, other);
|
||||
}
|
||||
|
||||
CMatrix &
|
||||
|
@ -277,9 +277,9 @@ void
|
|||
CMatrix::RotateX(float x)
|
||||
{
|
||||
#if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
|
||||
mat_load(reinterpret_cast<matrix_t *>(this));
|
||||
mat_rotate_x(x);
|
||||
mat_store(reinterpret_cast<matrix_t *>(this));
|
||||
mat_load(reinterpret_cast<matrix_t *>(this));
|
||||
mat_rotate_x(x);
|
||||
mat_store(reinterpret_cast<matrix_t *>(this));
|
||||
#else
|
||||
auto [s, c] = SinCos(x);
|
||||
|
||||
|
@ -307,9 +307,9 @@ void
|
|||
CMatrix::RotateY(float y)
|
||||
{
|
||||
#if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
|
||||
mat_load(reinterpret_cast<matrix_t *>(this));
|
||||
mat_rotate_y(y);
|
||||
mat_store(reinterpret_cast<matrix_t *>(this));
|
||||
mat_load(reinterpret_cast<matrix_t *>(this));
|
||||
mat_rotate_y(y);
|
||||
mat_store(reinterpret_cast<matrix_t *>(this));
|
||||
#else
|
||||
auto [s, c] = SinCos(y);
|
||||
|
||||
|
@ -337,9 +337,9 @@ void
|
|||
CMatrix::RotateZ(float z)
|
||||
{
|
||||
#if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
|
||||
mat_load(reinterpret_cast<matrix_t *>(this));
|
||||
mat_rotate_z(z);
|
||||
mat_store(reinterpret_cast<matrix_t *>(this));
|
||||
mat_load(reinterpret_cast<matrix_t *>(this));
|
||||
mat_rotate_z(z);
|
||||
mat_store(reinterpret_cast<matrix_t *>(this));
|
||||
#else
|
||||
auto [s, c] = SinCos(z);
|
||||
|
||||
|
@ -367,9 +367,9 @@ void
|
|||
CMatrix::Rotate(float x, float y, float z)
|
||||
{
|
||||
#if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
|
||||
mat_load(reinterpret_cast<matrix_t *>(this));
|
||||
mat_rotate(x, y, z);
|
||||
mat_store(reinterpret_cast<matrix_t *>(this));
|
||||
mat_load(reinterpret_cast<matrix_t *>(this));
|
||||
mat_rotate(x, y, z);
|
||||
mat_store(reinterpret_cast<matrix_t *>(this));
|
||||
#else
|
||||
auto [sX, cX] = SinCos(x);
|
||||
auto [sY, cY] = SinCos(y);
|
||||
|
@ -449,65 +449,13 @@ CMatrix::Reorthogonalise(void)
|
|||
f = CrossProduct(u, r);
|
||||
}
|
||||
|
||||
#ifdef DC_SH4
|
||||
static __always_inline void MATH_Load_Matrix_Product(const matrix_t* matrix1, const matrix_t* matrix2)
|
||||
{
|
||||
unsigned int prefetch_scratch;
|
||||
|
||||
asm volatile (
|
||||
"mov %[bmtrx], %[pref_scratch]\n\t" // (MT)
|
||||
"add #32, %[pref_scratch]\n\t" // offset by 32 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle)
|
||||
"fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) (FE)
|
||||
"pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS)
|
||||
// back matrix
|
||||
"fmov.d @%[bmtrx]+, XD0\n\t" // (LS)
|
||||
"fmov.d @%[bmtrx]+, XD2\n\t"
|
||||
"fmov.d @%[bmtrx]+, XD4\n\t"
|
||||
"fmov.d @%[bmtrx]+, XD6\n\t"
|
||||
"pref @%[fmtrx]\n\t" // prefetch fmtrx now while we wait (LS)
|
||||
"fmov.d @%[bmtrx]+, XD8\n\t" // bmtrx prefetch should work for here
|
||||
"fmov.d @%[bmtrx]+, XD10\n\t"
|
||||
"fmov.d @%[bmtrx]+, XD12\n\t"
|
||||
"mov %[fmtrx], %[pref_scratch]\n\t" // (MT)
|
||||
"add #32, %[pref_scratch]\n\t" // store offset by 32 in r0 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle)
|
||||
"fmov.d @%[bmtrx], XD14\n\t"
|
||||
"pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS)
|
||||
// front matrix
|
||||
// interleave loads and matrix multiply 4x4
|
||||
"fmov.d @%[fmtrx]+, DR0\n\t"
|
||||
"fmov.d @%[fmtrx]+, DR2\n\t"
|
||||
"fmov.d @%[fmtrx]+, DR4\n\t" // (LS) want to issue the next one before 'ftrv' for parallel exec
|
||||
"ftrv XMTRX, FV0\n\t" // (FE)
|
||||
|
||||
"fmov.d @%[fmtrx]+, DR6\n\t"
|
||||
"fmov.d @%[fmtrx]+, DR8\n\t"
|
||||
"ftrv XMTRX, FV4\n\t"
|
||||
|
||||
"fmov.d @%[fmtrx]+, DR10\n\t"
|
||||
"fmov.d @%[fmtrx]+, DR12\n\t"
|
||||
"ftrv XMTRX, FV8\n\t"
|
||||
|
||||
"fmov.d @%[fmtrx], DR14\n\t" // (LS, but this will stall 'ftrv' for 3 cycles)
|
||||
"fschg\n\t" // switch back to single moves (and avoid stalling 'ftrv') (FE)
|
||||
"ftrv XMTRX, FV12\n\t" // (FE)
|
||||
// Save output in XF regs
|
||||
"frchg\n"
|
||||
: [bmtrx] "+&r" ((unsigned int)matrix1), [fmtrx] "+r" ((unsigned int)matrix2), [pref_scratch] "=&r" (prefetch_scratch) // outputs, "+" means r/w, "&" means it's written to before all inputs are consumed
|
||||
: // no inputs
|
||||
: "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11", "fr12", "fr13", "fr14", "fr15" // clobbers (GCC doesn't know about back bank, so writing to it isn't clobbered)
|
||||
);
|
||||
}
|
||||
#endif
|
||||
|
||||
CMatrix
|
||||
operator*(const CMatrix &m1, const CMatrix &m2)
|
||||
{
|
||||
// TODO: VU0 code
|
||||
CMatrix out;
|
||||
#if defined(RW_DC)
|
||||
mat_load(reinterpret_cast<const matrix_t *>(&m1));
|
||||
mat_apply(reinterpret_cast<const matrix_t *>(&m2));
|
||||
mat_store(reinterpret_cast<matrix_t *>(&out));
|
||||
mat_mult(out, m1, m2);
|
||||
#else
|
||||
out.rx = m1.rx * m2.rx + m1.fx * m2.ry + m1.ux * m2.rz;
|
||||
out.ry = m1.ry * m2.rx + m1.fy * m2.ry + m1.uy * m2.rz;
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
#pragma once
|
||||
|
||||
#include "rwdc_common.h"
|
||||
|
||||
class alignas(8) CMatrix
|
||||
{
|
||||
public:
|
||||
|
@ -27,6 +29,8 @@ public:
|
|||
SetScale(scale);
|
||||
}
|
||||
~CMatrix(void);
|
||||
operator matrix_t *() { return reinterpret_cast<matrix_t *>(this); }
|
||||
operator const matrix_t *() const { return reinterpret_cast<const matrix_t *>(this); }
|
||||
void Attach(RwMatrix *matrix, bool owner = false);
|
||||
void AttachRW(RwMatrix *matrix, bool owner = false);
|
||||
void Detach(void);
|
||||
|
@ -59,6 +63,7 @@ public:
|
|||
void SetScale(float s);
|
||||
void Scale(float scale)
|
||||
{
|
||||
#ifndef DC_SH4
|
||||
for (int i = 0; i < 3; i++)
|
||||
#ifdef FIX_BUGS // BUGFIX from VC
|
||||
for (int j = 0; j < 3; j++)
|
||||
|
@ -66,6 +71,11 @@ public:
|
|||
for (int j = 0; j < 4; j++)
|
||||
#endif
|
||||
f[i][j] *= scale;
|
||||
#else
|
||||
mat_load(*this);
|
||||
mat_scale(scale, scale, scale);
|
||||
mat_store(*this);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
@ -102,11 +112,22 @@ CMatrix Invert(const CMatrix &matrix);
|
|||
CMatrix operator*(const CMatrix &m1, const CMatrix &m2);
|
||||
inline CVector MultiplyInverse(const CMatrix &mat, const CVector &vec)
|
||||
{
|
||||
#ifndef DC_SH4
|
||||
CVector v(vec.x - mat.px, vec.y - mat.py, vec.z - mat.pz);
|
||||
return CVector(
|
||||
mat.rx * v.x + mat.ry * v.y + mat.rz * v.z,
|
||||
mat.fx * v.x + mat.fy * v.y + mat.fz * v.z,
|
||||
mat.ux * v.x + mat.uy * v.y + mat.uz * v.z);
|
||||
#else
|
||||
register float x asm(KOS_FPARG(0)) = vec.x - mat.px;
|
||||
register float y asm(KOS_FPARG(1)) = vec.y - mat.py;
|
||||
register float z asm(KOS_FPARG(2)) = vec.z - mat.pz;
|
||||
return CVector(
|
||||
fipr(x, y, z, 0.0f, mat.rx, mat.ry, mat.rz, 0.0f),
|
||||
fipr(x, y, z, 0.0f, mat.fx, mat.fy, mat.fz, 0.0f),
|
||||
fipr(x, y, z, 0.0f, mat.ux, mat.uy, mat.uz, 0.0f)
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -39,10 +39,14 @@ CQuaternion::Slerp(const CQuaternion &q1, const CQuaternion &q2, float theta, fl
|
|||
void
|
||||
CQuaternion::Multiply(const CQuaternion &q1, const CQuaternion &q2)
|
||||
{
|
||||
#ifndef DC_SH4
|
||||
x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w);
|
||||
y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w);
|
||||
z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w);
|
||||
w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z);
|
||||
#else
|
||||
quat_mult(*this, q1, q2);
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -51,9 +55,16 @@ CQuaternion::Get(RwV3d *axis, float *angle)
|
|||
*angle = Acos(w);
|
||||
float s = Sin(*angle);
|
||||
|
||||
#ifndef DC_SH4
|
||||
axis->x = x * (1.0f / s);
|
||||
axis->y = y * (1.0f / s);
|
||||
axis->z = z * (1.0f / s);
|
||||
#else
|
||||
float invS = dc::Invert<true, false>(s);
|
||||
axis->x = x * invS;
|
||||
axis->y = y * invS;
|
||||
axis->z = z * invS;
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -104,7 +115,7 @@ CQuaternion::Set(const RwMatrix &matrix)
|
|||
if (f >= 0.0f) {
|
||||
s = Sqrt(f + 1.0f);
|
||||
w = 0.5f * s;
|
||||
m = 0.5f / s;
|
||||
m = Div<true, false>(0.5f, s);
|
||||
x = (matrix.up.z - matrix.at.y) * m;
|
||||
y = (matrix.at.x - matrix.right.z) * m;
|
||||
z = (matrix.right.y - matrix.up.x) * m;
|
||||
|
@ -115,7 +126,7 @@ CQuaternion::Set(const RwMatrix &matrix)
|
|||
if (f >= 0.0f) {
|
||||
s = Sqrt(f + 1.0f);
|
||||
x = 0.5f * s;
|
||||
m = 0.5f / s;
|
||||
m = Div<true, false>(0.5f, s);
|
||||
y = (matrix.up.x + matrix.right.y) * m;
|
||||
z = (matrix.at.x + matrix.right.z) * m;
|
||||
w = (matrix.up.z - matrix.at.y) * m;
|
||||
|
@ -126,7 +137,7 @@ CQuaternion::Set(const RwMatrix &matrix)
|
|||
if (f >= 0.0f) {
|
||||
s = Sqrt(f + 1.0f);
|
||||
y = 0.5f * s;
|
||||
m = 0.5f / s;
|
||||
m = Div<true, false>(0.5f, s);
|
||||
w = (matrix.at.x - matrix.right.z) * m;
|
||||
x = (matrix.up.x - matrix.right.y) * m;
|
||||
z = (matrix.at.y + matrix.up.z) * m;
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#pragma once
|
||||
|
||||
#include "src/common_defines.h"
|
||||
#include "rwdc_common.h"
|
||||
|
||||
// TODO: actually implement this
|
||||
class CQuaternion
|
||||
|
@ -10,6 +11,10 @@ public:
|
|||
CQuaternion(void) {}
|
||||
CQuaternion(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {}
|
||||
|
||||
operator quaternion_t *() { return reinterpret_cast<quaternion_t *>(this); }
|
||||
operator const quaternion_t *() const { return reinterpret_cast<const quaternion_t *>(this); }
|
||||
operator quaternion_t &() { return *reinterpret_cast<quaternion_t *>(this); }
|
||||
operator const quaternion_t &() const { return *reinterpret_cast<const quaternion_t *>(this); }
|
||||
float Magnitude(void) const {
|
||||
#ifndef DC_SH4
|
||||
return Sqrt(x*x + y*y + z*z + w*w);
|
||||
|
@ -57,10 +62,11 @@ public:
|
|||
}
|
||||
|
||||
const CQuaternion &operator/=(float right) {
|
||||
x /= right;
|
||||
y /= right;
|
||||
z /= right;
|
||||
w /= right;
|
||||
right = dc::Invert(right);
|
||||
x *= right;
|
||||
y *= right;
|
||||
z *= right;
|
||||
w *= right;
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
|
|
@ -50,9 +50,18 @@ Multiply3x3(const CMatrix &mat, const CVector &vec)
|
|||
CVector
|
||||
Multiply3x3(const CVector &vec, const CMatrix &mat)
|
||||
{
|
||||
#ifndef DC_SH4
|
||||
return CVector(mat.rx * vec.x + mat.ry * vec.y + mat.rz * vec.z,
|
||||
mat.fx * vec.x + mat.fy * vec.y + mat.fz * vec.z,
|
||||
mat.ux * vec.x + mat.uy * vec.y + mat.uz * vec.z);
|
||||
#else
|
||||
CVector out;
|
||||
mat_load(mat);
|
||||
mat_transpose();
|
||||
mat_trans_vec3_nomod(vec.x, vec.y, vec.z,
|
||||
out.x, out.y, out.z);
|
||||
return out;
|
||||
#endif
|
||||
}
|
||||
|
||||
CVector
|
||||
|
|
|
@ -44,7 +44,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const CV
|
|||
sqc2 vf06,0x0(%0)\n\
|
||||
": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory");
|
||||
#elif defined(DC_SH4)
|
||||
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat)));
|
||||
mat_load(mat);
|
||||
mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z);
|
||||
#else
|
||||
out = mat * in;
|
||||
|
@ -71,7 +71,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const Rw
|
|||
sqc2 vf06,0x0(%0)\n\
|
||||
": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory");
|
||||
#elif defined(DC_SH4)
|
||||
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat)));
|
||||
mat_load(mat);
|
||||
mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z);
|
||||
#else
|
||||
out = mat * in;
|
||||
|
@ -108,7 +108,7 @@ __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat,
|
|||
bnez %1,1b\n\
|
||||
": : "r" (out) , "r" (n), "r" (&mat), "r" (in), "r" (stride): "memory");
|
||||
#elif defined(DC_SH4)
|
||||
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat)));
|
||||
mat_load(mat);
|
||||
while(n--) {
|
||||
mat_trans_single3_nodiv_nomod(in->x, in->y, in->z, out->x, out->y, out->z);
|
||||
in = reinterpret_cast<const RwV3d *>(reinterpret_cast<const uint8_t *>(in) + stride);
|
||||
|
|
|
@ -1,73 +1,6 @@
|
|||
#pragma once
|
||||
|
||||
#include "src/common_defines.h"
|
||||
#include "rwdc_common.h"
|
||||
|
||||
#include <tuple>
|
||||
#include <dc/matrix.h>
|
||||
|
||||
#ifdef DC_SH4
|
||||
|
||||
#define mat_trans_nodiv_nomod(x, y, z, x2, y2, z2, w2) do { \
|
||||
register float __x __asm__("fr12") = (x); \
|
||||
register float __y __asm__("fr13") = (y); \
|
||||
register float __z __asm__("fr14") = (z); \
|
||||
register float __w __asm__("fr15") = 1.0f; \
|
||||
__asm__ __volatile__( "ftrv xmtrx, fv12\n" \
|
||||
: "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
|
||||
: "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
|
||||
x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
|
||||
} while(false)
|
||||
|
||||
#define mat_trans_w_nodiv_nomod(x, y, z, w) do { \
|
||||
register float __x __asm__("fr12") = (x); \
|
||||
register float __y __asm__("fr13") = (y); \
|
||||
register float __z __asm__("fr14") = (z); \
|
||||
register float __w __asm__("fr15") = 1.0f; \
|
||||
__asm__ __volatile__( "ftrv xmtrx, fv12\n" \
|
||||
: "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
|
||||
: "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
|
||||
w = __w; \
|
||||
} while(false)
|
||||
|
||||
__always_inline float Fmac(float a, float b, float c) {
|
||||
asm volatile ("fmac fr0, %[floatb], %[floatc]\n"
|
||||
: [floatc] "+f" (c) : "w" (a), [floatb] "f" (b) : );
|
||||
return c;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define mat_trans_nodiv_nomod(x_, y_, z_, x2, y2, z2, w2) do { \
|
||||
vector_t tmp = { x_, y_, z_, 1.0f }; \
|
||||
mat_transform(&tmp, &tmp, 1, 0); \
|
||||
x2 = tmp.x; y2 = tmp.y; z2 = tmp.z; w2 = tmp.w; \
|
||||
} while(false)
|
||||
|
||||
#define mat_trans_w_nodiv_nomod(x_, y_, z_, w_) do { \
|
||||
vector_t tmp = { x_, y_, z_, 1.0f }; \
|
||||
mat_transform(&tmp, &tmp, 1, 0); \
|
||||
w_ = tmp.w; \
|
||||
} while(false)
|
||||
|
||||
__always_inline float Fmac(float a, float b, float c) { return a * b + c; }
|
||||
|
||||
#endif
|
||||
|
||||
__always_inline float Sin(float x) { return __builtin_sinf(x); }
|
||||
__always_inline float Cos(float x) { return __builtin_cosf(x); }
|
||||
__always_inline auto SinCos(float x) { return std::pair { Sin(x), Cos(x) }; }
|
||||
__always_inline float Tan(float x) { return __builtin_tanf(x); }
|
||||
__always_inline float Abs(float x) { return __builtin_fabsf(x); }
|
||||
__always_inline float Sqrt(float x) { return __builtin_sqrtf(x); }
|
||||
__always_inline float RecipSqrt(float x) { return 1.0f / __builtin_sqrtf(x); }
|
||||
__always_inline float Asin(float x) { return __builtin_asinf(x); }
|
||||
__always_inline float Acos(float x) { return __builtin_acosf(x); }
|
||||
__always_inline float Atan(float x) { return __builtin_atanf(x); }
|
||||
__always_inline float Atan2(float y, float x) { return __builtin_atan2f(y, x); }
|
||||
__always_inline float RecipSqrt(float x, float y) { return x / __builtin_sqrtf(y); /*y = RecipSqrt(y); return x * y * y;*/ }
|
||||
__always_inline float Pow(float x, float y) { return __builtin_powf(x, y); }
|
||||
__always_inline float Floor(float x) { return __builtin_floorf(x); }
|
||||
__always_inline float Ceil(float x) { return __builtin_ceilf(x); }
|
||||
__always_inline float Invert(float x) { return (((x) < 0.0f)? -1.0f : 1.0f) * RecipSqrt((x) * (x)); }
|
||||
__always_inline float Div(float x, float y) { return x * Invert(y); }
|
||||
__always_inline float Lerp(float a, float b, float t) { return Fmac(t, (b - a), a); }
|
||||
using namespace dc;
|
||||
|
|
|
@ -85,8 +85,6 @@
|
|||
|
||||
#define rwVENDORID_ROCKSTAR 0x0253F2
|
||||
|
||||
#define Max(a,b) ((a) > (b) ? (a) : (b))
|
||||
#define Min(a,b) ((a) < (b) ? (a) : (b))
|
||||
|
||||
// Use this to add const that wasn't there in the original code
|
||||
#define Const const
|
||||
|
@ -303,12 +301,8 @@ extern int strncasecmp(const char *str1, const char *str2, size_t len);
|
|||
|
||||
extern wchar *AllocUnicode(const char*src);
|
||||
|
||||
#define Clamp(v, low, high) ((v)<(low) ? (low) : (v)>(high) ? (high) : (v))
|
||||
|
||||
#define Clamp2(v, center, radius) ((v) > (center) ? Min(v, center + radius) : Max(v, center - radius))
|
||||
|
||||
inline float sq(float x) { return x*x; }
|
||||
#define SQR(x) ((x) * (x))
|
||||
__always_inline auto sq(auto x) { return SQR(x); }
|
||||
|
||||
#ifdef __MWERKS__
|
||||
#define M_E 2.71828182845904523536 // e
|
||||
|
@ -326,7 +320,10 @@ inline float sq(float x) { return x*x; }
|
|||
#define M_SQRT1_2 0.707106781186547524401 // 1/sqrt(2)
|
||||
#endif
|
||||
|
||||
#define PI (float)M_PI
|
||||
#ifndef DC_SH4
|
||||
#define F_PI M_PI
|
||||
#endif
|
||||
#define PI (float)F_PI
|
||||
#define TWOPI (PI*2)
|
||||
#define HALFPI (PI/2)
|
||||
#define DEGTORAD(x) ((x) * PI / 180.0f)
|
||||
|
@ -380,7 +377,10 @@ __inline__ void TRACE(char *f, ...) { } // this is re3 only, and so the function
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef MASTER
|
||||
#ifdef assert
|
||||
#undef assert
|
||||
#endif
|
||||
#if !defined(MASTER)
|
||||
#define assert(_Expression) (void)( (!!(_Expression)) || (re3_assert(#_Expression, __FILE__, __LINE__, __FUNCTION__), 0) )
|
||||
#else
|
||||
#define assert(_Expression) (_Expression)
|
||||
|
@ -411,12 +411,7 @@ template<int s, int t> struct check_size {
|
|||
#endif
|
||||
#define BIT(num) (1<<(num))
|
||||
|
||||
#define ABS(a) (((a) < 0) ? (-(a)) : (a))
|
||||
#define norm(value, min, max) (((value) < (min)) ? 0 : (((value) > (max)) ? 1 : (((value) - (min)) / ((max) - (min)))))
|
||||
#define Lerp(norm, min, max) ( (norm) * ((max) - (min)) + (min) )
|
||||
|
||||
#define STRINGIFY(x) #x
|
||||
#define STR(x) STRINGIFY(x)
|
||||
#define CONCAT_(x,y) x##y
|
||||
#define CONCAT(x,y) CONCAT_(x,y)
|
||||
#define ABS(a) Abs(a)
|
||||
|
||||
// we use std::lerp now
|
||||
//#define lerp(norm, min, max) ( (norm) * ((max) - (min)) + (min) )
|
||||
|
|
|
@ -4,7 +4,7 @@ CMatrix::CMatrix(CMatrix const &m)
|
|||
{
|
||||
m_attachment = nil;
|
||||
m_hasRwMatrix = false;
|
||||
*this = m;
|
||||
mat_copy(*this, m);
|
||||
}
|
||||
|
||||
CMatrix::CMatrix(RwMatrix *matrix, bool owner)
|
||||
|
@ -75,7 +75,7 @@ CMatrix::UpdateRW(void)
|
|||
void
|
||||
CMatrix::operator=(CMatrix const &rhs)
|
||||
{
|
||||
memcpy(this, &rhs, sizeof(f));
|
||||
mat_copy(*this, rhs);
|
||||
if (m_attachment)
|
||||
UpdateRW();
|
||||
}
|
||||
|
@ -83,7 +83,7 @@ CMatrix::operator=(CMatrix const &rhs)
|
|||
void
|
||||
CMatrix::CopyOnlyMatrix(const CMatrix &other)
|
||||
{
|
||||
memcpy(this, &other, sizeof(f));
|
||||
mat_copy(*this, other);
|
||||
}
|
||||
|
||||
CMatrix &
|
||||
|
@ -358,12 +358,14 @@ CMatrix::RotateZ(float z)
|
|||
void
|
||||
CMatrix::Rotate(float x, float y, float z)
|
||||
{
|
||||
float cX = Cos(x);
|
||||
float sX = Sin(x);
|
||||
float cY = Cos(y);
|
||||
float sY = Sin(y);
|
||||
float cZ = Cos(z);
|
||||
float sZ = Sin(z);
|
||||
#if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
|
||||
mat_load(reinterpret_cast<matrix_t *>(this));
|
||||
mat_rotate(x, y, z);
|
||||
mat_store(reinterpret_cast<matrix_t *>(this));
|
||||
#else
|
||||
auto [sX, cX] = SinCos(x);
|
||||
auto [sY, cY] = SinCos(y);
|
||||
auto [sZ, cZ] = SinCos(z);
|
||||
|
||||
float rx = this->rx;
|
||||
float ry = this->ry;
|
||||
|
@ -388,6 +390,20 @@ CMatrix::Rotate(float x, float y, float z)
|
|||
float z2 = sZ * sY - (cZ * sX) * cY;
|
||||
float z3 = cX * cY;
|
||||
|
||||
#if !defined(DC_TEXCONV) && !defined(DC_SIM)
|
||||
this->rx = fipr(x1, y1, z1, 0, rx, ry, rz, 0);
|
||||
this->ry = fipr(x2, y2, z2, 0, rx, ry, rz, 0);
|
||||
this->rz = fipr(x3, y3, z3, 0, rx, ry, rz, 0);
|
||||
this->fx = fipr(x1, y1, z1, 0, ux, uy, uz, 0);
|
||||
this->fy = fipr(x2, y2, z2, 0, ux, uy, uz, 0);
|
||||
this->fz = fipr(x3, y3, z3, 0, ux, uy, uz, 0);
|
||||
this->ux = fipr(x1, y1, z1, 0, ax, ay, az, 0);
|
||||
this->uy = fipr(x2, y2, z2, 0, ax, ay, az, 0);
|
||||
this->uz = fipr(x3, y3, z3, 0, ax, ay, az, 0);
|
||||
this->px = fipr(x1, y1, z1, 0, px, py, pz, 0);
|
||||
this->py = fipr(x2, y2, z2, 0, px, py, pz, 0);
|
||||
this->pz = fipr(x3, y3, z3, 0, px, py, pz, 0);
|
||||
#else
|
||||
this->rx = x1 * rx + y1 * ry + z1 * rz;
|
||||
this->ry = x2 * rx + y2 * ry + z2 * rz;
|
||||
this->rz = x3 * rx + y3 * ry + z3 * rz;
|
||||
|
@ -400,6 +416,8 @@ CMatrix::Rotate(float x, float y, float z)
|
|||
this->px = x1 * px + y1 * py + z1 * pz;
|
||||
this->py = x2 * px + y2 * py + z2 * pz;
|
||||
this->pz = x3 * px + y3 * py + z3 * pz;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
CMatrix &
|
||||
|
@ -429,9 +447,7 @@ operator*(const CMatrix &m1, const CMatrix &m2)
|
|||
// TODO: VU0 code
|
||||
CMatrix out;
|
||||
#if defined(RW_DC)
|
||||
mat_load(reinterpret_cast<const matrix_t *>(&m1));
|
||||
mat_apply(reinterpret_cast<const matrix_t *>(&m2));
|
||||
mat_store(reinterpret_cast<matrix_t *>(&out));
|
||||
mat_mult(out, m1, m2);
|
||||
#else
|
||||
out.rx = m1.rx * m2.rx + m1.fx * m2.ry + m1.ux * m2.rz;
|
||||
out.ry = m1.ry * m2.rx + m1.fy * m2.ry + m1.uy * m2.rz;
|
||||
|
|
|
@ -43,6 +43,8 @@ public:
|
|||
SetScale(scale);
|
||||
}
|
||||
~CMatrix(void);
|
||||
operator matrix_t *() { return reinterpret_cast<matrix_t *>(this); }
|
||||
operator const matrix_t *() const { return reinterpret_cast<const matrix_t *>(this); }
|
||||
void Attach(RwMatrix *matrix, bool owner = false);
|
||||
void AttachRW(RwMatrix *matrix, bool owner = false);
|
||||
void Detach(void);
|
||||
|
@ -75,20 +77,31 @@ public:
|
|||
void SetScale(float s);
|
||||
void Scale(float scale)
|
||||
{
|
||||
#if !defined(DC_SH4) || 1 /* TODO */
|
||||
for (int i = 0; i < 3; i++)
|
||||
for (int j = 0; j < 3; j++)
|
||||
f[i][j] *= scale;
|
||||
#else
|
||||
mat_load(*this);
|
||||
mat_scale(scale, scale, scale);
|
||||
mat_store(*this);
|
||||
#endif
|
||||
}
|
||||
void Scale(float sx, float sy, float sz)
|
||||
{
|
||||
#if !defined(DC_SH4) || 1 /* TODO */
|
||||
for (int i = 0; i < 3; i++){
|
||||
f[i][0] *= sx;
|
||||
f[i][1] *= sy;
|
||||
f[i][2] *= sz;
|
||||
f[i][0] *= sx;
|
||||
f[i][1] *= sy;
|
||||
f[i][2] *= sz;
|
||||
}
|
||||
#else
|
||||
mat_load(*this);
|
||||
mat_scale(sx, sy, sz);
|
||||
mat_store(*this);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
void SetRotateXOnly(float angle);
|
||||
void SetRotateYOnly(float angle);
|
||||
void SetRotateZOnly(float angle);
|
||||
|
@ -125,11 +138,22 @@ CMatrix Invert(const CMatrix &matrix);
|
|||
CMatrix operator*(const CMatrix &m1, const CMatrix &m2);
|
||||
inline CVector MultiplyInverse(const CMatrix &mat, const CVector &vec)
|
||||
{
|
||||
#ifndef DC_SH4
|
||||
CVector v(vec.x - mat.px, vec.y - mat.py, vec.z - mat.pz);
|
||||
return CVector(
|
||||
mat.rx * v.x + mat.ry * v.y + mat.rz * v.z,
|
||||
mat.fx * v.x + mat.fy * v.y + mat.fz * v.z,
|
||||
mat.ux * v.x + mat.uy * v.y + mat.uz * v.z);
|
||||
#else
|
||||
register float x asm(KOS_FPARG(0)) = vec.x - mat.px;
|
||||
register float y asm(KOS_FPARG(1)) = vec.y - mat.py;
|
||||
register float z asm(KOS_FPARG(2)) = vec.z - mat.pz;
|
||||
return CVector(
|
||||
fipr(x, y, z, 0.0f, mat.rx, mat.ry, mat.rz, 0.0f),
|
||||
fipr(x, y, z, 0.0f, mat.fx, mat.fy, mat.fz, 0.0f),
|
||||
fipr(x, y, z, 0.0f, mat.ux, mat.uy, mat.uz, 0.0f)
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -39,10 +39,14 @@ CQuaternion::Slerp(const CQuaternion &q1, const CQuaternion &q2, float theta, fl
|
|||
void
|
||||
CQuaternion::Multiply(const CQuaternion &q1, const CQuaternion &q2)
|
||||
{
|
||||
#ifndef DC_SH4
|
||||
x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w);
|
||||
y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w);
|
||||
z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w);
|
||||
w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z);
|
||||
#else
|
||||
quat_mult(*this, q1, q2);
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -51,9 +55,16 @@ CQuaternion::Get(RwV3d *axis, float *angle)
|
|||
*angle = Acos(w);
|
||||
float s = Sin(*angle);
|
||||
|
||||
#ifndef DC_SH4
|
||||
axis->x = x * (1.0f / s);
|
||||
axis->y = y * (1.0f / s);
|
||||
axis->z = z * (1.0f / s);
|
||||
#else
|
||||
float invS = dc::Invert<true, false>(s);
|
||||
axis->x = x * invS;
|
||||
axis->y = y * invS;
|
||||
axis->z = z * invS;
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -104,7 +115,7 @@ CQuaternion::Set(const RwMatrix &matrix)
|
|||
if (f >= 0.0f) {
|
||||
s = Sqrt(f + 1.0f);
|
||||
w = 0.5f * s;
|
||||
m = 0.5f / s;
|
||||
m = Div<true, false>(0.5f, s);
|
||||
x = (matrix.up.z - matrix.at.y) * m;
|
||||
y = (matrix.at.x - matrix.right.z) * m;
|
||||
z = (matrix.right.y - matrix.up.x) * m;
|
||||
|
@ -115,7 +126,7 @@ CQuaternion::Set(const RwMatrix &matrix)
|
|||
if (f >= 0.0f) {
|
||||
s = Sqrt(f + 1.0f);
|
||||
x = 0.5f * s;
|
||||
m = 0.5f / s;
|
||||
m = Div<true, false>(0.5f, s);
|
||||
y = (matrix.up.x + matrix.right.y) * m;
|
||||
z = (matrix.at.x + matrix.right.z) * m;
|
||||
w = (matrix.up.z - matrix.at.y) * m;
|
||||
|
@ -126,7 +137,7 @@ CQuaternion::Set(const RwMatrix &matrix)
|
|||
if (f >= 0.0f) {
|
||||
s = Sqrt(f + 1.0f);
|
||||
y = 0.5f * s;
|
||||
m = 0.5f / s;
|
||||
m = Div<true, false>(0.5f, s);
|
||||
w = (matrix.at.x - matrix.right.z) * m;
|
||||
x = (matrix.up.x - matrix.right.y) * m;
|
||||
z = (matrix.at.y + matrix.up.z) * m;
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#pragma once
|
||||
|
||||
#include "src/common_defines.h"
|
||||
#include "rwdc_common.h"
|
||||
|
||||
// TODO: actually implement this
|
||||
class CQuaternion
|
||||
|
@ -10,6 +11,10 @@ public:
|
|||
CQuaternion(void) {}
|
||||
CQuaternion(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {}
|
||||
|
||||
operator quaternion_t *() { return reinterpret_cast<quaternion_t *>(this); }
|
||||
operator const quaternion_t *() const { return reinterpret_cast<const quaternion_t *>(this); }
|
||||
operator quaternion_t &() { return *reinterpret_cast<quaternion_t *>(this); }
|
||||
operator const quaternion_t &() const { return *reinterpret_cast<const quaternion_t *>(this); }
|
||||
float Magnitude(void) const {
|
||||
#ifndef DC_SH4
|
||||
return Sqrt(x*x + y*y + z*z + w*w);
|
||||
|
@ -57,10 +62,11 @@ public:
|
|||
}
|
||||
|
||||
const CQuaternion &operator/=(float right) {
|
||||
x /= right;
|
||||
y /= right;
|
||||
z /= right;
|
||||
w /= right;
|
||||
right = dc::Invert(right);
|
||||
x *= right;
|
||||
y *= right;
|
||||
z *= right;
|
||||
w *= right;
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
|
|
@ -50,9 +50,18 @@ Multiply3x3(const CMatrix &mat, const CVector &vec)
|
|||
CVector
|
||||
Multiply3x3(const CVector &vec, const CMatrix &mat)
|
||||
{
|
||||
#ifndef DC_SH4
|
||||
return CVector(mat.rx * vec.x + mat.ry * vec.y + mat.rz * vec.z,
|
||||
mat.fx * vec.x + mat.fy * vec.y + mat.fz * vec.z,
|
||||
mat.ux * vec.x + mat.uy * vec.y + mat.uz * vec.z);
|
||||
#else
|
||||
CVector out;
|
||||
mat_load(mat);
|
||||
mat_transpose();
|
||||
mat_trans_vec3_nomod(vec.x, vec.y, vec.z,
|
||||
out.x, out.y, out.z);
|
||||
return out;
|
||||
#endif
|
||||
}
|
||||
|
||||
CVector
|
||||
|
|
|
@ -50,7 +50,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const CV
|
|||
sqc2 vf06,0x0(%0)\n\
|
||||
": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory");
|
||||
#elif defined(DC_SH4)
|
||||
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat)));
|
||||
mat_load(mat);
|
||||
mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z);
|
||||
#else
|
||||
out = mat * in;
|
||||
|
@ -77,7 +77,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const Rw
|
|||
sqc2 vf06,0x0(%0)\n\
|
||||
": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory");
|
||||
#elif defined(DC_SH4)
|
||||
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat)));
|
||||
mat_load(mat);
|
||||
mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z);
|
||||
#else
|
||||
out = mat * in;
|
||||
|
@ -114,7 +114,7 @@ __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat,
|
|||
bnez %1,1b\n\
|
||||
": : "r" (out) , "r" (n), "r" (&mat), "r" (in), "r" (stride): "memory");
|
||||
#elif defined(DC_SH4)
|
||||
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat)));
|
||||
mat_load(mat);
|
||||
while(n--) {
|
||||
mat_trans_single3_nodiv_nomod(in->x, in->y, in->z, out->x, out->y, out->z);
|
||||
in = reinterpret_cast<const RwV3d *>(reinterpret_cast<const uint8_t *>(in) + stride);
|
||||
|
@ -137,7 +137,6 @@ __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat,
|
|||
lqc2 vf03,0x10(%2)\n\
|
||||
lqc2 vf04,0x20(%2)\n\
|
||||
lqc2 vf05,0x30(%2)\n\
|
||||
lqc2 vf01,0x0(%3)\n\
|
||||
nop\n\
|
||||
1: vmulax.xyz ACC, vf02,vf01\n\
|
||||
vmadday.xyz ACC, vf03,vf01\n\
|
||||
|
|
|
@ -1,46 +1,6 @@
|
|||
#pragma once
|
||||
|
||||
#include "src/common_defines.h"
|
||||
#include "rwdc_common.h"
|
||||
|
||||
#include <dc/matrix.h>
|
||||
|
||||
#ifdef DC_SH4
|
||||
|
||||
#define mat_trans_nodiv_nomod(x, y, z, x2, y2, z2, w2) do { \
|
||||
register float __x __asm__("fr12") = (x); \
|
||||
register float __y __asm__("fr13") = (y); \
|
||||
register float __z __asm__("fr14") = (z); \
|
||||
register float __w __asm__("fr15") = 1.0f; \
|
||||
__asm__ __volatile__( "ftrv xmtrx, fv12\n" \
|
||||
: "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
|
||||
: "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
|
||||
x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
|
||||
} while(false)
|
||||
|
||||
#else
|
||||
|
||||
#define mat_trans_nodiv_nomod(x_, y_, z_, x2, y2, z2, w2) do { \
|
||||
vector_t tmp = { x_, y_, z_, 1.0f }; \
|
||||
mat_transform(&tmp, &tmp, 1, 0); \
|
||||
x2 = tmp.x; y2 = tmp.y; z2 = tmp.z; w2 = tmp.w; \
|
||||
} while(false)
|
||||
#endif
|
||||
|
||||
|
||||
// wrapper around float versions of functions
|
||||
// in gta they are in CMaths but that makes the code rather noisy
|
||||
|
||||
inline float Sin(float x) { return sinf(x); }
|
||||
inline float Asin(float x) { return asinf(x); }
|
||||
inline float Cos(float x) { return cosf(x); }
|
||||
inline float Acos(float x) { return acosf(x); }
|
||||
inline float Tan(float x) { return tanf(x); }
|
||||
inline float Atan(float x) { return atanf(x); }
|
||||
inline float Atan2(float y, float x) { return atan2f(y, x); }
|
||||
inline float Abs(float x) { return fabsf(x); }
|
||||
inline float Sqrt(float x) { return sqrtf(x); }
|
||||
inline float RecipSqrt(float x, float y) { return x/Sqrt(y); }
|
||||
inline float RecipSqrt(float x) { return RecipSqrt(1.0f, x); }
|
||||
inline float Pow(float x, float y) { return powf(x, y); }
|
||||
inline float Floor(float x) { return floorf(x); }
|
||||
inline float Ceil(float x) { return ceilf(x); }
|
||||
using namespace dc;
|
||||
|
|
2
vendor/dca3-kos
vendored
2
vendor/dca3-kos
vendored
|
@ -1 +1 @@
|
|||
Subproject commit 5f69d048aaf2548687c33afc7726d603a22e29b7
|
||||
Subproject commit 144eaeab940246d9c84c1f14b45d9e2847705c5e
|
8
vendor/librw/src/base.cpp
vendored
8
vendor/librw/src/base.cpp
vendored
|
@ -89,10 +89,18 @@ strncmp_ci(const char *s1, const char *s2, int n)
|
|||
Quat
|
||||
mult(const Quat &q, const Quat &p)
|
||||
{
|
||||
#ifndef DC_SH4
|
||||
return makeQuat(q.w*p.w - q.x*p.x - q.y*p.y - q.z*p.z,
|
||||
q.w*p.x + q.x*p.w + q.y*p.z - q.z*p.y,
|
||||
q.w*p.y + q.y*p.w + q.z*p.x - q.x*p.z,
|
||||
q.w*p.z + q.z*p.w + q.x*p.y - q.y*p.x);
|
||||
#else
|
||||
Quat o;
|
||||
dc::quat_mult(reinterpret_cast<dc::quaternion_t *>(&o),
|
||||
reinterpret_cast<const dc::quaternion_t&>(q),
|
||||
reinterpret_cast<const dc::quaternion_t&>(p));
|
||||
return o;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
|
7
vendor/librw/src/dc/rwdc.cpp
vendored
7
vendor/librw/src/dc/rwdc.cpp
vendored
|
@ -728,13 +728,6 @@ __always_inline void DCE_RenderSubmitVertexIM3D(float x, float y, float w,
|
|||
|
||||
/* END TA Submission Functions*/
|
||||
|
||||
|
||||
|
||||
|
||||
#if defined(DC_TEXCONV)
|
||||
void malloc_stats() { }
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#define UNIMPL_LOG() printf("TODO: Implement %s @ %s:%d\n", __func__, __FILE__, __LINE__);
|
||||
#define UNIMPL_LOGV(fmt, ...) printf("TODO: Implement %s @ %s:%d " fmt "\n", __func__, __FILE__, __LINE__, __VA_ARGS__);
|
||||
|
|
739
vendor/librw/src/dc/rwdc_common.h
vendored
Normal file
739
vendor/librw/src/dc/rwdc_common.h
vendored
Normal file
|
@ -0,0 +1,739 @@
|
|||
#ifndef __RWDC_COMMON_H
|
||||
#define __RWDC_COMMON_H
|
||||
|
||||
#include <tuple>
|
||||
#include <limits>
|
||||
#include <cmath>
|
||||
#include <type_traits>
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
|
||||
#ifndef RW_DC
|
||||
# pragma warning(disable: 4244) // int to float
|
||||
# pragma warning(disable: 4800) // int to bool
|
||||
# pragma warning(disable: 4838) // narrowing conversion
|
||||
# pragma warning(disable: 4996) // POSIX names
|
||||
#else
|
||||
# if !defined(DC_TEXCONV) && !defined(DC_SIM)
|
||||
# include <kos.h>
|
||||
# define DC_SH4
|
||||
# define VIDEO_MODE_WIDTH static_cast<float>(vid_mode->width)
|
||||
# define VIDEO_MODE_HEIGHT static_cast<float>(vid_mode->height)
|
||||
# define memcpy4 memcpy
|
||||
# else
|
||||
# ifdef DC_TEXCONV
|
||||
# define malloc_stats()
|
||||
# endif
|
||||
# include <dc/matrix.h>
|
||||
# include <dc/pvr.h>
|
||||
# include <kos/dbglog.h>
|
||||
# define VIDEO_MODE_WIDTH 640.0f
|
||||
# define VIDEO_MODE_HEIGHT 480.0f
|
||||
# define memcpy4 memcpy
|
||||
# define dcache_pref_block(a) __builtin_prefetch(a)
|
||||
# define F_PI M_PI
|
||||
# ifndef __always_inline
|
||||
# define __always_inline inline
|
||||
# endif
|
||||
# endif
|
||||
# ifdef PVR_TXRFMT_STRIDE
|
||||
# undef PVR_TXRFMT_STRIDE
|
||||
# define PVR_TXRFMT_STRIDE (1 << 25)
|
||||
# endif
|
||||
static_assert(PVR_TXRFMT_STRIDE == (1 << 25),
|
||||
"PVR_TXRFMT_STRIDE is bugged in your KOS version");
|
||||
#endif
|
||||
|
||||
#define F_PI_2 (F_PI * 0.5f)
|
||||
#define __hot __attribute__((hot))
|
||||
#define __cold __attribute__((cold))
|
||||
|
||||
#define STRINGIFY(x) #x
|
||||
#define STR(x) STRINGIFY(x)
|
||||
#define CONCAT_(x,y) x##y
|
||||
#define CONCAT(x,y) CONCAT_(x,y)
|
||||
|
||||
namespace rw {
|
||||
class Matrix;
|
||||
}
|
||||
|
||||
namespace dc {
|
||||
|
||||
struct quaternion_t {
|
||||
float x, y, z, w;
|
||||
};
|
||||
|
||||
__always_inline __hot constexpr float Sin(float x) { return sinf(x); }
|
||||
__always_inline __hot constexpr float Cos(float x) { return cosf(x); }
|
||||
__always_inline __hot constexpr auto SinCos(float x) { return std::pair { Sin(x), Cos(x) }; }
|
||||
__always_inline __hot constexpr float Abs(float x) { return fabsf(x); }
|
||||
__always_inline __hot constexpr float Sqrt(float x) { return sqrtf(x); }
|
||||
__always_inline __hot constexpr float RecipSqrt(float x, float y) { return x / Sqrt(y); }
|
||||
__always_inline __hot constexpr float Pow(float x, float y) { return powf(x, y); }
|
||||
__always_inline __hot constexpr float Floor(float x) { return floorf(x); }
|
||||
__always_inline __hot constexpr float Ceil(float x) { return ceilf(x); }
|
||||
__always_inline __hot constexpr float Fmac(auto a, auto b, auto c) { return a * b + c; }
|
||||
__always_inline __hot constexpr float Lerp(float a, float b, float t) { return Fmac(t, (b - a), a); }
|
||||
__always_inline __hot constexpr auto Max(auto a, auto b) { return ((a > b)? a : b); }
|
||||
__always_inline __hot constexpr auto Min(auto a, auto b) { return ((a < b)? a : b); }
|
||||
|
||||
template<bool CHECK_ZERO=false>
|
||||
__always_inline __hot constexpr float RecipSqrt(float x) {
|
||||
if constexpr(CHECK_ZERO)
|
||||
if(x == 0.0f)
|
||||
return 0.0f;
|
||||
|
||||
return 1.0f / Sqrt(x);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__always_inline __hot constexpr T Clamp(T v, auto low, auto high) {
|
||||
return std::clamp(v, static_cast<T>(low), static_cast<T>(high));
|
||||
}
|
||||
|
||||
__always_inline constexpr auto Clamp2(auto v, auto center, auto radius) {
|
||||
return (v > center) ? Min(v, center + radius) : Max(v, center - radius);
|
||||
}
|
||||
|
||||
template<bool FAST_APPROX=true, bool COPY_SIGN=true>
|
||||
__always_inline __hot constexpr float Invert(float x) {
|
||||
float value;
|
||||
|
||||
if(!std::is_constant_evaluated() && FAST_APPROX) {
|
||||
value = RecipSqrt(x * x);
|
||||
|
||||
if constexpr(COPY_SIGN)
|
||||
if(x < 0.0f)
|
||||
value = -value;
|
||||
|
||||
} else value = 1.0f / x;
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
template<bool FAST_APPROX=true, bool COPY_SIGN=true>
|
||||
__always_inline __hot constexpr float Div(float x, float y) {
|
||||
if(FAST_APPROX && !std::is_constant_evaluated())
|
||||
return x * Invert<true, COPY_SIGN>(y);
|
||||
else
|
||||
return x / y;
|
||||
}
|
||||
|
||||
template<bool FAST_APPROX=false, bool COPY_SIGN=true>
|
||||
__always_inline __hot constexpr auto Norm(auto value, auto min, auto max) {
|
||||
auto numerator = Clamp(value, min, max) - min;
|
||||
auto denominator = (max - min);
|
||||
|
||||
if(FAST_APPROX && !std::is_constant_evaluated())
|
||||
return Div<true, COPY_SIGN>(numerator, denominator);
|
||||
else
|
||||
return numerator / denominator;
|
||||
}
|
||||
|
||||
template<bool FAST_APPROX=false, bool FAST_DIV=false, bool DIV_COPY_SIGN=false>
|
||||
__always_inline __hot constexpr float Tan(float x) {
|
||||
if(!std::is_constant_evaluated() && FAST_APPROX) {
|
||||
constexpr float pisqby4 = 2.4674011002723397f;
|
||||
constexpr float adjpisqby4 = 2.471688400562703f;
|
||||
constexpr float adj1minus8bypisq = 0.189759681063053f;
|
||||
float xsq = x * x;
|
||||
|
||||
return x * Div<FAST_DIV, DIV_COPY_SIGN>(adjpisqby4 - adj1minus8bypisq * xsq,
|
||||
pisqby4 - xsq);
|
||||
} else
|
||||
return tanf(x);
|
||||
}
|
||||
|
||||
template<bool FAST_APPROX=false>
|
||||
__always_inline __hot constexpr float Atan(float x) {
|
||||
if(FAST_APPROX && !std::is_constant_evaluated()) {
|
||||
constexpr float a[3] = { //
|
||||
0.998418889819911f, -2.9993501171084700E-01f, 0.0869142852883849f};
|
||||
float xx = x * x;
|
||||
return ((a[2] * xx + a[1]) * xx + a[0]) * x;
|
||||
} else return atanf(x);
|
||||
}
|
||||
|
||||
template<bool FAST_APPROX=false>
|
||||
__hot constexpr float Atan2(float y, float x) {
|
||||
if(FAST_APPROX && !std::is_constant_evaluated()) {
|
||||
#if 0
|
||||
constexpr float halfpi_i754 = M_PI * 0.5f;
|
||||
constexpr float quarterpi_i754 = M_PI * 0.25f;
|
||||
// kludge to prevent 0/0 condition
|
||||
float abs_y = Abs(y) + std::numeric_limits<float>::epsilon();
|
||||
float absy_plus_absx = abs_y + Abs(x);
|
||||
float inv_absy_plus_absx = Invert<true, true>(absy_plus_absx);
|
||||
float angle = halfpi_i754 - copysignf(quarterpi_i754, x);
|
||||
float r = (x - copysignf(abs_y, x)) * inv_absy_plus_absx;
|
||||
angle += (0.1963f * r * r - 0.9817f) * r;
|
||||
return copysignf(angle, y);
|
||||
#else
|
||||
// Ensure input is in [-1, +1]
|
||||
bool swap = fabs(x) < fabs(y);
|
||||
float atan_input = (swap ? x : y) / (swap ? y : x);
|
||||
|
||||
// Approximate atan
|
||||
float res = Atan<true>(atan_input);
|
||||
|
||||
// If swapped, adjust atan output
|
||||
res = swap ? (atan_input >= 0.0f ? F_PI_2 : -F_PI_2) - res : res;
|
||||
// Adjust quadrants
|
||||
if (x >= 0.0f && y >= 0.0f) {} // 1st quadrant
|
||||
else if (x < 0.0f && y >= 0.0f) { res = F_PI + res; } // 2nd quadrant
|
||||
else if (x < 0.0f && y < 0.0f) { res = -F_PI + res; } // 3rd quadrant
|
||||
else if (x >= 0.0f && y < 0.0f) {} // 4th quadrant
|
||||
|
||||
// Store result
|
||||
return res;
|
||||
#endif
|
||||
} else return atan2f(y, x);
|
||||
}
|
||||
|
||||
template<bool FAST_APPROX=false>
|
||||
__always_inline __hot constexpr float Asin(float x) {
|
||||
if(FAST_APPROX && !std::is_constant_evaluated()) {
|
||||
Atan(Div<true, false>(x, Sqrt(1.0f-(x*x))));
|
||||
} else return asinf(x);
|
||||
}
|
||||
|
||||
template<bool FAST_APPROX=false>
|
||||
__always_inline __hot constexpr float Acos(float x) {
|
||||
if(FAST_APPROX && !std::is_constant_evaluated()) {
|
||||
return (-0.69813170079773212f * x * x - 0.87266462599716477f) * x + 1.5707963267948966f;
|
||||
} else return acosf(x);
|
||||
}
|
||||
|
||||
#ifdef RW_DC
|
||||
# ifdef DC_SH4
|
||||
|
||||
#define mat_trans_nodiv_nomod(x, y, z, x2, y2, z2, w2) do { \
|
||||
register float __x __asm__("fr12") = (x); \
|
||||
register float __y __asm__("fr13") = (y); \
|
||||
register float __z __asm__("fr14") = (z); \
|
||||
register float __w __asm__("fr15") = 1.0f; \
|
||||
__asm__ __volatile__( "ftrv xmtrx, fv12\n" \
|
||||
: "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
|
||||
: "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
|
||||
x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
|
||||
} while(false)
|
||||
|
||||
#define mat_trans_nodiv_nomod_zerow(x, y, z, x2, y2, z2, w2) do { \
|
||||
register float __x __asm__("fr12") = (x); \
|
||||
register float __y __asm__("fr13") = (y); \
|
||||
register float __z __asm__("fr14") = (z); \
|
||||
register float __w __asm__("fr15") = 0.0f; \
|
||||
__asm__ __volatile__( "ftrv xmtrx, fv12\n" \
|
||||
: "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
|
||||
: "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
|
||||
x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
|
||||
} while(false)
|
||||
|
||||
#define mat_trans_w_nodiv_nomod(x, y, z, w) do { \
|
||||
register float __x __asm__("fr12") = (x); \
|
||||
register float __y __asm__("fr13") = (y); \
|
||||
register float __z __asm__("fr14") = (z); \
|
||||
register float __w __asm__("fr15") = 1.0f; \
|
||||
__asm__ __volatile__( "ftrv xmtrx, fv12\n" \
|
||||
: "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
|
||||
: "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
|
||||
w = __w; \
|
||||
} while(false)
|
||||
|
||||
#define mat_trans_vec3_nomod(x, y, z, x2, y2, z2) { \
|
||||
register float __x __asm__("fr12") = (x); \
|
||||
register float __y __asm__("fr13") = (y); \
|
||||
register float __z __asm__("fr14") = (z); \
|
||||
__asm__ __volatile__( \
|
||||
"fldi0 fr15\n" \
|
||||
"ftrv xmtrx, fv8\n" \
|
||||
: "=f" (__x), "=f" (__y), "=f" (__z) \
|
||||
: "0" (__x), "1" (__y), "2" (__z) \
|
||||
: "fr15" ); \
|
||||
x2 = __x; y2 = __y; z2 = __z; \
|
||||
}
|
||||
|
||||
// no declspec naked, so can't do rts / fschg. instead compiler pads with nop?
|
||||
inline __hot void mat_load_3x3(const matrix_t* mtx) {
|
||||
__asm__ __volatile__ (
|
||||
R"(
|
||||
fschg
|
||||
frchg
|
||||
|
||||
fmov @%[mtx]+, dr0
|
||||
fldi0 fr12
|
||||
|
||||
fmov @%[mtx]+, dr2
|
||||
fldi0 fr13
|
||||
|
||||
fmov @%[mtx]+, dr4
|
||||
fldi0 fr3
|
||||
|
||||
fmov @%[mtx]+, dr6
|
||||
fmov dr12, dr14
|
||||
|
||||
fmov @%[mtx]+, dr8
|
||||
fldi0 fr7
|
||||
|
||||
fmov @%[mtx]+, dr10
|
||||
fldi0 fr11
|
||||
|
||||
fschg
|
||||
frchg
|
||||
)"
|
||||
: [mtx] "+r" (mtx)
|
||||
);
|
||||
}
|
||||
|
||||
// sets pos.w to 1
|
||||
inline __hot void rw_mat_load_4x4(const rw::Matrix* mtx) {
|
||||
__asm__ __volatile__ (
|
||||
R"(
|
||||
fschg
|
||||
frchg
|
||||
fmov @%[mtx]+, dr0
|
||||
|
||||
fmov @%[mtx]+, dr2
|
||||
fmov @%[mtx]+, dr4
|
||||
fmov @%[mtx]+, dr6
|
||||
fmov @%[mtx]+, dr8
|
||||
fmov @%[mtx]+, dr10
|
||||
fmov @%[mtx]+, dr12
|
||||
fmov @%[mtx]+, dr14
|
||||
fldi1 fr15
|
||||
|
||||
fschg
|
||||
frchg
|
||||
)"
|
||||
: [mtx] "+r" (mtx)
|
||||
);
|
||||
}
|
||||
|
||||
__hot inline void mat_transpose(void) {
|
||||
asm volatile (
|
||||
"frchg\n\t" // fmov for singles only works on front bank
|
||||
// FR0, FR5, FR10, and FR15 are already in place
|
||||
// swap FR1 and FR4
|
||||
"flds FR1, FPUL\n\t"
|
||||
"fmov FR4, FR1\n\t"
|
||||
"fsts FPUL, FR4\n\t"
|
||||
// swap FR2 and FR8
|
||||
"flds FR2, FPUL\n\t"
|
||||
"fmov FR8, FR2\n\t"
|
||||
"fsts FPUL, FR8\n\t"
|
||||
// swap FR3 and FR12
|
||||
"flds FR3, FPUL\n\t"
|
||||
"fmov FR12, FR3\n\t"
|
||||
"fsts FPUL, FR12\n\t"
|
||||
// swap FR6 and FR9
|
||||
"flds FR6, FPUL\n\t"
|
||||
"fmov FR9, FR6\n\t"
|
||||
"fsts FPUL, FR9\n\t"
|
||||
// swap FR7 and FR13
|
||||
"flds FR7, FPUL\n\t"
|
||||
"fmov FR13, FR7\n\t"
|
||||
"fsts FPUL, FR13\n\t"
|
||||
// swap FR11 and FR14
|
||||
"flds FR11, FPUL\n\t"
|
||||
"fmov FR14, FR11\n\t"
|
||||
"fsts FPUL, FR14\n\t"
|
||||
// restore XMTRX to back bank
|
||||
"frchg\n"
|
||||
: // no outputs
|
||||
: // no inputs
|
||||
: "fpul" // clobbers
|
||||
);
|
||||
}
|
||||
|
||||
__attribute__((optimize("align-functions=32")))
|
||||
__hot inline void mat_copy(matrix_t *dst, const matrix_t *src) {
|
||||
asm volatile(R"(
|
||||
fschg
|
||||
|
||||
pref @%[dst]
|
||||
|
||||
fmov.d @%[src]+, xd0
|
||||
fmov.d @%[src]+, xd2
|
||||
fmov.d @%[src]+, xd4
|
||||
fmov.d @%[src]+, xd6
|
||||
|
||||
pref @%[src]
|
||||
|
||||
add #32, %[dst]
|
||||
|
||||
fmov.d xd6, @-%[dst]
|
||||
fmov.d xd4, @-%[dst]
|
||||
fmov.d xd2, @-%[dst]
|
||||
fmov.d xd0, @-%[dst]
|
||||
|
||||
add #32, %[dst]
|
||||
pref @%[dst]
|
||||
|
||||
fmov.d @%[src]+, xd0
|
||||
fmov.d @%[src]+, xd2
|
||||
fmov.d @%[src]+, xd4
|
||||
fmov.d @%[src]+, xd6
|
||||
|
||||
add #32, %[dst]
|
||||
|
||||
fmov.d xd6, @-%[dst]
|
||||
fmov.d xd4, @-%[dst]
|
||||
fmov.d xd2, @-%[dst]
|
||||
fmov.d xd0, @-%[dst]
|
||||
|
||||
fschg
|
||||
)": [dst] "+&r" (dst), [src] "+&r" (src)
|
||||
:
|
||||
: "memory");
|
||||
}
|
||||
|
||||
template<bool FAST_APPROX=true>
|
||||
__hot constexpr inline void quat_mult(quaternion_t *r, const quaternion_t &q1, const quaternion_t &q2) {
|
||||
if(FAST_APPROX && !std::is_constant_evaluated()) {
|
||||
/*
|
||||
// reorder the coefficients so that q1 stays in constant order {x,y,z,w}
|
||||
// q2 then needs to be rotated after each inner product
|
||||
x = (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x);
|
||||
y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y);
|
||||
z = (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z);
|
||||
w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w);
|
||||
*/
|
||||
// keep q1 in fv4
|
||||
register float q1x __asm__ ("fr4") = (q1.x);
|
||||
register float q1y __asm__ ("fr5") = (q1.y);
|
||||
register float q1z __asm__ ("fr6") = (q1.z);
|
||||
register float q1w __asm__ ("fr7") = (q1.w);
|
||||
|
||||
// load q2 into fv8, use it to get the shuffled reorder into fv0
|
||||
register float q2x __asm__ ("fr8") = (q2.x);
|
||||
register float q2y __asm__ ("fr9") = (q2.y);
|
||||
register float q2z __asm__ ("fr10") = (q2.z);
|
||||
register float q2w __asm__ ("fr11") = (q2.w);
|
||||
|
||||
// temporary operand / result in fv0
|
||||
register float t1x __asm__ ("fr0");
|
||||
register float t1y __asm__ ("fr1");
|
||||
register float t1z __asm__ ("fr2");
|
||||
register float t1w __asm__ ("fr3");
|
||||
|
||||
// x = (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x);
|
||||
t1x = q2w;
|
||||
t1y = q2z;
|
||||
t1z = -q2y;
|
||||
t1w = q2w;
|
||||
__asm__ ("\n"
|
||||
" fipr fv4,fv0\n"
|
||||
: "+f" (t1w)
|
||||
: "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
|
||||
"f" (t1x), "f" (t1y), "f" (t1z)
|
||||
);
|
||||
// x = t1w; try to avoid the stall by not reading the fipr result immediately
|
||||
|
||||
// y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y);
|
||||
t1x = -q2z;
|
||||
t1y = q2w;
|
||||
t1z = q2x;
|
||||
__atomic_thread_fence(1);
|
||||
r->x = t1w; // get previous result
|
||||
t1w = q2y;
|
||||
__asm__ ("\n"
|
||||
" fipr fv4,fv0\n"
|
||||
: "+f" (t1w)
|
||||
: "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
|
||||
"f" (t1x), "f" (t1y), "f" (t1z)
|
||||
);
|
||||
//y = t1w;
|
||||
|
||||
// z = (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z);
|
||||
t1x = q2y;
|
||||
t1y = -q2x;
|
||||
t1z = q2w;
|
||||
__atomic_thread_fence(1);
|
||||
r->y = t1w; // get previous result
|
||||
t1w = q2z;
|
||||
__asm__ ("\n"
|
||||
" fipr fv4,fv0\n"
|
||||
: "+f" (t1w)
|
||||
: "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
|
||||
"f" (t1x), "f" (t1y), "f" (t1z)
|
||||
);
|
||||
//z = t1w;
|
||||
__atomic_thread_fence(1);
|
||||
|
||||
// w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w);
|
||||
q2x = -q2x;
|
||||
q2y = -q2y;
|
||||
q2z = -q2z;
|
||||
__asm__ ("\n"
|
||||
" fipr fv4,fv8\n"
|
||||
: "+f" (q2w)
|
||||
: "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
|
||||
"f" (q2x), "f" (q2y), "f" (q2z)
|
||||
);
|
||||
|
||||
__atomic_thread_fence(1);
|
||||
r->z = t1w;
|
||||
__atomic_thread_fence(1);
|
||||
r->w = q2w;
|
||||
} else {
|
||||
r->x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w);
|
||||
r->y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w);
|
||||
r->z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w);
|
||||
r->w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z);
|
||||
}
|
||||
}
|
||||
|
||||
__hot inline void mat_load_apply(const matrix_t* matrix1, const matrix_t* matrix2) {
|
||||
unsigned int prefetch_scratch;
|
||||
|
||||
asm volatile (
|
||||
"mov %[bmtrx], %[pref_scratch]\n\t" // (MT)
|
||||
"add #32, %[pref_scratch]\n\t" // offset by 32 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle)
|
||||
"fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) (FE)
|
||||
"pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS)
|
||||
// back matrix
|
||||
"fmov.d @%[bmtrx]+, XD0\n\t" // (LS)
|
||||
"fmov.d @%[bmtrx]+, XD2\n\t"
|
||||
"fmov.d @%[bmtrx]+, XD4\n\t"
|
||||
"fmov.d @%[bmtrx]+, XD6\n\t"
|
||||
"pref @%[fmtrx]\n\t" // prefetch fmtrx now while we wait (LS)
|
||||
"fmov.d @%[bmtrx]+, XD8\n\t" // bmtrx prefetch should work for here
|
||||
"fmov.d @%[bmtrx]+, XD10\n\t"
|
||||
"fmov.d @%[bmtrx]+, XD12\n\t"
|
||||
"mov %[fmtrx], %[pref_scratch]\n\t" // (MT)
|
||||
"add #32, %[pref_scratch]\n\t" // store offset by 32 in r0 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle)
|
||||
"fmov.d @%[bmtrx], XD14\n\t"
|
||||
"pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS)
|
||||
// front matrix
|
||||
// interleave loads and matrix multiply 4x4
|
||||
"fmov.d @%[fmtrx]+, DR0\n\t"
|
||||
"fmov.d @%[fmtrx]+, DR2\n\t"
|
||||
"fmov.d @%[fmtrx]+, DR4\n\t" // (LS) want to issue the next one before 'ftrv' for parallel exec
|
||||
"ftrv XMTRX, FV0\n\t" // (FE)
|
||||
|
||||
"fmov.d @%[fmtrx]+, DR6\n\t"
|
||||
"fmov.d @%[fmtrx]+, DR8\n\t"
|
||||
"ftrv XMTRX, FV4\n\t"
|
||||
|
||||
"fmov.d @%[fmtrx]+, DR10\n\t"
|
||||
"fmov.d @%[fmtrx]+, DR12\n\t"
|
||||
"ftrv XMTRX, FV8\n\t"
|
||||
|
||||
"fmov.d @%[fmtrx], DR14\n\t" // (LS, but this will stall 'ftrv' for 3 cycles)
|
||||
"fschg\n\t" // switch back to single moves (and avoid stalling 'ftrv') (FE)
|
||||
"ftrv XMTRX, FV12\n\t" // (FE)
|
||||
// Save output in XF regs
|
||||
"frchg\n"
|
||||
: [bmtrx] "+&r" ((unsigned int)matrix1), [fmtrx] "+r" ((unsigned int)matrix2), [pref_scratch] "=&r" (prefetch_scratch) // outputs, "+" means r/w, "&" means it's written to before all inputs are consumed
|
||||
: // no inputs
|
||||
: "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11", "fr12", "fr13", "fr14", "fr15" // clobbers (GCC doesn't know about back bank, so writing to it isn't clobbered)
|
||||
);
|
||||
}
|
||||
|
||||
__hot inline void mat_apply_rotate_x(float x) {
|
||||
x *= 10430.37835f;
|
||||
asm volatile(
|
||||
"ftrc %[a], fpul\n\t"
|
||||
"fsca fpul, dr4\n\t"
|
||||
"fldi0 fr8\n\t"
|
||||
"fldi0 fr11\n\t"
|
||||
"fmov fr5, fr10\n\t"
|
||||
"fmov fr4, fr9\n\t"
|
||||
"fneg fr9\n\t"
|
||||
"ftrv xmtrx, fv8\n\t"
|
||||
"fmov fr4, fr6\n\t"
|
||||
"fldi0 fr7\n\t"
|
||||
"fldi0 fr4\n\t"
|
||||
"ftrv xmtrx, fv4\n\t"
|
||||
"fschg\n\t"
|
||||
"fmov dr8, xd8\n\t"
|
||||
"fmov dr10, xd10\n\t"
|
||||
"fmov dr4, xd4\n\t"
|
||||
"fmov dr6, xd6\n\t"
|
||||
"fschg\n"
|
||||
:
|
||||
: [a] "f"(x)
|
||||
: "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11");
|
||||
}
|
||||
|
||||
__hot inline void mat_apply_rotate_y(float y) {
|
||||
y *= 10430.37835f;
|
||||
asm volatile(
|
||||
"ftrc %[a], fpul\n\t"
|
||||
"fsca fpul, dr6\n\t"
|
||||
"fldi0 fr9\n\t"
|
||||
"fldi0 fr11\n\t"
|
||||
"fmov fr6, fr8\n\t"
|
||||
"fmov fr7, fr10\n\t"
|
||||
"ftrv xmtrx, fv8\n\t"
|
||||
"fmov fr7, fr4\n\t"
|
||||
"fldi0 fr5\n\t"
|
||||
"fneg fr6\n\t"
|
||||
"fldi0 fr7\n\t"
|
||||
"ftrv xmtrx, fv4\n\t"
|
||||
"fschg\n\t"
|
||||
"fmov dr8, xd8\n\t"
|
||||
"fmov dr10, xd10\n\t"
|
||||
"fmov dr4, xd0\n\t"
|
||||
"fmov dr6, xd2\n\t"
|
||||
"fschg\n"
|
||||
:
|
||||
: [a] "f"(y)
|
||||
: "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11");
|
||||
}
|
||||
|
||||
__hot inline void mat_apply_rotate_z(float z) {
|
||||
z *= 10430.37835f;
|
||||
asm volatile(
|
||||
"ftrc %[a], fpul\n\t"
|
||||
"fsca fpul, dr8\n\t"
|
||||
"fldi0 fr10\n\t"
|
||||
"fldi0 fr11\n\t"
|
||||
"fmov fr8, fr5\n\t"
|
||||
"fneg fr8\n\t"
|
||||
"ftrv xmtrx, fv8\n\t"
|
||||
"fmov fr9, fr4\n\t"
|
||||
"fschg\n\t"
|
||||
"fmov dr10, dr6\n\t"
|
||||
"ftrv xmtrx, fv4\n\t"
|
||||
"fmov dr8, xd4\n\t"
|
||||
"fmov dr10, xd6\n\t"
|
||||
"fmov dr4, xd0\n\t"
|
||||
"fmov dr6, xd2\n\t"
|
||||
"fschg\n"
|
||||
:
|
||||
: [a] "f"(z)
|
||||
: "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11");
|
||||
}
|
||||
|
||||
# else
|
||||
# ifdef DC_TEXCONV
|
||||
# define mat_transform(a, b, c, d)
|
||||
# define mat_apply(a)
|
||||
# define mat_load(a)
|
||||
# define mat_store(a)
|
||||
# define mat_identity(a)
|
||||
# define pvr_fog_table_color(a,r,g,b)
|
||||
# define pvr_fog_table_linear(s,e)
|
||||
# endif
|
||||
|
||||
#define mat_trans_vec3_nomod(x_, y_, z_, x2, y2, z2) do { \
|
||||
vector_t tmp = { x_, y_, z_, 0.0f }; \
|
||||
mat_transform(&tmp, &tmp, 1, 0); \
|
||||
x2 = tmp.x; y2 = tmp.y; z2 = tmp.z; \
|
||||
} while(false)
|
||||
|
||||
#define mat_trans_single3_nomod(x_, y_, z_, x2, y2, z2) do { \
|
||||
vector_t tmp = { x_, y_, z_, 1.0f }; \
|
||||
mat_transform(&tmp, &tmp, 1, 0); \
|
||||
z2 = 1.0f / tmp.w; \
|
||||
x2 = tmp.x * z2; \
|
||||
y2 = tmp.y * z2; \
|
||||
} while(false)
|
||||
|
||||
#define mat_trans_single3_nodiv_nomod(x_, y_, z_, x2, y2, z2) do { \
|
||||
vector_t tmp = { x_, y_, z_, 1.0f }; \
|
||||
mat_transform(&tmp, &tmp, 1, 0); \
|
||||
z2 = tmp.z; \
|
||||
x2 = tmp.x; \
|
||||
y2 = tmp.y; \
|
||||
} while(false)
|
||||
|
||||
#define mat_trans_nodiv_nomod(x_, y_, z_, x2, y2, z2, w2) do { \
|
||||
vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \
|
||||
mat_transform(&tmp1233123, &tmp1233123, 1, 0); \
|
||||
x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \
|
||||
} while(false)
|
||||
|
||||
#define mat_trans_nodiv_nomod_zerow(x_, y_, z_, x2, y2, z2, w2) do { \
|
||||
vector_t tmp1233123 = { x_, y_, z_, 0.0f }; \
|
||||
mat_transform(&tmp1233123, &tmp1233123, 1, 0); \
|
||||
x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \
|
||||
} while(false)
|
||||
|
||||
#define mat_trans_w_nodiv_nomod(x_, y_, z_, w_) do { \
|
||||
vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \
|
||||
mat_transform(&tmp1233123, &tmp1233123, 1, 0); \
|
||||
w_ = tmp1233123.w; \
|
||||
} while(false)
|
||||
|
||||
inline void mat_load_3x3(const matrix_t* mtx) {
|
||||
memcpy(XMTRX, mtx, sizeof(matrix_t));
|
||||
XMTRX[0][3] = 0.0f;
|
||||
XMTRX[1][3] = 0.0f;
|
||||
XMTRX[2][3] = 0.0f;
|
||||
|
||||
XMTRX[3][0] = 0.0f;
|
||||
XMTRX[3][1] = 0.0f;
|
||||
XMTRX[3][2] = 0.0f;
|
||||
XMTRX[3][3] = 0.0f;
|
||||
}
|
||||
|
||||
inline void rw_mat_load_4x4(const rw::Matrix* mtx) {
|
||||
memcpy(XMTRX, mtx, sizeof(matrix_t));
|
||||
XMTRX[3][3] = 1.0f;
|
||||
}
|
||||
|
||||
inline void mat_transpose(void) {
|
||||
matrix_t tmp;
|
||||
|
||||
for(unsigned i = 0; i < 4; ++i)
|
||||
for(unsigned j = 0; j < 4; ++j)
|
||||
tmp[j][i] = XMTRX[i][j];
|
||||
|
||||
mat_load(&tmp);
|
||||
}
|
||||
|
||||
template<bool FAST_APPROX=true>
|
||||
__hot inline void quat_mult(quaternion_t *r, const quaternion_t &q1, const quaternion_t &q2) {
|
||||
r->x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w);
|
||||
r->y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w);
|
||||
r->z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w);
|
||||
r->w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z);
|
||||
}
|
||||
|
||||
__hot inline void mat_load_apply(const matrix_t* matrix1, const matrix_t* matrix2) {
|
||||
mat_load(matrix1);
|
||||
mat_apply(matrix2);
|
||||
}
|
||||
|
||||
__always_inline __hot void mat_copy(matrix_t *dst, const matrix_t *src) {
|
||||
mat_load(src);
|
||||
mat_store(dst);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
__hot inline void mat_mult(matrix_t *out, const matrix_t* matrix1, const matrix_t* matrix2) {
|
||||
mat_load_apply(matrix1, matrix2);
|
||||
mat_store(out);
|
||||
}
|
||||
|
||||
#if 0
|
||||
template<std::size_t N>
|
||||
__always_inline __hot
|
||||
auto vec3_dot_prod(const vec3_t &v1, vec3_t const (&v2)[N]) {
|
||||
std::array<float, N> result;
|
||||
|
||||
#ifdef DC_SH4
|
||||
register float x asm(KOS_FPARG(0)) = v1.x;
|
||||
register float y asm(KOS_FPARG(1)) = v1.y;
|
||||
register float z asm(KOS_FPARG(2)) = v1.z;
|
||||
#else
|
||||
float x = v1.x;
|
||||
float y = v1.y;
|
||||
float z = v1.z;
|
||||
#endif
|
||||
|
||||
for(std::size_t v = 0; v < N; ++v)
|
||||
result[v] = fipr(x, y, z, 0.0f, v2[v].x, v2[v].y, v2[v].z, 0.0f);
|
||||
|
||||
return result;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
#endif /* RWDC_COMMON_H */
|
11
vendor/librw/src/rwbase.h
vendored
11
vendor/librw/src/rwbase.h
vendored
|
@ -1,6 +1,9 @@
|
|||
#pragma once
|
||||
|
||||
#include "common_defines.h"
|
||||
#ifdef RW_DC
|
||||
#include "rwdc_common.h"
|
||||
#endif
|
||||
|
||||
#ifndef RW_PS2
|
||||
#include <stdint.h>
|
||||
|
@ -322,7 +325,13 @@ inline float32 dot(const Quat &q, const Quat &p) {
|
|||
#endif
|
||||
}
|
||||
inline Quat scale(const Quat &q, float32 r) { return makeQuat(q.w*r, q.x*r, q.y*r, q.z*r); }
|
||||
inline float32 length(const Quat &q) { return sqrtf(q.w*q.w + q.x*q.x + q.y*q.y + q.z*q.z); }
|
||||
inline float32 length(const Quat &q) {
|
||||
#ifndef DC_SH4
|
||||
return sqrtf(q.w*q.w + q.x*q.x + q.y*q.y + q.z*q.z);
|
||||
#else
|
||||
return dc::Sqrt(fipr_magnitude_sqr(q.x, q.y, q.z, q.w));
|
||||
#endif
|
||||
}
|
||||
inline Quat normalize(const Quat &q) { return scale(q, 1.0f/length(q)); }
|
||||
inline Quat conj(const Quat &q) { return makeQuat(q.w, -q.x, -q.y, -q.z); }
|
||||
Quat mult(const Quat &q, const Quat &p);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue