Everything tested/working: liberty/miami + gainz!

This commit is contained in:
Falco Girgis 2025-04-26 10:52:36 -05:00
parent dc96ffc551
commit 2716147db4
16 changed files with 476 additions and 481 deletions

View file

@ -1619,7 +1619,7 @@ CCollision::ProcessLineOfSight(const CColLine &line,
point.point = matrix * point.point; point.point = matrix * point.point;
point.normal = Multiply3x3(matrix, point.normal); point.normal = Multiply3x3(matrix, point.normal);
#else #else
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&matrix))); dc::mat_load2(matrix);
mat_trans_single3_nodiv(point.point.x, mat_trans_single3_nodiv(point.point.x,
point.point.y, point.point.y,
point.point.z); point.point.z);
@ -1798,7 +1798,7 @@ CCollision::ProcessVerticalLine(const CColLine &line,
point.point = matrix * point.point; point.point = matrix * point.point;
point.normal = Multiply3x3(matrix, point.normal); point.normal = Multiply3x3(matrix, point.normal);
#else #else
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&matrix))); dc::mat_load2(matrix);
mat_trans_single3_nodiv(point.point.x, mat_trans_single3_nodiv(point.point.x,
point.point.y, point.point.y,
point.point.z); point.point.z);
@ -2173,8 +2173,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA,
#ifndef DC_SH4 #ifndef DC_SH4
matAB *= matrixA; matAB *= matrixA;
#else #else
mat_load(reinterpret_cast<const matrix_t*>(&matAB)); dc::mat_load_apply(matAB, matrixA);
mat_apply(reinterpret_cast<const matrix_t*>(&matrixA));
#endif #endif
CColSphere bsphereAB; // bounding sphere of A in B space CColSphere bsphereAB; // bounding sphere of A in B space
@ -2246,8 +2245,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA,
#ifndef DC_SH4 #ifndef DC_SH4
matBA *= matrixB; matBA *= matrixB;
#else #else
mat_load(reinterpret_cast<const matrix_t*>(&matBA)); dc::mat_load_apply(matBA, matrixB);
mat_apply(reinterpret_cast<const matrix_t*>(&matrixB));
#endif #endif
for(i = 0; i < modelB.numSpheres; i++){ for(i = 0; i < modelB.numSpheres; i++){
s.radius = modelB.spheres[i].radius; s.radius = modelB.spheres[i].radius;
@ -2309,7 +2307,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA,
} }
#ifdef DC_SH4 #ifdef DC_SH4
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&matrixB))); dc::mat_load2(matrixB);
#endif #endif
for(i = 0; i < numCollisions; i++){ for(i = 0; i < numCollisions; i++){
#ifndef DC_SH4 #ifndef DC_SH4

View file

@ -277,9 +277,9 @@ void
CMatrix::RotateX(float x) CMatrix::RotateX(float x)
{ {
#if 0 && defined(DC_SH4) // this is bugged and does not yield correct results #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
mat_load(reinterpret_cast<matrix_t *>(this)); dc::mat_load2(reinterpret_cast<matrix_t *>(this));
mat_rotate_x(x); mat_rotate_x(x);
mat_store(reinterpret_cast<matrix_t *>(this)); dc::mat_store2(reinterpret_cast<matrix_t *>(this));
#else #else
auto [s, c] = SinCos(x); auto [s, c] = SinCos(x);
@ -307,9 +307,9 @@ void
CMatrix::RotateY(float y) CMatrix::RotateY(float y)
{ {
#if 0 && defined(DC_SH4) // this is bugged and does not yield correct results #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
mat_load(reinterpret_cast<matrix_t *>(this)); dc::mat_load2(reinterpret_cast<matrix_t *>(this));
mat_rotate_y(y); mat_rotate_y(y);
mat_store(reinterpret_cast<matrix_t *>(this)); dc::mat_store2(reinterpret_cast<matrix_t *>(this));
#else #else
auto [s, c] = SinCos(y); auto [s, c] = SinCos(y);
@ -337,9 +337,9 @@ void
CMatrix::RotateZ(float z) CMatrix::RotateZ(float z)
{ {
#if 0 && defined(DC_SH4) // this is bugged and does not yield correct results #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
mat_load(reinterpret_cast<matrix_t *>(this)); dc::mat_load2(reinterpret_cast<matrix_t *>(this));
mat_rotate_z(z); mat_rotate_z(z);
mat_store(reinterpret_cast<matrix_t *>(this)); dc::mat_store2(reinterpret_cast<matrix_t *>(this));
#else #else
auto [s, c] = SinCos(z); auto [s, c] = SinCos(z);
@ -367,9 +367,9 @@ void
CMatrix::Rotate(float x, float y, float z) CMatrix::Rotate(float x, float y, float z)
{ {
#if 0 && defined(DC_SH4) // this is bugged and does not yield correct results #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
mat_load(reinterpret_cast<matrix_t *>(this)); dc::mat_load2(reinterpret_cast<matrix_t *>(this));
mat_rotate(x, y, z); mat_rotate(x, y, z);
mat_store(reinterpret_cast<matrix_t *>(this)); dc::mat_store2(reinterpret_cast<matrix_t *>(this));
#else #else
auto [sX, cX] = SinCos(x); auto [sX, cX] = SinCos(x);
auto [sY, cY] = SinCos(y); auto [sY, cY] = SinCos(y);

View file

@ -106,7 +106,7 @@ CMatrix Invert(const CMatrix &matrix);
CMatrix operator*(const CMatrix &m1, const CMatrix &m2); CMatrix operator*(const CMatrix &m1, const CMatrix &m2);
inline CVector MultiplyInverse(const CMatrix &mat, const CVector &vec) inline CVector MultiplyInverse(const CMatrix &mat, const CVector &vec)
{ {
#if 1 #ifndef DC_SH4
CVector v(vec.x - mat.px, vec.y - mat.py, vec.z - mat.pz); CVector v(vec.x - mat.px, vec.y - mat.py, vec.z - mat.pz);
return CVector( return CVector(
mat.rx * v.x + mat.ry * v.y + mat.rz * v.z, mat.rx * v.x + mat.ry * v.y + mat.rz * v.z,
@ -124,8 +124,6 @@ inline CVector MultiplyInverse(const CMatrix &mat, const CVector &vec)
#endif #endif
} }
class CCompressedMatrixNotAligned class CCompressedMatrixNotAligned
{ {
CVector m_vecPos; CVector m_vecPos;

View file

@ -34,7 +34,7 @@ Multiply3x3(const CMatrix &mat, const CVector &vec)
mat.rz * vec.x + mat.fz * vec.y + mat.uz * vec.z); mat.rz * vec.x + mat.fz * vec.y + mat.uz * vec.z);
#else #else
CVector out; CVector out;
mat_load(mat); dc::mat_load2(mat);
mat_trans_normal3_nomod(vec.x, vec.y, vec.z, mat_trans_normal3_nomod(vec.x, vec.y, vec.z,
out.x, out.y, out.z); out.x, out.y, out.z);
return out; return out;
@ -50,7 +50,7 @@ Multiply3x3(const CVector &vec, const CMatrix &mat)
mat.ux * vec.x + mat.uy * vec.y + mat.uz * vec.z); mat.ux * vec.x + mat.uy * vec.y + mat.uz * vec.z);
#else #else
CVector out; CVector out;
mat_load(mat); dc::mat_load2(mat);
mat_transpose(); mat_transpose();
mat_trans_normal3_nomod(vec.x, vec.y, vec.z, mat_trans_normal3_nomod(vec.x, vec.y, vec.z,
out.x, out.y, out.z); out.x, out.y, out.z);
@ -63,7 +63,7 @@ operator*(const CMatrix &mat, const CVector &vec)
{ {
#ifdef DC_SH4 #ifdef DC_SH4
CVector out; CVector out;
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat))); dc::mat_load2(mat);
mat_trans_single3_nodiv_nomod(vec.x, vec.y, vec.z, out.x, out.y, out.z); mat_trans_single3_nodiv_nomod(vec.x, vec.y, vec.z, out.x, out.y, out.z);
return out; return out;
#else #else

View file

@ -44,7 +44,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const CV
sqc2 vf06,0x0(%0)\n\ sqc2 vf06,0x0(%0)\n\
": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory"); ": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory");
#elif defined(DC_SH4) #elif defined(DC_SH4)
mat_load(mat); dc::mat_load2(mat);
mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z); mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z);
#else #else
out = mat * in; out = mat * in;
@ -71,7 +71,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const Rw
sqc2 vf06,0x0(%0)\n\ sqc2 vf06,0x0(%0)\n\
": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory"); ": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory");
#elif defined(DC_SH4) #elif defined(DC_SH4)
mat_load(mat); dc::mat_load2(mat);
mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z); mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z);
#else #else
out = mat * in; out = mat * in;
@ -108,7 +108,7 @@ __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat,
bnez %1,1b\n\ bnez %1,1b\n\
": : "r" (out) , "r" (n), "r" (&mat), "r" (in), "r" (stride): "memory"); ": : "r" (out) , "r" (n), "r" (&mat), "r" (in), "r" (stride): "memory");
#elif defined(DC_SH4) #elif defined(DC_SH4)
mat_load(mat); dc::mat_load2(mat);
while(n--) { while(n--) {
mat_trans_single3_nodiv_nomod(in->x, in->y, in->z, out->x, out->y, out->z); mat_trans_single3_nodiv_nomod(in->x, in->y, in->z, out->x, out->y, out->z);
in = reinterpret_cast<const RwV3d *>(reinterpret_cast<const uint8_t *>(in) + stride); in = reinterpret_cast<const RwV3d *>(reinterpret_cast<const uint8_t *>(in) + stride);

View file

@ -1473,7 +1473,7 @@ CCollision::ProcessLineOfSight(const CColLine &line,
point.point = matrix * point.point; point.point = matrix * point.point;
point.normal = Multiply3x3(matrix, point.normal); point.normal = Multiply3x3(matrix, point.normal);
#else #else
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&matrix))); dc::mat_load2(matrix);
mat_trans_single3_nodiv(point.point.x, mat_trans_single3_nodiv(point.point.x,
point.point.y, point.point.y,
point.point.z); point.point.z);
@ -1653,7 +1653,7 @@ CCollision::ProcessVerticalLine(const CColLine &line,
point.point = matrix * point.point; point.point = matrix * point.point;
point.normal = Multiply3x3(matrix, point.normal); point.normal = Multiply3x3(matrix, point.normal);
#else #else
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&matrix))); dc::mat_load2(matrix);
mat_trans_single3_nodiv(point.point.x, mat_trans_single3_nodiv(point.point.x,
point.point.y, point.point.y,
point.point.z); point.point.z);
@ -2027,8 +2027,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA,
#ifndef DC_SH4 #ifndef DC_SH4
matAB *= matrixA; matAB *= matrixA;
#else #else
mat_load(reinterpret_cast<const matrix_t*>(&matAB)); dc::mat_load_apply(matAB, matrixA);
mat_apply(reinterpret_cast<const matrix_t*>(&matrixA));
#endif #endif
CColSphere bsphereAB; // bounding sphere of A in B space CColSphere bsphereAB; // bounding sphere of A in B space
@ -2099,8 +2098,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA,
#ifndef DC_SH4 #ifndef DC_SH4
matBA *= matrixB; matBA *= matrixB;
#else #else
mat_load(reinterpret_cast<const matrix_t*>(&matBA)); dc::mat_load_apply(matBA, matrixB);
mat_apply(reinterpret_cast<const matrix_t*>(&matrixB));
#endif #endif
for(i = 0; i < modelB.numSpheres; i++){ for(i = 0; i < modelB.numSpheres; i++){
s.radius = modelB.spheres[i].radius; s.radius = modelB.spheres[i].radius;
@ -2162,7 +2160,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA,
} }
#ifdef DC_SH4 #ifdef DC_SH4
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&matrixB))); dc::mat_load2(matrixB);
#endif #endif
for(i = 0; i < numCollisions; i++){ for(i = 0; i < numCollisions; i++){
#ifndef DC_SH4 #ifndef DC_SH4

View file

@ -359,9 +359,9 @@ void
CMatrix::Rotate(float x, float y, float z) CMatrix::Rotate(float x, float y, float z)
{ {
#if 0 && defined(DC_SH4) // this is bugged and does not yield correct results #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
mat_load(reinterpret_cast<matrix_t *>(this)); dc::mat_load2(reinterpret_cast<matrix_t *>(this));
mat_rotate(x, y, z); mat_rotate(x, y, z);
mat_store(reinterpret_cast<matrix_t *>(this)); dc::mat_store2(reinterpret_cast<matrix_t *>(this));
#else #else
auto [sX, cX] = SinCos(x); auto [sX, cX] = SinCos(x);
auto [sY, cY] = SinCos(y); auto [sY, cY] = SinCos(y);

View file

@ -77,29 +77,17 @@ public:
void SetScale(float s); void SetScale(float s);
void Scale(float scale) void Scale(float scale)
{ {
#if !defined(DC_SH4) || 1 /* TODO */
for (int i = 0; i < 3; i++) for (int i = 0; i < 3; i++)
for (int j = 0; j < 3; j++) for (int j = 0; j < 3; j++)
f[i][j] *= scale; f[i][j] *= scale;
#else
mat_load(*this);
mat_scale(scale, scale, scale);
mat_store(*this);
#endif
} }
void Scale(float sx, float sy, float sz) void Scale(float sx, float sy, float sz)
{ {
#if !defined(DC_SH4) || 1 /* TODO */
for (int i = 0; i < 3; i++){ for (int i = 0; i < 3; i++){
f[i][0] *= sx; f[i][0] *= sx;
f[i][1] *= sy; f[i][1] *= sy;
f[i][2] *= sz; f[i][2] *= sz;
} }
#else
mat_load(*this);
mat_scale(sx, sy, sz);
mat_store(*this);
#endif
} }
void SetRotateXOnly(float angle); void SetRotateXOnly(float angle);

View file

@ -62,7 +62,7 @@ public:
} }
const CQuaternion &operator/=(float right) { const CQuaternion &operator/=(float right) {
right = dc::Invert(right); right = dc::Invert<false>(right);
x *= right; x *= right;
y *= right; y *= right;
z *= right; z *= right;
@ -115,5 +115,6 @@ inline CQuaternion operator*(float left, const CQuaternion &right)
inline CQuaternion operator/(const CQuaternion &left, float right) inline CQuaternion operator/(const CQuaternion &left, float right)
{ {
return CQuaternion(left.x / right, left.y / right, left.z / right, left.w / right); right = Invert<false>(right);
return CQuaternion(left.x * right, left.y * right, left.z * right, left.w * right);
} }

View file

@ -27,23 +27,17 @@ CrossProduct(const CVector &v1, const CVector &v2)
CVector CVector
Multiply3x3(const CMatrix &mat, const CVector &vec) Multiply3x3(const CMatrix &mat, const CVector &vec)
{ {
#ifdef DC_SH4 #ifndef DC_SH4
register float __x __asm__("fr12") = vec.x;
register float __y __asm__("fr13") = vec.y;
register float __z __asm__("fr14") = vec.z;
register float __w __asm__("fr15") = 0.0f;
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat)));
asm volatile( "ftrv xmtrx, fv12\n"
: "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w)
: "0" (__x), "1" (__y), "2" (__z), "3" (__w) );
return { __x, __y, __z };
#else
// TODO: VU0 code // TODO: VU0 code
return CVector(mat.rx * vec.x + mat.fx * vec.y + mat.ux * vec.z, return CVector(mat.rx * vec.x + mat.fx * vec.y + mat.ux * vec.z,
mat.ry * vec.x + mat.fy * vec.y + mat.uy * vec.z, mat.ry * vec.x + mat.fy * vec.y + mat.uy * vec.z,
mat.rz * vec.x + mat.fz * vec.y + mat.uz * vec.z); mat.rz * vec.x + mat.fz * vec.y + mat.uz * vec.z);
#else
CVector out;
dc::mat_load2(mat);
mat_trans_normal3_nomod(vec.x, vec.y, vec.z,
out.x, out.y, out.z);
return out;
#endif #endif
} }
@ -56,9 +50,9 @@ Multiply3x3(const CVector &vec, const CMatrix &mat)
mat.ux * vec.x + mat.uy * vec.y + mat.uz * vec.z); mat.ux * vec.x + mat.uy * vec.y + mat.uz * vec.z);
#else #else
CVector out; CVector out;
mat_load(mat); dc::mat_load2(mat);
mat_transpose(); mat_transpose();
mat_trans_vec3_nomod(vec.x, vec.y, vec.z, mat_trans_normal3_nomod(vec.x, vec.y, vec.z,
out.x, out.y, out.z); out.x, out.y, out.z);
return out; return out;
#endif #endif
@ -69,7 +63,7 @@ operator*(const CMatrix &mat, const CVector &vec)
{ {
#ifdef DC_SH4 #ifdef DC_SH4
CVector out; CVector out;
mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat))); dc::mat_load2(mat);
mat_trans_single3_nodiv_nomod(vec.x, vec.y, vec.z, out.x, out.y, out.z); mat_trans_single3_nodiv_nomod(vec.x, vec.y, vec.z, out.x, out.y, out.z);
return out; return out;
#else #else

View file

@ -50,7 +50,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const CV
sqc2 vf06,0x0(%0)\n\ sqc2 vf06,0x0(%0)\n\
": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory"); ": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory");
#elif defined(DC_SH4) #elif defined(DC_SH4)
mat_load(mat); dc::mat_load2(mat);
mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z); mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z);
#else #else
out = mat * in; out = mat * in;
@ -77,7 +77,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const Rw
sqc2 vf06,0x0(%0)\n\ sqc2 vf06,0x0(%0)\n\
": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory"); ": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory");
#elif defined(DC_SH4) #elif defined(DC_SH4)
mat_load(mat); dc::mat_load2(mat);
mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z); mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z);
#else #else
out = mat * in; out = mat * in;
@ -86,7 +86,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const Rw
__always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat, const RwV3d *in, int stride) __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat, const RwV3d *in, int stride)
{ {
#ifdef GTA_PS3 #ifdef GTA_PS2
__asm__ __volatile__("\n\ __asm__ __volatile__("\n\
paddub $3,%4,$0\n\ paddub $3,%4,$0\n\
lqc2 vf02,0x0(%2)\n\ lqc2 vf02,0x0(%2)\n\
@ -114,7 +114,7 @@ __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat,
bnez %1,1b\n\ bnez %1,1b\n\
": : "r" (out) , "r" (n), "r" (&mat), "r" (in), "r" (stride): "memory"); ": : "r" (out) , "r" (n), "r" (&mat), "r" (in), "r" (stride): "memory");
#elif defined(DC_SH4) #elif defined(DC_SH4)
mat_load(mat); dc::mat_load2(mat);
while(n--) { while(n--) {
mat_trans_single3_nodiv_nomod(in->x, in->y, in->z, out->x, out->y, out->z); mat_trans_single3_nodiv_nomod(in->x, in->y, in->z, out->x, out->y, out->z);
in = reinterpret_cast<const RwV3d *>(reinterpret_cast<const uint8_t *>(in) + stride); in = reinterpret_cast<const RwV3d *>(reinterpret_cast<const uint8_t *>(in) + stride);

2
vendor/dca3-kos vendored

@ -1 +1 @@
Subproject commit 144eaeab940246d9c84c1f14b45d9e2847705c5e Subproject commit b1f6bbc960cd1f5cd05be8d9d0eca4556ca3a56a

View file

@ -97,8 +97,8 @@ mult(const Quat &q, const Quat &p)
#else #else
Quat o; Quat o;
dc::quat_mult(reinterpret_cast<dc::quaternion_t *>(&o), dc::quat_mult(reinterpret_cast<dc::quaternion_t *>(&o),
reinterpret_cast<const dc::quaternion_t&>(q), reinterpret_cast<const dc::quaternion_t &>(q),
reinterpret_cast<const dc::quaternion_t&>(p)); reinterpret_cast<const dc::quaternion_t &>(p));
return o; return o;
#endif #endif
} }

View file

@ -429,6 +429,7 @@ Camera::frustumTestSphere(const Sphere *s) const
{ {
int32 res = SPHEREINSIDE; int32 res = SPHEREINSIDE;
const FrustumPlane *p = this->frustumPlanes; const FrustumPlane *p = this->frustumPlanes;
#ifndef DC_SH4
for(int32 i = 0; i < 6; i++){ for(int32 i = 0; i < 6; i++){
float32 dist = dot(p->plane.normal, s->center) - p->plane.distance; float32 dist = dot(p->plane.normal, s->center) - p->plane.distance;
if(s->radius < dist) if(s->radius < dist)
@ -437,6 +438,114 @@ Camera::frustumTestSphere(const Sphere *s) const
res = SPHEREBOUNDARY; res = SPHEREBOUNDARY;
p++; p++;
} }
#else
__builtin_prefetch(p);
register float sx asm("fr0") = s->center.x;
register float sy asm("fr1") = s->center.y;
register float sz asm("fr2") = s->center.z;
register float sw asm("fr3") = -1.0f;
// far
register float px asm("fr4") = p->plane.normal.x;
register float py asm("fr5") = p->plane.normal.y;
register float pz asm("fr6") = p->plane.normal.z;
register float pw asm("fr7") = p->plane.distance;
asm volatile("fipr fv0, fv4"
: "+f" (pw)
: "f" (sx), "f" (sy), "f" (sz), "f" (sw),
"f" (px), "f" (py), "f" (pz));
if(s->radius < pw)
return SPHEREOUTSIDE;
else if(s->radius > -pw)
res = SPHEREBOUNDARY;
p++;
// near
px = p->plane.normal.x;
py = p->plane.normal.y;
pz = p->plane.normal.z;
pw = p->plane.distance;
asm volatile("fipr fv0, fv4"
: "+f" (pw)
: "f" (sx), "f" (sy), "f" (sz), "f" (sw),
"f" (px), "f" (py), "f" (pz));
if(s->radius < pw)
return SPHEREOUTSIDE;
if(s->radius > -pw)
res = SPHEREBOUNDARY_NEAR;
p++;
const float* base_ptr0 = &p[0].plane.normal.x;
const float* base_ptr1 = &p[1].plane.normal.x;
const float* base_ptr2 = &p[2].plane.normal.x;
const float* base_ptr3 = &p[3].plane.normal.x;
__builtin_prefetch(base_ptr0);
static_assert(offsetof (decltype (p[0].plane.normal), y)
-offsetof (decltype (p[0].plane.normal), x) == sizeof (float));
static_assert(offsetof (decltype (p[0].plane.normal), z)
-offsetof (decltype (p[0].plane.normal), y) == sizeof (float));
static_assert(offsetof (decltype (p[0].plane), distance)
-offsetof (decltype (p[0].plane.normal), z) == sizeof (float));
asm volatile (R"(
frchg
fmov.s @%0+,fr0
fmov.s @%1+,fr1
fmov.s @%2+,fr2
fmov.s @%3+,fr3
fmov.s @%0+,fr4
fmov.s @%1+,fr5
fmov.s @%2+,fr6
fmov.s @%3+,fr7
fmov.s @%0+,fr8
fmov.s @%1+,fr9
fmov.s @%2+,fr10
fmov.s @%3+,fr11
fmov.s @%0,fr12
fmov.s @%1,fr13
fmov.s @%2,fr14
fmov.s @%3,fr15
frchg
)" : "+&r" (base_ptr0), "+&r" (base_ptr1), "+&r" (base_ptr2), "+&r" (base_ptr3)
:
: );
float dists[4];
mat_trans_vec4_nodiv_nomod(sx, sy, sz, sw,
dists[0], dists[1], dists[2], dists[3]);
if(s->radius < dists[0])
return SPHEREOUTSIDE;
else if(s->radius > -dists[0])
res = SPHEREBOUNDARY;
if(s->radius < dists[1])
return SPHEREOUTSIDE;
else if(s->radius > -dists[1])
res = SPHEREBOUNDARY;
if(s->radius < dists[2])
return SPHEREOUTSIDE;
else if(s->radius > -dists[2])
res = SPHEREBOUNDARY;
if(s->radius < dists[3])
return SPHEREOUTSIDE;
else if(s->radius > -dists[3])
res = SPHEREBOUNDARY;
#endif
return res; return res;
} }
@ -445,7 +554,7 @@ Camera::frustumTestSphereNear(const Sphere *s) const
{ {
int32 res = SPHEREINSIDE; int32 res = SPHEREINSIDE;
const FrustumPlane *p = this->frustumPlanes; const FrustumPlane *p = this->frustumPlanes;
#ifndef DC_SH4
// far // far
float32 dist = dot(p->plane.normal, s->center) - p->plane.distance; float32 dist = dot(p->plane.normal, s->center) - p->plane.distance;
if(s->radius < dist) if(s->radius < dist)
@ -481,6 +590,101 @@ Camera::frustumTestSphereNear(const Sphere *s) const
return SPHEREOUTSIDE; return SPHEREOUTSIDE;
p++; p++;
#else
__builtin_prefetch(p);
register float sx asm("fr0") = s->center.x;
register float sy asm("fr1") = s->center.y;
register float sz asm("fr2") = s->center.z;
register float sw asm("fr3") = -1.0f;
// far
register float px asm("fr4") = p->plane.normal.x;
register float py asm("fr5") = p->plane.normal.y;
register float pz asm("fr6") = p->plane.normal.z;
register float pw asm("fr7") = p->plane.distance;
asm volatile("fipr fv0, fv4"
: "+f" (pw)
: "f" (sx), "f" (sy), "f" (sz), "f" (sw),
"f" (px), "f" (py), "f" (pz));
if(s->radius < pw)
return SPHEREOUTSIDE;
p++;
// near
px = p->plane.normal.x;
py = p->plane.normal.y;
pz = p->plane.normal.z;
pw = p->plane.distance;
asm volatile("fipr fv0, fv4"
: "+f" (pw)
: "f" (sx), "f" (sy), "f" (sz), "f" (sw),
"f" (px), "f" (py), "f" (pz));
if(s->radius < pw)
return SPHEREOUTSIDE;
if(s->radius > -pw)
res = SPHEREBOUNDARY_NEAR;
p++;
const float* base_ptr0 = &p[0].plane.normal.x;
const float* base_ptr1 = &p[1].plane.normal.x;
const float* base_ptr2 = &p[2].plane.normal.x;
const float* base_ptr3 = &p[3].plane.normal.x;
__builtin_prefetch(base_ptr0);
static_assert(offsetof (decltype (p[0].plane.normal), y)
-offsetof (decltype (p[0].plane.normal), x) == sizeof (float));
static_assert(offsetof (decltype (p[0].plane.normal), z)
-offsetof (decltype (p[0].plane.normal), y) == sizeof (float));
static_assert(offsetof (decltype (p[0].plane), distance)
-offsetof (decltype (p[0].plane.normal), z) == sizeof (float));
asm volatile (R"(
frchg
fmov.s @%0+,fr0
fmov.s @%1+,fr1
fmov.s @%2+,fr2
fmov.s @%3+,fr3
fmov.s @%0+,fr4
fmov.s @%1+,fr5
fmov.s @%2+,fr6
fmov.s @%3+,fr7
fmov.s @%0+,fr8
fmov.s @%1+,fr9
fmov.s @%2+,fr10
fmov.s @%3+,fr11
fmov.s @%0,fr12
fmov.s @%1,fr13
fmov.s @%2,fr14
fmov.s @%3,fr15
frchg
)" : "+&r" (base_ptr0), "+&r" (base_ptr1), "+&r" (base_ptr2), "+&r" (base_ptr3)
:
: );
float dists[4];
mat_trans_vec4_nodiv_nomod(sx, sy, sz, sw,
dists[0], dists[1], dists[2], dists[3]);
if(s->radius < dists[0])
return SPHEREOUTSIDE;
else if(s->radius < dists[1])
return SPHEREOUTSIDE;
else if(s->radius < dists[2])
return SPHEREOUTSIDE;
else if(s->radius < dists[3])
return SPHEREOUTSIDE;
#endif
return res; return res;
} }

View file

@ -37,8 +37,6 @@ extern const char* currentFile;
#include <functional> #include <functional>
#include <fstream> #include <fstream>
#define ARRAY_SIZE(array) (sizeof(array) / sizeof(array[0]))
#define errorf(...) dbglog(DBG_CRITICAL, __VA_ARGS__) #define errorf(...) dbglog(DBG_CRITICAL, __VA_ARGS__)
#define logf(...) // printf(__VA_ARGS__) #define logf(...) // printf(__VA_ARGS__)
bool re3RemoveLeastUsedModel(); bool re3RemoveLeastUsedModel();
@ -47,21 +45,13 @@ void* re3StreamingAlloc(size_t size);
// #include "rwdcimpl.h" // #include "rwdcimpl.h"
#include <dc/pvr.h>
#include <dc/matrix.h>
#include "alloc.h" #include "alloc.h"
#undef PVR_TXRFMT_STRIDE using namespace dc;
#define PVR_TXRFMT_STRIDE (1 << 25)
static_assert(PVR_TXRFMT_STRIDE == (1 << 25), "PVR_TXRFMT_STRIDE is bugged in your KOS version");
// TODO: probably needs a better place to be // TODO: probably needs a better place to be
bool doEnvironmentMaps = true; bool doEnvironmentMaps = true;
#define fclamp0_1(n) ((n) > 1.0f ? 1.0f : n < 0.0f ? 0.0f : n)
#define fclamp1(n) ((n) > 1.0f ? 1.0f : n)
struct alignas(32) pvr_vertex16_t { struct alignas(32) pvr_vertex16_t {
uint32_t flags; /**< \brief TA command (vertex flags) */ uint32_t flags; /**< \brief TA command (vertex flags) */
float x; /**< \brief X coordinate */ float x; /**< \brief X coordinate */
@ -170,177 +160,10 @@ struct alignas(32) pvr_vertex32_ut {
static_assert(sizeof(pvr_vertex16_t) == 32, "pvr_vertex16_t size mismatch"); static_assert(sizeof(pvr_vertex16_t) == 32, "pvr_vertex16_t size mismatch");
static_assert(alignof(pvr_vertex16_t) == 32, "pvr_vertex16_t alignof mismatch"); static_assert(alignof(pvr_vertex16_t) == 32, "pvr_vertex16_t alignof mismatch");
#define MATH_Fast_Invert(x) ({ (((x) < 0.0f)? -1.0f : 1.0f) * frsqrt((x) * (x)); })
static pvr_dr_state_t drState; static pvr_dr_state_t drState;
#include <kos/dbglog.h>
float VIDEO_MODE_SCALE_X; float VIDEO_MODE_SCALE_X;
#if !defined(DC_TEXCONV) && !defined(DC_SIM)
#include <kos.h>
#define mat_trans_nodiv_nomod(x, y, z, x2, y2, z2, w2) do { \
register float __x __asm__("fr12") = (x); \
register float __y __asm__("fr13") = (y); \
register float __z __asm__("fr14") = (z); \
register float __w __asm__("fr15") = 1.0f; \
__asm__ __volatile__( "ftrv xmtrx, fv12\n" \
: "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
: "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
} while(false)
#define mat_trans_nodiv_nomod_zerow(x, y, z, x2, y2, z2, w2) do { \
register float __x __asm__("fr12") = (x); \
register float __y __asm__("fr13") = (y); \
register float __z __asm__("fr14") = (z); \
register float __w __asm__("fr15") = 0.0f; \
__asm__ __volatile__( "ftrv xmtrx, fv12\n" \
: "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
: "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
} while(false)
#define mat_trans_w_nodiv_nomod(x, y, z, w) do { \
register float __x __asm__("fr12") = (x); \
register float __y __asm__("fr13") = (y); \
register float __z __asm__("fr14") = (z); \
register float __w __asm__("fr15") = 1.0f; \
__asm__ __volatile__( "ftrv xmtrx, fv12\n" \
: "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
: "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
w = __w; \
} while(false)
// no declspec naked, so can't do rts / fschg. instead compiler pads with nop?
inline void rw_mat_load_3x3(const rw::Matrix* mtx) {
__asm__ __volatile__ (
R"(
fschg
frchg
fmov @%[mtx]+, dr0
fldi0 fr12
fldi0 fr13
fmov @%[mtx]+, dr2
fmov @%[mtx]+, dr4
fmov @%[mtx]+, dr6
fmov @%[mtx]+, dr8
fmov @%[mtx]+, dr10
fldi0 fr3
fldi0 fr7
fldi0 fr11
fmov dr12, dr14
fschg
frchg
)"
: [mtx] "+r" (mtx)
);
}
// sets pos.w to 1
inline void rw_mat_load_4x4(const rw::Matrix* mtx) {
__asm__ __volatile__ (
R"(
fschg
frchg
fmov @%[mtx]+, dr0
fmov @%[mtx]+, dr2
fmov @%[mtx]+, dr4
fmov @%[mtx]+, dr6
fmov @%[mtx]+, dr8
fmov @%[mtx]+, dr10
fmov @%[mtx]+, dr12
fmov @%[mtx]+, dr14
fldi1 fr15
fschg
frchg
)"
: [mtx] "+r" (mtx)
);
}
#else
extern matrix_t XMTRX;
void rw_mat_load_3x3(rw::Matrix* mtx) {
memcpy(XMTRX, mtx, sizeof(matrix_t));
XMTRX[0][3] = 0.0f;
XMTRX[1][3] = 0.0f;
XMTRX[2][3] = 0.0f;
XMTRX[3][0] = 0.0f;
XMTRX[3][1] = 0.0f;
XMTRX[3][2] = 0.0f;
XMTRX[3][3] = 0.0f;
}
void rw_mat_load_4x4(rw::Matrix* mtx) {
memcpy(XMTRX, mtx, sizeof(matrix_t));
XMTRX[3][3] = 1.0f;
}
#include <dc/matrix.h>
#define frsqrt(a) (1.0f/sqrt(a))
#define dcache_pref_block(a) __builtin_prefetch(a)
#ifndef __always_inline
#define __always_inline __attribute__((always_inline)) inline
#endif
#ifdef DC_TEXCONV
#define mat_transform(a, b, c, d)
#define mat_apply(a)
#define mat_load(a)
#define mat_store(a)
#define mat_identity(a)
#define pvr_fog_table_color(a,r,g,b)
#define pvr_fog_table_linear(s,e)
#define pvr_fog_table_exp(d)
#define pvr_fog_table_custom(d)
#endif
#define mat_trans_single3_nomod(x_, y_, z_, x2, y2, z2) do { \
vector_t tmp = { x_, y_, z_, 1.0f }; \
mat_transform(&tmp, &tmp, 1, 0); \
z2 = 1.0f / tmp.w; \
x2 = tmp.x * z2; \
y2 = tmp.y * z2; \
} while(false)
#define mat_trans_nodiv_nomod(x_, y_, z_, x2, y2, z2, w2) do { \
vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \
mat_transform(&tmp1233123, &tmp1233123, 1, 0); \
x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \
} while(false)
#define mat_trans_nodiv_nomod_zerow(x_, y_, z_, x2, y2, z2, w2) do { \
vector_t tmp1233123 = { x_, y_, z_, 0.0f }; \
mat_transform(&tmp1233123, &tmp1233123, 1, 0); \
x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \
} while(false)
#define mat_trans_w_nodiv_nomod(x_, y_, z_, w_) do { \
vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \
mat_transform(&tmp1233123, &tmp1233123, 1, 0); \
w_ = tmp1233123.w; \
} while(false)
#define memcpy4 memcpy
// END STUBS
#endif
static pvr_ptr_t fake_tex; static pvr_ptr_t fake_tex;
alignas(4) static const uint16_t fake_tex_data[] = { alignas(4) static const uint16_t fake_tex_data[] = {
@ -551,9 +374,9 @@ void DCE_MatrixViewport(float x, float y, float width, float height) {
void DCE_InitMatrices() { void DCE_InitMatrices() {
// Setup the screenview matrix. Only need to do once since this matrix does not need to change for single player viewpoint. // Setup the screenview matrix. Only need to do once since this matrix does not need to change for single player viewpoint.
mat_identity(); mat_identity2();
mat_store(&DCE_MAT_SCREENVIEW); mat_store2(&DCE_MAT_SCREENVIEW);
} }
} }
@ -684,7 +507,7 @@ struct atomic_context_t {
__always_inline void DCE_RenderSubmitVertex(const pvr_vertex_t *v, uint32_t flags) { __always_inline void DCE_RenderSubmitVertex(const pvr_vertex_t *v, uint32_t flags) {
auto *sq = reinterpret_cast<uint32_t *>(pvr_dr_target(drState)); auto *sq = reinterpret_cast<uint32_t *>(pvr_dr_target(drState));
auto *src = reinterpret_cast<const uint32_t *>(v); auto *src = reinterpret_cast<const uint32_t *>(v);
float sz = MATH_Fast_Invert(v->z); float sz = Invert<true, false>(v->z);
float sx = v->x * sz; float sx = v->x * sz;
float sy = v->y * sz; float sy = v->y * sz;
@ -711,7 +534,7 @@ __always_inline void DCE_RenderSubmitVertexIM3D(float x, float y, float w,
{ {
auto *sq = reinterpret_cast<uint32_t *>(pvr_dr_target(drState)); auto *sq = reinterpret_cast<uint32_t *>(pvr_dr_target(drState));
auto *uv32 = reinterpret_cast<const uint32_t *>(uv); auto *uv32 = reinterpret_cast<const uint32_t *>(uv);
float sz = MATH_Fast_Invert(w); float sz = Invert<true, false>(w);
float sx = x * sz; float sx = x * sz;
float sy = y * sz; float sy = y * sz;
@ -801,9 +624,8 @@ void beginUpdate(Camera* cam) {
DCE_MatrixViewport(0, 0, cam->frameBuffer->width * VIDEO_MODE_SCALE_X, cam->frameBuffer->height); DCE_MatrixViewport(0, 0, cam->frameBuffer->width * VIDEO_MODE_SCALE_X, cam->frameBuffer->height);
mat_load((matrix_t*)&DCE_MAT_SCREENVIEW); mat_load_apply((matrix_t*)&DCE_MAT_SCREENVIEW, (matrix_t*)&cam->devProj);
mat_apply((matrix_t*)&cam->devProj); mat_store2((matrix_t*)&cam->devProjScreen);
mat_store((matrix_t*)&cam->devProjScreen);
} }
@ -1831,7 +1653,7 @@ void im2DRenderPrimitive(PrimitiveType primType, void *vertices, int32_t numVert
pvrVert->flags = flags; pvrVert->flags = flags;
pvrVert->x = gtaVert.x * VIDEO_MODE_SCALE_X; pvrVert->x = gtaVert.x * VIDEO_MODE_SCALE_X;
pvrVert->y = gtaVert.y; pvrVert->y = gtaVert.y;
pvrVert->z = MATH_Fast_Invert(gtaVert.w); // this is perfect for almost every case... pvrVert->z = Invert<true, false>(gtaVert.w); // this is perfect for almost every case...
pvrVert->u = gtaVert.u; pvrVert->u = gtaVert.u;
pvrVert->v = gtaVert.v; pvrVert->v = gtaVert.v;
pvrVert->argb = (gtaVert.a << 24) | pvrVert->argb = (gtaVert.a << 24) |
@ -1952,7 +1774,7 @@ void im2DRenderIndexedPrimitive(PrimitiveType primType, void *vertices, int32 nu
pvrVert->flags = flags; pvrVert->flags = flags;
pvrVert->x = gtaVert.x * VIDEO_MODE_SCALE_X; pvrVert->x = gtaVert.x * VIDEO_MODE_SCALE_X;
pvrVert->y = gtaVert.y; pvrVert->y = gtaVert.y;
pvrVert->z = MATH_Fast_Invert(gtaVert.w); // this is perfect for almost every case... pvrVert->z = Invert<true, false>(gtaVert.w); // this is perfect for almost every case...
pvrVert->u = gtaVert.u; pvrVert->u = gtaVert.u;
pvrVert->v = gtaVert.v; pvrVert->v = gtaVert.v;
pvrVert->argb = (gtaVert.a << 24) | pvrVert->argb = (gtaVert.a << 24) |
@ -2015,8 +1837,8 @@ void im3DTransform(void *vertices, int32 numVertices, Matrix *worldMat, uint32 f
rw::RawMatrix::mult(&worldview, &world, &cam->devView); rw::RawMatrix::mult(&worldview, &world, &cam->devView);
rw::RawMatrix::mult(&proj, &worldview, &cam->devProj); rw::RawMatrix::mult(&proj, &worldview, &cam->devProj);
rw::RawMatrix::mult(&mtx, &proj, (RawMatrix*)&DCE_MAT_SCREENVIEW); rw::RawMatrix::mult(&mtx, &proj, (RawMatrix*)&DCE_MAT_SCREENVIEW);
// mat_load(&DCE_MAT_SCREENVIEW); // ~11 cycles. // mat_load2(&DCE_MAT_SCREENVIEW); // ~11 cycles.
mat_load(( matrix_t*)&mtx.right); // Number of cycles: ~32. mat_load2(( matrix_t*)&mtx.right); // Number of cycles: ~32.
if (im3dVertices) { if (im3dVertices) {
free(im3dVertices); free(im3dVertices);
} }
@ -2110,7 +1932,7 @@ void im3DRenderIndexedPrimitive(PrimitiveType primType,
// assuming near plane is 0.0f // assuming near plane is 0.0f
// gv1 is visible (posi), and gv2 is behind the plane (negative) // gv1 is visible (posi), and gv2 is behind the plane (negative)
float t = (1.0f - gv1.position.z) * MATH_Fast_Invert(gv2.position.z - gv1.position.z); float t = (1.0f - gv1.position.z) * Invert<true, true>(gv2.position.z - gv1.position.z);
pvr_vertex_t pvrVert; pvr_vertex_t pvrVert;
@ -3388,13 +3210,13 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve
auto skinningWeightData = (uint8_t*)skinWeights; auto skinningWeightData = (uint8_t*)skinWeights;
if (!matrix0Identity) { if (!matrix0Identity) {
rw_mat_load_4x4(&skinMatrices[0]); mat_load_4x4(&skinMatrices[0]);
if (small_xyz) { if (small_xyz) {
mat_apply(&DCE_MESHLET_MAT_DECODE); mat_apply(&DCE_MESHLET_MAT_DECODE);
} }
} else { } else {
if (small_xyz) { if (small_xyz) {
mat_load(&DCE_MESHLET_MAT_DECODE); mat_load2(&DCE_MESHLET_MAT_DECODE);
} }
} }
@ -3458,7 +3280,7 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve
break; break;
} }
rw_mat_load_4x4(currentMatrix); mat_load_4x4(currentMatrix);
if (small_xyz){ if (small_xyz){
mat_apply(&DCE_MESHLET_MAT_DECODE); mat_apply(&DCE_MESHLET_MAT_DECODE);
} }
@ -3491,7 +3313,7 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve
auto skinningWeightData = (uint8_t*)skinWeights; auto skinningWeightData = (uint8_t*)skinWeights;
if (!matrix0Identity) { if (!matrix0Identity) {
rw_mat_load_3x3(&skinMatrices[0]); mat_load_3x3(&skinMatrices[0]);
} }
for(;;) { for(;;) {
@ -3549,7 +3371,7 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve
break; break;
} }
rw_mat_load_3x3(currentMatrix); mat_load_3x3(currentMatrix);
do { do {
auto srcOffset = *skinningIndexData++; auto srcOffset = *skinningIndexData++;
@ -3574,7 +3396,7 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve
__attribute__((noinline)) __attribute__((noinline))
void tnlMeshletEnvMap(uint8_t* OCR, uint8_t* normal, int vertexCount, int vertexSize, matrix_t* matfxMatrix, float matfxCoefficient) { void tnlMeshletEnvMap(uint8_t* OCR, uint8_t* normal, int vertexCount, int vertexSize, matrix_t* matfxMatrix, float matfxCoefficient) {
mat_load(matfxMatrix); mat_load2(matfxMatrix);
do { do {
pvr_vertex64_t* v = (pvr_vertex64_t*)OCR; pvr_vertex64_t* v = (pvr_vertex64_t*)OCR;
@ -3991,15 +3813,12 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {
lightingCB(atomic, ac->uniform); lightingCB(atomic, ac->uniform);
rw::RawMatrix world; rw::RawMatrix world;
rw::convMatrix(&world, atomic->getFrame()->getLTM()); rw::convMatrix(&world, atomic->getFrame()->getLTM());
mat_load_apply((matrix_t*)&cam->devProjScreen, (matrix_t*)&cam->devView);
mat_load((matrix_t*)&cam->devProjScreen);
mat_apply((matrix_t*)&cam->devView);
mat_apply((matrix_t*)&world); mat_apply((matrix_t*)&world);
mat_store((matrix_t*)&atomicContexts.back().mtx); mat_store2((matrix_t*)&atomicContexts.back().mtx);
auto meshes = geo->meshHeader->getMeshes(); auto meshes = geo->meshHeader->getMeshes();
@ -4238,15 +4057,14 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {
unsigned skinSelector = small_xyz + acp->skinMatrix0Identity*2; unsigned skinSelector = small_xyz + acp->skinMatrix0Identity*2;
tnlMeshletSkinVerticesSelector[skinSelector](OCR_SPACE, normalDst, &dcModel->data[meshlet->vertexOffset], normalSrc, &dcModel->data[meshlet->skinWeightOffset], &dcModel->data[meshlet->skinIndexOffset], meshlet->vertexCount, meshlet->vertexSize, &acp->skinContextPointer->mtx); tnlMeshletSkinVerticesSelector[skinSelector](OCR_SPACE, normalDst, &dcModel->data[meshlet->vertexOffset], normalSrc, &dcModel->data[meshlet->skinWeightOffset], &dcModel->data[meshlet->skinIndexOffset], meshlet->vertexCount, meshlet->vertexSize, &acp->skinContextPointer->mtx);
mat_load(&mtx); mat_load2(&mtx);
tnlMeshletTransformSelector[clippingRequired * 2](OCR_SPACE, OCR_SPACE + 4, meshlet->vertexCount, 64); tnlMeshletTransformSelector[clippingRequired * 2](OCR_SPACE, OCR_SPACE + 4, meshlet->vertexCount, 64);
} else { } else {
if (selector & 8) { if (selector & 8) {
mat_load(&mtx); mat_load_apply(&mtx, &DCE_MESHLET_MAT_DECODE);
mat_apply(&DCE_MESHLET_MAT_DECODE);
} else { } else {
mat_load(&mtx); mat_load2(&mtx);
} }
tnlMeshletTransformSelector[smallSelector](OCR_SPACE, &dcModel->data[meshlet->vertexOffset], meshlet->vertexCount, meshlet->vertexSize); tnlMeshletTransformSelector[smallSelector](OCR_SPACE, &dcModel->data[meshlet->vertexOffset], meshlet->vertexCount, meshlet->vertexSize);
} }
@ -4273,7 +4091,7 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {
unsigned dstColOffset = textured ? offsetof(pvr_vertex64_t, a) : offsetof(pvr_vertex32_ut, a); unsigned dstColOffset = textured ? offsetof(pvr_vertex64_t, a) : offsetof(pvr_vertex32_ut, a);
dce_set_mat_vertex_color(&residual, &material); dce_set_mat_vertex_color(&residual, &material);
mat_load(&DCE_MESHLET_MAT_VERTEX_COLOR); mat_load2(&DCE_MESHLET_MAT_VERTEX_COLOR);
tnlMeshletVertexColorSelector[0](OCR_SPACE + dstColOffset, (int8_t*)&dcModel->data[meshlet->vertexOffset] + colOffset, meshlet->vertexCount, meshlet->vertexSize); tnlMeshletVertexColorSelector[0](OCR_SPACE + dstColOffset, (int8_t*)&dcModel->data[meshlet->vertexOffset] + colOffset, meshlet->vertexCount, meshlet->vertexSize);
} else { } else {
unsigned dstColOffset = textured ? offsetof(pvr_vertex64_t, a) : offsetof(pvr_vertex32_ut, a); unsigned dstColOffset = textured ? offsetof(pvr_vertex64_t, a) : offsetof(pvr_vertex32_ut, a);
@ -4295,7 +4113,7 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {
unsigned normalSelector = (pass1 - 1) + (skin != 0) * 4; unsigned normalSelector = (pass1 - 1) + (skin != 0) * 4;
mat_load((matrix_t*)&uniformObject.dir[0][0][0]); mat_load2((matrix_t*)&uniformObject.dir[0][0][0]);
auto normalPointer = &dcModel->data[meshlet->vertexOffset] + normalOffset; auto normalPointer = &dcModel->data[meshlet->vertexOffset] + normalOffset;
auto vtxSize = meshlet->vertexSize; auto vtxSize = meshlet->vertexSize;
if (skin) { if (skin) {
@ -4310,7 +4128,7 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {
if (pass2) { if (pass2) {
unsigned normalSelector = (pass2 - 1) + (skin != 0) * 4; unsigned normalSelector = (pass2 - 1) + (skin != 0) * 4;
mat_load((matrix_t*)&uniformObject.dir[1][0][0]); mat_load2((matrix_t*)&uniformObject.dir[1][0][0]);
tnlMeshletDiffuseColorSelector[normalSelector](OCR_SPACE + dstColOffset, normalPointer, meshlet->vertexCount, vtxSize, &lightDiffuseColors[4]); tnlMeshletDiffuseColorSelector[normalSelector](OCR_SPACE + dstColOffset, normalPointer, meshlet->vertexCount, vtxSize, &lightDiffuseColors[4]);
} }
} }
@ -4373,7 +4191,7 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {
indices.back() |= 0x80; indices.back() |= 0x80;
pvr_vertex64_t *vd = (pvr_vertex64_t *)OCR_SPACE; pvr_vertex64_t *vd = (pvr_vertex64_t *)OCR_SPACE;
mat_load(&mtx); // Number of cycles: ~11 mat_load2(&mtx); // Number of cycles: ~11
for (int idx = 0; idx < geo->numVertices; idx++) { for (int idx = 0; idx < geo->numVertices; idx++) {
auto& vert = vertices[idx]; auto& vert = vertices[idx];

View file

@ -30,6 +30,7 @@
# define VIDEO_MODE_WIDTH 640.0f # define VIDEO_MODE_WIDTH 640.0f
# define VIDEO_MODE_HEIGHT 480.0f # define VIDEO_MODE_HEIGHT 480.0f
# define memcpy4 memcpy # define memcpy4 memcpy
# define frsqrt(a) (1.0f/sqrtf(a))
# define dcache_pref_block(a) __builtin_prefetch(a) # define dcache_pref_block(a) __builtin_prefetch(a)
# define F_PI M_PI # define F_PI M_PI
# ifndef __always_inline # ifndef __always_inline
@ -47,11 +48,13 @@
#define F_PI_2 (F_PI * 0.5f) #define F_PI_2 (F_PI * 0.5f)
#define __hot __attribute__((hot)) #define __hot __attribute__((hot))
#define __cold __attribute__((cold)) #define __cold __attribute__((cold))
#define __icache_aligned __attribute__((aligned(32)))
#define STRINGIFY(x) #x #define STRINGIFY(x) #x
#define STR(x) STRINGIFY(x) #define STR(x) STRINGIFY(x)
#define CONCAT_(x,y) x##y #define CONCAT_(x,y) x##y
#define CONCAT(x,y) CONCAT_(x,y) #define CONCAT(x,y) CONCAT_(x,y)
#define ARRAY_SIZE(array) (sizeof(array) / sizeof(array[0]))
namespace rw { namespace rw {
class Matrix; class Matrix;
@ -66,6 +69,11 @@ struct quaternion_t {
__always_inline __hot constexpr float Sin(float x) { return sinf(x); } __always_inline __hot constexpr float Sin(float x) { return sinf(x); }
__always_inline __hot constexpr float Cos(float x) { return cosf(x); } __always_inline __hot constexpr float Cos(float x) { return cosf(x); }
__always_inline __hot constexpr auto SinCos(float x) { return std::pair { Sin(x), Cos(x) }; } __always_inline __hot constexpr auto SinCos(float x) { return std::pair { Sin(x), Cos(x) }; }
__always_inline __hot constexpr float Tan(float x) { return tanf(x); }
__always_inline __hot constexpr float Atan(float x) { return atanf(x); }
__always_inline __hot constexpr float Atan2(float y, float x) { return atan2f(y, x); }
__always_inline __hot constexpr float Asin(float x) { return asinf(x); }
__always_inline __hot constexpr float Acos(float x) { return acosf(x); }
__always_inline __hot constexpr float Abs(float x) { return fabsf(x); } __always_inline __hot constexpr float Abs(float x) { return fabsf(x); }
__always_inline __hot constexpr float Sqrt(float x) { return sqrtf(x); } __always_inline __hot constexpr float Sqrt(float x) { return sqrtf(x); }
__always_inline __hot constexpr float RecipSqrt(float x, float y) { return x / Sqrt(y); } __always_inline __hot constexpr float RecipSqrt(float x, float y) { return x / Sqrt(y); }
@ -130,80 +138,6 @@ __always_inline __hot constexpr auto Norm(auto value, auto min, auto max) {
return numerator / denominator; return numerator / denominator;
} }
template<bool FAST_APPROX=false, bool FAST_DIV=false, bool DIV_COPY_SIGN=false>
__always_inline __hot constexpr float Tan(float x) {
if(!std::is_constant_evaluated() && FAST_APPROX) {
constexpr float pisqby4 = 2.4674011002723397f;
constexpr float adjpisqby4 = 2.471688400562703f;
constexpr float adj1minus8bypisq = 0.189759681063053f;
float xsq = x * x;
return x * Div<FAST_DIV, DIV_COPY_SIGN>(adjpisqby4 - adj1minus8bypisq * xsq,
pisqby4 - xsq);
} else
return tanf(x);
}
template<bool FAST_APPROX=false>
__always_inline __hot constexpr float Atan(float x) {
if(FAST_APPROX && !std::is_constant_evaluated()) {
constexpr float a[3] = { //
0.998418889819911f, -2.9993501171084700E-01f, 0.0869142852883849f};
float xx = x * x;
return ((a[2] * xx + a[1]) * xx + a[0]) * x;
} else return atanf(x);
}
template<bool FAST_APPROX=false>
__hot constexpr float Atan2(float y, float x) {
if(FAST_APPROX && !std::is_constant_evaluated()) {
#if 0
constexpr float halfpi_i754 = M_PI * 0.5f;
constexpr float quarterpi_i754 = M_PI * 0.25f;
// kludge to prevent 0/0 condition
float abs_y = Abs(y) + std::numeric_limits<float>::epsilon();
float absy_plus_absx = abs_y + Abs(x);
float inv_absy_plus_absx = Invert<true, true>(absy_plus_absx);
float angle = halfpi_i754 - copysignf(quarterpi_i754, x);
float r = (x - copysignf(abs_y, x)) * inv_absy_plus_absx;
angle += (0.1963f * r * r - 0.9817f) * r;
return copysignf(angle, y);
#else
// Ensure input is in [-1, +1]
bool swap = fabs(x) < fabs(y);
float atan_input = (swap ? x : y) / (swap ? y : x);
// Approximate atan
float res = Atan<true>(atan_input);
// If swapped, adjust atan output
res = swap ? (atan_input >= 0.0f ? F_PI_2 : -F_PI_2) - res : res;
// Adjust quadrants
if (x >= 0.0f && y >= 0.0f) {} // 1st quadrant
else if (x < 0.0f && y >= 0.0f) { res = F_PI + res; } // 2nd quadrant
else if (x < 0.0f && y < 0.0f) { res = -F_PI + res; } // 3rd quadrant
else if (x >= 0.0f && y < 0.0f) {} // 4th quadrant
// Store result
return res;
#endif
} else return atan2f(y, x);
}
template<bool FAST_APPROX=false>
__always_inline __hot constexpr float Asin(float x) {
if(FAST_APPROX && !std::is_constant_evaluated()) {
Atan(Div<true, false>(x, Sqrt(1.0f-(x*x))));
} else return asinf(x);
}
template<bool FAST_APPROX=false>
__always_inline __hot constexpr float Acos(float x) {
if(FAST_APPROX && !std::is_constant_evaluated()) {
return (-0.69813170079773212f * x * x - 0.87266462599716477f) * x + 1.5707963267948966f;
} else return acosf(x);
}
#ifdef RW_DC #ifdef RW_DC
# ifdef DC_SH4 # ifdef DC_SH4
@ -240,43 +174,134 @@ __always_inline __hot constexpr float Acos(float x) {
w = __w; \ w = __w; \
} while(false) } while(false)
#define mat_trans_vec3_nomod(x, y, z, x2, y2, z2) { \ #define mat_trans_vec4_nodiv_nomod(x, y, z, w, x2, y2, z2, w2) { \
register float __x __asm__("fr12") = (x); \ register float __x __asm__("fr0") = (x); \
register float __y __asm__("fr13") = (y); \ register float __y __asm__("fr1") = (y); \
register float __z __asm__("fr14") = (z); \ register float __z __asm__("fr2") = (z); \
__asm__ __volatile__( \ register float __w __asm__("fr3") = (w); \
"fldi0 fr15\n" \ __asm__ __volatile__( "ftrv xmtrx, fv0\n" \
"ftrv xmtrx, fv8\n" \ : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
: "=f" (__x), "=f" (__y), "=f" (__z) \ : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
: "0" (__x), "1" (__y), "2" (__z) \ x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
: "fr15" ); \ } while(false)
x2 = __x; y2 = __y; z2 = __z; \
} inline __hot __icache_aligned void mat_load2(const matrix_t* mtx) {
asm volatile(
R"(
fschg
fmov.d @%[mtx],xd0
add #32,%[mtx]
pref @%[mtx]
add #-(32-8),%[mtx]
fmov.d @%[mtx]+,xd2
fmov.d @%[mtx]+,xd4
fmov.d @%[mtx]+,xd6
fmov.d @%[mtx]+,xd8
fmov.d @%[mtx]+,xd10
fmov.d @%[mtx]+,xd12
fmov.d @%[mtx]+,xd14
fschg
)"
: [mtx] "+r" (mtx)
:
:
);
}
inline __hot __icache_aligned void mat_store2(matrix_t *mtx) {
asm volatile(
R"(
fschg
add #64-8,%[mtx]
fmov.d xd14,@%[mtx]
add #-32,%[mtx]
pref @%[mtx]
add #32,%[mtx]
fmov.d xd12,@-%[mtx]
fmov.d xd10,@-%[mtx]
fmov.d xd8,@-%[mtx]
fmov.d xd6,@-%[mtx]
fmov.d xd4,@-%[mtx]
fmov.d xd2,@-%[mtx]
fmov.d xd0,@-%[mtx]
fschg
)"
: [mtx] "+&r" (mtx), "=m" (*mtx)
:
:
);
}
inline __hot __icache_aligned void mat_identity2(void) {
asm volatile(
R"(
frchg
fldi1 fr0
fschg
fldi0 fr1
fldi0 fr2
fldi0 fr3
fldi0 fr4
fldi1 fr5
fmov dr2,dr6
fmov dr2,dr8
fmov dr0,dr10
fmov dr2,dr12
fmov dr4,dr14
fschg
frchg
)"
);
}
inline __hot __icache_aligned void mat_set_scale(float x, float y, float z) {
asm volatile(
R"(
frchg
fldi0 fr1
fschg
fldi0 fr2
fldi0 fr3
fldi0 fr4
fmov dr2, dr6
fmov dr2, dr8
fldi0 fr11
fmov dr2, dr12
fldi0 fr14
fschg
frchg
fmov %[x], xf0
fmov %[y], xf5
fmov %[z], xf10
)"
:
: [x] "r" (x), [y] "r" (y), [z] "r" (z)
:
);
}
// no declspec naked, so can't do rts / fschg. instead compiler pads with nop? // no declspec naked, so can't do rts / fschg. instead compiler pads with nop?
inline __hot void mat_load_3x3(const matrix_t* mtx) { inline __hot void mat_load_3x3(const rw::Matrix* mtx) {
__asm__ __volatile__ ( __asm__ __volatile__ (
R"( R"(
fschg fschg
frchg frchg
fmov @%[mtx]+, dr0 fmov @%[mtx]+, dr0
fldi0 fr12
fmov @%[mtx]+, dr2 fldi0 fr12
fldi0 fr13 fldi0 fr13
fmov @%[mtx]+, dr2
fmov @%[mtx]+, dr4 fmov @%[mtx]+, dr4
fldi0 fr3
fmov @%[mtx]+, dr6 fmov @%[mtx]+, dr6
fmov dr12, dr14
fmov @%[mtx]+, dr8 fmov @%[mtx]+, dr8
fldi0 fr7
fmov @%[mtx]+, dr10 fmov @%[mtx]+, dr10
fldi0 fr3
fldi0 fr7
fldi0 fr11 fldi0 fr11
fmov dr12, dr14
fschg fschg
frchg frchg
@ -286,7 +311,7 @@ inline __hot void mat_load_3x3(const matrix_t* mtx) {
} }
// sets pos.w to 1 // sets pos.w to 1
inline __hot void rw_mat_load_4x4(const rw::Matrix* mtx) { inline __hot void mat_load_4x4(const rw::Matrix* mtx) {
__asm__ __volatile__ ( __asm__ __volatile__ (
R"( R"(
fschg fschg
@ -307,7 +332,7 @@ inline __hot void rw_mat_load_4x4(const rw::Matrix* mtx) {
)" )"
: [mtx] "+r" (mtx) : [mtx] "+r" (mtx)
); );
} }
__hot inline void mat_transpose(void) { __hot inline void mat_transpose(void) {
asm volatile ( asm volatile (
@ -345,8 +370,7 @@ __hot inline void mat_transpose(void) {
); );
} }
__attribute__((optimize("align-functions=32"))) __hot __icache_aligned inline void mat_copy(matrix_t *dst, const matrix_t *src) {
__hot inline void mat_copy(matrix_t *dst, const matrix_t *src) {
asm volatile(R"( asm volatile(R"(
fschg fschg
@ -382,12 +406,13 @@ __hot inline void mat_copy(matrix_t *dst, const matrix_t *src) {
fmov.d xd0, @-%[dst] fmov.d xd0, @-%[dst]
fschg fschg
)": [dst] "+&r" (dst), [src] "+&r" (src) )": [dst] "+&r" (dst), [src] "+&r" (src), "=m" (*dst)
: :
: "memory"); :);
} }
template<bool FAST_APPROX=true> //TODO: FIXME FOR VC (AND USE FTRV)
template<bool FAST_APPROX=false>
__hot constexpr inline void quat_mult(quaternion_t *r, const quaternion_t &q1, const quaternion_t &q2) { __hot constexpr inline void quat_mult(quaternion_t *r, const quaternion_t &q1, const quaternion_t &q2) {
if(FAST_APPROX && !std::is_constant_evaluated()) { if(FAST_APPROX && !std::is_constant_evaluated()) {
/* /*
@ -608,21 +633,15 @@ __hot inline void mat_apply_rotate_z(float z) {
# else # else
# ifdef DC_TEXCONV # ifdef DC_TEXCONV
# define mat_transform(a, b, c, d)
# define mat_apply(a) # define mat_apply(a)
# define mat_load(a) # define mat_load2(a)
# define mat_store(a) # define mat_store2(a)
# define mat_identity(a) # define mat_identity2()
# define mat_transform(a, b, c, d)
# define pvr_fog_table_color(a,r,g,b) # define pvr_fog_table_color(a,r,g,b)
# define pvr_fog_table_linear(s,e) # define pvr_fog_table_linear(s,e)
# endif # endif
#define mat_trans_vec3_nomod(x_, y_, z_, x2, y2, z2) do { \
vector_t tmp = { x_, y_, z_, 0.0f }; \
mat_transform(&tmp, &tmp, 1, 0); \
x2 = tmp.x; y2 = tmp.y; z2 = tmp.z; \
} while(false)
#define mat_trans_single3_nomod(x_, y_, z_, x2, y2, z2) do { \ #define mat_trans_single3_nomod(x_, y_, z_, x2, y2, z2) do { \
vector_t tmp = { x_, y_, z_, 1.0f }; \ vector_t tmp = { x_, y_, z_, 1.0f }; \
mat_transform(&tmp, &tmp, 1, 0); \ mat_transform(&tmp, &tmp, 1, 0); \
@ -657,7 +676,7 @@ __hot inline void mat_apply_rotate_z(float z) {
w_ = tmp1233123.w; \ w_ = tmp1233123.w; \
} while(false) } while(false)
inline void mat_load_3x3(const matrix_t* mtx) { inline void mat_load_3x3(const rw::Matrix* mtx) {
memcpy(XMTRX, mtx, sizeof(matrix_t)); memcpy(XMTRX, mtx, sizeof(matrix_t));
XMTRX[0][3] = 0.0f; XMTRX[0][3] = 0.0f;
XMTRX[1][3] = 0.0f; XMTRX[1][3] = 0.0f;
@ -669,7 +688,7 @@ inline void mat_load_3x3(const matrix_t* mtx) {
XMTRX[3][3] = 0.0f; XMTRX[3][3] = 0.0f;
} }
inline void rw_mat_load_4x4(const rw::Matrix* mtx) { inline void mat_load_4x4(const rw::Matrix* mtx) {
memcpy(XMTRX, mtx, sizeof(matrix_t)); memcpy(XMTRX, mtx, sizeof(matrix_t));
XMTRX[3][3] = 1.0f; XMTRX[3][3] = 1.0f;
} }
@ -706,32 +725,9 @@ __always_inline __hot void mat_copy(matrix_t *dst, const matrix_t *src) {
__hot inline void mat_mult(matrix_t *out, const matrix_t* matrix1, const matrix_t* matrix2) { __hot inline void mat_mult(matrix_t *out, const matrix_t* matrix1, const matrix_t* matrix2) {
mat_load_apply(matrix1, matrix2); mat_load_apply(matrix1, matrix2);
mat_store(out); mat_store2(out);
} }
#if 0
template<std::size_t N>
__always_inline __hot
auto vec3_dot_prod(const vec3_t &v1, vec3_t const (&v2)[N]) {
std::array<float, N> result;
#ifdef DC_SH4
register float x asm(KOS_FPARG(0)) = v1.x;
register float y asm(KOS_FPARG(1)) = v1.y;
register float z asm(KOS_FPARG(2)) = v1.z;
#else
float x = v1.x;
float y = v1.y;
float z = v1.z;
#endif
for(std::size_t v = 0; v < N; ++v)
result[v] = fipr(x, y, z, 0.0f, v2[v].x, v2[v].y, v2[v].z, 0.0f);
return result;
}
#endif
#endif #endif
} }