rpcs3/rpcs3/Emu/Cell/SPUThread.cpp

4824 lines
111 KiB
C++
Raw Normal View History

2020-12-05 13:08:24 +01:00
#include "stdafx.h"
#include "Utilities/JIT.h"
2020-10-08 09:14:35 +03:00
#include "Utilities/date_time.h"
#include "Emu/Memory/vm.h"
#include "Emu/Memory/vm_ptr.h"
#include "Emu/Memory/vm_reservation.h"
2020-10-08 09:14:35 +03:00
#include "Loader/ELF.h"
#include "Emu/VFS.h"
2014-08-26 03:55:37 +04:00
#include "Emu/IdManager.h"
#include "Emu/perf_meter.hpp"
2020-02-11 23:36:46 +02:00
#include "Emu/RSX/RSXThread.h"
2014-08-26 03:55:37 +04:00
#include "Emu/Cell/PPUThread.h"
2016-04-14 02:09:41 +03:00
#include "Emu/Cell/ErrorCodes.h"
#include "Emu/Cell/lv2/sys_spu.h"
#include "Emu/Cell/lv2/sys_event_flag.h"
#include "Emu/Cell/lv2/sys_event.h"
#include "Emu/Cell/lv2/sys_interrupt.h"
2014-08-23 01:15:02 +04:00
2014-08-26 03:55:37 +04:00
#include "Emu/Cell/SPUDisAsm.h"
2020-10-25 09:08:50 +02:00
#include "Emu/Cell/SPUAnalyser.h"
#include "Emu/Cell/SPUThread.h"
2014-04-06 23:23:32 +04:00
#include "Emu/Cell/SPURecompiler.h"
2021-03-23 22:32:50 +03:00
#include "Emu/Cell/timers.hpp"
2016-05-13 16:55:34 +03:00
#include <cmath>
2014-07-16 16:09:20 +04:00
#include <cfenv>
#include <thread>
#include <shared_mutex>
#include "util/vm.hpp"
#include "util/asm.hpp"
#include "util/v128.hpp"
#include "util/v128sse.hpp"
#include "util/sysinfo.hpp"
2017-07-18 20:03:47 +03:00
using spu_rdata_t = decltype(spu_thread::rdata);
template <>
void fmt_class_string<mfc_atomic_status>::format(std::string& out, u64 arg)
{
format_enum(out, arg, [](mfc_atomic_status arg)
{
switch (arg)
{
case MFC_PUTLLC_SUCCESS: return "PUTLLC";
case MFC_PUTLLC_FAILURE: return "PUTLLC-FAIL";
case MFC_PUTLLUC_SUCCESS: return "PUTLLUC";
case MFC_GETLLAR_SUCCESS: return "GETLLAR";
}
return unknown;
});
}
template <>
void fmt_class_string<mfc_tag_update>::format(std::string& out, u64 arg)
{
format_enum(out, arg, [](mfc_tag_update arg)
{
switch (arg)
{
case MFC_TAG_UPDATE_IMMEDIATE: return "empty";
case MFC_TAG_UPDATE_ANY: return "ANY";
case MFC_TAG_UPDATE_ALL: return "ALL";
}
return unknown;
});
}
2020-07-17 11:18:04 +03:00
template <>
void fmt_class_string<spu_type>::format(std::string& out, u64 arg)
{
format_enum(out, arg, [](spu_type arg)
{
switch (arg)
{
case spu_type::threaded: return "Threaded";
case spu_type::raw: return "Raw";
case spu_type::isolated: return "Isolated";
}
return unknown;
});
}
// Verify AVX availability for TSX transactions
static const bool s_tsx_avx = utils::has_avx();
// For special case
static const bool s_tsx_haswell = utils::has_rtm() && !utils::has_mpx();
static FORCE_INLINE bool cmp_rdata_avx(const __m256i* lhs, const __m256i* rhs)
{
#if defined(_MSC_VER) || defined(__AVX__)
const __m256 x0 = _mm256_xor_ps(_mm256_castsi256_ps(_mm256_loadu_si256(lhs + 0)), _mm256_castsi256_ps(_mm256_loadu_si256(rhs + 0)));
const __m256 x1 = _mm256_xor_ps(_mm256_castsi256_ps(_mm256_loadu_si256(lhs + 1)), _mm256_castsi256_ps(_mm256_loadu_si256(rhs + 1)));
const __m256 x2 = _mm256_xor_ps(_mm256_castsi256_ps(_mm256_loadu_si256(lhs + 2)), _mm256_castsi256_ps(_mm256_loadu_si256(rhs + 2)));
const __m256 x3 = _mm256_xor_ps(_mm256_castsi256_ps(_mm256_loadu_si256(lhs + 3)), _mm256_castsi256_ps(_mm256_loadu_si256(rhs + 3)));
const __m256 c0 = _mm256_or_ps(x0, x1);
const __m256 c1 = _mm256_or_ps(x2, x3);
const __m256 c2 = _mm256_or_ps(c0, c1);
return _mm256_testz_si256(_mm256_castps_si256(c2), _mm256_castps_si256(c2)) != 0;
#else
bool result = 0;
__asm__(
"vmovups 0*32(%[lhs]), %%ymm0;" // load
"vmovups 1*32(%[lhs]), %%ymm1;"
"vmovups 2*32(%[lhs]), %%ymm2;"
"vmovups 3*32(%[lhs]), %%ymm3;"
"vxorps 0*32(%[rhs]), %%ymm0, %%ymm0;" // compare
"vxorps 1*32(%[rhs]), %%ymm1, %%ymm1;"
"vxorps 2*32(%[rhs]), %%ymm2, %%ymm2;"
"vxorps 3*32(%[rhs]), %%ymm3, %%ymm3;"
"vorps %%ymm0, %%ymm1, %%ymm0;" // merge
"vorps %%ymm2, %%ymm3, %%ymm2;"
"vorps %%ymm0, %%ymm2, %%ymm0;"
"vptest %%ymm0, %%ymm0;" // test
"vzeroupper"
: "=@ccz" (result)
: [lhs] "r" (lhs)
, [rhs] "r" (rhs)
: "cc" // Clobber flags
, "xmm0" // Clobber registers ymm0-ymm3 (see mov_rdata_avx)
, "xmm1"
, "xmm2"
, "xmm3"
);
return result;
#endif
}
#ifdef _MSC_VER
__forceinline
#endif
extern bool cmp_rdata(const spu_rdata_t& _lhs, const spu_rdata_t& _rhs)
{
#ifndef __AVX__
if (s_tsx_avx) [[likely]]
#endif
{
return cmp_rdata_avx(reinterpret_cast<const __m256i*>(_lhs), reinterpret_cast<const __m256i*>(_rhs));
}
const auto lhs = reinterpret_cast<const v128*>(_lhs);
const auto rhs = reinterpret_cast<const v128*>(_rhs);
const v128 a = (lhs[0] ^ rhs[0]) | (lhs[1] ^ rhs[1]);
const v128 b = (lhs[2] ^ rhs[2]) | (lhs[3] ^ rhs[3]);
const v128 c = (lhs[4] ^ rhs[4]) | (lhs[5] ^ rhs[5]);
const v128 d = (lhs[6] ^ rhs[6]) | (lhs[7] ^ rhs[7]);
const v128 r = (a | b) | (c | d);
2020-06-15 17:24:04 +03:00
return r == v128{};
}
2014-07-16 16:09:20 +04:00
2020-04-26 19:34:55 +03:00
static FORCE_INLINE void mov_rdata_avx(__m256i* dst, const __m256i* src)
{
#ifdef _MSC_VER
_mm256_storeu_si256(dst + 0, _mm256_loadu_si256(src + 0));
_mm256_storeu_si256(dst + 1, _mm256_loadu_si256(src + 1));
_mm256_storeu_si256(dst + 2, _mm256_loadu_si256(src + 2));
_mm256_storeu_si256(dst + 3, _mm256_loadu_si256(src + 3));
2020-04-26 19:34:55 +03:00
#else
__asm__(
"vmovdqu 0*32(%[src]), %%ymm0;" // load
"vmovdqu %%ymm0, 0*32(%[dst]);" // store
"vmovdqu 1*32(%[src]), %%ymm0;"
"vmovdqu %%ymm0, 1*32(%[dst]);"
"vmovdqu 2*32(%[src]), %%ymm0;"
"vmovdqu %%ymm0, 2*32(%[dst]);"
"vmovdqu 3*32(%[src]), %%ymm0;"
"vmovdqu %%ymm0, 3*32(%[dst]);"
#ifndef __AVX__
"vzeroupper" // Don't need in AVX mode (should be emitted automatically)
#endif
2020-04-26 19:34:55 +03:00
:
: [src] "r" (src)
, [dst] "r" (dst)
#ifdef __AVX__
: "ymm0" // Clobber ymm0 register (acknowledge its modification)
#else
: "xmm0" // ymm0 is "unknown" if not compiled in AVX mode, so clobber xmm0 only
#endif
2020-04-26 19:34:55 +03:00
);
#endif
}
#ifdef _MSC_VER
__forceinline
#endif
extern void mov_rdata(spu_rdata_t& _dst, const spu_rdata_t& _src)
{
2020-04-26 19:34:55 +03:00
#ifndef __AVX__
if (s_tsx_avx) [[likely]]
#endif
{
mov_rdata_avx(reinterpret_cast<__m256i*>(_dst), reinterpret_cast<const __m256i*>(_src));
2020-04-26 19:34:55 +03:00
return;
}
{
const __m128i v0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(_src + 0));
const __m128i v1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(_src + 16));
const __m128i v2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(_src + 32));
const __m128i v3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(_src + 48));
_mm_storeu_si128(reinterpret_cast<__m128i*>(_dst + 0), v0);
_mm_storeu_si128(reinterpret_cast<__m128i*>(_dst + 16), v1);
_mm_storeu_si128(reinterpret_cast<__m128i*>(_dst + 32), v2);
_mm_storeu_si128(reinterpret_cast<__m128i*>(_dst + 48), v3);
}
const __m128i v0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(_src + 64));
const __m128i v1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(_src + 80));
const __m128i v2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(_src + 96));
const __m128i v3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(_src + 112));
_mm_storeu_si128(reinterpret_cast<__m128i*>(_dst + 64), v0);
_mm_storeu_si128(reinterpret_cast<__m128i*>(_dst + 80), v1);
_mm_storeu_si128(reinterpret_cast<__m128i*>(_dst + 96), v2);
_mm_storeu_si128(reinterpret_cast<__m128i*>(_dst + 112), v3);
2020-02-11 23:36:46 +02:00
}
static FORCE_INLINE void mov_rdata_nt_avx(__m256i* dst, const __m256i* src)
{
#ifdef _MSC_VER
_mm256_stream_si256(dst + 0, _mm256_load_si256(src + 0));
_mm256_stream_si256(dst + 1, _mm256_load_si256(src + 1));
_mm256_stream_si256(dst + 2, _mm256_load_si256(src + 2));
_mm256_stream_si256(dst + 3, _mm256_load_si256(src + 3));
#else
__asm__(
"vmovdqa 0*32(%[src]), %%ymm0;" // load
"vmovntdq %%ymm0, 0*32(%[dst]);" // store
"vmovdqa 1*32(%[src]), %%ymm0;"
"vmovntdq %%ymm0, 1*32(%[dst]);"
"vmovdqa 2*32(%[src]), %%ymm0;"
"vmovntdq %%ymm0, 2*32(%[dst]);"
"vmovdqa 3*32(%[src]), %%ymm0;"
"vmovntdq %%ymm0, 3*32(%[dst]);"
#ifndef __AVX__
"vzeroupper" // Don't need in AVX mode (should be emitted automatically)
#endif
:
: [src] "r" (src)
, [dst] "r" (dst)
#ifdef __AVX__
: "ymm0" // Clobber ymm0 register (acknowledge its modification)
#else
: "xmm0" // ymm0 is "unknown" if not compiled in AVX mode, so clobber xmm0 only
#endif
);
#endif
}
extern void mov_rdata_nt(spu_rdata_t& _dst, const spu_rdata_t& _src)
{
#ifndef __AVX__
if (s_tsx_avx) [[likely]]
#endif
{
mov_rdata_nt_avx(reinterpret_cast<__m256i*>(_dst), reinterpret_cast<const __m256i*>(_src));
return;
}
{
const __m128i v0 = _mm_load_si128(reinterpret_cast<const __m128i*>(_src + 0));
const __m128i v1 = _mm_load_si128(reinterpret_cast<const __m128i*>(_src + 16));
const __m128i v2 = _mm_load_si128(reinterpret_cast<const __m128i*>(_src + 32));
const __m128i v3 = _mm_load_si128(reinterpret_cast<const __m128i*>(_src + 48));
_mm_stream_si128(reinterpret_cast<__m128i*>(_dst + 0), v0);
_mm_stream_si128(reinterpret_cast<__m128i*>(_dst + 16), v1);
_mm_stream_si128(reinterpret_cast<__m128i*>(_dst + 32), v2);
_mm_stream_si128(reinterpret_cast<__m128i*>(_dst + 48), v3);
}
const __m128i v0 = _mm_load_si128(reinterpret_cast<const __m128i*>(_src + 64));
const __m128i v1 = _mm_load_si128(reinterpret_cast<const __m128i*>(_src + 80));
const __m128i v2 = _mm_load_si128(reinterpret_cast<const __m128i*>(_src + 96));
const __m128i v3 = _mm_load_si128(reinterpret_cast<const __m128i*>(_src + 112));
_mm_stream_si128(reinterpret_cast<__m128i*>(_dst + 64), v0);
_mm_stream_si128(reinterpret_cast<__m128i*>(_dst + 80), v1);
_mm_stream_si128(reinterpret_cast<__m128i*>(_dst + 96), v2);
_mm_stream_si128(reinterpret_cast<__m128i*>(_dst + 112), v3);
}
void do_cell_atomic_128_store(u32 addr, const void* to_write);
extern thread_local u64 g_tls_fault_spu;
const spu_decoder<spu_itype> s_spu_itype;
2020-10-25 09:08:50 +02:00
namespace spu
{
namespace scheduler
{
std::array<atomic_t<u8>, 65536> atomic_instruction_table = {};
constexpr u32 native_jiffy_duration_us = 1500; //About 1ms resolution with a half offset
void acquire_pc_address(spu_thread& spu, u32 pc, u32 timeout_ms, u32 max_concurrent_instructions)
{
const u32 pc_offset = pc >> 2;
if (atomic_instruction_table[pc_offset].observe() >= max_concurrent_instructions)
{
spu.state += cpu_flag::wait + cpu_flag::temp;
2019-06-29 23:41:52 +03:00
if (timeout_ms > 0)
{
const u64 timeout = timeout_ms * 1000u; //convert to microseconds
const u64 start = get_system_time();
auto remaining = timeout;
while (atomic_instruction_table[pc_offset].observe() >= max_concurrent_instructions)
{
if (remaining >= native_jiffy_duration_us)
std::this_thread::sleep_for(1ms);
else
std::this_thread::yield();
const auto now = get_system_time();
const auto elapsed = now - start;
if (elapsed > timeout) break;
remaining = timeout - elapsed;
}
}
else
{
//Slight pause if function is overburdened
const auto count = atomic_instruction_table[pc_offset].observe() * 100ull;
busy_wait(count);
}
2019-06-29 23:41:52 +03:00
ensure(!spu.check_state());
}
atomic_instruction_table[pc_offset]++;
}
void release_pc_address(u32 pc)
{
const u32 pc_offset = pc >> 2;
atomic_instruction_table[pc_offset]--;
}
struct concurrent_execution_watchdog
{
u32 pc = 0;
bool active = false;
concurrent_execution_watchdog(spu_thread& spu)
:pc(spu.pc)
{
if (const u32 max_concurrent_instructions = g_cfg.core.preferred_spu_threads)
{
acquire_pc_address(spu, pc, g_cfg.core.spu_delay_penalty, max_concurrent_instructions);
active = true;
}
}
~concurrent_execution_watchdog()
{
if (active)
release_pc_address(pc);
}
};
}
}
2020-11-23 19:57:34 +02:00
std::array<u32, 2> op_branch_targets(u32 pc, spu_opcode_t op)
{
std::array<u32, 2> res{spu_branch_target(pc + 4), UINT32_MAX};
switch (const auto type = s_spu_itype.decode(op.opcode))
{
case spu_itype::BR:
case spu_itype::BRA:
case spu_itype::BRNZ:
case spu_itype::BRZ:
case spu_itype::BRHNZ:
case spu_itype::BRHZ:
case spu_itype::BRSL:
case spu_itype::BRASL:
{
const int index = (type == spu_itype::BR || type == spu_itype::BRA || type == spu_itype::BRSL || type == spu_itype::BRASL ? 0 : 1);
res[index] = (spu_branch_target(type == spu_itype::BRASL || type == spu_itype::BRA ? 0 : pc, op.i16));
break;
}
case spu_itype::IRET:
case spu_itype::BI:
case spu_itype::BISLED:
case spu_itype::BISL:
case spu_itype::BIZ:
case spu_itype::BINZ:
case spu_itype::BIHZ:
case spu_itype::BIHNZ: // TODO (detect constant address branches, such as for interrupts enable/disable pattern)
case spu_itype::UNK:
{
res[0] = UINT32_MAX;
break;
}
default: break;
}
return res;
}
const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, void* _old, const void* _new)>([](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;
Label fall = c.newLabel();
Label fail = c.newLabel();
Label _ret = c.newLabel();
Label skip = c.newLabel();
Label next = c.newLabel();
Label load = c.newLabel();
2020-06-04 09:19:56 +03:00
//if (utils::has_avx() && !s_tsx_avx)
//{
// c.vzeroupper();
//}
// Create stack frame if necessary (Windows ABI has only 6 volatile vector registers)
c.push(x86::rbp);
c.push(x86::r13);
c.push(x86::r12);
c.push(x86::rbx);
c.sub(x86::rsp, 168);
#ifdef _WIN32
if (s_tsx_avx)
{
c.vmovups(x86::oword_ptr(x86::rsp, 0), x86::xmm6);
c.vmovups(x86::oword_ptr(x86::rsp, 16), x86::xmm7);
}
else
{
c.movups(x86::oword_ptr(x86::rsp, 0), x86::xmm6);
c.movups(x86::oword_ptr(x86::rsp, 16), x86::xmm7);
c.movups(x86::oword_ptr(x86::rsp, 32), x86::xmm8);
c.movups(x86::oword_ptr(x86::rsp, 48), x86::xmm9);
c.movups(x86::oword_ptr(x86::rsp, 64), x86::xmm10);
c.movups(x86::oword_ptr(x86::rsp, 80), x86::xmm11);
c.movups(x86::oword_ptr(x86::rsp, 96), x86::xmm12);
c.movups(x86::oword_ptr(x86::rsp, 112), x86::xmm13);
c.movups(x86::oword_ptr(x86::rsp, 128), x86::xmm14);
c.movups(x86::oword_ptr(x86::rsp, 144), x86::xmm15);
}
#endif
// Prepare registers
build_swap_rdx_with(c, args, x86::r12);
c.mov(x86::rbp, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_sudo_addr)));
c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0]));
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
c.and_(args[0].r32(), 0xff80);
c.shr(args[0].r32(), 1);
c.lea(x86::rbx, x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0]));
c.prefetchw(x86::byte_ptr(x86::rbx));
c.mov(x86::r13, args[1]);
// Prepare data
if (s_tsx_avx)
{
c.vmovups(x86::ymm0, x86::yword_ptr(args[2], 0));
c.vmovups(x86::ymm1, x86::yword_ptr(args[2], 32));
c.vmovups(x86::ymm2, x86::yword_ptr(args[2], 64));
c.vmovups(x86::ymm3, x86::yword_ptr(args[2], 96));
c.vmovups(x86::ymm4, x86::yword_ptr(args[3], 0));
c.vmovups(x86::ymm5, x86::yword_ptr(args[3], 32));
c.vmovups(x86::ymm6, x86::yword_ptr(args[3], 64));
c.vmovups(x86::ymm7, x86::yword_ptr(args[3], 96));
}
else
{
c.movaps(x86::xmm0, x86::oword_ptr(args[2], 0));
c.movaps(x86::xmm1, x86::oword_ptr(args[2], 16));
c.movaps(x86::xmm2, x86::oword_ptr(args[2], 32));
c.movaps(x86::xmm3, x86::oword_ptr(args[2], 48));
c.movaps(x86::xmm4, x86::oword_ptr(args[2], 64));
c.movaps(x86::xmm5, x86::oword_ptr(args[2], 80));
c.movaps(x86::xmm6, x86::oword_ptr(args[2], 96));
c.movaps(x86::xmm7, x86::oword_ptr(args[2], 112));
c.movaps(x86::xmm8, x86::oword_ptr(args[3], 0));
c.movaps(x86::xmm9, x86::oword_ptr(args[3], 16));
c.movaps(x86::xmm10, x86::oword_ptr(args[3], 32));
c.movaps(x86::xmm11, x86::oword_ptr(args[3], 48));
c.movaps(x86::xmm12, x86::oword_ptr(args[3], 64));
c.movaps(x86::xmm13, x86::oword_ptr(args[3], 80));
c.movaps(x86::xmm14, x86::oword_ptr(args[3], 96));
c.movaps(x86::xmm15, x86::oword_ptr(args[3], 112));
}
// Alloc args[0] to stamp0
const auto stamp0 = args[0];
const auto stamp1 = args[1];
build_get_tsc(c, stamp0);
// Begin transaction
Label tx0 = build_transaction_enter(c, fall, [&]()
{
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx) - ::offset32(&spu_thread::rdata)), 1);
build_get_tsc(c, stamp1);
c.sub(stamp1, stamp0);
c.xor_(x86::eax, x86::eax);
2020-11-01 14:45:16 +03:00
c.cmp(stamp1, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit1)));
c.jae(fall);
});
c.bt(x86::dword_ptr(args[2], ::offset32(&spu_thread::state) - ::offset32(&spu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
c.jc(fall);
c.xbegin(tx0);
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
c.test(x86::eax, vm::rsrv_unique_lock);
c.jnz(skip);
c.and_(x86::rax, -128);
c.cmp(x86::rax, x86::r13);
c.jne(fail);
if (s_tsx_avx)
{
c.vxorps(x86::ymm0, x86::ymm0, x86::yword_ptr(x86::rbp, 0));
c.vxorps(x86::ymm1, x86::ymm1, x86::yword_ptr(x86::rbp, 32));
c.vxorps(x86::ymm2, x86::ymm2, x86::yword_ptr(x86::rbp, 64));
c.vxorps(x86::ymm3, x86::ymm3, x86::yword_ptr(x86::rbp, 96));
c.vorps(x86::ymm0, x86::ymm0, x86::ymm1);
c.vorps(x86::ymm1, x86::ymm2, x86::ymm3);
c.vorps(x86::ymm0, x86::ymm1, x86::ymm0);
c.vptest(x86::ymm0, x86::ymm0);
}
else
{
c.xorps(x86::xmm0, x86::oword_ptr(x86::rbp, 0));
c.xorps(x86::xmm1, x86::oword_ptr(x86::rbp, 16));
c.xorps(x86::xmm2, x86::oword_ptr(x86::rbp, 32));
c.xorps(x86::xmm3, x86::oword_ptr(x86::rbp, 48));
c.xorps(x86::xmm4, x86::oword_ptr(x86::rbp, 64));
c.xorps(x86::xmm5, x86::oword_ptr(x86::rbp, 80));
c.xorps(x86::xmm6, x86::oword_ptr(x86::rbp, 96));
c.xorps(x86::xmm7, x86::oword_ptr(x86::rbp, 112));
c.orps(x86::xmm0, x86::xmm1);
c.orps(x86::xmm2, x86::xmm3);
c.orps(x86::xmm4, x86::xmm5);
c.orps(x86::xmm6, x86::xmm7);
c.orps(x86::xmm0, x86::xmm2);
c.orps(x86::xmm4, x86::xmm6);
c.orps(x86::xmm0, x86::xmm4);
c.ptest(x86::xmm0, x86::xmm0);
}
c.jnz(fail);
if (s_tsx_avx)
{
c.vmovaps(x86::yword_ptr(x86::rbp, 0), x86::ymm4);
c.vmovaps(x86::yword_ptr(x86::rbp, 32), x86::ymm5);
c.vmovaps(x86::yword_ptr(x86::rbp, 64), x86::ymm6);
c.vmovaps(x86::yword_ptr(x86::rbp, 96), x86::ymm7);
}
else
{
c.movaps(x86::oword_ptr(x86::rbp, 0), x86::xmm8);
c.movaps(x86::oword_ptr(x86::rbp, 16), x86::xmm9);
c.movaps(x86::oword_ptr(x86::rbp, 32), x86::xmm10);
c.movaps(x86::oword_ptr(x86::rbp, 48), x86::xmm11);
c.movaps(x86::oword_ptr(x86::rbp, 64), x86::xmm12);
c.movaps(x86::oword_ptr(x86::rbp, 80), x86::xmm13);
c.movaps(x86::oword_ptr(x86::rbp, 96), x86::xmm14);
c.movaps(x86::oword_ptr(x86::rbp, 112), x86::xmm15);
}
c.sub(x86::qword_ptr(x86::rbx), -128);
c.xend();
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx) - ::offset32(&spu_thread::rdata)), 1);
build_get_tsc(c);
c.sub(x86::rax, stamp0);
c.jmp(_ret);
// XABORT is expensive so finish with xend instead
c.bind(fail);
// Load old data to store back in rdata
if (s_tsx_avx)
{
c.vmovaps(x86::ymm0, x86::yword_ptr(x86::rbp, 0));
c.vmovaps(x86::ymm1, x86::yword_ptr(x86::rbp, 32));
c.vmovaps(x86::ymm2, x86::yword_ptr(x86::rbp, 64));
c.vmovaps(x86::ymm3, x86::yword_ptr(x86::rbp, 96));
}
else
{
c.movaps(x86::xmm0, x86::oword_ptr(x86::rbp, 0));
c.movaps(x86::xmm1, x86::oword_ptr(x86::rbp, 16));
c.movaps(x86::xmm2, x86::oword_ptr(x86::rbp, 32));
c.movaps(x86::xmm3, x86::oword_ptr(x86::rbp, 48));
c.movaps(x86::xmm4, x86::oword_ptr(x86::rbp, 64));
c.movaps(x86::xmm5, x86::oword_ptr(x86::rbp, 80));
c.movaps(x86::xmm6, x86::oword_ptr(x86::rbp, 96));
c.movaps(x86::xmm7, x86::oword_ptr(x86::rbp, 112));
}
c.xend();
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx) - ::offset32(&spu_thread::rdata)), 1);
c.jmp(load);
c.bind(skip);
c.xend();
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx) - ::offset32(&spu_thread::rdata)), 1);
build_get_tsc(c, stamp1);
//c.jmp(fall);
c.bind(fall);
Label fall2 = c.newLabel();
Label fail2 = c.newLabel();
Label fail3 = c.newLabel();
// Lightened transaction: only compare and swap data
c.bind(next);
// Try to "lock" reservation
c.mov(x86::eax, 1);
c.lock().xadd(x86::qword_ptr(x86::rbx), x86::rax);
c.test(x86::eax, vm::rsrv_unique_lock);
c.jnz(fail2);
// Check if already updated
c.and_(x86::rax, -128);
c.cmp(x86::rax, x86::r13);
c.jne(fail2);
// Exclude some time spent on touching memory: stamp1 contains last success or failure
c.mov(x86::rax, stamp1);
c.sub(x86::rax, stamp0);
build_get_tsc(c, stamp1);
c.sub(stamp1, x86::rax);
2020-11-01 14:45:16 +03:00
c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit2)));
c.jae(fall2);
Label tx1 = build_transaction_enter(c, fall2, [&]()
{
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx) - ::offset32(&spu_thread::rdata)), 1);
build_get_tsc(c);
c.sub(x86::rax, stamp1);
2020-11-01 14:45:16 +03:00
c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit2)));
c.jae(fall2);
c.test(x86::qword_ptr(x86::rbx), 127 - 1);
c.jnz(fall2);
});
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
// Check pause flag
c.bt(x86::dword_ptr(args[2], ::offset32(&spu_thread::state) - ::offset32(&spu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
c.jc(fall2);
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
c.and_(x86::rax, -128);
c.cmp(x86::rax, x86::r13);
c.jne(fail2);
c.xbegin(tx1);
if (s_tsx_avx)
{
c.vxorps(x86::ymm0, x86::ymm0, x86::yword_ptr(x86::rbp, 0));
c.vxorps(x86::ymm1, x86::ymm1, x86::yword_ptr(x86::rbp, 32));
c.vxorps(x86::ymm2, x86::ymm2, x86::yword_ptr(x86::rbp, 64));
c.vxorps(x86::ymm3, x86::ymm3, x86::yword_ptr(x86::rbp, 96));
c.vorps(x86::ymm0, x86::ymm0, x86::ymm1);
c.vorps(x86::ymm1, x86::ymm2, x86::ymm3);
c.vorps(x86::ymm0, x86::ymm1, x86::ymm0);
c.vptest(x86::ymm0, x86::ymm0);
}
else
{
c.xorps(x86::xmm0, x86::oword_ptr(x86::rbp, 0));
c.xorps(x86::xmm1, x86::oword_ptr(x86::rbp, 16));
c.xorps(x86::xmm2, x86::oword_ptr(x86::rbp, 32));
c.xorps(x86::xmm3, x86::oword_ptr(x86::rbp, 48));
c.xorps(x86::xmm4, x86::oword_ptr(x86::rbp, 64));
c.xorps(x86::xmm5, x86::oword_ptr(x86::rbp, 80));
c.xorps(x86::xmm6, x86::oword_ptr(x86::rbp, 96));
c.xorps(x86::xmm7, x86::oword_ptr(x86::rbp, 112));
c.orps(x86::xmm0, x86::xmm1);
c.orps(x86::xmm2, x86::xmm3);
c.orps(x86::xmm4, x86::xmm5);
c.orps(x86::xmm6, x86::xmm7);
c.orps(x86::xmm0, x86::xmm2);
c.orps(x86::xmm4, x86::xmm6);
c.orps(x86::xmm0, x86::xmm4);
c.ptest(x86::xmm0, x86::xmm0);
}
c.jnz(fail3);
if (s_tsx_avx)
{
c.vmovaps(x86::yword_ptr(x86::rbp, 0), x86::ymm4);
c.vmovaps(x86::yword_ptr(x86::rbp, 32), x86::ymm5);
c.vmovaps(x86::yword_ptr(x86::rbp, 64), x86::ymm6);
c.vmovaps(x86::yword_ptr(x86::rbp, 96), x86::ymm7);
}
else
{
c.movaps(x86::oword_ptr(x86::rbp, 0), x86::xmm8);
c.movaps(x86::oword_ptr(x86::rbp, 16), x86::xmm9);
c.movaps(x86::oword_ptr(x86::rbp, 32), x86::xmm10);
c.movaps(x86::oword_ptr(x86::rbp, 48), x86::xmm11);
c.movaps(x86::oword_ptr(x86::rbp, 64), x86::xmm12);
c.movaps(x86::oword_ptr(x86::rbp, 80), x86::xmm13);
c.movaps(x86::oword_ptr(x86::rbp, 96), x86::xmm14);
c.movaps(x86::oword_ptr(x86::rbp, 112), x86::xmm15);
}
c.xend();
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx) - ::offset32(&spu_thread::rdata)), 1);
c.lock().add(x86::qword_ptr(x86::rbx), 127);
build_get_tsc(c);
c.sub(x86::rax, stamp0);
c.jmp(_ret);
// XABORT is expensive so try to finish with xend instead
c.bind(fail3);
// Load previous data to store back to rdata
if (s_tsx_avx)
{
c.vmovaps(x86::ymm0, x86::yword_ptr(x86::rbp, 0));
c.vmovaps(x86::ymm1, x86::yword_ptr(x86::rbp, 32));
c.vmovaps(x86::ymm2, x86::yword_ptr(x86::rbp, 64));
c.vmovaps(x86::ymm3, x86::yword_ptr(x86::rbp, 96));
}
else
{
c.movaps(x86::xmm0, x86::oword_ptr(x86::rbp, 0));
c.movaps(x86::xmm1, x86::oword_ptr(x86::rbp, 16));
c.movaps(x86::xmm2, x86::oword_ptr(x86::rbp, 32));
c.movaps(x86::xmm3, x86::oword_ptr(x86::rbp, 48));
c.movaps(x86::xmm4, x86::oword_ptr(x86::rbp, 64));
c.movaps(x86::xmm5, x86::oword_ptr(x86::rbp, 80));
c.movaps(x86::xmm6, x86::oword_ptr(x86::rbp, 96));
c.movaps(x86::xmm7, x86::oword_ptr(x86::rbp, 112));
}
c.xend();
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx) - ::offset32(&spu_thread::rdata)), 1);
c.jmp(fail2);
c.bind(fall2);
c.mov(x86::rax, -1);
c.jmp(_ret);
c.bind(fail2);
c.lock().sub(x86::qword_ptr(x86::rbx), 1);
c.bind(load);
// Store previous data back to rdata
if (s_tsx_avx)
{
c.vmovaps(x86::yword_ptr(args[2], 0), x86::ymm0);
c.vmovaps(x86::yword_ptr(args[2], 32), x86::ymm1);
c.vmovaps(x86::yword_ptr(args[2], 64), x86::ymm2);
c.vmovaps(x86::yword_ptr(args[2], 96), x86::ymm3);
}
else
{
c.movaps(x86::oword_ptr(args[2], 0), x86::xmm0);
c.movaps(x86::oword_ptr(args[2], 16), x86::xmm1);
c.movaps(x86::oword_ptr(args[2], 32), x86::xmm2);
c.movaps(x86::oword_ptr(args[2], 48), x86::xmm3);
c.movaps(x86::oword_ptr(args[2], 64), x86::xmm4);
c.movaps(x86::oword_ptr(args[2], 80), x86::xmm5);
c.movaps(x86::oword_ptr(args[2], 96), x86::xmm6);
c.movaps(x86::oword_ptr(args[2], 112), x86::xmm7);
}
c.mov(x86::rax, -1);
c.mov(x86::qword_ptr(args[2], ::offset32(&spu_thread::last_ftime) - ::offset32(&spu_thread::rdata)), x86::rax);
c.xor_(x86::eax, x86::eax);
//c.jmp(_ret);
c.bind(_ret);
#ifdef _WIN32
if (s_tsx_avx)
{
c.vmovups(x86::xmm6, x86::oword_ptr(x86::rsp, 0));
c.vmovups(x86::xmm7, x86::oword_ptr(x86::rsp, 16));
}
else
{
c.movups(x86::xmm6, x86::oword_ptr(x86::rsp, 0));
c.movups(x86::xmm7, x86::oword_ptr(x86::rsp, 16));
c.movups(x86::xmm8, x86::oword_ptr(x86::rsp, 32));
c.movups(x86::xmm9, x86::oword_ptr(x86::rsp, 48));
c.movups(x86::xmm10, x86::oword_ptr(x86::rsp, 64));
c.movups(x86::xmm11, x86::oword_ptr(x86::rsp, 80));
c.movups(x86::xmm12, x86::oword_ptr(x86::rsp, 96));
c.movups(x86::xmm13, x86::oword_ptr(x86::rsp, 112));
c.movups(x86::xmm14, x86::oword_ptr(x86::rsp, 128));
c.movups(x86::xmm15, x86::oword_ptr(x86::rsp, 144));
}
#endif
if (s_tsx_avx)
{
c.vzeroupper();
}
c.add(x86::rsp, 168);
c.pop(x86::rbx);
c.pop(x86::r12);
c.pop(x86::r13);
c.pop(x86::rbp);
c.ret();
});
const auto spu_putlluc_tx = build_function_asm<u64(*)(u32 raddr, const void* rdata, cpu_thread* _spu)>([](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;
Label fall = c.newLabel();
Label _ret = c.newLabel();
Label skip = c.newLabel();
Label next = c.newLabel();
2020-06-04 09:19:56 +03:00
//if (utils::has_avx() && !s_tsx_avx)
//{
// c.vzeroupper();
//}
// Create stack frame if necessary (Windows ABI has only 6 volatile vector registers)
c.push(x86::rbp);
c.push(x86::r13);
c.push(x86::r12);
c.push(x86::rbx);
c.sub(x86::rsp, 40);
#ifdef _WIN32
if (!s_tsx_avx)
{
c.movups(x86::oword_ptr(x86::rsp, 0), x86::xmm6);
c.movups(x86::oword_ptr(x86::rsp, 16), x86::xmm7);
}
#endif
// Prepare registers
build_swap_rdx_with(c, args, x86::r12);
c.mov(x86::rbp, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_sudo_addr)));
c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0]));
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
c.and_(args[0].r32(), 0xff80);
c.shr(args[0].r32(), 1);
c.lea(x86::rbx, x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0]));
c.prefetchw(x86::byte_ptr(x86::rbx));
c.mov(x86::r13, args[1]);
// Prepare data
if (s_tsx_avx)
{
c.vmovups(x86::ymm0, x86::yword_ptr(args[1], 0));
c.vmovups(x86::ymm1, x86::yword_ptr(args[1], 32));
c.vmovups(x86::ymm2, x86::yword_ptr(args[1], 64));
c.vmovups(x86::ymm3, x86::yword_ptr(args[1], 96));
}
else
{
c.movaps(x86::xmm0, x86::oword_ptr(args[1], 0));
c.movaps(x86::xmm1, x86::oword_ptr(args[1], 16));
c.movaps(x86::xmm2, x86::oword_ptr(args[1], 32));
c.movaps(x86::xmm3, x86::oword_ptr(args[1], 48));
c.movaps(x86::xmm4, x86::oword_ptr(args[1], 64));
c.movaps(x86::xmm5, x86::oword_ptr(args[1], 80));
c.movaps(x86::xmm6, x86::oword_ptr(args[1], 96));
c.movaps(x86::xmm7, x86::oword_ptr(args[1], 112));
}
// Alloc args[0] to stamp0
const auto stamp0 = args[0];
const auto stamp1 = args[1];
build_get_tsc(c, stamp0);
// Begin transaction
Label tx0 = build_transaction_enter(c, fall, [&]()
{
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx)), 1);
build_get_tsc(c, stamp1);
c.sub(stamp1, stamp0);
c.xor_(x86::eax, x86::eax);
2020-11-01 14:45:16 +03:00
c.cmp(stamp1, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit1)));
c.jae(fall);
});
c.xbegin(tx0);
2020-10-17 21:27:19 +03:00
c.test(x86::qword_ptr(x86::rbx), vm::rsrv_unique_lock);
c.jnz(skip);
if (s_tsx_avx)
{
c.vmovaps(x86::yword_ptr(x86::rbp, 0), x86::ymm0);
c.vmovaps(x86::yword_ptr(x86::rbp, 32), x86::ymm1);
c.vmovaps(x86::yword_ptr(x86::rbp, 64), x86::ymm2);
c.vmovaps(x86::yword_ptr(x86::rbp, 96), x86::ymm3);
}
else
{
c.movaps(x86::oword_ptr(x86::rbp, 0), x86::xmm0);
c.movaps(x86::oword_ptr(x86::rbp, 16), x86::xmm1);
c.movaps(x86::oword_ptr(x86::rbp, 32), x86::xmm2);
c.movaps(x86::oword_ptr(x86::rbp, 48), x86::xmm3);
c.movaps(x86::oword_ptr(x86::rbp, 64), x86::xmm4);
c.movaps(x86::oword_ptr(x86::rbp, 80), x86::xmm5);
c.movaps(x86::oword_ptr(x86::rbp, 96), x86::xmm6);
c.movaps(x86::oword_ptr(x86::rbp, 112), x86::xmm7);
}
c.sub(x86::qword_ptr(x86::rbx), -128);
c.xend();
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx)), 1);
build_get_tsc(c);
c.sub(x86::rax, stamp0);
c.jmp(_ret);
c.bind(skip);
c.xend();
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx)), 1);
build_get_tsc(c, stamp1);
//c.jmp(fall);
c.bind(fall);
Label fall2 = c.newLabel();
// Lightened transaction
c.bind(next);
// Lock reservation
c.mov(x86::eax, 1);
c.lock().xadd(x86::qword_ptr(x86::rbx), x86::rax);
c.test(x86::eax, 127 - 1);
c.jnz(fall2);
// Exclude some time spent on touching memory: stamp1 contains last success or failure
c.mov(x86::rax, stamp1);
c.sub(x86::rax, stamp0);
2020-11-01 14:45:16 +03:00
c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit2)));
c.jae(fall2);
build_get_tsc(c, stamp1);
c.sub(stamp1, x86::rax);
Label tx1 = build_transaction_enter(c, fall2, [&]()
{
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx)), 1);
build_get_tsc(c);
c.sub(x86::rax, stamp1);
2020-11-01 14:45:16 +03:00
c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit2)));
c.jae(fall2);
});
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
// Check pause flag
c.bt(x86::dword_ptr(args[2], ::offset32(&cpu_thread::state)), static_cast<u32>(cpu_flag::pause));
c.jc(fall2);
// Check contention
c.test(x86::qword_ptr(x86::rbx), 127 - 1);
c.jc(fall2);
c.xbegin(tx1);
if (s_tsx_avx)
{
c.vmovaps(x86::yword_ptr(x86::rbp, 0), x86::ymm0);
c.vmovaps(x86::yword_ptr(x86::rbp, 32), x86::ymm1);
c.vmovaps(x86::yword_ptr(x86::rbp, 64), x86::ymm2);
c.vmovaps(x86::yword_ptr(x86::rbp, 96), x86::ymm3);
}
else
{
c.movaps(x86::oword_ptr(x86::rbp, 0), x86::xmm0);
c.movaps(x86::oword_ptr(x86::rbp, 16), x86::xmm1);
c.movaps(x86::oword_ptr(x86::rbp, 32), x86::xmm2);
c.movaps(x86::oword_ptr(x86::rbp, 48), x86::xmm3);
c.movaps(x86::oword_ptr(x86::rbp, 64), x86::xmm4);
c.movaps(x86::oword_ptr(x86::rbp, 80), x86::xmm5);
c.movaps(x86::oword_ptr(x86::rbp, 96), x86::xmm6);
c.movaps(x86::oword_ptr(x86::rbp, 112), x86::xmm7);
}
c.xend();
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx)), 1);
c.lock().add(x86::qword_ptr(x86::rbx), 127);
build_get_tsc(c);
c.sub(x86::rax, stamp0);
c.jmp(_ret);
c.bind(fall2);
c.xor_(x86::eax, x86::eax);
//c.jmp(_ret);
c.bind(_ret);
#ifdef _WIN32
if (!s_tsx_avx)
{
c.movups(x86::xmm6, x86::oword_ptr(x86::rsp, 0));
c.movups(x86::xmm7, x86::oword_ptr(x86::rsp, 16));
}
#endif
if (s_tsx_avx)
{
c.vzeroupper();
}
c.add(x86::rsp, 40);
c.pop(x86::rbx);
c.pop(x86::r12);
c.pop(x86::r13);
c.pop(x86::rbp);
c.ret();
});
const extern auto spu_getllar_tx = build_function_asm<u64(*)(u32 raddr, void* rdata, cpu_thread* _cpu, u64 rtime)>([](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;
Label fall = c.newLabel();
Label _ret = c.newLabel();
//if (utils::has_avx() && !s_tsx_avx)
//{
// c.vzeroupper();
//}
// Create stack frame if necessary (Windows ABI has only 6 volatile vector registers)
c.push(x86::rbp);
c.push(x86::r13);
c.push(x86::r12);
c.push(x86::rbx);
c.sub(x86::rsp, 40);
#ifdef _WIN32
if (!s_tsx_avx)
{
c.movups(x86::oword_ptr(x86::rsp, 0), x86::xmm6);
c.movups(x86::oword_ptr(x86::rsp, 16), x86::xmm7);
}
#endif
// Prepare registers
build_swap_rdx_with(c, args, x86::r12);
c.mov(x86::rbp, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_sudo_addr)));
c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0]));
c.and_(args[0].r32(), 0xff80);
c.shr(args[0].r32(), 1);
c.lea(x86::rbx, x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0]));
c.mov(x86::r13, args[1]);
// Alloc args[0] to stamp0
const auto stamp0 = args[0];
build_get_tsc(c, stamp0);
// Begin transaction
Label tx0 = build_transaction_enter(c, fall, [&]()
{
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx)), 1);
build_get_tsc(c);
c.sub(x86::rax, stamp0);
2020-11-01 14:45:16 +03:00
c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit1)));
c.jae(fall);
});
// Check pause flag
c.bt(x86::dword_ptr(args[2], ::offset32(&cpu_thread::state)), static_cast<u32>(cpu_flag::pause));
c.jc(fall);
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
c.and_(x86::rax, ~vm::rsrv_shared_mask);
c.cmp(x86::rax, args[3]);
c.jne(fall);
c.xbegin(tx0);
// Just read data to registers
if (s_tsx_avx)
{
c.vmovups(x86::ymm0, x86::yword_ptr(x86::rbp, 0));
c.vmovups(x86::ymm1, x86::yword_ptr(x86::rbp, 32));
c.vmovups(x86::ymm2, x86::yword_ptr(x86::rbp, 64));
c.vmovups(x86::ymm3, x86::yword_ptr(x86::rbp, 96));
}
else
{
c.movaps(x86::xmm0, x86::oword_ptr(x86::rbp, 0));
c.movaps(x86::xmm1, x86::oword_ptr(x86::rbp, 16));
c.movaps(x86::xmm2, x86::oword_ptr(x86::rbp, 32));
c.movaps(x86::xmm3, x86::oword_ptr(x86::rbp, 48));
c.movaps(x86::xmm4, x86::oword_ptr(x86::rbp, 64));
c.movaps(x86::xmm5, x86::oword_ptr(x86::rbp, 80));
c.movaps(x86::xmm6, x86::oword_ptr(x86::rbp, 96));
c.movaps(x86::xmm7, x86::oword_ptr(x86::rbp, 112));
}
c.xend();
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx)), 1);
build_get_tsc(c);
c.sub(x86::rax, stamp0);
// Store data
if (s_tsx_avx)
{
c.vmovaps(x86::yword_ptr(args[1], 0), x86::ymm0);
c.vmovaps(x86::yword_ptr(args[1], 32), x86::ymm1);
c.vmovaps(x86::yword_ptr(args[1], 64), x86::ymm2);
c.vmovaps(x86::yword_ptr(args[1], 96), x86::ymm3);
}
else
{
c.movaps(x86::oword_ptr(args[1], 0), x86::xmm0);
c.movaps(x86::oword_ptr(args[1], 16), x86::xmm1);
c.movaps(x86::oword_ptr(args[1], 32), x86::xmm2);
c.movaps(x86::oword_ptr(args[1], 48), x86::xmm3);
c.movaps(x86::oword_ptr(args[1], 64), x86::xmm4);
c.movaps(x86::oword_ptr(args[1], 80), x86::xmm5);
c.movaps(x86::oword_ptr(args[1], 96), x86::xmm6);
c.movaps(x86::oword_ptr(args[1], 112), x86::xmm7);
}
c.jmp(_ret);
c.bind(fall);
c.xor_(x86::eax, x86::eax);
//c.jmp(_ret);
c.bind(_ret);
#ifdef _WIN32
if (!s_tsx_avx)
{
c.movups(x86::xmm6, x86::oword_ptr(x86::rsp, 0));
c.movups(x86::xmm7, x86::oword_ptr(x86::rsp, 16));
}
#endif
if (s_tsx_avx)
{
c.vzeroupper();
}
c.add(x86::rsp, 40);
c.pop(x86::rbx);
c.pop(x86::r12);
c.pop(x86::r13);
c.pop(x86::rbp);
c.ret();
});
2015-07-13 00:02:02 +03:00
void spu_int_ctrl_t::set(u64 ints)
{
// leave only enabled interrupts
ints &= mask;
2015-07-13 00:02:02 +03:00
// notify if at least 1 bit was set
2020-03-15 08:10:33 +02:00
if (ints && ~stat.fetch_or(ints) & ints)
2015-07-13 00:02:02 +03:00
{
2020-03-15 08:10:33 +02:00
std::shared_lock rlock(id_manager::g_mutex);
2015-07-13 00:02:02 +03:00
if (const auto tag_ptr = tag.lock())
2015-07-13 00:02:02 +03:00
{
if (auto handler = tag_ptr->handler.lock())
2017-02-04 19:30:21 +03:00
{
2020-03-15 08:10:33 +02:00
rlock.unlock();
2017-02-04 19:30:21 +03:00
handler->exec();
}
2015-07-13 00:02:02 +03:00
}
}
}
const spu_imm_table_t g_spu_imm;
2015-03-20 19:53:54 +03:00
2016-05-13 16:55:34 +03:00
spu_imm_table_t::scale_table_t::scale_table_t()
{
for (s32 i = -155; i < 174; i++)
{
2018-05-28 13:40:31 +03:00
m_data[i + 155].vf = _mm_set1_ps(static_cast<float>(std::exp2(i)));
2016-05-13 16:55:34 +03:00
}
}
spu_imm_table_t::spu_imm_table_t()
{
2018-09-05 22:52:31 +02:00
for (u32 i = 0; i < std::size(sldq_pshufb); i++)
2016-05-13 16:55:34 +03:00
{
for (u32 j = 0; j < 16; j++)
{
sldq_pshufb[i]._u8[j] = static_cast<u8>(j - i);
}
}
2018-09-05 22:52:31 +02:00
for (u32 i = 0; i < std::size(srdq_pshufb); i++)
2016-05-13 16:55:34 +03:00
{
2017-12-09 17:57:43 +03:00
const u32 im = (0u - i) & 0x1f;
2016-05-13 16:55:34 +03:00
for (u32 j = 0; j < 16; j++)
{
2017-12-09 17:57:43 +03:00
srdq_pshufb[i]._u8[j] = (j + im > 15) ? 0xff : static_cast<u8>(j + im);
2016-05-13 16:55:34 +03:00
}
}
2018-09-05 22:52:31 +02:00
for (u32 i = 0; i < std::size(rldq_pshufb); i++)
2016-05-13 16:55:34 +03:00
{
for (u32 j = 0; j < 16; j++)
{
rldq_pshufb[i]._u8[j] = static_cast<u8>((j - i) & 0xf);
}
}
}
std::string spu_thread::dump_regs() const
{
std::string ret;
const bool floats_only = debugger_float_mode.load();
2020-11-10 16:57:06 +02:00
for (u32 i = 0; i < 128; i++, ret += '\n')
{
fmt::append(ret, "%s: ", spu_reg_name[i]);
2020-11-10 16:57:06 +02:00
const auto r = gpr[i];
if (auto [size, dst, src] = SPUDisAsm::try_get_insert_mask_info(r); size)
{
// Special: insertion masks
2020-11-10 16:57:06 +02:00
const std::string_view type =
size == 1 ? "byte" :
size == 2 ? "half" :
size == 4 ? "word" :
size == 8 ? "dword" : "error";
if ((size >= 4u && !src) || (size == 2u && src == 1u) || (size == 1u && src == 3u))
{
fmt::append(ret, "insert -> %s[%u]", type, dst);
continue;
}
}
auto to_f64 = [](u32 bits)
{
const u32 abs = bits & 0x7fff'ffff;
constexpr u32 scale = (1 << 23);
return std::copysign(abs < scale ? 0 : std::ldexp((scale + (abs % scale)) / f64{scale}, static_cast<int>(abs >> 23) - 127), bits >> 31 ? -1 : 1);
};
const double array[]{to_f64(r.u32r[0]), to_f64(r.u32r[1]), to_f64(r.u32r[2]), to_f64(r.u32r[3])};
2020-11-10 16:57:06 +02:00
const u32 i3 = r._u32[3];
const bool is_packed = v128::from32p(i3) == r;
if (floats_only)
{
fmt::append(ret, "%g, %g, %g, %g", array[0], array[1], array[2], array[3]);
continue;
}
2020-11-10 16:57:06 +02:00
if (is_packed)
2020-11-10 16:57:06 +02:00
{
// Shortand formatting
fmt::append(ret, "%08x", i3);
2020-11-10 16:57:06 +02:00
}
else
{
fmt::append(ret, "%08x %08x %08x %08x", r.u32r[0], r.u32r[1], r.u32r[2], r.u32r[3]);
2020-11-10 16:57:06 +02:00
}
if (i3 >= 0x80 && is_exec_code(i3))
{
2021-01-22 10:11:54 +02:00
SPUDisAsm dis_asm(cpu_disasm_mode::normal, ls);
2020-11-10 16:57:06 +02:00
dis_asm.disasm(i3);
fmt::append(ret, " -> %s", dis_asm.last_opcode);
}
if (std::any_of(std::begin(array), std::end(array), [](f64 v){ return v != 0; }))
{
if (is_packed)
{
fmt::append(ret, " (%g)", array[0]);
}
else
{
fmt::append(ret, " (%g, %g, %g, %g)", array[0], array[1], array[2], array[3]);
}
}
}
const auto events = ch_events.load();
fmt::append(ret, "\nEvent Stat: 0x%x\n", events.events);
fmt::append(ret, "Event Mask: 0x%x\n", events.mask);
fmt::append(ret, "Event Count: %u\n", events.count);
2020-04-23 19:26:06 +03:00
fmt::append(ret, "SRR0: 0x%05x\n", srr0);
fmt::append(ret, "Stall Stat: %s\n", ch_stall_stat);
fmt::append(ret, "Stall Mask: 0x%x\n", ch_stall_mask);
fmt::append(ret, "Tag Stat: %s\n", ch_tag_stat);
fmt::append(ret, "Tag Update: %s\n", mfc_tag_update{ch_tag_upd});
fmt::append(ret, "Atomic Stat: %s\n", ch_atomic_stat); // TODO: use mfc_atomic_status formatting
fmt::append(ret, "Interrupts: %s\n", interrupts_enabled ? "Enabled" : "Disabled");
2020-04-03 12:18:00 +03:00
fmt::append(ret, "Inbound Mailbox: %s\n", ch_in_mbox);
fmt::append(ret, "Out Mailbox: %s\n", ch_out_mbox);
fmt::append(ret, "Out Interrupts Mailbox: %s\n", ch_out_intr_mbox);
fmt::append(ret, "SNR config: 0x%llx\n", snr_config);
fmt::append(ret, "SNR1: %s\n", ch_snr1);
2020-12-19 09:48:37 +02:00
fmt::append(ret, "SNR2: %s\n", ch_snr2);
const u32 addr = raddr;
fmt::append(ret, "Reservation Addr: %s\n", addr ? fmt::format("0x%x", addr) : "N/A");
fmt::append(ret, "Reservation Data:\n");
be_t<u32> data[32]{};
std::memcpy(data, rdata, sizeof(rdata)); // Show the data even if the reservation was lost inside the atomic loop
2020-12-19 09:48:37 +02:00
for (usz i = 0; i < std::size(data); i += 4)
{
fmt::append(ret, "[0x%02x] %08x %08x %08x %08x\n", i * sizeof(data[0])
, data[i + 0], data[i + 1], data[i + 2], data[i + 3]);
}
2020-04-03 12:18:00 +03:00
return ret;
}
std::string spu_thread::dump_callstack() const
{
std::string ret;
fmt::append(ret, "Call stack:\n=========\n0x%08x (0x0) called\n", pc);
for (const auto& sp : dump_callstack_list())
{
// TODO: function addresses too
fmt::append(ret, "> from 0x%08x (sp=0x%08x)\n", sp.first, sp.second);
}
return ret;
}
std::vector<std::pair<u32, u32>> spu_thread::dump_callstack_list() const
{
std::vector<std::pair<u32, u32>> call_stack_list;
bool first = true;
// Declare first 128-bytes as invalid for stack (common values such as 0 do not make sense here)
for (u32 sp = gpr[1]._u32[3]; (sp & 0xF) == 0u && sp >= 0x80u && sp <= 0x3FFE0u; sp = _ref<u32>(sp), first = false)
{
v128 lr = _ref<v128>(sp + 16);
auto is_invalid = [this](v128 v)
{
const u32 addr = v._u32[3] & 0x3FFFC;
if (v != v128::from32r(addr))
{
// 1) Non-zero lower words are invalid (because BRSL-like instructions generate only zeroes)
// 2) Bits normally masked out by indirect braches are considered invalid
return true;
}
2020-11-10 16:57:06 +02:00
return !addr || !is_exec_code(addr);
};
if (is_invalid(lr))
{
if (first)
{
// Function hasn't saved LR, could be because it's a leaf function
// Use LR directly instead
lr = gpr[0];
if (is_invalid(lr))
{
// Skip it, workaround
continue;
}
}
else
{
break;
}
}
// TODO: function addresses too
call_stack_list.emplace_back(lr._u32[3], sp);
}
return call_stack_list;
}
std::string spu_thread::dump_misc() const
{
std::string ret;
2015-07-01 01:25:52 +03:00
2020-04-03 11:21:18 +03:00
fmt::append(ret, "Block Weight: %u (Retreats: %u)", block_counter, block_failure);
2020-03-17 20:10:49 +03:00
if (g_cfg.core.spu_prof)
{
// Get short function hash
const u64 name = atomic_storage<u64>::load(block_hash);
fmt::append(ret, "\nCurrent block: %s", fmt::base57(be_t<u64>{name}));
// Print only 7 hash characters out of 11 (which covers roughly 48 bits)
ret.resize(ret.size() - 4);
// Print chunk address from lowest 16 bits
fmt::append(ret, "...chunk-0x%05x", (name & 0xffff) * 4);
}
const u32 offset = group ? SPU_FAKE_BASE_ADDR + (id & 0xffffff) * SPU_LS_SIZE : RAW_SPU_BASE_ADDR + index * RAW_SPU_OFFSET;
2018-05-18 18:53:01 +03:00
fmt::append(ret, "\n[%s]", ch_mfc_cmd);
2020-01-21 15:08:45 +02:00
fmt::append(ret, "\nLocal Storage: 0x%08x..0x%08x", offset, offset + 0x3ffff);
if (const u64 _time = start_time)
{
if (const auto func = current_func)
{
ret += "\nIn function: ";
ret += func;
}
else
{
ret += '\n';
}
fmt::append(ret, "\nWaiting: %fs", (get_system_time() - _time) / 1000000.);
}
else
{
ret += "\n\n";
}
fmt::append(ret, "\nTag Mask: 0x%08x", ch_tag_mask);
fmt::append(ret, "\nMFC Queue Size: %u", mfc_size);
for (u32 i = 0; i < 16; i++)
{
if (i < mfc_size)
{
2018-05-18 18:53:01 +03:00
fmt::append(ret, "\n%s", mfc_queue[i]);
}
else
{
2020-04-03 11:21:18 +03:00
break;
}
}
2016-04-14 02:09:41 +03:00
return ret;
}
void spu_thread::cpu_init()
{
2020-01-11 21:36:03 +02:00
std::memset(gpr.data(), 0, gpr.size() * sizeof(gpr[0]));
fpscr.Reset();
2015-03-02 05:10:41 +03:00
ch_mfc_cmd = {};
2017-02-13 16:12:24 +03:00
srr0 = 0;
mfc_size = 0;
mfc_barrier = 0;
mfc_fence = 0;
ch_tag_upd = 0;
2015-03-13 04:59:25 +03:00
ch_tag_mask = 0;
ch_tag_stat.data.raw() = {};
ch_stall_mask = 0;
ch_stall_stat.data.raw() = {};
ch_atomic_stat.data.raw() = {};
ch_out_mbox.data.raw() = {};
ch_out_intr_mbox.data.raw() = {};
2014-07-16 20:10:18 +04:00
ch_events.raw() = {};
interrupts_enabled = false;
raddr = 0;
2015-03-02 05:10:41 +03:00
ch_dec_start_timestamp = get_timebased_time();
ch_dec_value = option & SYS_SPU_THREAD_OPTION_DEC_SYNC_TB_ENABLE ? ~static_cast<u32>(ch_dec_start_timestamp) : 0;
2015-03-02 05:10:41 +03:00
2020-07-17 11:18:04 +03:00
if (get_type() >= spu_type::raw)
{
ch_in_mbox.clear();
ch_snr1.data.raw() = {};
ch_snr2.data.raw() = {};
snr_config = 0;
mfc_prxy_mask.raw() = 0;
mfc_prxy_write_state = {};
}
2020-07-17 11:18:04 +03:00
status_npc.raw() = {get_type() == spu_type::isolated ? SPU_STATUS_IS_ISOLATED : 0, 0};
run_ctrl.raw() = 0;
2015-03-02 05:10:41 +03:00
int_ctrl[0].clear();
int_ctrl[1].clear();
int_ctrl[2].clear();
2015-03-02 05:10:41 +03:00
gpr[1]._u32[3] = 0x3FFF0; // initial stack frame pointer
}
void spu_thread::cpu_return()
{
2020-07-17 11:18:04 +03:00
if (get_type() >= spu_type::raw)
2018-11-05 12:38:35 +01:00
{
if (status_npc.fetch_op([this](status_npc_sync_var& state)
2020-01-21 15:51:55 +03:00
{
if (state.status & SPU_STATUS_RUNNING)
{
// Save next PC and current SPU Interrupt Status
// Used only by RunCtrl stop requests
state.status &= ~SPU_STATUS_RUNNING;
state.npc = pc | +interrupts_enabled;
return true;
}
return false;
}).second)
{
status_npc.notify_one();
}
}
2020-07-17 11:18:04 +03:00
else if (is_stopped())
{
2019-10-29 21:32:34 +02:00
ch_in_mbox.clear();
if (ensure(group->running)-- == 1)
2018-11-05 12:38:35 +01:00
{
{
std::lock_guard lock(group->mutex);
group->run_state = SPU_THREAD_GROUP_STATUS_INITIALIZED;
if (!group->join_state)
{
group->join_state = SYS_SPU_THREAD_GROUP_JOIN_ALL_THREADS_EXIT;
}
for (const auto& thread : group->threads)
{
if (thread && thread.get() != this && thread->status_npc.load().status >> 16 == SYS_SPU_THREAD_STOP_THREAD_EXIT)
{
// Wait for all threads to have error codes if exited by sys_spu_thread_exit
for (u32 status; !thread->exit_status.try_read(status)
|| status != thread->last_exit_status;)
{
utils::pause();
2020-05-27 18:41:17 +03:00
}
}
}
if (status_npc.load().status >> 16 == SYS_SPU_THREAD_STOP_THREAD_EXIT)
{
// Set exit status now, in conjunction with group state changes
exit_status.set_value(last_exit_status);
}
group->stop_count++;
if (const auto ppu = std::exchange(group->waiter, nullptr))
{
// Send exit status directly to the joining thread
ppu->gpr[4] = group->join_state;
ppu->gpr[5] = group->exit_status;
group->join_state.release(0);
lv2_obj::awake(ppu);
}
}
// Notify on last thread stopped
group->stop_count.notify_all();
2018-11-05 12:38:35 +01:00
}
else if (status_npc.load().status >> 16 == SYS_SPU_THREAD_STOP_THREAD_EXIT)
{
exit_status.set_value(last_exit_status);
}
}
}
2018-11-05 12:38:35 +01:00
extern thread_local std::string(*g_tls_log_prefix)();
2018-11-05 12:38:35 +01:00
void spu_thread::cpu_task()
{
// Get next PC and SPU Interrupt status
pc = status_npc.load().npc;
2020-01-21 15:51:55 +03:00
// Note: works both on RawSPU and threaded SPU!
set_interrupt_status((pc & 1) != 0);
pc &= 0x3fffc;
2016-04-14 02:09:41 +03:00
std::fesetround(FE_TOWARDZERO);
2017-12-09 17:57:43 +03:00
g_tls_log_prefix = []
2015-03-20 19:53:54 +03:00
{
const auto cpu = static_cast<spu_thread*>(get_current_cpu_thread());
static thread_local shared_ptr<std::string> name_cache;
if (!cpu->spu_tname.is_equal(name_cache)) [[unlikely]]
{
cpu->spu_tname.peek_op([&](const shared_ptr<std::string>& ptr)
{
if (ptr != name_cache)
{
name_cache = ptr;
}
});
}
2020-07-17 11:18:04 +03:00
const auto type = cpu->get_type();
return fmt::format("%sSPU[0x%07x] Thread (%s) [0x%05x]", type >= spu_type::raw ? type == spu_type::isolated ? "Iso" : "Raw" : "", cpu->lv2_id, *name_cache.get(), cpu->pc);
2016-04-14 02:09:41 +03:00
};
if (jit)
{
while (true)
{
if (state) [[unlikely]]
{
if (check_state())
break;
}
if (_ref<u32>(pc) == 0x0u)
{
2020-01-21 15:51:55 +03:00
if (spu_thread::stop_and_signal(0x0))
pc += 4;
continue;
}
spu_runtime::g_gateway(*this, _ptr<u8>(0), nullptr);
}
// Print some stats
spu_log.notice("Stats: Block Weight: %u (Retreats: %u);", block_counter, block_failure);
}
else
{
ensure(spu_runtime::g_interpreter);
while (true)
{
if (state) [[unlikely]]
{
if (check_state())
break;
}
spu_runtime::g_interpreter(*this, _ptr<u8>(0), nullptr);
}
2015-03-20 19:53:54 +03:00
}
}
struct raw_spu_cleanup
{
raw_spu_cleanup() = default;
raw_spu_cleanup(const raw_spu_cleanup&) = delete;
raw_spu_cleanup& operator =(const raw_spu_cleanup&) = delete;
~raw_spu_cleanup()
{
std::memset(spu_thread::g_raw_spu_id, 0, sizeof(spu_thread::g_raw_spu_id));
spu_thread::g_raw_spu_ctr = 0;
g_fxo->get<raw_spu_cleanup>(); // Register destructor
}
};
2021-01-19 20:15:57 +02:00
void spu_thread::cleanup()
{
2021-01-19 20:15:57 +02:00
// Deallocate local storage
ensure(vm::dealloc(vm_offset(), vm::spu, &shm));
if (g_cfg.core.mfc_debug)
{
utils::memory_decommit(vm::g_stat_addr + vm_offset(), SPU_LS_SIZE);
}
// Deallocate RawSPU ID
2020-07-17 11:18:04 +03:00
if (get_type() >= spu_type::raw)
{
g_raw_spu_id[index] = 0;
g_raw_spu_ctr--;
}
// Free range lock (and signals cleanup was called to the destructor)
2021-01-22 10:11:54 +02:00
vm::free_range_lock(range_lock);
// Signal the debugger about the termination
state += cpu_flag::exit;
}
2021-01-19 20:15:57 +02:00
spu_thread::~spu_thread()
{
// Unmap LS and its mirrors
shm->unmap(ls + SPU_LS_SIZE);
shm->unmap(ls);
shm->unmap(ls - SPU_LS_SIZE);
perf_log.notice("Perf stats for transactions: success %u, failure %u", stx, ftx);
perf_log.notice("Perf stats for PUTLLC reload: successs %u, failure %u", last_succ, last_fail);
2021-01-19 20:15:57 +02:00
}
spu_thread::spu_thread(lv2_spu_group* group, u32 index, std::string_view name, u32 lv2_id, bool is_isolated, u32 option)
2017-01-25 20:50:30 +03:00
: cpu_thread(idm::last_id())
, group(group)
, index(index)
, shm(std::make_shared<utils::shm>(SPU_LS_SIZE))
, ls([&]()
{
if (g_cfg.core.mfc_debug)
{
utils::memory_commit(vm::g_stat_addr + vm_offset(), SPU_LS_SIZE);
}
if (!group)
{
2021-03-19 20:07:35 +02:00
ensure(vm::get(vm::spu)->falloc(vm_offset(), SPU_LS_SIZE, &shm, 0x200));
}
else
{
// 0x1000 indicates falloc to allocate page with no access rights in base memory
2021-03-19 20:07:35 +02:00
ensure(vm::get(vm::spu)->falloc(vm_offset(), SPU_LS_SIZE, &shm, 0x1200));
}
// Try to guess free area
const auto start = vm::g_free_addr + SPU_LS_SIZE * (cpu_thread::id & 0xffffff) * 12;
2021-01-19 20:15:57 +02:00
u32 total = 0;
// Map LS and its mirrors
for (u64 addr = reinterpret_cast<u64>(start); addr < 0x8000'0000'0000;)
{
if (auto ptr = shm->try_map(reinterpret_cast<u8*>(addr)))
{
if (++total == 3)
{
// Use the middle mirror
return ptr - SPU_LS_SIZE;
}
addr += SPU_LS_SIZE;
}
else
{
// Reset, cleanup and start again
for (u32 i = 1; i <= total; i++)
{
shm->unmap(reinterpret_cast<u8*>(addr - i * SPU_LS_SIZE));
}
total = 0;
addr += 0x10000;
}
}
fmt::throw_exception("Failed to map SPU LS memory");
}())
2020-07-17 11:18:04 +03:00
, thread_type(group ? spu_type::threaded : is_isolated ? spu_type::isolated : spu_type::raw)
, option(option)
, lv2_id(lv2_id)
, spu_tname(make_single<std::string>(name))
2016-04-14 02:09:41 +03:00
{
if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit)
{
jit = spu_recompiler_base::make_asmjit_recompiler();
}
if (g_cfg.core.spu_decoder == spu_decoder_type::llvm)
{
jit = spu_recompiler_base::make_fast_llvm_recompiler();
}
if (g_cfg.core.spu_decoder != spu_decoder_type::fast && g_cfg.core.spu_decoder != spu_decoder_type::precise)
{
if (g_cfg.core.spu_block_size != spu_block_size_type::safe)
{
// Initialize stack mirror
std::memset(stack_mirror.data(), 0xff, sizeof(stack_mirror));
}
}
2020-07-17 11:18:04 +03:00
if (get_type() >= spu_type::raw)
{
cpu_init();
}
range_lock = vm::alloc_range_lock();
2016-04-14 02:09:41 +03:00
}
2014-09-24 22:44:26 +04:00
void spu_thread::push_snr(u32 number, u32 value)
2016-04-14 02:09:41 +03:00
{
2016-04-19 16:04:02 +03:00
// Get channel
const auto channel = number & 1 ? &ch_snr2 : &ch_snr1;
2014-09-24 22:44:26 +04:00
// Prepare some data
const u32 event_bit = SPU_EVENT_S1 >> (number & 1);
const u32 bitor_bit = (snr_config >> number) & 1;
// Redundant, g_use_rtm is checked inside tx_start now.
if (g_use_rtm)
{
bool channel_notify = false;
bool thread_notify = false;
const bool ok = utils::tx_start([&]
{
channel_notify = (channel->data.raw() & spu_channel::bit_wait) != 0;
thread_notify = (channel->data.raw() & spu_channel::bit_count) == 0;
if (bitor_bit)
{
channel->data.raw() &= ~spu_channel::bit_wait;
channel->data.raw() |= spu_channel::bit_count | value;
}
else
{
channel->data.raw() = spu_channel::bit_count | value;
}
if (thread_notify)
{
ch_events.raw().events |= event_bit;
if (ch_events.raw().mask & event_bit)
{
ch_events.raw().count = 1;
thread_notify = ch_events.raw().waiting != 0;
}
else
{
thread_notify = false;
}
}
});
if (ok)
{
if (channel_notify)
channel->data.notify_one();
if (thread_notify)
this->notify();
return;
}
}
// Lock event channel in case it needs event notification
ch_events.atomic_op([](ch_events_t& ev)
{
ev.locks++;
});
2016-04-19 16:04:02 +03:00
// Check corresponding SNR register settings
if (bitor_bit)
2015-07-01 01:25:52 +03:00
{
if (channel->push_or(value))
set_events(event_bit);
2015-07-01 01:25:52 +03:00
}
2016-04-14 02:09:41 +03:00
else
2015-07-01 01:25:52 +03:00
{
if (channel->push(value))
set_events(event_bit);
2016-04-14 02:09:41 +03:00
}
ch_events.atomic_op([](ch_events_t& ev)
{
ev.locks--;
});
2014-10-02 14:29:20 +04:00
}
void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8* ls)
2014-08-23 01:15:02 +04:00
{
perf_meter<"DMA"_u32> perf_;
const bool is_get = (args.cmd & ~(MFC_BARRIER_MASK | MFC_FENCE_MASK | MFC_START_MASK)) == MFC_GET_CMD;
2014-08-23 01:15:02 +04:00
u32 eal = args.eal;
u32 lsa = args.lsa & 0x3ffff;
2015-02-15 20:13:06 +03:00
2020-11-23 20:02:05 +02:00
// Keep src point to const
u8* dst = nullptr;
const u8* src = nullptr;
std::tie(dst, src) = [&]() -> std::pair<u8*, const u8*>
{
u8* dst = vm::_ptr<u8>(eal);
u8* src = ls + lsa;
if (is_get)
{
std::swap(src, dst);
}
return {dst, src};
}();
2018-04-01 21:48:58 +03:00
// SPU Thread Group MMIO (LS and SNR) and RawSPU MMIO
if (_this && eal >= RAW_SPU_BASE_ADDR)
2014-08-23 01:15:02 +04:00
{
if (g_cfg.core.mfc_debug && _this)
{
// TODO
}
2015-03-05 00:51:14 +03:00
const u32 index = (eal - SYS_SPU_THREAD_BASE_LOW) / SYS_SPU_THREAD_OFFSET; // thread number in group
const u32 offset = (eal - SYS_SPU_THREAD_BASE_LOW) % SYS_SPU_THREAD_OFFSET; // LS offset or MMIO register
2015-02-15 20:13:06 +03:00
2018-04-01 21:48:58 +03:00
if (eal < SYS_SPU_THREAD_BASE_LOW)
{
// RawSPU MMIO
auto thread = idm::get<named_thread<spu_thread>>(find_raw_spu((eal - RAW_SPU_BASE_ADDR) / RAW_SPU_OFFSET));
2018-04-01 21:48:58 +03:00
if (!thread)
{
// Access Violation
2018-04-01 21:48:58 +03:00
}
else if ((eal - RAW_SPU_BASE_ADDR) % RAW_SPU_OFFSET + args.size - 1 < SPU_LS_SIZE) // LS access
2018-04-01 21:48:58 +03:00
{
}
else if (u32 value; args.size == 4 && is_get && thread->read_reg(eal, value))
2018-04-01 21:48:58 +03:00
{
_this->_ref<u32>(lsa) = value;
2018-04-01 21:48:58 +03:00
return;
}
else if (args.size == 4 && !is_get && thread->write_reg(eal, args.cmd != MFC_SDCRZ_CMD ? + _this->_ref<u32>(lsa) : 0))
2018-04-01 21:48:58 +03:00
{
return;
}
else
{
fmt::throw_exception("Invalid RawSPU MMIO offset (cmd=[%s])", args);
2018-04-01 21:48:58 +03:00
}
}
else if (_this->get_type() >= spu_type::raw)
2018-04-01 21:48:58 +03:00
{
// Access Violation
2018-04-01 21:48:58 +03:00
}
else if (_this->group && _this->group->threads_map[index] != -1)
2015-02-15 20:13:06 +03:00
{
auto& spu = static_cast<spu_thread&>(*_this->group->threads[_this->group->threads_map[index]]);
2014-09-19 04:19:22 +04:00
2020-11-23 20:02:05 +02:00
if (offset + args.size <= SPU_LS_SIZE) // LS access
2014-08-23 01:15:02 +04:00
{
2020-11-23 20:02:05 +02:00
// redirect access
if (auto ptr = spu.ls + offset; is_get)
src = ptr;
else
dst = ptr;
2014-08-23 01:15:02 +04:00
}
else if (!is_get && args.size == 4 && (offset == SYS_SPU_THREAD_SNR1 || offset == SYS_SPU_THREAD_SNR2))
2014-08-23 01:15:02 +04:00
{
spu.push_snr(SYS_SPU_THREAD_SNR2 == offset, args.cmd != MFC_SDCRZ_CMD ? +_this->_ref<u32>(lsa) : 0);
2014-08-23 01:15:02 +04:00
return;
}
else
{
fmt::throw_exception("Invalid MMIO offset (cmd=[%s])", args);
2014-08-23 01:15:02 +04:00
}
}
else
{
// Access Violation
2014-08-23 01:15:02 +04:00
}
}
// Cleanup: if PUT or GET happens after PUTLLC failure, it's too complicated and it's easier to just give up
if (_this)
{
_this->last_faddr = 0;
}
2020-05-16 20:03:27 +03:00
// It is so rare that optimizations are not implemented (TODO)
alignas(64) static constexpr u8 zero_buf[0x10000]{};
if (args.cmd == MFC_SDCRZ_CMD)
{
2020-05-16 20:03:27 +03:00
src = zero_buf;
}
if ((!g_use_rtm && !is_get) || g_cfg.core.spu_accurate_dma) [[unlikely]]
{
perf_meter<"ADMA_GET"_u64> perf_get = perf_;
perf_meter<"ADMA_PUT"_u64> perf_put = perf_;
cpu_thread* _cpu = _this ? _this : get_current_cpu_thread();
atomic_t<u64, 64>* range_lock = nullptr;
if (!_this) [[unlikely]]
{
if (_cpu->id_type() == 2)
{
// Use range_lock of current SPU thread for range locks
range_lock = static_cast<spu_thread*>(_cpu)->range_lock;
}
else
{
goto plain_access;
}
}
else
{
range_lock = _this->range_lock;
}
utils::prefetch_write(range_lock);
for (u32 size = args.size, size0; is_get; size -= size0, dst += size0, src += size0, eal += size0)
{
2020-09-03 00:58:29 +03:00
size0 = std::min<u32>(128 - (eal & 127), std::min<u32>(size, 128));
for (u64 i = 0;; [&]()
{
if (_cpu->state)
2020-09-03 00:58:29 +03:00
{
_cpu->check_state();
2020-09-03 00:58:29 +03:00
}
else if (++i < 25) [[likely]]
{
busy_wait(300);
}
else
{
_cpu->state += cpu_flag::wait + cpu_flag::temp;
std::this_thread::yield();
_cpu->check_state();
}
}())
{
2021-03-05 22:05:37 +03:00
const u64 time0 = vm::reservation_acquire(eal);
if (time0 & 127)
{
continue;
}
const auto cpu = static_cast<spu_thread*>(get_current_cpu_thread());
alignas(64) u8 temp[128];
u8* dst0 = cpu && cpu->id_type() != 1 && (eal & -128) == cpu->raddr ? temp : dst;
if (dst0 == +temp && time0 != cpu->rtime)
{
// Validate rtime for read data
cpu->set_events(SPU_EVENT_LR);
cpu->raddr = 0;
}
2020-09-03 00:58:29 +03:00
switch (size0)
{
case 1:
{
*reinterpret_cast<u8*>(dst0) = *reinterpret_cast<const u8*>(src);
break;
}
case 2:
{
*reinterpret_cast<u16*>(dst0) = *reinterpret_cast<const u16*>(src);
break;
}
case 4:
{
*reinterpret_cast<u32*>(dst0) = *reinterpret_cast<const u32*>(src);
break;
}
case 8:
{
*reinterpret_cast<u64*>(dst0) = *reinterpret_cast<const u64*>(src);
break;
}
2020-09-03 00:58:29 +03:00
case 128:
{
mov_rdata(*reinterpret_cast<spu_rdata_t*>(dst0), *reinterpret_cast<const spu_rdata_t*>(src));
2020-09-03 00:58:29 +03:00
break;
}
default:
{
auto dst1 = dst0;
auto src1 = src;
auto size1 = size0;
while (size1)
{
*reinterpret_cast<v128*>(dst1) = *reinterpret_cast<const v128*>(src1);
dst1 += 16;
src1 += 16;
size1 -= 16;
}
break;
}
}
2021-03-05 22:05:37 +03:00
if (time0 != vm::reservation_acquire(eal) || (size0 == 128 && !cmp_rdata(*reinterpret_cast<spu_rdata_t*>(dst0), *reinterpret_cast<const spu_rdata_t*>(src))))
{
continue;
}
if (dst0 == +temp)
{
// Write to LS
std::memcpy(dst, dst0, size0);
// Validate data
if (std::memcmp(dst0, &cpu->rdata[eal & 127], size0) != 0)
{
cpu->set_events(SPU_EVENT_LR);
cpu->raddr = 0;
}
}
2020-09-03 00:58:29 +03:00
break;
}
if (size == size0)
{
if (g_cfg.core.mfc_debug && _this)
{
auto& dump = reinterpret_cast<mfc_cmd_dump*>(vm::g_stat_addr + _this->vm_offset())[_this->mfc_dump_idx++ % spu_thread::max_mfc_dump_idx];
dump.cmd = args;
dump.cmd.eah = _this->pc;
std::memcpy(dump.data, is_get ? dst : src, std::min<u32>(args.size, 128));
}
return;
}
}
if (g_cfg.core.spu_accurate_dma) [[unlikely]]
{
for (u32 size0, size = args.size;; size -= size0, dst += size0, src += size0, eal += size0)
{
size0 = std::min<u32>(128 - (eal & 127), std::min<u32>(size, 128));
if (size0 == 128u && g_cfg.core.accurate_cache_line_stores)
{
// As atomic as PUTLLUC
do_cell_atomic_128_store(eal, src);
if (size == size0)
{
break;
}
continue;
}
// Lock each cache line
2021-03-05 22:05:37 +03:00
auto& res = vm::reservation_acquire(eal);
// Lock each bit corresponding to a byte being written, using some free space in reservation memory
auto* bits = utils::bless<atomic_t<u128>>(vm::g_reservations + ((eal & 0xff80) / 2 + 16));
// Get writing mask
const u128 wmask = (~u128{} << (eal & 127)) & (~u128{} >> (127 - ((eal + size0 - 1) & 127)));
//const u64 start = (eal & 127) / 2;
//const u64 _end_ = ((eal + size0 - 1) & 127) / 2;
//const u64 wmask = (UINT64_MAX << start) & (UINT64_MAX >> (63 - _end_));
u128 old = 0;
for (u64 i = 0; i != umax; [&]()
{
if (_cpu->state & cpu_flag::pause)
{
const bool ok = cpu_thread::if_suspended<0>(_cpu, {dst, dst + 64, &res}, [&]
{
std::memcpy(dst, src, size0);
res += 128;
});
if (ok)
{
// Exit loop and function
i = -1;
bits = nullptr;
return;
}
}
if (++i < 10)
{
busy_wait(500);
}
else
{
// Wait
_cpu->state += cpu_flag::wait + cpu_flag::temp;
bits->wait(old, wmask);
_cpu->check_state();
}
}())
{
// Completed in suspend_all()
if (!bits)
{
break;
}
bool ok = false;
std::tie(old, ok) = bits->fetch_op([&](auto& v)
{
if (v & wmask)
{
return false;
}
v |= wmask;
return true;
});
if (ok) [[likely]]
{
break;
}
}
if (!bits)
{
if (size == size0)
{
break;
}
continue;
}
// Lock reservation (shared)
auto [_oldd, _ok] = res.fetch_op([&](u64& r)
{
if (r & vm::rsrv_unique_lock)
{
return false;
}
r += 1;
return true;
});
if (!_ok)
{
vm::reservation_shared_lock_internal(res);
}
// Obtain range lock as normal store
vm::range_lock(range_lock, eal, size0);
switch (size0)
{
case 1:
{
*reinterpret_cast<u8*>(dst) = *reinterpret_cast<const u8*>(src);
break;
}
case 2:
{
*reinterpret_cast<u16*>(dst) = *reinterpret_cast<const u16*>(src);
break;
}
case 4:
{
*reinterpret_cast<u32*>(dst) = *reinterpret_cast<const u32*>(src);
break;
}
case 8:
{
*reinterpret_cast<u64*>(dst) = *reinterpret_cast<const u64*>(src);
break;
}
case 128:
{
mov_rdata(*reinterpret_cast<spu_rdata_t*>(dst), *reinterpret_cast<const spu_rdata_t*>(src));
break;
}
default:
{
auto _dst = dst;
auto _src = src;
auto _size = size0;
while (_size)
{
*reinterpret_cast<v128*>(_dst) = *reinterpret_cast<const v128*>(_src);
_dst += 16;
_src += 16;
_size -= 16;
}
break;
}
}
range_lock->release(0);
res += 127;
// Release bits and notify
bits->atomic_op([&](auto& v)
{
v &= ~wmask;
});
bits->notify_all(wmask);
if (size == size0)
{
break;
}
}
//atomic_fence_seq_cst();
if (g_cfg.core.mfc_debug && _this)
{
auto& dump = reinterpret_cast<mfc_cmd_dump*>(vm::g_stat_addr + _this->vm_offset())[_this->mfc_dump_idx++ % spu_thread::max_mfc_dump_idx];
dump.cmd = args;
dump.cmd.eah = _this->pc;
std::memcpy(dump.data, is_get ? dst : src, std::min<u32>(args.size, 128));
}
return;
}
else
{
perf_put.reset();
perf_get.reset();
}
perf_meter<"DMA_PUT"_u64> perf2 = perf_;
switch (u32 size = args.size)
{
case 1:
{
vm::range_lock<1>(range_lock, eal, 1);
*reinterpret_cast<u8*>(dst) = *reinterpret_cast<const u8*>(src);
range_lock->release(0);
break;
}
case 2:
{
vm::range_lock<2>(range_lock, eal, 2);
*reinterpret_cast<u16*>(dst) = *reinterpret_cast<const u16*>(src);
range_lock->release(0);
break;
}
case 4:
{
vm::range_lock<4>(range_lock, eal, 4);
*reinterpret_cast<u32*>(dst) = *reinterpret_cast<const u32*>(src);
range_lock->release(0);
break;
}
case 8:
{
vm::range_lock<8>(range_lock, eal, 8);
*reinterpret_cast<u64*>(dst) = *reinterpret_cast<const u64*>(src);
range_lock->release(0);
break;
}
default:
{
if (((eal & 127) + size) <= 128)
{
vm::range_lock(range_lock, eal, size);
while (size)
{
*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
dst += 16;
src += 16;
size -= 16;
}
range_lock->release(0);
break;
}
u32 range_addr = eal & -128;
u32 range_end = utils::align(eal + size, 128);
// Handle the case of crossing 64K page borders (TODO: maybe split in 4K fragments?)
if (range_addr >> 16 != (range_end - 1) >> 16)
{
u32 nexta = range_end & -65536;
u32 size0 = nexta - eal;
size -= size0;
// Split locking + transfer in two parts (before 64K border, and after it)
vm::range_lock(range_lock, range_addr, size0);
// Avoid unaligned stores in mov_rdata_avx
if (reinterpret_cast<u64>(dst) & 0x10)
{
*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
dst += 16;
src += 16;
size0 -= 16;
}
while (size0 >= 128)
{
mov_rdata(*reinterpret_cast<spu_rdata_t*>(dst), *reinterpret_cast<const spu_rdata_t*>(src));
dst += 128;
src += 128;
size0 -= 128;
}
while (size0)
{
*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
dst += 16;
src += 16;
size0 -= 16;
}
range_lock->release(0);
range_addr = nexta;
}
vm::range_lock(range_lock, range_addr, range_end - range_addr);
// Avoid unaligned stores in mov_rdata_avx
if (reinterpret_cast<u64>(dst) & 0x10)
{
*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
dst += 16;
src += 16;
size -= 16;
}
while (size >= 128)
{
mov_rdata(*reinterpret_cast<spu_rdata_t*>(dst), *reinterpret_cast<const spu_rdata_t*>(src));
dst += 128;
src += 128;
size -= 128;
}
while (size)
{
*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
dst += 16;
src += 16;
size -= 16;
}
range_lock->release(0);
break;
}
}
if (g_cfg.core.mfc_debug && _this)
{
auto& dump = reinterpret_cast<mfc_cmd_dump*>(vm::g_stat_addr + _this->vm_offset())[_this->mfc_dump_idx++ % spu_thread::max_mfc_dump_idx];
dump.cmd = args;
dump.cmd.eah = _this->pc;
std::memcpy(dump.data, is_get ? dst : src, std::min<u32>(args.size, 128));
}
return;
}
plain_access:
switch (u32 size = args.size)
2014-08-23 01:15:02 +04:00
{
case 1:
{
*reinterpret_cast<u8*>(dst) = *reinterpret_cast<const u8*>(src);
break;
2014-08-23 01:15:02 +04:00
}
case 2:
{
*reinterpret_cast<u16*>(dst) = *reinterpret_cast<const u16*>(src);
break;
}
case 4:
{
*reinterpret_cast<u32*>(dst) = *reinterpret_cast<const u32*>(src);
break;
}
case 8:
2014-08-23 01:15:02 +04:00
{
*reinterpret_cast<u64*>(dst) = *reinterpret_cast<const u64*>(src);
break;
}
default:
{
// Avoid unaligned stores in mov_rdata_avx
if (reinterpret_cast<u64>(dst) & 0x10)
{
*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
dst += 16;
src += 16;
size -= 16;
}
while (size >= 128)
{
mov_rdata(*reinterpret_cast<spu_rdata_t*>(dst), *reinterpret_cast<const spu_rdata_t*>(src));
dst += 128;
src += 128;
size -= 128;
}
while (size)
{
*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
dst += 16;
src += 16;
size -= 16;
}
break;
2014-08-23 01:15:02 +04:00
}
}
if (g_cfg.core.mfc_debug && _this)
{
auto& dump = reinterpret_cast<mfc_cmd_dump*>(vm::g_stat_addr + _this->vm_offset())[_this->mfc_dump_idx++ % spu_thread::max_mfc_dump_idx];
dump.cmd = args;
dump.cmd.eah = _this->pc;
std::memcpy(dump.data, is_get ? dst : src, std::min<u32>(args.size, 128));
}
2015-03-02 05:10:41 +03:00
}
2014-08-23 01:15:02 +04:00
bool spu_thread::do_dma_check(const spu_mfc_cmd& args)
2014-08-23 01:15:02 +04:00
{
const u32 mask = utils::rol32(1, args.tag);
2018-04-03 16:09:43 +03:00
if (mfc_barrier & mask || (args.cmd & (MFC_BARRIER_MASK | MFC_FENCE_MASK) && mfc_fence & mask)) [[unlikely]]
{
2018-04-03 16:09:43 +03:00
// Check for special value combination (normally impossible)
2018-04-28 20:11:16 +03:00
if (false)
{
2018-04-03 16:09:43 +03:00
// Update barrier/fence masks if necessary
mfc_barrier = 0;
mfc_fence = 0;
2018-04-03 16:09:43 +03:00
for (u32 i = 0; i < mfc_size; i++)
{
if ((mfc_queue[i].cmd & ~0xc) == MFC_BARRIER_CMD)
2018-04-03 16:09:43 +03:00
{
mfc_barrier |= -1;
mfc_fence |= utils::rol32(1, mfc_queue[i].tag);
2018-04-03 16:09:43 +03:00
continue;
}
if (true)
2018-04-03 16:09:43 +03:00
{
const u32 _mask = utils::rol32(1u, mfc_queue[i].tag);
2018-04-03 16:09:43 +03:00
// A command with barrier hard blocks that tag until it's been dealt with
if (mfc_queue[i].cmd & MFC_BARRIER_MASK)
{
mfc_barrier |= _mask;
}
// A new command that has a fence can't be executed until the stalled list has been dealt with
mfc_fence |= _mask;
}
}
if (mfc_barrier & mask || (args.cmd & MFC_FENCE_MASK && mfc_fence & mask))
{
return false;
}
2018-04-03 16:09:43 +03:00
return true;
}
2018-04-03 16:09:43 +03:00
return false;
}
2015-03-02 05:10:41 +03:00
return true;
}
bool spu_thread::do_list_transfer(spu_mfc_cmd& args)
{
// Amount of elements to fetch in one go
constexpr u32 fetch_size = 6;
struct alignas(8) list_element
{
be_t<u16> sb; // Stall-and-Notify bit (0x8000)
be_t<u16> ts; // List Transfer Size
be_t<u32> ea; // External Address Low
};
2014-08-23 01:15:02 +04:00
union
2014-08-23 01:15:02 +04:00
{
list_element items[fetch_size];
alignas(v128) char bufitems[sizeof(items)];
};
spu_mfc_cmd transfer;
transfer.eah = 0;
transfer.tag = args.tag;
transfer.cmd = MFC(args.cmd & ~MFC_LIST_MASK);
2014-08-23 01:15:02 +04:00
args.lsa &= 0x3fff0;
args.eal &= 0x3fff8;
u32 index = fetch_size;
// Assume called with size greater than 0
while (true)
{
// Check if fetching is needed
if (index == fetch_size)
{
// Reset to elements array head
index = 0;
const auto src = _ptr<const void>(args.eal);
const v128 data0 = v128::loadu(src, 0);
const v128 data1 = v128::loadu(src, 1);
const v128 data2 = v128::loadu(src, 2);
2019-12-03 00:31:34 +03:00
reinterpret_cast<v128*>(bufitems)[0] = data0;
reinterpret_cast<v128*>(bufitems)[1] = data1;
reinterpret_cast<v128*>(bufitems)[2] = data2;
}
const u32 size = items[index].ts & 0x7fff;
const u32 addr = items[index].ea;
spu_log.trace("LIST: item=0x%016x, lsa=0x%05x", std::bit_cast<be_t<u64>>(items[index]), args.lsa | (addr & 0xf));
if (size)
{
transfer.eal = addr;
transfer.lsa = args.lsa | (addr & 0xf);
transfer.size = size;
do_dma_transfer(this, transfer, ls);
const u32 add_size = std::max<u32>(size, 16);
args.lsa += add_size;
}
args.size -= 8;
if (!args.size)
{
// No more elements
break;
}
args.eal += 8;
if (items[index].sb & 0x8000) [[unlikely]]
{
ch_stall_mask |= utils::rol32(1, args.tag);
if (!ch_stall_stat.get_count())
{
set_events(SPU_EVENT_SN);
}
ch_stall_stat.set_value(utils::rol32(1, args.tag) | ch_stall_stat.get_value());
args.tag |= 0x80; // Set stalled status
return false;
}
index++;
}
return true;
}
2020-09-11 06:49:19 +03:00
bool spu_thread::do_putllc(const spu_mfc_cmd& args)
{
perf_meter<"PUTLLC-"_u64> perf0;
perf_meter<"PUTLLC+"_u64> perf1 = perf0;
2020-09-11 06:49:19 +03:00
// Store conditionally
const u32 addr = args.eal & -128;
if ([&]()
{
perf_meter<"PUTLLC."_u64> perf2 = perf0;
2020-09-11 06:49:19 +03:00
if (raddr != addr)
{
return false;
}
const auto& to_write = _ref<spu_rdata_t>(args.lsa & 0x3ff80);
2021-03-05 22:05:37 +03:00
auto& res = vm::reservation_acquire(addr);
2020-09-11 06:49:19 +03:00
2020-09-14 23:38:17 +03:00
// TODO: Limit scope!!
rsx::reservation_lock rsx_lock(addr, 128);
2020-09-11 06:49:19 +03:00
if (!g_use_rtm && rtime != res)
{
return false;
}
if (!g_use_rtm && cmp_rdata(to_write, rdata))
{
// Writeback of unchanged data. Only check memory change
return cmp_rdata(rdata, vm::_ref<spu_rdata_t>(addr)) && res.compare_and_swap_test(rtime, rtime + 128);
2020-09-11 06:49:19 +03:00
}
if (g_use_rtm) [[likely]]
{
switch (u64 count = spu_putllc_tx(addr, rtime, rdata, to_write))
2020-09-11 06:49:19 +03:00
{
case UINT64_MAX:
2020-09-11 06:49:19 +03:00
{
auto& data = *vm::get_super_ptr<spu_rdata_t>(addr);
const bool ok = cpu_thread::suspend_all<+3>(this, {data, data + 64, &res}, [&]()
{
if ((res & -128) == rtime)
{
if (cmp_rdata(rdata, data))
{
mov_rdata(data, to_write);
res += 127;
return true;
}
2020-09-11 06:49:19 +03:00
}
// Save previous data
mov_rdata_nt(rdata, data);
res -= 1;
return false;
});
2020-11-15 03:26:10 +03:00
const u64 count2 = __rdtsc() - perf2.get();
if (count2 > 20000 && g_cfg.core.perf_report) [[unlikely]]
{
perf_log.warning(u8"PUTLLC: took too long: %.3fµs (%u c) (addr=0x%x) (S)", count2 / (utils::get_tsc_freq() / 1000'000.), count2, addr);
}
if (ok)
{
break;
}
last_ftime = -1;
[[fallthrough]];
}
case 0:
{
if (addr == last_faddr)
{
last_fail++;
}
if (last_ftime != umax)
{
last_faddr = 0;
return false;
}
utils::prefetch_read(rdata);
utils::prefetch_read(rdata + 64);
last_faddr = addr;
last_ftime = res.load() & -128;
last_ftsc = __rdtsc();
return false;
2020-09-11 06:49:19 +03:00
}
default:
{
if (count > 20000 && g_cfg.core.perf_report) [[unlikely]]
{
2020-11-15 03:26:10 +03:00
perf_log.warning(u8"PUTLLC: took too long: %.3fµs (%u c) (addr = 0x%x)", count / (utils::get_tsc_freq() / 1000'000.), count, addr);
}
break;
}
2020-09-11 06:49:19 +03:00
}
if (addr == last_faddr)
{
last_succ++;
}
last_faddr = 0;
return true;
2020-09-11 06:49:19 +03:00
}
auto [_oldd, _ok] = res.fetch_op([&](u64& r)
2020-09-11 06:49:19 +03:00
{
if ((r & -128) != rtime || (r & 127))
{
return false;
}
r += vm::rsrv_unique_lock;
return true;
});
if (!_ok)
{
// Already locked or updated: give up
2020-09-11 06:49:19 +03:00
return false;
}
vm::_ref<atomic_t<u32>>(addr) += 0;
auto& super_data = *vm::get_super_ptr<spu_rdata_t>(addr);
2020-09-11 06:49:19 +03:00
const bool success = [&]()
{
// Full lock (heavyweight)
// TODO: vm::check_addr
vm::writer_lock lock(addr);
if (cmp_rdata(rdata, super_data))
{
mov_rdata(super_data, to_write);
res += 64;
2020-09-11 06:49:19 +03:00
return true;
}
res -= 64;
2020-09-11 06:49:19 +03:00
return false;
}();
return success;
}())
{
2021-03-05 22:05:37 +03:00
vm::reservation_notifier(addr).notify_all(-128);
2020-09-11 06:49:19 +03:00
raddr = 0;
perf0.reset();
2020-09-11 06:49:19 +03:00
return true;
}
else
{
if (raddr)
{
// Last check for event before we clear the reservation
if (raddr == addr)
2020-09-11 06:49:19 +03:00
{
set_events(SPU_EVENT_LR);
2020-09-11 06:49:19 +03:00
}
else
{
get_events(SPU_EVENT_LR);
}
2020-09-11 06:49:19 +03:00
}
if (!vm::check_addr(addr, vm::page_writable))
{
vm::_ref<atomic_t<u8>>(addr) += 0; // Access violate
}
2020-09-11 06:49:19 +03:00
raddr = 0;
perf1.reset();
2020-09-11 06:49:19 +03:00
return false;
}
}
void do_cell_atomic_128_store(u32 addr, const void* to_write)
{
perf_meter<"STORE128"_u64> perf0;
const auto cpu = get_current_cpu_thread();
2020-09-14 23:38:17 +03:00
rsx::reservation_lock rsx_lock(addr, 128);
2018-06-21 15:24:47 +03:00
if (g_use_rtm) [[likely]]
{
2020-11-15 03:26:10 +03:00
u64 result = spu_putlluc_tx(addr, to_write, cpu);
if (result == 0)
2018-05-19 23:14:02 +03:00
{
auto& sdata = *vm::get_super_ptr<spu_rdata_t>(addr);
2021-03-05 22:05:37 +03:00
auto& res = vm::reservation_acquire(addr);
cpu_thread::suspend_all<+2>(cpu, {&res}, [&]
{
mov_rdata_nt(sdata, *static_cast<const spu_rdata_t*>(to_write));
res += 127;
});
}
2020-11-15 03:26:10 +03:00
if (!result)
{
result = __rdtsc() - perf0.get();
}
if (result > 20000 && g_cfg.core.perf_report) [[unlikely]]
{
2020-11-15 03:26:10 +03:00
perf_log.warning(u8"STORE128: took too long: %.3fµs (%u c) (addr=0x%x)", result / (utils::get_tsc_freq() / 1000'000.), result, addr);
}
static_cast<void>(cpu->test_stopped());
}
else
{
auto& data = vm::_ref<spu_rdata_t>(addr);
auto [res, time0] = vm::reservation_lock(addr);
2018-06-02 13:43:22 +03:00
*reinterpret_cast<atomic_t<u32>*>(&data) += 0;
2018-06-02 13:43:22 +03:00
auto& super_data = *vm::get_super_ptr<spu_rdata_t>(addr);
{
// Full lock (heavyweight)
// TODO: vm::check_addr
vm::writer_lock lock(addr);
mov_rdata(super_data, *static_cast<const spu_rdata_t*>(to_write));
res += 64;
}
}
}
void spu_thread::do_putlluc(const spu_mfc_cmd& args)
{
perf_meter<"PUTLLUC"_u64> perf0;
const u32 addr = args.eal & -128;
if (raddr && addr == raddr)
{
// Try to process PUTLLUC using PUTLLC when a reservation is active:
// If it fails the reservation is cleared, LR event is set and we fallback to the main implementation
// All of this is done atomically in PUTLLC
if (do_putllc(args))
{
// Success, return as our job was done here
return;
}
// Failure, fallback to the main implementation
}
do_cell_atomic_128_store(addr, _ptr<spu_rdata_t>(args.lsa & 0x3ff80));
2021-03-05 22:05:37 +03:00
vm::reservation_notifier(addr).notify_all(-128);
}
2021-03-05 22:05:37 +03:00
void spu_thread::do_mfc(bool /*wait*/)
{
u32 removed = 0;
u32 barrier = 0;
u32 fence = 0;
// Process enqueued commands
static_cast<void>(std::remove_if(mfc_queue + 0, mfc_queue + mfc_size, [&](spu_mfc_cmd& args)
{
// Select tag bit in the tag mask or the stall mask
const u32 mask = utils::rol32(1, args.tag);
if ((args.cmd & ~0xc) == MFC_BARRIER_CMD)
{
if (&args - mfc_queue <= removed)
{
// Remove barrier-class command if it's the first in the queue
atomic_fence_seq_cst();
removed++;
return true;
}
// Block all tags
barrier |= -1;
fence |= mask;
return false;
}
if (barrier & mask)
{
2018-04-03 16:09:43 +03:00
fence |= mask;
return false;
}
2018-07-10 11:25:11 +03:00
if (args.cmd & (MFC_BARRIER_MASK | MFC_FENCE_MASK) && fence & mask)
{
2018-07-10 11:25:11 +03:00
if (args.cmd & MFC_BARRIER_MASK)
{
barrier |= mask;
}
return false;
}
if (args.cmd & MFC_LIST_MASK)
{
if (!(args.tag & 0x80))
{
2018-05-23 11:55:15 +03:00
if (do_list_transfer(args))
{
2018-05-23 11:55:15 +03:00
removed++;
return true;
}
}
if (args.cmd & MFC_BARRIER_MASK)
{
barrier |= mask;
}
2018-04-03 16:09:43 +03:00
fence |= mask;
return false;
}
if (args.cmd == MFC_PUTQLLUC_CMD)
{
2018-04-28 20:11:16 +03:00
if (fence & mask)
{
return false;
}
do_putlluc(args);
}
else if (args.size)
{
do_dma_transfer(this, args, ls);
}
removed++;
return true;
}));
mfc_size -= removed;
2018-04-03 16:09:43 +03:00
mfc_barrier = barrier;
mfc_fence = fence;
if (removed && ch_tag_upd)
{
const u32 completed = get_mfc_completed();
if (completed && ch_tag_upd == MFC_TAG_UPDATE_ANY)
{
ch_tag_stat.set_value(completed);
ch_tag_upd = MFC_TAG_UPDATE_IMMEDIATE;
}
else if (completed == ch_tag_mask && ch_tag_upd == MFC_TAG_UPDATE_ALL)
{
ch_tag_stat.set_value(completed);
ch_tag_upd = MFC_TAG_UPDATE_IMMEDIATE;
}
}
2020-05-16 20:29:39 +03:00
if (check_mfc_interrupts(pc + 4))
{
spu_runtime::g_escape(this);
}
}
bool spu_thread::check_mfc_interrupts(u32 next_pc)
{
if (ch_events.load().count && std::exchange(interrupts_enabled, false))
2020-05-16 20:29:39 +03:00
{
srr0 = next_pc;
// Test for BR/BRA instructions (they are equivalent at zero pc)
const u32 br = _ref<u32>(0);
pc = (br & 0xfd80007f) == 0x30000000 ? (br >> 5) & 0x3fffc : 0;
return true;
}
return false;
}
2020-11-10 16:57:06 +02:00
bool spu_thread::is_exec_code(u32 addr) const
{
if (addr & ~0x3FFFC)
{
return false;
}
for (u32 i = 0; i < 30; i++)
{
const u32 addr0 = addr + (i * 4);
const u32 op = _ref<u32>(addr0);
const auto type = s_spu_itype.decode(op);
if (type == spu_itype::UNK || !op)
{
return false;
}
if (type & spu_itype::branch)
{
// TODO
break;
}
}
return true;
}
2021-04-09 21:12:47 +02:00
u32 spu_thread::get_mfc_completed() const
{
return ch_tag_mask & ~mfc_fence;
}
bool spu_thread::process_mfc_cmd()
{
// Stall infinitely if MFC queue is full
while (mfc_size >= 16) [[unlikely]]
{
auto old = state.add_fetch(cpu_flag::wait);
if (is_stopped(old))
{
return false;
}
2021-04-09 21:12:47 +02:00
thread_ctrl::wait_on(state, old);
}
spu::scheduler::concurrent_execution_watchdog watchdog(*this);
2020-10-22 18:28:56 +03:00
spu_log.trace("DMAC: (%s)", ch_mfc_cmd);
switch (ch_mfc_cmd.cmd)
{
2020-05-16 20:03:27 +03:00
case MFC_SDCRT_CMD:
case MFC_SDCRTST_CMD:
return true;
case MFC_GETLLAR_CMD:
2014-08-23 01:15:02 +04:00
{
perf_meter<"GETLLAR"_u64> perf0;
2019-06-28 08:24:28 +03:00
const u32 addr = ch_mfc_cmd.eal & -128;
const auto& data = vm::_ref<spu_rdata_t>(addr);
2014-08-23 01:15:02 +04:00
if (addr == last_faddr)
{
// TODO: make this configurable and possible to disable
spu_log.trace(u8"GETLLAR after fail: addr=0x%x, time=%u c", last_faddr, (perf0.get() - last_ftsc));
}
2021-03-05 22:05:37 +03:00
if (addr == last_faddr && perf0.get() - last_ftsc < 1000 && (vm::reservation_acquire(addr) & -128) == last_ftime)
{
rtime = last_ftime;
raddr = last_faddr;
last_ftime = 0;
mov_rdata(_ref<spu_rdata_t>(ch_mfc_cmd.lsa & 0x3ff80), rdata);
ch_atomic_stat.set_value(MFC_GETLLAR_SUCCESS);
return true;
}
else
{
// Silent failure
last_faddr = 0;
}
2021-03-05 22:05:37 +03:00
if (addr == raddr && !g_use_rtm && g_cfg.core.spu_getllar_polling_detection && rtime == vm::reservation_acquire(addr) && cmp_rdata(rdata, data))
{
// Spinning, might as well yield cpu resources
std::this_thread::yield();
// Reset perf
perf0.restart();
}
alignas(64) spu_rdata_t temp;
2020-04-26 05:48:31 +03:00
u64 ntime;
2020-09-14 23:38:17 +03:00
rsx::reservation_lock rsx_lock(addr, 128);
2020-04-26 05:48:31 +03:00
if (raddr)
{
// Save rdata from previous reservation
mov_rdata(temp, rdata);
}
for (u64 i = 0; i != umax; [&]()
{
if (state & cpu_flag::pause)
2018-05-19 23:20:41 +03:00
{
auto& sdata = *vm::get_super_ptr<spu_rdata_t>(addr);
const bool ok = cpu_thread::if_suspended<0>(this, {&ntime}, [&]
{
// Guaranteed success
2021-03-05 22:05:37 +03:00
ntime = vm::reservation_acquire(addr);
mov_rdata_nt(rdata, sdata);
});
// Exit loop
if (ok && (ntime & 127) == 0)
{
atomic_fence_seq_cst();
i = -1;
return;
}
2020-04-16 22:45:58 +03:00
}
2020-04-16 22:45:58 +03:00
if (++i < 25) [[likely]]
{
busy_wait(300);
2018-05-19 23:20:41 +03:00
}
2020-04-16 22:45:58 +03:00
else
{
state += cpu_flag::wait + cpu_flag::temp;
2020-04-16 22:45:58 +03:00
std::this_thread::yield();
!check_state();
2020-04-16 22:45:58 +03:00
}
}())
{
2021-03-05 22:05:37 +03:00
ntime = vm::reservation_acquire(addr);
if (ntime & vm::rsrv_unique_lock)
{
2020-04-16 22:45:58 +03:00
// There's an on-going reservation store, wait
continue;
}
2020-02-11 23:36:46 +02:00
u64 test_mask = -1;
if (ntime & 127)
{
// Try to use TSX to obtain data atomically
if (!g_use_rtm || !spu_getllar_tx(addr, rdata, this, ntime & -128))
{
// See previous ntime check.
continue;
}
else
{
// If succeeded, only need to check unique lock bit
test_mask = ~vm::rsrv_shared_mask;
}
}
else
{
mov_rdata(rdata, data);
}
2021-03-05 22:05:37 +03:00
if (u64 time0 = vm::reservation_acquire(addr); (ntime & test_mask) != (time0 & test_mask))
2020-04-16 22:45:58 +03:00
{
// Reservation data has been modified recently
if (time0 & vm::rsrv_unique_lock) i += 12;
2020-04-16 22:45:58 +03:00
continue;
}
2020-04-16 22:45:58 +03:00
if (g_cfg.core.spu_accurate_getllar && !cmp_rdata(rdata, data))
{
2020-04-16 22:45:58 +03:00
i += 2;
continue;
}
2020-04-16 22:45:58 +03:00
if (i >= 15 && g_cfg.core.perf_report) [[unlikely]]
{
perf_log.warning("GETLLAR: took too long: %u", i);
}
2020-04-16 22:45:58 +03:00
break;
}
if (raddr && raddr != addr)
{
// Last check for event before we replace the reservation with a new one
if (reservation_check(raddr, temp))
{
set_events(SPU_EVENT_LR);
}
}
else if (raddr == addr)
{
// Lost previous reservation on polling
if (ntime != rtime || !cmp_rdata(rdata, temp))
{
set_events(SPU_EVENT_LR);
}
}
raddr = addr;
rtime = ntime;
mov_rdata(_ref<spu_rdata_t>(ch_mfc_cmd.lsa & 0x3ff80), rdata);
ch_atomic_stat.set_value(MFC_GETLLAR_SUCCESS);
if (g_cfg.core.mfc_debug)
{
auto& dump = reinterpret_cast<mfc_cmd_dump*>(vm::g_stat_addr + vm_offset())[mfc_dump_idx++ % spu_thread::max_mfc_dump_idx];
dump.cmd = ch_mfc_cmd;
dump.cmd.eah = pc;
std::memcpy(dump.data, rdata, 128);
}
return true;
}
case MFC_PUTLLC_CMD:
{
// Avoid logging useless commands if there is no reservation
const bool dump = g_cfg.core.mfc_debug && raddr;
if (do_putllc(ch_mfc_cmd))
{
ch_atomic_stat.set_value(MFC_PUTLLC_SUCCESS);
}
else
{
ch_atomic_stat.set_value(MFC_PUTLLC_FAILURE);
}
if (dump)
{
auto& dump = reinterpret_cast<mfc_cmd_dump*>(vm::g_stat_addr + vm_offset())[mfc_dump_idx++ % spu_thread::max_mfc_dump_idx];
dump.cmd = ch_mfc_cmd;
dump.cmd.eah = pc;
dump.cmd.tag = static_cast<u32>(ch_atomic_stat.get_value()); // Use tag as atomic status
std::memcpy(dump.data, _ptr<u8>(ch_mfc_cmd.lsa & 0x3ff80), 128);
}
return !test_stopped();
2014-08-23 01:15:02 +04:00
}
case MFC_PUTLLUC_CMD:
{
if (g_cfg.core.mfc_debug)
{
auto& dump = reinterpret_cast<mfc_cmd_dump*>(vm::g_stat_addr + vm_offset())[mfc_dump_idx++ % spu_thread::max_mfc_dump_idx];
dump.cmd = ch_mfc_cmd;
dump.cmd.eah = pc;
std::memcpy(dump.data, _ptr<u8>(ch_mfc_cmd.lsa & 0x3ff80), 128);
}
do_putlluc(ch_mfc_cmd);
ch_atomic_stat.set_value(MFC_PUTLLUC_SUCCESS);
return !test_stopped();
}
case MFC_PUTQLLUC_CMD:
2014-08-23 01:15:02 +04:00
{
if (g_cfg.core.mfc_debug)
{
auto& dump = reinterpret_cast<mfc_cmd_dump*>(vm::g_stat_addr + vm_offset())[mfc_dump_idx++ % spu_thread::max_mfc_dump_idx];
dump.cmd = ch_mfc_cmd;
dump.cmd.eah = pc;
std::memcpy(dump.data, _ptr<u8>(ch_mfc_cmd.lsa & 0x3ff80), 128);
}
const u32 mask = utils::rol32(1, ch_mfc_cmd.tag);
2018-04-28 20:11:16 +03:00
if ((mfc_barrier | mfc_fence) & mask) [[unlikely]]
{
mfc_queue[mfc_size++] = ch_mfc_cmd;
2018-04-28 20:11:16 +03:00
mfc_fence |= mask;
}
else
{
do_putlluc(ch_mfc_cmd);
}
return true;
}
case MFC_SNDSIG_CMD:
case MFC_SNDSIGB_CMD:
case MFC_SNDSIGF_CMD:
{
if (ch_mfc_cmd.size != 4)
{
break;
}
2018-09-06 13:28:12 +02:00
[[fallthrough]];
}
2014-08-23 01:15:02 +04:00
case MFC_PUT_CMD:
2015-03-02 05:10:41 +03:00
case MFC_PUTB_CMD:
case MFC_PUTF_CMD:
case MFC_PUTR_CMD:
case MFC_PUTRB_CMD:
case MFC_PUTRF_CMD:
2014-08-23 01:15:02 +04:00
case MFC_GET_CMD:
2015-03-02 05:10:41 +03:00
case MFC_GETB_CMD:
case MFC_GETF_CMD:
2020-05-16 20:03:27 +03:00
case MFC_SDCRZ_CMD:
2014-08-23 01:15:02 +04:00
{
if (ch_mfc_cmd.size <= 0x4000) [[likely]]
{
if (do_dma_check(ch_mfc_cmd)) [[likely]]
{
if (ch_mfc_cmd.size)
{
do_dma_transfer(this, ch_mfc_cmd, ls);
}
2018-05-23 11:55:15 +03:00
return true;
}
2014-08-23 01:15:02 +04:00
mfc_queue[mfc_size++] = ch_mfc_cmd;
mfc_fence |= utils::rol32(1, ch_mfc_cmd.tag);
if (ch_mfc_cmd.cmd & MFC_BARRIER_MASK)
{
mfc_barrier |= utils::rol32(1, ch_mfc_cmd.tag);
}
return true;
}
2017-12-09 17:57:43 +03:00
break;
}
2014-08-23 01:15:02 +04:00
case MFC_PUTL_CMD:
2015-03-02 05:10:41 +03:00
case MFC_PUTLB_CMD:
case MFC_PUTLF_CMD:
case MFC_PUTRL_CMD:
case MFC_PUTRLB_CMD:
case MFC_PUTRLF_CMD:
2014-08-23 01:15:02 +04:00
case MFC_GETL_CMD:
2015-03-02 05:10:41 +03:00
case MFC_GETLB_CMD:
case MFC_GETLF_CMD:
2014-08-23 01:15:02 +04:00
{
if (ch_mfc_cmd.size <= 0x4000) [[likely]]
2015-07-21 23:14:04 +03:00
{
auto& cmd = mfc_queue[mfc_size];
cmd = ch_mfc_cmd;
//if (g_cfg.core.mfc_debug)
//{
2021-03-05 22:05:37 +03:00
// TODO: This needs a disambiguator with list elements dumping
// auto& dump = reinterpret_cast<mfc_cmd_dump*>(vm::g_stat_addr + vm_offset())[mfc_dump_idx++ % spu_thread::max_mfc_dump_idx];
// dump.cmd = ch_mfc_cmd;
// dump.cmd.eah = pc;
// std::memcpy(dump.data, _ptr<u8>(ch_mfc_cmd.eah & 0x3fff0), std::min<u32>(ch_mfc_cmd.size, 128));
//}
if (do_dma_check(cmd)) [[likely]]
{
if (!cmd.size || do_list_transfer(cmd)) [[likely]]
{
2018-05-23 11:55:15 +03:00
return true;
}
}
mfc_size++;
mfc_fence |= utils::rol32(1, cmd.tag);
if (cmd.cmd & MFC_BARRIER_MASK)
2015-07-21 23:14:04 +03:00
{
mfc_barrier |= utils::rol32(1, cmd.tag);
2015-07-21 23:14:04 +03:00
}
2020-05-16 20:29:39 +03:00
if (check_mfc_interrupts(pc + 4))
{
spu_runtime::g_escape(this);
}
return true;
2015-03-02 05:10:41 +03:00
}
break;
2014-08-23 01:15:02 +04:00
}
2015-12-20 10:16:31 +02:00
case MFC_BARRIER_CMD:
case MFC_EIEIO_CMD:
2018-04-01 20:52:54 +03:00
case MFC_SYNC_CMD:
{
if (mfc_size == 0)
2018-04-01 20:52:54 +03:00
{
atomic_fence_seq_cst();
2018-04-01 20:52:54 +03:00
}
else
{
mfc_queue[mfc_size++] = ch_mfc_cmd;
mfc_barrier |= -1;
mfc_fence |= utils::rol32(1, ch_mfc_cmd.tag);
2018-04-01 20:52:54 +03:00
}
return true;
}
default:
{
break;
}
2015-02-16 04:53:53 +03:00
}
2015-03-02 05:10:41 +03:00
fmt::throw_exception("Unknown command (cmd=%s, lsa=0x%x, ea=0x%llx, tag=0x%x, size=0x%x)",
ch_mfc_cmd.cmd, ch_mfc_cmd.lsa, ch_mfc_cmd.eal, ch_mfc_cmd.tag, ch_mfc_cmd.size);
2014-08-23 01:15:02 +04:00
}
2021-04-09 21:12:47 +02:00
bool spu_thread::reservation_check(u32 addr, const decltype(rdata)& data) const
{
if (!addr)
{
// No reservation to be lost in the first place
return false;
}
2021-03-05 22:05:37 +03:00
if ((vm::reservation_acquire(addr) & -128) != rtime)
{
return true;
}
// Ensure data is allocated (HACK: would raise LR event if not)
// Set range_lock first optimistically
range_lock->store(u64{128} << 32 | addr);
u64 lock_val = vm::g_range_lock;
u64 old_lock = 0;
while (lock_val != old_lock)
{
// Since we want to read data, let's check readability first
if (!(lock_val & vm::range_readable))
{
// Only one abnormal operation is "unreadable"
if ((lock_val >> vm::range_pos) == (vm::range_locked >> vm::range_pos))
{
// All page flags are untouched and can be read safely
if (!vm::check_addr(addr))
{
// Assume our memory is being (de)allocated
range_lock->release(0);
break;
}
// g_shmem values are unchanged too
const u64 is_shmem = vm::g_shmem[addr >> 16];
const u64 test_addr = is_shmem ? (is_shmem | static_cast<u16>(addr)) / 128 : u64{addr} / 128;
const u64 lock_addr = lock_val / 128;
if (test_addr == lock_addr)
{
// Our reservation is locked
range_lock->release(0);
break;
}
break;
}
}
// Fallback to normal range check
const u64 lock_addr = static_cast<u32>(lock_val);
const u32 lock_size = static_cast<u32>(lock_val << 3 >> 35);
if (lock_addr + lock_size <= addr || lock_addr >= addr + 128)
{
// We are outside locked range, so page flags are unaffected
if (!vm::check_addr(addr))
{
range_lock->release(0);
break;
}
}
else if (!(lock_val & vm::range_readable))
{
range_lock->release(0);
break;
}
old_lock = std::exchange(lock_val, vm::g_range_lock);
}
if (!range_lock->load()) [[unlikely]]
{
return true;
}
const bool res = cmp_rdata(data, vm::_ref<decltype(rdata)>(addr));
range_lock->release(0);
return !res;
}
spu_thread::ch_events_t spu_thread::get_events(u32 mask_hint, bool waiting, bool reading)
2015-07-21 23:14:04 +03:00
{
if (auto mask1 = ch_events.load().mask; mask1 & ~SPU_EVENT_IMPLEMENTED)
{
fmt::throw_exception("SPU Events not implemented (mask=0x%x)", mask1);
}
retry:
u32 collect = 0;
// Check reservation status and set SPU_EVENT_LR if lost
if (mask_hint & SPU_EVENT_LR)
2015-07-21 23:14:04 +03:00
{
if (reservation_check(raddr, rdata))
{
collect |= SPU_EVENT_LR;
raddr = 0;
}
2015-07-15 21:11:32 +03:00
}
2020-08-20 20:36:49 +03:00
// SPU Decrementer Event on underflow (use the upper 32-bits to determine it)
if (mask_hint & SPU_EVENT_TM)
{
if (const u64 res = (ch_dec_value - (get_timebased_time() - ch_dec_start_timestamp)) >> 32)
2017-02-13 16:12:24 +03:00
{
// Set next event to the next time the decrementer underflows
ch_dec_start_timestamp -= res << 32;
collect |= SPU_EVENT_TM;
2017-02-13 16:12:24 +03:00
}
}
if (collect)
2020-08-28 01:18:24 +03:00
{
set_events(collect);
}
2020-08-28 01:18:24 +03:00
auto [res, ok] = ch_events.fetch_op([&](ch_events_t& events)
{
if (!reading)
return false;
if (waiting)
events.waiting = !events.count;
events.count = false;
return true;
});
if (reading && res.locks && mask_hint & (SPU_EVENT_S1 | SPU_EVENT_S2))
{
busy_wait(100);
goto retry;
}
return res;
2015-07-21 23:14:04 +03:00
}
void spu_thread::set_events(u32 bits)
2015-07-15 21:11:32 +03:00
{
if (ch_events.atomic_op([&](ch_events_t& events)
2015-07-15 21:11:32 +03:00
{
events.events |= bits;
2015-07-15 21:11:32 +03:00
// If one masked event was fired, set the channel count (even if the event bit was already 1)
if (events.mask & bits)
{
events.count = true;
return !!events.waiting;
}
2015-07-15 21:11:32 +03:00
return false;
}))
2015-07-15 21:11:32 +03:00
{
notify();
2015-07-15 21:11:32 +03:00
}
}
void spu_thread::set_interrupt_status(bool enable)
{
if (enable)
{
// Detect enabling interrupts with events masked
if (auto mask = ch_events.load().mask; mask & ~SPU_EVENT_INTR_IMPLEMENTED)
{
fmt::throw_exception("SPU Interrupts not implemented (mask=0x%x)", mask);
}
}
interrupts_enabled = enable;
}
u32 spu_thread::get_ch_count(u32 ch)
2014-08-23 01:15:02 +04:00
{
2020-10-22 18:28:56 +03:00
if (ch < 128) spu_log.trace("get_ch_count(ch=%s)", spu_ch_name[ch]);
2015-03-02 05:10:41 +03:00
switch (ch)
{
case SPU_WrOutMbox: return ch_out_mbox.get_count() ^ 1;
case SPU_WrOutIntrMbox: return ch_out_intr_mbox.get_count() ^ 1;
case SPU_RdInMbox: return ch_in_mbox.get_count();
case MFC_RdTagStat: return ch_tag_stat.get_count();
case MFC_RdListStallStat: return ch_stall_stat.get_count();
case MFC_WrTagUpdate: return 1;
case SPU_RdSigNotify1: return ch_snr1.get_count();
case SPU_RdSigNotify2: return ch_snr2.get_count();
case MFC_RdAtomicStat: return ch_atomic_stat.get_count();
case SPU_RdEventStat: return get_events().count;
case MFC_Cmd: return 16 - mfc_size;
// Channels with a constant count of 1:
case SPU_WrEventMask:
case SPU_WrEventAck:
case SPU_WrDec:
case SPU_RdDec:
case SPU_RdEventMask:
case SPU_RdMachStat:
case SPU_WrSRR0:
case SPU_RdSRR0:
case SPU_Set_Bkmk_Tag:
case SPU_PM_Start_Ev:
case SPU_PM_Stop_Ev:
case MFC_RdTagMask:
case MFC_LSA:
case MFC_EAH:
case MFC_EAL:
case MFC_Size:
case MFC_TagID:
case MFC_WrTagMask:
case MFC_WrListStallAck:
return 1;
default: break;
2015-03-02 05:10:41 +03:00
}
ensure(ch < 128u);
2020-10-22 18:28:56 +03:00
spu_log.error("Unknown/illegal channel in RCHCNT (ch=%s)", spu_ch_name[ch]);
return 0; // Default count
2014-08-23 01:15:02 +04:00
}
s64 spu_thread::get_ch_value(u32 ch)
2014-08-23 01:15:02 +04:00
{
2020-10-22 18:28:56 +03:00
if (ch < 128) spu_log.trace("get_ch_value(ch=%s)", spu_ch_name[ch]);
2014-10-02 14:29:20 +04:00
auto read_channel = [&](spu_channel& channel) -> s64
2015-07-03 19:07:36 +03:00
{
if (channel.get_count() == 0)
{
state += cpu_flag::wait + cpu_flag::temp;
}
for (int i = 0; i < 10 && channel.get_count() == 0; i++)
{
busy_wait();
}
const s64 out = channel.pop_wait(*this);
static_cast<void>(test_stopped());
return out;
2015-07-03 19:07:36 +03:00
};
2014-08-23 01:15:02 +04:00
switch (ch)
{
2017-02-13 16:12:24 +03:00
case SPU_RdSRR0:
{
return srr0;
2017-02-13 16:12:24 +03:00
}
2015-03-02 05:10:41 +03:00
case SPU_RdInMbox:
{
if (ch_in_mbox.get_count() == 0)
{
state += cpu_flag::wait;
}
2015-07-17 19:27:12 +03:00
while (true)
2015-03-02 05:10:41 +03:00
{
for (int i = 0; i < 10 && ch_in_mbox.get_count() == 0; i++)
{
busy_wait();
}
u32 out = 0;
2016-04-19 16:04:02 +03:00
if (const uint old_count = ch_in_mbox.try_pop(out))
2015-07-17 19:27:12 +03:00
{
2016-04-19 16:04:02 +03:00
if (old_count == 4 /* SPU_IN_MBOX_THRESHOLD */) // TODO: check this
2015-07-17 19:27:12 +03:00
{
int_ctrl[2].set(SPU_INT2_STAT_SPU_MAILBOX_THRESHOLD_INT);
}
check_state();
return out;
2015-07-17 19:27:12 +03:00
}
auto old = +state;
if (is_stopped(old))
2016-04-14 02:09:41 +03:00
{
return -1;
2016-04-14 02:09:41 +03:00
}
2015-07-03 19:07:36 +03:00
thread_ctrl::wait_on(state, old);
2015-03-02 05:10:41 +03:00
}
}
2014-08-23 01:15:02 +04:00
2015-03-02 05:10:41 +03:00
case MFC_RdTagStat:
{
if (u32 out; ch_tag_stat.try_read(out))
{
ch_tag_stat.set_value(0, false);
return out;
}
// Will stall infinitely
2015-07-03 19:07:36 +03:00
return read_channel(ch_tag_stat);
2015-03-02 05:10:41 +03:00
}
case MFC_RdTagMask:
{
return ch_tag_mask;
2015-03-02 05:10:41 +03:00
}
case SPU_RdSigNotify1:
{
2015-07-03 19:07:36 +03:00
return read_channel(ch_snr1);
2015-03-02 05:10:41 +03:00
}
case SPU_RdSigNotify2:
{
2015-07-03 19:07:36 +03:00
return read_channel(ch_snr2);
2015-03-02 05:10:41 +03:00
}
case MFC_RdAtomicStat:
{
if (u32 out; ch_atomic_stat.try_read(out))
{
ch_atomic_stat.set_value(0, false);
return out;
}
// Will stall infinitely
2015-07-03 19:07:36 +03:00
return read_channel(ch_atomic_stat);
2015-03-02 05:10:41 +03:00
}
case MFC_RdListStallStat:
{
if (u32 out; ch_stall_stat.try_read(out))
{
ch_stall_stat.set_value(0, false);
return out;
}
// Will stall infinitely
2015-07-03 19:07:36 +03:00
return read_channel(ch_stall_stat);
2015-03-02 05:10:41 +03:00
}
case SPU_RdDec:
{
2019-12-03 00:31:34 +03:00
u32 out = ch_dec_value - static_cast<u32>(get_timebased_time() - ch_dec_start_timestamp);
//Polling: We might as well hint to the scheduler to slot in another thread since this one is counting down
if (g_cfg.core.spu_loop_detection && out > spu::scheduler::native_jiffy_duration_us)
2019-06-20 04:32:19 +03:00
{
state += cpu_flag::wait;
std::this_thread::yield();
2019-06-20 04:32:19 +03:00
}
return out;
2015-03-02 05:10:41 +03:00
}
case SPU_RdEventMask:
{
return ch_events.load().mask;
2015-03-02 05:10:41 +03:00
}
case SPU_RdEventStat:
{
const u32 mask1 = ch_events.load().mask;
auto events = get_events(mask1, false, true);
if (events.count)
2015-03-02 05:10:41 +03:00
{
return events.events & mask1;
2015-07-15 21:11:32 +03:00
}
2015-07-04 02:22:24 +03:00
spu_function_logger logger(*this, "MFC Events read");
if (mask1 & SPU_EVENT_LR && raddr)
2015-07-15 21:11:32 +03:00
{
if (mask1 != SPU_EVENT_LR && mask1 != SPU_EVENT_LR + SPU_EVENT_TM)
{
// Combining LR with other flags needs another solution
fmt::throw_exception("Not supported: event mask 0x%x", mask1);
}
for (; !events.count; events = get_events(mask1, false, true))
{
const auto old = state.add_fetch(cpu_flag::wait);
if (is_stopped(old))
2020-12-19 09:48:37 +02:00
{
return -1;
2020-12-19 09:48:37 +02:00
}
if (is_paused(old))
{
// Ensure reservation data won't change while paused for debugging purposes
check_state();
continue;
}
2021-03-05 22:05:37 +03:00
vm::reservation_notifier(raddr).wait(rtime, -128, atomic_wait_timeout{100'000});
}
check_state();
return events.events & mask1;
2015-07-15 21:11:32 +03:00
}
for (; !events.count; events = get_events(mask1, true, true))
2015-07-15 21:11:32 +03:00
{
const auto old = state.add_fetch(cpu_flag::wait);
if (is_stopped(old))
2020-12-19 09:48:37 +02:00
{
return -1;
2020-12-19 09:48:37 +02:00
}
if (is_paused(old))
{
check_state();
continue;
2015-07-15 21:11:32 +03:00
}
thread_ctrl::wait_on(state, old, 100);
2016-04-14 02:09:41 +03:00
}
2017-12-09 17:57:43 +03:00
check_state();
return events.events & mask1;
2015-03-02 05:10:41 +03:00
}
case SPU_RdMachStat:
2014-08-23 01:15:02 +04:00
{
// Return SPU Interrupt status in LSB
2020-07-17 11:18:04 +03:00
return u32{interrupts_enabled} | (u32{get_type() == spu_type::isolated} << 1);
2014-08-23 01:15:02 +04:00
}
}
2014-10-02 14:29:20 +04:00
fmt::throw_exception("Unknown/illegal channel in RDCH (ch=%d [%s])", ch, ch < 128 ? spu_ch_name[ch] : "???");
2014-08-23 01:15:02 +04:00
}
bool spu_thread::set_ch_value(u32 ch, u32 value)
2014-08-23 01:15:02 +04:00
{
2020-10-22 18:28:56 +03:00
if (ch < 128) spu_log.trace("set_ch_value(ch=%s, value=0x%x)", spu_ch_name[ch], value);
2014-10-02 14:29:20 +04:00
2014-08-23 01:15:02 +04:00
switch (ch)
{
2017-02-13 16:12:24 +03:00
case SPU_WrSRR0:
{
2019-10-01 10:06:34 +03:00
srr0 = value & 0x3fffc;
return true;
2017-02-13 16:12:24 +03:00
}
2017-12-09 17:57:43 +03:00
2014-08-23 01:15:02 +04:00
case SPU_WrOutIntrMbox:
{
2020-07-17 11:18:04 +03:00
if (get_type() >= spu_type::raw)
2014-08-23 01:15:02 +04:00
{
if (ch_out_intr_mbox.get_count())
2014-08-23 01:15:02 +04:00
{
state += cpu_flag::wait;
}
if (!ch_out_intr_mbox.push_wait(*this, value))
{
return false;
2014-08-23 01:15:02 +04:00
}
2015-03-02 05:10:41 +03:00
2015-07-13 00:02:02 +03:00
int_ctrl[2].set(SPU_INT2_STAT_MAILBOX_INT);
check_state();
2016-04-19 16:04:02 +03:00
return true;
2014-08-23 01:15:02 +04:00
}
2017-12-09 17:57:43 +03:00
state += cpu_flag::wait;
2017-02-05 02:26:57 +03:00
const u32 code = value >> 24;
2014-08-23 01:15:02 +04:00
{
if (code < 64)
{
/* ===== sys_spu_thread_send_event (used by spu_printf) ===== */
2017-02-05 02:26:57 +03:00
u32 spup = code & 63;
u32 data = 0;
2014-08-23 01:15:02 +04:00
2017-02-05 02:26:57 +03:00
if (!ch_out_mbox.try_pop(data))
2014-08-23 01:15:02 +04:00
{
fmt::throw_exception("sys_spu_thread_send_event(value=0x%x, spup=%d): Out_MBox is empty", value, spup);
2014-08-23 01:15:02 +04:00
}
spu_log.trace("sys_spu_thread_send_event(spup=%d, data0=0x%x, data1=0x%x)", spup, value & 0x00ffffff, data);
2014-08-23 01:15:02 +04:00
std::lock_guard lock(group->mutex);
2014-08-23 01:15:02 +04:00
const auto queue = this->spup[spup].lock();
const auto res = ch_in_mbox.get_count() ? CELL_EBUSY :
!queue ? CELL_ENOTCONN :
queue->send(SYS_SPU_THREAD_EVENT_USER_KEY, lv2_id, (u64{spup} << 32) | (value & 0x00ffffff), data);
2017-02-24 16:19:11 +03:00
if (ch_in_mbox.get_count())
{
spu_log.warning("sys_spu_thread_send_event(spup=%d, data0=0x%x, data1=0x%x): In_MBox is not empty (%d)", spup, (value & 0x00ffffff), data, ch_in_mbox.get_count());
}
else if (res == CELL_ENOTCONN)
2014-08-23 01:15:02 +04:00
{
spu_log.warning("sys_spu_thread_send_event(spup=%d, data0=0x%x, data1=0x%x): error (%s)", spup, (value & 0x00ffffff), data, res);
2014-08-23 01:15:02 +04:00
}
ch_in_mbox.set_values(1, res);
2017-02-24 16:19:11 +03:00
return true;
2014-08-23 01:15:02 +04:00
}
else if (code < 128)
{
/* ===== sys_spu_thread_throw_event ===== */
2017-02-05 02:26:57 +03:00
u32 spup = code & 63;
u32 data = 0;
2015-07-17 19:27:12 +03:00
2017-02-05 02:26:57 +03:00
if (!ch_out_mbox.try_pop(data))
2014-08-23 01:15:02 +04:00
{
fmt::throw_exception("sys_spu_thread_throw_event(value=0x%x, spup=%d): Out_MBox is empty", value, spup);
2014-08-23 01:15:02 +04:00
}
spu_log.trace("sys_spu_thread_throw_event(spup=%d, data0=0x%x, data1=0x%x)", spup, value & 0x00ffffff, data);
2014-08-23 01:15:02 +04:00
const auto queue = (std::lock_guard{group->mutex}, this->spup[spup].lock());
2014-08-23 01:15:02 +04:00
// TODO: check passing spup value
if (auto res = queue ? queue->send(SYS_SPU_THREAD_EVENT_USER_KEY, lv2_id, (u64{spup} << 32) | (value & 0x00ffffff), data) : CELL_ENOTCONN)
2014-08-23 01:15:02 +04:00
{
spu_log.warning("sys_spu_thread_throw_event(spup=%d, data0=0x%x, data1=0x%x) failed (error=%s)", spup, (value & 0x00ffffff), data, res);
2014-08-23 01:15:02 +04:00
}
2016-04-19 16:04:02 +03:00
return true;
2014-08-23 01:15:02 +04:00
}
else if (code == 128)
{
/* ===== sys_event_flag_set_bit ===== */
2017-02-05 02:26:57 +03:00
u32 flag = value & 0xffffff;
u32 data = 0;
2015-07-17 19:27:12 +03:00
2017-02-05 02:26:57 +03:00
if (!ch_out_mbox.try_pop(data))
2014-08-23 01:15:02 +04:00
{
fmt::throw_exception("sys_event_flag_set_bit(value=0x%x (flag=%d)): Out_MBox is empty", value, flag);
2014-08-23 01:15:02 +04:00
}
spu_log.trace("sys_event_flag_set_bit(id=%d, value=0x%x (flag=%d))", data, value, flag);
2014-08-23 01:15:02 +04:00
std::lock_guard lock(group->mutex);
2017-02-03 19:27:03 +03:00
// Use the syscall to set flag
const auto res = ch_in_mbox.get_count() ? CELL_EBUSY : 0u + sys_event_flag_set(*this, data, 1ull << flag);
2017-02-24 16:19:11 +03:00
if (res == CELL_EBUSY)
{
spu_log.warning("sys_event_flag_set_bit(value=0x%x (flag=%d)): In_MBox is not empty (%d)", value, flag, ch_in_mbox.get_count());
}
ch_in_mbox.set_values(1, res);
2017-02-24 16:19:11 +03:00
return true;
2014-08-23 01:15:02 +04:00
}
else if (code == 192)
{
/* ===== sys_event_flag_set_bit_impatient ===== */
2017-02-05 02:26:57 +03:00
u32 flag = value & 0xffffff;
u32 data = 0;
2015-07-17 19:27:12 +03:00
2017-02-05 02:26:57 +03:00
if (!ch_out_mbox.try_pop(data))
2014-08-23 01:15:02 +04:00
{
fmt::throw_exception("sys_event_flag_set_bit_impatient(value=0x%x (flag=%d)): Out_MBox is empty", value, flag);
2014-08-23 01:15:02 +04:00
}
spu_log.trace("sys_event_flag_set_bit_impatient(id=%d, value=0x%x (flag=%d))", data, value, flag);
2014-08-23 01:15:02 +04:00
2017-02-03 19:27:03 +03:00
// Use the syscall to set flag
sys_event_flag_set(*this, data, 1ull << flag);
2016-04-19 16:04:02 +03:00
return true;
2014-08-23 01:15:02 +04:00
}
else
{
fmt::throw_exception("SPU_WrOutIntrMbox: unknown data (value=0x%x, Out_MBox=%s)", value, ch_out_mbox);
2014-08-23 01:15:02 +04:00
}
}
}
case SPU_WrOutMbox:
{
if (ch_out_mbox.get_count())
{
state += cpu_flag::wait;
}
if (!ch_out_mbox.push_wait(*this, value))
{
return false;
2014-12-23 02:31:11 +03:00
}
2015-03-02 05:10:41 +03:00
check_state();
2016-04-19 16:04:02 +03:00
return true;
2014-08-23 01:15:02 +04:00
}
case MFC_WrTagMask:
{
2015-03-02 05:10:41 +03:00
ch_tag_mask = value;
if (ch_tag_upd)
{
const u32 completed = get_mfc_completed();
if (completed && ch_tag_upd == MFC_TAG_UPDATE_ANY)
{
ch_tag_stat.set_value(completed);
ch_tag_upd = MFC_TAG_UPDATE_IMMEDIATE;
}
else if (completed == value && ch_tag_upd == MFC_TAG_UPDATE_ALL)
{
ch_tag_stat.set_value(completed);
ch_tag_upd = MFC_TAG_UPDATE_IMMEDIATE;
}
}
2016-04-19 16:04:02 +03:00
return true;
2014-08-23 01:15:02 +04:00
}
case MFC_WrTagUpdate:
{
if (value > MFC_TAG_UPDATE_ALL)
{
break;
}
const u32 completed = get_mfc_completed();
if (!value)
2017-05-15 21:54:12 +03:00
{
ch_tag_upd = MFC_TAG_UPDATE_IMMEDIATE;
ch_tag_stat.set_value(completed);
2017-05-15 21:54:12 +03:00
}
else if (completed && value == MFC_TAG_UPDATE_ANY)
{
ch_tag_upd = MFC_TAG_UPDATE_IMMEDIATE;
ch_tag_stat.set_value(completed);
}
else if (completed == ch_tag_mask && value == MFC_TAG_UPDATE_ALL)
{
ch_tag_upd = MFC_TAG_UPDATE_IMMEDIATE;
ch_tag_stat.set_value(completed);
}
else
{
ch_tag_upd = value;
}
2016-04-19 16:04:02 +03:00
return true;
2014-08-23 01:15:02 +04:00
}
case MFC_LSA:
{
ch_mfc_cmd.lsa = value;
2016-04-19 16:04:02 +03:00
return true;
2014-08-23 01:15:02 +04:00
}
case MFC_EAH:
{
ch_mfc_cmd.eah = value;
2016-04-19 16:04:02 +03:00
return true;
2014-08-23 01:15:02 +04:00
}
case MFC_EAL:
{
ch_mfc_cmd.eal = value;
2016-04-19 16:04:02 +03:00
return true;
2014-08-23 01:15:02 +04:00
}
case MFC_Size:
{
2017-11-29 14:28:41 +02:00
ch_mfc_cmd.size = value & 0x7fff;
2016-04-19 16:04:02 +03:00
return true;
2014-08-23 01:15:02 +04:00
}
2015-03-02 05:10:41 +03:00
case MFC_TagID:
2014-08-23 01:15:02 +04:00
{
2017-11-29 14:28:41 +02:00
ch_mfc_cmd.tag = value & 0x1f;
2016-04-19 16:04:02 +03:00
return true;
2014-08-23 01:15:02 +04:00
}
2015-03-02 05:10:41 +03:00
case MFC_Cmd:
2014-08-23 01:15:02 +04:00
{
ch_mfc_cmd.cmd = MFC(value & 0xff);
return process_mfc_cmd();
2014-08-23 01:15:02 +04:00
}
2015-03-02 05:10:41 +03:00
case MFC_WrListStallAck:
2014-08-23 01:15:02 +04:00
{
value &= 0x1f;
// Reset stall status for specified tag
const u32 tag_mask = utils::rol32(1, value);
if (ch_stall_mask & tag_mask)
2014-10-02 14:29:20 +04:00
{
ch_stall_mask &= ~tag_mask;
for (u32 i = 0; i < mfc_size; i++)
{
if (mfc_queue[i].tag == (value | 0x80))
{
// Unset stall bit
mfc_queue[i].tag &= 0x7f;
}
}
2018-04-08 14:03:00 +03:00
do_mfc(true);
2014-10-02 14:29:20 +04:00
}
2015-03-02 05:10:41 +03:00
2016-04-19 16:04:02 +03:00
return true;
2014-08-23 01:15:02 +04:00
}
2015-03-02 05:10:41 +03:00
case SPU_WrDec:
2014-08-23 01:15:02 +04:00
{
get_events(SPU_EVENT_TM); // Don't discard possibly occured old event
ch_dec_start_timestamp = get_timebased_time();
2015-03-02 05:10:41 +03:00
ch_dec_value = value;
2016-04-19 16:04:02 +03:00
return true;
2014-08-23 01:15:02 +04:00
}
2015-03-02 05:10:41 +03:00
case SPU_WrEventMask:
2014-08-23 01:15:02 +04:00
{
get_events(value);
if (ch_events.atomic_op([&](ch_events_t& events)
{
events.mask = value;
if (events.events & events.mask)
{
events.count = true;
return true;
}
return false;
}))
{
// Check interrupts in case count is 1
if (check_mfc_interrupts(pc + 4))
{
spu_runtime::g_escape(this);
}
}
2016-04-19 16:04:02 +03:00
return true;
2014-08-23 01:15:02 +04:00
}
2015-03-02 05:10:41 +03:00
case SPU_WrEventAck:
2014-08-23 01:15:02 +04:00
{
// "Collect" events before final acknowledgment
get_events(value);
if (ch_events.atomic_op([&](ch_events_t& events)
{
events.events &= ~value;
if (events.events & events.mask)
{
events.count = true;
return true;
}
return false;
}))
{
// Check interrupts in case count is 1
if (check_mfc_interrupts(pc + 4))
{
spu_runtime::g_escape(this);
}
}
2016-04-19 16:04:02 +03:00
return true;
2014-08-23 01:15:02 +04:00
}
2017-03-11 21:55:50 +03:00
case SPU_Set_Bkmk_Tag:
case SPU_PM_Start_Ev:
case SPU_PM_Stop_Ev:
2017-03-11 21:55:50 +03:00
{
return true;
}
2014-08-23 01:15:02 +04:00
}
fmt::throw_exception("Unknown/illegal channel in WRCH (ch=%d [%s], value=0x%x)", ch, ch < 128 ? spu_ch_name[ch] : "???", value);
2015-03-02 05:10:41 +03:00
}
bool spu_thread::stop_and_signal(u32 code)
2015-03-02 05:10:41 +03:00
{
spu_log.trace("stop_and_signal(code=0x%x)", code);
2014-08-23 01:15:02 +04:00
auto set_status_npc = [&]()
2014-08-23 01:15:02 +04:00
{
status_npc.atomic_op([&](status_npc_sync_var& state)
2015-03-02 05:10:41 +03:00
{
state.status = (state.status & 0xffff) | (code << 16);
state.status |= SPU_STATUS_STOPPED_BY_STOP;
state.status &= ~SPU_STATUS_RUNNING;
state.npc = (pc + 4) | +interrupts_enabled;
2015-03-02 05:10:41 +03:00
});
};
2020-07-17 11:18:04 +03:00
if (get_type() >= spu_type::raw)
{
// Save next PC and current SPU Interrupt Status
state += cpu_flag::stop + cpu_flag::wait + cpu_flag::ret;
set_status_npc();
2014-08-23 01:15:02 +04:00
status_npc.notify_one();
2015-07-13 00:02:02 +03:00
int_ctrl[2].set(SPU_INT2_STAT_SPU_STOP_AND_SIGNAL_INT);
check_state();
return true;
2015-03-02 05:10:41 +03:00
}
2014-08-23 01:15:02 +04:00
switch (code)
{
2014-10-02 14:29:20 +04:00
case 0x001:
{
state += cpu_flag::wait;
std::this_thread::sleep_for(1ms); // hack
check_state();
2016-04-19 16:04:02 +03:00
return true;
2014-10-02 14:29:20 +04:00
}
case 0x002:
{
state += cpu_flag::ret;
2016-04-19 16:04:02 +03:00
return true;
2014-10-02 14:29:20 +04:00
}
case SYS_SPU_THREAD_STOP_RECEIVE_EVENT:
2014-08-23 01:15:02 +04:00
{
2014-08-28 01:04:55 +04:00
/* ===== sys_spu_thread_receive_event ===== */
u32 spuq = 0;
2015-07-17 19:27:12 +03:00
2017-02-05 02:26:57 +03:00
if (!ch_out_mbox.try_pop(spuq))
2014-08-23 01:15:02 +04:00
{
fmt::throw_exception("sys_spu_thread_receive_event(): Out_MBox is empty");
2014-08-23 01:15:02 +04:00
}
2015-07-17 19:27:12 +03:00
if (u32 count = ch_in_mbox.get_count())
2014-08-23 01:15:02 +04:00
{
spu_log.error("sys_spu_thread_receive_event(): In_MBox is not empty (%d)", count);
2016-04-19 16:04:02 +03:00
return ch_in_mbox.set_values(1, CELL_EBUSY), true;
2014-08-23 01:15:02 +04:00
}
spu_log.trace("sys_spu_thread_receive_event(spuq=0x%x)", spuq);
2014-08-23 01:15:02 +04:00
if (!group->has_scheduler_context /*|| group->type & 0xf00*/)
2015-07-03 19:07:36 +03:00
{
spu_log.error("sys_spu_thread_receive_event(): Incompatible group type = 0x%x", group->type);
2016-04-19 16:04:02 +03:00
return ch_in_mbox.set_values(1, CELL_EINVAL), true;
2015-07-03 19:07:36 +03:00
}
std::shared_ptr<lv2_event_queue> queue;
state += cpu_flag::wait;
spu_function_logger logger(*this, "sys_spu_thread_receive_event");
2017-02-05 02:26:57 +03:00
while (true)
{
2017-02-05 02:26:57 +03:00
queue.reset();
2017-02-05 02:26:57 +03:00
// Check group status, wait if necessary
for (auto _state = +group->run_state;
_state >= SPU_THREAD_GROUP_STATUS_WAITING && _state <= SPU_THREAD_GROUP_STATUS_WAITING_AND_SUSPENDED;
_state = group->run_state)
2017-02-05 02:26:57 +03:00
{
const auto old = state.load();
if (is_stopped(old))
{
2017-02-05 02:26:57 +03:00
return false;
}
2017-02-05 02:26:57 +03:00
thread_ctrl::wait_on(state, old);;
}
2017-02-05 02:26:57 +03:00
reader_lock rlock(id_manager::g_mutex);
2014-12-23 02:31:11 +03:00
std::lock_guard lock(group->mutex);
2015-07-03 19:07:36 +03:00
if (is_stopped())
{
return false;
}
2017-02-05 02:26:57 +03:00
if (group->run_state >= SPU_THREAD_GROUP_STATUS_WAITING && group->run_state <= SPU_THREAD_GROUP_STATUS_WAITING_AND_SUSPENDED)
2016-04-14 02:09:41 +03:00
{
2017-02-05 02:26:57 +03:00
// Try again
continue;
2016-04-14 02:09:41 +03:00
}
2015-07-03 19:07:36 +03:00
2017-02-05 02:26:57 +03:00
for (auto& v : this->spuq)
2014-12-23 02:31:11 +03:00
{
2017-02-05 02:26:57 +03:00
if (spuq == v.first)
2016-04-14 02:09:41 +03:00
{
2017-02-05 02:26:57 +03:00
queue = v.second.lock();
if (lv2_event_queue::check(queue))
2017-02-05 02:26:57 +03:00
{
break;
}
2016-04-14 02:09:41 +03:00
}
2014-12-23 02:31:11 +03:00
}
2015-07-03 19:07:36 +03:00
if (!lv2_event_queue::check(queue))
2017-02-05 02:26:57 +03:00
{
return ch_in_mbox.set_values(1, CELL_EINVAL), true;
2017-02-05 02:26:57 +03:00
}
std::lock_guard qlock(queue->mutex);
2015-07-19 15:58:11 +03:00
if (!queue->exists)
{
return ch_in_mbox.set_values(1, CELL_EINVAL), true;
}
2017-02-04 00:36:04 +03:00
if (queue->events.empty())
2015-07-19 15:58:11 +03:00
{
2017-02-04 00:36:04 +03:00
queue->sq.emplace_back(this);
2017-02-05 02:26:57 +03:00
group->run_state = SPU_THREAD_GROUP_STATUS_WAITING;
for (auto& thread : group->threads)
{
if (thread)
{
thread->state += cpu_flag::suspend;
}
}
// Wait
break;
2017-02-04 00:36:04 +03:00
}
else
{
2017-02-05 02:26:57 +03:00
// Return the event immediately
2017-02-04 00:36:04 +03:00
const auto event = queue->events.front();
const auto data1 = static_cast<u32>(std::get<1>(event));
const auto data2 = static_cast<u32>(std::get<2>(event));
const auto data3 = static_cast<u32>(std::get<3>(event));
ch_in_mbox.set_values(4, CELL_OK, data1, data2, data3);
queue->events.pop_front();
2017-02-05 02:26:57 +03:00
return true;
2017-02-04 00:36:04 +03:00
}
}
2015-07-01 01:25:52 +03:00
while (auto old = state.fetch_sub(cpu_flag::signal))
2017-02-04 00:36:04 +03:00
{
if (is_stopped(old))
2017-02-04 00:36:04 +03:00
{
// The thread group cannot be stopped while waiting for an event
ensure(!(old & cpu_flag::stop));
2017-02-04 00:36:04 +03:00
return false;
2015-07-19 15:58:11 +03:00
}
2015-03-04 07:42:04 +03:00
if (old & cpu_flag::signal)
2017-02-05 02:26:57 +03:00
{
break;
}
thread_ctrl::wait_on(state, old);;
2015-07-03 19:07:36 +03:00
}
2017-02-05 02:26:57 +03:00
std::lock_guard lock(group->mutex);
2017-12-09 17:57:43 +03:00
2017-02-05 02:26:57 +03:00
if (group->run_state == SPU_THREAD_GROUP_STATUS_WAITING)
2015-07-03 19:07:36 +03:00
{
2017-02-05 02:26:57 +03:00
group->run_state = SPU_THREAD_GROUP_STATUS_RUNNING;
2015-07-03 19:07:36 +03:00
}
2017-02-05 02:26:57 +03:00
else if (group->run_state == SPU_THREAD_GROUP_STATUS_WAITING_AND_SUSPENDED)
2015-07-03 19:07:36 +03:00
{
2017-02-05 02:26:57 +03:00
group->run_state = SPU_THREAD_GROUP_STATUS_SUSPENDED;
2015-07-03 19:07:36 +03:00
}
2015-03-06 00:29:05 +03:00
2015-07-19 15:58:11 +03:00
for (auto& thread : group->threads)
2015-03-06 00:29:05 +03:00
{
2017-02-05 02:26:57 +03:00
if (thread)
2016-04-14 02:09:41 +03:00
{
thread->state -= cpu_flag::suspend;
2017-02-05 02:26:57 +03:00
if (thread.get() != this)
{
thread->state.notify_one(cpu_flag::suspend);
2017-02-05 02:26:57 +03:00
}
2016-04-14 02:09:41 +03:00
}
2015-03-06 00:29:05 +03:00
}
2016-04-19 16:04:02 +03:00
return true;
2014-08-23 01:15:02 +04:00
}
2014-08-28 01:04:55 +04:00
case SYS_SPU_THREAD_STOP_TRY_RECEIVE_EVENT:
{
/* ===== sys_spu_thread_tryreceive_event ===== */
u32 spuq = 0;
if (!ch_out_mbox.try_pop(spuq))
{
fmt::throw_exception("sys_spu_thread_tryreceive_event(): Out_MBox is empty");
}
if (u32 count = ch_in_mbox.get_count())
{
spu_log.error("sys_spu_thread_tryreceive_event(): In_MBox is not empty (%d)", count);
return ch_in_mbox.set_values(1, CELL_EBUSY), true;
}
spu_log.trace("sys_spu_thread_tryreceive_event(spuq=0x%x)", spuq);
std::lock_guard lock(group->mutex);
std::shared_ptr<lv2_event_queue> queue;
for (auto& v : this->spuq)
{
if (spuq == v.first)
{
if (queue = v.second.lock(); lv2_event_queue::check(queue))
{
break;
}
}
}
if (!lv2_event_queue::check(queue))
{
return ch_in_mbox.set_values(1, CELL_EINVAL), true;
}
std::lock_guard qlock(queue->mutex);
if (!queue->exists)
{
return ch_in_mbox.set_values(1, CELL_EINVAL), true;
}
if (queue->events.empty())
{
return ch_in_mbox.set_values(1, CELL_EBUSY), true;
}
const auto event = queue->events.front();
const auto data1 = static_cast<u32>(std::get<1>(event));
const auto data2 = static_cast<u32>(std::get<2>(event));
const auto data3 = static_cast<u32>(std::get<3>(event));
ch_in_mbox.set_values(4, CELL_OK, data1, data2, data3);
queue->events.pop_front();
return true;
}
case SYS_SPU_THREAD_STOP_YIELD:
{
// SPU thread group yield (TODO)
if (ch_out_mbox.get_count())
{
fmt::throw_exception("STOP code 0x100: Out_MBox is not empty");
}
atomic_fence_seq_cst();
return true;
}
case SYS_SPU_THREAD_STOP_GROUP_EXIT:
2014-08-28 01:04:55 +04:00
{
/* ===== sys_spu_thread_group_exit ===== */
state += cpu_flag::wait;
u32 value = 0;
2015-07-17 19:27:12 +03:00
2017-02-05 02:26:57 +03:00
if (!ch_out_mbox.try_pop(value))
2014-08-28 01:04:55 +04:00
{
fmt::throw_exception("sys_spu_thread_group_exit(): Out_MBox is empty");
2014-08-28 01:04:55 +04:00
}
2015-03-02 05:10:41 +03:00
spu_log.trace("sys_spu_thread_group_exit(status=0x%x)", value);
2015-03-02 05:10:41 +03:00
while (true)
2014-08-28 01:04:55 +04:00
{
for (auto _state = +group->run_state;
_state >= SPU_THREAD_GROUP_STATUS_WAITING && _state <= SPU_THREAD_GROUP_STATUS_WAITING_AND_SUSPENDED;
_state = group->run_state)
2014-08-28 01:04:55 +04:00
{
const auto old = +state;
if (is_stopped(old))
{
return false;
}
thread_ctrl::wait_on(state, old);;
2014-08-28 01:04:55 +04:00
}
std::lock_guard lock(group->mutex);
if (auto _state = +group->run_state;
_state >= SPU_THREAD_GROUP_STATUS_WAITING && _state <= SPU_THREAD_GROUP_STATUS_WAITING_AND_SUSPENDED)
{
// We can't exit while we are waiting on an SPU event
continue;
}
if (std::exchange(group->set_terminate, true))
{
// Whoever terminated first decides the error status + cause
return true;
}
for (auto& thread : group->threads)
{
if (thread)
{
thread->state.fetch_op([](bs_t<cpu_flag>& flags)
{
if (flags & cpu_flag::stop)
{
// In case the thread raised the ret flag itself at some point do not raise it again
return false;
}
flags += cpu_flag::stop + cpu_flag::ret;
return true;
});
if (thread.get() != this)
thread_ctrl::notify(*thread);
}
}
group->exit_status = value;
group->join_state = SYS_SPU_THREAD_GROUP_JOIN_GROUP_EXIT;
set_status_npc();
break;
}
2015-07-01 01:25:52 +03:00
check_state();
2016-04-19 16:04:02 +03:00
return true;
2014-08-28 01:04:55 +04:00
}
case SYS_SPU_THREAD_STOP_THREAD_EXIT:
2014-08-28 01:04:55 +04:00
{
/* ===== sys_spu_thread_exit ===== */
state += cpu_flag::wait;
u32 value;
if (!ch_out_mbox.try_pop(value))
2014-08-23 01:15:02 +04:00
{
fmt::throw_exception("sys_spu_thread_exit(): Out_MBox is empty");
2014-08-23 01:15:02 +04:00
}
2015-03-02 05:10:41 +03:00
spu_log.trace("sys_spu_thread_exit(status=0x%x)", value);
last_exit_status.release(value);
set_status_npc();
state += cpu_flag::stop + cpu_flag::ret;
check_state();
2016-04-19 16:04:02 +03:00
return true;
2014-08-28 01:04:55 +04:00
}
2015-03-02 05:10:41 +03:00
}
2014-08-28 01:04:55 +04:00
fmt::throw_exception("Unknown STOP code: 0x%x (op=0x%x, Out_MBox=%s)", code, _ref<u32>(pc), ch_out_mbox);
2015-03-02 05:10:41 +03:00
}
void spu_thread::halt()
2015-03-02 05:10:41 +03:00
{
spu_log.trace("halt()");
2015-03-02 05:10:41 +03:00
2020-07-17 11:18:04 +03:00
if (get_type() >= spu_type::raw)
2015-03-02 05:10:41 +03:00
{
state += cpu_flag::stop + cpu_flag::wait;
status_npc.atomic_op([this](status_npc_sync_var& state)
2014-08-23 01:15:02 +04:00
{
state.status |= SPU_STATUS_STOPPED_BY_HALT;
state.status &= ~SPU_STATUS_RUNNING;
state.npc = pc | +interrupts_enabled;
2015-03-02 05:10:41 +03:00
});
status_npc.notify_one();
2015-07-13 00:02:02 +03:00
int_ctrl[2].set(SPU_INT2_STAT_SPU_HALT_OR_STEP_INT);
2015-07-01 01:25:52 +03:00
spu_runtime::g_escape(this);
2014-08-23 01:15:02 +04:00
}
2015-03-02 05:10:41 +03:00
spu_log.fatal("Halt");
spu_runtime::g_escape(this);
2014-10-24 17:24:09 +04:00
}
void spu_thread::fast_call(u32 ls_addr)
{
2016-04-14 02:09:41 +03:00
// LS:0x0: this is originally the entry point of the interrupt handler, but interrupts are not implemented
_ref<u32>(0) = 0x00000002; // STOP 2
2015-07-01 01:25:52 +03:00
2016-04-14 02:09:41 +03:00
auto old_pc = pc;
auto old_lr = gpr[0]._u32[3];
auto old_stack = gpr[1]._u32[3]; // only saved and restored (may be wrong)
pc = ls_addr;
gpr[0]._u32[3] = 0x0;
cpu_task();
state -= cpu_flag::ret;
2016-04-14 02:09:41 +03:00
pc = old_pc;
gpr[0]._u32[3] = old_lr;
gpr[1]._u32[3] = old_stack;
}
2020-10-08 09:14:35 +03:00
bool spu_thread::capture_local_storage() const
{
spu_exec_object spu_exec;
// Save data as an executable segment, even the SPU stack
// In the past, an optimization was made here to save only non-zero chunks of data
// But Ghidra didn't like accessing memory out of chunks (pretty common)
// So it has been reverted
auto& prog = spu_exec.progs.emplace_back(SYS_SPU_SEGMENT_TYPE_COPY, 0x7, 0, SPU_LS_SIZE, 8, std::vector<uchar>(ls, ls + SPU_LS_SIZE));
2020-10-08 09:14:35 +03:00
prog.p_paddr = prog.p_vaddr;
spu_log.success("Segment: p_type=0x%x, p_vaddr=0x%x, p_filesz=0x%x, p_memsz=0x%x", prog.p_type, prog.p_vaddr, prog.p_filesz, prog.p_memsz);
2020-10-08 09:14:35 +03:00
std::string name;
if (get_type() == spu_type::threaded)
{
name = *spu_tname.load();
if (name.empty())
{
// TODO: Maybe add thread group name here
2020-12-22 15:29:41 +02:00
fmt::append(name, "SPU.0x%07x", lv2_id);
2020-10-08 09:14:35 +03:00
}
}
else
{
fmt::append(name, "RawSPU.%u", lv2_id);
}
2020-10-25 09:08:50 +02:00
u32 pc0 = pc;
for (; pc0; pc0 -= 4)
{
be_t<u32> op;
std::memcpy(&op, prog.bin.data() + pc0 - 4, 4);
2020-10-25 09:08:50 +02:00
// Try to find function entry (if they are placed sequentially search for BI $LR of previous function)
if (!op || op == 0x35000000u || s_spu_itype.decode(op) == spu_itype::UNK)
{
break;
}
}
spu_exec.header.e_entry = pc0;
2020-10-08 09:14:35 +03:00
name = vfs::escape(name, true);
std::replace(name.begin(), name.end(), ' ', '_');
auto get_filename = [&]() -> std::string
{
return fs::get_cache_dir() + "spu_progs/" + Emu.GetTitleID() + "_" + vfs::escape(name, true) + '_' + date_time::current_time_narrow() + "_capture.elf";
};
auto elf_path = get_filename();
2021-02-23 06:29:11 +02:00
if (fs::exists(elf_path))
2020-10-08 09:14:35 +03:00
{
// Wait 1 second so current_time_narrow() will return a different string
std::this_thread::sleep_for(1s);
2021-02-23 06:29:11 +02:00
if (elf_path = get_filename(); fs::exists(elf_path))
2020-10-08 09:14:35 +03:00
{
spu_log.error("Failed to create '%s' (error=%s)", elf_path, fs::g_tls_error);
return false;
}
}
2021-02-23 06:29:11 +02:00
fs::pending_file temp(elf_path);
if (!temp.file)
{
spu_log.error("Failed to create temporary file for '%s' (error=%s)", elf_path, fs::g_tls_error);
return false;
}
temp.file.write(spu_exec.save());
if (!temp.commit(false))
{
spu_log.error("Failed to create rename temporary file to '%s' (error=%s)", elf_path, fs::g_tls_error);
return false;
}
2020-10-08 09:14:35 +03:00
spu_log.success("SPU Local Storage image saved to '%s'", elf_path);
return true;
}
spu_function_logger::spu_function_logger(spu_thread& spu, const char* func)
: spu(spu)
{
spu.current_func = func;
spu.start_time = get_system_time();
}
2020-04-03 12:18:00 +03:00
template <>
void fmt_class_string<spu_channel>::format(std::string& out, u64 arg)
{
const auto& ch = get_object(arg);
u32 data = 0;
2020-04-03 12:18:00 +03:00
if (ch.try_read(data))
2020-04-03 12:18:00 +03:00
{
fmt::append(out, "0x%08x", data);
2020-04-03 12:18:00 +03:00
}
else
{
out += "empty";
2020-04-03 12:18:00 +03:00
}
}
template <>
void fmt_class_string<spu_channel_4_t>::format(std::string& out, u64 arg)
{
const auto& ch = get_object(arg);
// TODO (use try_read)
2020-04-03 12:18:00 +03:00
fmt::append(out, "count = %d", ch.get_count());
}
DECLARE(spu_thread::g_raw_spu_ctr){};
DECLARE(spu_thread::g_raw_spu_id){};