rpcs3/Utilities/Thread.cpp
Elad 575a245f8d
IDM: Implement lock-free smart pointers (#16403)
Replaces `std::shared_pointer` with `stx::atomic_ptr` and `stx::shared_ptr`.

Notes to programmers:

* This pr kills the use of `dynamic_cast`, `std::dynamic_pointer_cast` and `std::weak_ptr` on IDM objects, possible replacement is to save the object ID on the base object, then use idm::check/get_unlocked to the destination type via the saved ID which may be null. Null pointer check is how you can tell type mismatch (as dynamic cast) or object destruction (as weak_ptr locking).
* Double-inheritance on IDM objects should be used with care, `stx::shared_ptr` does not support constant-evaluated pointer offsetting to parent/child type.
* `idm::check/get_unlocked` can now be used anywhere.

Misc fixes:
* Fixes some segfaults with RPCN with interaction with IDM.
* Fix deadlocks in access violation handler due locking recursion.
* Fixes race condition in process exit-spawn on memory containers read.
* Fix bug that theoretically can prevent RPCS3 from booting - fix `id_manager::typeinfo` comparison to compare members instead of `memcmp` which can fail spuriously on padding bytes.
* Ensure all IDM inherited types of base, either has `id_base` or `id_type` defined locally, this allows to make getters such as `idm::get_unlocked<lv2_socket, lv2_socket_raw>()` which were broken before. (requires save-states invalidation)
* Removes broken operator[] overload of `stx::shared_ptr` and `stx::single_ptr` for non-array types.
2024-12-22 20:59:48 +02:00

3281 lines
76 KiB
C++

#include "stdafx.h"
#include "Emu/System.h"
#include "Emu/Cell/SPUThread.h"
#include "Emu/Cell/PPUThread.h"
#include "Emu/Cell/lv2/sys_mmapper.h"
#include "Emu/Cell/lv2/sys_event.h"
#include "Emu/Cell/lv2/sys_process.h"
#include "Emu/RSX/RSXThread.h"
#include "Thread.h"
#include "Utilities/JIT.h"
#include <thread>
#include <cfenv>
#ifdef ARCH_ARM64
#include "Emu/CPU/Backends/AArch64/AArch64Signal.h"
#endif
#ifdef _WIN32
#include <Windows.h>
#include <Psapi.h>
#include <process.h>
#include <sysinfoapi.h>
#include "util/dyn_lib.hpp"
DYNAMIC_IMPORT_RENAME("Kernel32.dll", SetThreadDescriptionImport, "SetThreadDescription", HRESULT(HANDLE hThread, PCWSTR lpThreadDescription));
#else
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#ifdef __APPLE__
#define _XOPEN_SOURCE
#define __USE_GNU
#include <mach/thread_act.h>
#include <mach/thread_policy.h>
#endif
#if defined(__DragonFly__) || defined(__FreeBSD__) || defined(__OpenBSD__)
#include <pthread_np.h>
#define cpu_set_t cpuset_t
#endif
#include <errno.h>
#include <signal.h>
#ifndef __OpenBSD__
#include <ucontext.h>
#endif
#include <pthread.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <time.h>
#endif
#ifdef __linux__
#include <sys/syscall.h>
#include <sys/timerfd.h>
#include <unistd.h>
#endif
#if defined(__APPLE__) || defined(__DragonFly__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)
# include <sys/sysctl.h>
# include <unistd.h>
# if defined(__DragonFly__) || defined(__FreeBSD__)
# include <sys/user.h>
# endif
# if defined(__OpenBSD__)
# include <sys/param.h>
# include <sys/proc.h>
# endif
# if defined(__NetBSD__)
# undef KERN_PROC
# define KERN_PROC KERN_PROC2
# define kinfo_proc kinfo_proc2
# endif
# if defined(__APPLE__)
# define KP_FLAGS kp_proc.p_flag
# elif defined(__DragonFly__)
# define KP_FLAGS kp_flags
# elif defined(__FreeBSD__)
# define KP_FLAGS ki_flag
# elif defined(__NetBSD__)
# define KP_FLAGS p_flag
# elif defined(__OpenBSD__)
# define KP_FLAGS p_psflags
# define P_TRACED PS_TRACED
# endif
#endif
#include "util/vm.hpp"
#include "util/logs.hpp"
#include "util/asm.hpp"
#include "util/v128.hpp"
#include "util/simd.hpp"
#include "util/sysinfo.hpp"
#include "Emu/Memory/vm_locking.h"
LOG_CHANNEL(sig_log, "SIG");
LOG_CHANNEL(sys_log, "SYS");
LOG_CHANNEL(vm_log, "VM");
thread_local u64 g_tls_fault_all = 0;
thread_local u64 g_tls_fault_rsx = 0;
thread_local u64 g_tls_fault_spu = 0;
thread_local u64 g_tls_wait_time = 0;
thread_local u64 g_tls_wait_fail = 0;
thread_local bool g_tls_access_violation_recovered = false;
extern thread_local std::string(*g_tls_log_prefix)();
// Report error and call std::abort(), defined in main.cpp
[[noreturn]] void report_fatal_error(std::string_view text, bool is_html = false, bool include_help_text = true);
enum cpu_threads_emulation_info_dump_t : u32 {};
std::string dump_useful_thread_info()
{
std::string result;
if (auto cpu = get_current_cpu_thread())
{
fmt::append(result, "%s", cpu_threads_emulation_info_dump_t{cpu->id});
}
return result;
}
#ifndef _WIN32
bool IsDebuggerPresent()
{
#if defined(__APPLE__) || defined(__DragonFly__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)
int mib[] = {
CTL_KERN,
KERN_PROC,
KERN_PROC_PID,
getpid(),
# if defined(__NetBSD__) || defined(__OpenBSD__)
sizeof(struct kinfo_proc),
1,
# endif
};
u_int miblen = std::size(mib);
struct kinfo_proc info;
usz size = sizeof(info);
if (sysctl(mib, miblen, &info, &size, NULL, 0))
{
return false;
}
return info.KP_FLAGS & P_TRACED;
#else
char buf[4096];
fs::file status_fd("/proc/self/status");
if (!status_fd)
{
std::fprintf(stderr, "Failed to open /proc/self/status\n");
return false;
}
const auto num_read = status_fd.read(buf, sizeof(buf) - 1);
if (num_read == 0 || num_read == umax)
{
std::fprintf(stderr, "Failed to read /proc/self/status (%d)\n", errno);
return false;
}
buf[num_read] = '\0';
std::string_view status = buf;
const auto found = status.find("TracerPid:");
if (found == umax)
{
std::fprintf(stderr, "Failed to find 'TracerPid:' in /proc/self/status\n");
return false;
}
for (const char* cp = status.data() + found + 10; cp <= status.data() + num_read; ++cp)
{
if (!std::isspace(*cp))
{
return std::isdigit(*cp) != 0 && *cp != '0';
}
}
return false;
#endif
}
#endif
bool is_debugger_present()
{
if (g_cfg.core.external_debugger)
return true;
return IsDebuggerPresent();
}
#if defined(ARCH_X64)
enum x64_reg_t : u32
{
X64R_RAX = 0,
X64R_RCX,
X64R_RDX,
X64R_RBX,
X64R_RSP,
X64R_RBP,
X64R_RSI,
X64R_RDI,
X64R_R8,
X64R_R9,
X64R_R10,
X64R_R11,
X64R_R12,
X64R_R13,
X64R_R14,
X64R_R15,
X64R_XMM0 = 0,
X64R_XMM1,
X64R_XMM2,
X64R_XMM3,
X64R_XMM4,
X64R_XMM5,
X64R_XMM6,
X64R_XMM7,
X64R_XMM8,
X64R_XMM9,
X64R_XMM10,
X64R_XMM11,
X64R_XMM12,
X64R_XMM13,
X64R_XMM14,
X64R_XMM15,
X64R_AL,
X64R_CL,
X64R_DL,
X64R_BL,
X64R_AH,
X64R_CH,
X64R_DH,
X64R_BH,
X64_NOT_SET,
X64_IMM8,
X64_IMM16,
X64_IMM32,
X64_BIT_O = 0x90,
X64_BIT_NO,
X64_BIT_C,
X64_BIT_NC,
X64_BIT_Z,
X64_BIT_NZ,
X64_BIT_BE,
X64_BIT_NBE,
X64_BIT_S,
X64_BIT_NS,
X64_BIT_P,
X64_BIT_NP,
X64_BIT_L,
X64_BIT_NL,
X64_BIT_LE,
X64_BIT_NLE,
X64R_ECX = X64R_CL,
};
enum x64_op_t : u32
{
X64OP_NONE,
X64OP_LOAD, // obtain and put the value into x64 register
X64OP_LOAD_BE,
X64OP_LOAD_CMP,
X64OP_LOAD_TEST,
X64OP_STORE, // take the value from x64 register or an immediate and use it
X64OP_STORE_BE,
X64OP_MOVS,
X64OP_STOS,
X64OP_XCHG,
X64OP_CMPXCHG,
X64OP_AND, // lock and [mem], ...
X64OP_OR, // lock or [mem], ...
X64OP_XOR, // lock xor [mem], ...
X64OP_INC, // lock inc [mem]
X64OP_DEC, // lock dec [mem]
X64OP_ADD, // lock add [mem], ...
X64OP_ADC, // lock adc [mem], ...
X64OP_SUB, // lock sub [mem], ...
X64OP_SBB, // lock sbb [mem], ...
X64OP_BEXTR,
};
static thread_local x64_reg_t s_tls_reg3{};
void decode_x64_reg_op(const u8* code, x64_op_t& out_op, x64_reg_t& out_reg, usz& out_size, usz& out_length)
{
// simple analysis of x64 code allows to reinterpret MOV or other instructions in any desired way
out_length = 0;
u8 rex = 0, pg2 = 0;
bool oso = false, lock = false, repne = false, repe = false;
enum : u8
{
LOCK = 0xf0,
REPNE = 0xf2,
REPE = 0xf3,
};
// check prefixes:
for (;; code++, out_length++)
{
switch (const u8 prefix = *code)
{
case LOCK: // group 1
{
if (lock)
{
sig_log.error("decode_x64_reg_op(%016llxh): LOCK prefix found twice", code - out_length);
}
lock = true;
continue;
}
case REPNE: // group 1
{
if (repne)
{
sig_log.error("decode_x64_reg_op(%016llxh): REPNE/REPNZ prefix found twice", code - out_length);
}
repne = true;
continue;
}
case REPE: // group 1
{
if (repe)
{
sig_log.error("decode_x64_reg_op(%016llxh): REP/REPE/REPZ prefix found twice", code - out_length);
}
repe = true;
continue;
}
case 0x2e: // group 2
case 0x36:
case 0x3e:
case 0x26:
case 0x64:
case 0x65:
{
if (pg2)
{
sig_log.error("decode_x64_reg_op(%016llxh): 0x%02x (group 2 prefix) found after 0x%02x", code - out_length, prefix, pg2);
}
else
{
pg2 = prefix; // probably, segment register
}
continue;
}
case 0x66: // group 3
{
if (oso)
{
sig_log.error("decode_x64_reg_op(%016llxh): operand-size override prefix found twice", code - out_length);
}
oso = true;
continue;
}
case 0x67: // group 4
{
sig_log.error("decode_x64_reg_op(%016llxh): address-size override prefix found", code - out_length, prefix);
out_op = X64OP_NONE;
out_reg = X64_NOT_SET;
out_size = 0;
out_length = 0;
return;
}
default:
{
if ((prefix & 0xf0) == 0x40) // check REX prefix
{
if (rex)
{
sig_log.error("decode_x64_reg_op(%016llxh): 0x%02x (REX prefix) found after 0x%02x", code - out_length, prefix, rex);
}
else
{
rex = prefix;
}
continue;
}
}
}
break;
}
auto get_modRM_reg = [](const u8* code, const u8 rex) -> x64_reg_t
{
return x64_reg_t{((*code & 0x38) >> 3 | (/* check REX.R bit */ rex & 4 ? 8 : 0)) + X64R_RAX};
};
auto get_modRM_reg_xmm = [](const u8* code, const u8 rex) -> x64_reg_t
{
return x64_reg_t{((*code & 0x38) >> 3 | (/* check REX.R bit */ rex & 4 ? 8 : 0)) + X64R_XMM0};
};
auto get_modRM_reg_lh = [](const u8* code) -> x64_reg_t
{
return x64_reg_t{((*code & 0x38) >> 3) + X64R_AL};
};
auto get_op_size = [](const u8 rex, const bool oso) -> usz
{
return rex & 8 ? 8 : (oso ? 2 : 4);
};
auto get_modRM_size = [](const u8* code) -> usz
{
switch (*code >> 6) // check Mod
{
case 0: return (*code & 0x07) == 4 ? 2 : 1; // check SIB
case 1: return (*code & 0x07) == 4 ? 3 : 2; // check SIB (disp8)
case 2: return (*code & 0x07) == 4 ? 6 : 5; // check SIB (disp32)
default: return 1;
}
};
const u8 op1 = (out_length++, *code++), op2 = code[0], op3 = code[1];
switch (op1)
{
case 0x0f:
{
out_length++, code++;
switch (op2)
{
case 0x11:
case 0x29:
{
if (!repe && !repne) // MOVUPS/MOVAPS/MOVUPD/MOVAPD xmm/m, xmm
{
out_op = X64OP_STORE;
out_reg = get_modRM_reg_xmm(code, rex);
out_size = 16;
out_length += get_modRM_size(code);
return;
}
break;
}
case 0x7f:
{
if (repe != oso) // MOVDQU/MOVDQA xmm/m, xmm
{
out_op = X64OP_STORE;
out_reg = get_modRM_reg_xmm(code, rex);
out_size = 16;
out_length += get_modRM_size(code);
return;
}
break;
}
case 0xb0:
{
if (!oso) // CMPXCHG r8/m8, r8
{
out_op = X64OP_CMPXCHG;
out_reg = rex & 8 ? get_modRM_reg(code, rex) : get_modRM_reg_lh(code);
out_size = 1;
out_length += get_modRM_size(code);
return;
}
break;
}
case 0xb1:
{
if (true) // CMPXCHG r/m, r (16, 32, 64)
{
out_op = X64OP_CMPXCHG;
out_reg = get_modRM_reg(code, rex);
out_size = get_op_size(rex, oso);
out_length += get_modRM_size(code);
return;
}
break;
}
case 0x90:
case 0x91:
case 0x92:
case 0x93:
case 0x94:
case 0x95:
case 0x96:
case 0x97:
case 0x98:
case 0x9a:
case 0x9b:
case 0x9c:
case 0x9d:
case 0x9e:
case 0x9f:
{
if (!lock) // SETcc
{
out_op = X64OP_STORE;
out_reg = x64_reg_t(X64_BIT_O + op2 - 0x90); // 0x90 .. 0x9f
out_size = 1;
out_length += get_modRM_size(code);
return;
}
break;
}
case 0x38:
{
out_length++, code++;
switch (op3)
{
case 0xf0:
case 0xf1:
{
if (!repne) // MOVBE
{
out_op = op3 == 0xf0 ? X64OP_LOAD_BE : X64OP_STORE_BE;
out_reg = get_modRM_reg(code, rex);
out_size = get_op_size(rex, oso);
out_length += get_modRM_size(code);
return;
}
break;
}
}
break;
}
}
break;
}
case 0x20:
{
if (!oso)
{
out_op = X64OP_AND;
out_reg = rex & 8 ? get_modRM_reg(code, rex) : get_modRM_reg_lh(code);
out_size = 1;
out_length += get_modRM_size(code);
return;
}
break;
}
case 0x21:
{
if (true)
{
out_op = X64OP_AND;
out_reg = get_modRM_reg(code, rex);
out_size = get_op_size(rex, oso);
out_length += get_modRM_size(code);
return;
}
break;
}
case 0x80:
{
switch (get_modRM_reg(code, 0))
{
//case 0: out_op = X64OP_ADD; break; // TODO: strange info in instruction manual
case 1: out_op = X64OP_OR; break;
case 2: out_op = X64OP_ADC; break;
case 3: out_op = X64OP_SBB; break;
case 4: out_op = X64OP_AND; break;
case 5: out_op = X64OP_SUB; break;
case 6: out_op = X64OP_XOR; break;
default: out_op = X64OP_LOAD_CMP; break;
}
out_reg = X64_IMM8;
out_size = 1;
out_length += get_modRM_size(code) + 1;
return;
}
case 0x81:
{
switch (get_modRM_reg(code, 0))
{
case 0: out_op = X64OP_ADD; break;
case 1: out_op = X64OP_OR; break;
case 2: out_op = X64OP_ADC; break;
case 3: out_op = X64OP_SBB; break;
case 4: out_op = X64OP_AND; break;
case 5: out_op = X64OP_SUB; break;
case 6: out_op = X64OP_XOR; break;
default: out_op = X64OP_LOAD_CMP; break;
}
out_reg = oso ? X64_IMM16 : X64_IMM32;
out_size = get_op_size(rex, oso);
out_length += get_modRM_size(code) + (oso ? 2 : 4);
return;
}
case 0x83:
{
switch (get_modRM_reg(code, 0))
{
case 0: out_op = X64OP_ADD; break;
case 1: out_op = X64OP_OR; break;
case 2: out_op = X64OP_ADC; break;
case 3: out_op = X64OP_SBB; break;
case 4: out_op = X64OP_AND; break;
case 5: out_op = X64OP_SUB; break;
case 6: out_op = X64OP_XOR; break;
default: out_op = X64OP_LOAD_CMP; break;
}
out_reg = X64_IMM8;
out_size = get_op_size(rex, oso);
out_length += get_modRM_size(code) + 1;
return;
}
case 0x86:
{
if (!oso) // XCHG r8/m8, r8
{
out_op = X64OP_XCHG;
out_reg = rex & 8 ? get_modRM_reg(code, rex) : get_modRM_reg_lh(code);
out_size = 1;
out_length += get_modRM_size(code);
return;
}
break;
}
case 0x87:
{
if (true) // XCHG r/m, r (16, 32, 64)
{
out_op = X64OP_XCHG;
out_reg = get_modRM_reg(code, rex);
out_size = get_op_size(rex, oso);
out_length += get_modRM_size(code);
return;
}
break;
}
case 0x88:
{
if (!lock && !oso) // MOV r8/m8, r8
{
out_op = X64OP_STORE;
out_reg = rex & 8 ? get_modRM_reg(code, rex) : get_modRM_reg_lh(code);
out_size = 1;
out_length += get_modRM_size(code);
return;
}
break;
}
case 0x89:
{
if (!lock) // MOV r/m, r (16, 32, 64)
{
out_op = X64OP_STORE;
out_reg = get_modRM_reg(code, rex);
out_size = get_op_size(rex, oso);
out_length += get_modRM_size(code);
return;
}
break;
}
case 0x8a:
{
if (!lock && !oso) // MOV r8, r8/m8
{
out_op = X64OP_LOAD;
out_reg = rex & 8 ? get_modRM_reg(code, rex) : get_modRM_reg_lh(code);
out_size = 1;
out_length += get_modRM_size(code);
return;
}
break;
}
case 0x8b:
{
if (!lock) // MOV r, r/m (16, 32, 64)
{
out_op = X64OP_LOAD;
out_reg = get_modRM_reg(code, rex);
out_size = get_op_size(rex, oso);
out_length += get_modRM_size(code);
return;
}
break;
}
case 0xa4:
{
if (!oso && !lock && !repe && !rex) // MOVS
{
out_op = X64OP_MOVS;
out_reg = X64_NOT_SET;
out_size = 1;
return;
}
if (!oso && !lock && repe) // REP MOVS
{
out_op = X64OP_MOVS;
out_reg = rex & 8 ? X64R_RCX : X64R_ECX;
out_size = 1;
return;
}
break;
}
case 0xaa:
{
if (!oso && !lock && !repe && !rex) // STOS
{
out_op = X64OP_STOS;
out_reg = X64_NOT_SET;
out_size = 1;
return;
}
if (!oso && !lock && repe) // REP STOS
{
out_op = X64OP_STOS;
out_reg = rex & 8 ? X64R_RCX : X64R_ECX;
out_size = 1;
return;
}
break;
}
case 0xc4: // 3-byte VEX prefix
case 0xc5: // 2-byte VEX prefix
{
// Last prefix byte: op2 or op3
const u8 opx = op1 == 0xc5 ? op2 : op3;
// Implied prefixes
rex |= op2 & 0x80 ? 0 : 0x4; // REX.R
rex |= op1 == 0xc4 && op3 & 0x80 ? 0x8 : 0; // REX.W ???
oso = (opx & 0x3) == 0x1;
repe = (opx & 0x3) == 0x2;
repne = (opx & 0x3) == 0x3;
const u8 vopm = op1 == 0xc5 ? 1 : op2 & 0x1f;
const u8 vop1 = op1 == 0xc5 ? op3 : code[2];
const u8 vlen = (opx & 0x4) ? 32 : 16;
const u8 vreg = (~opx >> 3) & 0xf;
out_length += op1 == 0xc5 ? 2 : 3;
code += op1 == 0xc5 ? 2 : 3;
s_tls_reg3 = x64_reg_t{vreg};
if (vopm == 0x1) switch (vop1) // Implied leading byte 0x0F
{
case 0x11:
case 0x29:
{
if (!repe && !repne) // VMOVAPS/VMOVAPD/VMOVUPS/VMOVUPD mem,reg
{
out_op = X64OP_STORE;
out_reg = get_modRM_reg_xmm(code, rex);
out_size = vlen;
out_length += get_modRM_size(code);
return;
}
break;
}
case 0x7f:
{
if (repe || oso) // VMOVDQU/VMOVDQA mem,reg
{
out_op = X64OP_STORE;
out_reg = get_modRM_reg_xmm(code, rex);
out_size = vlen;
out_length += get_modRM_size(code);
return;
}
break;
}
}
if (vopm == 0x2) switch (vop1) // Implied leading bytes 0x0F 0x38
{
case 0xf7:
{
if (!repe && !repne && vlen == 16) // BEXTR r32,mem,r32
{
out_op = X64OP_BEXTR;
out_reg = get_modRM_reg_xmm(code, rex);
out_size = opx & 0x80 ? 8 : 4;
out_length += get_modRM_size(code);
return;
}
break;
}
}
break;
}
case 0xc6:
{
if (!lock && !oso && get_modRM_reg(code, 0) == 0) // MOV r8/m8, imm8
{
out_op = X64OP_STORE;
out_reg = X64_IMM8;
out_size = 1;
out_length += get_modRM_size(code) + 1;
return;
}
break;
}
case 0xc7:
{
if (!lock && get_modRM_reg(code, 0) == 0) // MOV r/m, imm16/imm32 (16, 32, 64)
{
out_op = X64OP_STORE;
out_reg = oso ? X64_IMM16 : X64_IMM32;
out_size = get_op_size(rex, oso);
out_length += get_modRM_size(code) + (oso ? 2 : 4);
return;
}
break;
}
case 0xf6:
{
switch (get_modRM_reg(code, 0))
{
case 0: out_op = X64OP_LOAD_TEST; break;
default: out_op = X64OP_NONE; break; // TODO...
}
out_reg = X64_IMM8;
out_size = 1;
out_length += get_modRM_size(code) + 1;
return;
}
case 0xf7:
{
switch (get_modRM_reg(code, 0))
{
case 0: out_op = X64OP_LOAD_TEST; break;
default: out_op = X64OP_NONE; break; // TODO...
}
out_reg = oso ? X64_IMM16 : X64_IMM32;
out_size = get_op_size(rex, oso);
out_length += get_modRM_size(code) + (oso ? 2 : 4);
return;
}
}
out_op = X64OP_NONE;
out_reg = X64_NOT_SET;
out_size = 0;
out_length = 0;
}
#ifdef _WIN32
typedef CONTEXT x64_context;
typedef CONTEXT ucontext_t;
#define X64REG(context, reg) (&(&(context)->Rax)[reg])
#define XMMREG(context, reg) (reinterpret_cast<v128*>(&(&(context)->Xmm0)[reg]))
#define EFLAGS(context) ((context)->EFlags)
#define ARG1(context) RCX(context)
#define ARG2(context) RDX(context)
#else
typedef ucontext_t x64_context;
#ifdef __APPLE__
#define X64REG(context, reg) (darwin_x64reg(context, reg))
#define XMMREG(context, reg) (reinterpret_cast<v128*>(&(context)->uc_mcontext->__fs.__fpu_xmm0.__xmm_reg[reg]))
#define EFLAGS(context) ((context)->uc_mcontext->__ss.__rflags)
u64* darwin_x64reg(x64_context *context, int reg)
{
auto *state = &context->uc_mcontext->__ss;
switch(reg)
{
case 0: return &state->__rax;
case 1: return &state->__rcx;
case 2: return &state->__rdx;
case 3: return &state->__rbx;
case 4: return &state->__rsp;
case 5: return &state->__rbp;
case 6: return &state->__rsi;
case 7: return &state->__rdi;
case 8: return &state->__r8;
case 9: return &state->__r9;
case 10: return &state->__r10;
case 11: return &state->__r11;
case 12: return &state->__r12;
case 13: return &state->__r13;
case 14: return &state->__r14;
case 15: return &state->__r15;
case 16: return &state->__rip;
default:
sig_log.error("Invalid register index: %d", reg);
return nullptr;
}
}
#elif defined(__DragonFly__) || defined(__FreeBSD__)
#define X64REG(context, reg) (freebsd_x64reg(context, reg))
#ifdef __DragonFly__
# define XMMREG(context, reg) (reinterpret_cast<v128*>((reinterpret_cast<union savefpu*>(context)->uc_mcontext.mc_fpregs)->sv_xmm.sv_xmm[reg]))
#else
# define XMMREG(context, reg) (reinterpret_cast<v128*>((reinterpret_cast<struct savefpu*>(context)->uc_mcontext.mc_fpstate)->sv_xmm[reg]))
#endif
#define EFLAGS(context) ((context)->uc_mcontext.mc_rflags)
register_t* freebsd_x64reg(x64_context *context, int reg)
{
auto *state = &context->uc_mcontext;
switch(reg)
{
case 0: return &state->mc_rax;
case 1: return &state->mc_rcx;
case 2: return &state->mc_rdx;
case 3: return &state->mc_rbx;
case 4: return &state->mc_rsp;
case 5: return &state->mc_rbp;
case 6: return &state->mc_rsi;
case 7: return &state->mc_rdi;
case 8: return &state->mc_r8;
case 9: return &state->mc_r9;
case 10: return &state->mc_r10;
case 11: return &state->mc_r11;
case 12: return &state->mc_r12;
case 13: return &state->mc_r13;
case 14: return &state->mc_r14;
case 15: return &state->mc_r15;
case 16: return &state->mc_rip;
default:
sig_log.error("Invalid register index: %d", reg);
return nullptr;
}
}
#elif defined(__OpenBSD__)
#define X64REG(context, reg) (openbsd_x64reg(context, reg))
#define XMMREG(context, reg) (reinterpret_cast<v128*>((context)->sc_fpstate->fx_xmm[reg]))
#define EFLAGS(context) ((context)->sc_rflags)
long* openbsd_x64reg(x64_context *context, int reg)
{
auto *state = &context;
switch(reg)
{
case 0: return &state->sc_rax;
case 1: return &state->sc_rcx;
case 2: return &state->sc_rdx;
case 3: return &state->sc_rbx;
case 4: return &state->sc_rsp;
case 5: return &state->sc_rbp;
case 6: return &state->sc_rsi;
case 7: return &state->sc_rdi;
case 8: return &state->sc_r8;
case 9: return &state->sc_r9;
case 10: return &state->sc_r10;
case 11: return &state->sc_r11;
case 12: return &state->sc_r12;
case 13: return &state->sc_r13;
case 14: return &state->sc_r14;
case 15: return &state->sc_r15;
case 16: return &state->sc_rip;
default:
sig_log.error("Invalid register index: %d", reg);
return nullptr;
}
}
#elif defined(__NetBSD__)
static const decltype(_REG_RAX) reg_table[] =
{
_REG_RAX, _REG_RCX, _REG_RDX, _REG_RBX, _REG_RSP, _REG_RBP, _REG_RSI, _REG_RDI,
_REG_R8, _REG_R9, _REG_R10, _REG_R11, _REG_R12, _REG_R13, _REG_R14, _REG_R15, _REG_RIP
};
#define X64REG(context, reg) (&(context)->uc_mcontext.__gregs[reg_table[reg]])
#define XMM_sig(context, reg) (reinterpret_cast<v128*>(((struct fxsave64*)(context)->uc_mcontext.__fpregs)->fx_xmm[reg]))
#define EFLAGS(context) ((context)->uc_mcontext.__gregs[_REG_RFL])
#else
static const int reg_table[] =
{
REG_RAX, REG_RCX, REG_RDX, REG_RBX, REG_RSP, REG_RBP, REG_RSI, REG_RDI,
REG_R8, REG_R9, REG_R10, REG_R11, REG_R12, REG_R13, REG_R14, REG_R15, REG_RIP
};
#define X64REG(context, reg) (&(context)->uc_mcontext.gregs[reg_table[reg]])
#ifdef __sun
#define XMMREG(context, reg) (reinterpret_cast<v128*>(&(context)->uc_mcontext.fpregs.fp_reg_set.fpchip_state.xmm[reg_table[reg]]))
#else
#define XMMREG(context, reg) (reinterpret_cast<v128*>(&(context)->uc_mcontext.fpregs->_xmm[reg]))
#endif // __sun
#define EFLAGS(context) ((context)->uc_mcontext.gregs[REG_EFL])
#endif // __APPLE__
#define ARG1(context) RDI(context)
#define ARG2(context) RSI(context)
#endif
#define RAX(c) (*X64REG((c), 0))
#define RCX(c) (*X64REG((c), 1))
#define RDX(c) (*X64REG((c), 2))
#define RSP(c) (*X64REG((c), 4))
#define RSI(c) (*X64REG((c), 6))
#define RDI(c) (*X64REG((c), 7))
#define RIP(c) (*X64REG((c), 16))
bool get_x64_reg_value(x64_context* context, x64_reg_t reg, usz d_size, usz i_size, u64& out_value)
{
// get x64 reg value (for store operations)
if (reg - X64R_RAX < 16)
{
// load the value from x64 register
const u64 reg_value = *X64REG(context, reg - X64R_RAX);
switch (d_size)
{
case 1: out_value = static_cast<u8>(reg_value); return true;
case 2: out_value = static_cast<u16>(reg_value); return true;
case 4: out_value = static_cast<u32>(reg_value); return true;
case 8: out_value = reg_value; return true;
}
}
else if (reg - X64R_AL < 4 && d_size == 1)
{
out_value = static_cast<u8>(*X64REG(context, reg - X64R_AL));
return true;
}
else if (reg - X64R_AH < 4 && d_size == 1)
{
out_value = static_cast<u8>(*X64REG(context, reg - X64R_AH) >> 8);
return true;
}
else if (reg == X64_IMM8)
{
// load the immediate value (assuming it's at the end of the instruction)
const s8 imm_value = *reinterpret_cast<s8*>(RIP(context) + i_size - 1);
switch (d_size)
{
case 1: out_value = static_cast<u8>(imm_value); return true;
case 2: out_value = static_cast<u16>(imm_value); return true; // sign-extended
case 4: out_value = static_cast<u32>(imm_value); return true; // sign-extended
case 8: out_value = static_cast<u64>(imm_value); return true; // sign-extended
}
}
else if (reg == X64_IMM16)
{
const s16 imm_value = *reinterpret_cast<s16*>(RIP(context) + i_size - 2);
switch (d_size)
{
case 2: out_value = static_cast<u16>(imm_value); return true;
}
}
else if (reg == X64_IMM32)
{
const s32 imm_value = *reinterpret_cast<s32*>(RIP(context) + i_size - 4);
switch (d_size)
{
case 4: out_value = static_cast<u32>(imm_value); return true;
case 8: out_value = static_cast<u64>(imm_value); return true; // sign-extended
}
}
else if (reg == X64R_ECX)
{
out_value = static_cast<u32>(RCX(context));
return true;
}
else if (reg >= X64_BIT_O && reg <= X64_BIT_NLE)
{
const u32 _cf = EFLAGS(context) & 0x1;
const u32 _zf = EFLAGS(context) & 0x40;
const u32 _sf = EFLAGS(context) & 0x80;
const u32 _of = EFLAGS(context) & 0x800;
const u32 _pf = EFLAGS(context) & 0x4;
const u32 _l = (_sf << 4) ^ _of; // SF != OF
switch (reg & ~1)
{
case X64_BIT_O: out_value = !!_of ^ (reg & 1); break;
case X64_BIT_C: out_value = !!_cf ^ (reg & 1); break;
case X64_BIT_Z: out_value = !!_zf ^ (reg & 1); break;
case X64_BIT_BE: out_value = !!(_cf | _zf) ^ (reg & 1); break;
case X64_BIT_S: out_value = !!_sf ^ (reg & 1); break;
case X64_BIT_P: out_value = !!_pf ^ (reg & 1); break;
case X64_BIT_L: out_value = !!_l ^ (reg & 1); break;
case X64_BIT_LE: out_value = !!(_l | _zf) ^ (reg & 1); break;
}
return true;
}
sig_log.error("get_x64_reg_value(): invalid arguments (reg=%d, d_size=%lld, i_size=%lld)", +reg, d_size, i_size);
return false;
}
bool put_x64_reg_value(x64_context* context, x64_reg_t reg, usz d_size, u64 value)
{
// save x64 reg value (for load operations)
if (reg - X64R_RAX < 16)
{
// save the value into x64 register
switch (d_size)
{
case 1: *X64REG(context, reg - X64R_RAX) = (value & 0xff) | (*X64REG(context, reg - X64R_RAX) & 0xffffff00); return true;
case 2: *X64REG(context, reg - X64R_RAX) = (value & 0xffff) | (*X64REG(context, reg - X64R_RAX) & 0xffff0000); return true;
case 4: *X64REG(context, reg - X64R_RAX) = value & 0xffffffff; return true;
case 8: *X64REG(context, reg - X64R_RAX) = value; return true;
}
}
sig_log.error("put_x64_reg_value(): invalid destination (reg=%d, d_size=%lld, value=0x%llx)", +reg, d_size, value);
return false;
}
bool set_x64_cmp_flags(x64_context* context, usz d_size, u64 x, u64 y, bool carry = true)
{
switch (d_size)
{
case 1: break;
case 2: break;
case 4: break;
case 8: break;
default: sig_log.error("set_x64_cmp_flags(): invalid d_size (%lld)", d_size); return false;
}
const u64 sign = 1ull << (d_size * 8 - 1); // sign mask
const u64 diff = x - y;
const u64 summ = x + y;
if (carry && ((x & y) | ((x ^ y) & ~summ)) & sign)
{
EFLAGS(context) |= 0x1; // set CF
}
else if (carry)
{
EFLAGS(context) &= ~0x1; // clear CF
}
if (x == y)
{
EFLAGS(context) |= 0x40; // set ZF
}
else
{
EFLAGS(context) &= ~0x40; // clear ZF
}
if (diff & sign)
{
EFLAGS(context) |= 0x80; // set SF
}
else
{
EFLAGS(context) &= ~0x80; // clear SF
}
if ((x ^ summ) & (y ^ summ) & sign)
{
EFLAGS(context) |= 0x800; // set OF
}
else
{
EFLAGS(context) &= ~0x800; // clear OF
}
const u8 p1 = static_cast<u8>(diff) ^ (static_cast<u8>(diff) >> 4);
const u8 p2 = p1 ^ (p1 >> 2);
const u8 p3 = p2 ^ (p2 >> 1);
if ((p3 & 1) == 0)
{
EFLAGS(context) |= 0x4; // set PF
}
else
{
EFLAGS(context) &= ~0x4; // clear PF
}
if (((x & y) | ((x ^ y) & ~summ)) & 0x8)
{
EFLAGS(context) |= 0x10; // set AF
}
else
{
EFLAGS(context) &= ~0x10; // clear AF
}
return true;
}
usz get_x64_access_size(x64_context* context, x64_op_t op, x64_reg_t reg, usz d_size, usz i_size)
{
if (op == X64OP_MOVS || op == X64OP_STOS)
{
if (EFLAGS(context) & 0x400 /* direction flag */)
{
// TODO
return 0;
}
if (reg != X64_NOT_SET) // get "full" access size from RCX register
{
u64 counter = 1;
if (!get_x64_reg_value(context, reg, 8, i_size, counter))
{
return -1;
}
return d_size * counter;
}
}
return d_size;
}
#elif defined(ARCH_ARM64)
#if defined(__APPLE__)
// https://github.com/bombela/backward-cpp/issues/200
#define RIP(context) ((context)->uc_mcontext->__ss.__pc)
#elif defined(__FreeBSD__)
#define RIP(context) ((context)->uc_mcontext.mc_gpregs.gp_elr)
#elif defined(__NetBSD__)
#define RIP(context) ((context)->uc_mcontext.__gregs[_REG_PC])
#elif defined(__OpenBSD__)
#define RIP(context) ((context)->sc_elr)
#else
#define RIP(context) ((context)->uc_mcontext.pc)
#endif
#endif /* ARCH_ */
namespace rsx
{
extern std::function<bool(u32 addr, bool is_writing)> g_access_violation_handler;
}
bool handle_access_violation(u32 addr, bool is_writing, ucontext_t* context) noexcept
{
g_tls_fault_all++;
const auto cpu = get_current_cpu_thread();
struct spu_unsavable
{
spu_thread* _spu;
spu_unsavable(cpu_thread* cpu) noexcept
: _spu(cpu ? cpu->try_get<spu_thread>() : nullptr)
{
if (_spu)
{
if (_spu->unsavable)
{
_spu = nullptr;
}
else
{
// Must not be saved inside access violation handler because it is unpredictable
_spu->unsavable = true;
}
}
}
~spu_unsavable() noexcept
{
if (_spu)
{
_spu->unsavable = false;
}
}
} spu_protection{cpu};
if (addr < RAW_SPU_BASE_ADDR && vm::check_addr(addr) && rsx::g_access_violation_handler)
{
bool state_changed = false;
if (cpu)
{
state_changed = vm::temporary_unlock(*cpu);
}
bool handled = rsx::g_access_violation_handler(addr, is_writing);
if (state_changed && (cpu->state += cpu_flag::temp, cpu->test_stopped()))
{
//
}
if (handled)
{
g_tls_fault_rsx++;
return true;
}
}
#if defined(ARCH_X64)
const u8* const code = reinterpret_cast<u8*>(RIP(context));
x64_op_t op;
x64_reg_t reg;
usz d_size;
usz i_size;
// decode single x64 instruction that causes memory access
decode_x64_reg_op(code, op, reg, d_size, i_size);
auto report_opcode = [=]()
{
if (op == X64OP_NONE)
{
be_t<v128> dump;
std::memcpy(&dump, code, sizeof(dump));
sig_log.error("decode_x64_reg_op(%p): unsupported opcode: %s", code, dump);
}
};
if (0x1'0000'0000ull - addr < d_size)
{
sig_log.error("Invalid d_size (0x%llx)", d_size);
report_opcode();
return false;
}
// get length of data being accessed
usz a_size = get_x64_access_size(context, op, reg, d_size, i_size);
if (0x1'0000'0000ull - addr < a_size)
{
sig_log.error("Invalid a_size (0x%llx)", a_size);
report_opcode();
return false;
}
// check if address is RawSPU MMIO register
do if (addr - RAW_SPU_BASE_ADDR < (6 * RAW_SPU_OFFSET) && (addr % RAW_SPU_OFFSET) >= RAW_SPU_PROB_OFFSET)
{
auto thread = idm::get_unlocked<named_thread<spu_thread>>(spu_thread::find_raw_spu((addr - RAW_SPU_BASE_ADDR) / RAW_SPU_OFFSET));
if (!thread)
{
break;
}
if (!a_size || !d_size || !i_size)
{
sig_log.error("Invalid or unsupported instruction (op=%d, reg=%d, d_size=%lld, a_size=0x%llx, i_size=%lld)", +op, +reg, d_size, a_size, i_size);
report_opcode();
return false;
}
if (a_size != 4)
{
// Might be unimplemented, such as writing MFC proxy EAL+EAH using 64-bit store
break;
}
switch (op)
{
case X64OP_LOAD:
case X64OP_LOAD_BE:
case X64OP_LOAD_CMP:
case X64OP_LOAD_TEST:
{
u32 value;
if (is_writing || !thread->read_reg(addr, value))
{
return false;
}
if (op != X64OP_LOAD_BE)
{
value = stx::se_storage<u32>::swap(value);
}
if (op == X64OP_LOAD_CMP)
{
u64 rvalue;
if (!get_x64_reg_value(context, reg, d_size, i_size, rvalue) || !set_x64_cmp_flags(context, d_size, value, rvalue))
{
return false;
}
break;
}
if (op == X64OP_LOAD_TEST)
{
u64 rvalue;
if (!get_x64_reg_value(context, reg, d_size, i_size, rvalue) || !set_x64_cmp_flags(context, d_size, value & rvalue, 0))
{
return false;
}
break;
}
if (!put_x64_reg_value(context, reg, d_size, value))
{
return false;
}
break;
}
case X64OP_BEXTR:
{
u32 value;
if (is_writing || !thread->read_reg(addr, value))
{
return false;
}
value = stx::se_storage<u32>::swap(value);
u64 ctrl;
if (!get_x64_reg_value(context, s_tls_reg3, d_size, i_size, ctrl))
{
return false;
}
u8 start = ctrl & 0xff;
u8 _len = (ctrl & 0xff00) >> 8;
if (_len > 32)
_len = 32;
if (start > 32)
start = 32;
value = (u64{value} >> start) & ~(u64{umax} << _len);
if (!put_x64_reg_value(context, reg, d_size, value) || !set_x64_cmp_flags(context, d_size, value, 0))
{
return false;
}
break;
}
case X64OP_STORE:
case X64OP_STORE_BE:
{
u64 reg_value;
if (!is_writing || !get_x64_reg_value(context, reg, d_size, i_size, reg_value))
{
return false;
}
u32 val32 = static_cast<u32>(reg_value);
if (!thread->write_reg(addr, op == X64OP_STORE ? stx::se_storage<u32>::swap(val32) : val32))
{
return false;
}
break;
}
case X64OP_MOVS: // possibly, TODO
case X64OP_STOS:
default:
{
sig_log.error("Invalid or unsupported operation (op=%d, reg=%d, d_size=%lld, i_size=%lld)", +op, +reg, d_size, i_size);
report_opcode();
return false;
}
}
// skip processed instruction
RIP(context) += i_size;
g_tls_fault_spu++;
return true;
} while (0);
#else
static_cast<void>(context);
#endif /* ARCH_ */
if (vm::check_addr(addr, is_writing ? vm::page_writable : vm::page_readable))
{
return true;
}
// Hack: allocate memory in case the emulator is stopping
const auto hack_alloc = [&]()
{
g_tls_access_violation_recovered = true;
if (vm::check_addr(addr, is_writing ? vm::page_writable : vm::page_readable))
{
return true;
}
const auto area = vm::reserve_map(vm::any, addr & -0x10000, 0x10000);
if (!area)
{
return false;
}
if (vm::writer_lock mlock; area->flags & vm::preallocated || vm::check_addr(addr, 0))
{
// For allocated memory with protection lower than required (such as protection::no or read-only while writing to it)
utils::memory_protect(vm::base(addr & -0x1000), 0x1000, utils::protection::rw);
return true;
}
return area->falloc(addr & -0x10000, 0x10000) || vm::check_addr(addr, is_writing ? vm::page_writable : vm::page_readable);
};
if (cpu && (cpu->get_class() == thread_class::ppu || cpu->get_class() == thread_class::spu))
{
vm::temporary_unlock(*cpu);
u32 pf_port_id = 0;
if (auto& pf_entries = g_fxo->get<page_fault_notification_entries>(); true)
{
if (auto mem = vm::get(vm::any, addr))
{
reader_lock lock(pf_entries.mutex);
for (const auto& entry : pf_entries.entries)
{
if (entry.start_addr == mem->addr)
{
pf_port_id = entry.port_id;
break;
}
}
}
}
if (auto pf_port = idm::get_unlocked<lv2_obj, lv2_event_port>(pf_port_id); pf_port && pf_port->queue)
{
// We notify the game that a page fault occurred so it can rectify it.
// Note, for data3, were the memory readable AND we got a page fault, it must be due to a write violation since reads are allowed.
u64 data1 = addr;
u64 data2 = 0;
if (cpu->try_get<ppu_thread>())
{
data2 = (SYS_MEMORY_PAGE_FAULT_TYPE_PPU_THREAD << 32) | cpu->id;
}
else if (auto spu = cpu->try_get<spu_thread>())
{
const u64 type = spu->get_type() == spu_type::threaded ?
SYS_MEMORY_PAGE_FAULT_TYPE_SPU_THREAD :
SYS_MEMORY_PAGE_FAULT_TYPE_RAW_SPU;
data2 = (type << 32) | spu->lv2_id;
}
u64 data3;
{
vm::writer_lock rlock;
if (vm::check_addr(addr, is_writing ? vm::page_writable : vm::page_readable))
{
// Memory was allocated inbetween, retry
return true;
}
else if (vm::check_addr(addr))
{
data3 = SYS_MEMORY_PAGE_FAULT_CAUSE_READ_ONLY; // TODO
}
else
{
data3 = SYS_MEMORY_PAGE_FAULT_CAUSE_NON_MAPPED;
}
}
// Now, place the page fault event onto table so that other functions [sys_mmapper_free_address and pagefault recovery funcs etc]
// know that this thread is page faulted and where.
auto& pf_events = g_fxo->get<page_fault_event_entries>();
// De-schedule
if (cpu->get_class() == thread_class::ppu)
{
cpu->state -= cpu_flag::signal; // Cannot use check_state here and signal must be removed if exists
lv2_obj::sleep(*cpu);
}
auto send_event = [&]() -> error_code
{
lv2_obj::notify_all_t notify_later{};
std::lock_guard pf_lock(pf_events.pf_mutex);
if (auto error = pf_port->queue->send(pf_port->name ? pf_port->name : ((u64{process_getpid() + 0u} << 32) | u64{pf_port_id}), data1, data2, data3))
{
return error;
}
pf_events.events.emplace(cpu, addr);
return {};
};
sig_log.warning("Page_fault %s location 0x%x because of %s memory", is_writing ? "writing" : "reading",
addr, data3 == SYS_MEMORY_PAGE_FAULT_CAUSE_READ_ONLY ? "writing read-only" : "using unmapped");
if (cpu->get_class() == thread_class::ppu)
{
if (const auto func = static_cast<ppu_thread*>(cpu)->current_function)
{
sig_log.warning("Page_fault while in function %s", func);
}
}
error_code sending_error = not_an_error(CELL_EBUSY);
// If we fail due to being busy, wait a bit and try again.
for (u64 sleep_until = get_system_time(); static_cast<u32>(sending_error) == CELL_EBUSY; thread_ctrl::wait_until(&sleep_until, 1000))
{
sending_error = send_event();
if (cpu->is_stopped())
{
sending_error = {};
break;
}
}
if (sending_error)
{
vm_log.error("Unknown error 0x%x while trying to pass page fault.", +sending_error);
return false;
}
else
{
// Wait until the thread is recovered
while (auto state = cpu->state.fetch_sub(cpu_flag::signal))
{
if (is_stopped(state) || state & cpu_flag::signal)
{
break;
}
thread_ctrl::wait_on(cpu->state, state);
}
}
// Reschedule, test cpu state and try recovery if stopped
if (cpu->test_stopped() && !hack_alloc())
{
return false;
}
return true;
}
if (cpu->get_class() == thread_class::spu)
{
if (!g_tls_access_violation_recovered)
{
vm_log.notice("\n%s", dump_useful_thread_info());
vm_log.always()("[%s] Access violation %s location 0x%x (%s)", cpu->get_name(), is_writing ? "writing" : "reading", addr, (is_writing && vm::check_addr(addr)) ? "read-only memory" : "unmapped memory");
}
// TODO:
// RawSPU: Send appropriate interrupt
// SPUThread: Send sys_spu exception event
cpu->state += cpu_flag::dbg_pause;
if (cpu->check_state() && !hack_alloc())
{
return false;
}
return true;
}
else
{
if (auto last_func = static_cast<ppu_thread*>(cpu)->current_function)
{
ppu_log.fatal("Function aborted: %s", last_func);
}
lv2_obj::sleep(*cpu);
}
}
if (cpu)
{
cpu->state += cpu_flag::wait;
}
Emu.Pause(true);
if (!g_tls_access_violation_recovered)
{
vm_log.notice("\n%s", dump_useful_thread_info());
}
// Note: a thread may access violate more than once after hack_alloc recovery
// Do not log any further access violations in this case.
if (!g_tls_access_violation_recovered)
{
vm_log.fatal("Access violation %s location 0x%x (%s)", is_writing ? "writing" : (cpu && cpu->get_class() == thread_class::ppu && cpu->get_pc() == addr ? "executing" : "reading"), addr, (is_writing && vm::check_addr(addr)) ? "read-only memory" : "unmapped memory");
}
while (Emu.IsPaused())
{
thread_ctrl::wait();
}
if (Emu.IsStopped() && !hack_alloc())
{
return false;
}
return true;
}
static void append_thread_name(std::string& msg)
{
if (thread_ctrl::get_current())
{
fmt::append(msg, "Emu Thread Name: '%s'.\n", thread_ctrl::get_name());
}
else if (thread_ctrl::is_main())
{
fmt::append(msg, "Thread: Main Thread.\n");
}
else
{
fmt::append(msg, "Thread id = %u.\n", thread_ctrl::get_tid());
}
}
#ifdef _WIN32
static LONG exception_handler(PEXCEPTION_POINTERS pExp) noexcept
{
if (pExp->ExceptionRecord->ExceptionCode == EXCEPTION_BREAKPOINT)
{
return EXCEPTION_CONTINUE_SEARCH;
}
const auto ptr = reinterpret_cast<u8*>(pExp->ExceptionRecord->ExceptionInformation[1]);
const bool is_writing = pExp->ExceptionRecord->ExceptionInformation[0] == 1;
const bool is_executing = pExp->ExceptionRecord->ExceptionInformation[0] == 8;
if (pExp->ExceptionRecord->ExceptionCode == EXCEPTION_ACCESS_VIOLATION && !is_executing)
{
u32 addr = 0;
if (auto [addr0, ok] = vm::try_get_addr(ptr); ok)
{
addr = addr0;
}
else if (const usz exec64 = (ptr - vm::g_exec_addr) / 2; exec64 <= u32{umax})
{
addr = static_cast<u32>(exec64);
}
else
{
return EXCEPTION_CONTINUE_SEARCH;
}
if (thread_ctrl::get_current() && handle_access_violation(addr, is_writing, pExp->ContextRecord))
{
return EXCEPTION_CONTINUE_EXECUTION;
}
}
switch (pExp->ExceptionRecord->ExceptionCode)
{
case EXCEPTION_ACCESS_VIOLATION:
case EXCEPTION_ARRAY_BOUNDS_EXCEEDED:
case EXCEPTION_DATATYPE_MISALIGNMENT:
case EXCEPTION_ILLEGAL_INSTRUCTION:
case EXCEPTION_IN_PAGE_ERROR:
case EXCEPTION_INT_DIVIDE_BY_ZERO:
case EXCEPTION_NONCONTINUABLE_EXCEPTION:
case EXCEPTION_PRIV_INSTRUCTION:
//case EXCEPTION_STACK_OVERFLOW:
{
sys_log.notice("\n%s", dump_useful_thread_info());
logs::listener::sync_all();
break;
}
default:
{
break;
}
}
return EXCEPTION_CONTINUE_SEARCH;
}
static LONG exception_filter(PEXCEPTION_POINTERS pExp) noexcept
{
std::string msg = fmt::format("Unhandled Win32 exception 0x%08X.\n", pExp->ExceptionRecord->ExceptionCode);
if (pExp->ExceptionRecord->ExceptionCode == EXCEPTION_ACCESS_VIOLATION)
{
const auto cause =
pExp->ExceptionRecord->ExceptionInformation[0] == 8 ? "executing" :
pExp->ExceptionRecord->ExceptionInformation[0] == 1 ? "writing" : "reading";
fmt::append(msg, "Segfault %s location %p at %p.\n", cause, pExp->ExceptionRecord->ExceptionInformation[1], pExp->ExceptionRecord->ExceptionAddress);
}
else
{
fmt::append(msg, "Exception address: %p.\n", pExp->ExceptionRecord->ExceptionAddress);
for (DWORD i = 0; i < pExp->ExceptionRecord->NumberParameters; i++)
{
fmt::append(msg, "ExceptionInformation[0x%x]: %p.\n", i, pExp->ExceptionRecord->ExceptionInformation[i]);
}
}
append_thread_name(msg);
std::vector<HMODULE> modules;
for (DWORD size = 256; modules.size() != size; size /= sizeof(HMODULE))
{
modules.resize(size);
if (!EnumProcessModules(GetCurrentProcess(), modules.data(), size * sizeof(HMODULE), &size))
{
modules.clear();
break;
}
}
#if defined(ARCH_X64)
const auto exec_addr = pExp->ContextRecord->Rip;
#elif defined(ARCH_ARM64)
const auto exec_addr = pExp->ContextRecord->Pc;
#else
#error "Unimplemented exception handling for this architecture"
#endif
fmt::append(msg, "Instruction address: %p.\n", exec_addr);
DWORD64 unwind_base;
if (const auto rtf = RtlLookupFunctionEntry(exec_addr, &unwind_base, nullptr))
{
// Get function address
const DWORD64 func_addr = rtf->BeginAddress + unwind_base;
fmt::append(msg, "Function address: %p (base+0x%x).\n", func_addr, rtf->BeginAddress);
// Access UNWIND_INFO structure
//const auto uw = (u8*)(unwind_base + rtf->UnwindData);
}
for (HMODULE _module : modules)
{
MODULEINFO info;
if (GetModuleInformation(GetCurrentProcess(), _module, &info, sizeof(info)))
{
const DWORD64 base = reinterpret_cast<DWORD64>(info.lpBaseOfDll);
if (exec_addr >= base && exec_addr < base + info.SizeOfImage)
{
std::string module_name;
for (DWORD size = 15; module_name.size() != size;)
{
module_name.resize(size);
size = GetModuleBaseNameA(GetCurrentProcess(), _module, &module_name.front(), size + 1);
if (!size)
{
module_name.clear();
break;
}
}
fmt::append(msg, "Module name: '%s'.\n", module_name);
fmt::append(msg, "Module base: %p.\n", info.lpBaseOfDll);
}
}
}
fmt::append(msg, "RPCS3 image base: %p.\n", GetModuleHandle(NULL));
// TODO: print registers and the callstack
sys_log.fatal("\n%s", msg);
logs::listener::sync_all();
thread_ctrl::emergency_exit(msg);
}
const bool s_exception_handler_set = []() -> bool
{
#ifdef USE_ASAN
if (!AddVectoredExceptionHandler(FALSE, static_cast<PVECTORED_EXCEPTION_HANDLER>(exception_handler)))
#else
if (!AddVectoredExceptionHandler(1, static_cast<PVECTORED_EXCEPTION_HANDLER>(exception_handler)))
#endif
{
report_fatal_error("AddVectoredExceptionHandler() failed.");
}
if (!SetUnhandledExceptionFilter(static_cast<LPTOP_LEVEL_EXCEPTION_FILTER>(exception_filter)))
{
report_fatal_error("SetUnhandledExceptionFilter() failed.");
}
return true;
}();
#else
static void signal_handler(int /*sig*/, siginfo_t* info, void* uct) noexcept
{
ucontext_t* context = static_cast<ucontext_t*>(uct);
#if defined(ARCH_X64)
#ifdef __APPLE__
const u64 err = context->uc_mcontext->__es.__err;
#elif defined(__DragonFly__) || defined(__FreeBSD__)
const u64 err = context->uc_mcontext.mc_err;
#elif defined(__OpenBSD__)
const u64 err = context->sc_err;
#elif defined(__NetBSD__)
const u64 err = context->uc_mcontext.__gregs[_REG_ERR];
#else
const u64 err = context->uc_mcontext.gregs[REG_ERR];
#endif
const bool is_executing = err & 0x10;
const bool is_writing = err & 0x2;
#elif defined(ARCH_ARM64)
const bool is_executing = uptr(info->si_addr) == uptr(RIP(context));
#if defined(__linux__) || defined(__APPLE__)
// Current CPU state decoder is reverse-engineered from the linux kernel and may not work on other platforms.
const auto decoded_reason = aarch64::decode_fault_reason(context);
const bool is_writing = (decoded_reason == aarch64::fault_reason::data_write);
if (decoded_reason != aarch64::fault_reason::data_write &&
decoded_reason != aarch64::fault_reason::data_read)
{
// We don't expect other classes of exceptions during normal executions
sig_log.warning("Unexpected fault. Reason: %d", static_cast<int>(decoded_reason));
}
#else
const u32 insn = is_executing ? 0 : *reinterpret_cast<u32*>(RIP(context));
const bool is_writing =
(insn & 0xbfff0000) == 0x0c000000 || // STR <Wt>, [<Xn>, #<imm>] (store word with immediate offset)
(insn & 0xbfe00000) == 0x0c800000 || // STP <Wt1>, <Wt2>, [<Xn>, #<imm>] (store pair of registers with immediate offset)
(insn & 0xbfdf0000) == 0x0d000000 || // STR <Wt>, [<Xn>, <Xm>] (store word with register offset)
(insn & 0xbfc00000) == 0x0d800000 || // STP <Wt1>, <Wt2>, [<Xn>, <Xm>] (store pair of registers with register offset)
(insn & 0x3f400000) == 0x08000000 || // STR <Vd>, [<Xn>, #<imm>] (store SIMD/FP register with immediate offset)
(insn & 0x3bc00000) == 0x39000000 || // STR <Wt>, [<Xn>, #<imm>] (store word with immediate offset)
(insn & 0x3fc00000) == 0x3d800000 || // STR <Vd>, [<Xn>, <Xm>] (store SIMD/FP register with register offset)
(insn & 0x3bc00000) == 0x38000000 || // STR <Wt>, [<Xn>, <Xm>] (store word with register offset)
(insn & 0x3fe00000) == 0x3c800000 || // STUR <Vd>, [<Xn>, #<imm>] (store unprivileged register with immediate offset)
(insn & 0x3fe00000) == 0x3ca00000 || // STR <Vd>, [<Xn>, #<imm>] (store SIMD/FP register with immediate offset)
(insn & 0x3a400000) == 0x28000000 || // STP <Wt1>, <Wt2>, [<Xn>, #<imm>] (store pair of registers with immediate offset)
(insn & 0xbf000000) == 0xad000000 || // STP <Vd1>, <Vd2>, [<Xn>, #<imm>] (store SIMD/FP 128-bit register pair with immediate offset)
(insn & 0xbf000000) == 0x6d000000; // STP <Dd1>, <Dd2>, [<Xn>, #<imm>] (store SIMD/FP 64-bit register pair with immediate offset)
#endif
#else
#error "signal_handler not implemented"
#endif
const u64 exec64 = (reinterpret_cast<u64>(info->si_addr) - reinterpret_cast<u64>(vm::g_exec_addr)) / 2;
const auto cause = is_executing ? "executing" : is_writing ? "writing" : "reading";
if (auto [addr, ok] = vm::try_get_addr(info->si_addr); ok && !is_executing)
{
// Try to process access violation
if (thread_ctrl::get_current() && handle_access_violation(addr, is_writing, context))
{
return;
}
}
if (exec64 < 0x100000000ull && !is_executing)
{
if (thread_ctrl::get_current() && handle_access_violation(static_cast<u32>(exec64), is_writing, context))
{
return;
}
}
std::string msg = fmt::format("Segfault %s location %p at %p.\n", cause, info->si_addr, RIP(context));
append_thread_name(msg);
sys_log.fatal("\n%s", msg);
sys_log.notice("\n%s", dump_useful_thread_info());
logs::listener::sync_all();
if (IsDebuggerPresent())
{
// Convert to SIGTRAP
raise(SIGTRAP);
return;
}
thread_ctrl::emergency_exit(msg);
}
static void sigill_handler(int /*sig*/, siginfo_t* info, void* /*uct*/) noexcept
{
std::string msg = fmt::format("Illegal instruction at %p (%s).\n", info->si_addr, *reinterpret_cast<be_t<u128>*>(info->si_addr));
append_thread_name(msg);
sys_log.fatal("\n%s", msg);
sys_log.notice("\n%s", dump_useful_thread_info());
logs::listener::sync_all();
if (IsDebuggerPresent())
{
// Convert to SIGTRAP
raise(SIGTRAP);
return;
}
thread_ctrl::emergency_exit(msg);
}
void sigpipe_signaling_handler(int)
{
}
const bool s_exception_handler_set = []() -> bool
{
struct ::sigaction sa;
sa.sa_flags = SA_SIGINFO;
sigemptyset(&sa.sa_mask);
sa.sa_sigaction = signal_handler;
if (::sigaction(SIGSEGV, &sa, NULL) == -1)
{
std::fprintf(stderr, "sigaction(SIGSEGV) failed (%d).\n", errno);
std::abort();
}
#ifdef __APPLE__
if (::sigaction(SIGBUS, &sa, NULL) == -1)
{
std::fprintf(stderr, "sigaction(SIGBUS) failed (%d).\n", errno);
std::abort();
}
#endif
sa.sa_sigaction = sigill_handler;
if (::sigaction(SIGILL, &sa, NULL) == -1)
{
std::fprintf(stderr, "sigaction(SIGILL) failed (%d).\n", errno);
std::abort();
}
sa.sa_handler = sigpipe_signaling_handler;
if (::sigaction(SIGPIPE, &sa, NULL) == -1)
{
std::fprintf(stderr, "sigaction(SIGPIPE) failed (%d).\n", errno);
std::abort();
}
std::printf("Debugger: %d\n", +IsDebuggerPresent());
return true;
}();
#endif
const bool s_terminate_handler_set = []() -> bool
{
std::set_terminate([]()
{
if (IsDebuggerPresent())
{
logs::listener::sync_all();
utils::trap();
}
report_fatal_error("RPCS3 has abnormally terminated.");
});
return true;
}();
thread_local DECLARE(thread_ctrl::g_tls_this_thread) = nullptr;
thread_local DECLARE(thread_ctrl::g_tls_error_callback) = nullptr;
DECLARE(thread_ctrl::g_native_core_layout) { native_core_arrangement::undefined };
void thread_base::start()
{
m_sync.atomic_op([&](u32& v)
{
v &= ~static_cast<u32>(thread_state::mask);
v |= static_cast<u32>(thread_state::created);
});
#ifdef _WIN32
m_thread = ::_beginthreadex(nullptr, 0, entry_point, this, CREATE_SUSPENDED, nullptr);
ensure(m_thread);
ensure(::ResumeThread(reinterpret_cast<HANDLE>(+m_thread)) != static_cast<DWORD>(-1));
#elif defined(__APPLE__)
pthread_attr_t stack_size_attr;
pthread_attr_init(&stack_size_attr);
pthread_attr_setstacksize(&stack_size_attr, 0x800000);
ensure(pthread_create(reinterpret_cast<pthread_t*>(&m_thread.raw()), &stack_size_attr, entry_point, this) == 0);
#else
ensure(pthread_create(reinterpret_cast<pthread_t*>(&m_thread.raw()), nullptr, entry_point, this) == 0);
#endif
}
void thread_base::initialize(void (*error_cb)())
{
#ifndef _WIN32
m_thread.release(reinterpret_cast<u64>(pthread_self()));
#endif
// Initialize TLS variables
thread_ctrl::g_tls_this_thread = this;
thread_ctrl::g_tls_error_callback = error_cb;
g_tls_log_prefix = []
{
return thread_ctrl::get_name_cached();
};
atomic_wait_engine::set_wait_callback([](const void*, u64 attempts, u64 stamp0) -> bool
{
if (attempts == umax)
{
g_tls_wait_time += utils::get_tsc() - stamp0;
}
else if (attempts > 1)
{
g_tls_wait_fail += attempts - 1;
}
return true;
});
set_name(thread_ctrl::get_name_cached());
}
void thread_base::set_name(std::string name)
{
#ifdef _WIN32
if (SetThreadDescriptionImport)
{
SetThreadDescriptionImport(GetCurrentThread(), utf8_to_wchar(name).c_str());
}
#endif
#ifdef _MSC_VER
struct THREADNAME_INFO
{
DWORD dwType;
LPCSTR szName;
DWORD dwThreadID;
DWORD dwFlags;
};
// Set thread name for VS debugger
if (IsDebuggerPresent()) [&]() NEVER_INLINE
{
THREADNAME_INFO info;
info.dwType = 0x1000;
info.szName = name.c_str();
info.dwThreadID = -1;
info.dwFlags = 0;
__try
{
RaiseException(0x406D1388, 0, sizeof(info) / sizeof(ULONG_PTR), (ULONG_PTR*)&info);
}
__except (EXCEPTION_EXECUTE_HANDLER)
{
}
}();
#endif
#if defined(__APPLE__)
name.resize(std::min<usz>(15, name.size()));
pthread_setname_np(name.c_str());
#elif defined(__DragonFly__) || defined(__FreeBSD__) || defined(__OpenBSD__)
pthread_set_name_np(pthread_self(), name.c_str());
#elif defined(__NetBSD__)
pthread_setname_np(pthread_self(), "%s", name.data());
#elif !defined(_WIN32)
name.resize(std::min<usz>(15, name.size()));
pthread_setname_np(pthread_self(), name.c_str());
#endif
}
u64 thread_base::finalize(thread_state result_state) noexcept
{
// Report pending errors
error_code::error_report(0, nullptr, nullptr, nullptr, nullptr);
#ifdef _WIN32
static thread_local ULONG64 tls_cycles{};
static thread_local u64 tls_time{};
ULONG64 cycles{};
QueryThreadCycleTime(GetCurrentThread(), &cycles);
cycles -= tls_cycles;
tls_cycles += cycles;
FILETIME ctime, etime, ktime, utime;
GetThreadTimes(GetCurrentThread(), &ctime, &etime, &ktime, &utime);
const u64 time = ((ktime.dwLowDateTime | static_cast<u64>(ktime.dwHighDateTime) << 32) + (utime.dwLowDateTime | static_cast<u64>(utime.dwHighDateTime) << 32)) * 100ull - tls_time;
tls_time += time;
const u64 fsoft = 0;
const u64 fhard = 0;
const u64 ctxvol = 0;
const u64 ctxinv = 0;
#elif defined(RUSAGE_THREAD)
static thread_local u64 tls_time{}, tls_fsoft{}, tls_fhard{}, tls_ctxvol{}, tls_ctxinv{};
const u64 cycles = 0; // Not supported
struct ::rusage stats{};
::getrusage(RUSAGE_THREAD, &stats);
const u64 time = (stats.ru_utime.tv_sec + stats.ru_stime.tv_sec) * 1000000000ull + (stats.ru_utime.tv_usec + stats.ru_stime.tv_usec) * 1000ull - tls_time;
tls_time += time;
const u64 fsoft = stats.ru_minflt - tls_fsoft;
tls_fsoft += fsoft;
const u64 fhard = stats.ru_majflt - tls_fhard;
tls_fhard += fhard;
const u64 ctxvol = stats.ru_nvcsw - tls_ctxvol;
tls_ctxvol += ctxvol;
const u64 ctxinv = stats.ru_nivcsw - tls_ctxinv;
tls_ctxinv += ctxinv;
#else
const u64 cycles = 0;
const u64 time = 0;
const u64 fsoft = 0;
const u64 fhard = 0;
const u64 ctxvol = 0;
const u64 ctxinv = 0;
#endif
g_tls_log_prefix = []
{
return thread_ctrl::get_name_cached();
};
const bool is_cpu_thread = !!cpu_thread::get_current();
auto& thread_log = (is_cpu_thread || g_tls_fault_all ? sig_log.notice : sig_log.trace);
thread_log("Thread time: %fs (%fGc); Faults: %u [rsx:%u, spu:%u]; [soft:%u hard:%u]; Switches:[vol:%u unvol:%u]; Wait:[%.3fs, spur:%u]",
time / 1000000000.,
cycles / 1000000000.,
g_tls_fault_all,
g_tls_fault_rsx,
g_tls_fault_spu,
fsoft, fhard, ctxvol, ctxinv,
g_tls_wait_time / (utils::get_tsc_freq() / 1.),
g_tls_wait_fail);
atomic_wait_engine::set_wait_callback(nullptr);
// Avoid race with the destructor
const u64 _self = m_thread;
// Set result state (errored or finalized)
m_sync.fetch_op([&](u32& v)
{
v &= -4;
v |= static_cast<u32>(result_state);
});
// Signal waiting threads
m_sync.notify_all();
return _self;
}
thread_base::native_entry thread_base::finalize(u64 _self) noexcept
{
g_tls_fault_all = 0;
g_tls_fault_rsx = 0;
g_tls_fault_spu = 0;
g_tls_wait_time = 0;
g_tls_wait_fail = 0;
g_tls_access_violation_recovered = false;
g_tls_log_prefix = []() -> std::string { return {}; };
if (_self == umax)
{
thread_ctrl::g_tls_this_thread = nullptr;
return nullptr;
}
#ifdef _WIN32
_endthreadex(0);
#else
pthread_exit(nullptr);
#endif
return nullptr;
}
thread_base::native_entry thread_base::make_trampoline(u64(*entry)(thread_base* _base))
{
return build_function_asm<native_entry>("", [&](native_asm& c, auto& args)
{
using namespace asmjit;
#if defined(ARCH_X64)
Label _ret = c.newLabel();
c.push(x86::rbp);
c.sub(x86::rsp, 0x20);
// Call entry point (TODO: support for detached threads missing?)
c.call(entry);
// Call finalize, return if zero
c.mov(args[0], x86::rax);
c.call(static_cast<native_entry(*)(u64)>(&finalize));
c.test(x86::rax, x86::rax);
c.jz(_ret);
// Otherwise, call it as an entry point with first arg = new current thread
c.mov(x86::rbp, x86::rax);
c.call(thread_ctrl::get_current);
c.mov(args[0], x86::rax);
c.add(x86::rsp, 0x28);
c.jmp(x86::rbp);
c.bind(_ret);
c.add(x86::rsp, 0x28);
c.ret();
#else
UNUSED(c);
UNUSED(args);
UNUSED(entry);
#endif
});
}
thread_state thread_ctrl::state()
{
auto _this = g_tls_this_thread;
// Guard for recursive calls (TODO: may be more effective to reuse one of m_sync bits)
static thread_local bool s_tls_exec = false;
// Drain execution queue
if (!s_tls_exec)
{
s_tls_exec = true;
_this->exec();
s_tls_exec = false;
}
return static_cast<thread_state>(_this->m_sync & 3);
}
void thread_ctrl::wait_for(u64 usec, [[maybe_unused]] bool alert /* true */)
{
if (!usec)
{
return;
}
auto _this = g_tls_this_thread;
if (!alert && usec > 50000)
{
usec = 50000;
}
#ifdef __linux__
static thread_local struct linux_timer_handle_t
{
// Allocate timer only if needed (i.e. someone calls wait_for with alert and short period)
const int m_timer = timerfd_create(CLOCK_MONOTONIC, 0);
linux_timer_handle_t() noexcept
{
if (m_timer == -1)
{
sig_log.error("Linux timer allocation failed, using the fallback instead.");
}
}
operator int() const
{
return m_timer;
}
~linux_timer_handle_t()
{
if (m_timer != -1)
{
close(m_timer);
}
}
} fd_timer;
if (!alert && fd_timer != -1)
{
struct itimerspec timeout;
u64 missed;
timeout.it_value.tv_nsec = usec % 1'000'000 * 1'000ull;
timeout.it_value.tv_sec = usec / 1'000'000;
timeout.it_interval.tv_sec = 0;
timeout.it_interval.tv_nsec = 0;
timerfd_settime(fd_timer, 0, &timeout, NULL);
if (read(fd_timer, &missed, sizeof(missed)) != sizeof(missed))
sig_log.error("timerfd: read() failed");
return;
}
#endif
if (alert)
{
if (_this->m_sync.bit_test_reset(2) || _this->m_taskq)
{
return;
}
}
// Wait for signal and thread state abort
atomic_wait::list<2> list{};
if (alert)
{
list.set<0>(_this->m_sync, 0);
list.set<1>(utils::bless<atomic_t<u32>>(&_this->m_taskq)[1], 0);
}
else
{
list.set<0>(_this->m_dummy, 0);
}
list.wait(atomic_wait_timeout{usec <= 0xffff'ffff'ffff'ffff / 1000 ? usec * 1000 : 0xffff'ffff'ffff'ffff});
}
void thread_ctrl::wait_until(u64* wait_time, u64 add_time, u64 min_wait, bool update_to_current_time)
{
*wait_time = utils::add_saturate<u64>(*wait_time, add_time);
// TODO: Implement proper support for "waiting until" inside atomic wait engine
const u64 current_time = get_system_time();
if (current_time > *wait_time)
{
if (update_to_current_time)
{
*wait_time = current_time + (add_time - (current_time - *wait_time) % add_time);
}
else if (!min_wait)
{
return;
}
}
if (min_wait)
{
*wait_time = std::max<u64>(*wait_time, utils::add_saturate<u64>(current_time, min_wait));
}
wait_for(*wait_time - current_time);
}
void thread_ctrl::wait_for_accurate(u64 usec)
{
if (!usec)
{
return;
}
if (usec > 50000)
{
fmt::throw_exception("thread_ctrl::wait_for_accurate: unsupported amount");
}
#ifdef __linux__
return wait_for(usec, false);
#else
using namespace std::chrono_literals;
const auto until = std::chrono::steady_clock::now() + 1us * usec;
while (true)
{
// Host scheduler quantum for windows (worst case)
constexpr u64 host_min_quantum = 500;
if (usec >= host_min_quantum)
{
// Wait on multiple of min quantum for large durations to avoid overloading low thread cpus
wait_for(usec - (usec % host_min_quantum), false);
}
// TODO: Determine best value for yield delay
else if (usec >= host_min_quantum / 2)
{
std::this_thread::yield();
}
else
{
busy_wait(100);
}
const auto current = std::chrono::steady_clock::now();
if (current >= until)
{
break;
}
usec = std::chrono::duration_cast<std::chrono::microseconds>(until - current).count();
}
#endif
}
std::string thread_ctrl::get_name_cached()
{
auto _this = thread_ctrl::g_tls_this_thread;
if (!_this)
{
return {};
}
static thread_local shared_ptr<std::string> name_cache;
if (!_this->m_tname.is_equal(name_cache)) [[unlikely]]
{
_this->m_tname.peek_op([&](const shared_ptr<std::string>& ptr)
{
if (ptr != name_cache)
{
name_cache = ptr;
}
});
}
return *name_cache;
}
thread_base::thread_base(native_entry entry, std::string name)
: entry_point(entry)
, m_tname(make_single_value(std::move(name)))
{
}
thread_base::~thread_base()
{
// Cleanup abandoned tasks: initialize default results and signal
this->exec();
// Cleanup
{
#ifdef _WIN32
const HANDLE handle0 = reinterpret_cast<HANDLE>(m_thread.load());
WaitForSingleObject(handle0, INFINITE);
CloseHandle(handle0);
#else
pthread_join(reinterpret_cast<pthread_t>(m_thread.load()), nullptr);
#endif
}
}
bool thread_base::join(bool dtor) const
{
// Check if already finished
if (m_sync & 2)
{
return (m_sync & 3) == 3;
}
// Hacked for too sleepy threads (1ms) TODO: make sure it's unneeded and remove
const auto timeout = dtor && Emu.IsStopped() ? atomic_wait_timeout{1'000'000} : atomic_wait_timeout::inf;
auto stamp0 = utils::get_tsc();
for (u64 i = 0; (m_sync & 3) <= 1; i++)
{
m_sync.wait(m_sync & ~2, timeout);
if (m_sync & 2)
{
break;
}
if (i >= 16 && !(i & (i - 1)) && timeout != atomic_wait_timeout::inf)
{
sig_log.error(u8"Thread [%s] is too sleepy. Waiting for it %.3fus already!", *m_tname.load(), (utils::get_tsc() - stamp0) / (utils::get_tsc_freq() / 1000000.));
}
}
return (m_sync & 3) == 3;
}
void thread_base::notify()
{
// Set notification
m_sync |= 4;
m_sync.notify_all();
}
u64 thread_base::get_native_id() const
{
#ifdef _WIN32
return GetThreadId(reinterpret_cast<HANDLE>(m_thread.load()));
#else
return m_thread.load();
#endif
}
u64 thread_base::get_cycles()
{
u64 cycles = 0;
const u64 handle = m_thread;
#ifdef _WIN32
if (QueryThreadCycleTime(reinterpret_cast<HANDLE>(handle), &cycles))
{
#elif __APPLE__
mach_port_name_t port = pthread_mach_thread_np(reinterpret_cast<pthread_t>(handle));
mach_msg_type_number_t count = THREAD_BASIC_INFO_COUNT;
thread_basic_info_data_t info;
kern_return_t ret = thread_info(port, THREAD_BASIC_INFO, reinterpret_cast<thread_info_t>(&info), &count);
if (ret == KERN_SUCCESS)
{
cycles = static_cast<u64>(info.user_time.seconds + info.system_time.seconds) * 1'000'000'000 +
static_cast<u64>(info.user_time.microseconds + info.system_time.microseconds) * 1'000;
#else
clockid_t _clock;
struct timespec thread_time;
if (!pthread_getcpuclockid(reinterpret_cast<pthread_t>(handle), &_clock) && !clock_gettime(_clock, &thread_time))
{
cycles = static_cast<u64>(thread_time.tv_sec) * 1'000'000'000 + thread_time.tv_nsec;
#endif
if (const u64 old_cycles = m_cycles.exchange(cycles))
{
return cycles - old_cycles;
}
// Report 0 the first time this function is called
return 0;
}
else
{
return m_cycles;
}
}
void thread_base::push(shared_ptr<thread_future> task)
{
const auto next = &task->next;
m_taskq.push_head(*next, std::move(task));
m_taskq.notify_one();
}
void thread_base::exec()
{
if (!m_taskq) [[likely]]
{
return;
}
while (shared_ptr<thread_future> head = m_taskq.exchange(null_ptr))
{
// TODO: check if adapting reverse algorithm is feasible here
thread_future* prev_head{head.get()};
for (thread_future* prev{};;)
{
utils::prefetch_exec(prev_head->exec.load());
if (auto next = prev_head->next.get())
{
prev = std::exchange(prev_head, next);
prev_head->prev = prev;
}
else
{
break;
}
}
for (auto ptr = prev_head; ptr; ptr = ptr->prev)
{
if (auto task = ptr->exec.load()) [[likely]]
{
// Execute or discard (if aborting)
if ((m_sync & 3) == 0) [[likely]]
{
task(this, ptr);
}
else
{
task(nullptr, ptr);
}
// Notify waiters
ptr->done.release(1);
ptr->done.notify_all();
}
if (ptr->next)
{
// Partial cleanup
ptr->next.reset();
}
}
if (!m_taskq) [[likely]]
{
return;
}
}
}
[[noreturn]] void thread_ctrl::emergency_exit(std::string_view reason)
{
if (const std::string info = dump_useful_thread_info(); !info.empty())
{
sys_log.notice("\n%s", info);
}
std::string reason_buf;
if (auto ppu = cpu_thread::get_current<ppu_thread>())
{
if (auto func = ppu->current_function)
{
fmt::append(reason_buf, "%s (PPU: %s)", reason, func);
}
}
if (!reason_buf.empty())
{
reason = reason_buf;
}
sig_log.fatal("Thread terminated due to fatal error: %s", reason);
logs::listener::sync_all();
if (IsDebuggerPresent())
{
// Prevent repeatedly halting the debugger in case multiple threads crashed at once
static atomic_t<u64> s_last_break = 0;
const u64 current_break = get_system_time() & -2;
if (s_last_break.fetch_op([current_break](u64& v)
{
if (current_break >= (v & -2) && current_break - (v & -2) >= 20'000'000)
{
v = current_break;
return true;
}
// Let's allow a single more thread to halt the debugger so the programmer sees the pattern
if (!(v & 1))
{
v |= 1;
return true;
}
return false;
}).second)
{
utils::trap();
}
}
if (const auto _this = g_tls_this_thread)
{
g_tls_error_callback();
u64 _self = _this->finalize(thread_state::errored);
if (_self == umax)
{
// Unused, detached thread support remnant
delete _this;
}
thread_base::finalize(umax);
#ifdef _WIN32
_endthreadex(0);
#else
pthread_exit(nullptr);
#endif
}
report_fatal_error(reason);
}
void thread_ctrl::detect_cpu_layout()
{
if (!g_native_core_layout.compare_and_swap_test(native_core_arrangement::undefined, native_core_arrangement::generic))
return;
const auto system_id = utils::get_cpu_brand();
if (system_id.find("Ryzen") != umax)
{
g_native_core_layout.store(native_core_arrangement::amd_ccx);
}
else if (system_id.find("Intel") != umax)
{
#ifdef _WIN32
const LOGICAL_PROCESSOR_RELATIONSHIP relationship = LOGICAL_PROCESSOR_RELATIONSHIP::RelationProcessorCore;
DWORD buffer_size = 0;
// If buffer size is set to 0 bytes, it will be overwritten with the required size
if (GetLogicalProcessorInformationEx(relationship, nullptr, &buffer_size))
{
sig_log.error("GetLogicalProcessorInformationEx returned 0 bytes");
return;
}
DWORD error_code = GetLastError();
if (error_code != ERROR_INSUFFICIENT_BUFFER)
{
sig_log.error("Unexpected windows error code when detecting CPU layout: %u", error_code);
return;
}
std::vector<u8> buffer(buffer_size);
if (!GetLogicalProcessorInformationEx(relationship,
reinterpret_cast<SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *>(buffer.data()), &buffer_size))
{
sig_log.error("GetLogicalProcessorInformationEx failed (size=%u, error=%s)", buffer_size, fmt::win_error{GetLastError(), nullptr});
}
else
{
// Iterate through the buffer until a core with hyperthreading is found
auto ptr = reinterpret_cast<uptr>(buffer.data());
const uptr end = ptr + buffer_size;
while (ptr < end)
{
auto info = reinterpret_cast<SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *>(ptr);
if (info->Relationship == relationship && info->Processor.Flags == LTP_PC_SMT)
{
g_native_core_layout.store(native_core_arrangement::intel_ht);
break;
}
ptr += info->Size;
}
}
#else
sig_log.todo("Thread scheduler is not implemented for Intel and this OS");
#endif
}
}
u64 thread_ctrl::get_affinity_mask(thread_class group)
{
detect_cpu_layout();
if (const auto thread_count = utils::get_thread_count())
{
const u64 all_cores_mask = process_affinity_mask;
switch (g_native_core_layout)
{
default:
case native_core_arrangement::generic:
{
return all_cores_mask;
}
case native_core_arrangement::amd_ccx:
{
if (thread_count <= 8)
{
// Single CCX or not enough threads, do nothing
return all_cores_mask;
}
u64 spu_mask, ppu_mask, rsx_mask;
spu_mask = ppu_mask = rsx_mask = all_cores_mask; // Fallback, in case someone is messing with core config
const auto system_id = utils::get_cpu_brand();
const auto family_id = utils::get_cpu_family();
const auto model_id = utils::get_cpu_model();
switch (family_id)
{
case 0x17: // Zen, Zen+, Zen2
case 0x18: // Dhyana core (Zen)
{
if (model_id > 0x30)
{
// Zen2 (models 49, 96, 113, 144)
// Much improved inter-CCX latency
switch (thread_count)
{
case 128:
case 64:
case 48:
case 32:
// TR 3000 series, or R9 3950X, Assign threads 9-32
ppu_mask = 0b11111111000000000000000000000000;
spu_mask = 0b00000000111111110000000000000000;
rsx_mask = 0b00000000000000001111111100000000;
break;
case 24:
// 3900X, Assign threads 7-24
ppu_mask = 0b111111000000000000000000;
spu_mask = 0b000000111111000000000000;
rsx_mask = 0b000000000000111111000000;
break;
case 16:
// 3700, 3800 family, Assign threads 1-16
ppu_mask = 0b0000000011110000;
spu_mask = 0b1111111100000000;
rsx_mask = 0b0000000000001111;
break;
case 12:
// 3600 family, Assign threads 1-12
ppu_mask = 0b000000111000;
spu_mask = 0b111111000000;
rsx_mask = 0b000000000111;
break;
default:
break;
}
}
else
{
// Zen, Zen+ (models 1, 8(+), 17, 24(+), 32)
switch (thread_count)
{
case 64:
// TR 2990WX, Assign threads 17-32
ppu_mask = 0b00000000111111110000000000000000;
spu_mask = ppu_mask;
rsx_mask = 0b11111111000000000000000000000000;
break;
case 48:
// TR 2970WX, Assign threads 9-24
ppu_mask = 0b000000111111000000000000;
spu_mask = ppu_mask;
rsx_mask = 0b111111000000000000000000;
break;
case 32:
// TR 2950X, TR 1950X, Assign threads 17-32
ppu_mask = 0b00000000111111110000000000000000;
spu_mask = ppu_mask;
rsx_mask = 0b11111111000000000000000000000000;
break;
case 24:
// TR 1920X, 2920X, Assign threads 13-24
ppu_mask = 0b000000111111000000000000;
spu_mask = ppu_mask;
rsx_mask = 0b111111000000000000000000;
break;
case 16:
// 1700, 1800, 2700, TR 1900X family
if (g_cfg.core.thread_scheduler == thread_scheduler_mode::alt)
{
ppu_mask = 0b0010000010000000;
spu_mask = 0b0000101010101010;
rsx_mask = 0b1000000000000000;
}
else // if (g_cfg.core.thread_scheduler == thread_scheduler_mode::old)
{
ppu_mask = 0b1111111100000000;
spu_mask = ppu_mask;
rsx_mask = 0b0000000000111100;
}
break;
case 12:
// 1600, 2600 family, Assign threads 3-12
ppu_mask = 0b111111000000;
spu_mask = ppu_mask;
rsx_mask = 0b000000111100;
break;
default:
break;
}
}
break;
}
case 0x19: // Zen3
{
// Single-CCX architecture, just disable SMT if wide enough
// CCX now holds upto 16 threads
// Lack of hw availability makes testing difficult
switch (thread_count)
{
case 24:
// 5900X, Use same scheduler as 3900X
// Unverified on windows, may be worse than just disabling SMT and scheduler
ppu_mask = 0b111111000000000000000000;
spu_mask = 0b000000111111000000000000;
rsx_mask = 0b000000000000111111000000;
break;
case 16:
// 5800X
if (g_cfg.core.thread_scheduler == thread_scheduler_mode::alt)
{
ppu_mask = 0b0000000011110000;
spu_mask = 0b1111111100000000;
rsx_mask = 0b0000000000001111;
}
else // if (g_cfg.core.thread_scheduler == thread_scheduler_mode::old)
{
// Verified by more than one windows user on 16-thread CPU
ppu_mask = spu_mask = rsx_mask = (0b10101010101010101010101010101010 & all_cores_mask);
}
break;
case 12:
// 5600X
if (g_cfg.core.thread_scheduler == thread_scheduler_mode::alt)
{
ppu_mask = 0b000000001100;
spu_mask = 0b111111110000;
rsx_mask = 0b000000000011;
}
else // if (g_cfg.core.thread_scheduler == thread_scheduler_mode::old)
{
ppu_mask = spu_mask = rsx_mask = all_cores_mask;
}
break;
default:
if (thread_count > 24)
{
ppu_mask = spu_mask = rsx_mask = (0b10101010101010101010101010101010 & all_cores_mask);
}
break;
}
break;
}
default:
{
break;
}
}
switch (group)
{
default:
case thread_class::general:
return all_cores_mask;
case thread_class::rsx:
return rsx_mask;
case thread_class::ppu:
return ppu_mask;
case thread_class::spu:
return spu_mask;
}
}
case native_core_arrangement::intel_ht:
{
if (thread_count >= 12 && g_cfg.core.thread_scheduler == thread_scheduler_mode::alt)
return (0b10101010101010101010101010101010 & all_cores_mask); // Potentially improves performance by mimicking HT off
return all_cores_mask;
}
}
}
return -1;
}
void thread_ctrl::set_native_priority(int priority)
{
#ifdef _WIN32
HANDLE _this_thread = GetCurrentThread();
INT native_priority = THREAD_PRIORITY_NORMAL;
if (priority > 0)
native_priority = THREAD_PRIORITY_ABOVE_NORMAL;
if (priority < 0)
native_priority = THREAD_PRIORITY_BELOW_NORMAL;
if (!SetThreadPriority(_this_thread, native_priority))
{
sig_log.error("SetThreadPriority() failed: %s", fmt::win_error{GetLastError(), nullptr});
}
#else
int policy;
struct sched_param param;
pthread_getschedparam(pthread_self(), &policy, &param);
if (priority > 0)
param.sched_priority = sched_get_priority_max(policy);
if (priority < 0)
param.sched_priority = sched_get_priority_min(policy);
if (int err = pthread_setschedparam(pthread_self(), policy, &param))
{
sig_log.error("pthread_setschedparam() failed: %d", err);
}
#endif
}
u64 thread_ctrl::get_process_affinity_mask()
{
static const u64 mask = []() -> u64
{
#ifdef _WIN32
DWORD_PTR res, _sys;
if (!GetProcessAffinityMask(GetCurrentProcess(), &res, &_sys))
{
sig_log.error("Failed to get process affinity mask.");
return 0;
}
return res;
#else
// Assume it's called from the main thread (this is a bit shaky)
return thread_ctrl::get_thread_affinity_mask();
#endif
}();
return mask;
}
DECLARE(thread_ctrl::process_affinity_mask) = get_process_affinity_mask();
void thread_ctrl::set_thread_affinity_mask(u64 mask)
{
sig_log.trace("set_thread_affinity_mask called with mask=0x%x", mask);
#ifdef _WIN32
HANDLE _this_thread = GetCurrentThread();
if (!SetThreadAffinityMask(_this_thread, !mask ? process_affinity_mask : mask))
{
sig_log.error("Failed to set thread affinity 0x%x: error: %s", mask, fmt::win_error{GetLastError(), nullptr});
}
#elif __APPLE__
// Supports only one core
thread_affinity_policy_data_t policy = { static_cast<integer_t>(std::countr_zero(mask)) };
thread_port_t mach_thread = pthread_mach_thread_np(pthread_self());
thread_policy_set(mach_thread, THREAD_AFFINITY_POLICY, reinterpret_cast<thread_policy_t>(&policy), !mask ? 0 : 1);
#elif defined(__linux__) || defined(__DragonFly__) || defined(__FreeBSD__)
if (!mask)
{
// Reset affinity mask
mask = process_affinity_mask;
}
cpu_set_t cs;
CPU_ZERO(&cs);
for (u32 core = 0; core < 64u; ++core)
{
const u64 shifted = mask >> core;
if (shifted & 1)
{
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wold-style-cast"
CPU_SET(core, &cs);
#pragma GCC diagnostic pop
}
if (shifted <= 1)
{
break;
}
}
if (int err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cs))
{
sig_log.error("Failed to set thread affinity 0x%x: error %d.", mask, err);
}
#endif
}
u64 thread_ctrl::get_thread_affinity_mask()
{
#ifdef _WIN32
const u64 res = process_affinity_mask;
if (DWORD_PTR result = SetThreadAffinityMask(GetCurrentThread(), res))
{
if (res != result)
{
SetThreadAffinityMask(GetCurrentThread(), result);
}
return result;
}
sig_log.error("Failed to get thread affinity mask.");
return 0;
#elif defined(__linux__) || defined(__DragonFly__) || defined(__FreeBSD__)
cpu_set_t cs;
CPU_ZERO(&cs);
if (int err = pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), &cs))
{
sig_log.error("Failed to get thread affinity mask: error %d.", err);
return 0;
}
u64 result = 0;
for (u32 core = 0; core < 64u; core++)
{
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wold-style-cast"
if (CPU_ISSET(core, &cs))
#pragma GCC diagnostic pop
{
result |= 1ull << core;
}
}
if (result == 0)
{
sig_log.error("Thread affinity mask is out of u64 range.");
return 0;
}
return result;
#else
return -1;
#endif
}
std::pair<void*, usz> thread_ctrl::get_thread_stack()
{
#ifdef _WIN32
ULONG_PTR _min = 0;
ULONG_PTR _max = 0;
GetCurrentThreadStackLimits(&_min, &_max);
const usz ssize = _max - _min;
const auto saddr = reinterpret_cast<void*>(_min);
#else
void* saddr = 0;
usz ssize = 0;
#if defined(__linux__)
pthread_attr_t attr;
pthread_getattr_np(pthread_self(), &attr);
pthread_attr_getstack(&attr, &saddr, &ssize);
#elif defined(__APPLE__)
saddr = pthread_get_stackaddr_np(pthread_self());
ssize = pthread_get_stacksize_np(pthread_self());
#else
pthread_attr_t attr;
pthread_attr_get_np(pthread_self(), &attr);
pthread_attr_getstackaddr(&attr, &saddr);
pthread_attr_getstacksize(&attr, &ssize);
#endif
#endif
return {saddr, ssize};
}
u64 thread_ctrl::get_tid()
{
#ifdef _WIN32
return GetCurrentThreadId();
#elif defined(__linux__)
return syscall(SYS_gettid);
#else
return reinterpret_cast<u64>(pthread_self());
#endif
}
bool thread_ctrl::is_main()
{
return get_tid() == utils::main_tid;
}