Fix thread pool entry point and get_cycles()

Fix possible race between thread handle availability.
Don't treat zero thread as invalid one.
Now entry point is full is assembly.
Attempt to fix #9282
Also fix some TLS.
This commit is contained in:
Nekotekina 2020-11-21 07:56:54 +03:00
parent 60cff6f3d4
commit 71f1021648
5 changed files with 109 additions and 110 deletions

View file

@ -88,6 +88,7 @@ thread_local u64 g_tls_fault_rsx = 0;
thread_local u64 g_tls_fault_spu = 0; thread_local u64 g_tls_fault_spu = 0;
thread_local u64 g_tls_wait_time = 0; thread_local u64 g_tls_wait_time = 0;
thread_local u64 g_tls_wait_fail = 0; thread_local u64 g_tls_wait_fail = 0;
thread_local bool g_tls_access_violation_recovered = false;
extern thread_local std::string(*g_tls_log_prefix)(); extern thread_local std::string(*g_tls_log_prefix)();
template <> template <>
@ -1367,13 +1368,11 @@ bool handle_access_violation(u32 addr, bool is_writing, x64_context* context) no
return true; return true;
} }
thread_local bool access_violation_recovered = false;
// Hack: allocate memory in case the emulator is stopping // Hack: allocate memory in case the emulator is stopping
const auto hack_alloc = [&]() const auto hack_alloc = [&]()
{ {
// If failed the value remains true and std::terminate should be called // If failed the value remains true and std::terminate should be called
access_violation_recovered = true; g_tls_access_violation_recovered = true;
const auto area = vm::reserve_map(vm::any, addr & -0x10000, 0x10000); const auto area = vm::reserve_map(vm::any, addr & -0x10000, 0x10000);
@ -1525,7 +1524,7 @@ bool handle_access_violation(u32 addr, bool is_writing, x64_context* context) no
if (cpu->id_type() != 1) if (cpu->id_type() != 1)
{ {
if (!access_violation_recovered) if (!g_tls_access_violation_recovered)
{ {
vm_log.notice("\n%s", cpu->dump_all()); vm_log.notice("\n%s", cpu->dump_all());
vm_log.error("Access violation %s location 0x%x (%s) [type=u%u]", is_writing ? "writing" : "reading", addr, (is_writing && vm::check_addr(addr)) ? "read-only memory" : "unmapped memory", d_size * 8); vm_log.error("Access violation %s location 0x%x (%s) [type=u%u]", is_writing ? "writing" : "reading", addr, (is_writing && vm::check_addr(addr)) ? "read-only memory" : "unmapped memory", d_size * 8);
@ -1556,14 +1555,14 @@ bool handle_access_violation(u32 addr, bool is_writing, x64_context* context) no
Emu.Pause(); Emu.Pause();
if (cpu && !access_violation_recovered) if (cpu && !g_tls_access_violation_recovered)
{ {
vm_log.notice("\n%s", cpu->dump_all()); vm_log.notice("\n%s", cpu->dump_all());
} }
// Note: a thread may access violate more than once after hack_alloc recovery // Note: a thread may access violate more than once after hack_alloc recovery
// Do not log any further access violations in this case. // Do not log any further access violations in this case.
if (!access_violation_recovered) if (!g_tls_access_violation_recovered)
{ {
vm_log.fatal("Access violation %s location 0x%x (%s) [type=u%u]", is_writing ? "writing" : "reading", addr, (is_writing && vm::check_addr(addr)) ? "read-only memory" : "unmapped memory", d_size * 8); vm_log.fatal("Access violation %s location 0x%x (%s) [type=u%u]", is_writing ? "writing" : "reading", addr, (is_writing && vm::check_addr(addr)) ? "read-only memory" : "unmapped memory", d_size * 8);
} }
@ -1850,7 +1849,7 @@ static atomic_t<u128, 64> s_thread_bits{0};
static atomic_t<thread_base**> s_thread_pool[128]{}; static atomic_t<thread_base**> s_thread_pool[128]{};
void thread_base::start(native_entry entry, void(*trampoline)()) void thread_base::start(native_entry entry)
{ {
for (u128 bits = s_thread_bits.load(); bits; bits &= bits - 1) for (u128 bits = s_thread_bits.load(); bits; bits &= bits - 1)
{ {
@ -1869,16 +1868,13 @@ void thread_base::start(native_entry entry, void(*trampoline)())
} }
// Send "this" and entry point // Send "this" and entry point
m_thread = reinterpret_cast<u64>(trampoline); const u64 entry_val = reinterpret_cast<u64>(entry);
m_thread = entry_val;
atomic_storage<thread_base*>::release(*tls, this); atomic_storage<thread_base*>::release(*tls, this);
s_thread_pool[pos].notify_all(); s_thread_pool[pos].notify_all();
// Wait for actual "m_thread" in return // Wait for actual "m_thread" in return
while (m_thread == reinterpret_cast<u64>(trampoline)) m_thread.wait(entry_val);
{
busy_wait(300);
}
return; return;
} }
@ -1892,6 +1888,10 @@ void thread_base::start(native_entry entry, void(*trampoline)())
void thread_base::initialize(void (*error_cb)()) void thread_base::initialize(void (*error_cb)())
{ {
#ifndef _WIN32
m_thread.release(reinterpret_cast<u64>(pthread_self()));
#endif
// Initialize TLS variables // Initialize TLS variables
thread_ctrl::g_tls_this_thread = this; thread_ctrl::g_tls_this_thread = this;
@ -2012,9 +2012,6 @@ u64 thread_base::finalize(thread_state result_state) noexcept
atomic_wait_engine::set_wait_callback(nullptr); atomic_wait_engine::set_wait_callback(nullptr);
const u64 _self = m_thread;
m_thread.release(0);
// Return true if need to delete thread object (no) // Return true if need to delete thread object (no)
const bool ok = 0 == (3 & ~m_sync.fetch_op([&](u64& v) const bool ok = 0 == (3 & ~m_sync.fetch_op([&](u64& v)
{ {
@ -2026,48 +2023,55 @@ u64 thread_base::finalize(thread_state result_state) noexcept
m_sync.notify_all(2); m_sync.notify_all(2);
// No detached thread supported atm // No detached thread supported atm
return _self; return m_thread;
} }
void thread_base::finalize(u64 _self) noexcept u64 thread_base::finalize(u64 _self) noexcept
{ {
g_tls_fault_all = 0;
g_tls_fault_rsx = 0;
g_tls_fault_spu = 0;
g_tls_wait_time = 0;
g_tls_wait_fail = 0;
g_tls_access_violation_recovered = false;
atomic_wait_engine::set_wait_callback(nullptr); atomic_wait_engine::set_wait_callback(nullptr);
g_tls_log_prefix = []() -> std::string { return {}; }; g_tls_log_prefix = []() -> std::string { return {}; };
thread_ctrl::g_tls_this_thread = nullptr; thread_ctrl::g_tls_this_thread = nullptr;
if (!_self) if (!_self)
{ {
return; return 0;
} }
// Try to add self to thread pool // Try to add self to thread pool
const auto [bits, ok] = s_thread_bits.fetch_op([](u128& bits)
{
if (~bits) [[likely]]
{
// Set lowest clear bit
bits |= bits + 1;
return true;
}
return false;
});
if (!ok)
{
#ifdef _WIN32
_endthread();
#else
pthread_detach(reinterpret_cast<pthread_t>(_self));
pthread_exit(0);
#endif
return;
}
set_name("..pool"); set_name("..pool");
// Obtain id from atomic op u32 pos = -1;
const u32 pos = utils::ctz128(~bits);
while (true)
{
const auto [bits, ok] = s_thread_bits.fetch_op([](u128& bits)
{
if (~bits) [[likely]]
{
// Set lowest clear bit
bits |= bits + 1;
return true;
}
return false;
});
if (ok) [[likely]]
{
pos = utils::ctz128(~bits);
break;
}
s_thread_bits.wait(bits);
}
const auto tls = &thread_ctrl::g_tls_this_thread; const auto tls = &thread_ctrl::g_tls_this_thread;
s_thread_pool[pos] = tls; s_thread_pool[pos] = tls;
@ -2082,30 +2086,47 @@ void thread_base::finalize(u64 _self) noexcept
val &= ~(u128(1) << pos); val &= ~(u128(1) << pos);
}); });
s_thread_bits.notify_one();
// Restore thread id // Restore thread id
const auto _this = atomic_storage<thread_base*>::load(*tls); const auto _this = atomic_storage<thread_base*>::load(*tls);
const auto entry = _this->m_thread.exchange(_self); const auto entry = _this->m_thread.exchange(_self);
verify(HERE), entry != _self;
_this->m_thread.notify_one(); _this->m_thread.notify_one();
// Hack return address to avoid tail call // Return new entry
#ifdef _MSC_VER return entry;
*static_cast<u64*>(_AddressOfReturnAddress()) = entry;
#else
static_cast<u64*>(__builtin_frame_address(0))[1] = entry;
#endif
//reinterpret_cast<native_entry>(entry)(_this);
} }
void (*thread_base::make_trampoline(native_entry entry))() thread_base::native_entry thread_base::make_trampoline(u64(*entry)(thread_base* _base))
{ {
return build_function_asm<void(*)()>([&](asmjit::X86Assembler& c, auto& args) return build_function_asm<native_entry>([&](asmjit::X86Assembler& c, auto& args)
{ {
using namespace asmjit; using namespace asmjit;
// Revert effect of ret instruction (fix stack alignment) Label _ret = c.newLabel();
c.mov(x86::rax, imm_ptr(entry)); c.push(x86::rbp);
c.sub(x86::rsp, 8); c.sub(x86::rsp, 0x20);
c.jmp(x86::rax);
// Call entry point (TODO: support for detached threads missing?)
c.call(imm_ptr(entry));
// Call finalize, return if zero
c.mov(args[0], x86::rax);
c.call(imm_ptr<u64(*)(u64)>(finalize));
c.test(x86::rax, x86::rax);
c.jz(_ret);
// Otherwise, call it as an entry point with first arg = new current thread
c.mov(x86::rbp, x86::rax);
c.call(imm_ptr(thread_ctrl::get_current));
c.mov(args[0], x86::rax);
c.add(x86::rsp, 0x28);
c.jmp(x86::rbp);
c.bind(_ret);
c.add(x86::rsp, 0x28);
c.ret();
}); });
} }
@ -2193,12 +2214,13 @@ thread_base::thread_base(std::string_view name)
thread_base::~thread_base() thread_base::~thread_base()
{ {
if (u64 handle = m_thread.exchange(0)) // Only cleanup on errored status
if ((m_sync & 3) == 2)
{ {
#ifdef _WIN32 #ifdef _WIN32
CloseHandle(reinterpret_cast<HANDLE>(handle)); CloseHandle(reinterpret_cast<HANDLE>(m_thread.load()));
#else #else
pthread_detach(reinterpret_cast<pthread_t>(handle)); pthread_join(reinterpret_cast<pthread_t>(m_thread.load()), nullptr);
#endif #endif
} }
} }
@ -2255,13 +2277,15 @@ u64 thread_base::get_native_id() const
u64 thread_base::get_cycles() u64 thread_base::get_cycles()
{ {
u64 cycles; u64 cycles = 0;
const u64 handle = m_thread;
#ifdef _WIN32 #ifdef _WIN32
if (QueryThreadCycleTime(reinterpret_cast<HANDLE>(m_thread.load()), &cycles)) if (QueryThreadCycleTime(reinterpret_cast<HANDLE>(handle), &cycles))
{ {
#elif __APPLE__ #elif __APPLE__
mach_port_name_t port = pthread_mach_thread_np(reinterpret_cast<pthread_t>(m_thread.load())); mach_port_name_t port = pthread_mach_thread_np(reinterpret_cast<pthread_t>(handle));
mach_msg_type_number_t count = THREAD_BASIC_INFO_COUNT; mach_msg_type_number_t count = THREAD_BASIC_INFO_COUNT;
thread_basic_info_data_t info; thread_basic_info_data_t info;
kern_return_t ret = thread_info(port, THREAD_BASIC_INFO, reinterpret_cast<thread_info_t>(&info), &count); kern_return_t ret = thread_info(port, THREAD_BASIC_INFO, reinterpret_cast<thread_info_t>(&info), &count);
@ -2272,7 +2296,7 @@ u64 thread_base::get_cycles()
#else #else
clockid_t _clock; clockid_t _clock;
struct timespec thread_time; struct timespec thread_time;
if (!pthread_getcpuclockid(reinterpret_cast<pthread_t>(m_thread.load()), &_clock) && !clock_gettime(_clock, &thread_time)) if (!pthread_getcpuclockid(reinterpret_cast<pthread_t>(handle), &_clock) && !clock_gettime(_clock, &thread_time))
{ {
cycles = static_cast<u64>(thread_time.tv_sec) * 1'000'000'000 + thread_time.tv_nsec; cycles = static_cast<u64>(thread_time.tv_sec) * 1'000'000'000 + thread_time.tv_nsec;
#endif #endif
@ -2317,19 +2341,15 @@ void thread_ctrl::emergency_exit(std::string_view reason)
if (!_self) if (!_self)
{ {
// Unused, detached thread support remnant
delete _this; delete _this;
} }
thread_base::finalize(0); thread_base::finalize(0);
#ifdef _WIN32 #ifdef _WIN32
_endthread(); _endthreadex(0);
#else #else
if (_self)
{
pthread_detach(reinterpret_cast<pthread_t>(_self));
}
pthread_exit(0); pthread_exit(0);
#endif #endif
} }

View file

@ -111,22 +111,22 @@ private:
stx::atomic_cptr<std::string> m_tname; stx::atomic_cptr<std::string> m_tname;
// Start thread // Start thread
void start(native_entry, void(*)()); void start(native_entry);
// Called at the thread start // Called at the thread start
void initialize(void (*error_cb)()); void initialize(void (*error_cb)());
// Called at the thread end, returns true if needs destruction // Called at the thread end, returns self handle
u64 finalize(thread_state result) noexcept; u64 finalize(thread_state result) noexcept;
// Cleanup after possibly deleting the thread instance // Cleanup after possibly deleting the thread instance
static void finalize(u64 _self) noexcept; static u64 finalize(u64 _self) noexcept;
// Set name for debugger // Set name for debugger
static void set_name(std::string); static void set_name(std::string);
// Make trampoline with stack fix // Make entry point
static void(*make_trampoline(native_entry))(); static native_entry make_trampoline(u64(*)(thread_base*));
friend class thread_ctrl; friend class thread_ctrl;
@ -277,35 +277,12 @@ class named_thread final : public Context, result_storage_t<Context>, thread_bas
using result = result_storage_t<Context>; using result = result_storage_t<Context>;
using thread = thread_base; using thread = thread_base;
// Type-erased thread entry point static u64 entry_point(thread_base* _base)
#ifdef _WIN32
static inline uint __stdcall entry_point(void* arg)
#else
static inline void* entry_point(void* arg)
#endif
{ {
if (auto _this = thread_ctrl::get_current()) return static_cast<named_thread*>(_base)->entry_point2();
{
arg = _this;
}
const auto _this = static_cast<named_thread*>(static_cast<thread*>(arg));
// Perform self-cleanup if necessary
u64 _self = _this->entry_point();
if (!_self)
{
delete _this;
thread::finalize(0);
return 0;
}
thread::finalize(_self);
return 0;
} }
u64 entry_point() u64 entry_point2()
{ {
thread::initialize([]() thread::initialize([]()
{ {
@ -330,7 +307,7 @@ class named_thread final : public Context, result_storage_t<Context>, thread_bas
return thread::finalize(thread_state::finished); return thread::finalize(thread_state::finished);
} }
static inline void(*trampoline)() = thread::make_trampoline(entry_point); static inline thread::native_entry trampoline = thread::make_trampoline(entry_point);
friend class thread_ctrl; friend class thread_ctrl;
@ -341,7 +318,7 @@ public:
: Context() : Context()
, thread(Context::thread_name) , thread(Context::thread_name)
{ {
thread::start(&named_thread::entry_point, trampoline); thread::start(trampoline);
} }
// Normal forwarding constructor // Normal forwarding constructor
@ -350,7 +327,7 @@ public:
: Context(std::forward<Args>(args)...) : Context(std::forward<Args>(args)...)
, thread(name) , thread(name)
{ {
thread::start(&named_thread::entry_point, trampoline); thread::start(trampoline);
} }
// Lambda constructor, also the implicit deduction guide candidate // Lambda constructor, also the implicit deduction guide candidate
@ -358,7 +335,7 @@ public:
: Context(std::forward<Context>(f)) : Context(std::forward<Context>(f))
, thread(name) , thread(name)
{ {
thread::start(&named_thread::entry_point, trampoline); thread::start(trampoline);
} }
named_thread(const named_thread&) = delete; named_thread(const named_thread&) = delete;

View file

@ -469,6 +469,8 @@ void GLGSRender::on_exit()
GSRender::on_exit(); GSRender::on_exit();
zcull_ctrl.release(); zcull_ctrl.release();
gl::set_primary_context_thread(false);
} }
void GLGSRender::clear_surface(u32 arg) void GLGSRender::clear_surface(u32 arg)

View file

@ -11,16 +11,16 @@ namespace gl
capabilities g_driver_caps; capabilities g_driver_caps;
const fbo screen{}; const fbo screen{};
thread_local bool tls_primary_context_thread = false; static thread_local bool s_tls_primary_context_thread = false;
void set_primary_context_thread() void set_primary_context_thread(bool value)
{ {
tls_primary_context_thread = true; s_tls_primary_context_thread = value;
} }
bool is_primary_context_thread() bool is_primary_context_thread()
{ {
return tls_primary_context_thread; return s_tls_primary_context_thread;
} }
GLenum draw_mode(rsx::primitive_type in) GLenum draw_mode(rsx::primitive_type in)

View file

@ -55,7 +55,7 @@ namespace gl
bool is_primitive_native(rsx::primitive_type in); bool is_primitive_native(rsx::primitive_type in);
GLenum draw_mode(rsx::primitive_type in); GLenum draw_mode(rsx::primitive_type in);
void set_primary_context_thread(); void set_primary_context_thread(bool = true);
bool is_primary_context_thread(); bool is_primary_context_thread();
// Texture helpers // Texture helpers