From bd269bccaf68caff66324580c0417ca68fc5805c Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Mon, 21 Dec 2020 17:12:05 +0300 Subject: [PATCH] types.hpp: remove intrinsic includes Replace v128 with u128 in some places. Removed some unused files. --- Utilities/StrFmt.cpp | 12 + Utilities/Thread.cpp | 2 + Utilities/typemap.h | 1043 ----------------- rpcs3/Crypto/unedat.cpp | 62 +- rpcs3/Crypto/unedat.h | 20 +- rpcs3/Crypto/unself.cpp | 6 +- rpcs3/Crypto/unself.h | 3 +- rpcs3/Emu/CMakeLists.txt | 1 - rpcs3/Emu/CPU/CPUThread.cpp | 5 +- rpcs3/Emu/CPU/CPUTranslator.cpp | 3 + rpcs3/Emu/Cell/Modules/cellAudio.cpp | 3 + rpcs3/Emu/Cell/Modules/cellSaveData.h | 4 +- rpcs3/Emu/Cell/Modules/cellSpurs.cpp | 1 + rpcs3/Emu/Cell/Modules/cellSpursSpu.cpp | 1 + rpcs3/Emu/Cell/Modules/sceNp.cpp | 10 +- rpcs3/Emu/Cell/PPUInterpreter.cpp | 1 + rpcs3/Emu/Cell/PPUThread.cpp | 2 + rpcs3/Emu/Cell/PPUTranslator.cpp | 1 + rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp | 7 +- rpcs3/Emu/Cell/SPUASMJITRecompiler.h | 8 +- rpcs3/Emu/Cell/SPUDisAsm.cpp | 29 +- rpcs3/Emu/Cell/SPUDisAsm.h | 30 +- rpcs3/Emu/Cell/SPUInterpreter.cpp | 1 + rpcs3/Emu/Cell/SPURecompiler.cpp | 1 + rpcs3/Emu/Cell/SPUThread.cpp | 3 +- rpcs3/Emu/Cell/SPUThread.h | 2 +- rpcs3/Emu/Cell/lv2/sys_overlay.cpp | 4 +- rpcs3/Emu/Cell/lv2/sys_process.cpp | 5 +- rpcs3/Emu/Cell/lv2/sys_prx.cpp | 4 +- rpcs3/Emu/Cell/lv2/sys_spu.cpp | 4 +- rpcs3/Emu/Memory/vm.cpp | 10 +- rpcs3/Emu/Memory/vm_reservation.h | 38 +- rpcs3/Emu/RSX/Common/BufferUtils.cpp | 18 +- rpcs3/Emu/RSX/Common/ProgramStateCache.cpp | 4 +- rpcs3/Emu/RSX/Common/ProgramStateCache.h | 57 +- rpcs3/Emu/RSX/Common/program_state_cache2.hpp | 64 + rpcs3/Emu/RSX/GL/GLGSRender.cpp | 2 + .../Shaders/shader_loading_dialog.cpp | 6 +- rpcs3/Emu/RSX/RSXOffload.cpp | 5 +- rpcs3/Emu/RSX/RSXThread.cpp | 8 +- rpcs3/Emu/RSX/VK/VKGSRender.cpp | 4 +- rpcs3/Emu/RSX/VK/VKGSRender.h | 4 + rpcs3/Emu/RSX/VK/VKHelpers.cpp | 4 + rpcs3/Emu/RSX/VK/VKHelpers.h | 8 + rpcs3/Emu/System.cpp | 2 +- rpcs3/Emu/System.h | 2 +- rpcs3/Emu/VFS.cpp | 2 +- rpcs3/Emu/perf_meter.cpp | 34 +- rpcs3/Emu/perf_meter.hpp | 45 +- rpcs3/Loader/TRP.cpp | 2 +- rpcs3/emucore.vcxproj | 6 +- rpcs3/emucore.vcxproj.filters | 12 +- rpcs3/main.cpp | 3 - rpcs3/rpcs3qt/main_window.cpp | 18 +- rpcs3/rpcs3qt/register_editor_dialog.cpp | 4 +- rpcs3/util/asm.hpp | 91 +- rpcs3/util/atomic.cpp | 31 +- rpcs3/util/atomic.hpp | 99 +- rpcs3/util/atomic2.cpp | 532 --------- rpcs3/util/atomic2.hpp | 156 --- rpcs3/util/sysinfo.cpp | 28 +- rpcs3/util/types.hpp | 26 +- rpcs3/util/v128.hpp | 306 ++--- rpcs3/util/v128sse.hpp | 255 ++++ 64 files changed, 899 insertions(+), 2265 deletions(-) delete mode 100644 Utilities/typemap.h create mode 100644 rpcs3/Emu/RSX/Common/program_state_cache2.hpp delete mode 100644 rpcs3/util/atomic2.cpp delete mode 100644 rpcs3/util/atomic2.hpp create mode 100644 rpcs3/util/v128sse.hpp diff --git a/Utilities/StrFmt.cpp b/Utilities/StrFmt.cpp index 7ba0092d2f..8ca763f53e 100644 --- a/Utilities/StrFmt.cpp +++ b/Utilities/StrFmt.cpp @@ -243,6 +243,18 @@ void fmt_class_string::format(std::string& out, u64 arg) fmt::append(out, "0x%016llx%016llx", vec._u64[1], vec._u64[0]); } +template <> +void fmt_class_string::format(std::string& out, u64 arg) +{ + // TODO: it should be supported as full-fledged integral type (with %u, %d, etc, fmt) + const u128& num = get_object(arg); +#ifdef _MSC_VER + fmt::append(out, "0x%016llx%016llx", num.hi, num.lo); +#else + fmt::append(out, "0x%016llx%016llx", static_cast(num >> 64), static_cast(num)); +#endif +} + template <> void fmt_class_string::format(std::string& out, u64 arg) { diff --git a/Utilities/Thread.cpp b/Utilities/Thread.cpp index 2b5b5ceb5b..fc650320c6 100644 --- a/Utilities/Thread.cpp +++ b/Utilities/Thread.cpp @@ -76,6 +76,8 @@ #include "util/vm.hpp" #include "util/logs.hpp" #include "util/asm.hpp" +#include "util/v128.hpp" +#include "util/v128sse.hpp" #include "util/sysinfo.hpp" #include "Emu/Memory/vm_locking.h" diff --git a/Utilities/typemap.h b/Utilities/typemap.h deleted file mode 100644 index b68730df37..0000000000 --- a/Utilities/typemap.h +++ /dev/null @@ -1,1043 +0,0 @@ -#pragma once - -#include "util/types.hpp" -#include "mutex.h" -#include "util/atomic.hpp" -#include "util/typeindices.hpp" -#include - -namespace utils -{ - class typemap; - - template - class typeptr; - - class typeptr_base; - - // Special tag for typemap access: request free id - constexpr struct id_new_t{} id_new{}; - - // Special tag for typemap access: unconditionally access the only object (max_count = 1 only) - constexpr struct id_any_t{} id_any{}; - - // Special tag for typemap access: like id_any but also default-construct the object if not exists - constexpr struct id_always_t{} id_always{}; - - // Aggregate with information for more accurate object retrieval, isn't accepted internally - struct weak_typeptr - { - uint id; - uint type; - - // Stamp isn't automatically stored and checked anywhere - ullong stamp; - }; - - // Detect id transformation trait (multiplier) - template - struct typeinfo_step - { - static constexpr uint step = 1; - }; - - template - struct typeinfo_step::id_step)>> - { - static constexpr uint step = uint{std::decay_t::id_step}; - }; - - // Detect id transformation trait (addend) - template - struct typeinfo_bias - { - static constexpr uint bias = 0; - }; - - template - struct typeinfo_bias::id_base)>> - { - static constexpr uint bias = uint{std::decay_t::id_base}; - }; - - // Detect max number of objects, default = 1 - template - struct typeinfo_count - { - static constexpr uint max_count = 1; - }; - - template - struct typeinfo_count::id_count)>> - { - static constexpr uint max_count = uint{std::decay_t::id_count}; - - static_assert(ullong{max_count} * typeinfo_step::step <= 0x1'0000'0000ull); - }; - - // Detect operator -> - template - struct typeinfo_pointer - { - static constexpr bool is_ptr = false; - }; - - template - struct typeinfo_pointer::operator->)>> - { - static constexpr bool is_ptr = true; - }; - - // Type information - struct typeinfo_base - { - uint size = 0; - uint align = 0; - uint count = 0; - void(*clean)(class typemap_block*) = 0; - - constexpr typeinfo_base() noexcept = default; - - template - static void call_destructor(typemap_block* ptr) noexcept; - - template - static constexpr typeinfo_base make_typeinfo() noexcept - { - static_assert(alignof(T) < 4096); - - typeinfo_base r; - r.size = uint{sizeof(T)}; - r.align = uint{alignof(T)}; - r.count = typeinfo_count::max_count; - r.clean = &call_destructor; - return r; - } - }; - - // Internal, control block for a particular object - class typemap_block - { - friend typemap; - - template - friend class typeptr; - - friend class typeptr_base; - - shared_mutex m_mutex; - atomic_t m_type; - public: - typemap_block() = default; - - // Get pointer to the object of type T, with respect to alignment - template - T* get_ptr() - { - constexpr uint offset = alignof(T) < SelfSize ? ::align(SelfSize, alignof(T)) : alignof(T); - return reinterpret_cast(reinterpret_cast(this) + offset); - } - }; - - static_assert(std::is_standard_layout_v); - static_assert(sizeof(typemap_block) == 8); - - template - void typeinfo_base::call_destructor(typemap_block* ptr) noexcept - { - ptr->get_ptr()->~T(); - } - - // An object of type T paired with atomic refcounter - template - class refctr final - { - atomic_t m_ref{1}; - - public: - T object; - - template - refctr(Args&&... args) - : object(std::forward(args)...) - { - } - - void add_ref() noexcept - { - m_ref++; - } - - usz remove_ref() noexcept - { - return --m_ref; - } - }; - - // Simplified "shared" ptr making use of refctr class - template - class refptr final - { - refctr* m_ptr = nullptr; - - void destroy() - { - if (m_ptr && !m_ptr->remove_ref()) - delete m_ptr; - } - - public: - constexpr refptr() = default; - - // Construct directly from refctr pointer - explicit refptr(refctr* ptr) noexcept - : m_ptr(ptr) - { - } - - refptr(const refptr& rhs) noexcept - : m_ptr(rhs.m_ptr) - { - if (m_ptr) - m_ptr->add_ref(); - } - - refptr(refptr&& rhs) noexcept - : m_ptr(rhs.m_ptr) - { - rhs.m_ptr = nullptr; - } - - ~refptr() - { - destroy(); - } - - refptr& operator =(const refptr& rhs) noexcept - { - destroy(); - m_ptr = rhs.m_ptr; - if (m_ptr) - m_ptr->add_ref(); - } - - refptr& operator =(refptr&& rhs) noexcept - { - std::swap(m_ptr, rhs.m_ptr); - } - - void reset() noexcept - { - destroy(); - m_ptr = nullptr; - } - - refctr* release() noexcept - { - return std::exchange(m_ptr, nullptr); - } - - void swap(refptr&& rhs) noexcept - { - std::swap(m_ptr, rhs.m_ptr); - } - - refctr* get() const noexcept - { - return m_ptr; - } - - T& operator *() const noexcept - { - return m_ptr->object; - } - - T* operator ->() const noexcept - { - return &m_ptr->object; - } - - explicit operator bool() const noexcept - { - return !!m_ptr; - } - }; - - // Internal, typemap control block for a particular type - struct alignas(64) typemap_head - { - // Pointer to the uninitialized storage - uchar* m_ptr = nullptr; - - // Free ID counter - atomic_t m_sema{0}; - - // Max ID ever used + 1 - atomic_t m_limit{0}; - - // Increased on each constructor call - atomic_t m_create_count{0}; - - // Increased on each destructor call - atomic_t m_destroy_count{0}; - - // Aligned size of the storage for each object - uint m_ssize = 0; - - // Total object count in the storage - uint m_count = 0; - - // Destructor caller; related to particular type, not the current storage - void(*clean)(typemap_block*) = 0; - }; - - class typeptr_base - { - typemap_head* m_head; - typemap_block* m_block; - - template - friend class typeptr; - - friend typemap; - }; - - // Pointer + lock object, possible states: - // 1) Invalid - bad id, no space, or after release() - // 2) Null - locked, but the object does not exist - // 3) OK - locked and the object exists - template - class typeptr : typeptr_base - { - using typeptr_base::m_head; - using typeptr_base::m_block; - - friend typemap; - - void release() - { - if constexpr (type_const() && type_volatile()) - { - } - else if constexpr (type_const() || type_volatile()) - { - m_block->m_mutex.unlock_shared(); - } - else - { - m_block->m_mutex.unlock(); - } - - if (m_block->m_type == 0) - { - if constexpr (typeinfo_count::max_count > 1) - { - // Return semaphore - m_head->m_sema--; - } - } - } - - public: - constexpr typeptr(typeptr_base base) noexcept - : typeptr_base(base) - { - } - - typeptr(const typeptr&) = delete; - - typeptr& operator=(const typeptr&) = delete; - - ~typeptr() - { - if (m_block) - { - release(); - } - } - - // Verify the object exists - bool exists() const noexcept - { - return m_block->m_type != 0; - } - - // Verify the state is valid - explicit operator bool() const noexcept - { - return m_block != nullptr; - } - - // Get the pointer to the existing object - template > - auto get() const noexcept - { - return m_block->get_ptr(); - } - - auto operator->() const noexcept - { - // Invoke object's operator -> if available - if constexpr (typeinfo_pointer::is_ptr) - { - return get()->operator->(); - } - else - { - return get(); - } - } - - // Release the lock and set invalid state - void unlock() - { - if (m_block) - { - release(); - m_block = nullptr; - } - } - - // Call the constructor, return the stamp - template , typename... Args> - ullong create(Args&&... args) - { - static_assert(!type_const()); - static_assert(!type_volatile()); - - const ullong result = ++m_head->m_create_count; - - if constexpr (typeinfo_count::max_count > 1) - { - // Update hints only if the object is not being recreated - if (!m_block->m_type) - { - const uint this_id = this->get_id(); - - // Update max count - m_head->m_limit.fetch_op([this_id](uint& limit) - { - if (limit <= this_id) - { - limit = this_id + 1; - return true; - } - - return false; - }); - } - } - - if constexpr (true) - { - static_assert(std::is_same_v); - - // Set type; zero value shall not be observed in the case of recreation - if (m_block->m_type.exchange(1) != 0) - { - // Destroy object if it exists - m_block->get_ptr()->~T(); - m_head->m_destroy_count++; - } - - new (m_block->get_ptr()) New(std::forward(args)...); - } - - return result; - } - - // Call the destructor if object exists - void destroy() noexcept - { - static_assert(!type_const()); - - if (!m_block->m_type.exchange(0)) - { - return; - } - - m_block->get_ptr()->~T(); - m_head->m_destroy_count++; - } - - // Get the ID - uint get_id() const - { - // It's not often needed so figure it out instead of storing it - const usz diff = reinterpret_cast(m_block) - m_head->m_ptr; - const usz quot = diff / m_head->m_ssize; - - if (diff % m_head->m_ssize || quot > typeinfo_count::max_count) - { - return -1; - } - - constexpr uint bias = typeinfo_bias::bias; - constexpr uint step = typeinfo_step::step; - return static_cast(quot) * step + bias; - } - - static constexpr bool type_const() - { - return std::is_const_v>; - } - - static constexpr bool type_volatile() - { - return std::is_volatile_v>; - } - }; - - // Dynamic object collection, one or more per any type; shall not be initialized before main() - class typemap - { - // Pointer to the dynamic array - typemap_head* m_map = nullptr; - - // Pointer to the virtual memory - void* m_memory = nullptr; - - // Virtual memory size - usz m_total = 0; - - template - typemap_head* get_head() const - { - return &m_map[stx::typeindex>()]; - } - - public: - typemap(const typemap&) = delete; - - typemap& operator=(const typemap&) = delete; - - // Construct without initialization (suitable for global typemap) - explicit constexpr typemap(std::nullptr_t) noexcept - { - } - - // Construct with initialization - typemap() - { - init(); - } - - ~typemap() - { - delete[] m_map; - - if (m_memory) - { - utils::memory_release(m_memory, m_total); - } - } - - // Recreate, also required if constructed without initialization. - void init() - { - if (!stx::typelist_v.count()) - { - return; - } - - // Recreate and copy some type information - if (m_map == nullptr) - { - m_map = new typemap_head[stx::typelist_v.count()](); - } - else - { - auto type = stx::typelist_v.begin(); - auto _end = stx::typelist_v.end(); - - for (uint i = 0; type != _end; i++, ++type) - { - // Delete objects (there shall be no threads accessing them) - const uint lim = m_map[i].m_count != 1 ? +m_map[i].m_limit : 1; - - for (usz j = 0; j < lim; j++) - { - const auto block = reinterpret_cast(m_map[i].m_ptr + j * m_map[i].m_ssize); - - if (block->m_type) - { - m_map[i].clean(block); - } - } - - // Reset mutable fields - m_map[i].m_sema.raw() = 0; - m_map[i].m_limit.raw() = 0; - - m_map[i].m_create_count.raw() = 0; - m_map[i].m_destroy_count.raw() = 0; - } - } - - // Initialize virtual memory if necessary - if (m_memory == nullptr) - { - // Determine total size, copy typeinfo - auto type = stx::typelist_v.begin(); - auto _end = stx::typelist_v.end(); - - for (uint i = 0; type != _end; i++, ++type) - { - const uint align = type->align; - const uint ssize = ::align(sizeof(typemap_block), align) + ::align(type->size, align); - const auto total = usz{ssize} * type->count; - const auto start = uptr{::align(m_total, align)}; - - if (total) - { - // Move forward hoping there are no usable gaps wasted - m_total = start + total; - - // Store storage size and object count - m_map[i].m_ssize = ssize; - m_map[i].m_count = type->count; - m_map[i].m_ptr = reinterpret_cast(start); - } - - // Copy destructor for indexed access - m_map[i].clean = type->clean; - } - - // Allocate virtual memory - m_memory = utils::memory_reserve(m_total); - utils::memory_commit(m_memory, m_total); - - // Update pointers - for (uint i = 0, n = stx::typelist_v.count(); i < n; i++) - { - if (m_map[i].m_count) - { - m_map[i].m_ptr = static_cast(m_memory) + reinterpret_cast(m_map[i].m_ptr); - } - } - } - else - { - // Reinitialize virtual memory at the same location - utils::memory_reset(m_memory, m_total); - } - } - - // Return allocated virtual memory block size (not aligned) - usz get_memory_size() const - { - return m_total; - } - - private: - - // Prepare pointers - template - typeptr_base init_ptr(Arg&& id) const - { - if constexpr (typeinfo_count::max_count == 0) - { - return {}; - } - - using id_tag = std::decay_t; - - typemap_head* head = get_head(); - typemap_block* block; - - if constexpr (std::is_same_v || std::is_same_v || std::is_same_v) - { - if constexpr (constexpr uint last = typeinfo_count::max_count - 1) - { - // If max_count > 1 only id_new is supported - static_assert(std::is_same_v); - static_assert(!std::is_const_v>); - static_assert(!std::is_volatile_v>); - - // Try to acquire the semaphore - if (!head->m_sema.try_inc(last + 1)) [[unlikely]] - { - block = nullptr; - } - else - { - // Find empty location and lock it, starting from hint index - for (uint lim = head->m_limit, i = (lim > last ? 0 : lim);; i = (i == last ? 0 : i + 1)) - { - block = reinterpret_cast(head->m_ptr + usz{i} * head->m_ssize); - - if (block->m_type == 0 && block->m_mutex.try_lock()) - { - if (block->m_type == 0) [[likely]] - { - break; - } - - block->m_mutex.unlock(); - } - } - } - } - else - { - // Always access first element - block = reinterpret_cast(head->m_ptr); - - if constexpr (std::is_same_v) - { - static_assert(!std::is_const_v>); - static_assert(!std::is_volatile_v>); - - if (block->m_type != 0 || !block->m_mutex.try_lock()) - { - block = nullptr; - } - else if (block->m_type != 0) [[unlikely]] - { - block->m_mutex.unlock(); - block = nullptr; - } - } - } - } - else if constexpr (std::is_invocable_r_v) - { - // Access with a lookup function - for (usz j = 0; j < (typeinfo_count::max_count != 1 ? +head->m_limit : 1); j++) - { - block = reinterpret_cast(head->m_ptr + j * head->m_ssize); - - if (block->m_type) - { - std::lock_guard lock(block->m_mutex); - - if (block->m_type) - { - if (std::invoke(std::forward(id), std::as_const(*block->get_ptr()))) - { - break; - } - } - } - - block = nullptr; - } - } - else - { - // Access by transformed id - constexpr uint bias = typeinfo_bias::bias; - constexpr uint step = typeinfo_step::step; - const uint unbiased = static_cast(std::forward(id)) - bias; - const uint unscaled = unbiased / step; - - block = reinterpret_cast(head->m_ptr + usz{head->m_ssize} * unscaled); - - // Check id range and type - if (unscaled >= typeinfo_count::max_count || unbiased % step) [[unlikely]] - { - block = nullptr; - } - else - { - if (block->m_type == 0) [[unlikely]] - { - block = nullptr; - } - } - } - - typeptr_base result; - result.m_head = head; - result.m_block = block; - return result; - } - - template - void check_ptr(typemap_block*& block, Arg&& id) const - { - using id_tag = std::decay_t; - - if constexpr (std::is_same_v) - { - // No action for id_new - return; - } - else if constexpr (std::is_same_v) - { - // No action for id_any - return; - } - else if constexpr (std::is_same_v) - { - if (block->m_type == 0 && block->m_type.compare_and_swap_test(0, 1)) - { - // Initialize object if necessary - static_assert(!std::is_const_v>); - static_assert(!std::is_volatile_v>); - new (block->get_ptr) Type(); - } - - return; - } - else if constexpr (std::is_invocable_r_v) - { - if (!block) [[unlikely]] - { - return; - } - - if (block->m_type) [[likely]] - { - if (std::invoke(std::forward(id), std::as_const(*block->get_ptr()))) - { - return; - } - } - } - else if (block) - { - if (block->m_type) [[likely]] - { - return; - } - } - else - { - return; - } - - // Fallback: unlock and invalidate - block->m_mutex.unlock(); - block = nullptr; - } - - template - bool lock_ptr(typemap_block* block) const - { - // Use reader lock for const access - constexpr bool is_const = std::is_const_v>; - constexpr bool is_volatile = std::is_volatile_v>; - - // Already locked or lock is unnecessary - if constexpr (!Lock) - { - return true; - } - else - { - // Skip failed ids - if (!block) - { - return true; - } - - if constexpr (Try) - { - if constexpr (is_const || is_volatile) - { - return block->m_mutex.try_lock_shared(); - } - else - { - return block->m_mutex.try_lock(); - } - } - else if constexpr (is_const || is_volatile) - { - if (block->m_mutex.is_lockable()) [[likely]] - { - return true; - } - - block->m_mutex.lock_shared(); - return false; - } - else - { - if (block->m_mutex.is_free()) [[likely]] - { - return true; - } - - block->m_mutex.lock(); - return false; - } - } - } - - template - bool try_lock(const std::array& array, uint locked, std::integer_sequence) const - { - // Try to lock mutex if not locked from the previous step - if (I == locked || lock_ptr(array[I].m_block)) - { - if constexpr (I + 1 < N) - { - // Proceed recursively - if (try_lock(array, locked, std::integer_sequence{})) [[likely]] - { - return true; - } - - // Retire: unlock everything, including (I == locked) case - if constexpr (Lock) - { - if (array[I].m_block) - { - if constexpr (std::is_const_v> || std::is_volatile_v>) - { - array[I].m_block->m_mutex.unlock_shared(); - } - else - { - array[I].m_block->m_mutex.unlock(); - } - } - } - } - else - { - return true; - } - } - - return false; - } - - template - uint lock_array(const std::array& array, std::integer_sequence, std::integer_sequence) const - { - // Verify all mutexes are free or wait for one of them and return its index - uint locked = 0; - ((lock_ptr(array[I].m_block) && ++locked) && ...); - return locked; - } - - template - void check_array(std::array& array, std::integer_sequence, Args&&... ids) const - { - // Check types and unlock on mismatch - (check_ptr(array[I].m_block, std::forward(ids)), ...); - } - - template - std::tuple...> array_to_tuple(const std::array& array, std::integer_sequence) const - { - return {array[I]...}; - } - - template - static constexpr bool does_need_lock() - { - if constexpr (std::is_same_v, id_new_t>) - { - return false; - } - - if constexpr (std::is_const_v> && std::is_volatile_v>) - { - return false; - } - - return true; - } - - // Transform T&& into refptr, moving const qualifier from T to refptr - template > - using decode_t = std::conditional_t, T, - std::conditional_t, const refptr>, refptr>>; - - public: - // Lock any objects by their identifiers, special tags id_new/id_any/id_always, or search predicates - template > - auto lock(Args&&... ids) const - { - static_assert(((!std::is_lvalue_reference_v) && ...)); - static_assert(((!std::is_array_v) && ...)); - static_assert(((!std::is_void_v) && ...)); - - // Initialize pointers - std::array result{this->init_ptr>(std::forward(ids))...}; - - // Whether requires locking after init_ptr - using locks_t = std::integer_sequence, Args>()...>; - - // Array index helper - using seq_t = std::index_sequence_for...>; - - // Lock any number of objects in safe manner - while (true) - { - const uint locked = lock_array...>(result, seq_t{}, locks_t{}); - if (try_lock<0, decode_t...>(result, locked, locks_t{})) [[likely]] - break; - } - - // Verify object types - check_array...>(result, seq_t{}, std::forward(ids)...); - - // Return tuple of possibly locked pointers, or a single pointer - if constexpr (sizeof...(Types) != 1) - { - return array_to_tuple...>(result, seq_t{}); - } - else - { - return typeptr...>(result[0]); - } - } - - // Apply a function to all objects of one or more types - template - ullong apply(F&& func) - { - static_assert(!std::is_lvalue_reference_v); - static_assert(!std::is_array_v); - static_assert(!std::is_void_v); - - typemap_head* head = get_head>(); - - const ullong ix = head->m_create_count; - - for (usz j = 0; j < (typeinfo_count>::max_count != 1 ? +head->m_limit : 1); j++) - { - const auto block = reinterpret_cast(head->m_ptr + j * head->m_ssize); - - if (block->m_type) - { - std::lock_guard lock(block->m_mutex); - - if (block->m_type) - { - std::invoke(std::forward(func), *block->get_ptr>()); - } - } - } - - // Return "unsigned negative" value if the creation index has increased - const ullong result = ix - head->m_create_count; - - if constexpr (sizeof...(Types) > 0) - { - return (result + ... + apply(func)); - } - else - { - return result; - } - } - - template - ullong get_create_count() const - { - return get_head()->m_create_count; - } - - template - ullong get_destroy_count() const - { - return get_head()->m_destroy_count; - } - }; -} // namespace utils diff --git a/rpcs3/Crypto/unedat.cpp b/rpcs3/Crypto/unedat.cpp index 50aab0aa32..3ec0d3c388 100644 --- a/rpcs3/Crypto/unedat.cpp +++ b/rpcs3/Crypto/unedat.cpp @@ -5,7 +5,6 @@ #include "Utilities/mutex.h" #include -#include "util/v128.hpp" #include "util/asm.hpp" LOG_CHANNEL(edat_log, "EDAT"); @@ -138,15 +137,15 @@ std::tuple dec_section(unsigned char* metadata) return std::make_tuple(offset, length, compression_end); } -v128 get_block_key(int block, NPD_HEADER *npd) +u128 get_block_key(int block, NPD_HEADER *npd) { unsigned char empty_key[0x10] = {}; unsigned char *src_key = (npd->version <= 1) ? empty_key : npd->dev_hash; - v128 dest_key{}; - memcpy(dest_key._bytes, src_key, 0xC); + u128 dest_key{}; + std::memcpy(&dest_key, src_key, 0xC); s32 swappedBlock = swap32(block); - memcpy(&dest_key._bytes[0xC], &swappedBlock, sizeof(swappedBlock)); + std::memcpy(reinterpret_cast(&dest_key) + 0xC, &swappedBlock, sizeof(swappedBlock)); return dest_key; } @@ -251,7 +250,7 @@ s64 decrypt_block(const fs::file* in, u8* out, EDAT_HEADER *edat, NPD_HEADER *np auto b_key = get_block_key(block_num, npd); // Encrypt the block key with the crypto key. - aesecb128_encrypt(crypt_key, b_key._bytes, key_result); + aesecb128_encrypt(crypt_key, reinterpret_cast(&b_key), key_result); if ((edat->flags & EDAT_FLAG_0x10) != 0) aesecb128_encrypt(crypt_key, key_result, hash); // If FLAG 0x10 is set, encrypt again to get the final hash. else @@ -556,9 +555,10 @@ int validate_dev_klic(const u8* klicensee, NPD_HEADER *npd) memcpy(dev + 0xC, &type, 4); // Check for an empty dev_hash (can't validate if devklic is NULL); - auto klic = v128::loadu(klicensee); + u128 klic; + std::memcpy(&klic, klicensee, sizeof(klic)); - if (klic == v128{}) + if (!klic) { // Allow empty dev hash. return 1; @@ -566,10 +566,10 @@ int validate_dev_klic(const u8* klicensee, NPD_HEADER *npd) else { // Generate klicensee xor key. - auto key = klic ^ std::bit_cast(NP_OMAC_KEY_2); + u128 key = klic ^ std::bit_cast(NP_OMAC_KEY_2); // Hash with generated key and compare with dev_hash. - return cmac_hash_compare(key._bytes, 0x10, dev, 0x60, npd->dev_hash, 0x10); + return cmac_hash_compare(reinterpret_cast(&key), 0x10, dev, 0x60, npd->dev_hash, 0x10); } } @@ -668,7 +668,7 @@ bool extract_all_data(const fs::file* input, const fs::file* output, const char* } // Set decryption key. - v128 key{}; + u128 key{}; // Check EDAT/SDAT flag. if ((EDAT.flags & SDAT_FLAG) == SDAT_FLAG) @@ -682,7 +682,7 @@ bool extract_all_data(const fs::file* input, const fs::file* output, const char* } // Generate SDAT key. - key = std::bit_cast(NPD.dev_hash) ^ std::bit_cast(SDAT_KEY); + key = std::bit_cast(NPD.dev_hash) ^ std::bit_cast(SDAT_KEY); } else { @@ -715,7 +715,7 @@ bool extract_all_data(const fs::file* input, const fs::file* output, const char* memcpy(&key, rifkey, 0x10); // Make sure we don't have an empty RIF key. - if (key == v128{}) + if (!key) { edat_log.error("EDAT: A valid RAP file is needed for this EDAT file! (local activation)"); return 1; @@ -726,7 +726,7 @@ bool extract_all_data(const fs::file* input, const fs::file* output, const char* memcpy(&key, rifkey, 0x10); // Make sure we don't have an empty RIF key. - if (key == v128{}) + if (!key) { edat_log.error("EDAT: A valid RAP file is needed for this EDAT file! (network activation)"); return 1; @@ -735,7 +735,7 @@ bool extract_all_data(const fs::file* input, const fs::file* output, const char* if (verbose) { - be_t data; + be_t data; std::memcpy(&data, devklic, sizeof(data)); edat_log.notice("DEVKLIC: %s", data); @@ -746,18 +746,18 @@ bool extract_all_data(const fs::file* input, const fs::file* output, const char* if (verbose) { - edat_log.notice("DECRYPTION KEY: %s", std::bit_cast>(key)); + edat_log.notice("DECRYPTION KEY: %s", std::bit_cast>(key)); } input->seek(0); - if (check_data(key._bytes, &EDAT, &NPD, input, verbose)) + if (check_data(reinterpret_cast(&key), &EDAT, &NPD, input, verbose)) { edat_log.error("EDAT: Data parsing failed!"); return 1; } input->seek(0); - if (decrypt_data(input, output, &EDAT, &NPD, key._bytes, verbose)) + if (decrypt_data(input, output, &EDAT, &NPD, reinterpret_cast(&key), verbose)) { edat_log.error("EDAT: Data decryption failed!"); return 1; @@ -766,14 +766,14 @@ bool extract_all_data(const fs::file* input, const fs::file* output, const char* return 0; } -v128 GetEdatRifKeyFromRapFile(const fs::file& rap_file) +u128 GetEdatRifKeyFromRapFile(const fs::file& rap_file) { - v128 rapkey{}; - v128 rifkey{}; + u128 rapkey{}; + u128 rifkey{}; - rap_file.read(rapkey); + rap_file.read(rapkey); - rap_to_rif(rapkey._bytes, rifkey._bytes); + rap_to_rif(reinterpret_cast(&rapkey), reinterpret_cast(&rifkey)); return rifkey; } @@ -824,8 +824,8 @@ fs::file DecryptEDAT(const fs::file& input, const std::string& input_file_name, input.seek(0); // Set keys (RIF and DEVKLIC). - v128 rifKey{}; - v128 devklic{}; + u128 rifKey{}; + u128 devklic{}; // Select the EDAT key mode. switch (mode) @@ -879,7 +879,7 @@ fs::file DecryptEDAT(const fs::file& input, const std::string& input_file_name, // Delete the bad output file if any errors arise. fs::file output = fs::make_stream>(); - if (extract_all_data(&input, &output, input_file_name.c_str(), devklic._bytes, rifKey._bytes, verbose)) + if (extract_all_data(&input, &output, input_file_name.c_str(), reinterpret_cast(&devklic), reinterpret_cast(&rifKey), verbose)) { output.release(); return fs::file{}; @@ -905,12 +905,12 @@ bool EDATADecrypter::ReadHeader() if ((edatHeader.flags & SDAT_FLAG) == SDAT_FLAG) { // Generate SDAT key. - dec_key = std::bit_cast(npdHeader.dev_hash) ^ std::bit_cast(SDAT_KEY); + dec_key = std::bit_cast(npdHeader.dev_hash) ^ std::bit_cast(SDAT_KEY); } else { // verify key - if (validate_dev_klic(dev_key._bytes, &npdHeader) == 0) + if (validate_dev_klic(reinterpret_cast(&dev_key), &npdHeader) == 0) { edat_log.error("EDAT: Failed validating klic"); return false; @@ -923,7 +923,7 @@ bool EDATADecrypter::ReadHeader() { dec_key = std::move(rif_key); - if (dec_key == v128{}) + if (!dec_key) { edat_log.warning("EDAT: Empty Dec key for local activation!"); } @@ -932,7 +932,7 @@ bool EDATADecrypter::ReadHeader() { dec_key = std::move(rif_key); - if (dec_key == v128{}) + if (!dec_key) { edat_log.warning("EDAT: Empty Dec key for network activation!"); } @@ -978,7 +978,7 @@ u64 EDATADecrypter::ReadData(u64 pos, u8* data, u64 size) for (u32 i = starting_block; i < ending_block; ++i) { edata_file.seek(0); - u64 res = decrypt_block(&edata_file, &data_buf[writeOffset], &edatHeader, &npdHeader, dec_key._bytes, i, total_blocks, edatHeader.file_size); + u64 res = decrypt_block(&edata_file, &data_buf[writeOffset], &edatHeader, &npdHeader, reinterpret_cast(&dec_key), i, total_blocks, edatHeader.file_size); if (res == umax) { edat_log.error("Error Decrypting data"); diff --git a/rpcs3/Crypto/unedat.h b/rpcs3/Crypto/unedat.h index e4e9a5a91d..312baef986 100644 --- a/rpcs3/Crypto/unedat.h +++ b/rpcs3/Crypto/unedat.h @@ -6,8 +6,6 @@ #include "Utilities/File.h" -#include "util/v128.hpp" - constexpr u32 SDAT_FLAG = 0x01000000; constexpr u32 EDAT_COMPRESSED_FLAG = 0x00000001; constexpr u32 EDAT_FLAG_0x02 = 0x00000002; @@ -18,8 +16,8 @@ constexpr u32 EDAT_DEBUG_DATA_FLAG = 0x80000000; struct loaded_npdrm_keys { - atomic_t devKlic{}; - atomic_t rifKey{}; + atomic_t devKlic{}; + atomic_t rifKey{}; atomic_t npdrm_fds{0}; }; @@ -49,7 +47,7 @@ extern fs::file DecryptEDAT(const fs::file& input, const std::string& input_file extern bool VerifyEDATHeaderWithKLicense(const fs::file& input, const std::string& input_file_name, const u8* custom_klic, std::string* contentID); -v128 GetEdatRifKeyFromRapFile(const fs::file& rap_file); +u128 GetEdatRifKeyFromRapFile(const fs::file& rap_file); struct EDATADecrypter final : fs::file_base { @@ -66,18 +64,20 @@ struct EDATADecrypter final : fs::file_base std::unique_ptr data_buf; u64 data_buf_size{0}; - v128 dec_key{}; + u128 dec_key{}; // edat usage - v128 rif_key{}; - v128 dev_key{}; + u128 rif_key{}; + u128 dev_key{}; public: // SdataByFd usage EDATADecrypter(fs::file&& input) : edata_file(std::move(input)) {} // Edat usage - EDATADecrypter(fs::file&& input, const v128& dev_key, const v128& rif_key) - : edata_file(std::move(input)), rif_key(rif_key), dev_key(dev_key) {} + EDATADecrypter(fs::file&& input, const u128& dev_key, const u128& rif_key) + : edata_file(std::move(input)) + , rif_key(rif_key) + , dev_key(dev_key) {} ~EDATADecrypter() override {} // false if invalid diff --git a/rpcs3/Crypto/unself.cpp b/rpcs3/Crypto/unself.cpp index ad1dba01c7..54a70b7615 100644 --- a/rpcs3/Crypto/unself.cpp +++ b/rpcs3/Crypto/unself.cpp @@ -9,8 +9,6 @@ #include #include -#include "util/v128.hpp" - inline u8 Read8(const fs::file& f) { u8 ret; @@ -1489,7 +1487,7 @@ bool verify_npdrm_self_headers(const fs::file& self, u8* klic_key) return true; } -v128 get_default_self_klic() +u128 get_default_self_klic() { - return std::bit_cast(NP_KLIC_FREE); + return std::bit_cast(NP_KLIC_FREE); } diff --git a/rpcs3/Crypto/unself.h b/rpcs3/Crypto/unself.h index 382b739e3b..3f3ef6dd21 100644 --- a/rpcs3/Crypto/unself.h +++ b/rpcs3/Crypto/unself.h @@ -509,5 +509,4 @@ private: fs::file decrypt_self(fs::file elf_or_self, u8* klic_key = nullptr, SelfAdditionalInfo* additional_info = nullptr); bool verify_npdrm_self_headers(const fs::file& self, u8* klic_key = nullptr); -union v128; -v128 get_default_self_klic(); +u128 get_default_self_klic(); diff --git a/rpcs3/Emu/CMakeLists.txt b/rpcs3/Emu/CMakeLists.txt index df0dbe6429..96dfb1e995 100644 --- a/rpcs3/Emu/CMakeLists.txt +++ b/rpcs3/Emu/CMakeLists.txt @@ -33,7 +33,6 @@ target_include_directories(rpcs3_emu # Utilities target_sources(rpcs3_emu PRIVATE ../util/atomic.cpp - ../util/atomic2.cpp ../util/fixed_typemap.cpp ../util/logs.cpp ../util/yaml.cpp diff --git a/rpcs3/Emu/CPU/CPUThread.cpp b/rpcs3/Emu/CPU/CPUThread.cpp index d01ff98244..5feaddaa11 100644 --- a/rpcs3/Emu/CPU/CPUThread.cpp +++ b/rpcs3/Emu/CPU/CPUThread.cpp @@ -15,6 +15,9 @@ #include #include +#include +#include + DECLARE(cpu_thread::g_threads_created){0}; DECLARE(cpu_thread::g_threads_deleted){0}; DECLARE(cpu_thread::g_suspend_counter){0}; @@ -938,7 +941,7 @@ bool cpu_thread::suspend_work::push(cpu_thread* _this) noexcept break; } - _mm_pause(); + utils::pause(); } // Second increment: all threads paused diff --git a/rpcs3/Emu/CPU/CPUTranslator.cpp b/rpcs3/Emu/CPU/CPUTranslator.cpp index 1e733a1835..c0320c54d5 100644 --- a/rpcs3/Emu/CPU/CPUTranslator.cpp +++ b/rpcs3/Emu/CPU/CPUTranslator.cpp @@ -2,6 +2,9 @@ #include "CPUTranslator.h" +#include "util/v128.hpp" +#include "util/v128sse.hpp" + llvm::LLVMContext g_llvm_ctx; cpu_translator::cpu_translator(llvm::Module* _module, bool is_be) diff --git a/rpcs3/Emu/Cell/Modules/cellAudio.cpp b/rpcs3/Emu/Cell/Modules/cellAudio.cpp index ec813ff14a..a0fd3dd124 100644 --- a/rpcs3/Emu/Cell/Modules/cellAudio.cpp +++ b/rpcs3/Emu/Cell/Modules/cellAudio.cpp @@ -5,6 +5,9 @@ #include "Emu/Cell/lv2/sys_process.h" #include "Emu/Cell/lv2/sys_event.h" #include "cellAudio.h" + +#include "emmintrin.h" +#include "immintrin.h" #include LOG_CHANNEL(cellAudio); diff --git a/rpcs3/Emu/Cell/Modules/cellSaveData.h b/rpcs3/Emu/Cell/Modules/cellSaveData.h index 2cddfc43ae..71e55d3803 100644 --- a/rpcs3/Emu/Cell/Modules/cellSaveData.h +++ b/rpcs3/Emu/Cell/Modules/cellSaveData.h @@ -3,8 +3,6 @@ #include "stdafx.h" #include -#include "util/v128.hpp" - // Return codes enum CellSaveDataError : u32 { @@ -300,7 +298,7 @@ struct CellSaveDataFileSet be_t fileOperation; vm::bptr reserved; be_t fileType; - be_t secureFileId; + be_t secureFileId; vm::bptr fileName; be_t fileOffset; be_t fileSize; diff --git a/rpcs3/Emu/Cell/Modules/cellSpurs.cpp b/rpcs3/Emu/Cell/Modules/cellSpurs.cpp index f79e8f0bec..e8f43583fb 100644 --- a/rpcs3/Emu/Cell/Modules/cellSpurs.cpp +++ b/rpcs3/Emu/Cell/Modules/cellSpurs.cpp @@ -16,6 +16,7 @@ #include "cellSpurs.h" #include "util/v128.hpp" +#include "util/v128sse.hpp" LOG_CHANNEL(cellSpurs); diff --git a/rpcs3/Emu/Cell/Modules/cellSpursSpu.cpp b/rpcs3/Emu/Cell/Modules/cellSpursSpu.cpp index 7e5f43afc6..a279db99fb 100644 --- a/rpcs3/Emu/Cell/Modules/cellSpursSpu.cpp +++ b/rpcs3/Emu/Cell/Modules/cellSpursSpu.cpp @@ -14,6 +14,7 @@ #include #include "util/v128.hpp" +#include "util/v128sse.hpp" LOG_CHANNEL(cellSpurs); diff --git a/rpcs3/Emu/Cell/Modules/sceNp.cpp b/rpcs3/Emu/Cell/Modules/sceNp.cpp index b7b1c9bd99..733ffbbaa7 100644 --- a/rpcs3/Emu/Cell/Modules/sceNp.cpp +++ b/rpcs3/Emu/Cell/Modules/sceNp.cpp @@ -15,8 +15,6 @@ #include "Emu/NP/np_handler.h" #include "Emu/NP/np_contexts.h" -#include "util/v128.hpp" - LOG_CHANNEL(sceNp); template <> @@ -447,12 +445,12 @@ error_code sceNpTerm() error_code npDrmIsAvailable(vm::cptr k_licensee_addr, vm::cptr drm_path) { - v128 k_licensee{}; + u128 k_licensee{}; if (k_licensee_addr) { std::memcpy(&k_licensee, k_licensee_addr.get_ptr(), sizeof(k_licensee)); - sceNp.notice("npDrmIsAvailable(): KLicense key %s", std::bit_cast>(k_licensee)); + sceNp.notice("npDrmIsAvailable(): KLicense key %s", std::bit_cast>(k_licensee)); } if (Emu.GetCat() == "PE") @@ -488,7 +486,7 @@ error_code npDrmIsAvailable(vm::cptr k_licensee_addr, vm::cptr drm_pat if (!k_licensee_addr) k_licensee = get_default_self_klic(); - if (verify_npdrm_self_headers(enc_file, k_licensee._bytes)) + if (verify_npdrm_self_headers(enc_file, reinterpret_cast(&k_licensee))) { npdrmkeys->devKlic = k_licensee; } @@ -504,7 +502,7 @@ error_code npDrmIsAvailable(vm::cptr k_licensee_addr, vm::cptr drm_pat std::string contentID; - if (VerifyEDATHeaderWithKLicense(enc_file, enc_drm_path_local, k_licensee._bytes, &contentID)) + if (VerifyEDATHeaderWithKLicense(enc_file, enc_drm_path_local, reinterpret_cast(&k_licensee), &contentID)) { const std::string rap_file = rap_dir_path + contentID + ".rap"; npdrmkeys->devKlic = k_licensee; diff --git a/rpcs3/Emu/Cell/PPUInterpreter.cpp b/rpcs3/Emu/Cell/PPUInterpreter.cpp index c2450c1ff9..85cedb234c 100644 --- a/rpcs3/Emu/Cell/PPUInterpreter.cpp +++ b/rpcs3/Emu/Cell/PPUInterpreter.cpp @@ -12,6 +12,7 @@ #include "util/asm.hpp" #include "util/v128.hpp" +#include "util/v128sse.hpp" #include "util/sysinfo.hpp" #if !defined(_MSC_VER) && defined(__clang__) diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index 4ce1408dd0..f1b3e1e45e 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -66,6 +66,8 @@ #include "util/asm.hpp" #include "util/vm.hpp" #include "util/v128.hpp" +#include "util/v128sse.hpp" +#include "util/sysinfo.hpp" const bool s_use_ssse3 = utils::has_ssse3(); diff --git a/rpcs3/Emu/Cell/PPUTranslator.cpp b/rpcs3/Emu/Cell/PPUTranslator.cpp index c38c0881d7..49d526ba28 100644 --- a/rpcs3/Emu/Cell/PPUTranslator.cpp +++ b/rpcs3/Emu/Cell/PPUTranslator.cpp @@ -9,6 +9,7 @@ #include "util/endian.hpp" #include "util/logs.hpp" #include "util/v128.hpp" +#include "util/v128sse.hpp" #include using namespace llvm; diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp index 0b5e180302..d157034137 100644 --- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp @@ -12,6 +12,7 @@ #include "util/asm.hpp" #include "util/v128.hpp" +#include "util/v128sse.hpp" #include "util/sysinfo.hpp" #include @@ -959,7 +960,7 @@ spu_recompiler::XmmLink spu_recompiler::XmmGet(s8 reg, XmmType type) // get xmm return result; } -inline asmjit::X86Mem spu_recompiler::XmmConst(v128 data) +inline asmjit::X86Mem spu_recompiler::XmmConst(const v128& data) { // Find existing const auto& xmm_label = xmm_consts[std::make_pair(data._u64[0], data._u64[1])]; @@ -980,12 +981,12 @@ inline asmjit::X86Mem spu_recompiler::XmmConst(v128 data) return asmjit::x86::oword_ptr(xmm_label); } -inline asmjit::X86Mem spu_recompiler::XmmConst(__m128 data) +inline asmjit::X86Mem spu_recompiler::XmmConst(const __m128& data) { return XmmConst(v128::fromF(data)); } -inline asmjit::X86Mem spu_recompiler::XmmConst(__m128i data) +inline asmjit::X86Mem spu_recompiler::XmmConst(const __m128i& data) { return XmmConst(v128::fromV(data)); } diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h index 1ed1278333..7d5073d02f 100644 --- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h +++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h @@ -5,7 +5,7 @@ #include -#include "util/v128.hpp" +union v128; // SPU ASMJIT Recompiler class spu_recompiler : public spu_recompiler_base @@ -87,9 +87,9 @@ private: XmmLink XmmAlloc(); XmmLink XmmGet(s8 reg, XmmType type); - asmjit::X86Mem XmmConst(v128 data); - asmjit::X86Mem XmmConst(__m128 data); - asmjit::X86Mem XmmConst(__m128i data); + asmjit::X86Mem XmmConst(const v128& data); + asmjit::X86Mem XmmConst(const __m128& data); + asmjit::X86Mem XmmConst(const __m128i& data); asmjit::X86Mem get_pc(u32 addr); void branch_fixed(u32 target, bool absolute = false); diff --git a/rpcs3/Emu/Cell/SPUDisAsm.cpp b/rpcs3/Emu/Cell/SPUDisAsm.cpp index 5f1fdd4a5e..3fefdc911c 100644 --- a/rpcs3/Emu/Cell/SPUDisAsm.cpp +++ b/rpcs3/Emu/Cell/SPUDisAsm.cpp @@ -8,6 +8,7 @@ const spu_decoder s_spu_itype; const spu_decoder s_spu_iflag; #include "util/v128.hpp" +#include "util/v128sse.hpp" u32 SPUDisAsm::disasm(u32 pc) { @@ -161,7 +162,7 @@ std::pair SPUDisAsm::try_get_const_value(u32 reg, u32 pc) const return {}; } -typename SPUDisAsm::insert_mask_info SPUDisAsm::try_get_insert_mask_info(v128 mask) +typename SPUDisAsm::insert_mask_info SPUDisAsm::try_get_insert_mask_info(const v128& mask) { if ((mask & v128::from8p(0xe0)) != v128{}) { @@ -302,3 +303,29 @@ void SPUDisAsm::IOHL(spu_opcode_t op) DisAsm("iohl", spu_reg_name[op.rt], op.i16); } + +void SPUDisAsm::SHUFB(spu_opcode_t op) +{ + const auto [is_const, value] = try_get_const_value(op.rc); + + if (is_const) + { + const auto [size, dst, src] = try_get_insert_mask_info(value); + + if (size) + { + if ((size >= 4u && !src) || (size == 2u && src == 1u) || (size == 1u && src == 3u)) + { + // Comment insertion pattern for CWD-alike instruction + DisAsm("shufb", spu_reg_name[op.rt4], spu_reg_name[op.ra], spu_reg_name[op.rb], fmt::format("%s #i%u[%u]", spu_reg_name[op.rc], size * 8, dst).c_str()); + return; + } + + // Comment insertion pattern for unknown instruction formations + DisAsm("shufb", spu_reg_name[op.rt4], spu_reg_name[op.ra], spu_reg_name[op.rb], fmt::format("%s #i%u[%u] = [%u]", spu_reg_name[op.rc], size * 8, dst, src).c_str()); + return; + } + } + + DisAsm("shufb", spu_reg_name[op.rt4], spu_reg_name[op.ra], spu_reg_name[op.rb], spu_reg_name[op.rc]); +} diff --git a/rpcs3/Emu/Cell/SPUDisAsm.h b/rpcs3/Emu/Cell/SPUDisAsm.h index 9ec1ff22fd..2edaac9aee 100644 --- a/rpcs3/Emu/Cell/SPUDisAsm.h +++ b/rpcs3/Emu/Cell/SPUDisAsm.h @@ -3,7 +3,7 @@ #include "PPCDisAsm.h" #include "SPUOpcodes.h" -#include "util/v128.hpp" +union v128; static constexpr const char* spu_reg_name[128] = { @@ -172,7 +172,7 @@ public: u32 src_index; }; - static insert_mask_info try_get_insert_mask_info(v128 mask); + static insert_mask_info try_get_insert_mask_info(const v128& mask); //0 - 10 void STOP(spu_opcode_t op) @@ -972,31 +972,7 @@ public: { DisAsm("selb", spu_reg_name[op.rt4], spu_reg_name[op.ra], spu_reg_name[op.rb], spu_reg_name[op.rc]); } - void SHUFB(spu_opcode_t op) - { - const auto [is_const, value] = try_get_const_value(op.rc); - - if (is_const) - { - const auto [size, dst, src] = try_get_insert_mask_info(value); - - if (size) - { - if ((size >= 4u && !src) || (size == 2u && src == 1u) || (size == 1u && src == 3u)) - { - // Comment insertion pattern for CWD-alike instruction - DisAsm("shufb", spu_reg_name[op.rt4], spu_reg_name[op.ra], spu_reg_name[op.rb], fmt::format("%s #i%u[%u]", spu_reg_name[op.rc], size * 8, dst).c_str()); - return; - } - - // Comment insertion pattern for unknown instruction formations - DisAsm("shufb", spu_reg_name[op.rt4], spu_reg_name[op.ra], spu_reg_name[op.rb], fmt::format("%s #i%u[%u] = [%u]", spu_reg_name[op.rc], size * 8, dst, src).c_str()); - return; - } - } - - DisAsm("shufb", spu_reg_name[op.rt4], spu_reg_name[op.ra], spu_reg_name[op.rb], spu_reg_name[op.rc]); - } + void SHUFB(spu_opcode_t op); void MPYA(spu_opcode_t op) { DisAsm("mpya", spu_reg_name[op.rt4], spu_reg_name[op.ra], spu_reg_name[op.rb], spu_reg_name[op.rc]); diff --git a/rpcs3/Emu/Cell/SPUInterpreter.cpp b/rpcs3/Emu/Cell/SPUInterpreter.cpp index 560dcdedf0..e4e701b440 100644 --- a/rpcs3/Emu/Cell/SPUInterpreter.cpp +++ b/rpcs3/Emu/Cell/SPUInterpreter.cpp @@ -7,6 +7,7 @@ #include "util/asm.hpp" #include "util/v128.hpp" +#include "util/v128sse.hpp" #include "util/sysinfo.hpp" #include diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index 0dcf9b565e..6ed3b005ef 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -18,6 +18,7 @@ #include #include "util/v128.hpp" +#include "util/v128sse.hpp" #include "util/sysinfo.hpp" extern atomic_t g_progr; diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index c252e16013..dfa1befbcb 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -31,6 +31,7 @@ #include "util/vm.hpp" #include "util/asm.hpp" #include "util/v128.hpp" +#include "util/v128sse.hpp" #include "util/sysinfo.hpp" using spu_rdata_t = decltype(spu_thread::rdata); @@ -1558,7 +1559,7 @@ void spu_thread::cpu_return() for (u32 status; !thread->exit_status.try_read(status) || status != thread->last_exit_status;) { - _mm_pause(); + utils::pause(); } } } diff --git a/rpcs3/Emu/Cell/SPUThread.h b/rpcs3/Emu/Cell/SPUThread.h index 97a5aa14b2..784ce5c07f 100644 --- a/rpcs3/Emu/Cell/SPUThread.h +++ b/rpcs3/Emu/Cell/SPUThread.h @@ -504,7 +504,7 @@ struct spu_imm_table_t public: scale_table_t(); - FORCE_INLINE __m128 operator [] (s32 scale) const + FORCE_INLINE const auto& operator [](s32 scale) const { return m_data[scale + 155].vf; } diff --git a/rpcs3/Emu/Cell/lv2/sys_overlay.cpp b/rpcs3/Emu/Cell/lv2/sys_overlay.cpp index 4eb1213a58..da0a1f5921 100644 --- a/rpcs3/Emu/Cell/lv2/sys_overlay.cpp +++ b/rpcs3/Emu/Cell/lv2/sys_overlay.cpp @@ -32,7 +32,9 @@ static error_code overlay_load_module(vm::ptr ovlmid, const std::string& vp src = std::move(lv2_file); } - const ppu_exec_object obj = decrypt_self(std::move(src), g_fxo->get()->devKlic.load()._bytes); + u128 klic = g_fxo->get()->devKlic.load(); + + const ppu_exec_object obj = decrypt_self(std::move(src), reinterpret_cast(&klic)); if (obj != elf_error::ok) { diff --git a/rpcs3/Emu/Cell/lv2/sys_process.cpp b/rpcs3/Emu/Cell/lv2/sys_process.cpp index a5956004d8..2ddcaefb3c 100644 --- a/rpcs3/Emu/Cell/lv2/sys_process.cpp +++ b/rpcs3/Emu/Cell/lv2/sys_process.cpp @@ -403,10 +403,9 @@ void _sys_process_exit2(ppu_thread& ppu, s32 status, vm::ptr ar Emu.disc = std::move(disc); Emu.hdd1 = std::move(hdd1); - if (klic != v128{}) + if (klic) { - // TODO: Use std::optional - Emu.klic.assign(std::begin(klic._bytes), std::end(klic._bytes)); + Emu.klic.emplace_back(klic); } Emu.SetForceBoot(true); diff --git a/rpcs3/Emu/Cell/lv2/sys_prx.cpp b/rpcs3/Emu/Cell/lv2/sys_prx.cpp index 146feb15aa..7031a1a89c 100644 --- a/rpcs3/Emu/Cell/lv2/sys_prx.cpp +++ b/rpcs3/Emu/Cell/lv2/sys_prx.cpp @@ -263,7 +263,9 @@ static error_code prx_load_module(const std::string& vpath, u64 flags, vm::ptrget()->devKlic.load()._bytes); + u128 klic = g_fxo->get()->devKlic.load(); + + const ppu_prx_object obj = decrypt_self(std::move(src), reinterpret_cast(&klic)); if (obj != elf_error::ok) { diff --git a/rpcs3/Emu/Cell/lv2/sys_spu.cpp b/rpcs3/Emu/Cell/lv2/sys_spu.cpp index f715335f00..602a846ca0 100644 --- a/rpcs3/Emu/Cell/lv2/sys_spu.cpp +++ b/rpcs3/Emu/Cell/lv2/sys_spu.cpp @@ -251,7 +251,9 @@ error_code sys_spu_image_open(ppu_thread& ppu, vm::ptr img, vm::c return {fs_error, path}; } - const fs::file elf_file = decrypt_self(std::move(file), g_fxo->get()->devKlic.load()._bytes); + u128 klic = g_fxo->get()->devKlic.load(); + + const fs::file elf_file = decrypt_self(std::move(file), reinterpret_cast(&klic)); if (!elf_file) { diff --git a/rpcs3/Emu/Memory/vm.cpp b/rpcs3/Emu/Memory/vm.cpp index 576126c6fb..99b4f029a8 100644 --- a/rpcs3/Emu/Memory/vm.cpp +++ b/rpcs3/Emu/Memory/vm.cpp @@ -323,7 +323,7 @@ namespace vm break; } - _mm_pause(); + utils::pause(); } } @@ -525,7 +525,7 @@ namespace vm break; } - _mm_pause(); + utils::pause(); } for (auto lock = g_locks.cbegin(), end = lock + g_cfg.core.ppu_threads; lock != end; lock++) @@ -533,7 +533,9 @@ namespace vm if (auto ptr = +*lock) { while (!(ptr->state & cpu_flag::wait)) - _mm_pause(); + { + utils::pause(); + } } } } @@ -1606,7 +1608,7 @@ namespace vm case 2: atomic_storage::release(*static_cast(dst), *static_cast(src)); break; case 4: atomic_storage::release(*static_cast(dst), *static_cast(src)); break; case 8: atomic_storage::release(*static_cast(dst), *static_cast(src)); break; - case 16: _mm_store_si128(static_cast<__m128i*>(dst), _mm_loadu_si128(static_cast<__m128i*>(src))); break; + case 16: atomic_storage::release(*static_cast(dst), *static_cast(src)); break; } return true; diff --git a/rpcs3/Emu/Memory/vm_reservation.h b/rpcs3/Emu/Memory/vm_reservation.h index fec751b5c5..7e37e5dcae 100644 --- a/rpcs3/Emu/Memory/vm_reservation.h +++ b/rpcs3/Emu/Memory/vm_reservation.h @@ -8,8 +8,26 @@ extern bool g_use_rtm; extern u64 g_rtm_tx_limit2; +#ifdef _MSC_VER +extern "C" +{ + u64 __rdtsc(); + u32 _xbegin(); + void _xend(); +} +#endif + namespace vm { + inline u64 get_tsc() + { +#ifdef _MSC_VER + return __rdtsc(); +#else + return __builtin_ia32_rdtsc(); +#endif + } + enum : u64 { rsrv_lock_mask = 127, @@ -81,28 +99,28 @@ namespace vm const auto sptr = vm::get_super_ptr(static_cast(ptr.addr())); // Prefetch some data - _m_prefetchw(sptr); - _m_prefetchw(reinterpret_cast(sptr) + 64); + //_m_prefetchw(sptr); + //_m_prefetchw(reinterpret_cast(sptr) + 64); // Use 128-byte aligned addr const u32 addr = static_cast(ptr.addr()) & -128; auto& res = vm::reservation_acquire(addr, 128); - _m_prefetchw(&res); + //_m_prefetchw(&res); if (g_use_rtm) { // Stage 1: single optimistic transaction attempt - unsigned status = _XBEGIN_STARTED; + unsigned status = -1; u64 _old = 0; - auto stamp0 = __rdtsc(), stamp1 = stamp0, stamp2 = stamp0; + auto stamp0 = get_tsc(), stamp1 = stamp0, stamp2 = stamp0; #ifndef _MSC_VER __asm__ goto ("xbegin %l[stage2];" ::: "memory" : stage2); #else status = _xbegin(); - if (status == _XBEGIN_STARTED) + if (status == umax) #endif { if (res & rsrv_unique_lock) @@ -158,16 +176,16 @@ namespace vm #ifndef _MSC_VER __asm__ volatile ("mov %%eax, %0;" : "=r" (status) :: "memory"); #endif - stamp1 = __rdtsc(); + stamp1 = get_tsc(); // Stage 2: try to lock reservation first _old = res.fetch_add(1); // Compute stamps excluding memory touch - stamp2 = __rdtsc() - (stamp1 - stamp0); + stamp2 = get_tsc() - (stamp1 - stamp0); // Start lightened transaction - for (; !(_old & vm::rsrv_unique_lock) && stamp2 - stamp0 <= g_rtm_tx_limit2; stamp2 = __rdtsc()) + for (; !(_old & vm::rsrv_unique_lock) && stamp2 - stamp0 <= g_rtm_tx_limit2; stamp2 = get_tsc()) { if (cpu.has_pause_flag()) { @@ -179,7 +197,7 @@ namespace vm #else status = _xbegin(); - if (status != _XBEGIN_STARTED) [[unlikely]] + if (status != umax) [[unlikely]] { goto retry; } diff --git a/rpcs3/Emu/RSX/Common/BufferUtils.cpp b/rpcs3/Emu/RSX/Common/BufferUtils.cpp index 6bce437096..c31f8bc8e5 100644 --- a/rpcs3/Emu/RSX/Common/BufferUtils.cpp +++ b/rpcs3/Emu/RSX/Common/BufferUtils.cpp @@ -3,10 +3,12 @@ #include "../rsx_methods.h" #include "../RSXThread.h" -#include "util/v128.hpp" #include "util/to_endian.hpp" #include "util/sysinfo.hpp" +#include "emmintrin.h" +#include "immintrin.h" + #define DEBUG_VERTEX_STREAMING 0 #if !defined(_MSC_VER) && defined(__clang__) @@ -166,7 +168,7 @@ namespace const u32 dword_count = size >> 2; const u32 iterations = dword_count >> 2; - v128 bits_diff{}; + __m128i bits_diff = _mm_setzero_si128(); if (s_use_ssse3) [[likely]] { @@ -177,12 +179,12 @@ namespace if constexpr (!unaligned) { - bits_diff = bits_diff | v128::fromV(_mm_xor_si128(_mm_load_si128(dst_ptr), shuffled_vector)); + bits_diff = _mm_or_si128(bits_diff, _mm_xor_si128(_mm_load_si128(dst_ptr), shuffled_vector)); _mm_stream_si128(dst_ptr, shuffled_vector); } else { - bits_diff = bits_diff | v128::fromV(_mm_xor_si128(_mm_loadu_si128(dst_ptr), shuffled_vector)); + bits_diff = _mm_or_si128(bits_diff, _mm_xor_si128(_mm_loadu_si128(dst_ptr), shuffled_vector)); _mm_storeu_si128(dst_ptr, shuffled_vector); } @@ -200,12 +202,12 @@ namespace if constexpr (!unaligned) { - bits_diff = bits_diff | v128::fromV(_mm_xor_si128(_mm_load_si128(dst_ptr), vec2)); + bits_diff = _mm_or_si128(bits_diff, _mm_xor_si128(_mm_load_si128(dst_ptr), vec2)); _mm_stream_si128(dst_ptr, vec2); } else { - bits_diff = bits_diff | v128::fromV(_mm_xor_si128(_mm_loadu_si128(dst_ptr), vec2)); + bits_diff = _mm_or_si128(bits_diff, _mm_xor_si128(_mm_loadu_si128(dst_ptr), vec2)); _mm_storeu_si128(dst_ptr, vec2); } @@ -228,12 +230,12 @@ namespace if (dst_ptr2[i] != data) { dst_ptr2[i] = data; - bits_diff._u32[0] = UINT32_MAX; + bits_diff = _mm_set1_epi64x(-1); } } } - return bits_diff != v128{}; + return _mm_cvtsi128_si64(_mm_packs_epi32(bits_diff, bits_diff)) != 0; } template bool stream_data_to_memory_swapped_and_compare_u32(void *dst, const void *src, u32 size); diff --git a/rpcs3/Emu/RSX/Common/ProgramStateCache.cpp b/rpcs3/Emu/RSX/Common/ProgramStateCache.cpp index 9172d4d6d9..e8bcb1ddae 100644 --- a/rpcs3/Emu/RSX/Common/ProgramStateCache.cpp +++ b/rpcs3/Emu/RSX/Common/ProgramStateCache.cpp @@ -284,7 +284,7 @@ bool vertex_program_compare::operator()(const RSXVertexProgram &binary1, const R { const auto inst1 = v128::loadu(instBuffer1, instIndex); const auto inst2 = v128::loadu(instBuffer2, instIndex); - if (inst1 != inst2) + if (inst1._u ^ inst2._u) { return false; } @@ -475,7 +475,7 @@ bool fragment_program_compare::operator()(const RSXFragmentProgram& binary1, con const auto inst1 = v128::loadu(instBuffer1, instIndex); const auto inst2 = v128::loadu(instBuffer2, instIndex); - if (inst1 != inst2) + if (inst1._u ^ inst2._u) return false; instIndex++; diff --git a/rpcs3/Emu/RSX/Common/ProgramStateCache.h b/rpcs3/Emu/RSX/Common/ProgramStateCache.h index 6d1acd1808..fcfda0b811 100644 --- a/rpcs3/Emu/RSX/Common/ProgramStateCache.h +++ b/rpcs3/Emu/RSX/Common/ProgramStateCache.h @@ -397,62 +397,7 @@ public: std::forward(args)...); // Other arguments } - void fill_fragment_constants_buffer(gsl::span dst_buffer, const RSXFragmentProgram &fragment_program, bool sanitize = false) const - { - const auto I = m_fragment_shader_cache.find(fragment_program); - if (I == m_fragment_shader_cache.end()) - return; - - ensure((dst_buffer.size_bytes() >= ::narrow(I->second.FragmentConstantOffsetCache.size()) * 16u)); - - f32* dst = dst_buffer.data(); - alignas(16) f32 tmp[4]; - for (usz offset_in_fragment_program : I->second.FragmentConstantOffsetCache) - { - char* data = static_cast(fragment_program.get_data()) + offset_in_fragment_program; - const __m128i vector = _mm_loadu_si128(reinterpret_cast<__m128i*>(data)); - const __m128i shuffled_vector = _mm_or_si128(_mm_slli_epi16(vector, 8), _mm_srli_epi16(vector, 8)); - - if (!patch_table.is_empty()) - { - _mm_store_ps(tmp, _mm_castsi128_ps(shuffled_vector)); - bool patched; - - for (int i = 0; i < 4; ++i) - { - patched = false; - for (auto& e : patch_table.db) - { - //TODO: Use fp comparison with fabsf without hurting performance - patched = e.second.test_and_set(tmp[i], &dst[i]); - if (patched) - { - break; - } - } - - if (!patched) - { - dst[i] = tmp[i]; - } - } - } - else if (sanitize) - { - //Convert NaNs and Infs to 0 - const auto masked = _mm_and_si128(shuffled_vector, _mm_set1_epi32(0x7fffffff)); - const auto valid = _mm_cmplt_epi32(masked, _mm_set1_epi32(0x7f800000)); - const auto result = _mm_and_si128(shuffled_vector, valid); - _mm_stream_si128(std::bit_cast<__m128i*>(dst), result); - } - else - { - _mm_stream_si128(std::bit_cast<__m128i*>(dst), shuffled_vector); - } - - dst += 4; - } - } + void fill_fragment_constants_buffer(gsl::span dst_buffer, const RSXFragmentProgram& fragment_program, bool sanitize = false) const; void clear() { diff --git a/rpcs3/Emu/RSX/Common/program_state_cache2.hpp b/rpcs3/Emu/RSX/Common/program_state_cache2.hpp new file mode 100644 index 0000000000..b1828a0e4a --- /dev/null +++ b/rpcs3/Emu/RSX/Common/program_state_cache2.hpp @@ -0,0 +1,64 @@ +#pragma once + +#include "ProgramStateCache.h" + +#include "emmintrin.h" +#include "immintrin.h" + +template +void program_state_cache::fill_fragment_constants_buffer(gsl::span dst_buffer, const RSXFragmentProgram &fragment_program, bool sanitize) const +{ + const auto I = m_fragment_shader_cache.find(fragment_program); + if (I == m_fragment_shader_cache.end()) + return; + + ensure((dst_buffer.size_bytes() >= ::narrow(I->second.FragmentConstantOffsetCache.size()) * 16u)); + + f32* dst = dst_buffer.data(); + alignas(16) f32 tmp[4]; + for (usz offset_in_fragment_program : I->second.FragmentConstantOffsetCache) + { + char* data = static_cast(fragment_program.get_data()) + offset_in_fragment_program; + const __m128i vector = _mm_loadu_si128(reinterpret_cast<__m128i*>(data)); + const __m128i shuffled_vector = _mm_or_si128(_mm_slli_epi16(vector, 8), _mm_srli_epi16(vector, 8)); + + if (!patch_table.is_empty()) + { + _mm_store_ps(tmp, _mm_castsi128_ps(shuffled_vector)); + bool patched; + + for (int i = 0; i < 4; ++i) + { + patched = false; + for (auto& e : patch_table.db) + { + //TODO: Use fp comparison with fabsf without hurting performance + patched = e.second.test_and_set(tmp[i], &dst[i]); + if (patched) + { + break; + } + } + + if (!patched) + { + dst[i] = tmp[i]; + } + } + } + else if (sanitize) + { + //Convert NaNs and Infs to 0 + const auto masked = _mm_and_si128(shuffled_vector, _mm_set1_epi32(0x7fffffff)); + const auto valid = _mm_cmplt_epi32(masked, _mm_set1_epi32(0x7f800000)); + const auto result = _mm_and_si128(shuffled_vector, valid); + _mm_stream_si128(std::bit_cast<__m128i*>(dst), result); + } + else + { + _mm_stream_si128(std::bit_cast<__m128i*>(dst), shuffled_vector); + } + + dst += 4; + } +} diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp index ececc28ef6..2eadea354c 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp +++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp @@ -7,6 +7,8 @@ #include "Emu/Memory/vm_locking.h" #include "Emu/RSX/rsx_methods.h" +#include "../Common/program_state_cache2.hpp" + #define DUMP_VERTEX_DATA 0 u64 GLGSRender::get_cycles() diff --git a/rpcs3/Emu/RSX/Overlays/Shaders/shader_loading_dialog.cpp b/rpcs3/Emu/RSX/Overlays/Shaders/shader_loading_dialog.cpp index 3a2c4b2f47..a100c9a060 100644 --- a/rpcs3/Emu/RSX/Overlays/Shaders/shader_loading_dialog.cpp +++ b/rpcs3/Emu/RSX/Overlays/Shaders/shader_loading_dialog.cpp @@ -3,6 +3,8 @@ #include "Emu/System.h" #include "Emu/Cell/Modules/cellMsgDialog.h" +#include "util/asm.hpp" + namespace rsx { void shader_loading_dialog::create(const std::string& msg, const std::string& title) @@ -27,7 +29,7 @@ namespace rsx while (ref_cnt.load() && !Emu.IsStopped()) { - _mm_pause(); + utils::pause(); } } @@ -87,7 +89,7 @@ namespace rsx { while (ref_cnt.load() && !Emu.IsStopped()) { - _mm_pause(); + utils::pause(); } } } diff --git a/rpcs3/Emu/RSX/RSXOffload.cpp b/rpcs3/Emu/RSX/RSXOffload.cpp index 4c9986c7ac..ed6c710c46 100644 --- a/rpcs3/Emu/RSX/RSXOffload.cpp +++ b/rpcs3/Emu/RSX/RSXOffload.cpp @@ -6,6 +6,7 @@ #include "rsx_utils.h" #include +#include "util/asm.hpp" namespace rsx { @@ -171,13 +172,13 @@ namespace rsx while (_thr->m_enqueued_count.load() > _thr->m_processed_count.load()) { rsxthr->on_semaphore_acquire_wait(); - _mm_pause(); + utils::pause(); } } else { while (_thr->m_enqueued_count.load() > _thr->m_processed_count.load()) - _mm_pause(); + utils::pause(); } return true; diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp index d1067f406f..f98cc52bed 100644 --- a/rpcs3/Emu/RSX/RSXThread.cpp +++ b/rpcs3/Emu/RSX/RSXThread.cpp @@ -862,7 +862,7 @@ namespace rsx for (; t == now; now = get_time_ns()) { - _mm_pause(); + utils::pause(); } timestamp_ctrl = now; @@ -2662,7 +2662,7 @@ namespace rsx for (u32 ea = address >> 20, end = ea + (size >> 20); ea < end; ea++) { - const u32 io = utils::ror32(iomap_table.io[ea], 20); + const u32 io = utils::rol32(iomap_table.io[ea], 32 - 20); if (io + 1) { @@ -2747,7 +2747,7 @@ namespace rsx if (Emu.IsStopped()) break; - _mm_pause(); + utils::pause(); } } @@ -2771,7 +2771,7 @@ namespace rsx while (external_interrupt_lock) { // TODO: Investigate non busy-spinning method - _mm_pause(); + utils::pause(); } external_interrupt_ack.store(false); diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp index 5524b9f52a..54f9315705 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp +++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp @@ -10,6 +10,8 @@ #include "Emu/RSX/rsx_methods.h" #include "Emu/Memory/vm_locking.h" +#include "../Common/program_state_cache2.hpp" + #include "util/asm.hpp" namespace vk @@ -679,7 +681,7 @@ bool VKGSRender::on_access_violation(u32 address, bool is_writing) // Wait for deadlock to clear while (m_queue_status & flush_queue_state::deadlock) { - _mm_pause(); + utils::pause(); } g_fxo->get()->clear_mem_fault_flag(); diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.h b/rpcs3/Emu/RSX/VK/VKGSRender.h index 9b1fa5d84d..1e53fed275 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.h +++ b/rpcs3/Emu/RSX/VK/VKGSRender.h @@ -300,7 +300,11 @@ namespace vk { while (num_waiters.load() != 0) { +#ifdef _MSC_VER _mm_pause(); +#else + __builtin_ia32_pause(); +#endif } } diff --git a/rpcs3/Emu/RSX/VK/VKHelpers.cpp b/rpcs3/Emu/RSX/VK/VKHelpers.cpp index a3aaad8907..673a54b53d 100644 --- a/rpcs3/Emu/RSX/VK/VKHelpers.cpp +++ b/rpcs3/Emu/RSX/VK/VKHelpers.cpp @@ -1006,7 +1006,11 @@ namespace vk } //std::this_thread::yield(); +#ifdef _MSC_VER _mm_pause(); +#else + __builtin_ia32_pause(); +#endif } } diff --git a/rpcs3/Emu/RSX/VK/VKHelpers.h b/rpcs3/Emu/RSX/VK/VKHelpers.h index 85c018d6f9..5e67772f0d 100644 --- a/rpcs3/Emu/RSX/VK/VKHelpers.h +++ b/rpcs3/Emu/RSX/VK/VKHelpers.h @@ -24,6 +24,10 @@ #include "3rdparty/GPUOpen/include/vk_mem_alloc.h" +#ifdef _MSC_VER +extern "C" void _mm_pause(); +#endif + #ifdef __APPLE__ #define VK_DISABLE_COMPONENT_SWIZZLE 1 #else @@ -1231,7 +1235,11 @@ private: { while (!flushed) { +#ifdef _MSC_VER _mm_pause(); +#else + __builtin_ia32_pause(); +#endif } } diff --git a/rpcs3/Emu/System.cpp b/rpcs3/Emu/System.cpp index 139dd909ea..01381f6069 100644 --- a/rpcs3/Emu/System.cpp +++ b/rpcs3/Emu/System.cpp @@ -1564,7 +1564,7 @@ game_boot_result Emulator::Load(const std::string& title_id, bool add_only, bool elf_file.open(decrypted_path); } // Decrypt SELF - else if ((elf_file = decrypt_self(std::move(elf_file), klic.empty() ? nullptr : klic.data(), &g_ps3_process_info.self_info))) + else if ((elf_file = decrypt_self(std::move(elf_file), klic.empty() ? nullptr : reinterpret_cast(&klic[0]), &g_ps3_process_info.self_info))) { if (true) { diff --git a/rpcs3/Emu/System.h b/rpcs3/Emu/System.h index 449b31b350..33de6f011b 100644 --- a/rpcs3/Emu/System.h +++ b/rpcs3/Emu/System.h @@ -124,7 +124,7 @@ public: std::vector argv; std::vector envp; std::vector data; - std::vector klic; + std::vector klic; std::string disc; std::string hdd1; diff --git a/rpcs3/Emu/VFS.cpp b/rpcs3/Emu/VFS.cpp index b8c6ea97ce..dcacef1c42 100644 --- a/rpcs3/Emu/VFS.cpp +++ b/rpcs3/Emu/VFS.cpp @@ -707,7 +707,7 @@ std::string vfs::unescape(std::string_view name) std::string vfs::host::hash_path(const std::string& path, const std::string& dev_root) { - return fmt::format(u8"%s/$%s%s", dev_root, fmt::base57(std::hash()(path)), fmt::base57(__rdtsc())); + return fmt::format(u8"%s/$%s%s", dev_root, fmt::base57(std::hash()(path)), fmt::base57(utils::get_unique_tsc())); } bool vfs::host::rename(const std::string& from, const std::string& to, const lv2_fs_mount_point* mp, bool overwrite) diff --git a/rpcs3/Emu/perf_meter.cpp b/rpcs3/Emu/perf_meter.cpp index 26387c3272..e64fc939e1 100644 --- a/rpcs3/Emu/perf_meter.cpp +++ b/rpcs3/Emu/perf_meter.cpp @@ -1,6 +1,8 @@ -#include "stdafx.h" +#include "stdafx.h" #include "perf_meter.hpp" +#include "util/sysinfo.hpp" + #include #include @@ -65,6 +67,36 @@ void perf_stat_base::print(const char* name) noexcept } } +#ifdef _MSC_VER +extern "C" void _mm_lfence(); +#endif + +SAFE_BUFFERS void perf_stat_base::push(u64 data[66], u64 start_time, const char* name) noexcept +{ + // Event end +#ifdef _MSC_VER + const u64 end_time = (_mm_lfence(), get_tsc()); +#else + const u64 end_time = (__builtin_ia32_lfence(), get_tsc()); +#endif + + // Compute difference in seconds + const f64 diff = (end_time - start_time) * 1. / utils::get_tsc_freq(); + + // Register perf stat in nanoseconds + const u64 ns = static_cast(diff * 1000'000'000.); + + // Print in microseconds + if (static_cast(diff * 1000'000.) >= g_cfg.core.perf_report_threshold) + { + perf_log.notice(u8"%s: %.3fµs", name, diff * 1000'000.); + } + + data[0] += ns != 0; + data[64 - std::countl_zero(ns)]++; + data[65] += ns; +} + static shared_mutex s_perf_mutex; static std::map s_perf_acc; diff --git a/rpcs3/Emu/perf_meter.hpp b/rpcs3/Emu/perf_meter.hpp index 5625d562fb..0c5c762585 100644 --- a/rpcs3/Emu/perf_meter.hpp +++ b/rpcs3/Emu/perf_meter.hpp @@ -7,10 +7,22 @@ #include #include -#include "util/sysinfo.hpp" - LOG_CHANNEL(perf_log, "PERF"); +#ifdef _MSC_VER +extern "C" u64 __rdtsc(); + +inline u64 get_tsc() +{ + return __rdtsc(); +} +#else +inline u64 get_tsc() +{ + return __builtin_ia32_rdtsc(); +} +#endif + // TODO: constexpr with the help of bitcast template inline const auto perf_name = [] @@ -32,6 +44,9 @@ protected: // Accumulate values from a thread void push(u64 ns[66]) noexcept; + // Get end time; accumulate value to the TLS + static void push(u64 ns[66], u64 start_time, const char* name) noexcept; + // Register TLS storage for stats static void add(u64 ns[66], const char* name) noexcept; @@ -73,27 +88,9 @@ class perf_stat final : public perf_stat_base } g_tls_perf_stat; public: - static NEVER_INLINE void push(u64 start_time) noexcept + static SAFE_BUFFERS FORCE_INLINE void push(u64 start_time) noexcept { - // Event end - const u64 end_time = (_mm_lfence(), __rdtsc()); - - // Compute difference in seconds - const f64 diff = (end_time - start_time) * 1. / utils::get_tsc_freq(); - - // Register perf stat in nanoseconds - const u64 ns = static_cast(diff * 1000'000'000.); - - // Print in microseconds - if (static_cast(diff * 1000'000.) >= g_cfg.core.perf_report_threshold) - { - perf_log.notice(u8"%s: %.3fµs", perf_name.data(), diff * 1000'000.); - } - - auto& data = g_tls_perf_stat.m_log; - data[0] += ns != 0; - data[64 - std::countl_zero(ns)]++; - data[65] += ns; + perf_stat_base::push(g_tls_perf_stat.m_log, start_time, perf_name.data()); } }; @@ -149,7 +146,7 @@ public: if constexpr (std::array{(SubEvents == Event)...}[Index]) { // Push actual timestamp into an array - m_timestamps[Index + 1] = __rdtsc(); + m_timestamps[Index + 1] = get_tsc(); } else if constexpr (Index < sizeof...(SubEvents)) { @@ -173,7 +170,7 @@ public: // Re-initialize first timestamp SAFE_BUFFERS FORCE_INLINE void restart() noexcept { - m_timestamps[0] = __rdtsc(); + m_timestamps[0] = get_tsc(); std::memset(m_timestamps + 1, 0, sizeof(m_timestamps) - sizeof(u64)); } diff --git a/rpcs3/Loader/TRP.cpp b/rpcs3/Loader/TRP.cpp index 8f30a829c0..97293121b2 100644 --- a/rpcs3/Loader/TRP.cpp +++ b/rpcs3/Loader/TRP.cpp @@ -23,7 +23,7 @@ bool TRPLoader::Install(const std::string& dest, bool show) const std::string& local_path = vfs::get(dest); - const auto temp = fmt::format(u8"%s.$temp$%u", local_path, __rdtsc()); + const auto temp = fmt::format(u8"%s.$temp$%u", local_path, utils::get_unique_tsc()); if (!fs::create_dir(temp)) { diff --git a/rpcs3/emucore.vcxproj b/rpcs3/emucore.vcxproj index 29c79bf29b..61d5e8e1f9 100644 --- a/rpcs3/emucore.vcxproj +++ b/rpcs3/emucore.vcxproj @@ -120,9 +120,6 @@ NotUsing - - NotUsing - NotUsing Sync @@ -516,6 +513,7 @@ + @@ -531,7 +529,6 @@ - @@ -742,6 +739,7 @@ + diff --git a/rpcs3/emucore.vcxproj.filters b/rpcs3/emucore.vcxproj.filters index 931aff5de0..5cf418f088 100644 --- a/rpcs3/emucore.vcxproj.filters +++ b/rpcs3/emucore.vcxproj.filters @@ -935,9 +935,6 @@ Emu\GPU\RSX\Overlays - - Utilities - Utilities @@ -1072,6 +1069,9 @@ Utilities + + Utilities + Utilities @@ -1153,6 +1153,9 @@ Emu\GPU\RSX\Common + + Emu\GPU\RSX\Common + Emu\GPU\RSX\Common @@ -1819,9 +1822,6 @@ Utilities - - Utilities - Utilities diff --git a/rpcs3/main.cpp b/rpcs3/main.cpp index f78c45a8f1..9e35967c45 100644 --- a/rpcs3/main.cpp +++ b/rpcs3/main.cpp @@ -44,7 +44,6 @@ DYNAMIC_IMPORT("ntdll.dll", NtSetTimerResolution, NTSTATUS(ULONG DesiredResoluti #include #include -#include "util/v128.hpp" #include "util/sysinfo.hpp" inline std::string sstr(const QString& _in) { return _in.toStdString(); } @@ -301,8 +300,6 @@ int main(int argc, char** argv) const u64 intro_time = (intro_stats.ru_utime.tv_sec + intro_stats.ru_stime.tv_sec) * 1000000000ull + (intro_stats.ru_utime.tv_usec + intro_stats.ru_stime.tv_usec) * 1000ull; #endif - v128::use_fma = utils::has_fma3(); - s_argv0 = argv[0]; // Save for report_fatal_error // Only run RPCS3 to display an error diff --git a/rpcs3/rpcs3qt/main_window.cpp b/rpcs3/rpcs3qt/main_window.cpp index 78d90de6ab..09d2a0d130 100644 --- a/rpcs3/rpcs3qt/main_window.cpp +++ b/rpcs3/rpcs3qt/main_window.cpp @@ -885,12 +885,12 @@ void main_window::DecryptSPRXLibraries() gui_log.notice("Decrypting binaries..."); // Always start with no KLIC - std::vector klics{v128{}}; + std::vector klics{u128{}}; if (const auto keys = g_fxo->get()) { // Second klic: get it from a running game - if (const v128 klic = keys->devKlic; klic != v128{}) + if (const u128 klic = keys->devKlic) { klics.emplace_back(klic); } @@ -913,7 +913,7 @@ void main_window::DecryptSPRXLibraries() if (elf_file.open(old_path) && elf_file.size() >= 4 && elf_file.read() == "SCE\0"_u32) { // First KLIC is no KLIC - elf_file = decrypt_self(std::move(elf_file), key_it != 0 ? klics[key_it]._bytes : nullptr); + elf_file = decrypt_self(std::move(elf_file), key_it != 0 ? reinterpret_cast(&klics[key_it]) : nullptr); if (!elf_file) { @@ -985,11 +985,15 @@ void main_window::DecryptSPRXLibraries() ensure(text.size() == 32); // It must succeed (only hex characters are present) - std::from_chars(&text[0], &text[16], klic._u64[1], 16); // Not a typo: on LE systems the u64[1] part will be swapped with u64[0] later - std::from_chars(&text[16], &text[32], klic._u64[0], 16); // And on BE systems it will be already swapped by index internally + u64 lo_ = 0; + u64 hi_ = 0; + std::from_chars(&text[0], &text[16], lo_, 16); + std::from_chars(&text[16], &text[32], hi_, 16); - // Needs to be in big endian because the left to right byte-order means big endian - klic = std::bit_cast>(klic); + be_t lo = std::bit_cast>(lo_); + be_t hi = std::bit_cast>(hi_); + + klic = (u128{+hi} << 64) | +lo; // Retry with specified KLIC key_it -= +std::exchange(tried, true); // Rewind on second and above attempt diff --git a/rpcs3/rpcs3qt/register_editor_dialog.cpp b/rpcs3/rpcs3qt/register_editor_dialog.cpp index b16a288932..d6ac5f0639 100644 --- a/rpcs3/rpcs3qt/register_editor_dialog.cpp +++ b/rpcs3/rpcs3qt/register_editor_dialog.cpp @@ -179,7 +179,7 @@ void register_editor_dialog::updateRegister(int reg) else if (reg >= ppu_v0 && reg <= ppu_v31) { const auto r = ppu.vr[reg_index]; - str = r == v128::from32p(r._u32[0]) ? fmt::format("%08x$", r._u32[0]) : fmt::format("%08x %08x %08x %08x", r.u32r[0], r.u32r[1], r.u32r[2], r.u32r[3]); + str = !r._u ? fmt::format("%08x$", r._u32[0]) : fmt::format("%08x %08x %08x %08x", r.u32r[0], r.u32r[1], r.u32r[2], r.u32r[3]); } } else if (reg == PPU_CR) str = fmt::format("%08x", ppu.cr.pack()); @@ -198,7 +198,7 @@ void register_editor_dialog::updateRegister(int reg) { const u32 reg_index = reg % 128; const auto r = spu.gpr[reg_index]; - str = r == v128::from32p(r._u32[0]) ? fmt::format("%08x$", r._u32[0]) : fmt::format("%08x %08x %08x %08x", r.u32r[0], r.u32r[1], r.u32r[2], r.u32r[3]); + str = !r._u ? fmt::format("%08x$", r._u32[0]) : fmt::format("%08x %08x %08x %08x", r.u32r[0], r.u32r[1], r.u32r[2], r.u32r[3]); } else if (reg == MFC_PEVENTS) str = fmt::format("%08x", +spu.ch_events.load().events); else if (reg == MFC_EVENTS_MASK) str = fmt::format("%08x", +spu.ch_events.load().mask); diff --git a/rpcs3/util/asm.hpp b/rpcs3/util/asm.hpp index ef72fbb50e..4f08439afa 100644 --- a/rpcs3/util/asm.hpp +++ b/rpcs3/util/asm.hpp @@ -5,22 +5,54 @@ extern bool g_use_rtm; extern u64 g_rtm_tx_limit1; +#ifdef _MSC_VER +extern "C" +{ + u64 __rdtsc(); + u32 _xbegin(); + void _xend(); + void _mm_pause(); + void _mm_prefetch(const char*, int); + void _m_prefetchw(const volatile void*); + + uchar _rotl8(uchar, uchar); + ushort _rotl16(ushort, uchar); + uint _rotl(uint, int); + u64 _rotl64(u64, int); + + s64 __mulh(s64, s64); + u64 __umulh(u64, u64); + + s64 _div128(s64, s64, s64, s64*); + u64 _udiv128(u64, u64, u64, u64*); +} +#endif + namespace utils { + inline u64 get_tsc() + { +#ifdef _MSC_VER + return __rdtsc(); +#else + return __builtin_ia32_rdtsc(); +#endif + } + // Transaction helper (result = pair of success and op result, or just bool) template > inline auto tx_start(F op) { uint status = -1; - for (auto stamp0 = __rdtsc(), stamp1 = stamp0; g_use_rtm && stamp1 - stamp0 <= g_rtm_tx_limit1; stamp1 = __rdtsc()) + for (auto stamp0 = get_tsc(), stamp1 = stamp0; g_use_rtm && stamp1 - stamp0 <= g_rtm_tx_limit1; stamp1 = get_tsc()) { #ifndef _MSC_VER __asm__ goto ("xbegin %l[retry];" ::: "memory" : retry); #else status = _xbegin(); - if (status != _XBEGIN_STARTED) [[unlikely]] + if (status != umax) [[unlikely]] { goto retry; } @@ -80,7 +112,7 @@ namespace utils const void* ptr = reinterpret_cast(value); #ifdef _MSC_VER - return _mm_prefetch(reinterpret_cast(ptr), _MM_HINT_T1); + return _mm_prefetch(reinterpret_cast(ptr), 2); #else return __builtin_prefetch(ptr, 0, 2); #endif @@ -95,7 +127,7 @@ namespace utils } #ifdef _MSC_VER - return _mm_prefetch(reinterpret_cast(ptr), _MM_HINT_T0); + return _mm_prefetch(reinterpret_cast(ptr), 3); #else return __builtin_prefetch(ptr, 0, 3); #endif @@ -108,7 +140,11 @@ namespace utils return; } +#ifdef _MSC_VER return _m_prefetchw(ptr); +#else + return __builtin_prefetch(ptr, 1, 0); +#endif } constexpr u8 rol8(u8 x, u8 n) @@ -120,8 +156,10 @@ namespace utils #ifdef _MSC_VER return _rotl8(x, n); +#elif defined(__clang__) + return __builtin_rotateleft8(x, n); #else - return __rolb(x, n); + return __builtin_ia32_rolqi(x, n); #endif } @@ -133,9 +171,11 @@ namespace utils } #ifdef _MSC_VER - return _rotl16(x, n); + return _rotl16(x, static_cast(n)); +#elif defined(__clang__) + return __builtin_rotateleft16(x, n); #else - return __rolw(x, n); + return __builtin_ia32_rolhi(x, n); #endif } @@ -148,22 +188,10 @@ namespace utils #ifdef _MSC_VER return _rotl(x, n); +#elif defined(__clang__) + return __builtin_rotateleft32(x, n); #else - return __rold(x, n); -#endif - } - - constexpr u32 ror32(u32 x, u32 n) - { - if (std::is_constant_evaluated()) - { - return (x >> (n & 31)) | (x << (((0 - n) & 31))); - } - -#ifdef _MSC_VER - return _rotr(x, n); -#else - return __rord(x, n); + return (x << n) | (x >> (32 - n)); #endif } @@ -176,8 +204,10 @@ namespace utils #ifdef _MSC_VER return _rotl64(x, static_cast(n)); +#elif defined(__clang__) + return __builtin_rotateleft64(x, n); #else - return __rolq(x, static_cast(n)); + return (x << n) | (x >> (64 - n)); #endif } @@ -285,12 +315,21 @@ namespace utils #endif } + inline void pause() + { +#ifdef _MSC_VER + _mm_pause(); +#else + __builtin_ia32_pause(); +#endif + } + // Synchronization helper (cache-friendly busy waiting) inline void busy_wait(usz cycles = 3000) { - const u64 start = __rdtsc(); - do _mm_pause(); - while (__rdtsc() - start < cycles); + const u64 start = get_tsc(); + do pause(); + while (get_tsc() - start < cycles); } // Align to power of 2 diff --git a/rpcs3/util/atomic.cpp b/rpcs3/util/atomic.cpp index 86a818a600..bff204a8c2 100644 --- a/rpcs3/util/atomic.cpp +++ b/rpcs3/util/atomic.cpp @@ -6,6 +6,25 @@ #define USE_STD #endif +#ifdef _MSC_VER + +#include "emmintrin.h" +#include "immintrin.h" + +namespace utils +{ + u128 __vectorcall atomic_load16(const void* ptr) + { + return std::bit_cast(_mm_load_si128((__m128i*)ptr)); + } + + void __vectorcall atomic_store16(void* ptr, u128 value) + { + _mm_store_si128((__m128i*)ptr, std::bit_cast<__m128i>(value)); + } +} +#endif + #include "Utilities/sync.h" #include "Utilities/StrFmt.h" @@ -847,9 +866,17 @@ namespace }; } -u64 atomic_wait::get_unique_tsc() +#ifdef _MSC_VER +extern "C" u64 __rdtsc(); +#endif + +u64 utils::get_unique_tsc() { +#ifdef _MSC_VER const u64 stamp0 = __rdtsc(); +#else + const u64 stamp0 = __builtin_ia32_rdtsc(); +#endif return s_min_tsc.atomic_op([&](u64& tsc) { @@ -1026,7 +1053,7 @@ FORCE_INLINE auto root_info::slot_search(uptr iptr, u32 size, u64 thread_id, u12 SAFE_BUFFERS void atomic_wait_engine::wait(const void* data, u32 size, u128 old_value, u64 timeout, u128 mask, atomic_wait::info* ext) { - const auto stamp0 = atomic_wait::get_unique_tsc(); + const auto stamp0 = utils::get_unique_tsc(); if (!s_tls_wait_cb(data, 0, stamp0)) { diff --git a/rpcs3/util/atomic.hpp b/rpcs3/util/atomic.hpp index 2518836024..6fa5a3649d 100644 --- a/rpcs3/util/atomic.hpp +++ b/rpcs3/util/atomic.hpp @@ -7,6 +7,62 @@ #ifdef _MSC_VER #pragma warning(push) #pragma warning(disable: 4996) + +extern "C" +{ + void _ReadWriteBarrier(); + void* _AddressOfReturnAddress(); + + uchar _bittest(const long*, long); + uchar _interlockedbittestandset(volatile long*, long); + uchar _interlockedbittestandreset(volatile long*, long); + + char _InterlockedCompareExchange8(volatile char*, char, char); + char _InterlockedExchange8(volatile char*, char); + char _InterlockedExchangeAdd8(volatile char*, char); + char _InterlockedAnd8(volatile char*, char); + char _InterlockedOr8(volatile char*, char); + char _InterlockedXor8(volatile char*, char); + + short _InterlockedCompareExchange16(volatile short*, short, short); + short _InterlockedExchange16(volatile short*, short); + short _InterlockedExchangeAdd16(volatile short*, short); + short _InterlockedAnd16(volatile short*, short); + short _InterlockedOr16(volatile short*, short); + short _InterlockedXor16(volatile short*, short); + short _InterlockedIncrement16(volatile short*); + short _InterlockedDecrement16(volatile short*); + + long _InterlockedCompareExchange(volatile long*, long, long); + long _InterlockedCompareExchange_HLEAcquire(volatile long*, long, long); + long _InterlockedExchange(volatile long*, long); + long _InterlockedExchangeAdd(volatile long*, long); + long _InterlockedExchangeAdd_HLERelease(volatile long*, long); + long _InterlockedAnd(volatile long*, long); + long _InterlockedOr(volatile long*, long); + long _InterlockedXor(volatile long*, long); + long _InterlockedIncrement(volatile long*); + long _InterlockedDecrement(volatile long*); + + s64 _InterlockedCompareExchange64(volatile s64*, s64, s64); + s64 _InterlockedCompareExchange64_HLEAcquire(volatile s64*, s64, s64); + s64 _InterlockedExchange64(volatile s64*, s64); + s64 _InterlockedExchangeAdd64(volatile s64*, s64); + s64 _InterlockedExchangeAdd64_HLERelease(volatile s64*, s64); + s64 _InterlockedAnd64(volatile s64*, s64); + s64 _InterlockedOr64(volatile s64*, s64); + s64 _InterlockedXor64(volatile s64*, s64); + s64 _InterlockedIncrement64(volatile s64*); + s64 _InterlockedDecrement64(volatile s64*); + + uchar _InterlockedCompareExchange128(volatile s64*, s64, s64, s64*); +} + +namespace utils +{ + u128 __vectorcall atomic_load16(const void*); + void __vectorcall atomic_store16(void*, u128); +} #endif FORCE_INLINE void atomic_fence_consume() @@ -238,7 +294,10 @@ namespace atomic_wait template ().template wait(any_value))...>> list(T&... vars) -> list; +} +namespace utils +{ // RDTSC with adjustment for being unique u64 get_unique_tsc(); } @@ -871,18 +930,14 @@ struct atomic_storage : atomic_storage static inline T load(const T& dest) { atomic_fence_acquire(); - __m128i val = _mm_load_si128(reinterpret_cast(&dest)); + u128 val = utils::atomic_load16(&dest); atomic_fence_acquire(); return std::bit_cast(val); } static inline T observe(const T& dest) { - // Barriers are kept intentionally - atomic_fence_acquire(); - __m128i val = _mm_load_si128(reinterpret_cast(&dest)); - atomic_fence_acquire(); - return std::bit_cast(val); + return load(dest); } static inline bool compare_exchange(T& dest, T& comp, T exch) @@ -906,32 +961,31 @@ struct atomic_storage : atomic_storage static inline void store(T& dest, T value) { atomic_fence_acq_rel(); - _mm_store_si128(reinterpret_cast<__m128i*>(&dest), std::bit_cast<__m128i>(value)); + release(dest, value); atomic_fence_seq_cst(); } static inline void release(T& dest, T value) { atomic_fence_release(); - _mm_store_si128(reinterpret_cast<__m128i*>(&dest), std::bit_cast<__m128i>(value)); + utils::atomic_store16(&dest, std::bit_cast(value)); atomic_fence_release(); } #else static inline T load(const T& dest) { - __atomic_thread_fence(__ATOMIC_ACQUIRE); - __m128i val = _mm_load_si128(reinterpret_cast(&dest)); - __atomic_thread_fence(__ATOMIC_ACQUIRE); - return std::bit_cast(val); + __m128i r; +#ifdef __AVX__ + __asm__ volatile("vmovdqa %1, %0;" : "=x" (r) : "m" (dest) : "memory"); +#else + __asm__ volatile("movdqa %1, %0;" : "=x" (r) : "m" (dest) : "memory"); +#endif + return std::bit_cast(r); } static inline T observe(const T& dest) { - // Barriers are kept intentionally - __atomic_thread_fence(__ATOMIC_ACQUIRE); - __m128i val = _mm_load_si128(reinterpret_cast(&dest)); - __atomic_thread_fence(__ATOMIC_ACQUIRE); - return std::bit_cast(val); + return load(dest); } static inline bool compare_exchange(T& dest, T& comp, T exch) @@ -987,16 +1041,17 @@ struct atomic_storage : atomic_storage static inline void store(T& dest, T value) { - __atomic_thread_fence(__ATOMIC_ACQ_REL); - _mm_store_si128(reinterpret_cast<__m128i*>(&dest), std::bit_cast<__m128i>(value)); + release(dest, value); atomic_fence_seq_cst(); } static inline void release(T& dest, T value) { - __atomic_thread_fence(__ATOMIC_RELEASE); - _mm_store_si128(reinterpret_cast<__m128i*>(&dest), std::bit_cast<__m128i>(value)); - __atomic_thread_fence(__ATOMIC_RELEASE); +#ifdef __AVX__ + __asm__ volatile("vmovdqa %0, %1;" :: "x" (reinterpret_cast<__m128i&>(value)), "m" (dest) : "memory"); +#else + __asm__ volatile("movdqa %0, %1;" :: "x" (reinterpret_cast<__m128i&>(value)), "m" (dest) : "memory"); +#endif } #endif diff --git a/rpcs3/util/atomic2.cpp b/rpcs3/util/atomic2.cpp deleted file mode 100644 index 1676d4ffe1..0000000000 --- a/rpcs3/util/atomic2.cpp +++ /dev/null @@ -1,532 +0,0 @@ -#include "atomic2.hpp" -#include "Utilities/JIT.h" - -#include "util/sysinfo.hpp" - -// -static const bool s_use_rtm = utils::has_rtm(); - -template -static const auto commit_tx = build_function_asm([](asmjit::X86Assembler& c, auto& args) -{ - static_assert(Count <= 8); - using namespace asmjit; - - // Fill registers with item data - c.lea(x86::rax, x86::qword_ptr(args[0], 120)); - - if constexpr (Count >= 1) - { - c.mov(x86::rcx, x86::qword_ptr(x86::rax, -120)); - c.mov(x86::rdx, x86::qword_ptr(x86::rax, -112)); - c.mov(x86::r8, x86::qword_ptr(x86::rax, -104)); - } - if constexpr (Count >= 2) - { - c.mov(x86::r9, x86::qword_ptr(x86::rax, -96)); - c.mov(x86::r10, x86::qword_ptr(x86::rax, -88)); - c.mov(x86::r11, x86::qword_ptr(x86::rax, -80)); - } - if constexpr (Count >= 3) - { - if (utils::has_avx()) - { - c.vzeroupper(); - } - -#ifdef _WIN32 - c.push(x86::rsi); -#endif - c.mov(x86::rsi, x86::qword_ptr(x86::rax, -72)); - c.movups(x86::xmm0, x86::oword_ptr(x86::rax, -64)); - } - if constexpr (Count >= 4) - { -#ifdef _WIN32 - c.push(x86::rdi); -#endif - c.mov(x86::rdi, x86::qword_ptr(x86::rax, -48)); - c.movups(x86::xmm1, x86::oword_ptr(x86::rax, -40)); - } - if constexpr (Count >= 5) - { - c.push(x86::rbx); - c.mov(x86::rbx, x86::qword_ptr(x86::rax, -24)); - c.movups(x86::xmm2, x86::oword_ptr(x86::rax, -16)); - } - if constexpr (Count >= 6) - { - c.push(x86::rbp); - c.mov(x86::rbp, x86::qword_ptr(x86::rax)); - c.movups(x86::xmm3, x86::oword_ptr(x86::rax, 8)); - } - if constexpr (Count >= 7) - { - c.push(x86::r12); - c.mov(x86::r12, x86::qword_ptr(x86::rax, 24)); - c.movups(x86::xmm4, x86::oword_ptr(x86::rax, 32)); - } - if constexpr (Count >= 8) - { - c.push(x86::r13); - c.mov(x86::r13, x86::qword_ptr(x86::rax, 48)); - c.movups(x86::xmm5, x86::oword_ptr(x86::rax, 56)); - } - - // Begin transaction - Label begin = c.newLabel(); - Label fall = c.newLabel(); - Label stop = c.newLabel(); - Label wait = c.newLabel(); - Label ret = c.newLabel(); - c.bind(begin); - c.xbegin(fall); - - // Compare phase - if constexpr (Count >= 1) - { - c.cmp(x86::qword_ptr(x86::rcx), x86::rdx); - c.jne(stop); - } - if constexpr (Count >= 2) - { - c.cmp(x86::qword_ptr(x86::r9), x86::r10); - c.jne(stop); - } - if constexpr (Count >= 3) - { - c.movq(x86::rax, x86::xmm0); - c.cmp(x86::qword_ptr(x86::rsi), x86::rax); - c.jne(stop); - } - if constexpr (Count >= 4) - { - c.movq(x86::rax, x86::xmm1); - c.cmp(x86::qword_ptr(x86::rdi), x86::rax); - c.jne(stop); - } - if constexpr (Count >= 5) - { - c.movq(x86::rax, x86::xmm2); - c.cmp(x86::qword_ptr(x86::rbx), x86::rax); - c.jne(stop); - } - if constexpr (Count >= 6) - { - c.movq(x86::rax, x86::xmm3); - c.cmp(x86::qword_ptr(x86::rbp), x86::rax); - c.jne(stop); - } - if constexpr (Count >= 7) - { - c.movq(x86::rax, x86::xmm4); - c.cmp(x86::qword_ptr(x86::r12), x86::rax); - c.jne(stop); - } - if constexpr (Count >= 8) - { - c.movq(x86::rax, x86::xmm5); - c.cmp(x86::qword_ptr(x86::r13), x86::rax); - c.jne(stop); - } - - // Check for transactions in progress - if constexpr (Count >= 1) - { - c.cmp(x86::qword_ptr(x86::rcx, 8), 0); - c.jne(wait); - } - if constexpr (Count >= 2) - { - c.cmp(x86::qword_ptr(x86::r9, 8), 0); - c.jne(wait); - } - if constexpr (Count >= 3) - { - c.cmp(x86::qword_ptr(x86::rsi, 8), 0); - c.jne(wait); - } - if constexpr (Count >= 4) - { - c.cmp(x86::qword_ptr(x86::rdi, 8), 0); - c.jne(wait); - } - if constexpr (Count >= 5) - { - c.cmp(x86::qword_ptr(x86::rbx, 8), 0); - c.jne(wait); - } - if constexpr (Count >= 6) - { - c.cmp(x86::qword_ptr(x86::rbp, 8), 0); - c.jne(wait); - } - if constexpr (Count >= 7) - { - c.cmp(x86::qword_ptr(x86::r12, 8), 0); - c.jne(wait); - } - if constexpr (Count >= 8) - { - c.cmp(x86::qword_ptr(x86::r13, 8), 0); - c.jne(wait); - } - - // Write phase - if constexpr (Count >= 1) - c.mov(x86::qword_ptr(x86::rcx), x86::r8); - if constexpr (Count >= 2) - c.mov(x86::qword_ptr(x86::r9), x86::r11); - if constexpr (Count >= 3) - c.movhps(x86::qword_ptr(x86::rsi), x86::xmm0); - if constexpr (Count >= 4) - c.movhps(x86::qword_ptr(x86::rdi), x86::xmm1); - if constexpr (Count >= 5) - c.movhps(x86::qword_ptr(x86::rbx), x86::xmm2); - if constexpr (Count >= 6) - c.movhps(x86::qword_ptr(x86::rbp), x86::xmm3); - if constexpr (Count >= 7) - c.movhps(x86::qword_ptr(x86::r12), x86::xmm4); - if constexpr (Count >= 8) - c.movhps(x86::qword_ptr(x86::r13), x86::xmm5); - - // End transaction (success) - c.xend(); - c.mov(x86::eax, 1); - c.bind(ret); - if constexpr (Count >= 8) - c.pop(x86::r13); - if constexpr (Count >= 7) - c.pop(x86::r12); - if constexpr (Count >= 6) - c.pop(x86::rbp); - if constexpr (Count >= 5) - c.pop(x86::rbx); -#ifdef _WIN32 - if constexpr (Count >= 4) - c.pop(x86::rdi); - if constexpr (Count >= 3) - c.pop(x86::rsi); -#endif - c.ret(); - - // Transaction abort - c.bind(stop); - c.xend(); - c.xor_(x86::eax, x86::eax); - c.jmp(fall); - - // Abort when there is still a chance of success - c.bind(wait); - c.xend(); - c.mov(x86::eax, 0xffu << 24); - c.jmp(fall); - - // Transaction fallback: return zero - c.bind(fall); - c.test(x86::eax, _XABORT_RETRY); - c.jnz(begin); - c.sar(x86::eax, 24); - c.jmp(ret); -}); - -// 4095 records max -static constexpr u64 s_rec_gcount = 4096 / 64; - -// Global record pool -static stx::multi_cas_record s_records[s_rec_gcount * 64]{}; - -// Allocation bits (without first element) -static atomic_t s_rec_bits[s_rec_gcount]{1}; - -static constexpr u64 s_state_mask = 3; -static constexpr u64 s_state_undef = 0; -static constexpr u64 s_state_failure = 1; -static constexpr u64 s_state_success = 2; -static constexpr u64 s_ref_mask = ~s_state_mask; -static constexpr u64 s_ref_one = s_state_mask + 1; - -static u64 rec_alloc() -{ - const u32 start = static_cast(__rdtsc()); - - for (u32 i = 0;; i++) - { - const u32 group = (i + start) % s_rec_gcount; - - const auto [bits, ok] = s_rec_bits[group].fetch_op([](u64& bits) - { - if (~bits) - { - // Set lowest clear bit - bits |= bits + 1; - return true; - } - - return false; - }); - - if (ok) - { - // Find lowest clear bit - return group * 64 + std::countr_one(bits); - } - } - - // TODO: unreachable - std::abort(); - return 0; -} - -static bool cmpxchg16(s64(&dest)[2], s64(&cmp_res)[2], s64 exch_high, s64 exch_low) -{ -#ifdef _MSC_VER - return !!_InterlockedCompareExchange128(dest, exch_high, exch_low, cmp_res); -#else - s64 exch[2]{exch_low, exch_high}; - return __atomic_compare_exchange(&dest, &cmp_res, &exch, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); -#endif -} - -bool stx::multi_cas_record::commit() const noexcept -{ - // Transaction cancelled - if (m_count == 0) - { - return true; - } - - static auto rec_unref = [](u64 id) - { - if (id && id < s_rec_gcount * 64) - { - auto [_, ok] = s_records[id].m_state.fetch_op([](u64& state) - { - if (state < s_ref_one) - { - return 0; - } - - state -= s_ref_one; - - if (state < s_ref_one) - { - state = 0; - return 2; - } - - return 1; - }); - - if (ok > 1) - { - s_rec_bits[id / 64] &= ~(u64{1} << (id % 64)); - } - } - }; - - // Helper function to complete successful transaction - static auto rec_complete = [](u64 id) - { - for (u32 i = 0; i < s_records[id].m_count; i++) - { - auto& item = s_records[id].m_list[i]; - - atomic2 cmp; - cmp.m_data[0] = item.m_old; - cmp.m_data[1] = id; - - if (item.m_addr->load() == item.m_old && atomic_storage::load(item.m_addr->m_data[1]) == static_cast(id)) - { - if (cmpxchg16(item.m_addr->m_data, cmp.m_data, 0, item.m_new)) - { - } - } - } - }; - - // Helper function to deal with existing transaction - static auto rec_try_abort = [](u64 id) -> u64 - { - if (id >= s_rec_gcount * 64) - { - std::abort(); - } - - auto [_old, ok] = s_records[id].m_state.fetch_op([](u64& state) - { - if (state < s_ref_one) - { - // Don't reference if no references - return false; - } - - if ((state & s_state_mask) == s_state_undef) - { - // Break transaction if possible - state |= s_state_failure; - } - - state += s_ref_one; - return true; - }); - - if (!ok) - { - return 0; - } - - if ((_old & s_state_mask) != s_state_success) - { - // Allow to overwrite failing transaction - return id; - } - - // Help to complete - rec_complete(id); - rec_unref(id); - return 0; - }; - - // Single CAS path - if (m_count == 1) - { - atomic2 cmp; - - while (auto ptr = m_list[0].m_addr) - { - if (ptr->load() != m_list[0].m_old) - { - return false; - } - - cmp.m_data[0] = m_list[0].m_old; - cmp.m_data[1] = atomic_storage::load(ptr->m_data[1]); - - if (!cmp.m_data[1] && cmpxchg16(ptr->m_data, cmp.m_data, 0, m_list[0].m_new)) - { - return true; - } - else if (cmp.m_data[0] != static_cast(m_list[0].m_old)) - { - return false; - } - else if (cmp.m_data[1]) - { - if (u64 _id = rec_try_abort(cmp.m_data[1])) - { - if (cmpxchg16(ptr->m_data, cmp.m_data, 0, m_list[0].m_new)) - { - rec_unref(_id); - return true; - } - - rec_unref(_id); - } - } - } - - // Unreachable - std::abort(); - } - - // Try TSX if available - if (s_use_rtm) - { - switch (m_count) - { - case 2: if (s32 r = commit_tx<2>(m_list)) return r > 0; break; - case 3: if (s32 r = commit_tx<3>(m_list)) return r > 0; break; - case 4: if (s32 r = commit_tx<4>(m_list)) return r > 0; break; - case 5: if (s32 r = commit_tx<5>(m_list)) return r > 0; break; - case 6: if (s32 r = commit_tx<6>(m_list)) return r > 0; break; - case 7: if (s32 r = commit_tx<7>(m_list)) return r > 0; break; - case 8: if (s32 r = commit_tx<8>(m_list)) return r > 0; break; - } - } - - // Allocate global record and copy data - const u64 id = rec_alloc(); - - for (u32 i = 0; i < (m_count + 1) / 2; i++) - { - std::memcpy(s_records[id].m_list + i * 2, m_list + i * 2, sizeof(multi_cas_item) * 2); - } - - s_records[id].m_count = m_count; - s_records[id].m_state = s_ref_one; - - // Try to install CAS items - for (u32 i = 0; i < m_count && (s_records[id].m_state & s_state_mask) == s_state_undef; i++) - { - atomic2 cmp; - - while (auto ptr = m_list[i].m_addr) - { - if (ptr->load() != m_list[i].m_old) - { - s_records[id].m_state |= s_state_failure; - break; - } - - cmp.m_data[0] = m_list[i].m_old; - cmp.m_data[1] = atomic_storage::load(ptr->m_data[1]); - - if (!cmp.m_data[1] && cmpxchg16(ptr->m_data, cmp.m_data, id, m_list[i].m_old)) - { - break; - } - else if (cmp.m_data[0] != static_cast(m_list[i].m_old)) - { - s_records[id].m_state |= s_state_failure; - break; - } - else if (cmp.m_data[1]) - { - if (u64 _id = rec_try_abort(cmp.m_data[1])) - { - if (cmpxchg16(ptr->m_data, cmp.m_data, id, m_list[i].m_old)) - { - rec_unref(_id); - break; - } - - rec_unref(_id); - } - } - } - } - - // Try to acknowledge transaction success - auto [_, ok] = s_records[id].m_state.fetch_op([](u64& state) - { - if (state & s_state_failure) - { - return false; - } - - state |= s_state_success; - return true; - }); - - // Complete transaction on success, or cleanup on failure - for (u32 i = 0; i < m_count; i++) - { - auto& item = m_list[i]; - - atomic2 cmp; - cmp.m_data[0] = item.m_old; - cmp.m_data[1] = id; - - if (item.m_addr->load() == item.m_old && atomic_storage::load(item.m_addr->m_data[1]) == static_cast(id)) - { - // Restore old or set new - if (cmpxchg16(item.m_addr->m_data, cmp.m_data, 0, ok ? item.m_new : item.m_old)) - { - } - } - } - - rec_unref(id); - return ok; -} diff --git a/rpcs3/util/atomic2.hpp b/rpcs3/util/atomic2.hpp deleted file mode 100644 index da0a24541c..0000000000 --- a/rpcs3/util/atomic2.hpp +++ /dev/null @@ -1,156 +0,0 @@ -#pragma once - -#include -#include "util/atomic.hpp" - -namespace stx -{ - // Unsigned 64-bit atomic for multi-cas (occupies 128 bits) - class alignas(16) atomic2 - { - // First 64-bit value is an actual value, second one is an allocated control block pointer (if not zero) - s64 m_data[2]{}; - - friend class multi_cas_record; - - public: - // Can't be really uninitialized or it'll be fundamentally broken - constexpr atomic2() noexcept = default; - - atomic2(const atomic2&) = delete; - - atomic2& operator=(const atomic2&) = delete; - - constexpr atomic2(u64 value) noexcept - : m_data{static_cast(value), s64{0}} - { - } - - // Simply observe the state - u64 load() const noexcept - { - return atomic_storage::load(m_data[0]); - } - - // void wait(u64 old_value) const noexcept; - // void notify_one() noexcept; - // void notify_all() noexcept; - }; - - // Atomic CAS item - class multi_cas_item - { - atomic2* m_addr; - u64 m_old; - u64 m_new; - - friend class multi_cas_record; - - public: - multi_cas_item() noexcept = default; - - multi_cas_item(const multi_cas_item&) = delete; - - multi_cas_item& operator=(const multi_cas_item&) = delete; - - u64 get_old() const noexcept - { - return m_old; - } - - operator u64() const noexcept - { - return m_new; - } - - void operator=(u64 value) noexcept - { - m_new = value; - } - }; - - // An object passed to multi_cas lambda - class alignas(64) multi_cas_record - { - // Ref counter and Multi-CAS state - atomic_t m_state; - - // Total number of CASes - u64 m_count; - - // Support up to 10 CASes - multi_cas_item m_list[10]; - - public: - // Read atomic value and allocate "writable" item - multi_cas_item& load(atomic2& atom) noexcept - { - if (m_count >= std::size(m_list)) - { - std::abort(); - } - - auto& r = m_list[m_count++]; - r.m_addr = &atom; - r.m_old = atom.load(); - r.m_new = r.m_old; - return r; - } - - // Reset transaction (invalidates item references) - void cancel() noexcept - { - m_count = 0; - } - - // Try to commit sudoku (don't call) - bool commit() const noexcept; - }; - - template - struct multi_cas_result - { - static constexpr bool is_void = false; - - T ret; - }; - - template <> - struct multi_cas_result - { - static constexpr bool is_void = true; - }; - - template - class multi_cas final : Context, multi_cas_record, public multi_cas_result> - { - using result = multi_cas_result>; - using record = multi_cas_record; - - public: - // Implicit deduction guide candidate constructor (for lambda) - multi_cas(Context&& f) noexcept - : Context(std::forward(f)) - { - while (true) - { - multi_cas_record& rec = *this; - record::cancel(); - - if constexpr (result::is_void) - { - Context::operator()(rec); - } - else - { - result::ret = Context::operator()(rec); - } - - if (record::commit()) - { - return; - } - } - } - }; -} \ No newline at end of file diff --git a/rpcs3/util/sysinfo.cpp b/rpcs3/util/sysinfo.cpp index d9d798766b..31f8845839 100755 --- a/rpcs3/util/sysinfo.cpp +++ b/rpcs3/util/sysinfo.cpp @@ -17,6 +17,13 @@ #include "util/asm.hpp" +#ifdef _MSC_VER +extern "C" +{ + u64 _xgetbv(u32); +} +#endif + inline std::array utils::get_cpuid(u32 func, u32 subfunc) { int regs[4]; @@ -303,6 +310,19 @@ static constexpr ullong round_tsc(ullong val) return utils::rounded_div(val, 1'000'000) * 1'000'000; } +#ifdef _MSC_VER +extern "C" void _mm_lfence(); +#endif + +static inline void lfence() +{ +#ifdef _MSC_VER + _mm_lfence(); +#else + __builtin_ia32_lfence(); +#endif +} + ullong utils::get_tsc_freq() { static const ullong cal_tsc = []() -> ullong @@ -343,17 +363,17 @@ ullong utils::get_tsc_freq() { #ifdef _WIN32 Sleep(1); - error_data[i] = (_mm_lfence(), __rdtsc()); + error_data[i] = (lfence(), utils::get_tsc()); LARGE_INTEGER ctr; QueryPerformanceCounter(&ctr); - rdtsc_data[i] = (_mm_lfence(), __rdtsc()); + rdtsc_data[i] = (lfence(), utils::get_tsc()); timer_data[i] = ctr.QuadPart; #else usleep(200); - error_data[i] = (_mm_lfence(), __rdtsc()); + error_data[i] = (lfence(), utils::get_tsc()); struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); - rdtsc_data[i] = (_mm_lfence(), __rdtsc()); + rdtsc_data[i] = (lfence(), utils::get_tsc()); timer_data[i] = ts.tv_nsec + (ts.tv_sec - sec_base) * 1'000'000'000; #endif } diff --git a/rpcs3/util/types.hpp b/rpcs3/util/types.hpp index c591badcdb..e07d58123f 100644 --- a/rpcs3/util/types.hpp +++ b/rpcs3/util/types.hpp @@ -1,13 +1,5 @@ #pragma once // No BOM and only basic ASCII in this header, or a neko will die -#ifdef _MSC_VER -#include -#else -#include -#endif -#include -#include - #include #include #include @@ -278,10 +270,28 @@ public: }; #ifndef _MSC_VER + using u128 = __uint128_t; using s128 = __int128_t; + +using __m128i = long long __attribute__((vector_size(16))); +using __m128d = double __attribute__((vector_size(16))); +using __m128 = float __attribute__((vector_size(16))); + #else +extern "C" +{ + union __m128; + union __m128i; + struct __m128d; + + uchar _addcarry_u64(uchar, u64, u64, u64*); + uchar _subborrow_u64(uchar, u64, u64, u64*); + u64 __shiftleft128(u64, u64, uchar); + u64 __shiftright128(u64, u64, uchar); +} + // Unsigned 128-bit integer implementation (TODO) struct alignas(16) u128 { diff --git a/rpcs3/util/v128.hpp b/rpcs3/util/v128.hpp index db099e1af0..fef14bc543 100644 --- a/rpcs3/util/v128.hpp +++ b/rpcs3/util/v128.hpp @@ -1,7 +1,6 @@ #pragma once // No BOM and only basic ASCII in this header, or a neko will die #include "util/types.hpp" -#include // 128-bit vector type union alignas(16) v128 @@ -12,17 +11,17 @@ union alignas(16) v128 template struct masked_array_t // array type accessed as (index ^ M) { - char m_data[16]; + T m_data[N]; public: T& operator[](usz index) { - return reinterpret_cast(m_data)[index ^ M]; + return m_data[index ^ M]; } const T& operator[](usz index) const { - return reinterpret_cast(m_data)[index ^ M]; + return m_data[index ^ M]; } }; @@ -56,88 +55,55 @@ union alignas(16) v128 reversed_array_t fr; reversed_array_t dr; + u128 _u; + //s128 _s; + +#ifdef _MSC_VER + template + struct opaque_wrapper + { + u128 m_data; + + opaque_wrapper() = default; + + opaque_wrapper(const T& value) + : m_data(std::bit_cast(value)) + { + } + + opaque_wrapper& operator=(const T& value) + { + m_data = std::bit_cast(value); + return *this; + } + + operator T() const + { + return std::bit_cast(m_data); + } + }; + + opaque_wrapper<__m128> vf; + opaque_wrapper<__m128i> vi; + opaque_wrapper<__m128d> vd; +#else __m128 vf; __m128i vi; __m128d vd; +#endif struct bit_array_128 { char m_data[16]; public: - class bit_element - { - u64& data; - const u64 mask; - - public: - bit_element(u64& data, const u64 mask) - : data(data) - , mask(mask) - { - } - - operator bool() const - { - return (data & mask) != 0; - } - - bit_element& operator=(const bool right) - { - if (right) - { - data |= mask; - } - else - { - data &= ~mask; - } - return *this; - } - - bit_element& operator=(const bit_element& right) - { - if (right) - { - data |= mask; - } - else - { - data &= ~mask; - } - return *this; - } - }; + class bit_element; // Index 0 returns the MSB and index 127 returns the LSB - bit_element operator[](u32 index) - { - const auto data_ptr = reinterpret_cast(m_data); - - if constexpr (std::endian::little == std::endian::native) - { - return bit_element(data_ptr[1 - (index >> 6)], 0x8000000000000000ull >> (index & 0x3F)); - } - else - { - return bit_element(data_ptr[index >> 6], 0x8000000000000000ull >> (index & 0x3F)); - } - } + [[deprecated]] bit_element operator[](u32 index); // Index 0 returns the MSB and index 127 returns the LSB - bool operator[](u32 index) const - { - const auto data_ptr = reinterpret_cast(m_data); - - if constexpr (std::endian::little == std::endian::native) - { - return (data_ptr[1 - (index >> 6)] & (0x8000000000000000ull >> (index & 0x3F))) != 0; - } - else - { - return (data_ptr[index >> 6] & (0x8000000000000000ull >> (index & 0x3F))) != 0; - } - } + [[deprecated]] bool operator[](u32 index) const; } _bit; static v128 from64(u64 _0, u64 _1 = 0) @@ -171,51 +137,39 @@ union alignas(16) v128 static v128 from32p(u32 value) { v128 ret; - ret.vi = _mm_set1_epi32(static_cast(value)); + ret._u32[0] = value; + ret._u32[1] = value; + ret._u32[2] = value; + ret._u32[3] = value; return ret; } static v128 from16p(u16 value) { v128 ret; - ret.vi = _mm_set1_epi16(static_cast(value)); + ret._u16[0] = value; + ret._u16[1] = value; + ret._u16[2] = value; + ret._u16[3] = value; + ret._u16[4] = value; + ret._u16[5] = value; + ret._u16[6] = value; + ret._u16[7] = value; return ret; } static v128 from8p(u8 value) { v128 ret; - ret.vi = _mm_set1_epi8(static_cast(value)); + std::memset(&ret, value, sizeof(ret)); return ret; } - static v128 fromBit(u32 bit) - { - v128 ret = {}; - ret._bit[bit] = true; - return ret; - } + static inline v128 fromV(const __m128i& value); - static v128 fromV(__m128i value) - { - v128 ret; - ret.vi = value; - return ret; - } + static inline v128 fromF(const __m128& value); - static v128 fromF(__m128 value) - { - v128 ret; - ret.vf = value; - return ret; - } - - static v128 fromD(__m128d value) - { - v128 ret; - ret.vd = value; - return ret; - } + static inline v128 fromD(const __m128d& value); // Unaligned load with optional index offset static v128 loadu(const void* ptr, usz index = 0) @@ -231,136 +185,46 @@ union alignas(16) v128 std::memcpy(static_cast(ptr) + index * sizeof(v128), &value, sizeof(v128)); } - static inline v128 add8(const v128& left, const v128& right) - { - return fromV(_mm_add_epi8(left.vi, right.vi)); - } + static inline v128 add8(const v128& left, const v128& right); - static inline v128 add16(const v128& left, const v128& right) - { - return fromV(_mm_add_epi16(left.vi, right.vi)); - } + static inline v128 add16(const v128& left, const v128& right); - static inline v128 add32(const v128& left, const v128& right) - { - return fromV(_mm_add_epi32(left.vi, right.vi)); - } + static inline v128 add32(const v128& left, const v128& right); - static inline v128 addfs(const v128& left, const v128& right) - { - return fromF(_mm_add_ps(left.vf, right.vf)); - } + static inline v128 addfs(const v128& left, const v128& right); - static inline v128 addfd(const v128& left, const v128& right) - { - return fromD(_mm_add_pd(left.vd, right.vd)); - } + static inline v128 addfd(const v128& left, const v128& right); - static inline v128 sub8(const v128& left, const v128& right) - { - return fromV(_mm_sub_epi8(left.vi, right.vi)); - } + static inline v128 sub8(const v128& left, const v128& right); - static inline v128 sub16(const v128& left, const v128& right) - { - return fromV(_mm_sub_epi16(left.vi, right.vi)); - } + static inline v128 sub16(const v128& left, const v128& right); - static inline v128 sub32(const v128& left, const v128& right) - { - return fromV(_mm_sub_epi32(left.vi, right.vi)); - } + static inline v128 sub32(const v128& left, const v128& right); - static inline v128 subfs(const v128& left, const v128& right) - { - return fromF(_mm_sub_ps(left.vf, right.vf)); - } + static inline v128 subfs(const v128& left, const v128& right); - static inline v128 subfd(const v128& left, const v128& right) - { - return fromD(_mm_sub_pd(left.vd, right.vd)); - } + static inline v128 subfd(const v128& left, const v128& right); - static inline v128 maxu8(const v128& left, const v128& right) - { - return fromV(_mm_max_epu8(left.vi, right.vi)); - } + static inline v128 maxu8(const v128& left, const v128& right); - static inline v128 minu8(const v128& left, const v128& right) - { - return fromV(_mm_min_epu8(left.vi, right.vi)); - } + static inline v128 minu8(const v128& left, const v128& right); - static inline v128 eq8(const v128& left, const v128& right) - { - return fromV(_mm_cmpeq_epi8(left.vi, right.vi)); - } + static inline v128 eq8(const v128& left, const v128& right); - static inline v128 eq16(const v128& left, const v128& right) - { - return fromV(_mm_cmpeq_epi16(left.vi, right.vi)); - } + static inline v128 eq16(const v128& left, const v128& right); - static inline v128 eq32(const v128& left, const v128& right) - { - return fromV(_mm_cmpeq_epi32(left.vi, right.vi)); - } + static inline v128 eq32(const v128& left, const v128& right); - static inline v128 eq32f(const v128& left, const v128& right) - { - return fromF(_mm_cmpeq_ps(left.vf, right.vf)); - } + static inline v128 eq32f(const v128& left, const v128& right); - static inline v128 eq64f(const v128& left, const v128& right) - { - return fromD(_mm_cmpeq_pd(left.vd, right.vd)); - } + static inline v128 fma32f(v128 a, const v128& b, const v128& c); - static inline bool use_fma = false; + bool operator==(const v128& right) const; - static inline v128 fma32f(v128 a, const v128& b, const v128& c) - { -#ifndef __FMA__ - if (use_fma) [[likely]] - { -#ifdef _MSC_VER - a.vf = _mm_fmadd_ps(a.vf, b.vf, c.vf); - return a; -#else - __asm__("vfmadd213ps %[c], %[b], %[a]" - : [a] "+x" (a.vf) - : [b] "x" (b.vf) - , [c] "x" (c.vf)); - return a; -#endif - } - - for (int i = 0; i < 4; i++) - { - a._f[i] = std::fmaf(a._f[i], b._f[i], c._f[i]); - } - return a; -#else - a.vf = _mm_fmadd_ps(a.vf, b.vf, c.vf); - return a; -#endif - } - - bool operator==(const v128& right) const - { - return _mm_movemask_epi8(v128::eq32(*this, right).vi) == 0xffff; - } - - bool operator!=(const v128& right) const - { - return !operator==(right); - } + bool operator!=(const v128& right) const; // result = (~left) & (right) - static inline v128 andnot(const v128& left, const v128& right) - { - return fromV(_mm_andnot_si128(left.vi, right.vi)); - } + static inline v128 andnot(const v128& left, const v128& right); void clear() { @@ -377,23 +241,3 @@ struct offset32_array> return u32{sizeof(T)} * (static_cast(arg) ^ static_cast(M)); } }; - -inline v128 operator|(const v128& left, const v128& right) -{ - return v128::fromV(_mm_or_si128(left.vi, right.vi)); -} - -inline v128 operator&(const v128& left, const v128& right) -{ - return v128::fromV(_mm_and_si128(left.vi, right.vi)); -} - -inline v128 operator^(const v128& left, const v128& right) -{ - return v128::fromV(_mm_xor_si128(left.vi, right.vi)); -} - -inline v128 operator~(const v128& other) -{ - return other ^ v128::from32p(UINT32_MAX); // XOR with ones -} diff --git a/rpcs3/util/v128sse.hpp b/rpcs3/util/v128sse.hpp new file mode 100644 index 0000000000..eb543aac86 --- /dev/null +++ b/rpcs3/util/v128sse.hpp @@ -0,0 +1,255 @@ +#pragma once + +#include "util/types.hpp" +#include "util/v128.hpp" +#include "util/sysinfo.hpp" + +#ifdef _MSC_VER +#include +#else +#include +#endif + +#include +#include + +#include + +inline bool v128_use_fma = utils::has_fma3(); + +class v128::bit_array_128::bit_element +{ + u64& data; + const u64 mask; + +public: + bit_element(u64& data, const u64 mask) + : data(data) + , mask(mask) + { + } + + operator bool() const + { + return (data & mask) != 0; + } + + bit_element& operator=(const bool right) + { + if (right) + { + data |= mask; + } + else + { + data &= ~mask; + } + return *this; + } + + bit_element& operator=(const bit_element& right) + { + if (right) + { + data |= mask; + } + else + { + data &= ~mask; + } + return *this; + } +}; + +[[deprecated]] inline v128::bit_array_128::bit_element v128::bit_array_128::operator[](u32 index) +{ + const auto data_ptr = reinterpret_cast(m_data); + + if constexpr (std::endian::little == std::endian::native) + { + return bit_element(data_ptr[1 - (index >> 6)], 0x8000000000000000ull >> (index & 0x3F)); + } + else + { + return bit_element(data_ptr[index >> 6], 0x8000000000000000ull >> (index & 0x3F)); + } +} + +[[deprecated]] inline bool v128::bit_array_128::operator[](u32 index) const +{ + const auto data_ptr = reinterpret_cast(m_data); + + if constexpr (std::endian::little == std::endian::native) + { + return (data_ptr[1 - (index >> 6)] & (0x8000000000000000ull >> (index & 0x3F))) != 0; + } + else + { + return (data_ptr[index >> 6] & (0x8000000000000000ull >> (index & 0x3F))) != 0; + } +} + +inline v128 v128::fromV(const __m128i& value) +{ + v128 ret; + ret.vi = value; + return ret; +} + +inline v128 v128::fromF(const __m128& value) +{ + v128 ret; + ret.vf = value; + return ret; +} + +inline v128 v128::fromD(const __m128d& value) +{ + v128 ret; + ret.vd = value; + return ret; +} + +inline v128 v128::add8(const v128& left, const v128& right) +{ + return fromV(_mm_add_epi8(left.vi, right.vi)); +} + +inline v128 v128::add16(const v128& left, const v128& right) +{ + return fromV(_mm_add_epi16(left.vi, right.vi)); +} + +inline v128 v128::add32(const v128& left, const v128& right) +{ + return fromV(_mm_add_epi32(left.vi, right.vi)); +} + +inline v128 v128::addfs(const v128& left, const v128& right) +{ + return fromF(_mm_add_ps(left.vf, right.vf)); +} + +inline v128 v128::addfd(const v128& left, const v128& right) +{ + return fromD(_mm_add_pd(left.vd, right.vd)); +} + +inline v128 v128::sub8(const v128& left, const v128& right) +{ + return fromV(_mm_sub_epi8(left.vi, right.vi)); +} + +inline v128 v128::sub16(const v128& left, const v128& right) +{ + return fromV(_mm_sub_epi16(left.vi, right.vi)); +} + +inline v128 v128::sub32(const v128& left, const v128& right) +{ + return fromV(_mm_sub_epi32(left.vi, right.vi)); +} + +inline v128 v128::subfs(const v128& left, const v128& right) +{ + return fromF(_mm_sub_ps(left.vf, right.vf)); +} + +inline v128 v128::subfd(const v128& left, const v128& right) +{ + return fromD(_mm_sub_pd(left.vd, right.vd)); +} + +inline v128 v128::maxu8(const v128& left, const v128& right) +{ + return fromV(_mm_max_epu8(left.vi, right.vi)); +} + +inline v128 v128::minu8(const v128& left, const v128& right) +{ + return fromV(_mm_min_epu8(left.vi, right.vi)); +} + +inline v128 v128::eq8(const v128& left, const v128& right) +{ + return fromV(_mm_cmpeq_epi8(left.vi, right.vi)); +} + +inline v128 v128::eq16(const v128& left, const v128& right) +{ + return fromV(_mm_cmpeq_epi16(left.vi, right.vi)); +} + +inline v128 v128::eq32(const v128& left, const v128& right) +{ + return fromV(_mm_cmpeq_epi32(left.vi, right.vi)); +} + +inline v128 v128::eq32f(const v128& left, const v128& right) +{ + return fromF(_mm_cmpeq_ps(left.vf, right.vf)); +} + +inline v128 v128::fma32f(v128 a, const v128& b, const v128& c) +{ +#ifndef __FMA__ + if (v128_use_fma) [[likely]] + { +#ifdef _MSC_VER + a.vf = _mm_fmadd_ps(a.vf, b.vf, c.vf); + return a; +#else + __asm__("vfmadd213ps %[c], %[b], %[a]" + : [a] "+x" (a.vf) + : [b] "x" (b.vf) + , [c] "x" (c.vf)); + return a; +#endif + } + + for (int i = 0; i < 4; i++) + { + a._f[i] = std::fmaf(a._f[i], b._f[i], c._f[i]); + } + return a; +#else + a.vf = _mm_fmadd_ps(a.vf, b.vf, c.vf); + return a; +#endif +} + +inline bool v128::operator==(const v128& right) const +{ + return _mm_movemask_epi8(v128::eq32(*this, right).vi) == 0xffff; +} + +inline bool v128::operator!=(const v128& right) const +{ + return !operator==(right); +} + +// result = (~left) & (right) +inline v128 v128::andnot(const v128& left, const v128& right) +{ + return fromV(_mm_andnot_si128(left.vi, right.vi)); +} + +inline v128 operator|(const v128& left, const v128& right) +{ + return v128::fromV(_mm_or_si128(left.vi, right.vi)); +} + +inline v128 operator&(const v128& left, const v128& right) +{ + return v128::fromV(_mm_and_si128(left.vi, right.vi)); +} + +inline v128 operator^(const v128& left, const v128& right) +{ + return v128::fromV(_mm_xor_si128(left.vi, right.vi)); +} + +inline v128 operator~(const v128& other) +{ + return other ^ v128::from32p(UINT32_MAX); // XOR with ones +}