mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-04-28 13:28:01 +03:00
rsx: Implement host GPU sync labels
This commit is contained in:
parent
c5680c29bd
commit
c7e49b58a8
7 changed files with 167 additions and 40 deletions
|
@ -590,6 +590,7 @@ namespace rsx
|
|||
bool supports_hw_conditional_render; // Conditional render
|
||||
bool supports_passthrough_dma; // DMA passthrough
|
||||
bool supports_asynchronous_compute; // Async compute
|
||||
bool supports_host_gpu_labels; // Advanced host synchronization
|
||||
};
|
||||
|
||||
struct sampled_image_descriptor_base;
|
||||
|
@ -859,6 +860,7 @@ namespace rsx
|
|||
void sync();
|
||||
flags32_t read_barrier(u32 memory_address, u32 memory_range, bool unconditional);
|
||||
virtual void sync_hint(FIFO_hint hint, void* args);
|
||||
virtual bool release_GCM_label(u32 /*address*/, u32 /*value*/) { return false; }
|
||||
|
||||
std::span<const std::byte> get_raw_index_array(const draw_clause& draw_indexed_clause) const;
|
||||
|
||||
|
|
|
@ -550,11 +550,16 @@ VKGSRender::VKGSRender() : GSRender()
|
|||
// Relaxed query synchronization
|
||||
backend_config.supports_hw_conditional_render = !!g_cfg.video.relaxed_zcull_sync;
|
||||
|
||||
// Passthrough DMA
|
||||
backend_config.supports_passthrough_dma = m_device->get_external_memory_host_support();
|
||||
|
||||
// Host sync
|
||||
backend_config.supports_host_gpu_labels = !!g_cfg.video.host_label_synchronization;
|
||||
|
||||
// Async compute and related operations
|
||||
if (g_cfg.video.vk.asynchronous_texture_streaming)
|
||||
{
|
||||
// Optimistic, enable async compute and passthrough DMA
|
||||
backend_config.supports_passthrough_dma = m_device->get_external_memory_host_support();
|
||||
// Optimistic, enable async compute
|
||||
backend_config.supports_asynchronous_compute = true;
|
||||
|
||||
if (m_device->get_graphics_queue() == m_device->get_transfer_queue())
|
||||
|
@ -562,10 +567,14 @@ VKGSRender::VKGSRender() : GSRender()
|
|||
rsx_log.error("Cannot run graphics and async transfer in the same queue. Async uploads are disabled. This is a limitation of your GPU");
|
||||
backend_config.supports_asynchronous_compute = false;
|
||||
}
|
||||
}
|
||||
|
||||
switch (vk::get_driver_vendor())
|
||||
// Sanity checks
|
||||
switch (vk::get_driver_vendor())
|
||||
{
|
||||
case vk::driver_vendor::NVIDIA:
|
||||
if (backend_config.supports_asynchronous_compute)
|
||||
{
|
||||
case vk::driver_vendor::NVIDIA:
|
||||
if (auto chip_family = vk::get_chip_family();
|
||||
chip_family == vk::chip_class::NV_kepler || chip_family == vk::chip_class::NV_maxwell)
|
||||
{
|
||||
|
@ -574,35 +583,47 @@ VKGSRender::VKGSRender() : GSRender()
|
|||
|
||||
rsx_log.notice("Forcing safe async compute for NVIDIA device to avoid crashing.");
|
||||
g_cfg.video.vk.asynchronous_scheduler.set(vk_gpu_scheduler_mode::safe);
|
||||
break;
|
||||
}
|
||||
break;
|
||||
#if !defined(_WIN32)
|
||||
// Anything running on AMDGPU kernel driver will not work due to the check for fd-backed memory allocations
|
||||
case vk::driver_vendor::RADV:
|
||||
case vk::driver_vendor::AMD:
|
||||
// Anything running on AMDGPU kernel driver will not work due to the check for fd-backed memory allocations
|
||||
case vk::driver_vendor::RADV:
|
||||
case vk::driver_vendor::AMD:
|
||||
#if !defined(__linux__)
|
||||
// Intel chipsets would fail on BSD in most cases and DRM_IOCTL_i915_GEM_USERPTR unimplemented
|
||||
case vk::driver_vendor::ANV:
|
||||
// Intel chipsets would fail on BSD in most cases and DRM_IOCTL_i915_GEM_USERPTR unimplemented
|
||||
case vk::driver_vendor::ANV:
|
||||
#endif
|
||||
if (backend_config.supports_passthrough_dma)
|
||||
{
|
||||
rsx_log.error("AMDGPU kernel driver on linux and INTEL driver on some platforms cannot support passthrough DMA buffers.");
|
||||
backend_config.supports_passthrough_dma = false;
|
||||
}
|
||||
break;
|
||||
#endif
|
||||
case vk::driver_vendor::MVK:
|
||||
// Async compute crashes immediately on Apple GPUs
|
||||
rsx_log.error("Apple GPUs are incompatible with the current implementation of asynchronous texture decoding.");
|
||||
backend_config.supports_asynchronous_compute = false;
|
||||
break;
|
||||
default: break;
|
||||
}
|
||||
|
||||
if (backend_config.supports_asynchronous_compute)
|
||||
if (backend_config.supports_passthrough_dma)
|
||||
{
|
||||
// Run only if async compute can be used.
|
||||
g_fxo->init<vk::AsyncTaskScheduler>(g_cfg.video.vk.asynchronous_scheduler);
|
||||
rsx_log.error("AMDGPU kernel driver on linux and INTEL driver on some platforms cannot support passthrough DMA buffers.");
|
||||
backend_config.supports_passthrough_dma = false;
|
||||
}
|
||||
break;
|
||||
#endif
|
||||
case vk::driver_vendor::MVK:
|
||||
// Async compute crashes immediately on Apple GPUs
|
||||
rsx_log.error("Apple GPUs are incompatible with the current implementation of asynchronous texture decoding.");
|
||||
backend_config.supports_asynchronous_compute = false;
|
||||
break;
|
||||
default: break;
|
||||
}
|
||||
|
||||
if (backend_config.supports_asynchronous_compute)
|
||||
{
|
||||
// Run only if async compute can be used.
|
||||
g_fxo->init<vk::AsyncTaskScheduler>(g_cfg.video.vk.asynchronous_scheduler);
|
||||
}
|
||||
|
||||
if (backend_config.supports_host_gpu_labels)
|
||||
{
|
||||
m_host_object_data = std::make_unique<vk::buffer>(*m_device,
|
||||
0x100000,
|
||||
memory_map.device_bar, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT,
|
||||
VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0,
|
||||
VMM_ALLOCATION_POOL_SYSTEM);
|
||||
|
||||
m_host_data_ptr = new (m_host_object_data->map(0, 0x100000)) vk::host_data_t();
|
||||
ensure(m_host_data_ptr->magic == 0xCAFEBABE);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -629,6 +650,13 @@ VKGSRender::~VKGSRender()
|
|||
g_fxo->get<vk::AsyncTaskScheduler>().destroy();
|
||||
}
|
||||
|
||||
// Host data
|
||||
if (m_host_object_data)
|
||||
{
|
||||
m_host_object_data->unmap();
|
||||
m_host_object_data.reset();
|
||||
}
|
||||
|
||||
// Clear flush requests
|
||||
m_flush_requests.clear_pending_flag();
|
||||
|
||||
|
@ -1453,6 +1481,35 @@ void VKGSRender::flush_command_queue(bool hard_sync, bool do_not_switch)
|
|||
m_current_command_buffer->begin();
|
||||
}
|
||||
|
||||
bool VKGSRender::release_GCM_label(u32 address, u32 args)
|
||||
{
|
||||
if (!backend_config.supports_host_gpu_labels)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
ensure(m_host_data_ptr);
|
||||
if (m_host_data_ptr->texture_load_complete_event == m_host_data_ptr->texture_load_request_event)
|
||||
{
|
||||
// All texture loads already seen by the host GPU
|
||||
// Wait for all previously submitted labels to be flushed
|
||||
while (m_host_data_ptr->last_label_release_event > m_host_data_ptr->commands_complete_event)
|
||||
{
|
||||
_mm_pause();
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
m_host_data_ptr->last_label_release_event = ++m_host_data_ptr->event_counter;
|
||||
|
||||
const auto mapping = vk::map_dma(address, 4);
|
||||
const auto write_data = std::bit_cast<u32, be_t<u32>>(args);
|
||||
vkCmdUpdateBuffer(*m_current_command_buffer, mapping.second->value, mapping.first, 4, &write_data);
|
||||
flush_command_queue();
|
||||
return true;
|
||||
}
|
||||
|
||||
void VKGSRender::sync_hint(rsx::FIFO_hint hint, void* args)
|
||||
{
|
||||
ensure(args);
|
||||
|
@ -2088,6 +2145,15 @@ void VKGSRender::close_and_submit_command_buffer(vk::fence* pFence, VkSemaphore
|
|||
m_current_command_buffer->flags &= ~vk::command_buffer::cb_has_open_query;
|
||||
}
|
||||
|
||||
if (m_host_data_ptr && m_host_data_ptr->last_label_release_event > m_host_data_ptr->commands_complete_event)
|
||||
{
|
||||
vkCmdUpdateBuffer(*m_current_command_buffer,
|
||||
m_host_object_data->value,
|
||||
::offset32(&vk::host_data_t::commands_complete_event),
|
||||
sizeof(u64),
|
||||
const_cast<u64*>(&m_host_data_ptr->event_counter));
|
||||
}
|
||||
|
||||
m_current_command_buffer->end();
|
||||
m_current_command_buffer->tag();
|
||||
|
||||
|
|
|
@ -117,6 +117,9 @@ private:
|
|||
vk::command_buffer_chunk* m_current_command_buffer = nullptr;
|
||||
VkSemaphore m_dangling_semaphore_signal = VK_NULL_HANDLE;
|
||||
|
||||
volatile vk::host_data_t* m_host_data_ptr = nullptr;
|
||||
std::unique_ptr<vk::buffer> m_host_object_data;
|
||||
|
||||
VkDescriptorSetLayout descriptor_layouts;
|
||||
VkPipelineLayout pipeline_layout;
|
||||
|
||||
|
@ -242,6 +245,7 @@ public:
|
|||
void bind_viewport();
|
||||
|
||||
void sync_hint(rsx::FIFO_hint hint, void* args) override;
|
||||
bool release_GCM_label(u32 address, u32 data) override;
|
||||
|
||||
void begin_occlusion_query(rsx::reports::occlusion_query_info* query) override;
|
||||
void end_occlusion_query(rsx::reports::occlusion_query_info* query) override;
|
||||
|
@ -259,6 +263,9 @@ public:
|
|||
void begin_conditional_rendering(const std::vector<rsx::reports::occlusion_query_info*>& sources) override;
|
||||
void end_conditional_rendering() override;
|
||||
|
||||
// Host sync object
|
||||
inline std::pair<volatile vk::host_data_t*, VkBuffer> map_host_object_data() { return { m_host_data_ptr, m_host_object_data->value }; }
|
||||
|
||||
protected:
|
||||
void clear_surface(u32 mask) override;
|
||||
void begin() override;
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
|
||||
#include "vkutils/data_heap.h"
|
||||
#include "vkutils/image_helpers.h"
|
||||
#include "VKGSRender.h"
|
||||
|
||||
#include "../GCM.h"
|
||||
#include "../rsx_utils.h"
|
||||
|
@ -1146,6 +1147,17 @@ namespace vk
|
|||
// Release from async chain, the primary chain will acquire later
|
||||
dst_image->queue_release(cmd2, cmd.get_queue_family(), dst_image->current_layout);
|
||||
}
|
||||
|
||||
if (auto rsxthr = rsx::get_current_renderer();
|
||||
rsxthr->get_backend_config().supports_host_gpu_labels)
|
||||
{
|
||||
// Queue a sync update on the CB doing the load
|
||||
auto [host_data, host_buffer] = static_cast<VKGSRender*>(rsxthr)->map_host_object_data();
|
||||
ensure(host_data);
|
||||
const auto event_id = ++host_data->event_counter;
|
||||
host_data->texture_load_request_event = event_id;
|
||||
vkCmdUpdateBuffer(cmd2, host_buffer, ::offset32(&vk::host_data_t::texture_load_complete_event), sizeof(u64), &event_id);
|
||||
}
|
||||
}
|
||||
|
||||
void blitter::scale_image(vk::command_buffer& cmd, vk::image* src, vk::image* dst, areai src_area, areai dst_area, bool interpolate, const rsx::typeless_xfer& xfer_info)
|
||||
|
|
|
@ -16,6 +16,16 @@ namespace vk
|
|||
gpu = 1
|
||||
};
|
||||
|
||||
struct host_data_t // Pick a better name
|
||||
{
|
||||
u64 magic = 0xCAFEBABE;
|
||||
u64 event_counter = 0;
|
||||
u64 texture_load_request_event = 0;
|
||||
u64 texture_load_complete_event = 0;
|
||||
u64 last_label_release_event = 0;
|
||||
u64 commands_complete_event = 0;
|
||||
};
|
||||
|
||||
struct fence
|
||||
{
|
||||
atomic_t<bool> flushed = false;
|
||||
|
|
|
@ -29,6 +29,35 @@ namespace rsx
|
|||
rsx_log.trace("RSX method 0x%x (arg=0x%x)", reg << 2, arg);
|
||||
}
|
||||
|
||||
template<bool FlushDMA, bool FlushPipe>
|
||||
void write_gcm_label(thread* rsx, u32 address, u32 data)
|
||||
{
|
||||
const bool is_flip_sema = (address == (rsx->label_addr + 0x10) || address == (rsx->label_addr + 0x30));
|
||||
if (!is_flip_sema)
|
||||
{
|
||||
if constexpr (FlushPipe)
|
||||
{
|
||||
// Ignoring these can cause very poor performance due to timestamp queries taking too long.
|
||||
rsx->sync();
|
||||
}
|
||||
|
||||
if (rsx->get_backend_config().supports_host_gpu_labels &&
|
||||
rsx->release_GCM_label(address, data))
|
||||
{
|
||||
// Backend will handle it, nothing to do.
|
||||
// Implicitly handles DMA sync.
|
||||
return;
|
||||
}
|
||||
|
||||
if constexpr (FlushDMA)
|
||||
{
|
||||
g_fxo->get<rsx::dma_manager>().sync();
|
||||
}
|
||||
}
|
||||
|
||||
vm::_ref<RsxSemaphore>(address).val = data;
|
||||
}
|
||||
|
||||
template<typename Type> struct vertex_data_type_from_element_type;
|
||||
template<> struct vertex_data_type_from_element_type<float> { static const vertex_base_type type = vertex_base_type::f; };
|
||||
template<> struct vertex_data_type_from_element_type<f16> { static const vertex_base_type type = vertex_base_type::sf; };
|
||||
|
@ -74,6 +103,8 @@ namespace rsx
|
|||
rsx->flush_fifo();
|
||||
}
|
||||
|
||||
//rsx_log.error("Wait for address at 0x%x to change to 0x%x", addr, arg);
|
||||
|
||||
u64 start = get_system_time();
|
||||
while (sema != arg)
|
||||
{
|
||||
|
@ -116,8 +147,6 @@ namespace rsx
|
|||
|
||||
void semaphore_release(thread* rsx, u32 /*reg*/, u32 arg)
|
||||
{
|
||||
rsx->sync();
|
||||
|
||||
const u32 offset = method_registers.semaphore_offset_406e();
|
||||
|
||||
if (offset % 4)
|
||||
|
@ -144,7 +173,7 @@ namespace rsx
|
|||
rsx_log.fatal("NV406E semaphore unexpected address. Please report to the developers. (offset=0x%x, addr=0x%x)", offset, addr);
|
||||
}
|
||||
|
||||
vm::_ref<RsxSemaphore>(addr).val = arg;
|
||||
write_gcm_label<false, true>(rsx, addr, arg);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -207,11 +236,6 @@ namespace rsx
|
|||
void texture_read_semaphore_release(thread* rsx, u32 /*reg*/, u32 arg)
|
||||
{
|
||||
// Pipeline barrier seems to be equivalent to a SHADER_READ stage barrier
|
||||
g_fxo->get<rsx::dma_manager>().sync();
|
||||
if (g_cfg.video.strict_rendering_mode)
|
||||
{
|
||||
rsx->sync();
|
||||
}
|
||||
|
||||
// lle-gcm likes to inject system reserved semaphores, presumably for system/vsh usage
|
||||
// Avoid calling render to avoid any havoc(flickering) they may cause from invalid flush/write
|
||||
|
@ -224,14 +248,19 @@ namespace rsx
|
|||
return;
|
||||
}
|
||||
|
||||
vm::_ref<RsxSemaphore>(get_address(offset, method_registers.semaphore_context_dma_4097())).val = arg;
|
||||
if (g_cfg.video.strict_rendering_mode) [[ unlikely ]]
|
||||
{
|
||||
write_gcm_label<true, true>(rsx, get_address(offset, method_registers.semaphore_context_dma_4097()), arg);
|
||||
}
|
||||
else
|
||||
{
|
||||
write_gcm_label<true, false>(rsx, get_address(offset, method_registers.semaphore_context_dma_4097()), arg);
|
||||
}
|
||||
}
|
||||
|
||||
void back_end_write_semaphore_release(thread* rsx, u32 /*reg*/, u32 arg)
|
||||
{
|
||||
// Full pipeline barrier
|
||||
g_fxo->get<rsx::dma_manager>().sync();
|
||||
rsx->sync();
|
||||
|
||||
const u32 offset = method_registers.semaphore_offset_4097();
|
||||
|
||||
|
@ -243,7 +272,7 @@ namespace rsx
|
|||
}
|
||||
|
||||
const u32 val = (arg & 0xff00ff00) | ((arg & 0xff) << 16) | ((arg >> 16) & 0xff);
|
||||
vm::_ref<RsxSemaphore>(get_address(offset, method_registers.semaphore_context_dma_4097())).val = val;
|
||||
write_gcm_label<true, true>(rsx, get_address(offset, method_registers.semaphore_context_dma_4097()), val);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -156,6 +156,7 @@ struct cfg_root : cfg::node
|
|||
cfg::_int<1, 1800> vblank_rate{ this, "Vblank Rate", 60, true }; // Changing this from 60 may affect game speed in unexpected ways
|
||||
cfg::_bool vblank_ntsc{ this, "Vblank NTSC Fixup", false, true };
|
||||
cfg::_bool decr_memory_layout{ this, "DECR memory layout", false}; // Force enable increased allowed main memory range as DECR console
|
||||
cfg::_bool host_label_synchronization{ this, "Use Host GPU Labels", false };
|
||||
|
||||
struct node_vk : cfg::node
|
||||
{
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue