rsx: Implement host GPU sync labels

This commit is contained in:
kd-11 2022-02-19 20:22:52 +03:00 committed by kd-11
parent c5680c29bd
commit c7e49b58a8
7 changed files with 167 additions and 40 deletions

View file

@ -590,6 +590,7 @@ namespace rsx
bool supports_hw_conditional_render; // Conditional render
bool supports_passthrough_dma; // DMA passthrough
bool supports_asynchronous_compute; // Async compute
bool supports_host_gpu_labels; // Advanced host synchronization
};
struct sampled_image_descriptor_base;
@ -859,6 +860,7 @@ namespace rsx
void sync();
flags32_t read_barrier(u32 memory_address, u32 memory_range, bool unconditional);
virtual void sync_hint(FIFO_hint hint, void* args);
virtual bool release_GCM_label(u32 /*address*/, u32 /*value*/) { return false; }
std::span<const std::byte> get_raw_index_array(const draw_clause& draw_indexed_clause) const;

View file

@ -550,11 +550,16 @@ VKGSRender::VKGSRender() : GSRender()
// Relaxed query synchronization
backend_config.supports_hw_conditional_render = !!g_cfg.video.relaxed_zcull_sync;
// Passthrough DMA
backend_config.supports_passthrough_dma = m_device->get_external_memory_host_support();
// Host sync
backend_config.supports_host_gpu_labels = !!g_cfg.video.host_label_synchronization;
// Async compute and related operations
if (g_cfg.video.vk.asynchronous_texture_streaming)
{
// Optimistic, enable async compute and passthrough DMA
backend_config.supports_passthrough_dma = m_device->get_external_memory_host_support();
// Optimistic, enable async compute
backend_config.supports_asynchronous_compute = true;
if (m_device->get_graphics_queue() == m_device->get_transfer_queue())
@ -562,10 +567,14 @@ VKGSRender::VKGSRender() : GSRender()
rsx_log.error("Cannot run graphics and async transfer in the same queue. Async uploads are disabled. This is a limitation of your GPU");
backend_config.supports_asynchronous_compute = false;
}
}
switch (vk::get_driver_vendor())
// Sanity checks
switch (vk::get_driver_vendor())
{
case vk::driver_vendor::NVIDIA:
if (backend_config.supports_asynchronous_compute)
{
case vk::driver_vendor::NVIDIA:
if (auto chip_family = vk::get_chip_family();
chip_family == vk::chip_class::NV_kepler || chip_family == vk::chip_class::NV_maxwell)
{
@ -574,35 +583,47 @@ VKGSRender::VKGSRender() : GSRender()
rsx_log.notice("Forcing safe async compute for NVIDIA device to avoid crashing.");
g_cfg.video.vk.asynchronous_scheduler.set(vk_gpu_scheduler_mode::safe);
break;
}
break;
#if !defined(_WIN32)
// Anything running on AMDGPU kernel driver will not work due to the check for fd-backed memory allocations
case vk::driver_vendor::RADV:
case vk::driver_vendor::AMD:
// Anything running on AMDGPU kernel driver will not work due to the check for fd-backed memory allocations
case vk::driver_vendor::RADV:
case vk::driver_vendor::AMD:
#if !defined(__linux__)
// Intel chipsets would fail on BSD in most cases and DRM_IOCTL_i915_GEM_USERPTR unimplemented
case vk::driver_vendor::ANV:
// Intel chipsets would fail on BSD in most cases and DRM_IOCTL_i915_GEM_USERPTR unimplemented
case vk::driver_vendor::ANV:
#endif
if (backend_config.supports_passthrough_dma)
{
rsx_log.error("AMDGPU kernel driver on linux and INTEL driver on some platforms cannot support passthrough DMA buffers.");
backend_config.supports_passthrough_dma = false;
}
break;
#endif
case vk::driver_vendor::MVK:
// Async compute crashes immediately on Apple GPUs
rsx_log.error("Apple GPUs are incompatible with the current implementation of asynchronous texture decoding.");
backend_config.supports_asynchronous_compute = false;
break;
default: break;
}
if (backend_config.supports_asynchronous_compute)
if (backend_config.supports_passthrough_dma)
{
// Run only if async compute can be used.
g_fxo->init<vk::AsyncTaskScheduler>(g_cfg.video.vk.asynchronous_scheduler);
rsx_log.error("AMDGPU kernel driver on linux and INTEL driver on some platforms cannot support passthrough DMA buffers.");
backend_config.supports_passthrough_dma = false;
}
break;
#endif
case vk::driver_vendor::MVK:
// Async compute crashes immediately on Apple GPUs
rsx_log.error("Apple GPUs are incompatible with the current implementation of asynchronous texture decoding.");
backend_config.supports_asynchronous_compute = false;
break;
default: break;
}
if (backend_config.supports_asynchronous_compute)
{
// Run only if async compute can be used.
g_fxo->init<vk::AsyncTaskScheduler>(g_cfg.video.vk.asynchronous_scheduler);
}
if (backend_config.supports_host_gpu_labels)
{
m_host_object_data = std::make_unique<vk::buffer>(*m_device,
0x100000,
memory_map.device_bar, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT,
VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0,
VMM_ALLOCATION_POOL_SYSTEM);
m_host_data_ptr = new (m_host_object_data->map(0, 0x100000)) vk::host_data_t();
ensure(m_host_data_ptr->magic == 0xCAFEBABE);
}
}
@ -629,6 +650,13 @@ VKGSRender::~VKGSRender()
g_fxo->get<vk::AsyncTaskScheduler>().destroy();
}
// Host data
if (m_host_object_data)
{
m_host_object_data->unmap();
m_host_object_data.reset();
}
// Clear flush requests
m_flush_requests.clear_pending_flag();
@ -1453,6 +1481,35 @@ void VKGSRender::flush_command_queue(bool hard_sync, bool do_not_switch)
m_current_command_buffer->begin();
}
bool VKGSRender::release_GCM_label(u32 address, u32 args)
{
if (!backend_config.supports_host_gpu_labels)
{
return false;
}
ensure(m_host_data_ptr);
if (m_host_data_ptr->texture_load_complete_event == m_host_data_ptr->texture_load_request_event)
{
// All texture loads already seen by the host GPU
// Wait for all previously submitted labels to be flushed
while (m_host_data_ptr->last_label_release_event > m_host_data_ptr->commands_complete_event)
{
_mm_pause();
}
return false;
}
m_host_data_ptr->last_label_release_event = ++m_host_data_ptr->event_counter;
const auto mapping = vk::map_dma(address, 4);
const auto write_data = std::bit_cast<u32, be_t<u32>>(args);
vkCmdUpdateBuffer(*m_current_command_buffer, mapping.second->value, mapping.first, 4, &write_data);
flush_command_queue();
return true;
}
void VKGSRender::sync_hint(rsx::FIFO_hint hint, void* args)
{
ensure(args);
@ -2088,6 +2145,15 @@ void VKGSRender::close_and_submit_command_buffer(vk::fence* pFence, VkSemaphore
m_current_command_buffer->flags &= ~vk::command_buffer::cb_has_open_query;
}
if (m_host_data_ptr && m_host_data_ptr->last_label_release_event > m_host_data_ptr->commands_complete_event)
{
vkCmdUpdateBuffer(*m_current_command_buffer,
m_host_object_data->value,
::offset32(&vk::host_data_t::commands_complete_event),
sizeof(u64),
const_cast<u64*>(&m_host_data_ptr->event_counter));
}
m_current_command_buffer->end();
m_current_command_buffer->tag();

View file

@ -117,6 +117,9 @@ private:
vk::command_buffer_chunk* m_current_command_buffer = nullptr;
VkSemaphore m_dangling_semaphore_signal = VK_NULL_HANDLE;
volatile vk::host_data_t* m_host_data_ptr = nullptr;
std::unique_ptr<vk::buffer> m_host_object_data;
VkDescriptorSetLayout descriptor_layouts;
VkPipelineLayout pipeline_layout;
@ -242,6 +245,7 @@ public:
void bind_viewport();
void sync_hint(rsx::FIFO_hint hint, void* args) override;
bool release_GCM_label(u32 address, u32 data) override;
void begin_occlusion_query(rsx::reports::occlusion_query_info* query) override;
void end_occlusion_query(rsx::reports::occlusion_query_info* query) override;
@ -259,6 +263,9 @@ public:
void begin_conditional_rendering(const std::vector<rsx::reports::occlusion_query_info*>& sources) override;
void end_conditional_rendering() override;
// Host sync object
inline std::pair<volatile vk::host_data_t*, VkBuffer> map_host_object_data() { return { m_host_data_ptr, m_host_object_data->value }; }
protected:
void clear_surface(u32 mask) override;
void begin() override;

View file

@ -9,6 +9,7 @@
#include "vkutils/data_heap.h"
#include "vkutils/image_helpers.h"
#include "VKGSRender.h"
#include "../GCM.h"
#include "../rsx_utils.h"
@ -1146,6 +1147,17 @@ namespace vk
// Release from async chain, the primary chain will acquire later
dst_image->queue_release(cmd2, cmd.get_queue_family(), dst_image->current_layout);
}
if (auto rsxthr = rsx::get_current_renderer();
rsxthr->get_backend_config().supports_host_gpu_labels)
{
// Queue a sync update on the CB doing the load
auto [host_data, host_buffer] = static_cast<VKGSRender*>(rsxthr)->map_host_object_data();
ensure(host_data);
const auto event_id = ++host_data->event_counter;
host_data->texture_load_request_event = event_id;
vkCmdUpdateBuffer(cmd2, host_buffer, ::offset32(&vk::host_data_t::texture_load_complete_event), sizeof(u64), &event_id);
}
}
void blitter::scale_image(vk::command_buffer& cmd, vk::image* src, vk::image* dst, areai src_area, areai dst_area, bool interpolate, const rsx::typeless_xfer& xfer_info)

View file

@ -16,6 +16,16 @@ namespace vk
gpu = 1
};
struct host_data_t // Pick a better name
{
u64 magic = 0xCAFEBABE;
u64 event_counter = 0;
u64 texture_load_request_event = 0;
u64 texture_load_complete_event = 0;
u64 last_label_release_event = 0;
u64 commands_complete_event = 0;
};
struct fence
{
atomic_t<bool> flushed = false;

View file

@ -29,6 +29,35 @@ namespace rsx
rsx_log.trace("RSX method 0x%x (arg=0x%x)", reg << 2, arg);
}
template<bool FlushDMA, bool FlushPipe>
void write_gcm_label(thread* rsx, u32 address, u32 data)
{
const bool is_flip_sema = (address == (rsx->label_addr + 0x10) || address == (rsx->label_addr + 0x30));
if (!is_flip_sema)
{
if constexpr (FlushPipe)
{
// Ignoring these can cause very poor performance due to timestamp queries taking too long.
rsx->sync();
}
if (rsx->get_backend_config().supports_host_gpu_labels &&
rsx->release_GCM_label(address, data))
{
// Backend will handle it, nothing to do.
// Implicitly handles DMA sync.
return;
}
if constexpr (FlushDMA)
{
g_fxo->get<rsx::dma_manager>().sync();
}
}
vm::_ref<RsxSemaphore>(address).val = data;
}
template<typename Type> struct vertex_data_type_from_element_type;
template<> struct vertex_data_type_from_element_type<float> { static const vertex_base_type type = vertex_base_type::f; };
template<> struct vertex_data_type_from_element_type<f16> { static const vertex_base_type type = vertex_base_type::sf; };
@ -74,6 +103,8 @@ namespace rsx
rsx->flush_fifo();
}
//rsx_log.error("Wait for address at 0x%x to change to 0x%x", addr, arg);
u64 start = get_system_time();
while (sema != arg)
{
@ -116,8 +147,6 @@ namespace rsx
void semaphore_release(thread* rsx, u32 /*reg*/, u32 arg)
{
rsx->sync();
const u32 offset = method_registers.semaphore_offset_406e();
if (offset % 4)
@ -144,7 +173,7 @@ namespace rsx
rsx_log.fatal("NV406E semaphore unexpected address. Please report to the developers. (offset=0x%x, addr=0x%x)", offset, addr);
}
vm::_ref<RsxSemaphore>(addr).val = arg;
write_gcm_label<false, true>(rsx, addr, arg);
}
}
@ -207,11 +236,6 @@ namespace rsx
void texture_read_semaphore_release(thread* rsx, u32 /*reg*/, u32 arg)
{
// Pipeline barrier seems to be equivalent to a SHADER_READ stage barrier
g_fxo->get<rsx::dma_manager>().sync();
if (g_cfg.video.strict_rendering_mode)
{
rsx->sync();
}
// lle-gcm likes to inject system reserved semaphores, presumably for system/vsh usage
// Avoid calling render to avoid any havoc(flickering) they may cause from invalid flush/write
@ -224,14 +248,19 @@ namespace rsx
return;
}
vm::_ref<RsxSemaphore>(get_address(offset, method_registers.semaphore_context_dma_4097())).val = arg;
if (g_cfg.video.strict_rendering_mode) [[ unlikely ]]
{
write_gcm_label<true, true>(rsx, get_address(offset, method_registers.semaphore_context_dma_4097()), arg);
}
else
{
write_gcm_label<true, false>(rsx, get_address(offset, method_registers.semaphore_context_dma_4097()), arg);
}
}
void back_end_write_semaphore_release(thread* rsx, u32 /*reg*/, u32 arg)
{
// Full pipeline barrier
g_fxo->get<rsx::dma_manager>().sync();
rsx->sync();
const u32 offset = method_registers.semaphore_offset_4097();
@ -243,7 +272,7 @@ namespace rsx
}
const u32 val = (arg & 0xff00ff00) | ((arg & 0xff) << 16) | ((arg >> 16) & 0xff);
vm::_ref<RsxSemaphore>(get_address(offset, method_registers.semaphore_context_dma_4097())).val = val;
write_gcm_label<true, true>(rsx, get_address(offset, method_registers.semaphore_context_dma_4097()), val);
}
/**

View file

@ -156,6 +156,7 @@ struct cfg_root : cfg::node
cfg::_int<1, 1800> vblank_rate{ this, "Vblank Rate", 60, true }; // Changing this from 60 may affect game speed in unexpected ways
cfg::_bool vblank_ntsc{ this, "Vblank NTSC Fixup", false, true };
cfg::_bool decr_memory_layout{ this, "DECR memory layout", false}; // Force enable increased allowed main memory range as DECR console
cfg::_bool host_label_synchronization{ this, "Use Host GPU Labels", false };
struct node_vk : cfg::node
{