rsx: Optimize static heap allocations

This commit is contained in:
kd-11 2025-03-31 19:55:05 +03:00 committed by kd-11
parent 76948b6364
commit 26a7e9653f
4 changed files with 123 additions and 91 deletions

View file

@ -404,9 +404,9 @@ namespace
} }
}; };
template <typename T> template <typename T>
NEVER_INLINE std::tuple<T, T, u32> upload_untouched_skip_restart(std::span<to_be_t<const T>> src, std::span<T> dst, T restart_index) NEVER_INLINE std::tuple<T, T, u32> upload_untouched_skip_restart(std::span<to_be_t<const T>> src, std::span<T> dst, T restart_index)
{ {
T min_index = index_limit<T>(); T min_index = index_limit<T>();
T max_index = 0; T max_index = 0;
u32 written = 0; u32 written = 0;
@ -422,11 +422,11 @@ NEVER_INLINE std::tuple<T, T, u32> upload_untouched_skip_restart(std::span<to_be
} }
return std::make_tuple(min_index, max_index, written); return std::make_tuple(min_index, max_index, written);
} }
template<typename T> template<typename T>
std::tuple<T, T, u32> upload_untouched(std::span<to_be_t<const T>> src, std::span<T> dst, rsx::primitive_type draw_mode, bool is_primitive_restart_enabled, u32 primitive_restart_index) std::tuple<T, T, u32> upload_untouched(std::span<to_be_t<const T>> src, std::span<T> dst, rsx::primitive_type draw_mode, bool is_primitive_restart_enabled, u32 primitive_restart_index)
{ {
if (!is_primitive_restart_enabled) if (!is_primitive_restart_enabled)
{ {
return untouched_impl::upload_untouched(src, dst); return untouched_impl::upload_untouched(src, dst);
@ -454,7 +454,26 @@ std::tuple<T, T, u32> upload_untouched(std::span<to_be_t<const T>> src, std::spa
{ {
return primitive_restart_impl::upload_untouched(src, dst, primitive_restart_index); return primitive_restart_impl::upload_untouched(src, dst, primitive_restart_index);
} }
} }
void iota16(u16* dst, u32 count)
{
unsigned i = 0;
#if defined(ARCH_X64) || defined(ARCH_ARM64)
const unsigned step = 8; // We do 8 entries per step
const __m128i vec_step = _mm_set1_epi16(8); // Constant to increment the raw values
__m128i values = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
__m128i* vec_ptr = reinterpret_cast<__m128i*>(dst);
for (; (i + step) <= count; i += step, vec_ptr++)
{
_mm_stream_si128(vec_ptr, values);
_mm_add_epi16(values, vec_step);
}
#endif
for (; i < count; ++i)
dst[i] = i;
}
template<typename T> template<typename T>
std::tuple<T, T, u32> expand_indexed_triangle_fan(std::span<to_be_t<const T>> src, std::span<T> dst, bool is_primitive_restart_enabled, u32 primitive_restart_index) std::tuple<T, T, u32> expand_indexed_triangle_fan(std::span<to_be_t<const T>> src, std::span<T> dst, bool is_primitive_restart_enabled, u32 primitive_restart_index)
@ -624,8 +643,7 @@ void write_index_array_for_non_indexed_non_native_primitive_to_buffer(char* dst,
switch (draw_mode) switch (draw_mode)
{ {
case rsx::primitive_type::line_loop: case rsx::primitive_type::line_loop:
for (unsigned i = 0; i < count; ++i) iota16(typedDst, count);
typedDst[i] = i;
typedDst[count] = 0; typedDst[count] = 0;
return; return;
case rsx::primitive_type::triangle_fan: case rsx::primitive_type::triangle_fan:

View file

@ -20,30 +20,34 @@ protected:
template<int Alignment> template<int Alignment>
bool can_alloc(usz size) const bool can_alloc(usz size) const
{ {
usz alloc_size = utils::align(size, Alignment); const usz alloc_size = utils::align(size, Alignment);
usz aligned_put_pos = utils::align(m_put_pos, Alignment); const usz aligned_put_pos = utils::align(m_put_pos, Alignment);
if (aligned_put_pos + alloc_size < m_size) const usz alloc_end = aligned_put_pos + alloc_size;
if (alloc_end < m_size) [[ likely ]]
{ {
// range before get // Range before get
if (aligned_put_pos + alloc_size < m_get_pos) if (alloc_end < m_get_pos)
return true; return true;
// range after get
// Range after get
if (aligned_put_pos > m_get_pos) if (aligned_put_pos > m_get_pos)
return true; return true;
return false; return false;
} }
else
{
// ..]....[..get.. // ..]....[..get..
if (aligned_put_pos < m_get_pos) if (aligned_put_pos < m_get_pos)
return false; return false;
// ..get..]...[... // ..get..]...[...
// Actually all resources extending beyond heap space starts at 0 // Actually all resources extending beyond heap space starts at 0
if (alloc_size > m_get_pos) if (alloc_size > m_get_pos)
return false; return false;
return true; return true;
} }
}
// Grow the buffer to hold at least size bytes // Grow the buffer to hold at least size bytes
virtual bool grow(usz /*size*/) virtual bool grow(usz /*size*/)
@ -54,9 +58,8 @@ protected:
usz m_size; usz m_size;
usz m_put_pos; // Start of free space usz m_put_pos; // Start of free space
usz m_min_guard_size; //If an allocation touches the guard region, reset the heap to avoid going over budget usz m_get_pos; // End of free space
usz m_current_allocated_size; usz m_min_guard_size; // If an allocation touches the guard region, reset the heap to avoid going over budget
usz m_largest_allocated_pool;
char* m_name; char* m_name;
public: public:
@ -65,8 +68,6 @@ public:
data_heap(const data_heap&) = delete; data_heap(const data_heap&) = delete;
data_heap(data_heap&&) = delete; data_heap(data_heap&&) = delete;
usz m_get_pos; // End of free space
void init(usz heap_size, const char* buffer_name = "unnamed", usz min_guard_size=0x10000) void init(usz heap_size, const char* buffer_name = "unnamed", usz min_guard_size=0x10000)
{ {
m_name = const_cast<char*>(buffer_name); m_name = const_cast<char*>(buffer_name);
@ -75,10 +76,8 @@ public:
m_put_pos = 0; m_put_pos = 0;
m_get_pos = heap_size - 1; m_get_pos = heap_size - 1;
//allocation stats // Allocation stats
m_min_guard_size = min_guard_size; m_min_guard_size = min_guard_size;
m_current_allocated_size = 0;
m_largest_allocated_pool = 0;
} }
template<int Alignment> template<int Alignment>
@ -89,24 +88,45 @@ public:
if (!can_alloc<Alignment>(size) && !grow(alloc_size)) if (!can_alloc<Alignment>(size) && !grow(alloc_size))
{ {
fmt::throw_exception("[%s] Working buffer not big enough, buffer_length=%d allocated=%d requested=%d guard=%d largest_pool=%d", fmt::throw_exception("[%s] Working buffer not big enough, buffer_length=%d requested=%d guard=%d",
m_name, m_size, m_current_allocated_size, size, m_min_guard_size, m_largest_allocated_pool); m_name, m_size, size, m_min_guard_size);
} }
const usz block_length = (aligned_put_pos - m_put_pos) + alloc_size; const usz alloc_end = aligned_put_pos + alloc_size;
m_current_allocated_size += block_length; if (alloc_end < m_size)
m_largest_allocated_pool = std::max(m_largest_allocated_pool, block_length);
if (aligned_put_pos + alloc_size < m_size)
{ {
m_put_pos = aligned_put_pos + alloc_size; m_put_pos = alloc_end;
return aligned_put_pos; return aligned_put_pos;
} }
else
{
m_put_pos = alloc_size; m_put_pos = alloc_size;
return 0; return 0;
} }
/*
* For use in cases where we take a fixed amount each time
*/
template<int Alignment, usz Size = Alignment>
usz static_alloc()
{
static_assert((Size & (Alignment - 1)) == 0);
ensure((m_put_pos & (Alignment - 1)) == 0);
if (!can_alloc<Alignment>(Size) && !grow(Size))
{
fmt::throw_exception("[%s] Working buffer not big enough, buffer_length=%d requested=%d guard=%d",
m_name, m_size, Size, m_min_guard_size);
}
const usz alloc_end = m_put_pos + Size;
if (m_put_pos + Size < m_size)
{
m_put_pos = alloc_end;
return m_put_pos;
}
m_put_pos = Size;
return 0;
} }
/** /**
@ -117,30 +137,25 @@ public:
return (m_put_pos > 0) ? m_put_pos - 1 : m_size - 1; return (m_put_pos > 0) ? m_put_pos - 1 : m_size - 1;
} }
inline void set_get_pos(usz value)
{
m_get_pos = value;
}
virtual bool is_critical() const virtual bool is_critical() const
{ {
const usz guard_length = std::max(m_min_guard_size, m_largest_allocated_pool); return m_min_guard_size >= m_size;
return (m_current_allocated_size + guard_length) >= m_size;
} }
void reset_allocation_stats() void reset_allocation_stats()
{ {
m_current_allocated_size = 0;
m_largest_allocated_pool = 0;
m_get_pos = get_current_put_pos_minus_one(); m_get_pos = get_current_put_pos_minus_one();
} }
// Updates the current_allocated_size metrics // Updates the current_allocated_size metrics
void notify() inline void notify()
{ {
if (m_get_pos == umax) // @unused
m_current_allocated_size = 0;
else if (m_get_pos < m_put_pos)
m_current_allocated_size = (m_put_pos - m_get_pos - 1);
else if (m_get_pos > m_put_pos)
m_current_allocated_size = (m_put_pos + (m_size - m_get_pos - 1));
else
fmt::throw_exception("m_put_pos == m_get_pos!");
} }
usz size() const usz size() const

View file

@ -41,7 +41,7 @@ namespace vk::data_heap_manager
continue; continue;
} }
heap->m_get_pos = found->second; heap->set_get_pos(found->second);
heap->notify(); heap->notify();
} }
} }

View file

@ -1162,7 +1162,6 @@ void VKGSRender::check_heap_status(u32 flags)
{ {
heap_critical = false; heap_critical = false;
u32 test = 1u << std::countr_zero(flags); u32 test = 1u << std::countr_zero(flags);
do do
{ {
switch (flags & test) switch (flags & test)
@ -2046,7 +2045,7 @@ void VKGSRender::load_program_env()
check_heap_status(VK_HEAP_CHECK_VERTEX_ENV_STORAGE); check_heap_status(VK_HEAP_CHECK_VERTEX_ENV_STORAGE);
// Vertex state // Vertex state
const auto mem = m_vertex_env_ring_info.alloc<256>(256); const auto mem = m_vertex_env_ring_info.static_alloc<256>();
auto buf = static_cast<u8*>(m_vertex_env_ring_info.map(mem, 148)); auto buf = static_cast<u8*>(m_vertex_env_ring_info.map(mem, 148));
m_draw_processor.fill_scale_offset_data(buf, false); m_draw_processor.fill_scale_offset_data(buf, false);
@ -2134,7 +2133,7 @@ void VKGSRender::load_program_env()
{ {
check_heap_status(VK_HEAP_CHECK_FRAGMENT_ENV_STORAGE); check_heap_status(VK_HEAP_CHECK_FRAGMENT_ENV_STORAGE);
auto mem = m_fragment_env_ring_info.alloc<256>(256); auto mem = m_fragment_env_ring_info.static_alloc<256>();
auto buf = m_fragment_env_ring_info.map(mem, 32); auto buf = m_fragment_env_ring_info.map(mem, 32);
m_draw_processor.fill_fragment_state_buffer(buf, current_fragment_program); m_draw_processor.fill_fragment_state_buffer(buf, current_fragment_program);
@ -2146,7 +2145,7 @@ void VKGSRender::load_program_env()
{ {
check_heap_status(VK_HEAP_CHECK_TEXTURE_ENV_STORAGE); check_heap_status(VK_HEAP_CHECK_TEXTURE_ENV_STORAGE);
auto mem = m_fragment_texture_params_ring_info.alloc<256>(768); auto mem = m_fragment_texture_params_ring_info.static_alloc<256, 768>();
auto buf = m_fragment_texture_params_ring_info.map(mem, 768); auto buf = m_fragment_texture_params_ring_info.map(mem, 768);
current_fragment_program.texture_params.write_to(buf, current_fp_metadata.referenced_textures_mask); current_fragment_program.texture_params.write_to(buf, current_fp_metadata.referenced_textures_mask);
@ -2158,7 +2157,7 @@ void VKGSRender::load_program_env()
{ {
check_heap_status(VK_HEAP_CHECK_FRAGMENT_ENV_STORAGE); check_heap_status(VK_HEAP_CHECK_FRAGMENT_ENV_STORAGE);
auto mem = m_raster_env_ring_info.alloc<256>(256); auto mem = m_raster_env_ring_info.static_alloc<256>();
auto buf = m_raster_env_ring_info.map(mem, 128); auto buf = m_raster_env_ring_info.map(mem, 128);
std::memcpy(buf, rsx::method_registers.polygon_stipple_pattern(), 128); std::memcpy(buf, rsx::method_registers.polygon_stipple_pattern(), 128);