diff --git a/CMakeLists.txt b/CMakeLists.txt index f05587d38..185205221 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -771,6 +771,7 @@ set(SHADER_RECOMPILER src/shader_recompiler/exception.h src/shader_recompiler/ir/passes/identity_removal_pass.cpp src/shader_recompiler/ir/passes/ir_passes.h src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp + src/shader_recompiler/ir/passes/readlane_elimination_pass.cpp src/shader_recompiler/ir/passes/resource_tracking_pass.cpp src/shader_recompiler/ir/passes/ring_access_elimination.cpp src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp @@ -1121,7 +1122,6 @@ cmrc_add_resource_library(embedded-resources src/images/gold.png src/images/platinum.png src/images/silver.png) - target_link_libraries(shadps4 PRIVATE res::embedded) # ImGui resources diff --git a/src/shader_recompiler/frontend/control_flow_graph.cpp b/src/shader_recompiler/frontend/control_flow_graph.cpp index 126cb4eb6..cf1882b8c 100644 --- a/src/shader_recompiler/frontend/control_flow_graph.cpp +++ b/src/shader_recompiler/frontend/control_flow_graph.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include +#include #include "common/assert.h" #include "shader_recompiler/frontend/control_flow_graph.h" @@ -39,9 +40,6 @@ static IR::Condition MakeCondition(const GcnInst& inst) { return IR::Condition::Execz; case Opcode::S_CBRANCH_EXECNZ: return IR::Condition::Execnz; - case Opcode::S_AND_SAVEEXEC_B64: - case Opcode::S_ANDN2_B64: - return IR::Condition::Execnz; default: return IR::Condition::True; } @@ -76,9 +74,9 @@ CFG::CFG(Common::ObjectPool& block_pool_, std::span inst_l index_to_pc.resize(inst_list.size() + 1); labels.reserve(LabelReserveSize); EmitLabels(); - EmitDivergenceLabels(); EmitBlocks(); LinkBlocks(); + SplitDivergenceScopes(); } void CFG::EmitLabels() { @@ -112,7 +110,7 @@ void CFG::EmitLabels() { std::ranges::sort(labels); } -void CFG::EmitDivergenceLabels() { +void CFG::SplitDivergenceScopes() { const auto is_open_scope = [](const GcnInst& inst) { // An open scope instruction is an instruction that modifies EXEC // but also saves the previous value to restore later. This indicates @@ -136,64 +134,97 @@ void CFG::EmitDivergenceLabels() { (inst.opcode == Opcode::S_ANDN2_B64 && inst.dst[0].field == OperandField::ExecLo); }; - // Since we will be adding new labels, avoid iterating those as well. - const size_t end_size = labels.size(); - for (u32 l = 0; l < end_size; l++) { - const Label start = labels[l]; - // Stop if we reached end of existing labels. - if (l == end_size - 1) { - break; - } - const Label end = labels[l + 1]; - const size_t end_index = GetIndex(end); - + for (auto blk = blocks.begin(); blk != blocks.end(); blk++) { + auto next_blk = std::next(blk); s32 curr_begin = -1; - s32 last_exec_idx = -1; - for (size_t index = GetIndex(start); index < end_index; index++) { + for (size_t index = blk->begin_index; index <= blk->end_index; index++) { const auto& inst = inst_list[index]; - if (curr_begin != -1) { - // Keep note of the last instruction that does not ignore exec, so we know where - // to end the divergence block without impacting trailing instructions that do. - if (!IgnoresExecMask(inst)) { - last_exec_idx = index; - } - // Consider a close scope on certain instruction types or at the last instruction - // before the next label. - if (is_close_scope(inst) || index == end_index - 1) { - // Only insert a scope if, since the open-scope instruction, there is at least - // one instruction that does not ignore exec. - if (index - curr_begin > 1 && last_exec_idx != -1) { - // Add a label to the instruction right after the open scope call. - // It is the start of a new basic block. - const auto& save_inst = inst_list[curr_begin]; - AddLabel(index_to_pc[curr_begin] + save_inst.length); - // Add a label to the close scope instruction. - // There are 3 cases where we need to close a scope. - // * Close scope instruction inside the block - // * Close scope instruction at the end of the block (cbranch or endpgm) - // * Normal instruction at the end of the block - // If the instruction we want to close the scope at is at the end of the - // block, we do not need to insert a new label. - if (last_exec_idx != end_index - 1) { - // Add the label after the last instruction affected by exec. - const auto& last_exec_inst = inst_list[last_exec_idx]; - AddLabel(index_to_pc[last_exec_idx] + last_exec_inst.length); - } - } - // Reset scope begin. + const bool is_close = is_close_scope(inst); + if ((is_close || index == blk->end_index) && curr_begin != -1) { + // If there are no instructions inside scope don't do anything. + if (index - curr_begin == 1) { curr_begin = -1; + continue; } + // If all instructions in the scope ignore exec masking, we shouldn't insert a + // scope. + const auto start = inst_list.begin() + curr_begin + 1; + if (!std::ranges::all_of(start, inst_list.begin() + index, IgnoresExecMask)) { + // Determine the first instruction affected by the exec mask. + do { + ++curr_begin; + } while (IgnoresExecMask(inst_list[curr_begin])); + + // Determine the last instruction affected by the exec mask. + s32 curr_end = index; + while (IgnoresExecMask(inst_list[curr_end])) { + --curr_end; + } + + // Create a new block for the divergence scope. + Block* block = block_pool.Create(); + block->begin = index_to_pc[curr_begin]; + block->end = index_to_pc[curr_end]; + block->begin_index = curr_begin; + block->end_index = curr_end; + block->end_inst = inst_list[curr_end]; + blocks.insert_before(next_blk, *block); + + // If we are inside the parent block, make an epilogue block and jump to it. + if (curr_end != blk->end_index) { + Block* epi_block = block_pool.Create(); + epi_block->begin = index_to_pc[curr_end + 1]; + epi_block->end = blk->end; + epi_block->begin_index = curr_end + 1; + epi_block->end_index = blk->end_index; + epi_block->end_inst = blk->end_inst; + epi_block->cond = blk->cond; + epi_block->end_class = blk->end_class; + epi_block->branch_true = blk->branch_true; + epi_block->branch_false = blk->branch_false; + blocks.insert_before(next_blk, *epi_block); + + // Have divergence block always jump to epilogue block. + block->cond = IR::Condition::True; + block->branch_true = epi_block; + block->branch_false = nullptr; + + // If the parent block fails to enter divergence block make it jump to + // epilogue too + blk->branch_false = epi_block; + } else { + // No epilogue block is needed since the divergence block + // also ends the parent block. Inherit the end condition. + auto& parent_blk = *blk; + ASSERT(blk->cond == IR::Condition::True && blk->branch_true); + block->cond = IR::Condition::True; + block->branch_true = blk->branch_true; + block->branch_false = nullptr; + + // If the parent block didn't enter the divergence scope + // have it jump directly to the next one + blk->branch_false = blk->branch_true; + } + + // Shrink parent block to end right before curr_begin + // and make it jump to divergence block + --curr_begin; + blk->end = index_to_pc[curr_begin]; + blk->end_index = curr_begin; + blk->end_inst = inst_list[curr_begin]; + blk->cond = IR::Condition::Execnz; + blk->end_class = EndClass::Branch; + blk->branch_true = block; + } + // Reset scope begin. + curr_begin = -1; } // Mark a potential start of an exec scope. if (is_open_scope(inst)) { curr_begin = index; - last_exec_idx = -1; } } } - - // Sort labels to make sure block insertion is correct. - std::ranges::sort(labels); } void CFG::EmitBlocks() { @@ -234,22 +265,6 @@ void CFG::LinkBlocks() { for (auto it = blocks.begin(); it != blocks.end(); it++) { auto& block = *it; const auto end_inst{block.end_inst}; - // Handle divergence block inserted here. - if (end_inst.opcode == Opcode::S_AND_SAVEEXEC_B64 || - end_inst.opcode == Opcode::S_ANDN2_B64 || end_inst.IsCmpx()) { - // Blocks are stored ordered by address in the set - auto next_it = std::next(it); - auto* target_block = &(*next_it); - ++target_block->num_predecessors; - block.branch_true = target_block; - - auto merge_it = std::next(next_it); - auto* merge_block = &(*merge_it); - ++merge_block->num_predecessors; - block.branch_false = merge_block; - block.end_class = EndClass::Branch; - continue; - } // If the block doesn't end with a branch we simply // need to link with the next block. diff --git a/src/shader_recompiler/frontend/control_flow_graph.h b/src/shader_recompiler/frontend/control_flow_graph.h index d98d4b05d..0acce3306 100644 --- a/src/shader_recompiler/frontend/control_flow_graph.h +++ b/src/shader_recompiler/frontend/control_flow_graph.h @@ -57,9 +57,9 @@ public: private: void EmitLabels(); - void EmitDivergenceLabels(); void EmitBlocks(); void LinkBlocks(); + void SplitDivergenceScopes(); void AddLabel(Label address) { const auto it = std::ranges::find(labels, address); diff --git a/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp b/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp index c8a4b13cb..5c66b1115 100644 --- a/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp +++ b/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp @@ -251,54 +251,6 @@ void FoldCmpClass(IR::Block& block, IR::Inst& inst) { } } -void FoldReadLane(IR::Block& block, IR::Inst& inst) { - const u32 lane = inst.Arg(1).U32(); - IR::Inst* prod = inst.Arg(0).InstRecursive(); - - const auto search_chain = [lane](const IR::Inst* prod) -> IR::Value { - while (prod->GetOpcode() == IR::Opcode::WriteLane) { - if (prod->Arg(2).U32() == lane) { - return prod->Arg(1); - } - prod = prod->Arg(0).InstRecursive(); - } - return {}; - }; - - if (prod->GetOpcode() == IR::Opcode::WriteLane) { - if (const IR::Value value = search_chain(prod); !value.IsEmpty()) { - inst.ReplaceUsesWith(value); - } - return; - } - - if (prod->GetOpcode() == IR::Opcode::Phi) { - boost::container::small_vector phi_args; - for (size_t arg_index = 0; arg_index < prod->NumArgs(); ++arg_index) { - const IR::Inst* arg{prod->Arg(arg_index).InstRecursive()}; - if (arg->GetOpcode() != IR::Opcode::WriteLane) { - return; - } - const IR::Value value = search_chain(arg); - if (value.IsEmpty()) { - continue; - } - phi_args.emplace_back(value); - } - if (std::ranges::all_of(phi_args, [&](IR::Value value) { return value == phi_args[0]; })) { - inst.ReplaceUsesWith(phi_args[0]); - return; - } - const auto insert_point = IR::Block::InstructionList::s_iterator_to(*prod); - IR::Inst* const new_phi{&*block.PrependNewInst(insert_point, IR::Opcode::Phi)}; - new_phi->SetFlags(IR::Type::U32); - for (size_t arg_index = 0; arg_index < phi_args.size(); arg_index++) { - new_phi->AddPhiOperand(prod->PhiBlock(arg_index), phi_args[arg_index]); - } - inst.ReplaceUsesWith(IR::Value{new_phi}); - } -} - void ConstantPropagation(IR::Block& block, IR::Inst& inst) { switch (inst.GetOpcode()) { case IR::Opcode::IAdd32: @@ -408,8 +360,6 @@ void ConstantPropagation(IR::Block& block, IR::Inst& inst) { case IR::Opcode::SelectF32: case IR::Opcode::SelectF64: return FoldSelect(inst); - case IR::Opcode::ReadLane: - return FoldReadLane(block, inst); case IR::Opcode::FPNeg32: FoldWhenAllImmediates(inst, [](f32 a) { return -a; }); return; diff --git a/src/shader_recompiler/ir/passes/hull_shader_transform.cpp b/src/shader_recompiler/ir/passes/hull_shader_transform.cpp index 48727e32a..5cf8a1525 100644 --- a/src/shader_recompiler/ir/passes/hull_shader_transform.cpp +++ b/src/shader_recompiler/ir/passes/hull_shader_transform.cpp @@ -1,11 +1,13 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later + #include "common/assert.h" #include "shader_recompiler/info.h" #include "shader_recompiler/ir/attribute.h" #include "shader_recompiler/ir/breadth_first_search.h" #include "shader_recompiler/ir/ir_emitter.h" #include "shader_recompiler/ir/opcodes.h" +#include "shader_recompiler/ir/passes/ir_passes.h" #include "shader_recompiler/ir/pattern_matching.h" #include "shader_recompiler/ir/program.h" #include "shader_recompiler/runtime_info.h" @@ -734,6 +736,8 @@ void TessellationPreprocess(IR::Program& program, RuntimeInfo& runtime_info) { } } } + + ConstantPropagationPass(program.post_order_blocks); } } // namespace Shader::Optimization diff --git a/src/shader_recompiler/ir/passes/ir_passes.h b/src/shader_recompiler/ir/passes/ir_passes.h index 69628dbfd..760dbb112 100644 --- a/src/shader_recompiler/ir/passes/ir_passes.h +++ b/src/shader_recompiler/ir/passes/ir_passes.h @@ -17,11 +17,11 @@ void IdentityRemovalPass(IR::BlockList& program); void DeadCodeEliminationPass(IR::Program& program); void ConstantPropagationPass(IR::BlockList& program); void FlattenExtendedUserdataPass(IR::Program& program); +void ReadLaneEliminationPass(IR::Program& program); void ResourceTrackingPass(IR::Program& program); void CollectShaderInfoPass(IR::Program& program); void LowerBufferFormatToRaw(IR::Program& program); -void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtime_info, - Stage stage); +void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtime_info); void TessellationPreprocess(IR::Program& program, RuntimeInfo& runtime_info); void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info); void DomainShaderTransform(IR::Program& program, RuntimeInfo& runtime_info); diff --git a/src/shader_recompiler/ir/passes/readlane_elimination_pass.cpp b/src/shader_recompiler/ir/passes/readlane_elimination_pass.cpp new file mode 100644 index 000000000..fbe382d41 --- /dev/null +++ b/src/shader_recompiler/ir/passes/readlane_elimination_pass.cpp @@ -0,0 +1,115 @@ +// SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "shader_recompiler/ir/program.h" + +namespace Shader::Optimization { + +static IR::Inst* SearchChain(IR::Inst* inst, u32 lane) { + while (inst->GetOpcode() == IR::Opcode::WriteLane) { + if (inst->Arg(2).U32() == lane) { + // We found a possible write lane source, return it. + return inst; + } + inst = inst->Arg(0).InstRecursive(); + } + return inst; +} + +static bool IsPossibleToEliminate(IR::Inst* inst, u32 lane) { + // Breadth-first search visiting the right most arguments first + boost::container::small_vector visited; + std::queue queue; + queue.push(inst); + + while (!queue.empty()) { + // Pop one instruction from the queue + IR::Inst* inst{queue.front()}; + queue.pop(); + + // If it's a WriteLane search for possible candidates + if (inst = SearchChain(inst, lane); inst->GetOpcode() == IR::Opcode::WriteLane) { + // We found a possible write lane source, stop looking here. + continue; + } + // If there are other instructions in-between that use the value we can't eliminate. + if (inst->GetOpcode() != IR::Opcode::ReadLane && inst->GetOpcode() != IR::Opcode::Phi) { + return false; + } + // Visit the right most arguments first + for (size_t arg = inst->NumArgs(); arg--;) { + auto arg_value{inst->Arg(arg)}; + if (arg_value.IsImmediate()) { + continue; + } + // Queue instruction if it hasn't been visited + IR::Inst* arg_inst{arg_value.InstRecursive()}; + if (std::ranges::find(visited, arg_inst) == visited.end()) { + visited.push_back(arg_inst); + queue.push(arg_inst); + } + } + } + return true; +} + +using PhiMap = std::unordered_map; + +static IR::Value GetRealValue(PhiMap& phi_map, IR::Inst* inst, u32 lane) { + // If this is a WriteLane op search the chain for a possible candidate. + if (inst = SearchChain(inst, lane); inst->GetOpcode() == IR::Opcode::WriteLane) { + return inst->Arg(1); + } + + // If this is a phi, duplicate it and populate its arguments with real values. + if (inst->GetOpcode() == IR::Opcode::Phi) { + // We are in a phi cycle, use the already duplicated phi. + const auto [it, is_new_phi] = phi_map.try_emplace(inst); + if (!is_new_phi) { + return IR::Value{it->second}; + } + + // Create new phi and insert it right before the old one. + const auto insert_point = IR::Block::InstructionList::s_iterator_to(*inst); + IR::Block* block = inst->GetParent(); + IR::Inst* new_phi{&*block->PrependNewInst(insert_point, IR::Opcode::Phi)}; + new_phi->SetFlags(IR::Type::U32); + it->second = new_phi; + + // Gather all arguments. + for (size_t arg_index = 0; arg_index < inst->NumArgs(); arg_index++) { + IR::Inst* arg_prod = inst->Arg(arg_index).InstRecursive(); + const IR::Value arg = GetRealValue(phi_map, arg_prod, lane); + new_phi->AddPhiOperand(inst->PhiBlock(arg_index), arg); + } + return IR::Value{new_phi}; + } + UNREACHABLE(); +} + +void ReadLaneEliminationPass(IR::Program& program) { + PhiMap phi_map; + for (IR::Block* const block : program.blocks) { + for (IR::Inst& inst : block->Instructions()) { + if (inst.GetOpcode() != IR::Opcode::ReadLane) { + continue; + } + const u32 lane = inst.Arg(1).U32(); + IR::Inst* prod = inst.Arg(0).InstRecursive(); + + // Check simple case of no control flow and phis + if (prod = SearchChain(prod, lane); prod->GetOpcode() == IR::Opcode::WriteLane) { + inst.ReplaceUsesWith(prod->Arg(1)); + continue; + } + + // Traverse the phi tree to see if it's possible to eliminate + if (prod->GetOpcode() == IR::Opcode::Phi && IsPossibleToEliminate(prod, lane)) { + inst.ReplaceUsesWith(GetRealValue(phi_map, prod, lane)); + phi_map.clear(); + } + } + } +} + +} // namespace Shader::Optimization diff --git a/src/shader_recompiler/ir/passes/ring_access_elimination.cpp b/src/shader_recompiler/ir/passes/ring_access_elimination.cpp index d6f1efb12..071b94ac0 100644 --- a/src/shader_recompiler/ir/passes/ring_access_elimination.cpp +++ b/src/shader_recompiler/ir/passes/ring_access_elimination.cpp @@ -11,8 +11,7 @@ namespace Shader::Optimization { -void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtime_info, - Stage stage) { +void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtime_info) { auto& info = program.info; const auto& ForEachInstruction = [&](auto func) { @@ -24,7 +23,7 @@ void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtim } }; - switch (stage) { + switch (program.info.stage) { case Stage::Local: { ForEachInstruction([=](IR::IREmitter& ir, IR::Inst& inst) { const auto opcode = inst.GetOpcode(); diff --git a/src/shader_recompiler/recompiler.cpp b/src/shader_recompiler/recompiler.cpp index 1c132ebbb..5004e0beb 100644 --- a/src/shader_recompiler/recompiler.cpp +++ b/src/shader_recompiler/recompiler.cpp @@ -1,9 +1,6 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later -#include "common/config.h" -#include "common/io_file.h" -#include "common/path_util.h" #include "shader_recompiler/frontend/control_flow_graph.h" #include "shader_recompiler/frontend/decode.h" #include "shader_recompiler/frontend/structured_control_flow.h" @@ -63,26 +60,18 @@ IR::Program TranslateProgram(std::span code, Pools& pools, Info& info program.post_order_blocks = Shader::IR::PostOrder(program.syntax_list.front()); // Run optimization passes - const auto stage = program.info.stage; - Shader::Optimization::SsaRewritePass(program.post_order_blocks); + Shader::Optimization::ConstantPropagationPass(program.post_order_blocks); Shader::Optimization::IdentityRemovalPass(program.blocks); if (info.l_stage == LogicalStage::TessellationControl) { - // Tess passes require previous const prop passes for now (for simplicity). TODO allow - // fine grained folding or opportunistic folding we set an operand to an immediate - Shader::Optimization::ConstantPropagationPass(program.post_order_blocks); Shader::Optimization::TessellationPreprocess(program, runtime_info); - Shader::Optimization::ConstantPropagationPass(program.post_order_blocks); Shader::Optimization::HullShaderTransform(program, runtime_info); } else if (info.l_stage == LogicalStage::TessellationEval) { - Shader::Optimization::ConstantPropagationPass(program.post_order_blocks); Shader::Optimization::TessellationPreprocess(program, runtime_info); - Shader::Optimization::ConstantPropagationPass(program.post_order_blocks); Shader::Optimization::DomainShaderTransform(program, runtime_info); } - Shader::Optimization::ConstantPropagationPass(program.post_order_blocks); - Shader::Optimization::RingAccessElimination(program, runtime_info, stage); - Shader::Optimization::ConstantPropagationPass(program.post_order_blocks); + Shader::Optimization::RingAccessElimination(program, runtime_info); + Shader::Optimization::ReadLaneEliminationPass(program); Shader::Optimization::FlattenExtendedUserdataPass(program); Shader::Optimization::ResourceTrackingPass(program); Shader::Optimization::LowerBufferFormatToRaw(program);