From f8a45c1e724f14834e6c4b7489c38355d27fcfe0 Mon Sep 17 00:00:00 2001 From: pineappleEA Date: Mon, 12 Jul 2021 20:42:10 +0200 Subject: [PATCH] early-access version 1870 --- README.md | 2 +- .../backend/glasm/emit_context.h | 2 +- .../backend/glasm/emit_glasm.cpp | 7 +- .../backend/glasm/reg_alloc.h | 3 +- .../backend/glsl/emit_glsl.cpp | 20 ++- .../backend/glsl/emit_glsl_floating_point.cpp | 2 +- .../backend/spirv/emit_spirv_image.cpp | 15 +- src/shader_recompiler/frontend/ir/opcodes.h | 3 +- .../frontend/maxwell/control_flow.cpp | 13 +- .../frontend/maxwell/control_flow.h | 1 - .../maxwell/structured_control_flow.cpp | 17 +- .../impl/atomic_operations_global_memory.cpp | 12 +- .../integer_floating_point_conversion.cpp | 4 +- .../translate/impl/load_store_attribute.cpp | 12 +- .../impl/surface_atomic_operations.cpp | 3 - .../translate/impl/surface_load_store.cpp | 8 +- .../frontend/maxwell/translate_program.cpp | 20 ++- .../ir_opt/constant_propagation_pass.cpp | 5 +- .../ir_opt/dead_code_elimination_pass.cpp | 2 - .../ir_opt/dual_vertex_pass.cpp | 6 - .../global_memory_to_storage_buffer_pass.cpp | 7 +- .../ir_opt/lower_int64_to_int32.cpp | 5 +- .../ir_opt/ssa_rewrite_pass.cpp | 10 +- src/shader_recompiler/profile.h | 2 + src/video_core/buffer_cache/buffer_cache.h | 150 ++++++++++++------ src/video_core/engines/maxwell_dma.cpp | 36 +++-- src/video_core/engines/maxwell_dma.h | 17 +- src/video_core/gpu.cpp | 1 + src/video_core/macro/macro_hle.cpp | 63 +++++++- src/video_core/rasterizer_interface.h | 5 + .../renderer_opengl/gl_rasterizer.cpp | 13 +- .../renderer_opengl/gl_rasterizer.h | 13 ++ .../renderer_opengl/gl_shader_cache.cpp | 3 +- .../renderer_vulkan/vk_graphics_pipeline.h | 4 +- .../renderer_vulkan/vk_rasterizer.cpp | 13 +- .../renderer_vulkan/vk_rasterizer.h | 13 ++ 36 files changed, 362 insertions(+), 150 deletions(-) diff --git a/README.md b/README.md index 906dbf41f..5ddb1dc26 100755 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ yuzu emulator early access ============= -This is the source code for early-access 1869. +This is the source code for early-access 1870. ## Legal Notice diff --git a/src/shader_recompiler/backend/glasm/emit_context.h b/src/shader_recompiler/backend/glasm/emit_context.h index 1da51a996..8433e5c00 100755 --- a/src/shader_recompiler/backend/glasm/emit_context.h +++ b/src/shader_recompiler/backend/glasm/emit_context.h @@ -59,7 +59,7 @@ public: } std::string code; - RegAlloc reg_alloc{*this}; + RegAlloc reg_alloc{}; const Info& info; const Profile& profile; const RuntimeInfo& runtime_info; diff --git a/src/shader_recompiler/backend/glasm/emit_glasm.cpp b/src/shader_recompiler/backend/glasm/emit_glasm.cpp index 64787b353..a5e8c9b6e 100755 --- a/src/shader_recompiler/backend/glasm/emit_glasm.cpp +++ b/src/shader_recompiler/backend/glasm/emit_glasm.cpp @@ -2,7 +2,7 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include +#include #include #include @@ -196,7 +196,10 @@ void PrecolorInst(IR::Inst& phi) { void Precolor(const IR::Program& program) { for (IR::Block* const block : program.blocks) { - for (IR::Inst& phi : block->Instructions() | std::views::take_while(IR::IsPhi)) { + for (IR::Inst& phi : block->Instructions()) { + if (!IR::IsPhi(phi)) { + break; + } PrecolorInst(phi); } } diff --git a/src/shader_recompiler/backend/glasm/reg_alloc.h b/src/shader_recompiler/backend/glasm/reg_alloc.h index 5a703daf2..82aec66c6 100755 --- a/src/shader_recompiler/backend/glasm/reg_alloc.h +++ b/src/shader_recompiler/backend/glasm/reg_alloc.h @@ -86,7 +86,7 @@ struct ScalarF64 : Value {}; class RegAlloc { public: - RegAlloc(EmitContext& ctx_) : ctx{ctx_} {} + RegAlloc() = default; Register Define(IR::Inst& inst); @@ -142,7 +142,6 @@ private: void Free(Id id); - EmitContext& ctx; size_t num_used_registers{}; size_t num_used_long_registers{}; std::bitset register_use{}; diff --git a/src/shader_recompiler/backend/glsl/emit_glsl.cpp b/src/shader_recompiler/backend/glsl/emit_glsl.cpp index ffdc6dbba..8a430d573 100755 --- a/src/shader_recompiler/backend/glsl/emit_glsl.cpp +++ b/src/shader_recompiler/backend/glsl/emit_glsl.cpp @@ -2,8 +2,10 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include +#include #include +#include +#include #include "common/div_ceil.h" #include "common/settings.h" @@ -120,7 +122,10 @@ void PrecolorInst(IR::Inst& phi) { void Precolor(const IR::Program& program) { for (IR::Block* const block : program.blocks) { - for (IR::Inst& phi : block->Instructions() | std::views::take_while(IR::IsPhi)) { + for (IR::Inst& phi : block->Instructions()) { + if (!IR::IsPhi(phi)) { + break; + } PrecolorInst(phi); } } @@ -218,8 +223,15 @@ std::string EmitGLSL(const Profile& profile, const RuntimeInfo& runtime_info, IR const std::string version{fmt::format("#version 450{}\n", GlslVersionSpecifier(ctx))}; ctx.header.insert(0, version); if (program.shared_memory_size > 0) { - ctx.header += - fmt::format("shared uint smem[{}];", Common::DivCeil(program.shared_memory_size, 4U)); + const auto requested_size{program.shared_memory_size}; + const auto max_size{profile.gl_max_compute_smem_size}; + const bool needs_clamp{requested_size > max_size}; + if (needs_clamp) { + LOG_WARNING(Shader_GLSL, "Requested shared memory size ({}) exceeds device limit ({})", + requested_size, max_size); + } + const auto smem_size{needs_clamp ? max_size : requested_size}; + ctx.header += fmt::format("shared uint smem[{}];", Common::DivCeil(smem_size, 4U)); } ctx.header += "void main(){\n"; if (program.local_memory_size > 0) { diff --git a/src/shader_recompiler/backend/glsl/emit_glsl_floating_point.cpp b/src/shader_recompiler/backend/glsl/emit_glsl_floating_point.cpp index b11be5bd7..2edcf592e 100755 --- a/src/shader_recompiler/backend/glsl/emit_glsl_floating_point.cpp +++ b/src/shader_recompiler/backend/glsl/emit_glsl_floating_point.cpp @@ -22,7 +22,7 @@ void Compare(EmitContext& ctx, IR::Inst& inst, std::string_view lhs, std::string } bool IsPrecise(const IR::Inst& inst) { - return {inst.Flags().no_contraction}; + return inst.Flags().no_contraction; } } // Anonymous namespace diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp index 647804814..3588f052b 100755 --- a/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp @@ -109,7 +109,7 @@ private: return; } if (offset.IsImmediate()) { - Add(spv::ImageOperandsMask::ConstOffset, ctx.SConst(offset.U32())); + Add(spv::ImageOperandsMask::ConstOffset, ctx.SConst(static_cast(offset.U32()))); return; } IR::Inst* const inst{offset.InstRecursive()}; @@ -117,16 +117,21 @@ private: switch (inst->GetOpcode()) { case IR::Opcode::CompositeConstructU32x2: Add(spv::ImageOperandsMask::ConstOffset, - ctx.SConst(inst->Arg(0).U32(), inst->Arg(1).U32())); + ctx.SConst(static_cast(inst->Arg(0).U32()), + static_cast(inst->Arg(1).U32()))); return; case IR::Opcode::CompositeConstructU32x3: Add(spv::ImageOperandsMask::ConstOffset, - ctx.SConst(inst->Arg(0).U32(), inst->Arg(1).U32(), inst->Arg(2).U32())); + ctx.SConst(static_cast(inst->Arg(0).U32()), + static_cast(inst->Arg(1).U32()), + static_cast(inst->Arg(2).U32()))); return; case IR::Opcode::CompositeConstructU32x4: Add(spv::ImageOperandsMask::ConstOffset, - ctx.SConst(inst->Arg(0).U32(), inst->Arg(1).U32(), inst->Arg(2).U32(), - inst->Arg(3).U32())); + ctx.SConst(static_cast(inst->Arg(0).U32()), + static_cast(inst->Arg(1).U32()), + static_cast(inst->Arg(2).U32()), + static_cast(inst->Arg(3).U32()))); return; default: break; diff --git a/src/shader_recompiler/frontend/ir/opcodes.h b/src/shader_recompiler/frontend/ir/opcodes.h index 56b001902..9ab108292 100755 --- a/src/shader_recompiler/frontend/ir/opcodes.h +++ b/src/shader_recompiler/frontend/ir/opcodes.h @@ -67,7 +67,8 @@ constexpr OpcodeMeta META_TABLE[]{ }; constexpr size_t CalculateNumArgsOf(Opcode op) { const auto& arg_types{META_TABLE[static_cast(op)].arg_types}; - return std::distance(arg_types.begin(), std::ranges::find(arg_types, Type::Void)); + return static_cast( + std::distance(arg_types.begin(), std::ranges::find(arg_types, Type::Void))); } constexpr u8 NUM_ARGS[]{ diff --git a/src/shader_recompiler/frontend/maxwell/control_flow.cpp b/src/shader_recompiler/frontend/maxwell/control_flow.cpp index e7abea82f..1a954a509 100755 --- a/src/shader_recompiler/frontend/maxwell/control_flow.cpp +++ b/src/shader_recompiler/frontend/maxwell/control_flow.cpp @@ -5,7 +5,6 @@ #include #include #include -#include #include #include @@ -151,18 +150,18 @@ std::pair Stack::Pop(Token token) const { } std::optional Stack::Peek(Token token) const { - const auto reverse_entries{entries | std::views::reverse}; - const auto it{std::ranges::find(reverse_entries, token, &StackEntry::token)}; - if (it == reverse_entries.end()) { + const auto it{std::find_if(entries.rbegin(), entries.rend(), + [token](const auto& entry) { return entry.token == token; })}; + if (it == entries.rend()) { return std::nullopt; } return it->target; } Stack Stack::Remove(Token token) const { - const auto reverse_entries{entries | std::views::reverse}; - const auto it{std::ranges::find(reverse_entries, token, &StackEntry::token)}; - const auto pos{std::distance(reverse_entries.begin(), it)}; + const auto it{std::find_if(entries.rbegin(), entries.rend(), + [token](const auto& entry) { return entry.token == token; })}; + const auto pos{std::distance(entries.rbegin(), it)}; Stack result; result.entries.insert(result.entries.end(), entries.begin(), entries.end() - pos - 1); return result; diff --git a/src/shader_recompiler/frontend/maxwell/control_flow.h b/src/shader_recompiler/frontend/maxwell/control_flow.h index 0e515c3b6..a6bd3e196 100755 --- a/src/shader_recompiler/frontend/maxwell/control_flow.h +++ b/src/shader_recompiler/frontend/maxwell/control_flow.h @@ -161,7 +161,6 @@ private: Environment& env; ObjectPool& block_pool; boost::container::small_vector functions; - FunctionId current_function_id{0}; Location program_start; bool exits_to_dispatcher{}; Block* dispatch_block{}; diff --git a/src/shader_recompiler/frontend/maxwell/structured_control_flow.cpp b/src/shader_recompiler/frontend/maxwell/structured_control_flow.cpp index 06fde0017..8b3e0a15c 100755 --- a/src/shader_recompiler/frontend/maxwell/structured_control_flow.cpp +++ b/src/shader_recompiler/frontend/maxwell/structured_control_flow.cpp @@ -4,7 +4,6 @@ #include #include -#include #include #include #include @@ -167,7 +166,7 @@ std::string DumpExpr(const Statement* stmt) { } } -std::string DumpTree(const Tree& tree, u32 indentation = 0) { +[[maybe_unused]] std::string DumpTree(const Tree& tree, u32 indentation = 0) { std::string ret; std::string indent(indentation, ' '); for (auto stmt = tree.begin(); stmt != tree.end(); ++stmt) { @@ -313,12 +312,11 @@ bool NeedsLift(Node goto_stmt, Node label_stmt) noexcept { class GotoPass { public: - explicit GotoPass(Flow::CFG& cfg, ObjectPool& inst_pool_, - ObjectPool& block_pool_, ObjectPool& stmt_pool) - : inst_pool{inst_pool_}, block_pool{block_pool_}, pool{stmt_pool} { + explicit GotoPass(Flow::CFG& cfg, ObjectPool& stmt_pool) : pool{stmt_pool} { std::vector gotos{BuildTree(cfg)}; - for (const Node& goto_stmt : gotos | std::views::reverse) { - RemoveGoto(goto_stmt); + const auto end{gotos.rend()}; + for (auto goto_stmt = gotos.rbegin(); goto_stmt != end; ++goto_stmt) { + RemoveGoto(*goto_stmt); } } @@ -616,8 +614,6 @@ private: return parent_tree.insert(std::next(loop), *new_goto); } - ObjectPool& inst_pool; - ObjectPool& block_pool; ObjectPool& pool; Statement root_stmt{FunctionTag{}}; }; @@ -864,7 +860,6 @@ private: ObjectPool& block_pool; Environment& env; IR::AbstractSyntaxList& syntax_list; - u32 loop_id{}; // TODO: C++20 Remove this when all compilers support constexpr std::vector #if __cpp_lib_constexpr_vector >= 201907 @@ -878,7 +873,7 @@ private: IR::AbstractSyntaxList BuildASL(ObjectPool& inst_pool, ObjectPool& block_pool, Environment& env, Flow::CFG& cfg) { ObjectPool stmt_pool{64}; - GotoPass goto_pass{cfg, inst_pool, block_pool, stmt_pool}; + GotoPass goto_pass{cfg, stmt_pool}; Statement& root{goto_pass.RootStatement()}; IR::AbstractSyntaxList syntax_list; TranslatePass{inst_pool, block_pool, stmt_pool, env, root, syntax_list}; diff --git a/src/shader_recompiler/frontend/maxwell/translate/impl/atomic_operations_global_memory.cpp b/src/shader_recompiler/frontend/maxwell/translate/impl/atomic_operations_global_memory.cpp index 66f39e44e..d9f999e05 100755 --- a/src/shader_recompiler/frontend/maxwell/translate/impl/atomic_operations_global_memory.cpp +++ b/src/shader_recompiler/frontend/maxwell/translate/impl/atomic_operations_global_memory.cpp @@ -59,14 +59,14 @@ IR::U32U64 ApplyIntegerAtomOp(IR::IREmitter& ir, const IR::U32U64& offset, const IR::Value ApplyFpAtomOp(IR::IREmitter& ir, const IR::U64& offset, const IR::Value& op_b, AtomOp op, AtomSize size) { static constexpr IR::FpControl f16_control{ - .no_contraction{false}, - .rounding{IR::FpRounding::RN}, - .fmz_mode{IR::FmzMode::DontCare}, + .no_contraction = false, + .rounding = IR::FpRounding::RN, + .fmz_mode = IR::FmzMode::DontCare, }; static constexpr IR::FpControl f32_control{ - .no_contraction{false}, - .rounding{IR::FpRounding::RN}, - .fmz_mode{IR::FmzMode::FTZ}, + .no_contraction = false, + .rounding = IR::FpRounding::RN, + .fmz_mode = IR::FmzMode::FTZ, }; switch (op) { case AtomOp::ADD: diff --git a/src/shader_recompiler/frontend/maxwell/translate/impl/integer_floating_point_conversion.cpp b/src/shader_recompiler/frontend/maxwell/translate/impl/integer_floating_point_conversion.cpp index e0e157275..0b8119ddd 100755 --- a/src/shader_recompiler/frontend/maxwell/translate/impl/integer_floating_point_conversion.cpp +++ b/src/shader_recompiler/frontend/maxwell/translate/impl/integer_floating_point_conversion.cpp @@ -104,7 +104,9 @@ void I2F(TranslatorVisitor& v, u64 insn, IR::U32U64 src) { .rounding = CastFpRounding(i2f.fp_rounding), .fmz_mode = IR::FmzMode::DontCare, }; - auto value{v.ir.ConvertIToF(dst_bitsize, conversion_src_bitsize, is_signed, src, fp_control)}; + auto value{v.ir.ConvertIToF(static_cast(dst_bitsize), + static_cast(conversion_src_bitsize), is_signed, src, + fp_control)}; if (i2f.neg != 0) { if (i2f.abs != 0 || !is_signed) { // We know the value is positive diff --git a/src/shader_recompiler/frontend/maxwell/translate/impl/load_store_attribute.cpp b/src/shader_recompiler/frontend/maxwell/translate/impl/load_store_attribute.cpp index 7d7dcc3cb..924fb7a40 100755 --- a/src/shader_recompiler/frontend/maxwell/translate/impl/load_store_attribute.cpp +++ b/src/shader_recompiler/frontend/maxwell/translate/impl/load_store_attribute.cpp @@ -80,10 +80,10 @@ void TranslatorVisitor::ALD(u64 insn) { for (u32 element = 0; element < num_elements; ++element) { if (ald.patch != 0) { const IR::Patch patch{offset / 4 + element}; - F(ald.dest_reg + element, ir.GetPatch(patch)); + F(ald.dest_reg + static_cast(element), ir.GetPatch(patch)); } else { const IR::Attribute attr{offset / 4 + element}; - F(ald.dest_reg + element, ir.GetAttribute(attr, vertex)); + F(ald.dest_reg + static_cast(element), ir.GetAttribute(attr, vertex)); } } return; @@ -92,7 +92,7 @@ void TranslatorVisitor::ALD(u64 insn) { throw NotImplementedException("Indirect patch read"); } HandleIndexed(*this, ald.index_reg, num_elements, [&](u32 element, IR::U32 final_offset) { - F(ald.dest_reg + element, ir.GetAttributeIndexed(final_offset, vertex)); + F(ald.dest_reg + static_cast(element), ir.GetAttributeIndexed(final_offset, vertex)); }); } @@ -121,10 +121,10 @@ void TranslatorVisitor::AST(u64 insn) { for (u32 element = 0; element < num_elements; ++element) { if (ast.patch != 0) { const IR::Patch patch{offset / 4 + element}; - ir.SetPatch(patch, F(ast.src_reg + element)); + ir.SetPatch(patch, F(ast.src_reg + static_cast(element))); } else { const IR::Attribute attr{offset / 4 + element}; - ir.SetAttribute(attr, F(ast.src_reg + element), vertex); + ir.SetAttribute(attr, F(ast.src_reg + static_cast(element)), vertex); } } return; @@ -133,7 +133,7 @@ void TranslatorVisitor::AST(u64 insn) { throw NotImplementedException("Indexed tessellation patch store"); } HandleIndexed(*this, ast.index_reg, num_elements, [&](u32 element, IR::U32 final_offset) { - ir.SetAttributeIndexed(final_offset, F(ast.src_reg + element), vertex); + ir.SetAttributeIndexed(final_offset, F(ast.src_reg + static_cast(element)), vertex); }); } diff --git a/src/shader_recompiler/frontend/maxwell/translate/impl/surface_atomic_operations.cpp b/src/shader_recompiler/frontend/maxwell/translate/impl/surface_atomic_operations.cpp index 44144f154..63b588ad4 100755 --- a/src/shader_recompiler/frontend/maxwell/translate/impl/surface_atomic_operations.cpp +++ b/src/shader_recompiler/frontend/maxwell/translate/impl/surface_atomic_operations.cpp @@ -69,9 +69,6 @@ TextureType GetType(Type type) { } IR::Value MakeCoords(TranslatorVisitor& v, IR::Reg reg, Type type) { - const auto array{[&](int index) { - return v.ir.BitFieldExtract(v.X(reg + index), v.ir.Imm32(0), v.ir.Imm32(16)); - }}; switch (type) { case Type::_1D: case Type::BUFFER_1D: diff --git a/src/shader_recompiler/frontend/maxwell/translate/impl/surface_load_store.cpp b/src/shader_recompiler/frontend/maxwell/translate/impl/surface_load_store.cpp index 7dc793ad7..681220a8d 100755 --- a/src/shader_recompiler/frontend/maxwell/translate/impl/surface_load_store.cpp +++ b/src/shader_recompiler/frontend/maxwell/translate/impl/surface_load_store.cpp @@ -160,10 +160,10 @@ unsigned SwizzleMask(u64 swizzle) { IR::Value MakeColor(IR::IREmitter& ir, IR::Reg reg, int num_regs) { std::array colors; for (int i = 0; i < num_regs; ++i) { - colors[i] = ir.GetReg(reg + i); + colors[static_cast(i)] = ir.GetReg(reg + i); } for (int i = num_regs; i < 4; ++i) { - colors[i] = ir.Imm32(0); + colors[static_cast(i)] = ir.Imm32(0); } return ir.CompositeConstruct(colors[0], colors[1], colors[2], colors[3]); } @@ -211,12 +211,12 @@ void TranslatorVisitor::SULD(u64 insn) { if (is_typed) { const int num_regs{SizeInRegs(suld.size)}; for (int i = 0; i < num_regs; ++i) { - X(dest_reg + i, IR::U32{ir.CompositeExtract(result, i)}); + X(dest_reg + i, IR::U32{ir.CompositeExtract(result, static_cast(i))}); } } else { const unsigned mask{SwizzleMask(suld.swizzle)}; const int bits{std::popcount(mask)}; - if (!IR::IsAligned(dest_reg, bits == 3 ? 4 : bits)) { + if (!IR::IsAligned(dest_reg, bits == 3 ? 4 : static_cast(bits))) { throw NotImplementedException("Unaligned destination register"); } for (unsigned component = 0; component < 4; ++component) { diff --git a/src/shader_recompiler/frontend/maxwell/translate_program.cpp b/src/shader_recompiler/frontend/maxwell/translate_program.cpp index 83c77967d..c067d459c 100755 --- a/src/shader_recompiler/frontend/maxwell/translate_program.cpp +++ b/src/shader_recompiler/frontend/maxwell/translate_program.cpp @@ -4,7 +4,6 @@ #include #include -#include #include #include "common/settings.h" @@ -20,12 +19,19 @@ namespace Shader::Maxwell { namespace { IR::BlockList GenerateBlocks(const IR::AbstractSyntaxList& syntax_list) { - auto syntax_blocks{syntax_list | std::views::filter([](const auto& node) { - return node.type == IR::AbstractSyntaxNode::Type::Block; - })}; - IR::BlockList blocks(std::ranges::distance(syntax_blocks)); - std::ranges::transform(syntax_blocks, blocks.begin(), - [](const IR::AbstractSyntaxNode& node) { return node.data.block; }); + size_t num_syntax_blocks{}; + for (const auto& node : syntax_list) { + if (node.type == IR::AbstractSyntaxNode::Type::Block) { + ++num_syntax_blocks; + } + } + IR::BlockList blocks; + blocks.reserve(num_syntax_blocks); + for (const auto& node : syntax_list) { + if (node.type == IR::AbstractSyntaxNode::Type::Block) { + blocks.push_back(node.data.block); + } + } return blocks; } diff --git a/src/shader_recompiler/ir_opt/constant_propagation_pass.cpp b/src/shader_recompiler/ir_opt/constant_propagation_pass.cpp index 3c72203ad..8dd6d6c2c 100755 --- a/src/shader_recompiler/ir_opt/constant_propagation_pass.cpp +++ b/src/shader_recompiler/ir_opt/constant_propagation_pass.cpp @@ -3,7 +3,6 @@ // Refer to the license.txt file included. #include -#include #include #include @@ -599,7 +598,9 @@ void ConstantPropagation(IR::Block& block, IR::Inst& inst) { } // Anonymous namespace void ConstantPropagationPass(IR::Program& program) { - for (IR::Block* const block : program.post_order_blocks | std::views::reverse) { + const auto end{program.post_order_blocks.rend()}; + for (auto it = program.post_order_blocks.rbegin(); it != end; ++it) { + IR::Block* const block{*it}; for (IR::Inst& inst : block->Instructions()) { ConstantPropagation(*block, inst); } diff --git a/src/shader_recompiler/ir_opt/dead_code_elimination_pass.cpp b/src/shader_recompiler/ir_opt/dead_code_elimination_pass.cpp index 1e4a3fdae..400836301 100755 --- a/src/shader_recompiler/ir_opt/dead_code_elimination_pass.cpp +++ b/src/shader_recompiler/ir_opt/dead_code_elimination_pass.cpp @@ -2,8 +2,6 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include - #include "shader_recompiler/frontend/ir/basic_block.h" #include "shader_recompiler/frontend/ir/value.h" #include "shader_recompiler/ir_opt/passes.h" diff --git a/src/shader_recompiler/ir_opt/dual_vertex_pass.cpp b/src/shader_recompiler/ir_opt/dual_vertex_pass.cpp index 3d2c205c2..055ba9c54 100755 --- a/src/shader_recompiler/ir_opt/dual_vertex_pass.cpp +++ b/src/shader_recompiler/ir_opt/dual_vertex_pass.cpp @@ -2,12 +2,6 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include -#include - -#include "common/bit_cast.h" -#include "common/bit_util.h" -#include "shader_recompiler/exception.h" #include "shader_recompiler/frontend/ir/ir_emitter.h" #include "shader_recompiler/ir_opt/passes.h" diff --git a/src/shader_recompiler/ir_opt/global_memory_to_storage_buffer_pass.cpp b/src/shader_recompiler/ir_opt/global_memory_to_storage_buffer_pass.cpp index 70449eeca..4197b0095 100755 --- a/src/shader_recompiler/ir_opt/global_memory_to_storage_buffer_pass.cpp +++ b/src/shader_recompiler/ir_opt/global_memory_to_storage_buffer_pass.cpp @@ -5,7 +5,6 @@ #include #include #include -#include #include #include @@ -314,8 +313,8 @@ std::optional Track(const IR::Value& value, const Bias* bias) return std::nullopt; } const StorageBufferAddr storage_buffer{ - .index{index.U32()}, - .offset{offset.U32()}, + .index = index.U32(), + .offset = offset.U32(), }; if (!Common::IsAligned(storage_buffer.offset, 16)) { // The SSBO pointer has to be aligned @@ -484,7 +483,7 @@ void GlobalMemoryToStorageBufferPass(IR::Program& program) { .cbuf_index = storage_buffer.index, .cbuf_offset = storage_buffer.offset, .count = 1, - .is_written{info.writes.contains(storage_buffer)}, + .is_written = info.writes.contains(storage_buffer), }); } for (const StorageInst& storage_inst : info.to_replace) { diff --git a/src/shader_recompiler/ir_opt/lower_int64_to_int32.cpp b/src/shader_recompiler/ir_opt/lower_int64_to_int32.cpp index abf7c87c7..e80d3d1d9 100755 --- a/src/shader_recompiler/ir_opt/lower_int64_to_int32.cpp +++ b/src/shader_recompiler/ir_opt/lower_int64_to_int32.cpp @@ -2,7 +2,6 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include #include #include "shader_recompiler/exception.h" @@ -207,7 +206,9 @@ void Lower(IR::Block& block, IR::Inst& inst) { } // Anonymous namespace void LowerInt64ToInt32(IR::Program& program) { - for (IR::Block* const block : program.post_order_blocks | std::views::reverse) { + const auto end{program.post_order_blocks.rend()}; + for (auto it = program.post_order_blocks.rbegin(); it != end; ++it) { + IR::Block* const block{*it}; for (IR::Inst& inst : block->Instructions()) { Lower(*block, inst); } diff --git a/src/shader_recompiler/ir_opt/ssa_rewrite_pass.cpp b/src/shader_recompiler/ir_opt/ssa_rewrite_pass.cpp index dcaced83f..53145fb5e 100755 --- a/src/shader_recompiler/ir_opt/ssa_rewrite_pass.cpp +++ b/src/shader_recompiler/ir_opt/ssa_rewrite_pass.cpp @@ -14,7 +14,6 @@ // https://link.springer.com/chapter/10.1007/978-3-642-37051-9_6 // -#include #include #include #include @@ -243,7 +242,9 @@ public: void SealBlock(IR::Block* block) { const auto it{incomplete_phis.find(block)}; if (it != incomplete_phis.end()) { - for (auto& [variant, phi] : it->second) { + for (auto& pair : it->second) { + auto& variant{pair.first}; + auto& phi{pair.second}; std::visit([&](auto& variable) { AddPhiOperands(variable, *phi, block); }, variant); } } @@ -373,8 +374,9 @@ void VisitBlock(Pass& pass, IR::Block* block) { void SsaRewritePass(IR::Program& program) { Pass pass; - for (IR::Block* const block : program.post_order_blocks | std::views::reverse) { - VisitBlock(pass, block); + const auto end{program.post_order_blocks.rend()}; + for (auto block = program.post_order_blocks.rbegin(); block != end; ++block) { + VisitBlock(pass, *block); } } diff --git a/src/shader_recompiler/profile.h b/src/shader_recompiler/profile.h index 501dcaf71..f0c3b3b17 100755 --- a/src/shader_recompiler/profile.h +++ b/src/shader_recompiler/profile.h @@ -67,6 +67,8 @@ struct Profile { bool has_gl_precise_bug{}; /// Ignores SPIR-V ordered vs unordered using GLSL semantics bool ignore_nan_fp_comparisons{}; + + u32 gl_max_compute_smem_size{}; }; } // namespace Shader diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index d5bf26894..798b327d2 100755 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -186,6 +186,8 @@ public: /// Pop asynchronous downloads void PopAsyncFlushes(); + [[nodiscard]] bool DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount); + /// Return true when a CPU region is modified from the GPU [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); @@ -223,6 +225,36 @@ private: } } + template + void ForEachWrittenRange(VAddr cpu_addr, u64 size, Func&& func) { + const VAddr start_address = cpu_addr; + const VAddr end_address = start_address + size; + const VAddr search_base = + static_cast(std::min(0LL, static_cast(start_address - size))); + const IntervalType search_interval{search_base, search_base + 1}; + auto it = common_ranges.lower_bound(search_interval); + if (it == common_ranges.end()) { + it = common_ranges.begin(); + } + for (; it != common_ranges.end(); it++) { + VAddr inter_addr_end = it->upper(); + VAddr inter_addr = it->lower(); + if (inter_addr >= end_address) { + break; + } + if (inter_addr_end <= start_address) { + continue; + } + if (inter_addr_end > end_address) { + inter_addr_end = end_address; + } + if (inter_addr < start_address) { + inter_addr = start_address; + } + func(inter_addr, inter_addr_end); + } + } + static bool IsRangeGranular(VAddr cpu_addr, size_t size) { return (cpu_addr & ~Core::Memory::PAGE_MASK) == ((cpu_addr + size) & ~Core::Memory::PAGE_MASK); @@ -477,6 +509,68 @@ void BufferCache

::DownloadMemory(VAddr cpu_addr, u64 size) { }); } +template +bool BufferCache

::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount) { + const std::optional cpu_src_address = gpu_memory.GpuToCpuAddress(src_address); + const std::optional cpu_dest_address = gpu_memory.GpuToCpuAddress(dest_address); + if (!cpu_src_address || !cpu_dest_address) { + return false; + } + const bool source_dirty = IsRegionGpuModified(*cpu_src_address, amount); + const bool dest_dirty = IsRegionGpuModified(*cpu_dest_address, amount); + if (!source_dirty && !dest_dirty) { + return false; + } + + const IntervalType subtract_interval{*cpu_dest_address, *cpu_dest_address + amount}; + uncommitted_ranges.subtract(subtract_interval); + for (auto& interval_set : committed_ranges) { + interval_set.subtract(subtract_interval); + } + + BufferId buffer_a; + BufferId buffer_b; + do { + has_deleted_buffers = false; + buffer_a = FindBuffer(*cpu_src_address, static_cast(amount)); + buffer_b = FindBuffer(*cpu_dest_address, static_cast(amount)); + } while (has_deleted_buffers); + auto& src_buffer = slot_buffers[buffer_a]; + auto& dest_buffer = slot_buffers[buffer_b]; + SynchronizeBuffer(src_buffer, *cpu_src_address, static_cast(amount)); + SynchronizeBuffer(dest_buffer, *cpu_dest_address, static_cast(amount)); + std::array copies{BufferCopy{ + .src_offset = src_buffer.Offset(*cpu_src_address), + .dst_offset = dest_buffer.Offset(*cpu_dest_address), + .size = amount, + }}; + + boost::container::small_vector tmp_intervals; + auto mirror = [&](VAddr base_address, VAddr base_address_end) { + const u64 size = base_address_end - base_address; + const VAddr diff = base_address - *cpu_src_address; + const VAddr new_base_address = *cpu_dest_address + diff; + const IntervalType add_interval{new_base_address, new_base_address + size}; + uncommitted_ranges.add(add_interval); + tmp_intervals.push_back(add_interval); + }; + ForEachWrittenRange(*cpu_src_address, amount, mirror); + // This subtraction in this order is important for overlapping copies. + common_ranges.subtract(subtract_interval); + for (const IntervalType add_interval : tmp_intervals) { + common_ranges.add(add_interval); + } + + runtime.CopyBuffer(dest_buffer, src_buffer, copies); + if (source_dirty) { + dest_buffer.MarkRegionAsGpuModified(*cpu_dest_address, amount); + } + std::vector tmp_buffer(amount); + cpu_memory.ReadBlockUnsafe(*cpu_src_address, tmp_buffer.data(), amount); + cpu_memory.WriteBlockUnsafe(*cpu_dest_address, tmp_buffer.data(), amount); + return true; +} + template void BufferCache

::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) { @@ -711,30 +805,7 @@ void BufferCache

::CommitAsyncFlushesHigh() { const VAddr start_address = buffer_addr + range_offset; const VAddr end_address = start_address + range_size; - const IntervalType search_interval{cpu_addr, 1}; - auto it = common_ranges.lower_bound(search_interval); - if (it == common_ranges.end()) { - it = common_ranges.begin(); - } - while (it != common_ranges.end()) { - VAddr inter_addr_end = it->upper(); - VAddr inter_addr = it->lower(); - if (inter_addr >= end_address) { - break; - } - if (inter_addr_end <= start_address) { - it++; - continue; - } - if (inter_addr_end > end_address) { - inter_addr_end = end_address; - } - if (inter_addr < start_address) { - inter_addr = start_address; - } - add_download(inter_addr, inter_addr_end); - it++; - } + ForEachWrittenRange(start_address, range_size, add_download); const IntervalType subtract_interval{start_address, end_address}; common_ranges.subtract(subtract_interval); }); @@ -832,7 +903,9 @@ void BufferCache

::BindHostIndexBuffer() { const u32 size = index_buffer.size; SynchronizeBuffer(buffer, index_buffer.cpu_addr, size); if constexpr (HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT) { - runtime.BindIndexBuffer(buffer, offset, size); + const u32 new_offset = offset + maxwell3d.regs.index_array.first * + maxwell3d.regs.index_array.FormatSizeInBytes(); + runtime.BindIndexBuffer(buffer, new_offset, size); } else { runtime.BindIndexBuffer(maxwell3d.regs.draw.topology, maxwell3d.regs.index_array.format, maxwell3d.regs.index_array.first, maxwell3d.regs.index_array.count, @@ -1113,7 +1186,7 @@ void BufferCache

::UpdateIndexBuffer() { const GPUVAddr gpu_addr_end = index_array.EndAddress(); const std::optional cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr_begin); const u32 address_size = static_cast(gpu_addr_end - gpu_addr_begin); - const u32 draw_size = index_array.count * index_array.FormatSizeInBytes(); + const u32 draw_size = (index_array.count + index_array.first) * index_array.FormatSizeInBytes(); const u32 size = std::min(address_size, draw_size); if (size == 0 || !cpu_addr) { index_buffer = NULL_BINDING; @@ -1535,30 +1608,7 @@ void BufferCache

::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 si const VAddr start_address = buffer_addr + range_offset; const VAddr end_address = start_address + range_size; - const IntervalType search_interval{start_address - range_size, 1}; - auto it = common_ranges.lower_bound(search_interval); - if (it == common_ranges.end()) { - it = common_ranges.begin(); - } - while (it != common_ranges.end()) { - VAddr inter_addr_end = it->upper(); - VAddr inter_addr = it->lower(); - if (inter_addr >= end_address) { - break; - } - if (inter_addr_end <= start_address) { - it++; - continue; - } - if (inter_addr_end > end_address) { - inter_addr_end = end_address; - } - if (inter_addr < start_address) { - inter_addr = start_address; - } - add_download(inter_addr, inter_addr_end); - it++; - } + ForEachWrittenRange(start_address, range_size, add_download); const IntervalType subtract_interval{start_address, end_address}; common_ranges.subtract(subtract_interval); }); diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index 7817456a6..9c657e32d 100755 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -21,6 +21,10 @@ MaxwellDMA::MaxwellDMA(Core::System& system_, MemoryManager& memory_manager_) MaxwellDMA::~MaxwellDMA() = default; +void MaxwellDMA::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) { + rasterizer = rasterizer_; +} + void MaxwellDMA::CallMethod(u32 method, u32 method_argument, bool is_last_call) { ASSERT_MSG(method < NUM_REGS, "Invalid MaxwellDMA register"); @@ -44,7 +48,6 @@ void MaxwellDMA::Launch() { // TODO(Subv): Perform more research and implement all features of this engine. const LaunchDMA& launch = regs.launch_dma; - ASSERT(launch.remap_enable == 0); ASSERT(launch.semaphore_type == LaunchDMA::SemaphoreType::NONE); ASSERT(launch.interrupt_type == LaunchDMA::InterruptType::NONE); ASSERT(launch.data_transfer_type == LaunchDMA::DataTransferType::NON_PIPELINED); @@ -77,11 +80,29 @@ void MaxwellDMA::CopyPitchToPitch() { // When `multi_line_enable` bit is disabled the copy is performed as if we were copying a 1D // buffer of length `line_length_in`. // Otherwise we copy a 2D image of dimensions (line_length_in, line_count). + auto& accelerate = rasterizer->AccessAccelerateDMA(); if (!regs.launch_dma.multi_line_enable) { - memory_manager.CopyBlock(regs.offset_out, regs.offset_in, regs.line_length_in); + const bool is_buffer_clear = regs.launch_dma.remap_enable != 0 && + regs.remap_const.dst_x == RemapConst::Swizzle::CONST_A; + // TODO: allow multisized components. + if (is_buffer_clear) { + ASSERT(regs.remap_const.component_size_minus_one == 3); + std::vector tmp_buffer(regs.line_length_in, regs.remap_consta_value); + memory_manager.WriteBlock(regs.offset_out, reinterpret_cast(tmp_buffer.data()), + regs.line_length_in * sizeof(u32)); + return; + } + UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0); + if (!accelerate.BufferCopy(regs.offset_in, regs.offset_out, regs.line_length_in)) { + std::vector tmp_buffer(regs.line_length_in); + memory_manager.ReadBlockUnsafe(regs.offset_in, tmp_buffer.data(), regs.line_length_in); + memory_manager.WriteBlock(regs.offset_out, tmp_buffer.data(), regs.line_length_in); + } return; } + UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0); + // Perform a line-by-line copy. // We're going to take a subrect of size (line_length_in, line_count) from the source rectangle. // There is no need to manually flush/invalidate the regions because CopyBlock does that for us. @@ -106,6 +127,7 @@ void MaxwellDMA::CopyBlockLinearToPitch() { } // Deswizzle the input and copy it over. + UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0); const u32 bytes_per_pixel = regs.pitch_out / regs.line_length_in; const Parameters& src_params = regs.src_params; const u32 width = src_params.width; @@ -135,6 +157,7 @@ void MaxwellDMA::CopyBlockLinearToPitch() { void MaxwellDMA::CopyPitchToBlockLinear() { UNIMPLEMENTED_IF_MSG(regs.dst_params.block_size.width != 0, "Block width is not one"); + UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0); const auto& dst_params = regs.dst_params; const u32 bytes_per_pixel = regs.pitch_in / regs.line_length_in; @@ -157,13 +180,8 @@ void MaxwellDMA::CopyPitchToBlockLinear() { write_buffer.resize(dst_size); } - if (Settings::IsGPULevelExtreme()) { - memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size); - memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size); - } else { - memory_manager.ReadBlockUnsafe(regs.offset_in, read_buffer.data(), src_size); - memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size); - } + memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size); + memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size); // If the input is linear and the output is tiled, swizzle the input and copy it over. if (regs.dst_params.block_size.depth > 0) { diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h index c77f02a22..4ed0d0996 100755 --- a/src/video_core/engines/maxwell_dma.h +++ b/src/video_core/engines/maxwell_dma.h @@ -21,8 +21,18 @@ namespace Tegra { class MemoryManager; } +namespace VideoCore { +class RasterizerInterface; +} + namespace Tegra::Engines { +class AccelerateDMAInterface { +public: + /// Write the value to the register identified by method. + virtual bool BufferCopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount) = 0; +}; + /** * This engine is known as gk104_copy. Documentation can be found in: * https://github.com/NVIDIA/open-gpu-doc/blob/master/classes/dma-copy/clb0b5.h @@ -187,6 +197,8 @@ public: }; static_assert(sizeof(RemapConst) == 12); + void BindRasterizer(VideoCore::RasterizerInterface* rasterizer); + explicit MaxwellDMA(Core::System& system_, MemoryManager& memory_manager_); ~MaxwellDMA() override; @@ -213,6 +225,7 @@ private: Core::System& system; MemoryManager& memory_manager; + VideoCore::RasterizerInterface* rasterizer; std::vector read_buffer; std::vector write_buffer; @@ -240,7 +253,9 @@ private: u32 pitch_out; u32 line_length_in; u32 line_count; - u32 reserved06[0xb8]; + u32 reserved06[0xb6]; + u32 remap_consta_value; + u32 remap_constb_value; RemapConst remap_const; Parameters dst_params; u32 reserved07[0x1]; diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index f317ddc2b..ff024f530 100755 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -50,6 +50,7 @@ void GPU::BindRenderer(std::unique_ptr renderer_) { maxwell_3d->BindRasterizer(rasterizer); fermi_2d->BindRasterizer(rasterizer); kepler_compute->BindRasterizer(rasterizer); + maxwell_dma->BindRasterizer(rasterizer); } Engines::Maxwell3D& GPU::Maxwell3D() { diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp index 70ac7c620..121f380f8 100755 --- a/src/video_core/macro/macro_hle.cpp +++ b/src/video_core/macro/macro_hle.cpp @@ -4,6 +4,8 @@ #include #include +#include "common/scope_exit.h" +#include "video_core/dirty_flags.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/macro/macro_hle.h" #include "video_core/rasterizer_interface.h" @@ -56,6 +58,7 @@ void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d, const std::vector& maxwell3d.regs.index_array.first = parameters[3]; maxwell3d.regs.reg_array[0x446] = element_base; // vertex id base? maxwell3d.regs.index_array.count = parameters[1]; + maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true; maxwell3d.regs.vb_element_base = element_base; maxwell3d.regs.vb_base_instance = base_instance; maxwell3d.mme_draw.instance_count = instance_count; @@ -77,12 +80,70 @@ void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d, const std::vector& maxwell3d.CallMethodFromMME(0x8e5, 0x0); maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined; } + +// Multidraw Indirect +void HLE_3f5e74b9c9a50164(Engines::Maxwell3D& maxwell3d, const std::vector& parameters) { + SCOPE_EXIT({ + // Clean everything. + maxwell3d.regs.reg_array[0x446] = 0x0; // vertex id base? + maxwell3d.regs.index_array.count = 0; + maxwell3d.regs.vb_element_base = 0x0; + maxwell3d.regs.vb_base_instance = 0x0; + maxwell3d.mme_draw.instance_count = 0; + maxwell3d.CallMethodFromMME(0x8e3, 0x640); + maxwell3d.CallMethodFromMME(0x8e4, 0x0); + maxwell3d.CallMethodFromMME(0x8e5, 0x0); + maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined; + maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true; + }); + const u32 start_indirect = parameters[0]; + const u32 end_indirect = parameters[1]; + if (start_indirect >= end_indirect) { + // Nothing to do. + return; + } + const auto topology = + static_cast(parameters[2]); + maxwell3d.regs.draw.topology.Assign(topology); + const u32 padding = parameters[3]; + const std::size_t max_draws = parameters[4]; + + const u32 indirect_words = 5 + padding; + const std::size_t first_draw = start_indirect; + const std::size_t effective_draws = end_indirect - start_indirect; + const std::size_t last_draw = start_indirect + std::min(effective_draws, max_draws); + + for (std::size_t index = first_draw; index < last_draw; index++) { + const std::size_t base = index * indirect_words + 5; + const u32 num_vertices = parameters[base]; + const u32 instance_count = parameters[base + 1]; + const u32 first_index = parameters[base + 2]; + const u32 base_vertex = parameters[base + 3]; + const u32 base_instance = parameters[base + 4]; + maxwell3d.regs.index_array.first = first_index; + maxwell3d.regs.reg_array[0x446] = base_vertex; + maxwell3d.regs.index_array.count = num_vertices; + maxwell3d.regs.vb_element_base = base_vertex; + maxwell3d.regs.vb_base_instance = base_instance; + maxwell3d.mme_draw.instance_count = instance_count; + maxwell3d.CallMethodFromMME(0x8e3, 0x640); + maxwell3d.CallMethodFromMME(0x8e4, base_vertex); + maxwell3d.CallMethodFromMME(0x8e5, base_instance); + maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true; + if (maxwell3d.ShouldExecute()) { + maxwell3d.Rasterizer().Draw(true, true); + } + maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined; + } +} + } // Anonymous namespace -constexpr std::array, 3> hle_funcs{{ +constexpr std::array, 4> hle_funcs{{ {0x771BB18C62444DA0, &HLE_771BB18C62444DA0}, {0x0D61FC9FAAC9FCAD, &HLE_0D61FC9FAAC9FCAD}, {0x0217920100488FF7, &HLE_0217920100488FF7}, + {0x3f5e74b9c9a50164, &HLE_3f5e74b9c9a50164}, }}; HLEMacro::HLEMacro(Engines::Maxwell3D& maxwell3d_) : maxwell3d{maxwell3d_} {} diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index fcb204768..e9e7a1064 100755 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -14,7 +14,10 @@ namespace Tegra { class MemoryManager; +namespace Engines { +class AccelerateDMAInterface; } +} // namespace Tegra namespace VideoCore { @@ -124,6 +127,8 @@ public: return false; } + [[nodiscard]] virtual Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() = 0; + /// Attempt to use a faster method to display the framebuffer to screen [[nodiscard]] virtual bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, u32 pixel_stride) { diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index da492d773..af6156d90 100755 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -69,7 +69,7 @@ RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window_, Tegra buffer_cache(*this, maxwell3d, kepler_compute, gpu_memory, cpu_memory_, buffer_cache_runtime), shader_cache(*this, emu_window_, maxwell3d, kepler_compute, gpu_memory, device, texture_cache, buffer_cache, program_manager, state_tracker, gpu.ShaderNotify()), - query_cache(*this, maxwell3d, gpu_memory), + query_cache(*this, maxwell3d, gpu_memory), accelerate_dma(buffer_cache), fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache) {} RasterizerOpenGL::~RasterizerOpenGL() = default; @@ -502,6 +502,10 @@ bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surf return true; } +Tegra::Engines::AccelerateDMAInterface& RasterizerOpenGL::AccessAccelerateDMA() { + return accelerate_dma; +} + bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, u32 pixel_stride) { if (framebuffer_addr == 0) { @@ -1040,4 +1044,11 @@ void RasterizerOpenGL::EndTransformFeedback() { } } +AccelerateDMA::AccelerateDMA(BufferCache& buffer_cache_) : buffer_cache{buffer_cache_} {} + +bool AccelerateDMA::BufferCopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount) { + std::scoped_lock lock{buffer_cache.mutex}; + return buffer_cache.DMACopy(src_address, dest_address, amount); +} + } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index faa6cb521..b7ad2252f 100755 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -19,6 +19,7 @@ #include "common/common_types.h" #include "video_core/engines/const_buffer_info.h" #include "video_core/engines/maxwell_3d.h" +#include "video_core/engines/maxwell_dma.h" #include "video_core/rasterizer_accelerated.h" #include "video_core/rasterizer_interface.h" #include "video_core/renderer_opengl/gl_buffer_cache.h" @@ -56,6 +57,16 @@ struct BindlessSSBO { }; static_assert(sizeof(BindlessSSBO) * CHAR_BIT == 128); +class AccelerateDMA : public Tegra::Engines::AccelerateDMAInterface { +public: + explicit AccelerateDMA(BufferCache& buffer_cache); + + bool BufferCopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount) override; + +private: + BufferCache& buffer_cache; +}; + class RasterizerOpenGL : public VideoCore::RasterizerAccelerated { public: explicit RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_, @@ -94,6 +105,7 @@ public: bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src, const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Config& copy_config) override; + Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() override; bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, u32 pixel_stride) override; void LoadDiskResources(u64 title_id, std::stop_token stop_loading, @@ -203,6 +215,7 @@ private: BufferCache buffer_cache; ShaderCache shader_cache; QueryCache query_cache; + AccelerateDMA accelerate_dma; FenceManagerOpenGL fence_manager; boost::container::static_vector image_view_indices; diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 58a4f0fb4..7ecafc862 100755 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -211,6 +211,7 @@ ShaderCache::ShaderCache(RasterizerOpenGL& rasterizer_, Core::Frontend::EmuWindo .has_gl_component_indexing_bug = device.HasComponentIndexingBug(), .has_gl_precise_bug = device.HasPreciseBug(), .ignore_nan_fp_comparisons = true, + .gl_max_compute_smem_size = device.GetMaxComputeSharedMemorySize(), }, host_info{ .support_float16 = false, @@ -318,7 +319,7 @@ GraphicsPipeline* ShaderCache::CurrentGraphicsPipeline() { SetXfbState(graphics_key.xfb_state, regs); } if (current_pipeline && graphics_key == current_pipeline->Key()) { - return current_pipeline->IsBuilt() ? current_pipeline : nullptr; + return BuiltPipeline(current_pipeline); } return CurrentGraphicsPipelineSlowPath(); } diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h index 622267147..2bd48d697 100755 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h @@ -104,9 +104,7 @@ public: template static auto MakeConfigureSpecFunc() { - return [](GraphicsPipeline* pipeline, bool is_indexed) { - pipeline->ConfigureImpl(is_indexed); - }; + return [](GraphicsPipeline* pl, bool is_indexed) { pl->ConfigureImpl(is_indexed); }; } private: diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 0cb733df6..51b8336ec 100755 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -141,7 +141,7 @@ RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra pipeline_cache(*this, maxwell3d, kepler_compute, gpu_memory, device, scheduler, descriptor_pool, update_descriptor_queue, render_pass_cache, buffer_cache, texture_cache, gpu.ShaderNotify()), - query_cache{*this, maxwell3d, gpu_memory, device, scheduler}, + query_cache{*this, maxwell3d, gpu_memory, device, scheduler}, accelerate_dma{ buffer_cache }, fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache, device, scheduler), wfi_event(device.GetLogical().CreateEvent()) { scheduler.SetQueryCache(query_cache); @@ -494,6 +494,10 @@ bool RasterizerVulkan::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surf return true; } +Tegra::Engines::AccelerateDMAInterface& RasterizerVulkan::AccessAccelerateDMA() { + return accelerate_dma; +} + bool RasterizerVulkan::AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, u32 pixel_stride) { if (!framebuffer_addr) { @@ -535,6 +539,13 @@ void RasterizerVulkan::FlushWork() { draw_counter = 0; } +AccelerateDMA::AccelerateDMA(BufferCache& buffer_cache_) : buffer_cache{buffer_cache_} {} + +bool AccelerateDMA::BufferCopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount) { + std::scoped_lock lock{buffer_cache.mutex}; + return buffer_cache.DMACopy(src_address, dest_address, amount); +} + void RasterizerVulkan::UpdateDynamicStates() { auto& regs = maxwell3d.regs; UpdateViewportsState(regs); diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 47408dbab..0b6a0da94 100755 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -13,6 +13,7 @@ #include #include "common/common_types.h" +#include "video_core/engines/maxwell_dma.h" #include "video_core/rasterizer_accelerated.h" #include "video_core/rasterizer_interface.h" #include "video_core/renderer_vulkan/blit_image.h" @@ -48,6 +49,16 @@ struct VKScreenInfo; class StateTracker; +class AccelerateDMA : public Tegra::Engines::AccelerateDMAInterface { +public: + explicit AccelerateDMA(BufferCache& buffer_cache); + + bool BufferCopy(GPUVAddr start_address, GPUVAddr end_address, u64 amount) override; + +private: + BufferCache& buffer_cache; +}; + class RasterizerVulkan final : public VideoCore::RasterizerAccelerated { public: explicit RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_, @@ -87,6 +98,7 @@ public: bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src, const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Config& copy_config) override; + Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() override; bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, u32 pixel_stride) override; void LoadDiskResources(u64 title_id, std::stop_token stop_loading, @@ -150,6 +162,7 @@ private: BufferCache buffer_cache; PipelineCache pipeline_cache; VKQueryCache query_cache; + AccelerateDMA accelerate_dma; VKFenceManager fence_manager; vk::Event wfi_event;