From 2391029e4794861f9c6b1a166c8bc859a28cd34c Mon Sep 17 00:00:00 2001 From: pineappleEA Date: Fri, 2 Feb 2024 17:58:50 +0100 Subject: [PATCH] early-access version 4109 --- CMakeLists.txt | 2 +- README.md | 2 +- src/common/settings.cpp | 3 + src/common/settings.h | 32 + src/common/settings_common.h | 1 + src/common/settings_enums.h | 2 + src/core/device_memory_manager.h | 2 + src/core/guest_memory.h | 62 +- src/core/hle/kernel/k_process.cpp | 7 +- src/core/hle/service/am/am_types.h | 6 +- .../am/frontend/applet_software_keyboard.cpp | 12 +- .../am/frontend/applet_software_keyboard.h | 2 +- .../hle/service/am/library_applet_creator.cpp | 57 +- src/core/hle/service/am/self_controller.cpp | 3 +- .../hle/service/am/system_buffer_manager.cpp | 18 +- .../hle/service/am/system_buffer_manager.h | 3 +- src/core/hle/service/nvdrv/core/container.cpp | 7 +- src/core/hle/service/nvdrv/core/container.h | 4 +- src/core/hle/service/nvdrv/core/nvmap.cpp | 8 +- .../service/nvdrv/devices/nvdisp_disp0.cpp | 17 + .../service/nvdrv/devices/nvhost_nvdec.cpp | 15 +- .../nvdrv/devices/nvhost_nvdec_common.cpp | 27 +- .../nvdrv/devices/nvhost_nvdec_common.h | 1 + .../hle/service/nvdrv/devices/nvhost_vic.cpp | 15 +- .../nvnflinger/fb_share_buffer_manager.cpp | 176 ++- .../nvnflinger/fb_share_buffer_manager.h | 24 +- .../service/nvnflinger/hardware_composer.cpp | 1 + src/core/hle/service/nvnflinger/hwc_layer.h | 13 + .../hle/service/nvnflinger/nvnflinger.cpp | 7 +- src/core/hle/service/nvnflinger/nvnflinger.h | 6 +- src/core/hle/service/vi/layer/vi_layer.cpp | 6 +- src/core/hle/service/vi/layer/vi_layer.h | 13 + src/core/memory.h | 2 + src/frontend_common/config.cpp | 18 + src/frontend_common/config.h | 2 + src/video_core/CMakeLists.txt | 8 +- src/video_core/capture.h | 36 + src/video_core/cdma_pusher.cpp | 206 ++- src/video_core/cdma_pusher.h | 87 +- src/video_core/framebuffer_config.h | 7 + src/video_core/gpu.cpp | 48 +- src/video_core/gpu.h | 11 +- src/video_core/gpu_thread.cpp | 1 + src/video_core/host1x/codecs/decoder.cpp | 18 +- src/video_core/host1x/codecs/decoder.h | 26 +- src/video_core/host1x/codecs/h264.cpp | 143 +- src/video_core/host1x/codecs/h264.h | 293 ++-- src/video_core/host1x/codecs/vp8.cpp | 73 +- src/video_core/host1x/codecs/vp8.h | 44 +- src/video_core/host1x/codecs/vp9.cpp | 152 +- src/video_core/host1x/codecs/vp9.h | 46 +- src/video_core/host1x/codecs/vp9_types.h | 27 +- src/video_core/host1x/control.cpp | 1 + src/video_core/host1x/control.h | 10 +- src/video_core/host1x/ffmpeg/ffmpeg.cpp | 246 ++- src/video_core/host1x/ffmpeg/ffmpeg.h | 61 +- src/video_core/host1x/host1x.cpp | 26 +- src/video_core/host1x/host1x.h | 139 +- src/video_core/host1x/nvdec.cpp | 63 +- src/video_core/host1x/nvdec.h | 34 +- src/video_core/host1x/nvdec_common.h | 84 +- src/video_core/host1x/syncpoint_manager.cpp | 6 +- src/video_core/host1x/vic.cpp | 1331 ++++++++++++++--- src/video_core/host1x/vic.h | 653 +++++++- .../host_shaders/fidelityfx_fsr.frag | 21 +- src/video_core/host_shaders/fxaa.frag | 2 +- .../host_shaders/opengl_fidelityfx_fsr.frag | 19 +- .../host_shaders/opengl_present.frag | 2 +- .../host_shaders/present_bicubic.frag | 2 +- .../host_shaders/present_gaussian.frag | 14 +- .../vulkan_fidelityfx_fsr_easu_fp16.frag | 1 + .../vulkan_fidelityfx_fsr_easu_fp32.frag | 1 + .../vulkan_fidelityfx_fsr_rcas_fp16.frag | 1 + .../vulkan_fidelityfx_fsr_rcas_fp32.frag | 1 + .../vulkan_present_scaleforce_fp16.frag | 2 +- .../vulkan_present_scaleforce_fp32.frag | 2 +- src/video_core/memory_manager.h | 2 + src/video_core/present.h | 37 + src/video_core/renderer_base.h | 3 + .../renderer_null/renderer_null.cpp | 5 + src/video_core/renderer_null/renderer_null.h | 2 + .../renderer_opengl/gl_blit_screen.cpp | 15 +- .../renderer_opengl/gl_blit_screen.h | 7 +- .../renderer_opengl/present/layer.cpp | 35 +- .../renderer_opengl/present/layer.h | 8 +- .../present/window_adapt_pass.cpp | 19 +- .../present/window_adapt_pass.h | 2 +- .../renderer_opengl/renderer_opengl.cpp | 89 +- .../renderer_opengl/renderer_opengl.h | 9 + .../renderer_vulkan/present/layer.cpp | 19 +- .../renderer_vulkan/present/layer.h | 6 +- .../renderer_vulkan/present/util.cpp | 92 +- src/video_core/renderer_vulkan/present/util.h | 9 +- .../present/window_adapt_pass.cpp | 29 +- .../present/window_adapt_pass.h | 6 +- .../renderer_vulkan/renderer_vulkan.cpp | 114 +- .../renderer_vulkan/renderer_vulkan.h | 11 +- .../renderer_vulkan/vk_blit_screen.cpp | 14 +- .../renderer_vulkan/vk_blit_screen.h | 5 +- .../renderer_vulkan/vk_pipeline_cache.cpp | 5 +- src/video_core/texture_cache/texture_cache.h | 8 +- .../vulkan_common/vulkan_device.cpp | 2 + src/yuzu/CMakeLists.txt | 3 + src/yuzu/configuration/configure_applets.cpp | 86 ++ src/yuzu/configuration/configure_applets.h | 48 + src/yuzu/configuration/configure_applets.ui | 65 + src/yuzu/configuration/configure_dialog.cpp | 7 +- src/yuzu/configuration/configure_dialog.h | 2 + src/yuzu/configuration/shared_translation.cpp | 22 + 109 files changed, 4082 insertions(+), 1228 deletions(-) create mode 100755 src/video_core/capture.h create mode 100755 src/video_core/present.h create mode 100755 src/yuzu/configuration/configure_applets.cpp create mode 100755 src/yuzu/configuration/configure_applets.h create mode 100755 src/yuzu/configuration/configure_applets.ui diff --git a/CMakeLists.txt b/CMakeLists.txt index 8d851522f..b2e23c379 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -307,7 +307,7 @@ find_package(ZLIB 1.2 REQUIRED) find_package(zstd 1.5 REQUIRED) if (NOT YUZU_USE_EXTERNAL_VULKAN_HEADERS) - find_package(Vulkan 1.3.274 REQUIRED) + find_package(VulkanHeaders 1.3.274 REQUIRED) endif() if (NOT YUZU_USE_EXTERNAL_VULKAN_UTILITY_LIBRARIES) diff --git a/README.md b/README.md index a6e4270e1..5a9980269 100755 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ yuzu emulator early access ============= -This is the source code for early-access 4107. +This is the source code for early-access 4109. ## Legal Notice diff --git a/src/common/settings.cpp b/src/common/settings.cpp index 685ee1097..52a20ea61 100755 --- a/src/common/settings.cpp +++ b/src/common/settings.cpp @@ -30,6 +30,7 @@ namespace Settings { #define SETTING(TYPE, RANGED) template class Setting #define SWITCHABLE(TYPE, RANGED) template class SwitchableSetting +SETTING(AppletMode, false); SETTING(AudioEngine, false); SETTING(bool, false); SETTING(int, false); @@ -215,6 +216,8 @@ const char* TranslateCategory(Category category) { return "Debugging"; case Category::GpuDriver: return "GpuDriver"; + case Category::LibraryApplet: + return "LibraryApplet"; case Category::Miscellaneous: return "Miscellaneous"; case Category::Network: diff --git a/src/common/settings.h b/src/common/settings.h index ee5d42644..9ff40d118 100755 --- a/src/common/settings.h +++ b/src/common/settings.h @@ -133,6 +133,38 @@ struct TouchFromButtonMap { struct Values { Linkage linkage{}; + // Applet + Setting cabinet_applet_mode{linkage, AppletMode::LLE, "cabinet_applet_mode", + Category::LibraryApplet}; + Setting controller_applet_mode{linkage, AppletMode::HLE, "controller_applet_mode", + Category::LibraryApplet}; + Setting data_erase_applet_mode{linkage, AppletMode::HLE, "data_erase_applet_mode", + Category::LibraryApplet}; + Setting error_applet_mode{linkage, AppletMode::HLE, "error_applet_mode", + Category::LibraryApplet}; + Setting net_connect_applet_mode{linkage, AppletMode::HLE, "net_connect_applet_mode", + Category::LibraryApplet}; + Setting player_select_applet_mode{ + linkage, AppletMode::HLE, "player_select_applet_mode", Category::LibraryApplet}; + Setting swkbd_applet_mode{linkage, AppletMode::LLE, "swkbd_applet_mode", + Category::LibraryApplet}; + Setting mii_edit_applet_mode{linkage, AppletMode::LLE, "mii_edit_applet_mode", + Category::LibraryApplet}; + Setting web_applet_mode{linkage, AppletMode::HLE, "web_applet_mode", + Category::LibraryApplet}; + Setting shop_applet_mode{linkage, AppletMode::HLE, "shop_applet_mode", + Category::LibraryApplet}; + Setting photo_viewer_applet_mode{ + linkage, AppletMode::LLE, "photo_viewer_applet_mode", Category::LibraryApplet}; + Setting offline_web_applet_mode{linkage, AppletMode::LLE, "offline_web_applet_mode", + Category::LibraryApplet}; + Setting login_share_applet_mode{linkage, AppletMode::HLE, "login_share_applet_mode", + Category::LibraryApplet}; + Setting wifi_web_auth_applet_mode{ + linkage, AppletMode::HLE, "wifi_web_auth_applet_mode", Category::LibraryApplet}; + Setting my_page_applet_mode{linkage, AppletMode::LLE, "my_page_applet_mode", + Category::LibraryApplet}; + // Audio SwitchableSetting sink_id{linkage, AudioEngine::Auto, "output_engine", Category::Audio, Specialization::RuntimeList}; diff --git a/src/common/settings_common.h b/src/common/settings_common.h index 987489e8a..2df3f0809 100755 --- a/src/common/settings_common.h +++ b/src/common/settings_common.h @@ -44,6 +44,7 @@ enum class Category : u32 { Services, Paths, Linux, + LibraryApplet, MaxEnum, }; diff --git a/src/common/settings_enums.h b/src/common/settings_enums.h index 617036588..f42367e67 100755 --- a/src/common/settings_enums.h +++ b/src/common/settings_enums.h @@ -151,6 +151,8 @@ ENUM(AspectRatio, R16_9, R4_3, R21_9, R16_10, Stretch); ENUM(ConsoleMode, Handheld, Docked); +ENUM(AppletMode, HLE, LLE); + template inline std::string CanonicalizeEnum(Type id) { const auto group = EnumMetadata::Canonicalizations(); diff --git a/src/core/device_memory_manager.h b/src/core/device_memory_manager.h index 0568a821b..6dcf7bb22 100755 --- a/src/core/device_memory_manager.h +++ b/src/core/device_memory_manager.h @@ -43,6 +43,8 @@ public: DeviceMemoryManager(const DeviceMemory& device_memory); ~DeviceMemoryManager(); + static constexpr bool HAS_FLUSH_INVALIDATION = true; + void BindInterface(DeviceInterface* device_inter); DAddr Allocate(size_t size); diff --git a/src/core/guest_memory.h b/src/core/guest_memory.h index 7ee18c126..83292f702 100755 --- a/src/core/guest_memory.h +++ b/src/core/guest_memory.h @@ -44,15 +44,32 @@ public: GuestMemory() = delete; explicit GuestMemory(M& memory, u64 addr, std::size_t size, Common::ScratchBuffer* backup = nullptr) - : m_memory{memory}, m_addr{addr}, m_size{size} { + : m_memory{&memory}, m_addr{addr}, m_size{size} { static_assert(FLAGS & GuestMemoryFlags::Read || FLAGS & GuestMemoryFlags::Write); - if constexpr (FLAGS & GuestMemoryFlags::Read) { + if constexpr (!(FLAGS & GuestMemoryFlags::Read)) { + if (!this->TrySetSpan()) { + if (backup) { + backup->resize_destructive(this->size()); + m_data_span = *backup; + m_span_valid = true; + m_is_data_copy = true; + } else { + m_data_copy.resize(this->size()); + m_data_span = std::span(m_data_copy); + m_span_valid = true; + m_is_data_copy = true; + } + } + } else if constexpr (FLAGS & GuestMemoryFlags::Read) { Read(addr, size, backup); } } ~GuestMemory() = default; + GuestMemory(GuestMemory&& rhs) = default; + GuestMemory& operator=(GuestMemory&& rhs) = default; + T* data() noexcept { return m_data_span.data(); } @@ -109,8 +126,8 @@ public: } if (this->TrySetSpan()) { - if constexpr (FLAGS & GuestMemoryFlags::Safe) { - m_memory.FlushRegion(m_addr, this->size_bytes()); + if constexpr (FLAGS & GuestMemoryFlags::Safe && M::HAS_FLUSH_INVALIDATION) { + m_memory->FlushRegion(m_addr, this->size_bytes()); } } else { if (backup) { @@ -123,9 +140,9 @@ public: m_is_data_copy = true; m_span_valid = true; if constexpr (FLAGS & GuestMemoryFlags::Safe) { - m_memory.ReadBlock(m_addr, this->data(), this->size_bytes()); + m_memory->ReadBlock(m_addr, this->data(), this->size_bytes()); } else { - m_memory.ReadBlockUnsafe(m_addr, this->data(), this->size_bytes()); + m_memory->ReadBlockUnsafe(m_addr, this->data(), this->size_bytes()); } } return m_data_span; @@ -133,18 +150,19 @@ public: void Write(std::span write_data) noexcept { if constexpr (FLAGS & GuestMemoryFlags::Cached) { - m_memory.WriteBlockCached(m_addr, write_data.data(), this->size_bytes()); + m_memory->WriteBlockCached(m_addr, write_data.data(), this->size_bytes()); } else if constexpr (FLAGS & GuestMemoryFlags::Safe) { - m_memory.WriteBlock(m_addr, write_data.data(), this->size_bytes()); + m_memory->WriteBlock(m_addr, write_data.data(), this->size_bytes()); } else { - m_memory.WriteBlockUnsafe(m_addr, write_data.data(), this->size_bytes()); + m_memory->WriteBlockUnsafe(m_addr, write_data.data(), this->size_bytes()); } } bool TrySetSpan() noexcept { - if (u8* ptr = m_memory.GetSpan(m_addr, this->size_bytes()); ptr) { + if (u8* ptr = m_memory->GetSpan(m_addr, this->size_bytes()); ptr) { m_data_span = {reinterpret_cast(ptr), this->size()}; m_span_valid = true; + m_is_data_copy = false; return true; } return false; @@ -159,7 +177,7 @@ protected: return m_addr_changed; } - M& m_memory; + M* m_memory; u64 m_addr{}; size_t m_size{}; std::span m_data_span{}; @@ -175,17 +193,7 @@ public: GuestMemoryScoped() = delete; explicit GuestMemoryScoped(M& memory, u64 addr, std::size_t size, Common::ScratchBuffer* backup = nullptr) - : GuestMemory(memory, addr, size, backup) { - if constexpr (!(FLAGS & GuestMemoryFlags::Read)) { - if (!this->TrySetSpan()) { - if (backup) { - this->m_data_span = *backup; - this->m_span_valid = true; - this->m_is_data_copy = true; - } - } - } - } + : GuestMemory(memory, addr, size, backup) {} ~GuestMemoryScoped() { if constexpr (FLAGS & GuestMemoryFlags::Write) { @@ -196,15 +204,17 @@ public: if (this->AddressChanged() || this->IsDataCopy()) { ASSERT(this->m_span_valid); if constexpr (FLAGS & GuestMemoryFlags::Cached) { - this->m_memory.WriteBlockCached(this->m_addr, this->data(), this->size_bytes()); + this->m_memory->WriteBlockCached(this->m_addr, this->data(), + this->size_bytes()); } else if constexpr (FLAGS & GuestMemoryFlags::Safe) { - this->m_memory.WriteBlock(this->m_addr, this->data(), this->size_bytes()); + this->m_memory->WriteBlock(this->m_addr, this->data(), this->size_bytes()); } else { - this->m_memory.WriteBlockUnsafe(this->m_addr, this->data(), this->size_bytes()); + this->m_memory->WriteBlockUnsafe(this->m_addr, this->data(), + this->size_bytes()); } } else if constexpr ((FLAGS & GuestMemoryFlags::Safe) || (FLAGS & GuestMemoryFlags::Cached)) { - this->m_memory.InvalidateRegion(this->m_addr, this->size_bytes()); + this->m_memory->InvalidateRegion(this->m_addr, this->size_bytes()); } } } diff --git a/src/core/hle/kernel/k_process.cpp b/src/core/hle/kernel/k_process.cpp index 6cad999aa..4ad83f8a9 100755 --- a/src/core/hle/kernel/k_process.cpp +++ b/src/core/hle/kernel/k_process.cpp @@ -4,8 +4,9 @@ #include #include "common/scope_exit.h" #include "common/settings.h" +#include "core/arm/dynarmic/arm_dynarmic.h" +#include "core/arm/dynarmic/dynarmic_exclusive_monitor.h" #include "core/core.h" -#include "core/gpu_dirty_memory_manager.h" #include "core/hle/kernel/k_process.h" #include "core/hle/kernel/k_scoped_resource_reservation.h" #include "core/hle/kernel/k_shared_memory.h" @@ -1258,6 +1259,10 @@ void KProcess::InitializeInterfaces() { #ifdef HAS_NCE if (this->IsApplication() && Settings::IsNceEnabled()) { + // Register the scoped JIT handler before creating any NCE instances + // so that its signal handler will appear first in the signal chain. + Core::ScopedJitExecution::RegisterHandler(); + for (size_t i = 0; i < Core::Hardware::NUM_CPU_CORES; i++) { m_arm_interfaces[i] = std::make_unique(m_kernel.System(), true, i); } diff --git a/src/core/hle/service/am/am_types.h b/src/core/hle/service/am/am_types.h index a2b852b12..8c33feb15 100755 --- a/src/core/hle/service/am/am_types.h +++ b/src/core/hle/service/am/am_types.h @@ -130,9 +130,9 @@ enum class AppletProgramId : u64 { enum class LibraryAppletMode : u32 { AllForeground = 0, - Background = 1, - NoUI = 2, - BackgroundIndirectDisplay = 3, + PartialForeground = 1, + NoUi = 2, + PartialForegroundIndirectDisplay = 3, AllForegroundInitiallyHidden = 4, }; diff --git a/src/core/hle/service/am/frontend/applet_software_keyboard.cpp b/src/core/hle/service/am/frontend/applet_software_keyboard.cpp index fbf75d379..034c62f32 100755 --- a/src/core/hle/service/am/frontend/applet_software_keyboard.cpp +++ b/src/core/hle/service/am/frontend/applet_software_keyboard.cpp @@ -68,9 +68,9 @@ void SoftwareKeyboard::Initialize() { case LibraryAppletMode::AllForeground: InitializeForeground(); break; - case LibraryAppletMode::Background: - case LibraryAppletMode::BackgroundIndirectDisplay: - InitializeBackground(applet_mode); + case LibraryAppletMode::PartialForeground: + case LibraryAppletMode::PartialForegroundIndirectDisplay: + InitializePartialForeground(applet_mode); break; default: ASSERT_MSG(false, "Invalid LibraryAppletMode={}", applet_mode); @@ -243,7 +243,7 @@ void SoftwareKeyboard::InitializeForeground() { InitializeFrontendNormalKeyboard(); } -void SoftwareKeyboard::InitializeBackground(LibraryAppletMode library_applet_mode) { +void SoftwareKeyboard::InitializePartialForeground(LibraryAppletMode library_applet_mode) { LOG_INFO(Service_AM, "Initializing Inline Software Keyboard Applet."); is_background = true; @@ -258,9 +258,9 @@ void SoftwareKeyboard::InitializeBackground(LibraryAppletMode library_applet_mod swkbd_inline_initialize_arg.size()); if (swkbd_initialize_arg.library_applet_mode_flag) { - ASSERT(library_applet_mode == LibraryAppletMode::Background); + ASSERT(library_applet_mode == LibraryAppletMode::PartialForeground); } else { - ASSERT(library_applet_mode == LibraryAppletMode::BackgroundIndirectDisplay); + ASSERT(library_applet_mode == LibraryAppletMode::PartialForegroundIndirectDisplay); } } diff --git a/src/core/hle/service/am/frontend/applet_software_keyboard.h b/src/core/hle/service/am/frontend/applet_software_keyboard.h index f464b7e15..2a7d01b96 100755 --- a/src/core/hle/service/am/frontend/applet_software_keyboard.h +++ b/src/core/hle/service/am/frontend/applet_software_keyboard.h @@ -62,7 +62,7 @@ private: void InitializeForeground(); /// Initializes the inline software keyboard. - void InitializeBackground(LibraryAppletMode library_applet_mode); + void InitializePartialForeground(LibraryAppletMode library_applet_mode); /// Processes the text check sent by the application. void ProcessTextCheck(); diff --git a/src/core/hle/service/am/library_applet_creator.cpp b/src/core/hle/service/am/library_applet_creator.cpp index 47bab7528..a883a021e 100755 --- a/src/core/hle/service/am/library_applet_creator.cpp +++ b/src/core/hle/service/am/library_applet_creator.cpp @@ -1,6 +1,7 @@ // SPDX-FileCopyrightText: Copyright 2024 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include "common/settings.h" #include "core/hle/kernel/k_transfer_memory.h" #include "core/hle/service/am/applet_data_broker.h" #include "core/hle/service/am/applet_manager.h" @@ -16,6 +17,34 @@ namespace Service::AM { namespace { +bool ShouldCreateGuestApplet(AppletId applet_id) { +#define X(Name, name) \ + if (applet_id == AppletId::Name && \ + Settings::values.name##_applet_mode.GetValue() != Settings::AppletMode::LLE) { \ + return false; \ + } + + X(Cabinet, cabinet) + X(Controller, controller) + X(DataErase, data_erase) + X(Error, error) + X(NetConnect, net_connect) + X(ProfileSelect, player_select) + X(SoftwareKeyboard, swkbd) + X(MiiEdit, mii_edit) + X(Web, web) + X(Shop, shop) + X(PhotoViewer, photo_viewer) + X(OfflineWeb, offline_web) + X(LoginShare, login_share) + X(WebAuth, wifi_web_auth) + X(MyPage, my_page) + +#undef X + + return true; +} + AppletProgramId AppletIdToProgramId(AppletId applet_id) { switch (applet_id) { case AppletId::OverlayDisplay: @@ -63,9 +92,10 @@ AppletProgramId AppletIdToProgramId(AppletId applet_id) { } } -[[maybe_unused]] std::shared_ptr CreateGuestApplet( - Core::System& system, std::shared_ptr caller_applet, AppletId applet_id, - LibraryAppletMode mode) { +std::shared_ptr CreateGuestApplet(Core::System& system, + std::shared_ptr caller_applet, + AppletId applet_id, + LibraryAppletMode mode) { const auto program_id = static_cast(AppletIdToProgramId(applet_id)); if (program_id == 0) { // Unknown applet @@ -87,7 +117,7 @@ AppletProgramId AppletIdToProgramId(AppletId applet_id) { // Set focus state switch (mode) { case LibraryAppletMode::AllForeground: - case LibraryAppletMode::NoUI: + case LibraryAppletMode::NoUi: applet->focus_state = FocusState::InFocus; applet->hid_registration.EnableAppletToGetInput(true); applet->message_queue.PushMessage(AppletMessageQueue::AppletMessage::ChangeIntoForeground); @@ -99,8 +129,8 @@ AppletProgramId AppletIdToProgramId(AppletId applet_id) { applet->hid_registration.EnableAppletToGetInput(false); applet->message_queue.PushMessage(AppletMessageQueue::AppletMessage::FocusStateChanged); break; - case LibraryAppletMode::Background: - case LibraryAppletMode::BackgroundIndirectDisplay: + case LibraryAppletMode::PartialForeground: + case LibraryAppletMode::PartialForegroundIndirectDisplay: default: applet->focus_state = FocusState::Background; applet->hid_registration.EnableAppletToGetInput(true); @@ -117,9 +147,10 @@ AppletProgramId AppletIdToProgramId(AppletId applet_id) { return std::make_shared(system, broker, applet); } -[[maybe_unused]] std::shared_ptr CreateFrontendApplet( - Core::System& system, std::shared_ptr caller_applet, AppletId applet_id, - LibraryAppletMode mode) { +std::shared_ptr CreateFrontendApplet(Core::System& system, + std::shared_ptr caller_applet, + AppletId applet_id, + LibraryAppletMode mode) { const auto program_id = static_cast(AppletIdToProgramId(applet_id)); auto process = std::make_unique(system); @@ -163,7 +194,13 @@ void ILibraryAppletCreator::CreateLibraryApplet(HLERequestContext& ctx) { LOG_DEBUG(Service_AM, "called with applet_id={:08X}, applet_mode={:08X}", applet_id, applet_mode); - auto library_applet = CreateFrontendApplet(system, applet, applet_id, applet_mode); + std::shared_ptr library_applet; + if (ShouldCreateGuestApplet(applet_id)) { + library_applet = CreateGuestApplet(system, applet, applet_id, applet_mode); + } + if (!library_applet) { + library_applet = CreateFrontendApplet(system, applet, applet_id, applet_mode); + } if (!library_applet) { LOG_ERROR(Service_AM, "Applet doesn't exist! applet_id={}", applet_id); diff --git a/src/core/hle/service/am/self_controller.cpp b/src/core/hle/service/am/self_controller.cpp index 0289f5cf1..b92663b2b 100755 --- a/src/core/hle/service/am/self_controller.cpp +++ b/src/core/hle/service/am/self_controller.cpp @@ -288,7 +288,8 @@ void ISelfController::GetSystemSharedBufferHandle(HLERequestContext& ctx) { } Result ISelfController::EnsureBufferSharingEnabled(Kernel::KProcess* process) { - if (applet->system_buffer_manager.Initialize(&nvnflinger, process, applet->applet_id)) { + if (applet->system_buffer_manager.Initialize(&nvnflinger, process, applet->applet_id, + applet->library_applet_mode)) { return ResultSuccess; } diff --git a/src/core/hle/service/am/system_buffer_manager.cpp b/src/core/hle/service/am/system_buffer_manager.cpp index 60a9afc9d..7fb9e3a75 100755 --- a/src/core/hle/service/am/system_buffer_manager.cpp +++ b/src/core/hle/service/am/system_buffer_manager.cpp @@ -17,11 +17,12 @@ SystemBufferManager::~SystemBufferManager() { // Clean up shared layers. if (m_buffer_sharing_enabled) { + m_nvnflinger->GetSystemBufferManager().Finalize(m_process); } } bool SystemBufferManager::Initialize(Nvnflinger::Nvnflinger* nvnflinger, Kernel::KProcess* process, - AppletId applet_id) { + AppletId applet_id, LibraryAppletMode mode) { if (m_nvnflinger) { return m_buffer_sharing_enabled; } @@ -36,9 +37,14 @@ bool SystemBufferManager::Initialize(Nvnflinger::Nvnflinger* nvnflinger, Kernel: return false; } + Nvnflinger::LayerBlending blending = Nvnflinger::LayerBlending::None; + if (mode == LibraryAppletMode::PartialForeground) { + blending = Nvnflinger::LayerBlending::Coverage; + } + const auto display_id = m_nvnflinger->OpenDisplay("Default").value(); const auto res = m_nvnflinger->GetSystemBufferManager().Initialize( - &m_system_shared_buffer_id, &m_system_shared_layer_id, display_id); + m_process, &m_system_shared_buffer_id, &m_system_shared_layer_id, display_id, blending); if (res.IsSuccess()) { m_buffer_sharing_enabled = true; @@ -62,8 +68,12 @@ void SystemBufferManager::SetWindowVisibility(bool visible) { Result SystemBufferManager::WriteAppletCaptureBuffer(bool* out_was_written, s32* out_fbshare_layer_index) { - // TODO - R_SUCCEED(); + if (!m_buffer_sharing_enabled) { + return VI::ResultPermissionDenied; + } + + return m_nvnflinger->GetSystemBufferManager().WriteAppletCaptureBuffer(out_was_written, + out_fbshare_layer_index); } } // namespace Service::AM diff --git a/src/core/hle/service/am/system_buffer_manager.h b/src/core/hle/service/am/system_buffer_manager.h index 98c3cf055..0690f68b6 100755 --- a/src/core/hle/service/am/system_buffer_manager.h +++ b/src/core/hle/service/am/system_buffer_manager.h @@ -27,7 +27,8 @@ public: SystemBufferManager(); ~SystemBufferManager(); - bool Initialize(Nvnflinger::Nvnflinger* flinger, Kernel::KProcess* process, AppletId applet_id); + bool Initialize(Nvnflinger::Nvnflinger* flinger, Kernel::KProcess* process, AppletId applet_id, + LibraryAppletMode mode); void GetSystemSharedLayerHandle(u64* out_system_shared_buffer_id, u64* out_system_shared_layer_id) { diff --git a/src/core/hle/service/nvdrv/core/container.cpp b/src/core/hle/service/nvdrv/core/container.cpp index 5a64576a4..18103504c 100755 --- a/src/core/hle/service/nvdrv/core/container.cpp +++ b/src/core/hle/service/nvdrv/core/container.cpp @@ -49,6 +49,7 @@ SessionId Container::OpenSession(Kernel::KProcess* process) { continue; } if (session.process == process) { + session.ref_count++; return session.id; } } @@ -66,6 +67,7 @@ SessionId Container::OpenSession(Kernel::KProcess* process) { } auto& session = impl->sessions[new_id]; session.is_active = true; + session.ref_count = 1; // Optimization if (process->IsApplication()) { auto& page_table = process->GetPageTable().GetBasePageTable(); @@ -114,8 +116,11 @@ SessionId Container::OpenSession(Kernel::KProcess* process) { void Container::CloseSession(SessionId session_id) { std::scoped_lock lk(impl->session_guard); - impl->file.UnmapAllHandles(session_id); auto& session = impl->sessions[session_id.id]; + if (--session.ref_count > 0) { + return; + } + impl->file.UnmapAllHandles(session_id); auto& smmu = impl->host1x.MemoryManager(); if (session.has_preallocated_area) { const DAddr region_start = session.mapper->GetRegionStart(); diff --git a/src/core/hle/service/nvdrv/core/container.h b/src/core/hle/service/nvdrv/core/container.h index c4aa9018f..e352d7e70 100755 --- a/src/core/hle/service/nvdrv/core/container.h +++ b/src/core/hle/service/nvdrv/core/container.h @@ -46,6 +46,7 @@ struct Session { bool has_preallocated_area{}; std::unique_ptr mapper{}; bool is_active{}; + s32 ref_count{}; }; class Container { @@ -67,10 +68,7 @@ public: const SyncpointManager& GetSyncpointManager() const; struct Host1xDeviceFileData { - std::unordered_map fd_to_id{}; std::deque syncpts_accumulated{}; - u32 nvdec_next_id{}; - u32 vic_next_id{}; }; Host1xDeviceFileData& Host1xDeviceFile(); diff --git a/src/core/hle/service/nvdrv/core/nvmap.cpp b/src/core/hle/service/nvdrv/core/nvmap.cpp index dc5455231..0e21bf6de 100755 --- a/src/core/hle/service/nvdrv/core/nvmap.cpp +++ b/src/core/hle/service/nvdrv/core/nvmap.cpp @@ -333,9 +333,13 @@ void NvMap::UnmapAllHandles(NvCore::SessionId session_id) { }(); for (auto& [id, handle] : handles_copy) { - if (handle->session_id.id == session_id.id) { - FreeHandle(id, false); + { + std::scoped_lock lk{handle->mutex}; + if (handle->session_id.id != session_id.id || handle->dupes <= 0) { + continue; + } } + FreeHandle(id, false); } } diff --git a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp index 62fefe4b9..e7285a8d8 100755 --- a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp +++ b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp @@ -15,6 +15,22 @@ namespace Service::Nvidia::Devices { +namespace { + +Tegra::BlendMode ConvertBlending(Service::Nvnflinger::LayerBlending blending) { + switch (blending) { + case Service::Nvnflinger::LayerBlending::None: + default: + return Tegra::BlendMode::Opaque; + case Service::Nvnflinger::LayerBlending::Premultiplied: + return Tegra::BlendMode::Premultiplied; + case Service::Nvnflinger::LayerBlending::Coverage: + return Tegra::BlendMode::Coverage; + } +} + +} // namespace + nvdisp_disp0::nvdisp_disp0(Core::System& system_, NvCore::Container& core) : nvdevice{system_}, container{core}, nvmap{core.GetNvMapFile()} {} nvdisp_disp0::~nvdisp_disp0() = default; @@ -56,6 +72,7 @@ void nvdisp_disp0::Composite(std::span sorted_layers .pixel_format = layer.format, .transform_flags = layer.transform, .crop_rect = layer.crop_rect, + .blending = ConvertBlending(layer.blending), }); for (size_t i = 0; i < layer.acquire_fence.num_fences; i++) { diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp index 762abc560..0daa97c7f 100755 --- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp @@ -8,6 +8,7 @@ #include "core/hle/service/nvdrv/core/container.h" #include "core/hle/service/nvdrv/devices/ioctl_serialization.h" #include "core/hle/service/nvdrv/devices/nvhost_nvdec.h" +#include "video_core/host1x/host1x.h" #include "video_core/renderer_base.h" namespace Service::Nvidia::Devices { @@ -21,13 +22,8 @@ NvResult nvhost_nvdec::Ioctl1(DeviceFD fd, Ioctl command, std::span in switch (command.group) { case 0x0: switch (command.cmd) { - case 0x1: { - auto& host1x_file = core.Host1xDeviceFile(); - if (!host1x_file.fd_to_id.contains(fd)) { - host1x_file.fd_to_id[fd] = host1x_file.nvdec_next_id++; - } + case 0x1: return WrapFixedVariable(this, &nvhost_nvdec::Submit, input, output, fd); - } case 0x2: return WrapFixed(this, &nvhost_nvdec::GetSyncpoint, input, output); case 0x3: @@ -72,15 +68,12 @@ void nvhost_nvdec::OnOpen(NvCore::SessionId session_id, DeviceFD fd) { LOG_INFO(Service_NVDRV, "NVDEC video stream started"); system.SetNVDECActive(true); sessions[fd] = session_id; + host1x.StartDevice(fd, Tegra::Host1x::ChannelType::NvDec, channel_syncpoint); } void nvhost_nvdec::OnClose(DeviceFD fd) { LOG_INFO(Service_NVDRV, "NVDEC video stream ended"); - auto& host1x_file = core.Host1xDeviceFile(); - const auto iter = host1x_file.fd_to_id.find(fd); - if (iter != host1x_file.fd_to_id.end()) { - system.GPU().ClearCdmaInstance(iter->second); - } + host1x.StopDevice(fd, Tegra::Host1x::ChannelType::NvDec); system.SetNVDECActive(false); auto it = sessions.find(fd); if (it != sessions.end()) { diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp index 38778c714..da29f332c 100755 --- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp @@ -55,8 +55,9 @@ std::size_t WriteVectors(std::span dst, const std::vector& src, std::size nvhost_nvdec_common::nvhost_nvdec_common(Core::System& system_, NvCore::Container& core_, NvCore::ChannelType channel_type_) - : nvdevice{system_}, core{core_}, syncpoint_manager{core.GetSyncpointManager()}, - nvmap{core.GetNvMapFile()}, channel_type{channel_type_} { + : nvdevice{system_}, host1x{system_.Host1x()}, core{core_}, + syncpoint_manager{core.GetSyncpointManager()}, nvmap{core.GetNvMapFile()}, + channel_type{channel_type_} { auto& syncpts_accumulated = core.Host1xDeviceFile().syncpts_accumulated; if (syncpts_accumulated.empty()) { channel_syncpoint = syncpoint_manager.AllocateSyncpoint(false); @@ -95,24 +96,24 @@ NvResult nvhost_nvdec_common::Submit(IoctlSubmit& params, std::span data, De offset += SliceVectors(data, syncpt_increments, params.syncpoint_count, offset); offset += SliceVectors(data, fence_thresholds, params.fence_count, offset); - auto& gpu = system.GPU(); auto* session = core.GetSession(sessions[fd]); - if (gpu.UseNvdec()) { - for (std::size_t i = 0; i < syncpt_increments.size(); i++) { - const SyncptIncr& syncpt_incr = syncpt_increments[i]; - fence_thresholds[i] = - syncpoint_manager.IncrementSyncpointMaxExt(syncpt_incr.id, syncpt_incr.increments); - } + for (std::size_t i = 0; i < syncpt_increments.size(); i++) { + const SyncptIncr& syncpt_incr = syncpt_increments[i]; + fence_thresholds[i] = + syncpoint_manager.IncrementSyncpointMaxExt(syncpt_incr.id, syncpt_incr.increments); } + for (const auto& cmd_buffer : command_buffers) { const auto object = nvmap.GetHandle(cmd_buffer.memory_id); ASSERT_OR_EXECUTE(object, return NvResult::InvalidState;); - Tegra::ChCommandHeaderList cmdlist(cmd_buffer.word_count); - session->process->GetMemory().ReadBlock(object->address + cmd_buffer.offset, cmdlist.data(), - cmdlist.size() * sizeof(u32)); - gpu.PushCommandBuffer(core.Host1xDeviceFile().fd_to_id[fd], cmdlist); + Core::Memory::CpuGuestMemory + cmdlist(session->process->GetMemory(), object->address + cmd_buffer.offset, + cmd_buffer.word_count); + host1x.PushEntries(fd, std::move(cmdlist)); } + // Some games expect command_buffers to be written back offset = 0; offset += WriteVectors(data, command_buffers, offset); diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h index 59ed38fa9..9e6eefb19 100755 --- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h +++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h @@ -119,6 +119,7 @@ protected: Kernel::KEvent* QueryEvent(u32 event_id) override; + Tegra::Host1x::Host1x& host1x; u32 channel_syncpoint; s32_le nvmap_fd{}; u32_le submit_timeout{}; diff --git a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp index 12ffddeb8..9d5bb298e 100755 --- a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp @@ -7,6 +7,7 @@ #include "core/hle/service/nvdrv/core/container.h" #include "core/hle/service/nvdrv/devices/ioctl_serialization.h" #include "core/hle/service/nvdrv/devices/nvhost_vic.h" +#include "video_core/host1x/host1x.h" #include "video_core/renderer_base.h" namespace Service::Nvidia::Devices { @@ -21,13 +22,8 @@ NvResult nvhost_vic::Ioctl1(DeviceFD fd, Ioctl command, std::span inpu switch (command.group) { case 0x0: switch (command.cmd) { - case 0x1: { - auto& host1x_file = core.Host1xDeviceFile(); - if (!host1x_file.fd_to_id.contains(fd)) { - host1x_file.fd_to_id[fd] = host1x_file.vic_next_id++; - } + case 0x1: return WrapFixedVariable(this, &nvhost_vic::Submit, input, output, fd); - } case 0x2: return WrapFixed(this, &nvhost_vic::GetSyncpoint, input, output); case 0x3: @@ -70,14 +66,11 @@ NvResult nvhost_vic::Ioctl3(DeviceFD fd, Ioctl command, std::span inpu void nvhost_vic::OnOpen(NvCore::SessionId session_id, DeviceFD fd) { sessions[fd] = session_id; + host1x.StartDevice(fd, Tegra::Host1x::ChannelType::VIC, channel_syncpoint); } void nvhost_vic::OnClose(DeviceFD fd) { - auto& host1x_file = core.Host1xDeviceFile(); - const auto iter = host1x_file.fd_to_id.find(fd); - if (iter != host1x_file.fd_to_id.end()) { - system.GPU().ClearCdmaInstance(iter->second); - } + host1x.StopDevice(fd, Tegra::Host1x::ChannelType::VIC); sessions.erase(fd); } diff --git a/src/core/hle/service/nvnflinger/fb_share_buffer_manager.cpp b/src/core/hle/service/nvnflinger/fb_share_buffer_manager.cpp index e71652cdf..90f7248a0 100755 --- a/src/core/hle/service/nvnflinger/fb_share_buffer_manager.cpp +++ b/src/core/hle/service/nvnflinger/fb_share_buffer_manager.cpp @@ -14,24 +14,20 @@ #include "core/hle/service/nvnflinger/ui/graphic_buffer.h" #include "core/hle/service/vi/layer/vi_layer.h" #include "core/hle/service/vi/vi_results.h" +#include "video_core/gpu.h" +#include "video_core/host1x/host1x.h" namespace Service::Nvnflinger { namespace { -Result AllocateIoForProcessAddressSpace(Common::ProcessAddress* out_map_address, - std::unique_ptr* out_page_group, - Core::System& system, u32 size) { +Result AllocateSharedBufferMemory(std::unique_ptr* out_page_group, + Core::System& system, u32 size) { using Core::Memory::YUZU_PAGESIZE; // Allocate memory for the system shared buffer. - // FIXME: Because the gmmu can only point to cpu addresses, we need - // to map this in the application space to allow it to be used. - // FIXME: Add proper smmu emulation. // FIXME: This memory belongs to vi's .data section. auto& kernel = system.Kernel(); - auto* process = system.ApplicationProcess(); - auto& page_table = process->GetPageTable(); // Hold a temporary page group reference while we try to map it. auto pg = std::make_unique( @@ -43,6 +39,30 @@ Result AllocateIoForProcessAddressSpace(Common::ProcessAddress* out_map_address, Kernel::KMemoryManager::EncodeOption(Kernel::KMemoryManager::Pool::Secure, Kernel::KMemoryManager::Direction::FromBack))); + // Fill the output data with red. + for (auto& block : *pg) { + u32* start = system.DeviceMemory().GetPointer(block.GetAddress()); + u32* end = system.DeviceMemory().GetPointer(block.GetAddress() + block.GetSize()); + + for (; start < end; start++) { + *start = 0xFF0000FF; + } + } + + // Return the mapped page group. + *out_page_group = std::move(pg); + + // We succeeded. + R_SUCCEED(); +} + +Result MapSharedBufferIntoProcessAddressSpace(Common::ProcessAddress* out_map_address, + std::unique_ptr& pg, + Kernel::KProcess* process, Core::System& system) { + using Core::Memory::YUZU_PAGESIZE; + + auto& page_table = process->GetPageTable(); + // Get bounds of where mapping is possible. const VAddr alias_code_begin = GetInteger(page_table.GetAliasCodeRegionStart()); const VAddr alias_code_size = page_table.GetAliasCodeRegionSize() / YUZU_PAGESIZE; @@ -64,9 +84,6 @@ Result AllocateIoForProcessAddressSpace(Common::ProcessAddress* out_map_address, // Return failure, if necessary R_UNLESS(i < 64, res); - // Return the mapped page group. - *out_page_group = std::move(pg); - // We succeeded. R_SUCCEED(); } @@ -135,6 +152,13 @@ Result AllocateHandleForBuffer(u32* out_handle, Nvidia::Module& nvdrv, Nvidia::D R_RETURN(AllocNvMapHandle(*nvmap, *out_handle, buffer, size, nvmap_fd)); } +void FreeHandle(u32 handle, Nvidia::Module& nvdrv, Nvidia::DeviceFD nvmap_fd) { + auto nvmap = nvdrv.GetDevice(nvmap_fd); + ASSERT(nvmap != nullptr); + + R_ASSERT(FreeNvMapHandle(*nvmap, handle, nvmap_fd)); +} + constexpr auto SharedBufferBlockLinearFormat = android::PixelFormat::Rgba8888; constexpr u32 SharedBufferBlockLinearBpp = 4; @@ -186,53 +210,97 @@ FbShareBufferManager::FbShareBufferManager(Core::System& system, Nvnflinger& fli FbShareBufferManager::~FbShareBufferManager() = default; -Result FbShareBufferManager::Initialize(u64* out_buffer_id, u64* out_layer_id, u64 display_id) { +Result FbShareBufferManager::Initialize(Kernel::KProcess* owner_process, u64* out_buffer_id, + u64* out_layer_handle, u64 display_id, + LayerBlending blending) { std::scoped_lock lk{m_guard}; - // Ensure we have not already created a buffer. - R_UNLESS(m_buffer_id == 0, VI::ResultOperationFailed); + // Ensure we haven't already created. + const u64 aruid = owner_process->GetProcessId(); + R_UNLESS(!m_sessions.contains(aruid), VI::ResultPermissionDenied); - // Allocate memory and space for the shared buffer. - Common::ProcessAddress map_address; - R_TRY(AllocateIoForProcessAddressSpace(std::addressof(map_address), - std::addressof(m_buffer_page_group), m_system, - SharedBufferSize)); + // Allocate memory for the shared buffer if needed. + if (!m_buffer_page_group) { + R_TRY(AllocateSharedBufferMemory(std::addressof(m_buffer_page_group), m_system, + SharedBufferSize)); + + // Record buffer id. + m_buffer_id = m_next_buffer_id++; + + // Record display id. + m_display_id = display_id; + } + + // Map into process. + Common::ProcessAddress map_address{}; + R_TRY(MapSharedBufferIntoProcessAddressSpace(std::addressof(map_address), m_buffer_page_group, + owner_process, m_system)); + + // Create new session. + auto [it, was_emplaced] = m_sessions.emplace(aruid, FbShareSession{}); + auto& session = it->second; auto& container = m_nvdrv->GetContainer(); - m_session_id = container.OpenSession(m_system.ApplicationProcess()); - m_nvmap_fd = m_nvdrv->Open("/dev/nvmap", m_session_id); + session.session_id = container.OpenSession(owner_process); + session.nvmap_fd = m_nvdrv->Open("/dev/nvmap", session.session_id); // Create an nvmap handle for the buffer and assign the memory to it. - R_TRY(AllocateHandleForBuffer(std::addressof(m_buffer_nvmap_handle), *m_nvdrv, m_nvmap_fd, - map_address, SharedBufferSize)); - - // Record the display id. - m_display_id = display_id; + R_TRY(AllocateHandleForBuffer(std::addressof(session.buffer_nvmap_handle), *m_nvdrv, + session.nvmap_fd, map_address, SharedBufferSize)); // Create and open a layer for the display. - m_layer_id = m_flinger.CreateLayer(m_display_id).value(); - m_flinger.OpenLayer(m_layer_id); - - // Set up the buffer. - m_buffer_id = m_next_buffer_id++; + session.layer_id = m_flinger.CreateLayer(m_display_id, blending).value(); + m_flinger.OpenLayer(session.layer_id); // Get the layer. - VI::Layer* layer = m_flinger.FindLayer(m_display_id, m_layer_id); + VI::Layer* layer = m_flinger.FindLayer(m_display_id, session.layer_id); ASSERT(layer != nullptr); // Get the producer and set preallocated buffers. auto& producer = layer->GetBufferQueue(); - MakeGraphicBuffer(producer, 0, m_buffer_nvmap_handle); - MakeGraphicBuffer(producer, 1, m_buffer_nvmap_handle); + MakeGraphicBuffer(producer, 0, session.buffer_nvmap_handle); + MakeGraphicBuffer(producer, 1, session.buffer_nvmap_handle); // Assign outputs. *out_buffer_id = m_buffer_id; - *out_layer_id = m_layer_id; + *out_layer_handle = session.layer_id; // We succeeded. R_SUCCEED(); } +void FbShareBufferManager::Finalize(Kernel::KProcess* owner_process) { + std::scoped_lock lk{m_guard}; + + if (m_buffer_id == 0) { + return; + } + + const u64 aruid = owner_process->GetProcessId(); + const auto it = m_sessions.find(aruid); + if (it == m_sessions.end()) { + return; + } + + auto& session = it->second; + + // Destroy the layer. + m_flinger.DestroyLayer(session.layer_id); + + // Close nvmap handle. + FreeHandle(session.buffer_nvmap_handle, *m_nvdrv, session.nvmap_fd); + + // Close nvmap device. + m_nvdrv->Close(session.nvmap_fd); + + // Close session. + auto& container = m_nvdrv->GetContainer(); + container.CloseSession(session.session_id); + + // Erase. + m_sessions.erase(it); +} + Result FbShareBufferManager::GetSharedBufferMemoryHandleId(u64* out_buffer_size, s32* out_nvmap_handle, SharedMemoryPoolLayout* out_pool_layout, @@ -242,17 +310,18 @@ Result FbShareBufferManager::GetSharedBufferMemoryHandleId(u64* out_buffer_size, R_UNLESS(m_buffer_id > 0, VI::ResultNotFound); R_UNLESS(buffer_id == m_buffer_id, VI::ResultNotFound); + R_UNLESS(m_sessions.contains(applet_resource_user_id), VI::ResultNotFound); *out_pool_layout = SharedBufferPoolLayout; *out_buffer_size = SharedBufferSize; - *out_nvmap_handle = m_buffer_nvmap_handle; + *out_nvmap_handle = m_sessions[applet_resource_user_id].buffer_nvmap_handle; R_SUCCEED(); } Result FbShareBufferManager::GetLayerFromId(VI::Layer** out_layer, u64 layer_id) { // Ensure the layer id is valid. - R_UNLESS(m_layer_id > 0 && layer_id == m_layer_id, VI::ResultNotFound); + R_UNLESS(layer_id > 0, VI::ResultNotFound); // Get the layer. VI::Layer* layer = m_flinger.FindLayer(m_display_id, layer_id); @@ -309,6 +378,10 @@ Result FbShareBufferManager::PresentSharedFrameBuffer(android::Fence fence, android::Status::NoError, VI::ResultOperationFailed); + ON_RESULT_FAILURE { + producer.CancelBuffer(static_cast(slot), fence); + }; + // Queue the buffer to the producer. android::QueueBufferInput input{}; android::QueueBufferOutput output{}; @@ -342,4 +415,33 @@ Result FbShareBufferManager::GetSharedFrameBufferAcquirableEvent(Kernel::KReadab R_SUCCEED(); } +Result FbShareBufferManager::WriteAppletCaptureBuffer(bool* out_was_written, s32* out_layer_index) { + std::vector capture_buffer(m_system.GPU().GetAppletCaptureBuffer()); + Common::ScratchBuffer scratch; + + // TODO: this could be optimized + s64 e = -1280 * 768 * 4; + for (auto& block : *m_buffer_page_group) { + u8* start = m_system.DeviceMemory().GetPointer(block.GetAddress()); + u8* end = m_system.DeviceMemory().GetPointer(block.GetAddress() + block.GetSize()); + + for (; start < end; start++) { + *start = 0; + + if (e >= 0 && e < static_cast(capture_buffer.size())) { + *start = capture_buffer[e]; + } + e++; + } + + m_system.GPU().Host1x().MemoryManager().ApplyOpOnPointer(start, scratch, [&](DAddr addr) { + m_system.GPU().InvalidateRegion(addr, end - start); + }); + } + + *out_was_written = true; + *out_layer_index = 1; + R_SUCCEED(); +} + } // namespace Service::Nvnflinger diff --git a/src/core/hle/service/nvnflinger/fb_share_buffer_manager.h b/src/core/hle/service/nvnflinger/fb_share_buffer_manager.h index 033bf4bbe..b79a7d23a 100755 --- a/src/core/hle/service/nvnflinger/fb_share_buffer_manager.h +++ b/src/core/hle/service/nvnflinger/fb_share_buffer_manager.h @@ -3,9 +3,12 @@ #pragma once +#include + #include "common/math_util.h" #include "core/hle/service/nvdrv/core/container.h" #include "core/hle/service/nvdrv/nvdata.h" +#include "core/hle/service/nvnflinger/hwc_layer.h" #include "core/hle/service/nvnflinger/nvnflinger.h" #include "core/hle/service/nvnflinger/ui/fence.h" @@ -29,13 +32,18 @@ struct SharedMemoryPoolLayout { }; static_assert(sizeof(SharedMemoryPoolLayout) == 0x188, "SharedMemoryPoolLayout has wrong size"); +struct FbShareSession; + class FbShareBufferManager final { public: explicit FbShareBufferManager(Core::System& system, Nvnflinger& flinger, std::shared_ptr nvdrv); ~FbShareBufferManager(); - Result Initialize(u64* out_buffer_id, u64* out_layer_handle, u64 display_id); + Result Initialize(Kernel::KProcess* owner_process, u64* out_buffer_id, u64* out_layer_handle, + u64 display_id, LayerBlending blending); + void Finalize(Kernel::KProcess* owner_process); + Result GetSharedBufferMemoryHandleId(u64* out_buffer_size, s32* out_nvmap_handle, SharedMemoryPoolLayout* out_pool_layout, u64 buffer_id, u64 applet_resource_user_id); @@ -45,6 +53,8 @@ public: u32 transform, s32 swap_interval, u64 layer_id, s64 slot); Result GetSharedFrameBufferAcquirableEvent(Kernel::KReadableEvent** out_event, u64 layer_id); + Result WriteAppletCaptureBuffer(bool* out_was_written, s32* out_layer_index); + private: Result GetLayerFromId(VI::Layer** out_layer, u64 layer_id); @@ -52,11 +62,8 @@ private: u64 m_next_buffer_id = 1; u64 m_display_id = 0; u64 m_buffer_id = 0; - u64 m_layer_id = 0; - u32 m_buffer_nvmap_handle = 0; SharedMemoryPoolLayout m_pool_layout = {}; - Nvidia::DeviceFD m_nvmap_fd = {}; - Nvidia::NvCore::SessionId m_session_id = {}; + std::map m_sessions; std::unique_ptr m_buffer_page_group; std::mutex m_guard; @@ -65,4 +72,11 @@ private: std::shared_ptr m_nvdrv; }; +struct FbShareSession { + Nvidia::DeviceFD nvmap_fd = {}; + Nvidia::NvCore::SessionId session_id = {}; + u64 layer_id = {}; + u32 buffer_nvmap_handle = 0; +}; + } // namespace Service::Nvnflinger diff --git a/src/core/hle/service/nvnflinger/hardware_composer.cpp b/src/core/hle/service/nvnflinger/hardware_composer.cpp index c720dd1f8..b2e35ffd4 100755 --- a/src/core/hle/service/nvnflinger/hardware_composer.cpp +++ b/src/core/hle/service/nvnflinger/hardware_composer.cpp @@ -109,6 +109,7 @@ u32 HardwareComposer::ComposeLocked(f32* out_speed_scale, VI::Display& display, .height = igbp_buffer.Height(), .stride = igbp_buffer.Stride(), .z_index = 0, + .blending = layer.GetBlending(), .transform = static_cast(item.transform), .crop_rect = item.crop, .acquire_fence = item.fence, diff --git a/src/core/hle/service/nvnflinger/hwc_layer.h b/src/core/hle/service/nvnflinger/hwc_layer.h index 3af668a25..f71a5d822 100755 --- a/src/core/hle/service/nvnflinger/hwc_layer.h +++ b/src/core/hle/service/nvnflinger/hwc_layer.h @@ -11,6 +11,18 @@ namespace Service::Nvnflinger { +// hwc_layer_t::blending values +enum class LayerBlending : u32 { + // No blending + None = 0x100, + + // ONE / ONE_MINUS_SRC_ALPHA + Premultiplied = 0x105, + + // SRC_ALPHA / ONE_MINUS_SRC_ALPHA + Coverage = 0x405, +}; + struct HwcLayer { u32 buffer_handle; u32 offset; @@ -19,6 +31,7 @@ struct HwcLayer { u32 height; u32 stride; s32 z_index; + LayerBlending blending; android::BufferTransformFlags transform; Common::Rectangle crop_rect; android::Fence acquire_fence; diff --git a/src/core/hle/service/nvnflinger/nvnflinger.cpp b/src/core/hle/service/nvnflinger/nvnflinger.cpp index a4e848882..0ad3e099a 100755 --- a/src/core/hle/service/nvnflinger/nvnflinger.cpp +++ b/src/core/hle/service/nvnflinger/nvnflinger.cpp @@ -157,7 +157,7 @@ bool Nvnflinger::CloseDisplay(u64 display_id) { return true; } -std::optional Nvnflinger::CreateLayer(u64 display_id) { +std::optional Nvnflinger::CreateLayer(u64 display_id, LayerBlending blending) { const auto lock_guard = Lock(); auto* const display = FindDisplay(display_id); @@ -166,13 +166,14 @@ std::optional Nvnflinger::CreateLayer(u64 display_id) { } const u64 layer_id = next_layer_id++; - CreateLayerAtId(*display, layer_id); + CreateLayerAtId(*display, layer_id, blending); return layer_id; } -void Nvnflinger::CreateLayerAtId(VI::Display& display, u64 layer_id) { +void Nvnflinger::CreateLayerAtId(VI::Display& display, u64 layer_id, LayerBlending blending) { const auto buffer_id = next_buffer_queue_id++; display.CreateLayer(layer_id, buffer_id, nvdrv->container); + display.FindLayer(layer_id)->SetBlending(blending); } bool Nvnflinger::OpenLayer(u64 layer_id) { diff --git a/src/core/hle/service/nvnflinger/nvnflinger.h b/src/core/hle/service/nvnflinger/nvnflinger.h index c984d55a0..4cf4f069d 100755 --- a/src/core/hle/service/nvnflinger/nvnflinger.h +++ b/src/core/hle/service/nvnflinger/nvnflinger.h @@ -15,6 +15,7 @@ #include "common/thread.h" #include "core/hle/result.h" #include "core/hle/service/kernel_helpers.h" +#include "core/hle/service/nvnflinger/hwc_layer.h" namespace Common { class Event; @@ -72,7 +73,8 @@ public: /// Creates a layer on the specified display and returns the layer ID. /// /// If an invalid display ID is specified, then an empty optional is returned. - [[nodiscard]] std::optional CreateLayer(u64 display_id); + [[nodiscard]] std::optional CreateLayer(u64 display_id, + LayerBlending blending = LayerBlending::None); /// Opens a layer on all displays for the given layer ID. bool OpenLayer(u64 layer_id); @@ -128,7 +130,7 @@ private: [[nodiscard]] VI::Layer* FindLayer(u64 display_id, u64 layer_id); /// Creates a layer with the specified layer ID in the desired display. - void CreateLayerAtId(VI::Display& display, u64 layer_id); + void CreateLayerAtId(VI::Display& display, u64 layer_id, LayerBlending blending); void SplitVSync(std::stop_token stop_token); diff --git a/src/core/hle/service/vi/layer/vi_layer.cpp b/src/core/hle/service/vi/layer/vi_layer.cpp index 0185d187d..cb9af576c 100755 --- a/src/core/hle/service/vi/layer/vi_layer.cpp +++ b/src/core/hle/service/vi/layer/vi_layer.cpp @@ -1,6 +1,7 @@ // SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include "core/hle/service/nvnflinger/hwc_layer.h" #include "core/hle/service/vi/layer/vi_layer.h" namespace Service::VI { @@ -8,8 +9,9 @@ namespace Service::VI { Layer::Layer(u64 layer_id_, u32 binder_id_, android::BufferQueueCore& core_, android::BufferQueueProducer& binder_, std::shared_ptr&& consumer_) - : layer_id{layer_id_}, binder_id{binder_id_}, core{core_}, binder{binder_}, - consumer{std::move(consumer_)}, open{false}, visible{true} {} + : layer_id{layer_id_}, binder_id{binder_id_}, core{core_}, binder{binder_}, consumer{std::move( + consumer_)}, + blending{Nvnflinger::LayerBlending::None}, open{false}, visible{true} {} Layer::~Layer() = default; diff --git a/src/core/hle/service/vi/layer/vi_layer.h b/src/core/hle/service/vi/layer/vi_layer.h index 859346eab..dd5c84693 100755 --- a/src/core/hle/service/vi/layer/vi_layer.h +++ b/src/core/hle/service/vi/layer/vi_layer.h @@ -14,6 +14,10 @@ class BufferQueueCore; class BufferQueueProducer; } // namespace Service::android +namespace Service::Nvnflinger { +enum class LayerBlending : u32; +} + namespace Service::VI { /// Represents a single display layer. @@ -92,12 +96,21 @@ public: return !std::exchange(open, true); } + Nvnflinger::LayerBlending GetBlending() { + return blending; + } + + void SetBlending(Nvnflinger::LayerBlending b) { + blending = b; + } + private: const u64 layer_id; const u32 binder_id; android::BufferQueueCore& core; android::BufferQueueProducer& binder; std::shared_ptr consumer; + Service::Nvnflinger::LayerBlending blending; bool open; bool visible; }; diff --git a/src/core/memory.h b/src/core/memory.h index 80a93ef90..c8fd99c82 100755 --- a/src/core/memory.h +++ b/src/core/memory.h @@ -64,6 +64,8 @@ public: Memory(Memory&&) = default; Memory& operator=(Memory&&) = delete; + static constexpr bool HAS_FLUSH_INVALIDATION = false; + /** * Resets the state of the Memory system. */ diff --git a/src/frontend_common/config.cpp b/src/frontend_common/config.cpp index d34624d28..cbbb07ac7 100755 --- a/src/frontend_common/config.cpp +++ b/src/frontend_common/config.cpp @@ -401,6 +401,14 @@ void Config::ReadNetworkValues() { EndGroup(); } +void Config::ReadLibraryAppletValues() { + BeginGroup(Settings::TranslateCategory(Settings::Category::LibraryApplet)); + + ReadCategory(Settings::Category::LibraryApplet); + + EndGroup(); +} + void Config::ReadValues() { if (global) { ReadDataStorageValues(); @@ -410,6 +418,7 @@ void Config::ReadValues() { ReadServiceValues(); ReadWebServiceValues(); ReadMiscellaneousValues(); + ReadLibraryAppletValues(); } ReadControlValues(); ReadCoreValues(); @@ -511,6 +520,7 @@ void Config::SaveValues() { SaveNetworkValues(); SaveWebServiceValues(); SaveMiscellaneousValues(); + SaveLibraryAppletValues(); } else { LOG_DEBUG(Config, "Saving only generic configuration values"); } @@ -691,6 +701,14 @@ void Config::SaveWebServiceValues() { EndGroup(); } +void Config::SaveLibraryAppletValues() { + BeginGroup(Settings::TranslateCategory(Settings::Category::LibraryApplet)); + + WriteCategory(Settings::Category::LibraryApplet); + + EndGroup(); +} + bool Config::ReadBooleanSetting(const std::string& key, const std::optional default_value) { std::string full_key = GetFullKey(key, false); if (!default_value.has_value()) { diff --git a/src/frontend_common/config.h b/src/frontend_common/config.h index 4ecb97044..8b0599cc3 100755 --- a/src/frontend_common/config.h +++ b/src/frontend_common/config.h @@ -88,6 +88,7 @@ protected: void ReadSystemValues(); void ReadWebServiceValues(); void ReadNetworkValues(); + void ReadLibraryAppletValues(); // Read platform specific sections virtual void ReadHidbusValues() = 0; @@ -121,6 +122,7 @@ protected: void SaveScreenshotValues(); void SaveSystemValues(); void SaveWebServiceValues(); + void SaveLibraryAppletValues(); // Save platform specific sections virtual void SaveHidbusValues() = 0; diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 1da7b9fbe..578a3f66b 100755 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -18,6 +18,7 @@ add_library(video_core STATIC buffer_cache/usage_tracker.h buffer_cache/word_manager.h cache_types.h + capture.h cdma_pusher.cpp cdma_pusher.h compatible_formats.cpp @@ -59,8 +60,8 @@ add_library(video_core STATIC framebuffer_config.h fsr.cpp fsr.h - host1x/codecs/codec.cpp - host1x/codecs/codec.h + host1x/codecs/decoder.cpp + host1x/codecs/decoder.h host1x/codecs/h264.cpp host1x/codecs/h264.h host1x/codecs/vp8.cpp @@ -79,8 +80,6 @@ add_library(video_core STATIC host1x/nvdec.cpp host1x/nvdec.h host1x/nvdec_common.h - host1x/sync_manager.cpp - host1x/sync_manager.h host1x/syncpoint_manager.cpp host1x/syncpoint_manager.h host1x/vic.cpp @@ -101,6 +100,7 @@ add_library(video_core STATIC memory_manager.cpp memory_manager.h precompiled_headers.h + present.h pte_kind.h query_cache/bank_base.h query_cache/query_base.h diff --git a/src/video_core/capture.h b/src/video_core/capture.h new file mode 100755 index 000000000..8db14a8ec --- /dev/null +++ b/src/video_core/capture.h @@ -0,0 +1,36 @@ +// SPDX-FileCopyrightText: Copyright 2024 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include "common/alignment.h" +#include "common/bit_util.h" +#include "common/common_types.h" +#include "core/frontend/framebuffer_layout.h" +#include "video_core/surface.h" + +namespace VideoCore::Capture { + +constexpr u32 BlockHeight = 4; +constexpr u32 BlockDepth = 0; +constexpr u32 BppLog2 = 2; + +constexpr auto PixelFormat = Surface::PixelFormat::B8G8R8A8_UNORM; + +constexpr auto LinearWidth = Layout::ScreenUndocked::Width; +constexpr auto LinearHeight = Layout::ScreenUndocked::Height; +constexpr auto LinearDepth = 1U; +constexpr auto BytesPerPixel = 4U; + +constexpr auto TiledWidth = LinearWidth; +constexpr auto TiledHeight = Common::AlignUpLog2(LinearHeight, BlockHeight + BlockDepth + BppLog2); +constexpr auto TiledSize = TiledWidth * TiledHeight * (1 << BppLog2); + +constexpr Layout::FramebufferLayout Layout{ + .width = LinearWidth, + .height = LinearHeight, + .screen = {0, 0, LinearWidth, LinearHeight}, + .is_srgb = false, +}; + +} // namespace VideoCore::Capture diff --git a/src/video_core/cdma_pusher.cpp b/src/video_core/cdma_pusher.cpp index 2e38410cf..6daca0f0a 100755 --- a/src/video_core/cdma_pusher.cpp +++ b/src/video_core/cdma_pusher.cpp @@ -2,136 +2,130 @@ // SPDX-License-Identifier: MIT #include + +#include "common/thread.h" +#include "core/core.h" #include "video_core/cdma_pusher.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/host1x/control.h" #include "video_core/host1x/host1x.h" #include "video_core/host1x/nvdec.h" #include "video_core/host1x/nvdec_common.h" -#include "video_core/host1x/sync_manager.h" #include "video_core/host1x/vic.h" #include "video_core/memory_manager.h" namespace Tegra { -CDmaPusher::CDmaPusher(Host1x::Host1x& host1x_) - : host1x{host1x_}, nvdec_processor(std::make_shared(host1x)), - vic_processor(std::make_unique(host1x, nvdec_processor)), - host1x_processor(std::make_unique(host1x)), - sync_manager(std::make_unique(host1x)) {} + +CDmaPusher::CDmaPusher(Host1x::Host1x& host1x_, s32 id) + : host1x{host1x_}, memory_manager{host1x.GMMU()}, + host_processor{std::make_unique(host1x_)}, current_class{ + static_cast(id)} { + thread = std::jthread([this](std::stop_token stop_token) { ProcessEntries(stop_token); }); +} CDmaPusher::~CDmaPusher() = default; -void CDmaPusher::ProcessEntries(ChCommandHeaderList&& entries) { - for (const auto& value : entries) { - if (mask != 0) { - const auto lbs = static_cast(std::countr_zero(mask)); - mask &= ~(1U << lbs); - ExecuteCommand(offset + lbs, value.raw); - continue; - } else if (count != 0) { - --count; - ExecuteCommand(offset, value.raw); - if (incrementing) { - ++offset; +void CDmaPusher::ProcessEntries(std::stop_token stop_token) { + Common::SetCurrentThreadPriority(Common::ThreadPriority::High); + ChCommandHeaderList command_list{host1x.System().ApplicationMemory(), 0, 0}; + u32 count{}; + u32 method_offset{}; + u32 mask{}; + bool incrementing{}; + + while (!stop_token.stop_requested()) { + { + std::unique_lock l{command_mutex}; + Common::CondvarWait(command_cv, l, stop_token, + [this]() { return command_lists.size() > 0; }); + if (stop_token.stop_requested()) { + return; } - continue; + + command_list = std::move(command_lists.front()); + command_lists.pop_front(); } - const auto mode = value.submission_mode.Value(); - switch (mode) { - case ChSubmissionMode::SetClass: { - mask = value.value & 0x3f; - offset = value.method_offset; - current_class = static_cast((value.value >> 6) & 0x3ff); - break; - } - case ChSubmissionMode::Incrementing: - case ChSubmissionMode::NonIncrementing: - count = value.value; - offset = value.method_offset; - incrementing = mode == ChSubmissionMode::Incrementing; - break; - case ChSubmissionMode::Mask: - mask = value.value; - offset = value.method_offset; - break; - case ChSubmissionMode::Immediate: { - const u32 data = value.value & 0xfff; - offset = value.method_offset; - ExecuteCommand(offset, data); - break; - } - default: - UNIMPLEMENTED_MSG("ChSubmission mode {} is not implemented!", static_cast(mode)); - break; + + size_t i = 0; + for (const auto value : command_list) { + i++; + if (mask != 0) { + const auto lbs = static_cast(std::countr_zero(mask)); + mask &= ~(1U << lbs); + ExecuteCommand(method_offset + lbs, value.raw); + continue; + } else if (count != 0) { + --count; + ExecuteCommand(method_offset, value.raw); + if (incrementing) { + ++method_offset; + } + continue; + } + const auto mode = value.submission_mode.Value(); + switch (mode) { + case ChSubmissionMode::SetClass: { + mask = value.value & 0x3f; + method_offset = value.method_offset; + current_class = static_cast((value.value >> 6) & 0x3ff); + break; + } + case ChSubmissionMode::Incrementing: + case ChSubmissionMode::NonIncrementing: + count = value.value; + method_offset = value.method_offset; + incrementing = mode == ChSubmissionMode::Incrementing; + break; + case ChSubmissionMode::Mask: + mask = value.value; + method_offset = value.method_offset; + break; + case ChSubmissionMode::Immediate: { + const u32 data = value.value & 0xfff; + method_offset = value.method_offset; + ExecuteCommand(method_offset, data); + break; + } + default: + LOG_ERROR(HW_GPU, "Bad command at index {} (bytes 0x{:X}), buffer size {}", i - 1, + (i - 1) * sizeof(u32), command_list.size()); + UNIMPLEMENTED_MSG("ChSubmission mode {} is not implemented!", + static_cast(mode)); + break; + } } } } -void CDmaPusher::ExecuteCommand(u32 state_offset, u32 data) { +void CDmaPusher::ExecuteCommand(u32 method, u32 arg) { switch (current_class) { - case ChClassId::NvDec: - ThiStateWrite(nvdec_thi_state, offset, data); - switch (static_cast(offset)) { - case ThiMethod::IncSyncpt: { - LOG_DEBUG(Service_NVDRV, "NVDEC Class IncSyncpt Method"); - const auto syncpoint_id = static_cast(data & 0xFF); - const auto cond = static_cast((data >> 8) & 0xFF); - if (cond == 0) { - sync_manager->Increment(syncpoint_id); - } else { - sync_manager->SignalDone( - sync_manager->IncrementWhenDone(static_cast(current_class), syncpoint_id)); - } - break; - } - case ThiMethod::SetMethod1: - LOG_DEBUG(Service_NVDRV, "NVDEC method 0x{:X}", - static_cast(nvdec_thi_state.method_0)); - nvdec_processor->ProcessMethod(nvdec_thi_state.method_0, data); - break; - default: - break; - } - break; - case ChClassId::GraphicsVic: - ThiStateWrite(vic_thi_state, static_cast(state_offset), {data}); - switch (static_cast(state_offset)) { - case ThiMethod::IncSyncpt: { - LOG_DEBUG(Service_NVDRV, "VIC Class IncSyncpt Method"); - const auto syncpoint_id = static_cast(data & 0xFF); - const auto cond = static_cast((data >> 8) & 0xFF); - if (cond == 0) { - sync_manager->Increment(syncpoint_id); - } else { - sync_manager->SignalDone( - sync_manager->IncrementWhenDone(static_cast(current_class), syncpoint_id)); - } - break; - } - case ThiMethod::SetMethod1: - LOG_DEBUG(Service_NVDRV, "VIC method 0x{:X}, Args=({})", - static_cast(vic_thi_state.method_0), data); - vic_processor->ProcessMethod(static_cast(vic_thi_state.method_0), - data); - break; - default: - break; - } - break; case ChClassId::Control: - // This device is mainly for syncpoint synchronization - LOG_DEBUG(Service_NVDRV, "Host1X Class Method"); - host1x_processor->ProcessMethod(static_cast(offset), data); + LOG_TRACE(Service_NVDRV, "Class {} method 0x{:X} arg 0x{:X}", + static_cast(current_class), method, arg); + host_processor->ProcessMethod(static_cast(method), arg); break; default: - UNIMPLEMENTED_MSG("Current class not implemented {:X}", static_cast(current_class)); - break; + thi_regs.reg_array[method] = arg; + switch (static_cast(method)) { + case ThiMethod::IncSyncpt: { + const auto syncpoint_id = static_cast(arg & 0xFF); + [[maybe_unused]] const auto cond = static_cast((arg >> 8) & 0xFF); + LOG_TRACE(Service_NVDRV, "Class {} IncSyncpt Method, syncpt {} cond {}", + static_cast(current_class), syncpoint_id, cond); + auto& syncpoint_manager = host1x.GetSyncpointManager(); + syncpoint_manager.IncrementGuest(syncpoint_id); + syncpoint_manager.IncrementHost(syncpoint_id); + break; + } + case ThiMethod::SetMethod1: + LOG_TRACE(Service_NVDRV, "Class {} method 0x{:X} arg 0x{:X}", + static_cast(current_class), static_cast(thi_regs.method_0), arg); + ProcessMethod(thi_regs.method_0, arg); + break; + default: + break; + } } } -void CDmaPusher::ThiStateWrite(ThiRegisters& state, u32 state_offset, u32 argument) { - u8* const offset_ptr = reinterpret_cast(&state) + sizeof(u32) * state_offset; - std::memcpy(offset_ptr, &argument, sizeof(u32)); -} - } // namespace Tegra diff --git a/src/video_core/cdma_pusher.h b/src/video_core/cdma_pusher.h index 2d663cdaa..553654b8f 100755 --- a/src/video_core/cdma_pusher.h +++ b/src/video_core/cdma_pusher.h @@ -3,12 +3,18 @@ #pragma once +#include +#include #include +#include +#include #include #include "common/bit_field.h" #include "common/common_funcs.h" #include "common/common_types.h" +#include "common/polyfill_thread.h" +#include "core/memory.h" namespace Tegra { @@ -62,23 +68,31 @@ struct ChCommand { std::vector arguments; }; -using ChCommandHeaderList = std::vector; +using ChCommandHeaderList = + Core::Memory::CpuGuestMemory; struct ThiRegisters { - u32_le increment_syncpt{}; - INSERT_PADDING_WORDS(1); - u32_le increment_syncpt_error{}; - u32_le ctx_switch_incremement_syncpt{}; - INSERT_PADDING_WORDS(4); - u32_le ctx_switch{}; - INSERT_PADDING_WORDS(1); - u32_le ctx_syncpt_eof{}; - INSERT_PADDING_WORDS(5); - u32_le method_0{}; - u32_le method_1{}; - INSERT_PADDING_WORDS(12); - u32_le int_status{}; - u32_le int_mask{}; + static constexpr std::size_t NUM_REGS = 0x20; + + union { + struct { + u32_le increment_syncpt; + INSERT_PADDING_WORDS_NOINIT(1); + u32_le increment_syncpt_error; + u32_le ctx_switch_incremement_syncpt; + INSERT_PADDING_WORDS_NOINIT(4); + u32_le ctx_switch; + INSERT_PADDING_WORDS_NOINIT(1); + u32_le ctx_syncpt_eof; + INSERT_PADDING_WORDS_NOINIT(5); + u32_le method_0; + u32_le method_1; + INSERT_PADDING_WORDS_NOINIT(12); + u32_le int_status; + u32_le int_mask; + }; + std::array reg_array; + }; }; enum class ThiMethod : u32 { @@ -89,32 +103,39 @@ enum class ThiMethod : u32 { class CDmaPusher { public: - explicit CDmaPusher(Host1x::Host1x& host1x); - ~CDmaPusher(); + CDmaPusher() = delete; + virtual ~CDmaPusher(); - /// Process the command entry - void ProcessEntries(ChCommandHeaderList&& entries); + void PushEntries(ChCommandHeaderList&& entries) { + std::scoped_lock l{command_mutex}; + command_lists.push_back(std::move(entries)); + command_cv.notify_one(); + } + +protected: + explicit CDmaPusher(Host1x::Host1x& host1x, s32 id); + + virtual void ProcessMethod(u32 method, u32 arg) = 0; + + Host1x::Host1x& host1x; + Tegra::MemoryManager& memory_manager; private: + /// Process the command entry + void ProcessEntries(std::stop_token stop_token); + /// Invoke command class devices to execute the command based on the current state void ExecuteCommand(u32 state_offset, u32 data); - /// Write arguments value to the ThiRegisters member at the specified offset - void ThiStateWrite(ThiRegisters& state, u32 offset, u32 argument); + std::unique_ptr host_processor; - Host1x::Host1x& host1x; - std::shared_ptr nvdec_processor; - std::unique_ptr vic_processor; - std::unique_ptr host1x_processor; - std::unique_ptr sync_manager; - ChClassId current_class{}; - ThiRegisters vic_thi_state{}; - ThiRegisters nvdec_thi_state{}; + std::mutex command_mutex; + std::condition_variable_any command_cv; + std::deque command_lists; + std::jthread thread; - u32 count{}; - u32 offset{}; - u32 mask{}; - bool incrementing{}; + ThiRegisters thi_regs{}; + ChClassId current_class; }; } // namespace Tegra diff --git a/src/video_core/framebuffer_config.h b/src/video_core/framebuffer_config.h index 4855fdae3..d196dd96a 100755 --- a/src/video_core/framebuffer_config.h +++ b/src/video_core/framebuffer_config.h @@ -11,6 +11,12 @@ namespace Tegra { +enum class BlendMode { + Opaque, + Premultiplied, + Coverage, +}; + /** * Struct describing framebuffer configuration */ @@ -23,6 +29,7 @@ struct FramebufferConfig { Service::android::PixelFormat pixel_format{}; Service::android::BufferTransformFlags transform_flags{}; Common::Rectangle crop_rect{}; + BlendMode blending{}; }; Common::Rectangle NormalizeCrop(const FramebufferConfig& framebuffer, u32 texture_width, diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 03ad6e68b..59356015b 100755 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -250,30 +250,6 @@ struct GPU::Impl { gpu_thread.SubmitList(channel, std::move(entries)); } - /// Push GPU command buffer entries to be processed - void PushCommandBuffer(u32 id, Tegra::ChCommandHeaderList& entries) { - if (!use_nvdec) { - return; - } - - if (!cdma_pushers.contains(id)) { - cdma_pushers.insert_or_assign(id, std::make_unique(host1x)); - } - - // SubmitCommandBuffer would make the nvdec operations async, this is not currently working - // TODO(ameerj): RE proper async nvdec operation - // gpu_thread.SubmitCommandBuffer(std::move(entries)); - cdma_pushers[id]->ProcessEntries(std::move(entries)); - } - - /// Frees the CDMAPusher instance to free up resources - void ClearCdmaInstance(u32 id) { - const auto iter = cdma_pushers.find(id); - if (iter != cdma_pushers.end()) { - cdma_pushers.erase(iter); - } - } - /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory void FlushRegion(DAddr addr, u64 size) { gpu_thread.FlushRegion(addr, size); @@ -347,11 +323,21 @@ struct GPU::Impl { WaitForSyncOperation(wait_fence); } + std::vector GetAppletCaptureBuffer() { + std::vector out; + + const auto wait_fence = + RequestSyncOperation([&] { out = renderer->GetAppletCaptureBuffer(); }); + gpu_thread.TickGPU(); + WaitForSyncOperation(wait_fence); + + return out; + } + GPU& gpu; Core::System& system; Host1x::Host1x& host1x; - std::map> cdma_pushers; std::unique_ptr renderer; VideoCore::RasterizerInterface* rasterizer = nullptr; const bool use_nvdec; @@ -505,6 +491,10 @@ void GPU::RequestComposite(std::vector&& layers, impl->RequestComposite(std::move(layers), std::move(fences)); } +std::vector GPU::GetAppletCaptureBuffer() { + return impl->GetAppletCaptureBuffer(); +} + u64 GPU::GetTicks() const { return impl->GetTicks(); } @@ -541,14 +531,6 @@ void GPU::PushGPUEntries(s32 channel, Tegra::CommandList&& entries) { impl->PushGPUEntries(channel, std::move(entries)); } -void GPU::PushCommandBuffer(u32 id, Tegra::ChCommandHeaderList& entries) { - impl->PushCommandBuffer(id, entries); -} - -void GPU::ClearCdmaInstance(u32 id) { - impl->ClearCdmaInstance(id); -} - VideoCore::RasterizerDownloadArea GPU::OnCPURead(PAddr addr, u64 size) { return impl->OnCPURead(addr, size); } diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index 86edf9aaa..25c75a109 100755 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -215,6 +215,8 @@ public: void RequestComposite(std::vector&& layers, std::vector&& fences); + std::vector GetAppletCaptureBuffer(); + /// Performs any additional setup necessary in order to begin GPU emulation. /// This can be used to launch any necessary threads and register any necessary /// core timing events. @@ -232,15 +234,6 @@ public: /// Push GPU command entries to be processed void PushGPUEntries(s32 channel, Tegra::CommandList&& entries); - /// Push GPU command buffer entries to be processed - void PushCommandBuffer(u32 id, Tegra::ChCommandHeaderList& entries); - - /// Frees the CDMAPusher instance to free up resources - void ClearCdmaInstance(u32 id); - - /// Swap buffers (render frame) - void SwapBuffers(const Tegra::FramebufferConfig* framebuffer); - /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory [[nodiscard]] VideoCore::RasterizerDownloadArea OnCPURead(DAddr addr, u64 size); diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp index d477743ca..0832234af 100755 --- a/src/video_core/gpu_thread.cpp +++ b/src/video_core/gpu_thread.cpp @@ -12,6 +12,7 @@ #include "video_core/dma_pusher.h" #include "video_core/gpu.h" #include "video_core/gpu_thread.h" +#include "video_core/host1x/host1x.h" #include "video_core/renderer_base.h" namespace VideoCommon::GPUThread { diff --git a/src/video_core/host1x/codecs/decoder.cpp b/src/video_core/host1x/codecs/decoder.cpp index 559166b51..8508cc172 100755 --- a/src/video_core/host1x/codecs/decoder.cpp +++ b/src/video_core/host1x/codecs/decoder.cpp @@ -9,8 +9,10 @@ namespace Tegra { -Decoder::Decoder(Host1x::Host1x& host1x_, s32 id_, const Host1x::NvdecCommon::NvdecRegisters& regs_) - : host1x(host1x_), memory_manager{host1x.GMMU()}, regs{regs_}, id{id_} {} +Decoder::Decoder(Host1x::Host1x& host1x_, s32 id_, const Host1x::NvdecCommon::NvdecRegisters& regs_, + Host1x::FrameQueue& frame_queue_) + : host1x(host1x_), memory_manager{host1x.GMMU()}, regs{regs_}, id{id_}, frame_queue{ + frame_queue_} {} Decoder::~Decoder() = default; @@ -43,11 +45,11 @@ void Decoder::Decode() { } if (UsingDecodeOrder()) { - decode_order_frames.insert_or_assign(luma_top, std::move(frame)); - decode_order_frames.insert_or_assign(luma_bottom, std::move(frame_copy)); + frame_queue.PushDecodeOrder(id, luma_top, std::move(frame)); + frame_queue.PushDecodeOrder(id, luma_bottom, std::move(frame_copy)); } else { - presentation_order_frames.push(std::move(frame)); - presentation_order_frames.push(std::move(frame_copy)); + frame_queue.PushPresentOrder(id, luma_top, std::move(frame)); + frame_queue.PushPresentOrder(id, luma_bottom, std::move(frame_copy)); } } else { auto [luma_offset, chroma_offset] = GetProgressiveOffsets(); @@ -57,9 +59,9 @@ void Decoder::Decode() { } if (UsingDecodeOrder()) { - decode_order_frames.insert_or_assign(luma_offset, std::move(frame)); + frame_queue.PushDecodeOrder(id, luma_offset, std::move(frame)); } else { - presentation_order_frames.push(std::move(frame)); + frame_queue.PushPresentOrder(id, luma_offset, std::move(frame)); } } } diff --git a/src/video_core/host1x/codecs/decoder.h b/src/video_core/host1x/codecs/decoder.h index 9245b6b8f..9b350ee3e 100755 --- a/src/video_core/host1x/codecs/decoder.h +++ b/src/video_core/host1x/codecs/decoder.h @@ -4,6 +4,7 @@ #pragma once #include +#include #include #include #include @@ -17,6 +18,7 @@ namespace Tegra { namespace Host1x { class Host1x; +class FrameQueue; } // namespace Host1x class Decoder { @@ -30,23 +32,6 @@ public: return decode_api.UsingDecodeOrder(); } - std::shared_ptr GetFrame(u64 luma_offset) { - if (UsingDecodeOrder()) { - auto it = decode_order_frames.find(luma_offset); - if (it == decode_order_frames.end()) { - return {}; - } - return decode_order_frames.extract(it).mapped(); - } - - if (presentation_order_frames.size() == 0) { - return {}; - } - auto frame = std::move(presentation_order_frames.front()); - presentation_order_frames.pop(); - return frame; - } - /// Returns the value of current_codec [[nodiscard]] Host1x::NvdecCommon::VideoCodec GetCurrentCodec() const { return codec; @@ -57,7 +42,8 @@ public: protected: explicit Decoder(Host1x::Host1x& host1x, s32 id, - const Host1x::NvdecCommon::NvdecRegisters& regs); + const Host1x::NvdecCommon::NvdecRegisters& regs, + Host1x::FrameQueue& frame_queue); virtual std::span ComposeFrame() = 0; virtual std::tuple GetProgressiveOffsets() = 0; @@ -68,12 +54,10 @@ protected: Tegra::MemoryManager& memory_manager; const Host1x::NvdecCommon::NvdecRegisters& regs; s32 id; + Host1x::FrameQueue& frame_queue; Host1x::NvdecCommon::VideoCodec codec; FFmpeg::DecodeApi decode_api; bool initialized{}; - std::queue> presentation_order_frames; - std::unordered_map> decode_order_frames; - bool vp9_hidden_frame{}; }; diff --git a/src/video_core/host1x/codecs/h264.cpp b/src/video_core/host1x/codecs/h264.cpp index 77eda05c2..d3b66c20c 100755 --- a/src/video_core/host1x/codecs/h264.cpp +++ b/src/video_core/host1x/codecs/h264.cpp @@ -10,7 +10,7 @@ #include "video_core/host1x/host1x.h" #include "video_core/memory_manager.h" -namespace Tegra::Decoder { +namespace Tegra::Decoders { namespace { // ZigZag LUTs from libavcodec. constexpr std::array zig_zag_direct{ @@ -25,23 +25,56 @@ constexpr std::array zig_zag_scan{ }; } // Anonymous namespace -H264::H264(Host1x::Host1x& host1x_) : host1x{host1x_} {} +H264::H264(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs_, s32 id_, + Host1x::FrameQueue& frame_queue_) + : Decoder{host1x_, id_, regs_, frame_queue_} { + codec = Host1x::NvdecCommon::VideoCodec::H264; + initialized = decode_api.Initialize(codec); +} H264::~H264() = default; -std::span H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state, - size_t* out_configuration_size, bool is_first_frame) { - H264DecoderContext context; - host1x.GMMU().ReadBlock(state.picture_info_offset, &context, sizeof(H264DecoderContext)); +std::tuple H264::GetProgressiveOffsets() { + auto pic_idx{current_context.h264_parameter_set.curr_pic_idx}; + auto luma{regs.surface_luma_offsets[pic_idx].Address() + + current_context.h264_parameter_set.luma_frame_offset.Address()}; + auto chroma{regs.surface_chroma_offsets[pic_idx].Address() + + current_context.h264_parameter_set.chroma_frame_offset.Address()}; + return {luma, chroma}; +} - const s64 frame_number = context.h264_parameter_set.frame_number.Value(); +std::tuple H264::GetInterlacedOffsets() { + auto pic_idx{current_context.h264_parameter_set.curr_pic_idx}; + auto luma_top{regs.surface_luma_offsets[pic_idx].Address() + + current_context.h264_parameter_set.luma_top_offset.Address()}; + auto luma_bottom{regs.surface_luma_offsets[pic_idx].Address() + + current_context.h264_parameter_set.luma_bot_offset.Address()}; + auto chroma_top{regs.surface_chroma_offsets[pic_idx].Address() + + current_context.h264_parameter_set.chroma_top_offset.Address()}; + auto chroma_bottom{regs.surface_chroma_offsets[pic_idx].Address() + + current_context.h264_parameter_set.chroma_bot_offset.Address()}; + return {luma_top, luma_bottom, chroma_top, chroma_bottom}; +} + +bool H264::IsInterlaced() { + return current_context.h264_parameter_set.luma_top_offset.Address() != 0 || + current_context.h264_parameter_set.luma_bot_offset.Address() != 0; +} + +std::span H264::ComposeFrame() { + memory_manager.ReadBlock(regs.picture_info_offset.Address(), ¤t_context, + sizeof(H264DecoderContext)); + + const s64 frame_number = current_context.h264_parameter_set.frame_number.Value(); if (!is_first_frame && frame_number != 0) { - frame.resize_destructive(context.stream_len); - host1x.GMMU().ReadBlock(state.frame_bitstream_offset, frame.data(), frame.size()); - *out_configuration_size = 0; - return frame; + frame_scratch.resize_destructive(current_context.stream_len); + memory_manager.ReadBlock(regs.frame_bitstream_offset.Address(), frame_scratch.data(), + frame_scratch.size()); + return frame_scratch; } + is_first_frame = false; + // Encode header H264BitWriter writer{}; writer.WriteU(1, 24); @@ -53,7 +86,7 @@ std::span H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters writer.WriteU(31, 8); writer.WriteUe(0); const u32 chroma_format_idc = - static_cast(context.h264_parameter_set.chroma_format_idc.Value()); + static_cast(current_context.h264_parameter_set.chroma_format_idc.Value()); writer.WriteUe(chroma_format_idc); if (chroma_format_idc == 3) { writer.WriteBit(false); @@ -61,42 +94,44 @@ std::span H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters writer.WriteUe(0); writer.WriteUe(0); - writer.WriteBit(false); // QpprimeYZeroTransformBypassFlag + writer.WriteBit(current_context.qpprime_y_zero_transform_bypass_flag.Value() != 0); writer.WriteBit(false); // Scaling matrix present flag - writer.WriteUe(static_cast(context.h264_parameter_set.log2_max_frame_num_minus4.Value())); + writer.WriteUe( + static_cast(current_context.h264_parameter_set.log2_max_frame_num_minus4.Value())); const auto order_cnt_type = - static_cast(context.h264_parameter_set.pic_order_cnt_type.Value()); + static_cast(current_context.h264_parameter_set.pic_order_cnt_type.Value()); writer.WriteUe(order_cnt_type); if (order_cnt_type == 0) { - writer.WriteUe(context.h264_parameter_set.log2_max_pic_order_cnt_lsb_minus4); + writer.WriteUe(current_context.h264_parameter_set.log2_max_pic_order_cnt_lsb_minus4); } else if (order_cnt_type == 1) { - writer.WriteBit(context.h264_parameter_set.delta_pic_order_always_zero_flag != 0); + writer.WriteBit(current_context.h264_parameter_set.delta_pic_order_always_zero_flag != 0); writer.WriteSe(0); writer.WriteSe(0); writer.WriteUe(0); } - const s32 pic_height = context.h264_parameter_set.frame_height_in_map_units / - (context.h264_parameter_set.frame_mbs_only_flag ? 1 : 2); + const s32 pic_height = current_context.h264_parameter_set.frame_height_in_mbs / + (current_context.h264_parameter_set.frame_mbs_only_flag ? 1 : 2); - // TODO (ameerj): Where do we get this number, it seems to be particular for each stream - const auto nvdec_decoding = Settings::values.nvdec_emulation.GetValue(); - const bool uses_gpu_decoding = nvdec_decoding == Settings::NvdecEmulation::Gpu; - const u32 max_num_ref_frames = uses_gpu_decoding ? 6u : 16u; + u32 max_num_ref_frames = + std::max(std::max(current_context.h264_parameter_set.num_refidx_l0_default_active, + current_context.h264_parameter_set.num_refidx_l1_default_active) + + 1, + 4); writer.WriteUe(max_num_ref_frames); writer.WriteBit(false); - writer.WriteUe(context.h264_parameter_set.pic_width_in_mbs - 1); + writer.WriteUe(current_context.h264_parameter_set.pic_width_in_mbs - 1); writer.WriteUe(pic_height - 1); - writer.WriteBit(context.h264_parameter_set.frame_mbs_only_flag != 0); + writer.WriteBit(current_context.h264_parameter_set.frame_mbs_only_flag != 0); - if (!context.h264_parameter_set.frame_mbs_only_flag) { - writer.WriteBit(context.h264_parameter_set.flags.mbaff_frame.Value() != 0); + if (!current_context.h264_parameter_set.frame_mbs_only_flag) { + writer.WriteBit(current_context.h264_parameter_set.flags.mbaff_frame.Value() != 0); } - writer.WriteBit(context.h264_parameter_set.flags.direct_8x8_inference.Value() != 0); + writer.WriteBit(current_context.h264_parameter_set.flags.direct_8x8_inference.Value() != 0); writer.WriteBit(false); // Frame cropping flag writer.WriteBit(false); // VUI parameter present flag @@ -111,57 +146,59 @@ std::span H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters writer.WriteUe(0); writer.WriteUe(0); - writer.WriteBit(context.h264_parameter_set.entropy_coding_mode_flag != 0); - writer.WriteBit(context.h264_parameter_set.pic_order_present_flag != 0); + writer.WriteBit(current_context.h264_parameter_set.entropy_coding_mode_flag != 0); + writer.WriteBit(current_context.h264_parameter_set.pic_order_present_flag != 0); writer.WriteUe(0); - writer.WriteUe(context.h264_parameter_set.num_refidx_l0_default_active); - writer.WriteUe(context.h264_parameter_set.num_refidx_l1_default_active); - writer.WriteBit(context.h264_parameter_set.flags.weighted_pred.Value() != 0); - writer.WriteU(static_cast(context.h264_parameter_set.weighted_bipred_idc.Value()), 2); - s32 pic_init_qp = static_cast(context.h264_parameter_set.pic_init_qp_minus26.Value()); + writer.WriteUe(current_context.h264_parameter_set.num_refidx_l0_default_active); + writer.WriteUe(current_context.h264_parameter_set.num_refidx_l1_default_active); + writer.WriteBit(current_context.h264_parameter_set.flags.weighted_pred.Value() != 0); + writer.WriteU(static_cast(current_context.h264_parameter_set.weighted_bipred_idc.Value()), + 2); + s32 pic_init_qp = + static_cast(current_context.h264_parameter_set.pic_init_qp_minus26.Value()); writer.WriteSe(pic_init_qp); writer.WriteSe(0); s32 chroma_qp_index_offset = - static_cast(context.h264_parameter_set.chroma_qp_index_offset.Value()); + static_cast(current_context.h264_parameter_set.chroma_qp_index_offset.Value()); writer.WriteSe(chroma_qp_index_offset); - writer.WriteBit(context.h264_parameter_set.deblocking_filter_control_present_flag != 0); - writer.WriteBit(context.h264_parameter_set.flags.constrained_intra_pred.Value() != 0); - writer.WriteBit(context.h264_parameter_set.redundant_pic_cnt_present_flag != 0); - writer.WriteBit(context.h264_parameter_set.transform_8x8_mode_flag != 0); + writer.WriteBit(current_context.h264_parameter_set.deblocking_filter_control_present_flag != 0); + writer.WriteBit(current_context.h264_parameter_set.flags.constrained_intra_pred.Value() != 0); + writer.WriteBit(current_context.h264_parameter_set.redundant_pic_cnt_present_flag != 0); + writer.WriteBit(current_context.h264_parameter_set.transform_8x8_mode_flag != 0); writer.WriteBit(true); // pic_scaling_matrix_present_flag for (s32 index = 0; index < 6; index++) { writer.WriteBit(true); - std::span matrix{context.weight_scale}; - writer.WriteScalingList(scan, matrix, index * 16, 16); + std::span matrix{current_context.weight_scale_4x4}; + writer.WriteScalingList(scan_scratch, matrix, index * 16, 16); } - if (context.h264_parameter_set.transform_8x8_mode_flag) { + if (current_context.h264_parameter_set.transform_8x8_mode_flag) { for (s32 index = 0; index < 2; index++) { writer.WriteBit(true); - std::span matrix{context.weight_scale_8x8}; - writer.WriteScalingList(scan, matrix, index * 64, 64); + std::span matrix{current_context.weight_scale_8x8}; + writer.WriteScalingList(scan_scratch, matrix, index * 64, 64); } } s32 chroma_qp_index_offset2 = - static_cast(context.h264_parameter_set.second_chroma_qp_index_offset.Value()); + static_cast(current_context.h264_parameter_set.second_chroma_qp_index_offset.Value()); writer.WriteSe(chroma_qp_index_offset2); writer.End(); const auto& encoded_header = writer.GetByteArray(); - frame.resize(encoded_header.size() + context.stream_len); - std::memcpy(frame.data(), encoded_header.data(), encoded_header.size()); + frame_scratch.resize(encoded_header.size() + current_context.stream_len); + std::memcpy(frame_scratch.data(), encoded_header.data(), encoded_header.size()); - *out_configuration_size = encoded_header.size(); - host1x.GMMU().ReadBlock(state.frame_bitstream_offset, frame.data() + encoded_header.size(), - context.stream_len); + memory_manager.ReadBlock(regs.frame_bitstream_offset.Address(), + frame_scratch.data() + encoded_header.size(), + current_context.stream_len); - return frame; + return frame_scratch; } H264BitWriter::H264BitWriter() = default; @@ -278,4 +315,4 @@ void H264BitWriter::Flush() { buffer = 0; buffer_pos = 0; } -} // namespace Tegra::Decoder +} // namespace Tegra::Decoders diff --git a/src/video_core/host1x/codecs/h264.h b/src/video_core/host1x/codecs/h264.h index 80dbb4e4b..9e95c6c68 100755 --- a/src/video_core/host1x/codecs/h264.h +++ b/src/video_core/host1x/codecs/h264.h @@ -10,6 +10,7 @@ #include "common/common_funcs.h" #include "common/common_types.h" #include "common/scratch_buffer.h" +#include "video_core/host1x/codecs/decoder.h" #include "video_core/host1x/nvdec_common.h" namespace Tegra { @@ -18,7 +19,7 @@ namespace Host1x { class Host1x; } // namespace Host1x -namespace Decoder { +namespace Decoders { class H264BitWriter { public: @@ -60,123 +61,213 @@ private: std::vector byte_array; }; -class H264 { -public: - explicit H264(Host1x::Host1x& host1x); - ~H264(); - - /// Compose the H264 frame for FFmpeg decoding - [[nodiscard]] std::span ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state, - size_t* out_configuration_size, - bool is_first_frame = false); +struct Offset { + constexpr u32 Address() const noexcept { + return offset << 8; + } private: - Common::ScratchBuffer frame; - Common::ScratchBuffer scan; - Host1x::Host1x& host1x; + u32 offset; +}; +static_assert(std::is_trivial_v, "Offset must be trivial"); +static_assert(sizeof(Offset) == 0x4, "Offset has the wrong size!"); - struct H264ParameterSet { - s32 log2_max_pic_order_cnt_lsb_minus4; ///< 0x00 - s32 delta_pic_order_always_zero_flag; ///< 0x04 - s32 frame_mbs_only_flag; ///< 0x08 - u32 pic_width_in_mbs; ///< 0x0C - u32 frame_height_in_map_units; ///< 0x10 - union { ///< 0x14 - BitField<0, 2, u32> tile_format; - BitField<2, 3, u32> gob_height; - }; - u32 entropy_coding_mode_flag; ///< 0x18 - s32 pic_order_present_flag; ///< 0x1C - s32 num_refidx_l0_default_active; ///< 0x20 - s32 num_refidx_l1_default_active; ///< 0x24 - s32 deblocking_filter_control_present_flag; ///< 0x28 - s32 redundant_pic_cnt_present_flag; ///< 0x2C - u32 transform_8x8_mode_flag; ///< 0x30 - u32 pitch_luma; ///< 0x34 - u32 pitch_chroma; ///< 0x38 - u32 luma_top_offset; ///< 0x3C - u32 luma_bot_offset; ///< 0x40 - u32 luma_frame_offset; ///< 0x44 - u32 chroma_top_offset; ///< 0x48 - u32 chroma_bot_offset; ///< 0x4C - u32 chroma_frame_offset; ///< 0x50 - u32 hist_buffer_size; ///< 0x54 - union { ///< 0x58 - union { - BitField<0, 1, u64> mbaff_frame; - BitField<1, 1, u64> direct_8x8_inference; - BitField<2, 1, u64> weighted_pred; - BitField<3, 1, u64> constrained_intra_pred; - BitField<4, 1, u64> ref_pic; - BitField<5, 1, u64> field_pic; - BitField<6, 1, u64> bottom_field; - BitField<7, 1, u64> second_field; - } flags; - BitField<8, 4, u64> log2_max_frame_num_minus4; - BitField<12, 2, u64> chroma_format_idc; - BitField<14, 2, u64> pic_order_cnt_type; - BitField<16, 6, s64> pic_init_qp_minus26; - BitField<22, 5, s64> chroma_qp_index_offset; - BitField<27, 5, s64> second_chroma_qp_index_offset; - BitField<32, 2, u64> weighted_bipred_idc; - BitField<34, 7, u64> curr_pic_idx; - BitField<41, 5, u64> curr_col_idx; - BitField<46, 16, u64> frame_number; - BitField<62, 1, u64> frame_surfaces; - BitField<63, 1, u64> output_memory_layout; - }; +struct H264ParameterSet { + s32 log2_max_pic_order_cnt_lsb_minus4; ///< 0x00 + s32 delta_pic_order_always_zero_flag; ///< 0x04 + s32 frame_mbs_only_flag; ///< 0x08 + u32 pic_width_in_mbs; ///< 0x0C + u32 frame_height_in_mbs; ///< 0x10 + union { ///< 0x14 + BitField<0, 2, u32> tile_format; + BitField<2, 3, u32> gob_height; + BitField<5, 27, u32> reserved_surface_format; }; - static_assert(sizeof(H264ParameterSet) == 0x60, "H264ParameterSet is an invalid size"); - - struct H264DecoderContext { - INSERT_PADDING_WORDS_NOINIT(18); ///< 0x0000 - u32 stream_len; ///< 0x0048 - INSERT_PADDING_WORDS_NOINIT(3); ///< 0x004C - H264ParameterSet h264_parameter_set; ///< 0x0058 - INSERT_PADDING_WORDS_NOINIT(66); ///< 0x00B8 - std::array weight_scale; ///< 0x01C0 - std::array weight_scale_8x8; ///< 0x0220 + u32 entropy_coding_mode_flag; ///< 0x18 + s32 pic_order_present_flag; ///< 0x1C + s32 num_refidx_l0_default_active; ///< 0x20 + s32 num_refidx_l1_default_active; ///< 0x24 + s32 deblocking_filter_control_present_flag; ///< 0x28 + s32 redundant_pic_cnt_present_flag; ///< 0x2C + u32 transform_8x8_mode_flag; ///< 0x30 + u32 pitch_luma; ///< 0x34 + u32 pitch_chroma; ///< 0x38 + Offset luma_top_offset; ///< 0x3C + Offset luma_bot_offset; ///< 0x40 + Offset luma_frame_offset; ///< 0x44 + Offset chroma_top_offset; ///< 0x48 + Offset chroma_bot_offset; ///< 0x4C + Offset chroma_frame_offset; ///< 0x50 + u32 hist_buffer_size; ///< 0x54 + union { ///< 0x58 + union { + BitField<0, 1, u64> mbaff_frame; + BitField<1, 1, u64> direct_8x8_inference; + BitField<2, 1, u64> weighted_pred; + BitField<3, 1, u64> constrained_intra_pred; + BitField<4, 1, u64> ref_pic; + BitField<5, 1, u64> field_pic; + BitField<6, 1, u64> bottom_field; + BitField<7, 1, u64> second_field; + } flags; + BitField<8, 4, u64> log2_max_frame_num_minus4; + BitField<12, 2, u64> chroma_format_idc; + BitField<14, 2, u64> pic_order_cnt_type; + BitField<16, 6, s64> pic_init_qp_minus26; + BitField<22, 5, s64> chroma_qp_index_offset; + BitField<27, 5, s64> second_chroma_qp_index_offset; + BitField<32, 2, u64> weighted_bipred_idc; + BitField<34, 7, u64> curr_pic_idx; + BitField<41, 5, u64> curr_col_idx; + BitField<46, 16, u64> frame_number; + BitField<62, 1, u64> frame_surfaces; + BitField<63, 1, u64> output_memory_layout; }; - static_assert(sizeof(H264DecoderContext) == 0x2A0, "H264DecoderContext is an invalid size"); +}; +static_assert(sizeof(H264ParameterSet) == 0x60, "H264ParameterSet is an invalid size"); #define ASSERT_POSITION(field_name, position) \ static_assert(offsetof(H264ParameterSet, field_name) == position, \ "Field " #field_name " has invalid position") - ASSERT_POSITION(log2_max_pic_order_cnt_lsb_minus4, 0x00); - ASSERT_POSITION(delta_pic_order_always_zero_flag, 0x04); - ASSERT_POSITION(frame_mbs_only_flag, 0x08); - ASSERT_POSITION(pic_width_in_mbs, 0x0C); - ASSERT_POSITION(frame_height_in_map_units, 0x10); - ASSERT_POSITION(tile_format, 0x14); - ASSERT_POSITION(entropy_coding_mode_flag, 0x18); - ASSERT_POSITION(pic_order_present_flag, 0x1C); - ASSERT_POSITION(num_refidx_l0_default_active, 0x20); - ASSERT_POSITION(num_refidx_l1_default_active, 0x24); - ASSERT_POSITION(deblocking_filter_control_present_flag, 0x28); - ASSERT_POSITION(redundant_pic_cnt_present_flag, 0x2C); - ASSERT_POSITION(transform_8x8_mode_flag, 0x30); - ASSERT_POSITION(pitch_luma, 0x34); - ASSERT_POSITION(pitch_chroma, 0x38); - ASSERT_POSITION(luma_top_offset, 0x3C); - ASSERT_POSITION(luma_bot_offset, 0x40); - ASSERT_POSITION(luma_frame_offset, 0x44); - ASSERT_POSITION(chroma_top_offset, 0x48); - ASSERT_POSITION(chroma_bot_offset, 0x4C); - ASSERT_POSITION(chroma_frame_offset, 0x50); - ASSERT_POSITION(hist_buffer_size, 0x54); - ASSERT_POSITION(flags, 0x58); +ASSERT_POSITION(log2_max_pic_order_cnt_lsb_minus4, 0x00); +ASSERT_POSITION(delta_pic_order_always_zero_flag, 0x04); +ASSERT_POSITION(frame_mbs_only_flag, 0x08); +ASSERT_POSITION(pic_width_in_mbs, 0x0C); +ASSERT_POSITION(frame_height_in_mbs, 0x10); +ASSERT_POSITION(tile_format, 0x14); +ASSERT_POSITION(entropy_coding_mode_flag, 0x18); +ASSERT_POSITION(pic_order_present_flag, 0x1C); +ASSERT_POSITION(num_refidx_l0_default_active, 0x20); +ASSERT_POSITION(num_refidx_l1_default_active, 0x24); +ASSERT_POSITION(deblocking_filter_control_present_flag, 0x28); +ASSERT_POSITION(redundant_pic_cnt_present_flag, 0x2C); +ASSERT_POSITION(transform_8x8_mode_flag, 0x30); +ASSERT_POSITION(pitch_luma, 0x34); +ASSERT_POSITION(pitch_chroma, 0x38); +ASSERT_POSITION(luma_top_offset, 0x3C); +ASSERT_POSITION(luma_bot_offset, 0x40); +ASSERT_POSITION(luma_frame_offset, 0x44); +ASSERT_POSITION(chroma_top_offset, 0x48); +ASSERT_POSITION(chroma_bot_offset, 0x4C); +ASSERT_POSITION(chroma_frame_offset, 0x50); +ASSERT_POSITION(hist_buffer_size, 0x54); +ASSERT_POSITION(flags, 0x58); #undef ASSERT_POSITION +struct DpbEntry { + union { + BitField<0, 7, u32> index; + BitField<7, 5, u32> col_idx; + BitField<12, 2, u32> state; + BitField<14, 1, u32> is_long_term; + BitField<15, 1, u32> non_existing; + BitField<16, 1, u32> is_field; + BitField<17, 4, u32> top_field_marking; + BitField<21, 4, u32> bottom_field_marking; + BitField<25, 1, u32> output_memory_layout; + BitField<26, 6, u32> reserved; + } flags; + std::array field_order_cnt; + u32 frame_idx; +}; +static_assert(sizeof(DpbEntry) == 0x10, "DpbEntry has the wrong size!"); + +struct DisplayParam { + union { + BitField<0, 1, u32> enable_tf_output; + BitField<1, 1, u32> vc1_map_y_flag; + BitField<2, 3, u32> map_y_value; + BitField<5, 1, u32> vc1_map_uv_flag; + BitField<6, 3, u32> map_uv_value; + BitField<9, 8, u32> out_stride; + BitField<17, 3, u32> tiling_format; + BitField<20, 1, u32> output_structure; // 0=frame, 1=field + BitField<21, 11, u32> reserved0; + }; + std::array output_top; + std::array output_bottom; + union { + BitField<0, 1, u32> enable_histogram; + BitField<1, 12, u32> histogram_start_x; + BitField<13, 12, u32> histogram_start_y; + BitField<25, 7, u32> reserved1; + }; + union { + BitField<0, 12, u32> histogram_end_x; + BitField<12, 12, u32> histogram_end_y; + BitField<24, 8, u32> reserved2; + }; +}; +static_assert(sizeof(DisplayParam) == 0x1C, "DisplayParam has the wrong size!"); + +struct H264DecoderContext { + INSERT_PADDING_WORDS_NOINIT(13); ///< 0x0000 + std::array eos; ///< 0x0034 + u8 explicit_eos_present_flag; ///< 0x0044 + u8 hint_dump_en; ///< 0x0045 + INSERT_PADDING_BYTES_NOINIT(2); ///< 0x0046 + u32 stream_len; ///< 0x0048 + u32 slice_count; ///< 0x004C + u32 mbhist_buffer_size; ///< 0x0050 + u32 gptimer_timeout_value; ///< 0x0054 + H264ParameterSet h264_parameter_set; ///< 0x0058 + std::array curr_field_order_cnt; ///< 0x00B8 + std::array dpb; ///< 0x00C0 + std::array weight_scale_4x4; ///< 0x01C0 + std::array weight_scale_8x8; ///< 0x0220 + std::array num_inter_view_refs_lX; ///< 0x02A0 + std::array reserved2; ///< 0x02A2 + std::array, 2> inter_view_refidx_lX; ///< 0x02B0 + union { ///< 0x02D0 + BitField<0, 1, u32> lossless_ipred8x8_filter_enable; + BitField<1, 1, u32> qpprime_y_zero_transform_bypass_flag; + BitField<2, 30, u32> reserved3; + }; + DisplayParam display_param; ///< 0x02D4 + std::array reserved4; ///< 0x02F0 +}; +static_assert(sizeof(H264DecoderContext) == 0x2FC, "H264DecoderContext is an invalid size"); + #define ASSERT_POSITION(field_name, position) \ static_assert(offsetof(H264DecoderContext, field_name) == position, \ "Field " #field_name " has invalid position") - ASSERT_POSITION(stream_len, 0x48); - ASSERT_POSITION(h264_parameter_set, 0x58); - ASSERT_POSITION(weight_scale, 0x1C0); +ASSERT_POSITION(stream_len, 0x48); +ASSERT_POSITION(h264_parameter_set, 0x58); +ASSERT_POSITION(dpb, 0xC0); +ASSERT_POSITION(weight_scale_4x4, 0x1C0); #undef ASSERT_POSITION + +class H264 final : public Decoder { +public: + explicit H264(Host1x::Host1x& host1x, const Host1x::NvdecCommon::NvdecRegisters& regs, s32 id, + Host1x::FrameQueue& frame_queue); + ~H264() override; + + H264(const H264&) = delete; + H264& operator=(const H264&) = delete; + + H264(H264&&) = delete; + H264& operator=(H264&&) = delete; + + /// Compose the H264 frame for FFmpeg decoding + [[nodiscard]] std::span ComposeFrame() override; + + std::tuple GetProgressiveOffsets() override; + std::tuple GetInterlacedOffsets() override; + bool IsInterlaced() override; + + std::string_view GetCurrentCodecName() const override { + return "H264"; + } + +private: + bool is_first_frame{true}; + Common::ScratchBuffer frame_scratch; + Common::ScratchBuffer scan_scratch; + H264DecoderContext current_context{}; }; -} // namespace Decoder +} // namespace Decoders } // namespace Tegra diff --git a/src/video_core/host1x/codecs/vp8.cpp b/src/video_core/host1x/codecs/vp8.cpp index e8b9bf6bf..94c6ab7f0 100755 --- a/src/video_core/host1x/codecs/vp8.cpp +++ b/src/video_core/host1x/codecs/vp8.cpp @@ -7,47 +7,70 @@ #include "video_core/host1x/host1x.h" #include "video_core/memory_manager.h" -namespace Tegra::Decoder { -VP8::VP8(Host1x::Host1x& host1x_) : host1x{host1x_} {} +namespace Tegra::Decoders { +VP8::VP8(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs_, s32 id_, + Host1x::FrameQueue& frame_queue_) + : Decoder{host1x_, id_, regs_, frame_queue_} { + codec = Host1x::NvdecCommon::VideoCodec::VP8; + initialized = decode_api.Initialize(codec); +} VP8::~VP8() = default; -std::span VP8::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state) { - VP8PictureInfo info; - host1x.GMMU().ReadBlock(state.picture_info_offset, &info, sizeof(VP8PictureInfo)); +std::tuple VP8::GetProgressiveOffsets() { + auto luma{regs.surface_luma_offsets[static_cast(Vp8SurfaceIndex::Current)].Address()}; + auto chroma{regs.surface_chroma_offsets[static_cast(Vp8SurfaceIndex::Current)].Address()}; + return {luma, chroma}; +} - const bool is_key_frame = info.key_frame == 1u; - const auto bitstream_size = static_cast(info.vld_buffer_size); +std::tuple VP8::GetInterlacedOffsets() { + auto luma_top{regs.surface_luma_offsets[static_cast(Vp8SurfaceIndex::Current)].Address()}; + auto luma_bottom{ + regs.surface_luma_offsets[static_cast(Vp8SurfaceIndex::Current)].Address()}; + auto chroma_top{ + regs.surface_chroma_offsets[static_cast(Vp8SurfaceIndex::Current)].Address()}; + auto chroma_bottom{ + regs.surface_chroma_offsets[static_cast(Vp8SurfaceIndex::Current)].Address()}; + return {luma_top, luma_bottom, chroma_top, chroma_bottom}; +} + +std::span VP8::ComposeFrame() { + memory_manager.ReadBlock(regs.picture_info_offset.Address(), ¤t_context, + sizeof(VP8PictureInfo)); + + const bool is_key_frame = current_context.key_frame == 1u; + const auto bitstream_size = static_cast(current_context.vld_buffer_size); const size_t header_size = is_key_frame ? 10u : 3u; - frame.resize(header_size + bitstream_size); + frame_scratch.resize(header_size + bitstream_size); // Based on page 30 of the VP8 specification. // https://datatracker.ietf.org/doc/rfc6386/ - frame[0] = is_key_frame ? 0u : 1u; // 1-bit frame type (0: keyframe, 1: interframes). - frame[0] |= static_cast((info.version & 7u) << 1u); // 3-bit version number - frame[0] |= static_cast(1u << 4u); // 1-bit show_frame flag + frame_scratch[0] = is_key_frame ? 0u : 1u; // 1-bit frame type (0: keyframe, 1: interframes). + frame_scratch[0] |= + static_cast((current_context.version & 7u) << 1u); // 3-bit version number + frame_scratch[0] |= static_cast(1u << 4u); // 1-bit show_frame flag // The next 19-bits are the first partition size - frame[0] |= static_cast((info.first_part_size & 7u) << 5u); - frame[1] = static_cast((info.first_part_size & 0x7f8u) >> 3u); - frame[2] = static_cast((info.first_part_size & 0x7f800u) >> 11u); + frame_scratch[0] |= static_cast((current_context.first_part_size & 7u) << 5u); + frame_scratch[1] = static_cast((current_context.first_part_size & 0x7f8u) >> 3u); + frame_scratch[2] = static_cast((current_context.first_part_size & 0x7f800u) >> 11u); if (is_key_frame) { - frame[3] = 0x9du; - frame[4] = 0x01u; - frame[5] = 0x2au; + frame_scratch[3] = 0x9du; + frame_scratch[4] = 0x01u; + frame_scratch[5] = 0x2au; // TODO(ameerj): Horizontal/Vertical Scale // 16 bits: (2 bits Horizontal Scale << 14) | Width (14 bits) - frame[6] = static_cast(info.frame_width & 0xff); - frame[7] = static_cast(((info.frame_width >> 8) & 0x3f)); + frame_scratch[6] = static_cast(current_context.frame_width & 0xff); + frame_scratch[7] = static_cast(((current_context.frame_width >> 8) & 0x3f)); // 16 bits:(2 bits Vertical Scale << 14) | Height (14 bits) - frame[8] = static_cast(info.frame_height & 0xff); - frame[9] = static_cast(((info.frame_height >> 8) & 0x3f)); + frame_scratch[8] = static_cast(current_context.frame_height & 0xff); + frame_scratch[9] = static_cast(((current_context.frame_height >> 8) & 0x3f)); } - const u64 bitstream_offset = state.frame_bitstream_offset; - host1x.GMMU().ReadBlock(bitstream_offset, frame.data() + header_size, bitstream_size); + const u64 bitstream_offset = regs.frame_bitstream_offset.Address(); + memory_manager.ReadBlock(bitstream_offset, frame_scratch.data() + header_size, bitstream_size); - return frame; + return frame_scratch; } -} // namespace Tegra::Decoder +} // namespace Tegra::Decoders diff --git a/src/video_core/host1x/codecs/vp8.h b/src/video_core/host1x/codecs/vp8.h index 36a7e08e0..0ab48ca37 100755 --- a/src/video_core/host1x/codecs/vp8.h +++ b/src/video_core/host1x/codecs/vp8.h @@ -9,6 +9,7 @@ #include "common/common_funcs.h" #include "common/common_types.h" #include "common/scratch_buffer.h" +#include "video_core/host1x/codecs/decoder.h" #include "video_core/host1x/nvdec_common.h" namespace Tegra { @@ -17,20 +18,41 @@ namespace Host1x { class Host1x; } // namespace Host1x -namespace Decoder { +namespace Decoders { +enum class Vp8SurfaceIndex : u32 { + Last = 0, + Golden = 1, + AltRef = 2, + Current = 3, +}; -class VP8 { +class VP8 final : public Decoder { public: - explicit VP8(Host1x::Host1x& host1x); - ~VP8(); + explicit VP8(Host1x::Host1x& host1x, const Host1x::NvdecCommon::NvdecRegisters& regs, s32 id, + Host1x::FrameQueue& frame_queue); + ~VP8() override; - /// Compose the VP8 frame for FFmpeg decoding - [[nodiscard]] std::span ComposeFrame( - const Host1x::NvdecCommon::NvdecRegisters& state); + VP8(const VP8&) = delete; + VP8& operator=(const VP8&) = delete; + + VP8(VP8&&) = delete; + VP8& operator=(VP8&&) = delete; + + [[nodiscard]] std::span ComposeFrame() override; + + std::tuple GetProgressiveOffsets() override; + std::tuple GetInterlacedOffsets() override; + + bool IsInterlaced() override { + return false; + } + + std::string_view GetCurrentCodecName() const override { + return "VP8"; + } private: - Common::ScratchBuffer frame; - Host1x::Host1x& host1x; + Common::ScratchBuffer frame_scratch; struct VP8PictureInfo { INSERT_PADDING_WORDS_NOINIT(14); @@ -73,7 +95,9 @@ private: INSERT_PADDING_WORDS_NOINIT(3); }; static_assert(sizeof(VP8PictureInfo) == 0xc0, "PictureInfo is an invalid size"); + + VP8PictureInfo current_context{}; }; -} // namespace Decoder +} // namespace Decoders } // namespace Tegra diff --git a/src/video_core/host1x/codecs/vp9.cpp b/src/video_core/host1x/codecs/vp9.cpp index 9701b867b..e286a199a 100755 --- a/src/video_core/host1x/codecs/vp9.cpp +++ b/src/video_core/host1x/codecs/vp9.cpp @@ -4,12 +4,13 @@ #include // for std::copy #include +#include "common/alignment.h" #include "common/assert.h" #include "video_core/host1x/codecs/vp9.h" #include "video_core/host1x/host1x.h" #include "video_core/memory_manager.h" -namespace Tegra::Decoder { +namespace Tegra::Decoders { namespace { constexpr u32 diff_update_probability = 252; constexpr u32 frame_sync_code = 0x498342; @@ -237,7 +238,12 @@ constexpr std::array map_lut{ } } // Anonymous namespace -VP9::VP9(Host1x::Host1x& host1x_) : host1x{host1x_} {} +VP9::VP9(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs_, s32 id_, + Host1x::FrameQueue& frame_queue_) + : Decoder{host1x_, id_, regs_, frame_queue_} { + codec = Host1x::NvdecCommon::VideoCodec::VP9; + initialized = decode_api.Initialize(codec); +} VP9::~VP9() = default; @@ -356,35 +362,113 @@ void VP9::WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_ } } -Vp9PictureInfo VP9::GetVp9PictureInfo(const Host1x::NvdecCommon::NvdecRegisters& state) { - PictureInfo picture_info; - host1x.GMMU().ReadBlock(state.picture_info_offset, &picture_info, sizeof(PictureInfo)); - Vp9PictureInfo vp9_info = picture_info.Convert(); +void VP9::WriteSegmentation(VpxBitStreamWriter& writer) { + bool enabled = current_picture_info.segmentation.enabled != 0; + writer.WriteBit(enabled); + if (!enabled) { + return; + } - InsertEntropy(state.vp9_entropy_probs_offset, vp9_info.entropy); + auto update_map = current_picture_info.segmentation.update_map != 0; + writer.WriteBit(update_map); + + if (update_map) { + EntropyProbs entropy_probs{}; + memory_manager.ReadBlock(regs.vp9_prob_tab_buffer_offset.Address(), &entropy_probs, + sizeof(entropy_probs)); + + auto WriteProb = [&](u8 prob) { + bool coded = prob != 255; + writer.WriteBit(coded); + if (coded) { + writer.WriteU(prob, 8); + } + }; + + for (size_t i = 0; i < entropy_probs.mb_segment_tree_probs.size(); i++) { + WriteProb(entropy_probs.mb_segment_tree_probs[i]); + } + + auto temporal_update = current_picture_info.segmentation.temporal_update != 0; + writer.WriteBit(temporal_update); + + if (temporal_update) { + for (s32 i = 0; i < 3; i++) { + WriteProb(entropy_probs.segment_pred_probs[i]); + } + } + } + + if (last_segmentation == current_picture_info.segmentation) { + writer.WriteBit(false); + return; + } + + last_segmentation = current_picture_info.segmentation; + writer.WriteBit(true); + writer.WriteBit(current_picture_info.segmentation.abs_delta != 0); + + constexpr s32 MAX_SEGMENTS = 8; + constexpr std::array SegmentationFeatureBits = {8, 6, 2, 0}; + + for (s32 i = 0; i < MAX_SEGMENTS; i++) { + auto q_enabled = current_picture_info.segmentation.feature_enabled[i][0] != 0; + writer.WriteBit(q_enabled); + if (q_enabled) { + writer.WriteS(current_picture_info.segmentation.feature_data[i][0], + SegmentationFeatureBits[0]); + } + + auto lf_enabled = current_picture_info.segmentation.feature_enabled[i][1] != 0; + writer.WriteBit(lf_enabled); + if (lf_enabled) { + writer.WriteS(current_picture_info.segmentation.feature_data[i][1], + SegmentationFeatureBits[1]); + } + + auto ref_enabled = current_picture_info.segmentation.feature_enabled[i][2] != 0; + writer.WriteBit(ref_enabled); + if (ref_enabled) { + writer.WriteU(current_picture_info.segmentation.feature_data[i][2], + SegmentationFeatureBits[2]); + } + + auto skip_enabled = current_picture_info.segmentation.feature_enabled[i][3] != 0; + writer.WriteBit(skip_enabled); + } +} + +Vp9PictureInfo VP9::GetVp9PictureInfo() { + memory_manager.ReadBlock(regs.picture_info_offset.Address(), ¤t_picture_info, + sizeof(PictureInfo)); + Vp9PictureInfo vp9_info = current_picture_info.Convert(); + + InsertEntropy(regs.vp9_prob_tab_buffer_offset.Address(), vp9_info.entropy); // surface_luma_offset[0:3] contains the address of the reference frame offsets in the following // order: last, golden, altref, current. - std::copy(state.surface_luma_offset.begin(), state.surface_luma_offset.begin() + 4, - vp9_info.frame_offsets.begin()); + for (size_t i = 0; i < 4; i++) { + vp9_info.frame_offsets[i] = regs.surface_luma_offsets[i].Address(); + } return vp9_info; } void VP9::InsertEntropy(u64 offset, Vp9EntropyProbs& dst) { EntropyProbs entropy; - host1x.GMMU().ReadBlock(offset, &entropy, sizeof(EntropyProbs)); + memory_manager.ReadBlock(offset, &entropy, sizeof(EntropyProbs)); entropy.Convert(dst); } -Vp9FrameContainer VP9::GetCurrentFrame(const Host1x::NvdecCommon::NvdecRegisters& state) { +Vp9FrameContainer VP9::GetCurrentFrame() { Vp9FrameContainer current_frame{}; { // gpu.SyncGuestHost(); epic, why? - current_frame.info = GetVp9PictureInfo(state); + current_frame.info = GetVp9PictureInfo(); current_frame.bit_stream.resize(current_frame.info.bitstream_size); - host1x.GMMU().ReadBlock(state.frame_bitstream_offset, current_frame.bit_stream.data(), - current_frame.info.bitstream_size); + memory_manager.ReadBlock(regs.frame_bitstream_offset.Address(), + current_frame.bit_stream.data(), + current_frame.info.bitstream_size); } if (!next_frame.bit_stream.empty()) { Vp9FrameContainer temp{ @@ -742,8 +826,7 @@ VpxBitStreamWriter VP9::ComposeUncompressedHeader() { uncomp_writer.WriteDeltaQ(current_frame_info.uv_dc_delta_q); uncomp_writer.WriteDeltaQ(current_frame_info.uv_ac_delta_q); - ASSERT(!current_frame_info.segment_enabled); - uncomp_writer.WriteBit(false); // Segmentation enabled (TODO). + WriteSegmentation(uncomp_writer); const s32 min_tile_cols_log2 = CalcMinLog2TileCols(current_frame_info.frame_size.width); const s32 max_tile_cols_log2 = CalcMaxLog2TileCols(current_frame_info.frame_size.width); @@ -770,10 +853,29 @@ VpxBitStreamWriter VP9::ComposeUncompressedHeader() { return uncomp_writer; } -void VP9::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state) { +std::tuple VP9::GetProgressiveOffsets() { + auto luma{regs.surface_luma_offsets[static_cast(Vp9SurfaceIndex::Current)].Address()}; + auto chroma{regs.surface_chroma_offsets[static_cast(Vp9SurfaceIndex::Current)].Address()}; + return {luma, chroma}; +} + +std::tuple VP9::GetInterlacedOffsets() { + auto luma_top{regs.surface_luma_offsets[static_cast(Vp9SurfaceIndex::Current)].Address()}; + auto luma_bottom{ + regs.surface_luma_offsets[static_cast(Vp9SurfaceIndex::Current)].Address()}; + auto chroma_top{ + regs.surface_chroma_offsets[static_cast(Vp9SurfaceIndex::Current)].Address()}; + auto chroma_bottom{ + regs.surface_chroma_offsets[static_cast(Vp9SurfaceIndex::Current)].Address()}; + return {luma_top, luma_bottom, chroma_top, chroma_bottom}; +} + +std::span VP9::ComposeFrame() { + vp9_hidden_frame = false; + std::vector bitstream; { - Vp9FrameContainer curr_frame = GetCurrentFrame(state); + Vp9FrameContainer curr_frame = GetCurrentFrame(); current_frame_info = curr_frame.info; bitstream = std::move(curr_frame.bit_stream); } @@ -786,12 +888,16 @@ void VP9::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state) { std::vector uncompressed_header = uncomp_writer.GetByteArray(); // Write headers and frame to buffer - frame.resize(uncompressed_header.size() + compressed_header.size() + bitstream.size()); - std::copy(uncompressed_header.begin(), uncompressed_header.end(), frame.begin()); + frame_scratch.resize(uncompressed_header.size() + compressed_header.size() + bitstream.size()); + std::copy(uncompressed_header.begin(), uncompressed_header.end(), frame_scratch.begin()); std::copy(compressed_header.begin(), compressed_header.end(), - frame.begin() + uncompressed_header.size()); + frame_scratch.begin() + uncompressed_header.size()); std::copy(bitstream.begin(), bitstream.end(), - frame.begin() + uncompressed_header.size() + compressed_header.size()); + frame_scratch.begin() + uncompressed_header.size() + compressed_header.size()); + + vp9_hidden_frame = WasFrameHidden(); + + return GetFrameBytes(); } VpxRangeEncoder::VpxRangeEncoder() { @@ -944,4 +1050,4 @@ const std::vector& VpxBitStreamWriter::GetByteArray() const { return byte_array; } -} // namespace Tegra::Decoder +} // namespace Tegra::Decoders diff --git a/src/video_core/host1x/codecs/vp9.h b/src/video_core/host1x/codecs/vp9.h index 23abf4a33..b9f4081dc 100755 --- a/src/video_core/host1x/codecs/vp9.h +++ b/src/video_core/host1x/codecs/vp9.h @@ -10,6 +10,7 @@ #include "common/common_types.h" #include "common/scratch_buffer.h" #include "common/stream.h" +#include "video_core/host1x/codecs/decoder.h" #include "video_core/host1x/codecs/vp9_types.h" #include "video_core/host1x/nvdec_common.h" @@ -19,7 +20,7 @@ namespace Host1x { class Host1x; } // namespace Host1x -namespace Decoder { +namespace Decoders { /// The VpxRangeEncoder, and VpxBitStreamWriter classes are used to compose the /// VP9 header bitstreams. @@ -110,21 +111,32 @@ private: std::vector byte_array; }; -class VP9 { +class VP9 final : public Decoder { public: - explicit VP9(Host1x::Host1x& host1x); - ~VP9(); + explicit VP9(Host1x::Host1x& host1x, const Host1x::NvdecCommon::NvdecRegisters& regs, s32 id, + Host1x::FrameQueue& frame_queue); + ~VP9() override; VP9(const VP9&) = delete; VP9& operator=(const VP9&) = delete; - VP9(VP9&&) = default; + VP9(VP9&&) = delete; VP9& operator=(VP9&&) = delete; - /// Composes the VP9 frame from the GPU state information. - /// Based on the official VP9 spec documentation - void ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state); + [[nodiscard]] std::span ComposeFrame() override; + std::tuple GetProgressiveOffsets() override; + std::tuple GetInterlacedOffsets() override; + + bool IsInterlaced() override { + return false; + } + + std::string_view GetCurrentCodecName() const override { + return "VP9"; + } + +private: /// Returns true if the most recent frame was a hidden frame. [[nodiscard]] bool WasFrameHidden() const { return !current_frame_info.show_frame; @@ -132,10 +144,9 @@ public: /// Returns a const span to the composed frame data. [[nodiscard]] std::span GetFrameBytes() const { - return frame; + return frame_scratch; } -private: /// Generates compressed header probability updates in the bitstream writer template void WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array& new_prob, @@ -167,23 +178,22 @@ private: /// Write motion vector probability updates. 6.3.17 in the spec void WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob); + void WriteSegmentation(VpxBitStreamWriter& writer); + /// Returns VP9 information from NVDEC provided offset and size - [[nodiscard]] Vp9PictureInfo GetVp9PictureInfo( - const Host1x::NvdecCommon::NvdecRegisters& state); + [[nodiscard]] Vp9PictureInfo GetVp9PictureInfo(); /// Read and convert NVDEC provided entropy probs to Vp9EntropyProbs struct void InsertEntropy(u64 offset, Vp9EntropyProbs& dst); /// Returns frame to be decoded after buffering - [[nodiscard]] Vp9FrameContainer GetCurrentFrame( - const Host1x::NvdecCommon::NvdecRegisters& state); + [[nodiscard]] Vp9FrameContainer GetCurrentFrame(); /// Use NVDEC providied information to compose the headers for the current frame [[nodiscard]] std::vector ComposeCompressedHeader(); [[nodiscard]] VpxBitStreamWriter ComposeUncompressedHeader(); - Host1x::Host1x& host1x; - Common::ScratchBuffer frame; + Common::ScratchBuffer frame_scratch; std::array loop_filter_ref_deltas{}; std::array loop_filter_mode_deltas{}; @@ -192,9 +202,11 @@ private: std::array frame_ctxs{}; bool swap_ref_indices{}; + Segmentation last_segmentation{}; + PictureInfo current_picture_info{}; Vp9PictureInfo current_frame_info{}; Vp9EntropyProbs prev_frame_probs{}; }; -} // namespace Decoder +} // namespace Decoders } // namespace Tegra diff --git a/src/video_core/host1x/codecs/vp9_types.h b/src/video_core/host1x/codecs/vp9_types.h index 591fe73de..dad6e8437 100755 --- a/src/video_core/host1x/codecs/vp9_types.h +++ b/src/video_core/host1x/codecs/vp9_types.h @@ -11,7 +11,14 @@ namespace Tegra { -namespace Decoder { +namespace Decoders { +enum class Vp9SurfaceIndex : u32 { + Last = 0, + Golden = 1, + AltRef = 2, + Current = 3, +}; + struct Vp9FrameDimensions { s16 width; s16 height; @@ -48,11 +55,13 @@ enum class TxMode { }; struct Segmentation { + constexpr bool operator==(const Segmentation& rhs) const = default; + u8 enabled; u8 update_map; u8 temporal_update; u8 abs_delta; - std::array feature_mask; + std::array, 8> feature_enabled; std::array, 8> feature_data; }; static_assert(sizeof(Segmentation) == 0x64, "Segmentation is an invalid size"); @@ -190,7 +199,17 @@ struct PictureInfo { static_assert(sizeof(PictureInfo) == 0x100, "PictureInfo is an invalid size"); struct EntropyProbs { - INSERT_PADDING_BYTES_NOINIT(1024); ///< 0x0000 + std::array kf_bmode_prob; ///< 0x0000 + std::array kf_bmode_probB; ///< 0x0320 + std::array ref_pred_probs; ///< 0x0384 + std::array mb_segment_tree_probs; ///< 0x0387 + std::array segment_pred_probs; ///< 0x038E + std::array ref_scores; ///< 0x0391 + std::array prob_comppred; ///< 0x0395 + INSERT_PADDING_BYTES_NOINIT(9); ///< 0x0397 + std::array kf_uv_mode_prob; ///< 0x03A0 + std::array kf_uv_mode_probB; ///< 0x03F0 + INSERT_PADDING_BYTES_NOINIT(6); ///< 0x03FA std::array inter_mode_prob; ///< 0x0400 std::array intra_inter_prob; ///< 0x041C INSERT_PADDING_BYTES_NOINIT(80); ///< 0x0420 @@ -302,5 +321,5 @@ ASSERT_POSITION(class_0_fr, 0x560); ASSERT_POSITION(coef_probs, 0x5A0); #undef ASSERT_POSITION -}; // namespace Decoder +}; // namespace Decoders }; // namespace Tegra diff --git a/src/video_core/host1x/control.cpp b/src/video_core/host1x/control.cpp index 1406bd41e..b931e854d 100755 --- a/src/video_core/host1x/control.cpp +++ b/src/video_core/host1x/control.cpp @@ -27,6 +27,7 @@ void Control::ProcessMethod(Method method, u32 argument) { } void Control::Execute(u32 data) { + LOG_TRACE(Service_NVDRV, "Control wait syncpt {} value {}", data, syncpoint_value); host1x.GetSyncpointManager().WaitHost(data, syncpoint_value); } diff --git a/src/video_core/host1x/control.h b/src/video_core/host1x/control.h index d76da5ecf..41dfe8ca0 100755 --- a/src/video_core/host1x/control.h +++ b/src/video_core/host1x/control.h @@ -6,9 +6,7 @@ #include "common/common_types.h" -namespace Tegra { - -namespace Host1x { +namespace Tegra::Host1x { class Host1x; class Nvdec; @@ -31,10 +29,8 @@ private: /// For Host1x, execute is waiting on a syncpoint previously written into the state void Execute(u32 data); - u32 syncpoint_value{}; Host1x& host1x; + u32 syncpoint_value{}; }; -} // namespace Host1x - -} // namespace Tegra +} // namespace Tegra::Host1x diff --git a/src/video_core/host1x/ffmpeg/ffmpeg.cpp b/src/video_core/host1x/ffmpeg/ffmpeg.cpp index 96686da59..c80768ca3 100755 --- a/src/video_core/host1x/ffmpeg/ffmpeg.cpp +++ b/src/video_core/host1x/ffmpeg/ffmpeg.cpp @@ -5,7 +5,9 @@ #include "common/logging/log.h" #include "common/scope_exit.h" #include "common/settings.h" +#include "core/memory.h" #include "video_core/host1x/ffmpeg/ffmpeg.h" +#include "video_core/memory_manager.h" extern "C" { #ifdef LIBVA_FOUND @@ -132,7 +134,7 @@ bool HardwareContext::InitializeForDecoder(DecoderContext& decoder_context, const Decoder& decoder) { const auto supported_types = GetSupportedDeviceTypes(); for (const auto type : PreferredGpuDecoders) { - AVPixelFormat hw_pix_fmt; + // AVPixelFormat hw_pix_fmt; if (std::ranges::find(supported_types, type) == supported_types.end()) { LOG_DEBUG(HW_GPU, "{} explicitly unsupported", av_hwdevice_get_type_name(type)); @@ -143,12 +145,14 @@ bool HardwareContext::InitializeForDecoder(DecoderContext& decoder_context, continue; } - if (decoder.SupportsDecodingOnDevice(&hw_pix_fmt, type)) { - decoder_context.InitializeHardwareDecoder(*this, hw_pix_fmt); - return true; - } + // Disable GPU decoding as it cannot return decode frame ordering which breaks everything. + // if (decoder.SupportsDecodingOnDevice(&hw_pix_fmt, type)) { + // decoder_context.InitializeHardwareDecoder(*this, hw_pix_fmt); + // return true; + //} } + LOG_INFO(HW_GPU, "Hardware decoding is disabled due to implementation issues, using CPU."); return false; } @@ -183,8 +187,8 @@ bool HardwareContext::InitializeWithType(AVHWDeviceType type) { return true; } -DecoderContext::DecoderContext(const Decoder& decoder) { - m_codec_context = avcodec_alloc_context3(decoder.GetCodec()); +DecoderContext::DecoderContext(const Decoder& decoder) : m_decoder{decoder} { + m_codec_context = avcodec_alloc_context3(m_decoder.GetCodec()); av_opt_set(m_codec_context->priv_data, "tune", "zerolatency", 0); m_codec_context->thread_count = 0; m_codec_context->thread_type &= ~FF_THREAD_FRAME; @@ -216,6 +220,25 @@ bool DecoderContext::OpenContext(const Decoder& decoder) { } bool DecoderContext::SendPacket(const Packet& packet) { + m_temp_frame = std::make_shared(); + m_got_frame = 0; + +// Android can randomly crash when calling decode directly, so skip. +// TODO update ffmpeg and hope that fixes it. +#ifndef ANDROID + if (!m_codec_context->hw_device_ctx && m_codec_context->codec_id == AV_CODEC_ID_H264) { + m_decode_order = true; + auto* codec{ffcodec(m_decoder.GetCodec())}; + if (const int ret = codec->cb.decode(m_codec_context, m_temp_frame->GetFrame(), + &m_got_frame, packet.GetPacket()); + ret < 0) { + LOG_DEBUG(Service_NVDRV, "avcodec_send_packet error {}", AVError(ret)); + return false; + } + return true; + } +#endif + if (const int ret = avcodec_send_packet(m_codec_context, packet.GetPacket()); ret < 0) { LOG_ERROR(HW_GPU, "avcodec_send_packet error: {}", AVError(ret)); return false; @@ -224,139 +247,73 @@ bool DecoderContext::SendPacket(const Packet& packet) { return true; } -std::unique_ptr DecoderContext::ReceiveFrame(bool* out_is_interlaced) { - auto dst_frame = std::make_unique(); +std::shared_ptr DecoderContext::ReceiveFrame() { + // Android can randomly crash when calling decode directly, so skip. + // TODO update ffmpeg and hope that fixes it. +#ifndef ANDROID + if (!m_codec_context->hw_device_ctx && m_codec_context->codec_id == AV_CODEC_ID_H264) { + m_decode_order = true; + auto* codec{ffcodec(m_decoder.GetCodec())}; + int ret{0}; - const auto ReceiveImpl = [&](AVFrame* frame) { - if (const int ret = avcodec_receive_frame(m_codec_context, frame); ret < 0) { - LOG_ERROR(HW_GPU, "avcodec_receive_frame error: {}", AVError(ret)); - return false; + if (m_got_frame == 0) { + Packet packet{{}}; + auto* pkt = packet.GetPacket(); + pkt->data = nullptr; + pkt->size = 0; + ret = codec->cb.decode(m_codec_context, m_temp_frame->GetFrame(), &m_got_frame, pkt); + m_codec_context->has_b_frames = 0; } - *out_is_interlaced = -#if defined(FF_API_INTERLACED_FRAME) || LIBAVUTIL_VERSION_MAJOR >= 59 - (frame->flags & AV_FRAME_FLAG_INTERLACED) != 0; -#else - frame->interlaced_frame != 0; + if (m_got_frame == 0 || ret < 0) { + LOG_ERROR(Service_NVDRV, "Failed to receive a frame! error {}", ret); + return {}; + } + } else #endif - return true; - }; + { - if (m_codec_context->hw_device_ctx) { - // If we have a hardware context, make a separate frame here to receive the - // hardware result before sending it to the output. - Frame intermediate_frame; + const auto ReceiveImpl = [&](AVFrame* frame) { + if (const int ret = avcodec_receive_frame(m_codec_context, frame); ret < 0) { + LOG_ERROR(HW_GPU, "avcodec_receive_frame error: {}", AVError(ret)); + return false; + } - if (!ReceiveImpl(intermediate_frame.GetFrame())) { - return {}; - } + return true; + }; - dst_frame->SetFormat(PreferredGpuFormat); - if (const int ret = - av_hwframe_transfer_data(dst_frame->GetFrame(), intermediate_frame.GetFrame(), 0); - ret < 0) { - LOG_ERROR(HW_GPU, "av_hwframe_transfer_data error: {}", AVError(ret)); - return {}; - } - } else { - // Otherwise, decode the frame as normal. - if (!ReceiveImpl(dst_frame->GetFrame())) { - return {}; + if (m_codec_context->hw_device_ctx) { + // If we have a hardware context, make a separate frame here to receive the + // hardware result before sending it to the output. + Frame intermediate_frame; + + if (!ReceiveImpl(intermediate_frame.GetFrame())) { + return {}; + } + + m_temp_frame->SetFormat(PreferredGpuFormat); + if (const int ret = av_hwframe_transfer_data(m_temp_frame->GetFrame(), + intermediate_frame.GetFrame(), 0); + ret < 0) { + LOG_ERROR(HW_GPU, "av_hwframe_transfer_data error: {}", AVError(ret)); + return {}; + } + } else { + // Otherwise, decode the frame as normal. + if (!ReceiveImpl(m_temp_frame->GetFrame())) { + return {}; + } } } - return dst_frame; -} - -DeinterlaceFilter::DeinterlaceFilter(const Frame& frame) { - const AVFilter* buffer_src = avfilter_get_by_name("buffer"); - const AVFilter* buffer_sink = avfilter_get_by_name("buffersink"); - AVFilterInOut* inputs = avfilter_inout_alloc(); - AVFilterInOut* outputs = avfilter_inout_alloc(); - SCOPE_EXIT({ - avfilter_inout_free(&inputs); - avfilter_inout_free(&outputs); - }); - - // Don't know how to get the accurate time_base but it doesn't matter for yadif filter - // so just use 1/1 to make buffer filter happy - std::string args = fmt::format("video_size={}x{}:pix_fmt={}:time_base=1/1", frame.GetWidth(), - frame.GetHeight(), static_cast(frame.GetPixelFormat())); - - m_filter_graph = avfilter_graph_alloc(); - int ret = avfilter_graph_create_filter(&m_source_context, buffer_src, "in", args.c_str(), - nullptr, m_filter_graph); - if (ret < 0) { - LOG_ERROR(HW_GPU, "avfilter_graph_create_filter source error: {}", AVError(ret)); - return; - } - - ret = avfilter_graph_create_filter(&m_sink_context, buffer_sink, "out", nullptr, nullptr, - m_filter_graph); - if (ret < 0) { - LOG_ERROR(HW_GPU, "avfilter_graph_create_filter sink error: {}", AVError(ret)); - return; - } - - inputs->name = av_strdup("out"); - inputs->filter_ctx = m_sink_context; - inputs->pad_idx = 0; - inputs->next = nullptr; - - outputs->name = av_strdup("in"); - outputs->filter_ctx = m_source_context; - outputs->pad_idx = 0; - outputs->next = nullptr; - - const char* description = "yadif=1:-1:0"; - ret = avfilter_graph_parse_ptr(m_filter_graph, description, &inputs, &outputs, nullptr); - if (ret < 0) { - LOG_ERROR(HW_GPU, "avfilter_graph_parse_ptr error: {}", AVError(ret)); - return; - } - - ret = avfilter_graph_config(m_filter_graph, nullptr); - if (ret < 0) { - LOG_ERROR(HW_GPU, "avfilter_graph_config error: {}", AVError(ret)); - return; - } - - m_initialized = true; -} - -bool DeinterlaceFilter::AddSourceFrame(const Frame& frame) { - if (const int ret = av_buffersrc_add_frame_flags(m_source_context, frame.GetFrame(), - AV_BUFFERSRC_FLAG_KEEP_REF); - ret < 0) { - LOG_ERROR(HW_GPU, "av_buffersrc_add_frame_flags error: {}", AVError(ret)); - return false; - } - - return true; -} - -std::unique_ptr DeinterlaceFilter::DrainSinkFrame() { - auto dst_frame = std::make_unique(); - const int ret = av_buffersink_get_frame(m_sink_context, dst_frame->GetFrame()); - - if (ret == AVERROR(EAGAIN) || ret == AVERROR(AVERROR_EOF)) { - return {}; - } - - if (ret < 0) { - LOG_ERROR(HW_GPU, "av_buffersink_get_frame error: {}", AVError(ret)); - return {}; - } - - return dst_frame; -} - -DeinterlaceFilter::~DeinterlaceFilter() { - avfilter_graph_free(&m_filter_graph); +#if defined(FF_API_INTERLACED_FRAME) || LIBAVUTIL_VERSION_MAJOR >= 59 + m_temp_frame->GetFrame()->interlaced_frame = + (m_temp_frame->GetFrame()->flags & AV_FRAME_FLAG_INTERLACED) != 0; +#endif + return std::move(m_temp_frame); } void DecodeApi::Reset() { - m_deinterlace_filter.reset(); m_hardware_context.reset(); m_decoder_context.reset(); m_decoder.reset(); @@ -382,43 +339,14 @@ bool DecodeApi::Initialize(Tegra::Host1x::NvdecCommon::VideoCodec codec) { return true; } -bool DecodeApi::SendPacket(std::span packet_data, size_t configuration_size) { +bool DecodeApi::SendPacket(std::span packet_data) { FFmpeg::Packet packet(packet_data); return m_decoder_context->SendPacket(packet); } -void DecodeApi::ReceiveFrames(std::queue>& frame_queue) { +std::shared_ptr DecodeApi::ReceiveFrame() { // Receive raw frame from decoder. - bool is_interlaced; - auto frame = m_decoder_context->ReceiveFrame(&is_interlaced); - if (!frame) { - return; - } - - if (!is_interlaced) { - // If the frame is not interlaced, we can pend it now. - frame_queue.push(std::move(frame)); - } else { - // Create the deinterlacer if needed. - if (!m_deinterlace_filter) { - m_deinterlace_filter.emplace(*frame); - } - - // Add the frame we just received. - if (!m_deinterlace_filter->AddSourceFrame(*frame)) { - return; - } - - // Pend output fields. - while (true) { - auto filter_frame = m_deinterlace_filter->DrainSinkFrame(); - if (!filter_frame) { - break; - } - - frame_queue.push(std::move(filter_frame)); - } - } + return m_decoder_context->ReceiveFrame(); } } // namespace FFmpeg diff --git a/src/video_core/host1x/ffmpeg/ffmpeg.h b/src/video_core/host1x/ffmpeg/ffmpeg.h index 1de0bbd83..a74fcba80 100755 --- a/src/video_core/host1x/ffmpeg/ffmpeg.h +++ b/src/video_core/host1x/ffmpeg/ffmpeg.h @@ -20,17 +20,20 @@ extern "C" { #endif #include -#include -#include -#include -#include #include +#ifndef ANDROID +#include +#endif #if defined(__GNUC__) || defined(__clang__) #pragma GCC diagnostic pop #endif } +namespace Tegra { +class MemoryManager; +} + namespace FFmpeg { class Packet; @@ -90,6 +93,10 @@ public: return m_frame->data[plane]; } + const u8* GetPlane(int plane) const { + return m_frame->data[plane]; + } + u8** GetPlanes() const { return m_frame->data; } @@ -98,6 +105,14 @@ public: m_frame->format = format; } + bool IsInterlaced() const { + return m_frame->interlaced_frame != 0; + } + + bool IsHardwareDecoded() const { + return m_frame->hw_frames_ctx != nullptr; + } + AVFrame* GetFrame() const { return m_frame; } @@ -160,33 +175,22 @@ public: void InitializeHardwareDecoder(const HardwareContext& context, AVPixelFormat hw_pix_fmt); bool OpenContext(const Decoder& decoder); bool SendPacket(const Packet& packet); - std::unique_ptr ReceiveFrame(bool* out_is_interlaced); + std::shared_ptr ReceiveFrame(); AVCodecContext* GetCodecContext() const { return m_codec_context; } + bool UsingDecodeOrder() const { + return m_decode_order; + } + private: + const Decoder& m_decoder; AVCodecContext* m_codec_context{}; -}; - -// Wraps an AVFilterGraph. -class DeinterlaceFilter { -public: - YUZU_NON_COPYABLE(DeinterlaceFilter); - YUZU_NON_MOVEABLE(DeinterlaceFilter); - - explicit DeinterlaceFilter(const Frame& frame); - ~DeinterlaceFilter(); - - bool AddSourceFrame(const Frame& frame); - std::unique_ptr DrainSinkFrame(); - -private: - AVFilterGraph* m_filter_graph{}; - AVFilterContext* m_source_context{}; - AVFilterContext* m_sink_context{}; - bool m_initialized{}; + s32 m_got_frame{}; + std::shared_ptr m_temp_frame{}; + bool m_decode_order{}; }; class DecodeApi { @@ -200,14 +204,17 @@ public: bool Initialize(Tegra::Host1x::NvdecCommon::VideoCodec codec); void Reset(); - bool SendPacket(std::span packet_data, size_t configuration_size); - void ReceiveFrames(std::queue>& frame_queue); + bool UsingDecodeOrder() const { + return m_decoder_context->UsingDecodeOrder(); + } + + bool SendPacket(std::span packet_data); + std::shared_ptr ReceiveFrame(); private: std::optional m_decoder; std::optional m_decoder_context; std::optional m_hardware_context; - std::optional m_deinterlace_filter; }; } // namespace FFmpeg diff --git a/src/video_core/host1x/host1x.cpp b/src/video_core/host1x/host1x.cpp index 27c59640b..6bc85b9ed 100755 --- a/src/video_core/host1x/host1x.cpp +++ b/src/video_core/host1x/host1x.cpp @@ -3,10 +3,10 @@ #include "core/core.h" #include "video_core/host1x/host1x.h" +#include "video_core/host1x/nvdec.h" +#include "video_core/host1x/vic.h" -namespace Tegra { - -namespace Host1x { +namespace Tegra::Host1x { Host1x::Host1x(Core::System& system_) : system{system_}, syncpoint_manager{}, @@ -15,6 +15,22 @@ Host1x::Host1x(Core::System& system_) Host1x::~Host1x() = default; -} // namespace Host1x +void Host1x::StartDevice(s32 fd, ChannelType type, u32 syncpt) { + switch (type) { + case ChannelType::NvDec: + devices[fd] = std::make_unique(*this, fd, syncpt, frame_queue); + break; + case ChannelType::VIC: + devices[fd] = std::make_unique(*this, fd, syncpt, frame_queue); + break; + default: + LOG_ERROR(HW_GPU, "Unimplemented host1x device {}", static_cast(type)); + break; + } +} -} // namespace Tegra +void Host1x::StopDevice(s32 fd, ChannelType type) { + devices.erase(fd); +} + +} // namespace Tegra::Host1x diff --git a/src/video_core/host1x/host1x.h b/src/video_core/host1x/host1x.h index 958aeda4f..24a46e26c 100755 --- a/src/video_core/host1x/host1x.h +++ b/src/video_core/host1x/host1x.h @@ -3,9 +3,14 @@ #pragma once +#include +#include +#include + #include "common/common_types.h" #include "common/address_space.h" +#include "video_core/cdma_pusher.h" #include "video_core/host1x/gpu_device_memory_manager.h" #include "video_core/host1x/syncpoint_manager.h" #include "video_core/memory_manager.h" @@ -14,15 +19,128 @@ namespace Core { class System; } // namespace Core -namespace Tegra { +namespace FFmpeg { +class Frame; +} // namespace FFmpeg -namespace Host1x { +namespace Tegra::Host1x { +class Nvdec; + +class FrameQueue { +public: + void Open(s32 fd) { + std::scoped_lock l{m_mutex}; + m_presentation_order.insert({fd, {}}); + m_decode_order.insert({fd, {}}); + } + + void Close(s32 fd) { + std::scoped_lock l{m_mutex}; + m_presentation_order.erase(fd); + m_decode_order.erase(fd); + } + + s32 VicFindNvdecFdFromOffset(u64 search_offset) { + std::scoped_lock l{m_mutex}; + // Vic does not know which nvdec is producing frames for it, so search all the fds here for + // the given offset. + for (auto& map : m_presentation_order) { + for (auto& [offset, frame] : map.second) { + if (offset == search_offset) { + return map.first; + } + } + } + + for (auto& map : m_decode_order) { + for (auto& [offset, frame] : map.second) { + if (offset == search_offset) { + return map.first; + } + } + } + + return -1; + } + + void PushPresentOrder(s32 fd, u64 offset, std::shared_ptr&& frame) { + std::scoped_lock l{m_mutex}; + auto map = m_presentation_order.find(fd); + map->second.emplace_back(offset, std::move(frame)); + } + + void PushDecodeOrder(s32 fd, u64 offset, std::shared_ptr&& frame) { + std::scoped_lock l{m_mutex}; + auto map = m_decode_order.find(fd); + map->second.insert_or_assign(offset, std::move(frame)); + } + + std::shared_ptr GetFrame(s32 fd, u64 offset) { + if (fd == -1) { + return {}; + } + + std::scoped_lock l{m_mutex}; + auto present_map = m_presentation_order.find(fd); + if (present_map->second.size() > 0) { + return GetPresentOrderLocked(fd); + } + + auto decode_map = m_decode_order.find(fd); + if (decode_map->second.size() > 0) { + return GetDecodeOrderLocked(fd, offset); + } + + return {}; + } + +private: + std::shared_ptr GetPresentOrderLocked(s32 fd) { + auto map = m_presentation_order.find(fd); + if (map->second.size() == 0) { + return {}; + } + auto frame = std::move(map->second.front().second); + map->second.pop_front(); + return frame; + } + + std::shared_ptr GetDecodeOrderLocked(s32 fd, u64 offset) { + auto map = m_decode_order.find(fd); + auto it = map->second.find(offset); + if (it == map->second.end()) { + return {}; + } + return std::move(map->second.extract(it).mapped()); + } + + using FramePtr = std::shared_ptr; + + std::mutex m_mutex{}; + std::unordered_map>> m_presentation_order; + std::unordered_map> m_decode_order; +}; + +enum class ChannelType : u32 { + MsEnc = 0, + VIC = 1, + GPU = 2, + NvDec = 3, + Display = 4, + NvJpg = 5, + TSec = 6, + Max = 7, +}; class Host1x { public: explicit Host1x(Core::System& system); ~Host1x(); + Core::System& System() { + return system; + } + SyncpointManager& GetSyncpointManager() { return syncpoint_manager; } @@ -55,14 +173,25 @@ public: return *allocator; } + void StartDevice(s32 fd, ChannelType type, u32 syncpt); + void StopDevice(s32 fd, ChannelType type); + + void PushEntries(s32 fd, ChCommandHeaderList&& entries) { + auto it = devices.find(fd); + if (it == devices.end()) { + return; + } + it->second->PushEntries(std::move(entries)); + } + private: Core::System& system; SyncpointManager syncpoint_manager; Tegra::MaxwellDeviceMemoryManager memory_manager; Tegra::MemoryManager gmmu_manager; std::unique_ptr> allocator; + FrameQueue frame_queue; + std::unordered_map> devices; }; -} // namespace Host1x - -} // namespace Tegra +} // namespace Tegra::Host1x diff --git a/src/video_core/host1x/nvdec.cpp b/src/video_core/host1x/nvdec.cpp index 4507a0d26..2972bbf60 100755 --- a/src/video_core/host1x/nvdec.cpp +++ b/src/video_core/host1x/nvdec.cpp @@ -2,6 +2,12 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include "common/assert.h" + +#include "common/polyfill_thread.h" +#include "common/settings.h" +#include "video_core/host1x/codecs/h264.h" +#include "video_core/host1x/codecs/vp8.h" +#include "video_core/host1x/codecs/vp9.h" #include "video_core/host1x/host1x.h" #include "video_core/host1x/nvdec.h" @@ -10,37 +16,70 @@ namespace Tegra::Host1x { #define NVDEC_REG_INDEX(field_name) \ (offsetof(NvdecCommon::NvdecRegisters, field_name) / sizeof(u64)) -Nvdec::Nvdec(Host1x& host1x_) - : host1x(host1x_), state{}, codec(std::make_unique(host1x, state)) {} +Nvdec::Nvdec(Host1x& host1x_, s32 id_, u32 syncpt, FrameQueue& frame_queue_) + : CDmaPusher{host1x_, id_}, id{id_}, syncpoint{syncpt}, frame_queue{frame_queue_} { + LOG_INFO(HW_GPU, "Created nvdec {}", id); + frame_queue.Open(id); +} -Nvdec::~Nvdec() = default; +Nvdec::~Nvdec() { + LOG_INFO(HW_GPU, "Destroying nvdec {}", id); + frame_queue.Close(id); +} void Nvdec::ProcessMethod(u32 method, u32 argument) { - state.reg_array[method] = static_cast(argument) << 8; + regs.reg_array[method] = argument; switch (method) { case NVDEC_REG_INDEX(set_codec_id): - codec->SetTargetCodec(static_cast(argument)); + CreateDecoder(static_cast(argument)); break; - case NVDEC_REG_INDEX(execute): + case NVDEC_REG_INDEX(execute): { + if (wait_needed) { + std::this_thread::sleep_for(std::chrono::milliseconds(32)); + wait_needed = false; + } Execute(); - break; + } break; } } -std::unique_ptr Nvdec::GetFrame() { - return codec->GetCurrentFrame(); +void Nvdec::CreateDecoder(NvdecCommon::VideoCodec codec) { + if (decoder.get()) { + return; + } + switch (codec) { + case NvdecCommon::VideoCodec::H264: + decoder = std::make_unique(host1x, regs, id, frame_queue); + break; + case NvdecCommon::VideoCodec::VP8: + decoder = std::make_unique(host1x, regs, id, frame_queue); + break; + case NvdecCommon::VideoCodec::VP9: + decoder = std::make_unique(host1x, regs, id, frame_queue); + break; + default: + UNIMPLEMENTED_MSG("Codec {}", decoder->GetCurrentCodecName()); + break; + } + LOG_INFO(HW_GPU, "Created decoder {} for id {}", decoder->GetCurrentCodecName(), id); } void Nvdec::Execute() { - switch (codec->GetCurrentCodec()) { + if (Settings::values.nvdec_emulation.GetValue() == Settings::NvdecEmulation::Off) [[unlikely]] { + // Signalling syncpts too fast can cause games to get stuck as they don't expect a <1ms + // execution time. Sleep for half of a 60 fps frame just in case. + std::this_thread::sleep_for(std::chrono::milliseconds(8)); + return; + } + switch (decoder->GetCurrentCodec()) { case NvdecCommon::VideoCodec::H264: case NvdecCommon::VideoCodec::VP8: case NvdecCommon::VideoCodec::VP9: - codec->Decode(); + decoder->Decode(); break; default: - UNIMPLEMENTED_MSG("Codec {}", codec->GetCurrentCodecName()); + UNIMPLEMENTED_MSG("Codec {}", decoder->GetCurrentCodecName()); break; } } diff --git a/src/video_core/host1x/nvdec.h b/src/video_core/host1x/nvdec.h index 806c30f2c..a1755696b 100755 --- a/src/video_core/host1x/nvdec.h +++ b/src/video_core/host1x/nvdec.h @@ -5,33 +5,47 @@ #include #include + #include "common/common_types.h" -#include "video_core/host1x/codecs/codec.h" +#include "video_core/cdma_pusher.h" +#include "video_core/host1x/codecs/decoder.h" namespace Tegra { namespace Host1x { - class Host1x; +class FrameQueue; -class Nvdec { +class Nvdec final : public CDmaPusher { public: - explicit Nvdec(Host1x& host1x); + explicit Nvdec(Host1x& host1x, s32 id, u32 syncpt, FrameQueue& frame_queue_); ~Nvdec(); /// Writes the method into the state, Invoke Execute() if encountered - void ProcessMethod(u32 method, u32 argument); + void ProcessMethod(u32 method, u32 arg) override; - /// Return most recently decoded frame - [[nodiscard]] std::unique_ptr GetFrame(); + u32 GetSyncpoint() const { + return syncpoint; + } + + void SetWait() { + wait_needed = true; + } private: + /// Create the decoder when the codec id is set + void CreateDecoder(NvdecCommon::VideoCodec codec); + /// Invoke codec to decode a frame void Execute(); - Host1x& host1x; - NvdecCommon::NvdecRegisters state; - std::unique_ptr codec; + s32 id; + u32 syncpoint; + FrameQueue& frame_queue; + + NvdecCommon::NvdecRegisters regs{}; + std::unique_ptr decoder; + bool wait_needed{false}; }; } // namespace Host1x diff --git a/src/video_core/host1x/nvdec_common.h b/src/video_core/host1x/nvdec_common.h index 0fd678269..13b96452f 100755 --- a/src/video_core/host1x/nvdec_common.h +++ b/src/video_core/host1x/nvdec_common.h @@ -17,6 +17,17 @@ enum class VideoCodec : u64 { VP9 = 0x9, }; +struct Offset { + constexpr u64 Address() const noexcept { + return offset << 8; + } + +private: + u64 offset; +}; +static_assert(std::is_trivial_v, "Offset must be trivial"); +static_assert(sizeof(Offset) == 0x8, "Offset has the wrong size!"); + // NVDEC should use a 32-bit address space, but is mapped to 64-bit, // doubling the sizes here is compensating for that. struct NvdecRegisters { @@ -38,29 +49,40 @@ struct NvdecRegisters { BitField<17, 1, u64> all_intra_frame; }; } control_params; - u64 picture_info_offset; ///< 0x0808 - u64 frame_bitstream_offset; ///< 0x0810 - u64 frame_number; ///< 0x0818 - u64 h264_slice_data_offsets; ///< 0x0820 - u64 h264_mv_dump_offset; ///< 0x0828 - INSERT_PADDING_WORDS_NOINIT(6); ///< 0x0830 - u64 frame_stats_offset; ///< 0x0848 - u64 h264_last_surface_luma_offset; ///< 0x0850 - u64 h264_last_surface_chroma_offset; ///< 0x0858 - std::array surface_luma_offset; ///< 0x0860 - std::array surface_chroma_offset; ///< 0x08E8 - INSERT_PADDING_WORDS_NOINIT(68); ///< 0x0970 - u64 vp8_prob_data_offset; ///< 0x0A80 - u64 vp8_header_partition_buf_offset; ///< 0x0A88 - INSERT_PADDING_WORDS_NOINIT(60); ///< 0x0A90 - u64 vp9_entropy_probs_offset; ///< 0x0B80 - u64 vp9_backward_updates_offset; ///< 0x0B88 - u64 vp9_last_frame_segmap_offset; ///< 0x0B90 - u64 vp9_curr_frame_segmap_offset; ///< 0x0B98 - INSERT_PADDING_WORDS_NOINIT(2); ///< 0x0BA0 - u64 vp9_last_frame_mvs_offset; ///< 0x0BA8 - u64 vp9_curr_frame_mvs_offset; ///< 0x0BB0 - INSERT_PADDING_WORDS_NOINIT(2); ///< 0x0BB8 + Offset picture_info_offset; ///< 0x0808 + Offset frame_bitstream_offset; ///< 0x0810 + u64 frame_number; ///< 0x0818 + Offset h264_slice_data_offsets; ///< 0x0820 + Offset h264_mv_dump_offset; ///< 0x0828 + INSERT_PADDING_WORDS_NOINIT(6); ///< 0x0830 + Offset frame_stats_offset; ///< 0x0848 + Offset h264_last_surface_luma_offset; ///< 0x0850 + Offset h264_last_surface_chroma_offset; ///< 0x0858 + std::array surface_luma_offsets; ///< 0x0860 + std::array surface_chroma_offsets; ///< 0x08E8 + Offset pic_scratch_buf_offset; ///< 0x0970 + Offset external_mvbuffer_offset; ///< 0x0978 + INSERT_PADDING_WORDS_NOINIT(32); ///< 0x0980 + Offset h264_mbhist_buffer_offset; ///< 0x0A00 + INSERT_PADDING_WORDS_NOINIT(30); ///< 0x0A08 + Offset vp8_prob_data_offset; ///< 0x0A80 + Offset vp8_header_partition_buf_offset; ///< 0x0A88 + INSERT_PADDING_WORDS_NOINIT(28); ///< 0x0A90 + Offset hvec_scalist_list_offset; ///< 0x0B00 + Offset hvec_tile_sizes_offset; ///< 0x0B08 + Offset hvec_filter_buffer_offset; ///< 0x0B10 + Offset hvec_sao_buffer_offset; ///< 0x0B18 + Offset hvec_slice_info_buffer_offset; ///< 0x0B20 + Offset hvec_slice_group_index_offset; ///< 0x0B28 + INSERT_PADDING_WORDS_NOINIT(20); ///< 0x0B30 + Offset vp9_prob_tab_buffer_offset; ///< 0x0B80 + Offset vp9_ctx_counter_buffer_offset; ///< 0x0B88 + Offset vp9_segment_read_buffer_offset; ///< 0x0B90 + Offset vp9_segment_write_buffer_offset; ///< 0x0B98 + Offset vp9_tile_size_buffer_offset; ///< 0x0BA0 + Offset vp9_col_mvwrite_buffer_offset; ///< 0x0BA8 + Offset vp9_col_mvread_buffer_offset; ///< 0x0BB0 + Offset vp9_filter_buffer_offset; ///< 0x0BB8 }; std::array reg_array; }; @@ -81,16 +103,16 @@ ASSERT_REG_POSITION(h264_slice_data_offsets, 0x104); ASSERT_REG_POSITION(frame_stats_offset, 0x109); ASSERT_REG_POSITION(h264_last_surface_luma_offset, 0x10A); ASSERT_REG_POSITION(h264_last_surface_chroma_offset, 0x10B); -ASSERT_REG_POSITION(surface_luma_offset, 0x10C); -ASSERT_REG_POSITION(surface_chroma_offset, 0x11D); +ASSERT_REG_POSITION(surface_luma_offsets, 0x10C); +ASSERT_REG_POSITION(surface_chroma_offsets, 0x11D); ASSERT_REG_POSITION(vp8_prob_data_offset, 0x150); ASSERT_REG_POSITION(vp8_header_partition_buf_offset, 0x151); -ASSERT_REG_POSITION(vp9_entropy_probs_offset, 0x170); -ASSERT_REG_POSITION(vp9_backward_updates_offset, 0x171); -ASSERT_REG_POSITION(vp9_last_frame_segmap_offset, 0x172); -ASSERT_REG_POSITION(vp9_curr_frame_segmap_offset, 0x173); -ASSERT_REG_POSITION(vp9_last_frame_mvs_offset, 0x175); -ASSERT_REG_POSITION(vp9_curr_frame_mvs_offset, 0x176); +ASSERT_REG_POSITION(vp9_prob_tab_buffer_offset, 0x170); +ASSERT_REG_POSITION(vp9_ctx_counter_buffer_offset, 0x171); +ASSERT_REG_POSITION(vp9_segment_read_buffer_offset, 0x172); +ASSERT_REG_POSITION(vp9_segment_write_buffer_offset, 0x173); +ASSERT_REG_POSITION(vp9_col_mvwrite_buffer_offset, 0x175); +ASSERT_REG_POSITION(vp9_col_mvread_buffer_offset, 0x176); #undef ASSERT_REG_POSITION diff --git a/src/video_core/host1x/syncpoint_manager.cpp b/src/video_core/host1x/syncpoint_manager.cpp index eb5dd8d70..1b5dd4ba6 100755 --- a/src/video_core/host1x/syncpoint_manager.cpp +++ b/src/video_core/host1x/syncpoint_manager.cpp @@ -18,7 +18,7 @@ SyncpointManager::ActionHandle SyncpointManager::RegisterAction( return {}; } - std::unique_lock lk(guard); + std::scoped_lock lk(guard); if (syncpoint.load(std::memory_order_relaxed) >= expected_value) { action(); return {}; @@ -35,7 +35,7 @@ SyncpointManager::ActionHandle SyncpointManager::RegisterAction( void SyncpointManager::DeregisterAction(std::list& action_storage, const ActionHandle& handle) { - std::unique_lock lk(guard); + std::scoped_lock lk(guard); // We want to ensure the iterator still exists prior to erasing it // Otherwise, if an invalid iterator was passed in then it could lead to UB @@ -78,7 +78,7 @@ void SyncpointManager::Increment(std::atomic& syncpoint, std::condition_var std::list& action_storage) { auto new_value{syncpoint.fetch_add(1, std::memory_order_acq_rel) + 1}; - std::unique_lock lk(guard); + std::scoped_lock lk(guard); auto it = action_storage.begin(); while (it != action_storage.end()) { if (it->expected_value > new_value) { diff --git a/src/video_core/host1x/vic.cpp b/src/video_core/host1x/vic.cpp index 34b365803..6c0127782 100755 --- a/src/video_core/host1x/vic.cpp +++ b/src/video_core/host1x/vic.cpp @@ -2,6 +2,18 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include +#include +#include + +#if defined(ARCHITECTURE_x86_64) +#if defined(_MSC_VER) +#include +#else +#include +#endif +#elif defined(ARCHITECTURE_arm64) +#include +#endif extern "C" { #if defined(__GNUC__) || defined(__clang__) @@ -14,228 +26,1181 @@ extern "C" { #endif } +#include "common/alignment.h" #include "common/assert.h" #include "common/bit_field.h" #include "common/logging/log.h" +#include "common/polyfill_thread.h" +#include "common/settings.h" #include "video_core/engines/maxwell_3d.h" +#include "video_core/guest_memory.h" #include "video_core/host1x/host1x.h" #include "video_core/host1x/nvdec.h" #include "video_core/host1x/vic.h" #include "video_core/memory_manager.h" #include "video_core/textures/decoders.h" -namespace Tegra { - -namespace Host1x { +#if defined(ARCHITECTURE_x86_64) +#include "common/x64/cpu_detect.h" +#elif defined(ARCHITECTURE_arm64) +// Some ARM64 detect +#endif +namespace Tegra::Host1x { namespace { -enum class VideoPixelFormat : u64_le { - RGBA8 = 0x1f, - BGRA8 = 0x20, - RGBX8 = 0x23, - YUV420 = 0x44, -}; -} // Anonymous namespace +static bool HasSSE41() { +#if defined(ARCHITECTURE_x86_64) + const auto& cpu_caps{Common::GetCPUCaps()}; + return cpu_caps.sse4_1; +#else + return false; +#endif +} -union VicConfig { - u64_le raw{}; - BitField<0, 7, VideoPixelFormat> pixel_format; - BitField<7, 2, u64_le> chroma_loc_horiz; - BitField<9, 2, u64_le> chroma_loc_vert; - BitField<11, 4, u64_le> block_linear_kind; - BitField<15, 4, u64_le> block_linear_height_log2; - BitField<32, 14, u64_le> surface_width_minus1; - BitField<46, 14, u64_le> surface_height_minus1; -}; +void SwizzleSurface(std::span output, u32 out_stride, std::span input, u32 in_stride, + u32 height) { + /* + * Taken from https://github.com/averne/FFmpeg/blob/nvtegra/libavutil/hwcontext_nvtegra.c#L949 + * Can only handle block height == 1. + */ + const uint32_t x_mask = 0xFFFFFFD2u; + const uint32_t y_mask = 0x2Cu; + uint32_t offs_x{}; + uint32_t offs_y{}; + uint32_t offs_line{}; -Vic::Vic(Host1x& host1x_, std::shared_ptr nvdec_processor_) - : host1x(host1x_), - nvdec_processor(std::move(nvdec_processor_)), converted_frame_buffer{nullptr, av_free} {} + for (u32 y = 0; y < height; y += 2) { + auto dst_line = output.data() + offs_y * 16; + const auto src_line = input.data() + y * (in_stride / 16) * 16; -Vic::~Vic() = default; + offs_line = offs_x; + for (u32 x = 0; x < in_stride; x += 16) { + std::memcpy(&dst_line[offs_line * 16], &src_line[x], 16); + std::memcpy(&dst_line[offs_line * 16 + 16], &src_line[x + in_stride], 16); + offs_line = (offs_line - x_mask) & x_mask; + } -void Vic::ProcessMethod(Method method, u32 argument) { - LOG_DEBUG(HW_GPU, "Vic method 0x{:X}", static_cast(method)); - const u64 arg = static_cast(argument) << 8; - switch (method) { - case Method::Execute: + offs_y = (offs_y - y_mask) & y_mask; + + /* Wrap into next tile row */ + if (!offs_y) { + offs_x += out_stride; + } + } +} + +} // namespace + +Vic::Vic(Host1x& host1x_, s32 id_, u32 syncpt, FrameQueue& frame_queue_) + : CDmaPusher{host1x_, id_}, id{id_}, syncpoint{syncpt}, + frame_queue{frame_queue_}, has_sse41{HasSSE41()} { + LOG_INFO(HW_GPU, "Created vic {}", id); +} + +Vic::~Vic() { + LOG_INFO(HW_GPU, "Destroying vic {}", id); +} + +void Vic::ProcessMethod(u32 method, u32 arg) { + LOG_TRACE(HW_GPU, "Vic method 0x{:X}", static_cast(method)); + regs.reg_array[method] = arg; + + switch (static_cast(method * sizeof(u32))) { + case Method::Execute: { Execute(); - break; - case Method::SetConfigStructOffset: - config_struct_address = arg; - break; - case Method::SetOutputSurfaceLumaOffset: - output_surface_luma_address = arg; - break; - case Method::SetOutputSurfaceChromaOffset: - output_surface_chroma_address = arg; - break; + } break; default: break; } } void Vic::Execute() { - if (output_surface_luma_address == 0) { - LOG_ERROR(Service_NVDRV, "VIC Luma address not set."); - return; - } - const VicConfig config{host1x.GMMU().Read(config_struct_address + 0x20)}; - auto frame = nvdec_processor->GetFrame(); - if (!frame) { - return; - } - const u64 surface_width = config.surface_width_minus1 + 1; - const u64 surface_height = config.surface_height_minus1 + 1; - if (static_cast(frame->GetWidth()) != surface_width || - static_cast(frame->GetHeight()) != surface_height) { - // TODO: Properly support multiple video streams with differing frame dimensions - LOG_WARNING(Service_NVDRV, "Frame dimensions {}x{} don't match surface dimensions {}x{}", - frame->GetWidth(), frame->GetHeight(), surface_width, surface_height); - } - switch (config.pixel_format) { - case VideoPixelFormat::RGBA8: - case VideoPixelFormat::BGRA8: - case VideoPixelFormat::RGBX8: - WriteRGBFrame(std::move(frame), config); - break; - case VideoPixelFormat::YUV420: - WriteYUVFrame(std::move(frame), config); - break; - default: - UNIMPLEMENTED_MSG("Unknown video pixel format {:X}", config.pixel_format.Value()); - break; - } -} + ConfigStruct config{}; + memory_manager.ReadBlock(regs.config_struct_offset.Address(), &config, sizeof(ConfigStruct)); -void Vic::WriteRGBFrame(std::unique_ptr frame, const VicConfig& config) { - LOG_TRACE(Service_NVDRV, "Writing RGB Frame"); + auto output_width{config.output_surface_config.out_surface_width + 1}; + auto output_height{config.output_surface_config.out_surface_height + 1}; + output_surface.resize_destructive(output_width * output_height); - const auto frame_width = frame->GetWidth(); - const auto frame_height = frame->GetHeight(); - const auto frame_format = frame->GetPixelFormat(); - - if (!scaler_ctx || frame_width != scaler_width || frame_height != scaler_height) { - const AVPixelFormat target_format = [pixel_format = config.pixel_format]() { - switch (pixel_format) { - case VideoPixelFormat::RGBA8: - return AV_PIX_FMT_RGBA; - case VideoPixelFormat::BGRA8: - return AV_PIX_FMT_BGRA; - case VideoPixelFormat::RGBX8: - return AV_PIX_FMT_RGB0; - default: - return AV_PIX_FMT_RGBA; - } - }(); - - sws_freeContext(scaler_ctx); - // Frames are decoded into either YUV420 or NV12 formats. Convert to desired RGB format - scaler_ctx = sws_getContext(frame_width, frame_height, frame_format, frame_width, - frame_height, target_format, 0, nullptr, nullptr, nullptr); - scaler_width = frame_width; - scaler_height = frame_height; - converted_frame_buffer.reset(); - } - if (!converted_frame_buffer) { - const size_t frame_size = frame_width * frame_height * 4; - converted_frame_buffer = AVMallocPtr{static_cast(av_malloc(frame_size)), av_free}; - } - const std::array converted_stride{frame_width * 4, frame_height * 4, 0, 0}; - u8* const converted_frame_buf_addr{converted_frame_buffer.get()}; - sws_scale(scaler_ctx, frame->GetPlanes(), frame->GetStrides(), 0, frame_height, - &converted_frame_buf_addr, converted_stride.data()); - - // Use the minimum of surface/frame dimensions to avoid buffer overflow. - const u32 surface_width = static_cast(config.surface_width_minus1) + 1; - const u32 surface_height = static_cast(config.surface_height_minus1) + 1; - const u32 width = std::min(surface_width, static_cast(frame_width)); - const u32 height = std::min(surface_height, static_cast(frame_height)); - const u32 blk_kind = static_cast(config.block_linear_kind); - if (blk_kind != 0) { - // swizzle pitch linear to block linear - const u32 block_height = static_cast(config.block_linear_height_log2); - const auto size = Texture::CalculateSize(true, 4, width, height, 1, block_height, 0); - luma_buffer.resize_destructive(size); - std::span frame_buff(converted_frame_buf_addr, 4 * width * height); - Texture::SwizzleSubrect(luma_buffer, frame_buff, 4, width, height, 1, 0, 0, width, height, - block_height, 0, width * 4); - - host1x.GMMU().WriteBlock(output_surface_luma_address, luma_buffer.data(), size); + if (Settings::values.nvdec_emulation.GetValue() == Settings::NvdecEmulation::Off) [[unlikely]] { + // Fill the frame with black, as otherwise they can have random data and be very glitchy. + std::fill(output_surface.begin(), output_surface.end(), Pixel{}); } else { - // send pitch linear frame - const size_t linear_size = width * height * 4; - host1x.GMMU().WriteBlock(output_surface_luma_address, converted_frame_buf_addr, - linear_size); + for (size_t i = 0; i < config.slot_structs.size(); i++) { + auto& slot_config{config.slot_structs[i]}; + if (!slot_config.config.slot_enable) { + continue; + } + + auto luma_offset{regs.surfaces[i][SurfaceIndex::Current].luma.Address()}; + if (nvdec_id == -1) { + nvdec_id = frame_queue.VicFindNvdecFdFromOffset(luma_offset); + } + + auto frame = frame_queue.GetFrame(nvdec_id, luma_offset); + if (!frame.get()) { + LOG_ERROR(HW_GPU, "Vic failed to get frame with offset 0x{:X}", luma_offset); + continue; + } + + switch (frame->GetPixelFormat()) { + case AV_PIX_FMT_YUV420P: + ReadY8__V8U8_N420(slot_config, regs.surfaces[i], std::move(frame)); + break; + case AV_PIX_FMT_NV12: + ReadY8__V8U8_N420(slot_config, regs.surfaces[i], std::move(frame)); + break; + default: + UNIMPLEMENTED_MSG( + "Unimplemented slot pixel format {}", + static_cast(slot_config.surface_config.slot_pixel_format.Value())); + break; + } + + Blend(config, slot_config); + } + } + + switch (config.output_surface_config.out_pixel_format) { + case VideoPixelFormat::A8B8G8R8: + case VideoPixelFormat::X8B8G8R8: + WriteABGR(config.output_surface_config); + break; + case VideoPixelFormat::A8R8G8B8: + WriteABGR(config.output_surface_config); + break; + case VideoPixelFormat::Y8__V8U8_N420: + WriteY8__V8U8_N420(config.output_surface_config); + break; + default: + UNIMPLEMENTED_MSG("Unknown video pixel format {}", + config.output_surface_config.out_pixel_format.Value()); + break; } } -void Vic::WriteYUVFrame(std::unique_ptr frame, const VicConfig& config) { - LOG_TRACE(Service_NVDRV, "Writing YUV420 Frame"); +template +void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, + std::span offsets, + std::shared_ptr frame) { + const auto out_luma_width{slot.surface_config.slot_surface_width + 1}; + auto out_luma_height{slot.surface_config.slot_surface_height + 1}; + const auto out_luma_stride{out_luma_width}; - const std::size_t surface_width = config.surface_width_minus1 + 1; - const std::size_t surface_height = config.surface_height_minus1 + 1; - const std::size_t aligned_width = (surface_width + 0xff) & ~0xffUL; - // Use the minimum of surface/frame dimensions to avoid buffer overflow. - const auto frame_width = std::min(surface_width, static_cast(frame->GetWidth())); - const auto frame_height = std::min(surface_height, static_cast(frame->GetHeight())); - - const auto stride = static_cast(frame->GetStride(0)); - - luma_buffer.resize_destructive(aligned_width * surface_height); - chroma_buffer.resize_destructive(aligned_width * surface_height / 2); - - // Populate luma buffer - const u8* luma_src = frame->GetData(0); - for (std::size_t y = 0; y < frame_height; ++y) { - const std::size_t src = y * stride; - const std::size_t dst = y * aligned_width; - std::memcpy(luma_buffer.data() + dst, luma_src + src, frame_width); + if constexpr (Interlaced) { + out_luma_height *= 2; } - host1x.GMMU().WriteBlock(output_surface_luma_address, luma_buffer.data(), luma_buffer.size()); - // Chroma - const std::size_t half_height = frame_height / 2; - const auto half_stride = static_cast(frame->GetStride(1)); + slot_surface.resize_destructive(out_luma_width * out_luma_height); - switch (frame->GetPixelFormat()) { - case AV_PIX_FMT_YUV420P: { - // Frame from FFmpeg software - // Populate chroma buffer from both channels with interleaving. - const std::size_t half_width = frame_width / 2; - u8* chroma_buffer_data = chroma_buffer.data(); - const u8* chroma_b_src = frame->GetData(1); - const u8* chroma_r_src = frame->GetData(2); - for (std::size_t y = 0; y < half_height; ++y) { - const std::size_t src = y * half_stride; - const std::size_t dst = y * aligned_width; - for (std::size_t x = 0; x < half_width; ++x) { - chroma_buffer_data[dst + x * 2] = chroma_b_src[src + x]; - chroma_buffer_data[dst + x * 2 + 1] = chroma_r_src[src + x]; + const auto in_luma_width{std::min(frame->GetWidth(), static_cast(out_luma_width))}; + const auto in_luma_height{std::min(frame->GetHeight(), static_cast(out_luma_height))}; + const auto in_luma_stride{frame->GetStride(0)}; + + const auto in_chroma_stride{frame->GetStride(1)}; + + const auto* luma_buffer{frame->GetPlane(0)}; + const auto* chroma_u_buffer{frame->GetPlane(1)}; + const auto* chroma_v_buffer{frame->GetPlane(2)}; + + LOG_TRACE(HW_GPU, + "Reading frame" + "\ninput luma {}x{} stride {} chroma {}x{} stride {}\n" + "output luma {}x{} stride {} chroma {}x{} stride {}", + in_luma_width, in_luma_height, in_luma_stride, in_luma_width / 2, in_luma_height / 2, + in_chroma_stride, out_luma_width, out_luma_height, out_luma_stride, out_luma_width, + out_luma_height, out_luma_stride); + + [[maybe_unused]] auto DecodeLinear = [&]() { + const auto alpha{static_cast(slot.config.planar_alpha.Value())}; + + for (s32 y = 0; y < in_luma_height; y++) { + const auto src_luma{y * in_luma_stride}; + const auto src_chroma{(y / 2) * in_chroma_stride}; + const auto dst{y * out_luma_stride}; + for (s32 x = 0; x < in_luma_width; x++) { + slot_surface[dst + x].r = static_cast(luma_buffer[src_luma + x] << 2); + // Chroma samples are duplicated horizontally and vertically. + if constexpr (Planar) { + slot_surface[dst + x].g = + static_cast(chroma_u_buffer[src_chroma + x / 2] << 2); + slot_surface[dst + x].b = + static_cast(chroma_v_buffer[src_chroma + x / 2] << 2); + } else { + slot_surface[dst + x].g = + static_cast(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2); + slot_surface[dst + x].b = + static_cast(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2); + } + slot_surface[dst + x].a = alpha; } } - break; + }; + +#if defined(ARCHITECTURE_x86_64) + if (!has_sse41) { + DecodeLinear(); + return; } - case AV_PIX_FMT_NV12: { - // Frame from VA-API hardware - // This is already interleaved so just copy - const u8* chroma_src = frame->GetData(1); - for (std::size_t y = 0; y < half_height; ++y) { - const std::size_t src = y * stride; - const std::size_t dst = y * aligned_width; - std::memcpy(chroma_buffer.data() + dst, chroma_src + src, frame_width); + + const auto alpha = + _mm_slli_epi64(_mm_set1_epi64x(static_cast(slot.config.planar_alpha.Value())), 48); + + const auto shuffle_mask = _mm_set_epi8(13, 15, 14, 12, 9, 11, 10, 8, 5, 7, 6, 4, 1, 3, 2, 0); + + for (s32 y = 0; y < in_luma_height; y++) { + const auto src_luma{y * in_luma_stride}; + const auto src_chroma{(y / 2) * in_chroma_stride}; + const auto dst{y * out_luma_stride}; + for (s32 x = 0; x < in_luma_width; x += 16) { + // clang-format off + // Prefetch next iteration's memory + _mm_prefetch((const char*)&luma_buffer[src_luma + x + 16], _MM_HINT_T0); + + // Load 8 bytes * 2 of 8-bit luma samples + // luma0 = 00 00 00 00 00 00 00 00 LL LL LL LL LL LL LL LL + auto luma0 = _mm_loadl_epi64((__m128i*)&luma_buffer[src_luma + x + 0]); + auto luma1 = _mm_loadl_epi64((__m128i*)&luma_buffer[src_luma + x + 8]); + + __m128i chroma; + + if constexpr (Planar) { + _mm_prefetch((const char*)&chroma_u_buffer[src_chroma + x / 2 + 8], _MM_HINT_T0); + _mm_prefetch((const char*)&chroma_v_buffer[src_chroma + x / 2 + 8], _MM_HINT_T0); + + // If Chroma is planar, we have separate U and V planes, load 8 bytes of each + // chroma_u0 = 00 00 00 00 00 00 00 00 UU UU UU UU UU UU UU UU + // chroma_v0 = 00 00 00 00 00 00 00 00 VV VV VV VV VV VV VV VV + auto chroma_u0 = _mm_loadl_epi64((__m128i*)&chroma_u_buffer[src_chroma + x / 2]); + auto chroma_v0 = _mm_loadl_epi64((__m128i*)&chroma_v_buffer[src_chroma + x / 2]); + + // Interleave the 8 bytes of U and V into a single 16 byte reg + // chroma = VV UU VV UU VV UU VV UU VV UU VV UU VV UU VV UU + chroma = _mm_unpacklo_epi8(chroma_u0, chroma_v0); + } else { + _mm_prefetch((const char*)&chroma_u_buffer[src_chroma + x / 2 + 8], _MM_HINT_T0); + + // Chroma is already interleaved in semiplanar format, just load 16 bytes + // chroma = VV UU VV UU VV UU VV UU VV UU VV UU VV UU VV UU + chroma = _mm_load_si128((__m128i*)&chroma_u_buffer[src_chroma + x]); + } + + // Convert the low 8 bytes of 8-bit luma into 16-bit luma + // luma0 = [00] [00] [00] [00] [00] [00] [00] [00] [LL] [LL] [LL] [LL] [LL] [LL] [LL] [LL] + // -> + // luma0 = [00 LL] [00 LL] [00 LL] [00 LL] [00 LL] [00 LL] [00 LL] [00 LL] + luma0 = _mm_cvtepu8_epi16(luma0); + luma1 = _mm_cvtepu8_epi16(luma1); + + // Treat the 8 bytes of 8-bit chroma as 16-bit channels, this allows us to take both the + // U and V together as one element. Using chroma twice here duplicates the values, as we + // take element 0 from chroma, and then element 0 from chroma again, etc. We need to + // duplicate chroma horitonally as chroma is half the width of luma. + // chroma = [VV8 UU8] [VV7 UU7] [VV6 UU6] [VV5 UU5] [VV4 UU4] [VV3 UU3] [VV2 UU2] [VV1 UU1] + // -> + // chroma00 = [VV4 UU4] [VV4 UU4] [VV3 UU3] [VV3 UU3] [VV2 UU2] [VV2 UU2] [VV1 UU1] [VV1 UU1] + // chroma01 = [VV8 UU8] [VV8 UU8] [VV7 UU7] [VV7 UU7] [VV6 UU6] [VV6 UU6] [VV5 UU5] [VV5 UU5] + auto chroma00 = _mm_unpacklo_epi16(chroma, chroma); + auto chroma01 = _mm_unpackhi_epi16(chroma, chroma); + + // Interleave the 16-bit luma and chroma. + // luma0 = [008 LL8] [007 LL7] [006 LL6] [005 LL5] [004 LL4] [003 LL3] [002 LL2] [001 LL1] + // chroma00 = [VV8 UU8] [VV7 UU7] [VV6 UU6] [VV5 UU5] [VV4 UU4] [VV3 UU3] [VV2 UU2] [VV1 UU1] + // -> + // yuv0 = [VV4 UU4 004 LL4] [VV3 UU3 003 LL3] [VV2 UU2 002 LL2] [VV1 UU1 001 LL1] + // yuv1 = [VV8 UU8 008 LL8] [VV7 UU7 007 LL7] [VV6 UU6 006 LL6] [VV5 UU5 005 LL5] + auto yuv0 = _mm_unpacklo_epi16(luma0, chroma00); + auto yuv1 = _mm_unpackhi_epi16(luma0, chroma00); + auto yuv2 = _mm_unpacklo_epi16(luma1, chroma01); + auto yuv3 = _mm_unpackhi_epi16(luma1, chroma01); + + // Shuffle the luma/chroma into the channel ordering we actually want. The high byte of + // the luma which is now a constant 0 after converting 8-bit -> 16-bit is used as the + // alpha. Luma -> R, U -> G, V -> B, 0 -> A + // yuv0 = [VV4 UU4 004 LL4] [VV3 UU3 003 LL3] [VV2 UU2 002 LL2] [VV1 UU1 001 LL1] + // -> + // yuv0 = [AA4 VV4 UU4 LL4] [AA3 VV3 UU3 LL3] [AA2 VV2 UU2 LL2] [AA1 VV1 UU1 LL1] + yuv0 = _mm_shuffle_epi8(yuv0, shuffle_mask); + yuv1 = _mm_shuffle_epi8(yuv1, shuffle_mask); + yuv2 = _mm_shuffle_epi8(yuv2, shuffle_mask); + yuv3 = _mm_shuffle_epi8(yuv3, shuffle_mask); + + // Extend the 8-bit channels we have into 16-bits, as that's the target surface format. + // Since this turns just the low 8 bytes into 16 bytes, the second of + // each operation here right shifts the register by 8 to get the high pixels. + // yuv0 = [AA4] [VV4] [UU4] [LL4] [AA3] [VV3] [UU3] [LL3] [AA2] [VV2] [UU2] [LL2] [AA1] [VV1] [UU1] [LL1] + // -> + // yuv01 = [002 AA2] [002 VV2] [002 UU2] [002 LL2] [001 AA1] [001 VV1] [001 UU1] [001 LL1] + // yuv23 = [004 AA4] [004 VV4] [004 UU4] [004 LL4] [003 AA3] [003 VV3] ]003 UU3] [003 LL3] + auto yuv01 = _mm_cvtepu8_epi16(yuv0); + auto yuv23 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv0, 8)); + auto yuv45 = _mm_cvtepu8_epi16(yuv1); + auto yuv67 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv1, 8)); + auto yuv89 = _mm_cvtepu8_epi16(yuv2); + auto yuv1011 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv2, 8)); + auto yuv1213 = _mm_cvtepu8_epi16(yuv3); + auto yuv1415 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv3, 8)); + + // Left-shift all 16-bit channels by 2, this is to get us into a 10-bit format instead + // of 8, which is the format alpha is in, as well as other blending values. + yuv01 = _mm_slli_epi16(yuv01, 2); + yuv23 = _mm_slli_epi16(yuv23, 2); + yuv45 = _mm_slli_epi16(yuv45, 2); + yuv67 = _mm_slli_epi16(yuv67, 2); + yuv89 = _mm_slli_epi16(yuv89, 2); + yuv1011 = _mm_slli_epi16(yuv1011, 2); + yuv1213 = _mm_slli_epi16(yuv1213, 2); + yuv1415 = _mm_slli_epi16(yuv1415, 2); + + // OR in the planar alpha, this has already been duplicated and shifted into position, + // and just fills in the AA channels with the actual alpha value. + yuv01 = _mm_or_si128(yuv01, alpha); + yuv23 = _mm_or_si128(yuv23, alpha); + yuv45 = _mm_or_si128(yuv45, alpha); + yuv67 = _mm_or_si128(yuv67, alpha); + yuv89 = _mm_or_si128(yuv89, alpha); + yuv1011 = _mm_or_si128(yuv1011, alpha); + yuv1213 = _mm_or_si128(yuv1213, alpha); + yuv1415 = _mm_or_si128(yuv1415, alpha); + + // Store out the pixels. One pixel is now 8 bytes, so each store is 2 pixels. + // [AA AA] [VV VV] [UU UU] [LL LL] [AA AA] [VV VV] [UU UU] [LL LL] + _mm_store_si128((__m128i*)&slot_surface[dst + x + 0], yuv01); + _mm_store_si128((__m128i*)&slot_surface[dst + x + 2], yuv23); + _mm_store_si128((__m128i*)&slot_surface[dst + x + 4], yuv45); + _mm_store_si128((__m128i*)&slot_surface[dst + x + 6], yuv67); + _mm_store_si128((__m128i*)&slot_surface[dst + x + 8], yuv89); + _mm_store_si128((__m128i*)&slot_surface[dst + x + 10], yuv1011); + _mm_store_si128((__m128i*)&slot_surface[dst + x + 12], yuv1213); + _mm_store_si128((__m128i*)&slot_surface[dst + x + 14], yuv1415); + + // clang-format on } - break; } - default: - ASSERT(false); - break; - } - host1x.GMMU().WriteBlock(output_surface_chroma_address, chroma_buffer.data(), - chroma_buffer.size()); +#elif defined(ARCHITECTURE_arm64) + DecodeLinear(); +#else + DecodeLinear(); +#endif } -} // namespace Host1x +template +void Vic::ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span offsets, + std::shared_ptr frame) { + if constexpr (!Planar) { + ReadProgressiveY8__V8U8_N420(slot, offsets, std::move(frame)); + return; + } + const auto out_luma_width{slot.surface_config.slot_surface_width + 1}; + const auto out_luma_height{(slot.surface_config.slot_surface_height + 1) * 2}; + const auto out_luma_stride{out_luma_width}; -} // namespace Tegra + slot_surface.resize_destructive(out_luma_width * out_luma_height); + + const auto in_luma_width{std::min(frame->GetWidth(), static_cast(out_luma_width))}; + [[maybe_unused]] const auto in_luma_height{ + std::min(frame->GetHeight(), static_cast(out_luma_height))}; + const auto in_luma_stride{frame->GetStride(0)}; + + [[maybe_unused]] const auto in_chroma_width{(frame->GetWidth() + 1) / 2}; + const auto in_chroma_height{(frame->GetHeight() + 1) / 2}; + const auto in_chroma_stride{frame->GetStride(1)}; + + const auto* luma_buffer{frame->GetPlane(0)}; + const auto* chroma_u_buffer{frame->GetPlane(1)}; + const auto* chroma_v_buffer{frame->GetPlane(2)}; + + LOG_TRACE(HW_GPU, + "Reading frame" + "\ninput luma {}x{} stride {} chroma {}x{} stride {}\n" + "output luma {}x{} stride {} chroma {}x{} stride {}", + in_luma_width, in_luma_height, in_luma_stride, in_chroma_width, in_chroma_height, + in_chroma_stride, out_luma_width, out_luma_height, out_luma_stride, + out_luma_width / 2, out_luma_height / 2, out_luma_stride); + + [[maybe_unused]] auto DecodeLinear = [&]() { + auto DecodeBobField = [&]() { + const auto alpha{static_cast(slot.config.planar_alpha.Value())}; + + for (s32 y = static_cast(TopField == false); y < in_chroma_height * 2; y += 2) { + const auto src_luma{y * in_luma_stride}; + const auto src_chroma{(y / 2) * in_chroma_stride}; + const auto dst{y * out_luma_stride}; + for (s32 x = 0; x < in_luma_width; x++) { + slot_surface[dst + x].r = static_cast(luma_buffer[src_luma + x] << 2); + if constexpr (Planar) { + slot_surface[dst + x].g = + static_cast(chroma_u_buffer[src_chroma + x / 2] << 2); + slot_surface[dst + x].b = + static_cast(chroma_v_buffer[src_chroma + x / 2] << 2); + } else { + slot_surface[dst + x].g = + static_cast(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2); + slot_surface[dst + x].b = + static_cast(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2); + } + slot_surface[dst + x].a = alpha; + } + + s32 other_line{}; + if constexpr (TopField) { + other_line = (y + 1) * out_luma_stride; + } else { + other_line = (y - 1) * out_luma_stride; + } + std::memcpy(&slot_surface[other_line], &slot_surface[dst], + out_luma_width * sizeof(Pixel)); + } + }; + + switch (slot.config.deinterlace_mode) { + case DXVAHD_DEINTERLACE_MODE_PRIVATE::WEAVE: + // Due to the fact that we do not write to memory in nvdec, we cannot use Weave as it + // relies on the previous frame. + DecodeBobField(); + break; + case DXVAHD_DEINTERLACE_MODE_PRIVATE::BOB_FIELD: + DecodeBobField(); + break; + case DXVAHD_DEINTERLACE_MODE_PRIVATE::DISI1: + // Due to the fact that we do not write to memory in nvdec, we cannot use DISI1 as it + // relies on previous/next frames. + DecodeBobField(); + break; + default: + UNIMPLEMENTED_MSG("Deinterlace mode {} not implemented!", + static_cast(slot.config.deinterlace_mode.Value())); + break; + } + }; + + DecodeLinear(); +} + +template +void Vic::ReadY8__V8U8_N420(const SlotStruct& slot, std::span offsets, + std::shared_ptr frame) { + switch (slot.config.frame_format) { + case DXVAHD_FRAME_FORMAT::PROGRESSIVE: + ReadProgressiveY8__V8U8_N420(slot, offsets, std::move(frame)); + break; + case DXVAHD_FRAME_FORMAT::TOP_FIELD: + ReadInterlacedY8__V8U8_N420(slot, offsets, std::move(frame)); + break; + case DXVAHD_FRAME_FORMAT::BOTTOM_FIELD: + ReadInterlacedY8__V8U8_N420(slot, offsets, std::move(frame)); + break; + default: + LOG_ERROR(HW_GPU, "Unknown deinterlace format {}", + static_cast(slot.config.frame_format.Value())); + break; + } +} + +void Vic::Blend(const ConfigStruct& config, const SlotStruct& slot) { + constexpr auto add_one([](u32 v) -> u32 { return v != 0 ? v + 1 : 0; }); + + auto source_left{add_one(static_cast(slot.config.source_rect_left.Value()))}; + auto source_right{add_one(static_cast(slot.config.source_rect_right.Value()))}; + auto source_top{add_one(static_cast(slot.config.source_rect_top.Value()))}; + auto source_bottom{add_one(static_cast(slot.config.source_rect_bottom.Value()))}; + + const auto dest_left{add_one(static_cast(slot.config.dest_rect_left.Value()))}; + const auto dest_right{add_one(static_cast(slot.config.dest_rect_right.Value()))}; + const auto dest_top{add_one(static_cast(slot.config.dest_rect_top.Value()))}; + const auto dest_bottom{add_one(static_cast(slot.config.dest_rect_bottom.Value()))}; + + auto rect_left{add_one(config.output_config.target_rect_left.Value())}; + auto rect_right{add_one(config.output_config.target_rect_right.Value())}; + auto rect_top{add_one(config.output_config.target_rect_top.Value())}; + auto rect_bottom{add_one(config.output_config.target_rect_bottom.Value())}; + + rect_left = std::max(rect_left, dest_left); + rect_right = std::min(rect_right, dest_right); + rect_top = std::max(rect_top, dest_top); + rect_bottom = std::min(rect_bottom, dest_bottom); + + source_left = std::max(source_left, rect_left); + source_right = std::min(source_right, rect_right); + source_top = std::max(source_top, rect_top); + source_bottom = std::min(source_bottom, rect_bottom); + + if (source_left >= source_right || source_top >= source_bottom) { + return; + } + + const auto out_surface_width{config.output_surface_config.out_surface_width + 1}; + [[maybe_unused]] const auto out_surface_height{config.output_surface_config.out_surface_height + + 1}; + const auto in_surface_width{slot.surface_config.slot_surface_width + 1}; + + source_bottom = std::min(source_bottom, out_surface_height); + source_right = std::min(source_right, out_surface_width); + + // TODO Alpha blending. No games I've seen use more than a single surface or supply an alpha + // below max, so it's ignored for now. + + if (!slot.color_matrix.matrix_enable) { + const auto copy_width = std::min(source_right - source_left, rect_right - rect_left); + + for (u32 y = source_top; y < source_bottom; y++) { + const auto dst_line = y * out_surface_width; + const auto src_line = y * in_surface_width; + std::memcpy(&output_surface[dst_line + rect_left], + &slot_surface[src_line + source_left], copy_width * sizeof(Pixel)); + } + } else { + // clang-format off + // Colour conversion is enabled, this is a 3x4 * 4x1 matrix multiplication, resulting in a 3x1 matrix. + // | r0c0 r0c1 r0c2 r0c3 | | R | | R | + // | r1c0 r1c1 r1c2 r1c3 | * | G | = | G | + // | r2c0 r2c1 r2c2 r2c3 | | B | | B | + // | 1 | + // clang-format on + + [[maybe_unused]] auto DecodeLinear = [&]() { + const auto r0c0 = static_cast(slot.color_matrix.matrix_coeff00.Value()); + const auto r0c1 = static_cast(slot.color_matrix.matrix_coeff01.Value()); + const auto r0c2 = static_cast(slot.color_matrix.matrix_coeff02.Value()); + const auto r0c3 = static_cast(slot.color_matrix.matrix_coeff03.Value()); + const auto r1c0 = static_cast(slot.color_matrix.matrix_coeff10.Value()); + const auto r1c1 = static_cast(slot.color_matrix.matrix_coeff11.Value()); + const auto r1c2 = static_cast(slot.color_matrix.matrix_coeff12.Value()); + const auto r1c3 = static_cast(slot.color_matrix.matrix_coeff13.Value()); + const auto r2c0 = static_cast(slot.color_matrix.matrix_coeff20.Value()); + const auto r2c1 = static_cast(slot.color_matrix.matrix_coeff21.Value()); + const auto r2c2 = static_cast(slot.color_matrix.matrix_coeff22.Value()); + const auto r2c3 = static_cast(slot.color_matrix.matrix_coeff23.Value()); + + const auto shift = static_cast(slot.color_matrix.matrix_r_shift.Value()); + const auto clamp_min = static_cast(slot.config.soft_clamp_low.Value()); + const auto clamp_max = static_cast(slot.config.soft_clamp_high.Value()); + + auto MatMul = [&](const Pixel& in_pixel) -> std::tuple { + auto r = static_cast(in_pixel.r); + auto g = static_cast(in_pixel.g); + auto b = static_cast(in_pixel.b); + + r = in_pixel.r * r0c0 + in_pixel.g * r0c1 + in_pixel.b * r0c2; + g = in_pixel.r * r1c0 + in_pixel.g * r1c1 + in_pixel.b * r1c2; + b = in_pixel.r * r2c0 + in_pixel.g * r2c1 + in_pixel.b * r2c2; + + r >>= shift; + g >>= shift; + b >>= shift; + + r += r0c3; + g += r1c3; + b += r2c3; + + r >>= 8; + g >>= 8; + b >>= 8; + + return {r, g, b, static_cast(in_pixel.a)}; + }; + + for (u32 y = source_top; y < source_bottom; y++) { + const auto src{y * in_surface_width + source_left}; + const auto dst{y * out_surface_width + rect_left}; + for (u32 x = source_left; x < source_right; x++) { + auto [r, g, b, a] = MatMul(slot_surface[src + x]); + + r = std::clamp(r, clamp_min, clamp_max); + g = std::clamp(g, clamp_min, clamp_max); + b = std::clamp(b, clamp_min, clamp_max); + a = std::clamp(a, clamp_min, clamp_max); + + output_surface[dst + x] = {static_cast(r), static_cast(g), + static_cast(b), static_cast(a)}; + } + } + }; + +#if defined(ARCHITECTURE_x86_64) + if (!has_sse41) { + DecodeLinear(); + return; + } + + // Fill the columns, e.g + // c0 = [00 00 00 00] [r2c0 r2c0 r2c0 r2c0] [r1c0 r1c0 r1c0 r1c0] [r0c0 r0c0 r0c0 r0c0] + + const auto c0 = _mm_set_epi32(0, static_cast(slot.color_matrix.matrix_coeff20.Value()), + static_cast(slot.color_matrix.matrix_coeff10.Value()), + static_cast(slot.color_matrix.matrix_coeff00.Value())); + const auto c1 = _mm_set_epi32(0, static_cast(slot.color_matrix.matrix_coeff21.Value()), + static_cast(slot.color_matrix.matrix_coeff11.Value()), + static_cast(slot.color_matrix.matrix_coeff01.Value())); + const auto c2 = _mm_set_epi32(0, static_cast(slot.color_matrix.matrix_coeff22.Value()), + static_cast(slot.color_matrix.matrix_coeff12.Value()), + static_cast(slot.color_matrix.matrix_coeff02.Value())); + const auto c3 = _mm_set_epi32(0, static_cast(slot.color_matrix.matrix_coeff23.Value()), + static_cast(slot.color_matrix.matrix_coeff13.Value()), + static_cast(slot.color_matrix.matrix_coeff03.Value())); + + // Set the matrix right-shift as a single element. + const auto shift = + _mm_set_epi32(0, 0, 0, static_cast(slot.color_matrix.matrix_r_shift.Value())); + + // Set every 16-bit value to the soft clamp values for clamping every 16-bit channel. + const auto clamp_min = _mm_set1_epi16(static_cast(slot.config.soft_clamp_low.Value())); + const auto clamp_max = + _mm_set1_epi16(static_cast(slot.config.soft_clamp_high.Value())); + + // clang-format off + + auto MatMul = [](__m128i& p, const __m128i& col0, const __m128i& col1, const __m128i& col2, + const __m128i& col3, const __m128i& trm_shift) -> __m128i { + // Duplicate the 32-bit channels, e.g + // p = [AA AA AA AA] [BB BB BB BB] [GG GG GG GG] [RR RR RR RR] + // -> + // r = [RR4 RR4 RR4 RR4] [RR3 RR3 RR3 RR3] [RR2 RR2 RR2 RR2] [RR1 RR1 RR1 RR1] + auto r = _mm_shuffle_epi32(p, 0x0); + auto g = _mm_shuffle_epi32(p, 0x55); + auto b = _mm_shuffle_epi32(p, 0xAA); + + // Multiply the rows and columns c0 * r, c1 * g, c2 * b, e.g + // r = [RR4 RR4 RR4 RR4] [ RR3 RR3 RR3 RR3] [ RR2 RR2 RR2 RR2] [ RR1 RR1 RR1 RR1] + // * + // c0 = [ 00 00 00 00] [r2c0 r2c0 r2c0 r2c0] [r1c0 r1c0 r1c0 r1c0] [r0c0 r0c0 r0c0 r0c0] + r = _mm_mullo_epi32(r, col0); + g = _mm_mullo_epi32(g, col1); + b = _mm_mullo_epi32(b, col2); + + // Add them all together vertically, such that the 32-bit element + // out[0] = (r[0] * c0[0]) + (g[0] * c1[0]) + (b[0] * c2[0]) + auto out = _mm_add_epi32(_mm_add_epi32(r, g), b); + + // Shift the result by r_shift, as the TRM says + out = _mm_sra_epi32(out, trm_shift); + + // Add the final column. Because the 4x1 matrix has this row as 1, there's no need to + // multiply by it, and as per the TRM this column ignores r_shift, so it's just added + // here after shifting. + out = _mm_add_epi32(out, col3); + + // Shift the result back from S12.8 to integer values + return _mm_srai_epi32(out, 8); + }; + + for (u32 y = source_top; y < source_bottom; y++) { + const auto src{y * in_surface_width + source_left}; + const auto dst{y * out_surface_width + rect_left}; + for (u32 x = source_left; x < source_right; x += 8) { + // clang-format off + // Prefetch the next iteration's memory + _mm_prefetch((const char*)&slot_surface[src + x + 8], _MM_HINT_T0); + + // Load in pixels + // p01 = [AA AA] [BB BB] [GG GG] [RR RR] [AA AA] [BB BB] [GG GG] [RR RR] + auto p01 = _mm_load_si128((__m128i*)&slot_surface[src + x + 0]); + auto p23 = _mm_load_si128((__m128i*)&slot_surface[src + x + 2]); + auto p45 = _mm_load_si128((__m128i*)&slot_surface[src + x + 4]); + auto p67 = _mm_load_si128((__m128i*)&slot_surface[src + x + 6]); + + // Convert the 16-bit channels into 32-bit (unsigned), as the matrix values are + // 32-bit and to avoid overflow. + // p01 = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1] + // -> + // p01_lo = [001 001 AA1 AA1] [001 001 BB1 BB1] [001 001 GG1 GG1] [001 001 RR1 RR1] + // p01_hi = [002 002 AA2 AA2] [002 002 BB2 BB2] [002 002 GG2 GG2] [002 002 RR2 RR2] + auto p01_lo = _mm_cvtepu16_epi32(p01); + auto p01_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p01, 8)); + auto p23_lo = _mm_cvtepu16_epi32(p23); + auto p23_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p23, 8)); + auto p45_lo = _mm_cvtepu16_epi32(p45); + auto p45_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p45, 8)); + auto p67_lo = _mm_cvtepu16_epi32(p67); + auto p67_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p67, 8)); + + // Matrix multiply the pixel, doing the colour conversion. + auto out0 = MatMul(p01_lo, c0, c1, c2, c3, shift); + auto out1 = MatMul(p01_hi, c0, c1, c2, c3, shift); + auto out2 = MatMul(p23_lo, c0, c1, c2, c3, shift); + auto out3 = MatMul(p23_hi, c0, c1, c2, c3, shift); + auto out4 = MatMul(p45_lo, c0, c1, c2, c3, shift); + auto out5 = MatMul(p45_hi, c0, c1, c2, c3, shift); + auto out6 = MatMul(p67_lo, c0, c1, c2, c3, shift); + auto out7 = MatMul(p67_hi, c0, c1, c2, c3, shift); + + // Pack the 32-bit channel pixels back into 16-bit using unsigned saturation + // out0 = [001 001 AA1 AA1] [001 001 BB1 BB1] [001 001 GG1 GG1] [001 001 RR1 RR1] + // out1 = [002 002 AA2 AA2] [002 002 BB2 BB2] [002 002 GG2 GG2] [002 002 RR2 RR2] + // -> + // done0 = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1] + auto done0 = _mm_packus_epi32(out0, out1); + auto done1 = _mm_packus_epi32(out2, out3); + auto done2 = _mm_packus_epi32(out4, out5); + auto done3 = _mm_packus_epi32(out6, out7); + + // Blend the original alpha back into the pixel, as the matrix multiply gives us a + // 3-channel output, not 4. + // 0x88 = b10001000, taking RGB from the first argument, A from the second argument. + // done0 = [002 002] [BB2 BB2] [GG2 GG2] [RR2 RR2] [001 001] [BB1 BB1] [GG1 GG1] [RR1 RR1] + // -> + // done0 = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1] + done0 = _mm_blend_epi16(done0, p01, 0x88); + done1 = _mm_blend_epi16(done1, p23, 0x88); + done2 = _mm_blend_epi16(done2, p45, 0x88); + done3 = _mm_blend_epi16(done3, p67, 0x88); + + // Clamp the 16-bit channels to the soft-clamp min/max. + done0 = _mm_max_epu16(done0, clamp_min); + done1 = _mm_max_epu16(done1, clamp_min); + done2 = _mm_max_epu16(done2, clamp_min); + done3 = _mm_max_epu16(done3, clamp_min); + + done0 = _mm_min_epu16(done0, clamp_max); + done1 = _mm_min_epu16(done1, clamp_max); + done2 = _mm_min_epu16(done2, clamp_max); + done3 = _mm_min_epu16(done3, clamp_max); + + // Store the pixels to the output surface. + _mm_store_si128((__m128i*)&output_surface[dst + x + 0], done0); + _mm_store_si128((__m128i*)&output_surface[dst + x + 2], done1); + _mm_store_si128((__m128i*)&output_surface[dst + x + 4], done2); + _mm_store_si128((__m128i*)&output_surface[dst + x + 6], done3); + + } + } + // clang-format on +#elif defined(ARCHITECTURE_arm64) + DecodeLinear(); +#else + DecodeLinear(); +#endif + } +} + +void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) { + constexpr u32 BytesPerPixel = 1; + + auto surface_width{output_surface_config.out_surface_width + 1}; + auto surface_height{output_surface_config.out_surface_height + 1}; + const auto surface_stride{surface_width}; + + const auto out_luma_width = output_surface_config.out_luma_width + 1; + const auto out_luma_height = output_surface_config.out_luma_height + 1; + const auto out_luma_stride = Common::AlignUp(out_luma_width * BytesPerPixel, 0x10); + const auto out_luma_size = out_luma_height * out_luma_stride; + + const auto out_chroma_width = output_surface_config.out_chroma_width + 1; + const auto out_chroma_height = output_surface_config.out_chroma_height + 1; + const auto out_chroma_stride = Common::AlignUp(out_chroma_width * BytesPerPixel * 2, 0x10); + const auto out_chroma_size = out_chroma_height * out_chroma_stride; + + surface_width = std::min(surface_width, out_luma_width); + surface_height = std::min(surface_height, out_luma_height); + + [[maybe_unused]] auto DecodeLinear = [&](std::span out_luma, std::span out_chroma) { + for (u32 y = 0; y < surface_height; ++y) { + const auto src_luma = y * surface_stride; + const auto dst_luma = y * out_luma_stride; + const auto src_chroma = y * surface_stride; + const auto dst_chroma = (y / 2) * out_chroma_stride; + for (u32 x = 0; x < surface_width; x += 2) { + out_luma[dst_luma + x + 0] = + static_cast(output_surface[src_luma + x + 0].r >> 2); + out_luma[dst_luma + x + 1] = + static_cast(output_surface[src_luma + x + 1].r >> 2); + out_chroma[dst_chroma + x + 0] = + static_cast(output_surface[src_chroma + x].g >> 2); + out_chroma[dst_chroma + x + 1] = + static_cast(output_surface[src_chroma + x].b >> 2); + } + } + }; + + auto Decode = [&](std::span out_luma, std::span out_chroma) { +#if defined(ARCHITECTURE_x86_64) + if (!has_sse41) { + DecodeLinear(out_luma, out_chroma); + return; + } + + // luma_mask = [00 00] [00 00] [00 00] [FF FF] [00 00] [00 00] [00 00] [FF FF] + const auto luma_mask = _mm_set_epi16(0, 0, 0, -1, 0, 0, 0, -1); + + for (u32 y = 0; y < surface_height; ++y) { + const auto src = y * surface_stride; + const auto dst_luma = y * out_luma_stride; + const auto dst_chroma = (y / 2) * out_chroma_stride; + for (u32 x = 0; x < surface_width; x += 16) { + // clang-format off + // Prefetch the next cache lines, 2 per iteration + _mm_prefetch((const char*)&output_surface[src + x + 16], _MM_HINT_T0); + _mm_prefetch((const char*)&output_surface[src + x + 24], _MM_HINT_T0); + + // Load the 64-bit pixels, 2 per variable. + auto pixel01 = _mm_load_si128((__m128i*)&output_surface[src + x + 0]); + auto pixel23 = _mm_load_si128((__m128i*)&output_surface[src + x + 2]); + auto pixel45 = _mm_load_si128((__m128i*)&output_surface[src + x + 4]); + auto pixel67 = _mm_load_si128((__m128i*)&output_surface[src + x + 6]); + auto pixel89 = _mm_load_si128((__m128i*)&output_surface[src + x + 8]); + auto pixel1011 = _mm_load_si128((__m128i*)&output_surface[src + x + 10]); + auto pixel1213 = _mm_load_si128((__m128i*)&output_surface[src + x + 12]); + auto pixel1415 = _mm_load_si128((__m128i*)&output_surface[src + x + 14]); + + // Split out the luma of each pixel using the luma_mask above. + // pixel01 = [AA2 AA2] [VV2 VV2] [UU2 UU2] [LL2 LL2] [AA1 AA1] [VV1 VV1] [UU1 UU1] [LL1 LL1] + // -> + // l01 = [002 002] [002 002] [002 002] [LL2 LL2] [001 001] [001 001] [001 001] [LL1 LL1] + auto l01 = _mm_and_si128(pixel01, luma_mask); + auto l23 = _mm_and_si128(pixel23, luma_mask); + auto l45 = _mm_and_si128(pixel45, luma_mask); + auto l67 = _mm_and_si128(pixel67, luma_mask); + auto l89 = _mm_and_si128(pixel89, luma_mask); + auto l1011 = _mm_and_si128(pixel1011, luma_mask); + auto l1213 = _mm_and_si128(pixel1213, luma_mask); + auto l1415 = _mm_and_si128(pixel1415, luma_mask); + + // Pack 32-bit elements from 2 registers down into 16-bit elements in 1 register. + // l01 = [002 002 002 002] [002 002 LL2 LL2] [001 001 001 001] [001 001 LL1 LL1] + // l23 = [004 004 004 004] [004 004 LL4 LL4] [003 003 003 003] [003 003 LL3 LL3] + // -> + // l0123 = [004 004] [LL4 LL4] [003 003] [LL3 LL3] [002 002] [LL2 LL2] [001 001] [LL1 LL1] + auto l0123 = _mm_packus_epi32(l01, l23); + auto l4567 = _mm_packus_epi32(l45, l67); + auto l891011 = _mm_packus_epi32(l89, l1011); + auto l12131415 = _mm_packus_epi32(l1213, l1415); + + // Pack 32-bit elements from 2 registers down into 16-bit elements in 1 register. + // l0123 = [004 004 LL4 LL4] [003 003 LL3 LL3] [002 002 LL2 LL2] [001 001 LL1 LL1] + // l4567 = [008 008 LL8 LL8] [007 007 LL7 LL7] [006 006 LL6 LL6] [005 005 LL5 LL5] + // -> + // luma_lo = [LL8 LL8] [LL7 LL7] [LL6 LL6] [LL5 LL5] [LL4 LL4] [LL3 LL3] [LL2 LL2] [LL1 LL1] + auto luma_lo = _mm_packus_epi32(l0123, l4567); + auto luma_hi = _mm_packus_epi32(l891011, l12131415); + + // Right-shift the 16-bit elements by 2, un-doing the left shift by 2 on read + // and bringing the range back to 8-bit. + luma_lo = _mm_srli_epi16(luma_lo, 2); + luma_hi = _mm_srli_epi16(luma_hi, 2); + + // Pack with unsigned saturation the 16-bit values in 2 registers into 8-bit values in 1 register. + // luma_lo = [LL8 LL8] [LL7 LL7] [LL6 LL6] [LL5 LL5] [LL4 LL4] [LL3 LL3] [LL2 LL2] [LL1 LL1] + // luma_hi = [LL16 LL16] [LL15 LL15] [LL14 LL14] [LL13 LL13] [LL12 LL12] [LL11 LL11] [LL10 LL10] [LL9 LL9] + // -> + // luma = [LL16] [LL15] [LL14] [LL13] [LL12] [LL11] [LL10] [LL9] [LL8] [LL7] [LL6] [LL5] [LL4] [LL3] [LL2] [LL1] + auto luma = _mm_packus_epi16(luma_lo, luma_hi); + + // Store the 16 bytes of luma + _mm_store_si128((__m128i*)&out_luma[dst_luma + x], luma); + + if (y % 2 == 0) { + // Chroma, done every other line as it's half the height of luma. + + // Shift the register right by 2 bytes (not bits), to kick out the 16-bit luma. + // We can do this instead of &'ing a mask and then shifting. + // pixel01 = [AA2 AA2] [VV2 VV2] [UU2 UU2] [LL2 LL2] [AA1 AA1] [VV1 VV1] [UU1 UU1] [LL1 LL1] + // -> + // c01 = [ 00 00] [AA2 AA2] [VV2 VV2] [UU2 UU2] [LL2 LL2] [AA1 AA1] [VV1 VV1] [UU1 UU1] + auto c01 = _mm_srli_si128(pixel01, 2); + auto c23 = _mm_srli_si128(pixel23, 2); + auto c45 = _mm_srli_si128(pixel45, 2); + auto c67 = _mm_srli_si128(pixel67, 2); + auto c89 = _mm_srli_si128(pixel89, 2); + auto c1011 = _mm_srli_si128(pixel1011, 2); + auto c1213 = _mm_srli_si128(pixel1213, 2); + auto c1415 = _mm_srli_si128(pixel1415, 2); + + // Interleave the lower 8 bytes as 32-bit elements from 2 registers into 1 register. + // This has the effect of skipping every other chroma value horitonally, + // notice the high pixels UU2/UU4 are skipped. + // This is intended as N420 chroma width is half the luma width. + // c01 = [ 00 00 AA2 AA2] [VV2 VV2 UU2 UU2] [LL2 LL2 AA1 AA1] [VV1 VV1 UU1 UU1] + // c23 = [ 00 00 AA4 AA4] [VV4 VV4 UU4 UU4] [LL4 LL4 AA3 AA3] [VV3 VV3 UU3 UU3] + // -> + // c0123 = [LL4 LL4 AA3 AA3] [LL2 LL2 AA1 AA1] [VV3 VV3 UU3 UU3] [VV1 VV1 UU1 UU1] + auto c0123 = _mm_unpacklo_epi32(c01, c23); + auto c4567 = _mm_unpacklo_epi32(c45, c67); + auto c891011 = _mm_unpacklo_epi32(c89, c1011); + auto c12131415 = _mm_unpacklo_epi32(c1213, c1415); + + // Interleave the low 64-bit elements from 2 registers into 1. + // c0123 = [LL4 LL4 AA3 AA3 LL2 LL2 AA1 AA1] [VV3 VV3 UU3 UU3 VV1 VV1 UU1 UU1] + // c4567 = [LL8 LL8 AA7 AA7 LL6 LL6 AA5 AA5] [VV7 VV7 UU7 UU7 VV5 VV5 UU5 UU5] + // -> + // chroma_lo = [VV7 VV7 UU7 UU7 VV5 VV5 UU5 UU5] [VV3 VV3 UU3 UU3 VV1 VV1 UU1 UU1] + auto chroma_lo = _mm_unpacklo_epi64(c0123, c4567); + auto chroma_hi = _mm_unpacklo_epi64(c891011, c12131415); + + // Right-shift the 16-bit elements by 2, un-doing the left shift by 2 on read + // and bringing the range back to 8-bit. + chroma_lo = _mm_srli_epi16(chroma_lo, 2); + chroma_hi = _mm_srli_epi16(chroma_hi, 2); + + // Pack with unsigned saturation the 16-bit elements from 2 registers into 8-bit elements in 1 register. + // chroma_lo = [ VV7 VV7] [ UU7 UU7] [ VV5 VV5] [ UU5 UU5] [ VV3 VV3] [ UU3 UU3] [VV1 VV1] [UU1 UU1] + // chroma_hi = [VV15 VV15] [UU15 UU15] [VV13 VV13] [UU13 UU13] [VV11 VV11] [UU11 UU11] [VV9 VV9] [UU9 UU9] + // -> + // chroma = [VV15] [UU15] [VV13] [UU13] [VV11] [UU11] [VV9] [UU9] [VV7] [UU7] [VV5] [UU5] [VV3] [UU3] [VV1] [UU1] + auto chroma = _mm_packus_epi16(chroma_lo, chroma_hi); + + // Store the 16 bytes of chroma. + _mm_store_si128((__m128i*)&out_chroma[dst_chroma + x + 0], chroma); + } + + // clang-format on + } + } +#elif defined(ARCHITECTURE_arm64) + DecodeLinear(out_luma, out_chroma); +#else + DecodeLinear(out_luma, out_chroma); +#endif + }; + + switch (output_surface_config.out_block_kind) { + case BLK_KIND::GENERIC_16Bx2: { + const u32 block_height = static_cast(output_surface_config.out_block_height); + const auto out_luma_swizzle_size = Texture::CalculateSize( + true, BytesPerPixel, out_luma_width, out_luma_height, 1, block_height, 0); + const auto out_chroma_swizzle_size = Texture::CalculateSize( + true, BytesPerPixel * 2, out_chroma_width, out_chroma_height, 1, block_height, 0); + + LOG_TRACE( + HW_GPU, + "Writing Y8__V8U8_N420 swizzled frame\n" + "\tinput surface {}x{} stride {} size 0x{:X}\n" + "\toutput luma {}x{} stride {} size 0x{:X} block height {} swizzled size 0x{:X}\n", + "\toutput chroma {}x{} stride {} size 0x{:X} block height {} swizzled size 0x{:X}", + surface_width, surface_height, surface_stride * BytesPerPixel, + surface_stride * surface_height * BytesPerPixel, out_luma_width, out_luma_height, + out_luma_stride, out_luma_size, block_height, out_luma_swizzle_size, out_chroma_width, + out_chroma_height, out_chroma_stride, out_chroma_size, block_height, + out_chroma_swizzle_size); + + luma_scratch.resize_destructive(out_luma_size); + chroma_scratch.resize_destructive(out_chroma_size); + + Decode(luma_scratch, chroma_scratch); + + Tegra::Memory::GpuGuestMemoryScoped out_luma( + memory_manager, regs.output_surface.luma.Address(), out_luma_swizzle_size, + &swizzle_scratch); + + if (block_height == 1) { + SwizzleSurface(out_luma, out_luma_stride, luma_scratch, out_luma_stride, + out_luma_height); + } else { + Texture::SwizzleTexture(out_luma, luma_scratch, BytesPerPixel, out_luma_width, + out_luma_height, 1, block_height, 0, 1); + } + + Tegra::Memory::GpuGuestMemoryScoped + out_chroma(memory_manager, regs.output_surface.chroma_u.Address(), + out_chroma_swizzle_size, &swizzle_scratch); + + if (block_height == 1) { + SwizzleSurface(out_chroma, out_chroma_stride, chroma_scratch, out_chroma_stride, + out_chroma_height); + } else { + Texture::SwizzleTexture(out_chroma, chroma_scratch, BytesPerPixel, out_chroma_width, + out_chroma_height, 1, block_height, 0, 1); + } + } break; + case BLK_KIND::PITCH: { + LOG_TRACE( + HW_GPU, + "Writing Y8__V8U8_N420 swizzled frame\n" + "\tinput surface {}x{} stride {} size 0x{:X}\n" + "\toutput luma {}x{} stride {} size 0x{:X} block height {} swizzled size 0x{:X}\n", + "\toutput chroma {}x{} stride {} size 0x{:X} block height {} swizzled size 0x{:X}", + surface_width, surface_height, surface_stride * BytesPerPixel, + surface_stride * surface_height * BytesPerPixel, out_luma_width, out_luma_height, + out_luma_stride, out_luma_size, out_chroma_width, out_chroma_height, out_chroma_stride, + out_chroma_size); + + // Unfortunately due to a driver bug or game bug, the chroma address can be not + // appropriately spaced from the luma, so the luma of size out_stride * height runs into the + // top of the chroma buffer. Unfortunately that removes an optimisation here where we could + // create guest spans and decode into game memory directly to avoid the memory copy from + // scratch to game. Due to this bug, we must write the luma first, and then the chroma + // afterwards to re-overwrite the luma being too large. + luma_scratch.resize_destructive(out_luma_size); + chroma_scratch.resize_destructive(out_chroma_size); + + Decode(luma_scratch, chroma_scratch); + + memory_manager.WriteBlock(regs.output_surface.luma.Address(), luma_scratch.data(), + out_luma_size); + memory_manager.WriteBlock(regs.output_surface.chroma_u.Address(), chroma_scratch.data(), + out_chroma_size); + } break; + default: + UNREACHABLE(); + break; + } +} + +template +void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config) { + constexpr u32 BytesPerPixel = 4; + + auto surface_width{output_surface_config.out_surface_width + 1}; + auto surface_height{output_surface_config.out_surface_height + 1}; + const auto surface_stride{surface_width}; + + const auto out_luma_width = output_surface_config.out_luma_width + 1; + const auto out_luma_height = output_surface_config.out_luma_height + 1; + const auto out_luma_stride = Common ::AlignUp(out_luma_width * BytesPerPixel, 0x10); + const auto out_luma_size = out_luma_height * out_luma_stride; + + surface_width = std::min(surface_width, out_luma_width); + surface_height = std::min(surface_height, out_luma_height); + + [[maybe_unused]] auto DecodeLinear = [&](std::span out_buffer) { + for (u32 y = 0; y < surface_height; y++) { + const auto src = y * surface_stride; + const auto dst = y * out_luma_stride; + for (u32 x = 0; x < surface_width; x++) { + if constexpr (Format == VideoPixelFormat::A8R8G8B8) { + out_buffer[dst + x * 4 + 0] = static_cast(output_surface[src + x].b >> 2); + out_buffer[dst + x * 4 + 1] = static_cast(output_surface[src + x].g >> 2); + out_buffer[dst + x * 4 + 2] = static_cast(output_surface[src + x].r >> 2); + out_buffer[dst + x * 4 + 3] = static_cast(output_surface[src + x].a >> 2); + } else { + out_buffer[dst + x * 4 + 0] = static_cast(output_surface[src + x].r >> 2); + out_buffer[dst + x * 4 + 1] = static_cast(output_surface[src + x].g >> 2); + out_buffer[dst + x * 4 + 2] = static_cast(output_surface[src + x].b >> 2); + out_buffer[dst + x * 4 + 3] = static_cast(output_surface[src + x].a >> 2); + } + } + } + }; + + auto Decode = [&](std::span out_buffer) { +#if defined(ARCHITECTURE_x86_64) + if (!has_sse41) { + DecodeLinear(out_buffer); + return; + } + + for (u32 y = 0; y < surface_height; y++) { + const auto src = y * surface_stride; + const auto dst = y * out_luma_stride; + for (u32 x = 0; x < surface_width; x += 16) { + // clang-format off + // Prefetch the next 2 cache lines + _mm_prefetch((const char*)&output_surface[src + x + 16], _MM_HINT_T0); + _mm_prefetch((const char*)&output_surface[src + x + 24], _MM_HINT_T0); + + // Load the pixels, 16-bit channels, 8 bytes per pixel, e.g + // pixel01 = [AA AA BB BB GG GG RR RR AA AA BB BB GG GG RR RR + auto pixel01 = _mm_load_si128((__m128i*)&output_surface[src + x + 0]); + auto pixel23 = _mm_load_si128((__m128i*)&output_surface[src + x + 2]); + auto pixel45 = _mm_load_si128((__m128i*)&output_surface[src + x + 4]); + auto pixel67 = _mm_load_si128((__m128i*)&output_surface[src + x + 6]); + auto pixel89 = _mm_load_si128((__m128i*)&output_surface[src + x + 8]); + auto pixel1011 = _mm_load_si128((__m128i*)&output_surface[src + x + 10]); + auto pixel1213 = _mm_load_si128((__m128i*)&output_surface[src + x + 12]); + auto pixel1415 = _mm_load_si128((__m128i*)&output_surface[src + x + 14]); + + // Right-shift the channels by 16 to un-do the left shit on read and bring the range + // back to 8-bit. + pixel01 = _mm_srli_epi16(pixel01, 2); + pixel23 = _mm_srli_epi16(pixel23, 2); + pixel45 = _mm_srli_epi16(pixel45, 2); + pixel67 = _mm_srli_epi16(pixel67, 2); + pixel89 = _mm_srli_epi16(pixel89, 2); + pixel1011 = _mm_srli_epi16(pixel1011, 2); + pixel1213 = _mm_srli_epi16(pixel1213, 2); + pixel1415 = _mm_srli_epi16(pixel1415, 2); + + // Pack with unsigned saturation 16-bit channels from 2 registers into 8-bit channels in 1 register. + // pixel01 = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1] + // pixel23 = [AA4 AA4] [BB4 BB4] [GG4 GG4] [RR4 RR4] [AA3 AA3] [BB3 BB3] [GG3 GG3] [RR3 RR3] + // -> + // pixels0_lo = [AA4] [BB4] [GG4] [RR4] [AA3] [BB3] [GG3] [RR3] [AA2] [BB2] [GG2] [RR2] [AA1] [BB1] [GG1] [RR1] + auto pixels0_lo = _mm_packus_epi16(pixel01, pixel23); + auto pixels0_hi = _mm_packus_epi16(pixel45, pixel67); + auto pixels1_lo = _mm_packus_epi16(pixel89, pixel1011); + auto pixels1_hi = _mm_packus_epi16(pixel1213, pixel1415); + + if constexpr (Format == VideoPixelFormat::A8R8G8B8) { + const auto shuffle = + _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2); + + // Our pixels are ABGR (big-endian) by default, if ARGB is needed, we need to shuffle. + // pixels0_lo = [AA4 BB4 GG4 RR4] [AA3 BB3 GG3 RR3] [AA2 BB2 GG2 RR2] [AA1 BB1 GG1 RR1] + // -> + // pixels0_lo = [AA4 RR4 GG4 BB4] [AA3 RR3 GG3 BB3] [AA2 RR2 GG2 BB2] [AA1 RR1 GG1 BB1] + pixels0_lo = _mm_shuffle_epi8(pixels0_lo, shuffle); + pixels0_hi = _mm_shuffle_epi8(pixels0_hi, shuffle); + pixels1_lo = _mm_shuffle_epi8(pixels1_lo, shuffle); + pixels1_hi = _mm_shuffle_epi8(pixels1_hi, shuffle); + } + + // Store the pixels + _mm_store_si128((__m128i*)&out_buffer[dst + x * 4 + 0], pixels0_lo); + _mm_store_si128((__m128i*)&out_buffer[dst + x * 4 + 16], pixels0_hi); + _mm_store_si128((__m128i*)&out_buffer[dst + x * 4 + 32], pixels1_lo); + _mm_store_si128((__m128i*)&out_buffer[dst + x * 4 + 48], pixels1_hi); + + // clang-format on + } + } +#elif defined(ARCHITECTURE_arm64) + DecodeLinear(out_buffer); +#else + DecodeLinear(out_buffer); +#endif + }; + + switch (output_surface_config.out_block_kind) { + case BLK_KIND::GENERIC_16Bx2: { + const u32 block_height = static_cast(output_surface_config.out_block_height); + const auto out_swizzle_size = Texture::CalculateSize(true, BytesPerPixel, out_luma_width, + out_luma_height, 1, block_height, 0); + + LOG_TRACE( + HW_GPU, + "Writing ABGR swizzled frame\n" + "\tinput surface {}x{} stride {} size 0x{:X}\n" + "\toutput surface {}x{} stride {} size 0x{:X} block height {} swizzled size 0x{:X}", + surface_width, surface_height, surface_stride * BytesPerPixel, + surface_stride * surface_height * BytesPerPixel, out_luma_width, out_luma_height, + out_luma_stride, out_luma_size, block_height, out_swizzle_size); + + luma_scratch.resize_destructive(out_luma_size); + + Decode(luma_scratch); + + Tegra::Memory::GpuGuestMemoryScoped out_luma( + memory_manager, regs.output_surface.luma.Address(), out_swizzle_size, &swizzle_scratch); + + if (block_height == 1) { + SwizzleSurface(out_luma, out_luma_stride, luma_scratch, out_luma_stride, + out_luma_height); + } else { + Texture::SwizzleTexture(out_luma, luma_scratch, BytesPerPixel, out_luma_width, + out_luma_height, 1, block_height, 0, 1); + } + + } break; + case BLK_KIND::PITCH: { + LOG_TRACE(HW_GPU, + "Writing ABGR pitch frame\n" + "\tinput surface {}x{} stride {} size 0x{:X}" + "\toutput surface {}x{} stride {} size 0x{:X}", + surface_width, surface_height, surface_stride, + surface_stride * surface_height * BytesPerPixel, out_luma_width, out_luma_height, + out_luma_stride, out_luma_size); + + luma_scratch.resize_destructive(out_luma_size); + + Tegra::Memory::GpuGuestMemoryScoped out_luma( + memory_manager, regs.output_surface.luma.Address(), out_luma_size, &luma_scratch); + + Decode(out_luma); + } break; + default: + UNREACHABLE(); + break; + } +} + +} // namespace Tegra::Host1x diff --git a/src/video_core/host1x/vic.h b/src/video_core/host1x/vic.h index 2de77e71e..3245b261c 100755 --- a/src/video_core/host1x/vic.h +++ b/src/video_core/host1x/vic.h @@ -3,65 +3,646 @@ #pragma once +#include +#include #include +#include +#include #include "common/common_types.h" #include "common/scratch_buffer.h" +#include "video_core/cdma_pusher.h" -struct SwsContext; - -namespace Tegra { - -namespace Host1x { - +namespace Tegra::Host1x { class Host1x; class Nvdec; -union VicConfig; -class Vic { +struct Pixel { + u16 r; + u16 g; + u16 b; + u16 a; +}; + +// One underscore represents separate pixels. +// Double underscore represents separate planes. +// _N represents chroma subsampling, not a separate pixel. +enum class VideoPixelFormat : u32 { + A8 = 0, + L8 = 1, + A4L4 = 2, + L4A4 = 3, + R8 = 4, + A8L8 = 5, + L8A8 = 6, + R8G8 = 7, + G8R8 = 8, + B5G6R5 = 9, + R5G6B5 = 10, + B6G5R5 = 11, + R5G5B6 = 12, + A1B5G5R5 = 13, + A1R5G5B5 = 14, + B5G5R5A1 = 15, + R5G5B5A1 = 16, + A5B5G5R1 = 17, + A5R1G5B5 = 18, + B5G5R1A5 = 19, + R1G5B5A5 = 20, + X1B5G5R5 = 21, + X1R5G5B5 = 22, + B5G5R5X1 = 23, + R5G5B5X1 = 24, + A4B4G5R4 = 25, + A4R4G4B4 = 26, + B4G4R4A4 = 27, + R4G4B4A4 = 28, + B8G8R8 = 29, + R8G8B8 = 30, + A8B8G8R8 = 31, + A8R8G8B8 = 32, + B8G8R8A8 = 33, + R8G8B8A8 = 34, + X8B8G8R8 = 35, + X8R8G8B8 = 36, + B8G8R8X8 = 37, + R8G8B8X8 = 38, + A8B10G10R10 = 39, + A2R10G10B10 = 40, + B10G10R10A2 = 41, + R10G10B10A2 = 42, + A4P4 = 43, + P4A4 = 44, + P8A8 = 45, + A8P8 = 46, + P8 = 47, + P1 = 48, + U8V8 = 49, + V8U8 = 50, + A8Y8U8V8 = 51, + V8U8Y8A8 = 52, + Y8U8V8 = 53, + Y8V8U8 = 54, + U8V8Y8 = 55, + V8U8Y8 = 56, + Y8U8_Y8V8 = 57, + Y8V8_Y8U8 = 58, + U8Y8_V8Y8 = 59, + V8Y8_U8Y8 = 60, + Y8__U8V8_N444 = 61, + Y8__V8U8_N444 = 62, + Y8__U8V8_N422 = 63, + Y8__V8U8_N422 = 64, + Y8__U8V8_N422R = 65, + Y8__V8U8_N422R = 66, + Y8__U8V8_N420 = 67, + Y8__V8U8_N420 = 68, + Y8__U8__V8_N444 = 69, + Y8__U8__V8_N422 = 70, + Y8__U8__V8_N422R = 71, + Y8__U8__V8_N420 = 72, + U8 = 73, + V8 = 74, +}; + +struct Offset { + constexpr u32 Address() const noexcept { + return offset << 8; + } + +private: + u32 offset; +}; +static_assert(std::is_trivial_v, "Offset must be trivial"); +static_assert(sizeof(Offset) == 0x4, "Offset has the wrong size!"); + +struct PlaneOffsets { + Offset luma; + Offset chroma_u; + Offset chroma_v; +}; +static_assert(sizeof(PlaneOffsets) == 0xC, "PlaneOffsets has the wrong size!"); + +enum SurfaceIndex : u32 { + Current = 0, + Previous = 1, + Next = 2, + NextNoiseReduced = 3, + CurrentMotion = 4, + PreviousMotion = 5, + PreviousPreviousMotion = 6, + CombinedMotion = 7, +}; + +enum class DXVAHD_ALPHA_FILL_MODE : u32 { + OPAQUE = 0, + BACKGROUND = 1, + DESTINATION = 2, + SOURCE_STREAM = 3, + COMPOSITED = 4, + SOURCE_ALPHA = 5, +}; + +enum class DXVAHD_FRAME_FORMAT : u64 { + PROGRESSIVE = 0, + INTERLACED_TOP_FIELD_FIRST = 1, + INTERLACED_BOTTOM_FIELD_FIRST = 2, + TOP_FIELD = 3, + BOTTOM_FIELD = 4, + SUBPIC_PROGRESSIVE = 5, + SUBPIC_INTERLACED_TOP_FIELD_FIRST = 6, + SUBPIC_INTERLACED_BOTTOM_FIELD_FIRST = 7, + SUBPIC_TOP_FIELD = 8, + SUBPIC_BOTTOM_FIELD = 9, + TOP_FIELD_CHROMA_BOTTOM = 10, + BOTTOM_FIELD_CHROMA_TOP = 11, + SUBPIC_TOP_FIELD_CHROMA_BOTTOM = 12, + SUBPIC_BOTTOM_FIELD_CHROMA_TOP = 13, +}; + +enum class DXVAHD_DEINTERLACE_MODE_PRIVATE : u64 { + WEAVE = 0, + BOB_FIELD = 1, + BOB = 2, + NEWBOB = 3, + DISI1 = 4, + WEAVE_LUMA_BOB_FIELD_CHROMA = 5, + MAX = 0xF, +}; + +enum class BLK_KIND { + PITCH = 0, + GENERIC_16Bx2 = 1, + // These are unsupported in the vic + BL_NAIVE = 2, + BL_KEPLER_XBAR_RAW = 3, + VP2_TILED = 15, +}; + +enum class BLEND_SRCFACTC : u32 { + K1 = 0, + K1_TIMES_DST = 1, + NEG_K1_TIMES_DST = 2, + K1_TIMES_SRC = 3, + ZERO = 4, +}; + +enum class BLEND_DSTFACTC : u32 { + K1 = 0, + K2 = 1, + K1_TIMES_DST = 2, + NEG_K1_TIMES_DST = 3, + NEG_K1_TIMES_SRC = 4, + ZERO = 5, + ONE = 6, +}; + +enum class BLEND_SRCFACTA : u32 { + K1 = 0, + K2 = 1, + NEG_K1_TIMES_DST = 2, + ZERO = 3, + MAX = 7, +}; + +enum class BLEND_DSTFACTA : u32 { + K2 = 0, + NEG_K1_TIMES_SRC = 1, + ZERO = 2, + ONE = 3, + MAX = 7, +}; + +struct PipeConfig { + union { + BitField<0, 11, u32> downsample_horiz; + BitField<11, 5, u32> reserved0; + BitField<16, 11, u32> downsample_vert; + BitField<27, 5, u32> reserved1; + }; + u32 reserved2; + u32 reserved3; + u32 reserved4; +}; +static_assert(sizeof(PipeConfig) == 0x10, "PipeConfig has the wrong size!"); + +struct OutputConfig { + union { + BitField<0, 3, DXVAHD_ALPHA_FILL_MODE> alpha_fill_mode; + BitField<3, 3, u64> alpha_fill_slot; + BitField<6, 10, u64> background_a; + BitField<16, 10, u64> background_r; + BitField<26, 10, u64> background_g; + BitField<36, 10, u64> background_b; + BitField<46, 2, u64> regamma_mode; + BitField<48, 1, u64> output_flip_x; + BitField<49, 1, u64> output_flip_y; + BitField<50, 1, u64> output_transpose; + BitField<51, 1, u64> reserved1; + BitField<52, 12, u64> reserved2; + }; + union { + BitField<0, 14, u32> target_rect_left; + BitField<14, 2, u32> reserved3; + BitField<16, 14, u32> target_rect_right; + BitField<30, 2, u32> reserved4; + }; + union { + BitField<0, 14, u32> target_rect_top; + BitField<14, 2, u32> reserved5; + BitField<16, 14, u32> target_rect_bottom; + BitField<30, 2, u32> reserved6; + }; +}; +static_assert(sizeof(OutputConfig) == 0x10, "OutputConfig has the wrong size!"); + +struct OutputSurfaceConfig { + union { + BitField<0, 7, VideoPixelFormat> out_pixel_format; + BitField<7, 2, u32> out_chroma_loc_horiz; + BitField<9, 2, u32> out_chroma_loc_vert; + BitField<11, 4, BLK_KIND> out_block_kind; + BitField<15, 4, u32> out_block_height; // in gobs, log2 + BitField<19, 3, u32> reserved0; + BitField<22, 10, u32> reserved1; + }; + union { + BitField<0, 14, u32> out_surface_width; // - 1 + BitField<14, 14, u32> out_surface_height; // - 1 + BitField<28, 4, u32> reserved2; + }; + union { + BitField<0, 14, u32> out_luma_width; // - 1 + BitField<14, 14, u32> out_luma_height; // - 1 + BitField<28, 4, u32> reserved3; + }; + union { + BitField<0, 14, u32> out_chroma_width; // - 1 + BitField<14, 14, u32> out_chroma_height; // - 1 + BitField<28, 4, u32> reserved4; + }; +}; +static_assert(sizeof(OutputSurfaceConfig) == 0x10, "OutputSurfaceConfig has the wrong size!"); + +struct MatrixStruct { + union { + BitField<0, 20, s64> matrix_coeff00; // (0,0) of 4x3 conversion matrix + BitField<20, 20, s64> matrix_coeff10; // (1,0) of 4x3 conversion matrix + BitField<40, 20, s64> matrix_coeff20; // (2,0) of 4x3 conversion matrix + BitField<60, 4, u64> matrix_r_shift; + }; + union { + BitField<0, 20, s64> matrix_coeff01; // (0,1) of 4x3 conversion matrix + BitField<20, 20, s64> matrix_coeff11; // (1,1) of 4x3 conversion matrix + BitField<40, 20, s64> matrix_coeff21; // (2,1) of 4x3 conversion matrix + BitField<60, 3, u64> reserved0; + BitField<63, 1, u64> matrix_enable; + }; + union { + BitField<0, 20, s64> matrix_coeff02; // (0,2) of 4x3 conversion matrix + BitField<20, 20, s64> matrix_coeff12; // (1,2) of 4x3 conversion matrix + BitField<40, 20, s64> matrix_coeff22; // (2,2) of 4x3 conversion matrix + BitField<60, 4, u64> reserved1; + }; + union { + BitField<0, 20, s64> matrix_coeff03; // (0,3) of 4x3 conversion matrix + BitField<20, 20, s64> matrix_coeff13; // (1,3) of 4x3 conversion matrix + BitField<40, 20, s64> matrix_coeff23; // (2,3) of 4x3 conversion matrix + BitField<60, 4, u64> reserved2; + }; +}; +static_assert(sizeof(MatrixStruct) == 0x20, "MatrixStruct has the wrong size!"); + +struct ClearRectStruct { + union { + BitField<0, 14, u32> clear_rect0_left; + BitField<14, 2, u32> reserved0; + BitField<16, 14, u32> clear_rect0_right; + BitField<30, 2, u32> reserved1; + }; + union { + BitField<0, 14, u32> clear_rect0_top; + BitField<14, 2, u32> reserved2; + BitField<16, 14, u32> clear_rect0_bottom; + BitField<30, 2, u32> reserved3; + }; + union { + BitField<0, 14, u32> clear_rect1_left; + BitField<14, 2, u32> reserved4; + BitField<16, 14, u32> clear_rect1_right; + BitField<30, 2, u32> reserved5; + }; + union { + BitField<0, 14, u32> clear_rect1_top; + BitField<14, 2, u32> reserved6; + BitField<16, 14, u32> clear_rect1_bottom; + BitField<30, 2, u32> reserved7; + }; +}; +static_assert(sizeof(ClearRectStruct) == 0x10, "ClearRectStruct has the wrong size!"); + +struct SlotConfig { + union { + BitField<0, 1, u64> slot_enable; + BitField<1, 1, u64> denoise; + BitField<2, 1, u64> advanced_denoise; + BitField<3, 1, u64> cadence_detect; + BitField<4, 1, u64> motion_map; + BitField<5, 1, u64> motion_map_capture; + BitField<6, 1, u64> is_even; + BitField<7, 1, u64> chroma_even; + // fetch control struct + BitField<8, 1, u64> current_field_enable; + BitField<9, 1, u64> prev_field_enable; + BitField<10, 1, u64> next_field_enable; + BitField<11, 1, u64> next_nr_field_enable; // noise reduction + BitField<12, 1, u64> current_motion_field_enable; + BitField<13, 1, u64> prev_motion_field_enable; + BitField<14, 1, u64> prev_prev_motion_field_enable; + BitField<15, 1, u64> combined_motion_field_enable; + + BitField<16, 4, DXVAHD_FRAME_FORMAT> frame_format; + BitField<20, 2, u64> filter_length_y; // 0: 1-tap, 1: 2-tap, 2: 5-tap, 3: 10-tap + BitField<22, 2, u64> filter_length_x; + BitField<24, 12, u64> panoramic; + BitField<36, 22, u64> reserved1; + BitField<58, 6, u64> detail_filter_clamp; + }; + union { + BitField<0, 10, u64> filter_noise; + BitField<10, 10, u64> filter_detail; + BitField<20, 10, u64> chroma_noise; + BitField<30, 10, u64> chroma_detail; + BitField<40, 4, DXVAHD_DEINTERLACE_MODE_PRIVATE> deinterlace_mode; + BitField<44, 3, u64> motion_accumulation_weight; + BitField<47, 11, u64> noise_iir; + BitField<58, 4, u64> light_level; + BitField<62, 2, u64> reserved4; + }; + union { + BitField<0, 10, u64> soft_clamp_low; + BitField<10, 10, u64> soft_clamp_high; + BitField<20, 3, u64> reserved5; + BitField<23, 9, u64> reserved6; + BitField<32, 10, u64> planar_alpha; + BitField<42, 1, u64> constant_alpha; + BitField<43, 3, u64> stereo_interleave; + BitField<46, 1, u64> clip_enabled; + BitField<47, 8, u64> clear_rect_mask; + BitField<55, 2, u64> degamma_mode; + BitField<57, 1, u64> reserved7; + BitField<58, 1, u64> decompress_enable; + BitField<59, 5, u64> reserved9; + }; + union { + BitField<0, 8, u64> decompress_ctb_count; + BitField<8, 32, u64> decompress_zbc_count; + BitField<40, 24, u64> reserved12; + }; + union { + BitField<0, 30, u64> source_rect_left; + BitField<30, 2, u64> reserved14; + BitField<32, 30, u64> source_rect_right; + BitField<62, 2, u64> reserved15; + }; + union { + BitField<0, 30, u64> source_rect_top; + BitField<30, 2, u64> reserved16; + BitField<32, 30, u64> source_rect_bottom; + BitField<62, 2, u64> reserved17; + }; + union { + BitField<0, 14, u64> dest_rect_left; + BitField<14, 2, u64> reserved18; + BitField<16, 14, u64> dest_rect_right; + BitField<30, 2, u64> reserved19; + BitField<32, 14, u64> dest_rect_top; + BitField<46, 2, u64> reserved20; + BitField<48, 14, u64> dest_rect_bottom; + BitField<62, 2, u64> reserved21; + }; + u32 reserved22; + u32 reserved23; +}; +static_assert(sizeof(SlotConfig) == 0x40, "SlotConfig has the wrong size!"); + +struct SlotSurfaceConfig { + union { + BitField<0, 7, VideoPixelFormat> slot_pixel_format; + BitField<7, 2, u32> slot_chroma_loc_horiz; + BitField<9, 2, u32> slot_chroma_loc_vert; + BitField<11, 4, u32> slot_block_kind; + BitField<15, 4, u32> slot_block_height; + BitField<19, 3, u32> slot_cache_width; + BitField<22, 10, u32> reserved0; + }; + union { + BitField<0, 14, u32> slot_surface_width; // - 1 + BitField<14, 14, u32> slot_surface_height; // - 1 + BitField<28, 4, u32> reserved1; + }; + union { + BitField<0, 14, u32> slot_luma_width; // padded, - 1 + BitField<14, 14, u32> slot_luma_height; // padded, - 1 + BitField<28, 4, u32> reserved2; + }; + union { + BitField<0, 14, u32> slot_chroma_width; // padded, - 1 + BitField<14, 14, u32> slot_chroma_height; // padded, - 1 + BitField<28, 4, u32> reserved3; + }; +}; +static_assert(sizeof(SlotSurfaceConfig) == 0x10, "SlotSurfaceConfig has the wrong size!"); + +struct LumaKeyStruct { + union { + BitField<0, 20, u64> luma_coeff0; // (0) of 4x1 conversion matrix, S12.8 format + BitField<20, 20, u64> luma_coeff1; // (1) of 4x1 conversion matrix, S12.8 format + BitField<40, 20, u64> luma_coeff2; // (2) of 4x1 conversion matrix, S12.8 format + BitField<60, 4, u64> luma_r_shift; + }; + union { + BitField<0, 20, u64> luma_coeff3; // (3) of 4x1 conversion matrix, S12.8 format + BitField<20, 10, u64> luma_key_lower; + BitField<30, 10, u64> luma_key_upper; + BitField<40, 1, u64> luma_key_enabled; + BitField<41, 2, u64> reserved0; + BitField<43, 21, u64> reserved1; + }; +}; +static_assert(sizeof(LumaKeyStruct) == 0x10, "LumaKeyStruct has the wrong size!"); + +struct BlendingSlotStruct { + union { + BitField<0, 10, u32> alpha_k1; + BitField<10, 6, u32> reserved0; + BitField<16, 10, u32> alpha_k2; + BitField<26, 6, u32> reserved1; + }; + union { + BitField<0, 3, BLEND_SRCFACTC> src_factor_color_match_select; + BitField<3, 1, u32> reserved2; + BitField<4, 3, BLEND_DSTFACTC> dst_factor_color_match_select; + BitField<7, 1, u32> reserved3; + BitField<8, 3, BLEND_SRCFACTA> src_factor_a_match_select; + BitField<11, 1, u32> reserved4; + BitField<12, 3, BLEND_DSTFACTA> dst_factor_a_match_select; + BitField<15, 1, u32> reserved5; + BitField<16, 4, u32> reserved6; + BitField<20, 4, u32> reserved7; + BitField<24, 4, u32> reserved8; + BitField<28, 4, u32> reserved9; + }; + union { + BitField<0, 2, u32> reserved10; + BitField<2, 10, u32> override_r; + BitField<12, 10, u32> override_g; + BitField<22, 10, u32> override_b; + }; + union { + BitField<0, 10, u32> override_a; + BitField<10, 2, u32> reserved11; + BitField<12, 1, u32> use_override_r; + BitField<13, 1, u32> use_override_g; + BitField<14, 1, u32> use_override_b; + BitField<15, 1, u32> use_override_a; + BitField<16, 1, u32> mask_r; + BitField<17, 1, u32> mask_g; + BitField<18, 1, u32> mask_b; + BitField<19, 1, u32> mask_a; + BitField<20, 12, u32> reserved12; + }; +}; +static_assert(sizeof(BlendingSlotStruct) == 0x10, "BlendingSlotStruct has the wrong size!"); + +struct SlotStruct { + SlotConfig config; + SlotSurfaceConfig surface_config; + LumaKeyStruct luma_key; + MatrixStruct color_matrix; + MatrixStruct gamut_matrix; + BlendingSlotStruct blending; +}; +static_assert(sizeof(SlotStruct) == 0xB0, "SlotStruct has the wrong size!"); + +struct ConfigStruct { + PipeConfig pipe_config; + OutputConfig output_config; + OutputSurfaceConfig output_surface_config; + MatrixStruct out_color_matrix; + std::array clear_rects; + std::array slot_structs; +}; +static_assert(offsetof(ConfigStruct, pipe_config) == 0x0, "pipe_config is in the wrong place!"); +static_assert(offsetof(ConfigStruct, output_config) == 0x10, + "output_config is in the wrong place!"); +static_assert(offsetof(ConfigStruct, output_surface_config) == 0x20, + "output_surface_config is in the wrong place!"); +static_assert(offsetof(ConfigStruct, out_color_matrix) == 0x30, + "out_color_matrix is in the wrong place!"); +static_assert(offsetof(ConfigStruct, clear_rects) == 0x50, "clear_rects is in the wrong place!"); +static_assert(offsetof(ConfigStruct, slot_structs) == 0x90, "slot_structs is in the wrong place!"); +static_assert(sizeof(ConfigStruct) == 0x610, "ConfigStruct has the wrong size!"); + +struct VicRegisters { + static constexpr std::size_t NUM_REGS = 0x446; + + union { + struct { + INSERT_PADDING_WORDS_NOINIT(0xC0); + u32 execute; + INSERT_PADDING_WORDS_NOINIT(0x3F); + std::array, 8> surfaces; + u32 picture_index; + u32 control_params; + Offset config_struct_offset; + Offset filter_struct_offset; + Offset palette_offset; + Offset hist_offset; + u32 context_id; + u32 fce_ucode_size; + PlaneOffsets output_surface; + Offset fce_ucode_offset; + INSERT_PADDING_WORDS_NOINIT(0x4); + std::array slot_context_ids; + std::array comp_tag_buffer_offsets; + std::array history_buffer_offset; + INSERT_PADDING_WORDS_NOINIT(0x25D); + u32 pm_trigger_end; + }; + std::array reg_array; + }; +}; +static_assert(offsetof(VicRegisters, execute) == 0x300, "execute is in the wrong place!"); +static_assert(offsetof(VicRegisters, surfaces) == 0x400, "surfaces is in the wrong place!"); +static_assert(offsetof(VicRegisters, picture_index) == 0x700, + "picture_index is in the wrong place!"); +static_assert(offsetof(VicRegisters, control_params) == 0x704, + "control_params is in the wrong place!"); +static_assert(offsetof(VicRegisters, config_struct_offset) == 0x708, + "config_struct_offset is in the wrong place!"); +static_assert(offsetof(VicRegisters, output_surface) == 0x720, + "output_surface is in the wrong place!"); +static_assert(offsetof(VicRegisters, slot_context_ids) == 0x740, + "slot_context_ids is in the wrong place!"); +static_assert(offsetof(VicRegisters, history_buffer_offset) == 0x780, + "history_buffer_offset is in the wrong place!"); +static_assert(offsetof(VicRegisters, pm_trigger_end) == 0x1114, + "pm_trigger_end is in the wrong place!"); +static_assert(sizeof(VicRegisters) == 0x1118, "VicRegisters has the wrong size!"); + +class Vic final : public CDmaPusher { public: enum class Method : u32 { - Execute = 0xc0, - SetControlParams = 0x1c1, - SetConfigStructOffset = 0x1c2, - SetOutputSurfaceLumaOffset = 0x1c8, - SetOutputSurfaceChromaOffset = 0x1c9, - SetOutputSurfaceChromaUnusedOffset = 0x1ca + Execute = offsetof(VicRegisters, execute), + SetControlParams = offsetof(VicRegisters, control_params), + SetConfigStructOffset = offsetof(VicRegisters, config_struct_offset), + SetOutputSurfaceLumaOffset = offsetof(VicRegisters, output_surface.luma), + SetOutputSurfaceChromaOffset = offsetof(VicRegisters, output_surface.chroma_u), + SetOutputSurfaceChromaUnusedOffset = offsetof(VicRegisters, output_surface.chroma_v) }; - explicit Vic(Host1x& host1x, std::shared_ptr nvdec_processor); - + explicit Vic(Host1x& host1x, s32 id, u32 syncpt, FrameQueue& frame_queue); ~Vic(); /// Write to the device state. - void ProcessMethod(Method method, u32 argument); + void ProcessMethod(u32 method, u32 arg) override; private: void Execute(); - void WriteRGBFrame(std::unique_ptr frame, const VicConfig& config); + void Blend(const ConfigStruct& config, const SlotStruct& slot); - void WriteYUVFrame(std::unique_ptr frame, const VicConfig& config); + template + void ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, std::span offsets, + std::shared_ptr frame); + template + void ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span offsets, + std::shared_ptr frame); - Host1x& host1x; - std::shared_ptr nvdec_processor; + template + void ReadY8__V8U8_N420(const SlotStruct& slot, std::span offsets, + std::shared_ptr frame); - /// Avoid reallocation of the following buffers every frame, as their - /// size does not change during a stream - using AVMallocPtr = std::unique_ptr; - AVMallocPtr converted_frame_buffer; - Common::ScratchBuffer luma_buffer; - Common::ScratchBuffer chroma_buffer; + void WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config); - GPUVAddr config_struct_address{}; - GPUVAddr output_surface_luma_address{}; - GPUVAddr output_surface_chroma_address{}; + template + void WriteABGR(const OutputSurfaceConfig& output_surface_config); - SwsContext* scaler_ctx{}; - s32 scaler_width{}; - s32 scaler_height{}; + s32 id; + s32 nvdec_id{-1}; + u32 syncpoint; + + VicRegisters regs{}; + FrameQueue& frame_queue; + + const bool has_sse41{false}; + + Common::ScratchBuffer output_surface; + Common::ScratchBuffer slot_surface; + Common::ScratchBuffer luma_scratch; + Common::ScratchBuffer chroma_scratch; + Common::ScratchBuffer swizzle_scratch; }; -} // namespace Host1x - -} // namespace Tegra +} // namespace Tegra::Host1x diff --git a/src/video_core/host_shaders/fidelityfx_fsr.frag b/src/video_core/host_shaders/fidelityfx_fsr.frag index a266e1c4e..54eedb450 100755 --- a/src/video_core/host_shaders/fidelityfx_fsr.frag +++ b/src/video_core/host_shaders/fidelityfx_fsr.frag @@ -37,6 +37,7 @@ layout(set=0,binding=0) uniform sampler2D InputTexture; #define A_GPU 1 #define A_GLSL 1 +#define FSR_RCAS_PASSTHROUGH_ALPHA 1 #ifndef YUZU_USE_FP16 #include "ffx_a.h" @@ -71,9 +72,7 @@ layout(set=0,binding=0) uniform sampler2D InputTexture; #include "ffx_fsr1.h" -#if USE_RCAS - layout(location = 0) in vec2 frag_texcoord; -#endif +layout (location = 0) in vec2 frag_texcoord; layout (location = 0) out vec4 frag_color; void CurrFilter(AU2 pos) { @@ -81,22 +80,22 @@ void CurrFilter(AU2 pos) { #ifndef YUZU_USE_FP16 AF3 c; FsrEasuF(c, pos, Const0, Const1, Const2, Const3); - frag_color = AF4(c, 1.0); + frag_color = AF4(c, texture(InputTexture, frag_texcoord).a); #else AH3 c; FsrEasuH(c, pos, Const0, Const1, Const2, Const3); - frag_color = AH4(c, 1.0); + frag_color = AH4(c, texture(InputTexture, frag_texcoord).a); #endif #endif #if USE_RCAS #ifndef YUZU_USE_FP16 - AF3 c; - FsrRcasF(c.r, c.g, c.b, pos, Const0); - frag_color = AF4(c, 1.0); + AF4 c; + FsrRcasF(c.r, c.g, c.b, c.a, pos, Const0); + frag_color = c; #else - AH3 c; - FsrRcasH(c.r, c.g, c.b, pos, Const0); - frag_color = AH4(c, 1.0); + AH4 c; + FsrRcasH(c.r, c.g, c.b, c.a, pos, Const0); + frag_color = c; #endif #endif } diff --git a/src/video_core/host_shaders/fxaa.frag b/src/video_core/host_shaders/fxaa.frag index 5c03c3724..012c147c6 100755 --- a/src/video_core/host_shaders/fxaa.frag +++ b/src/video_core/host_shaders/fxaa.frag @@ -71,5 +71,5 @@ vec3 FxaaPixelShader(vec4 posPos, sampler2D tex) { } void main() { - frag_color = vec4(FxaaPixelShader(posPos, input_texture), 1.0); + frag_color = vec4(FxaaPixelShader(posPos, input_texture), texture(input_texture, posPos.xy).a); } diff --git a/src/video_core/host_shaders/opengl_fidelityfx_fsr.frag b/src/video_core/host_shaders/opengl_fidelityfx_fsr.frag index 16d22f58e..fc47d3810 100755 --- a/src/video_core/host_shaders/opengl_fidelityfx_fsr.frag +++ b/src/video_core/host_shaders/opengl_fidelityfx_fsr.frag @@ -31,6 +31,7 @@ layout (location = 0) uniform uvec4 constants[4]; #define A_GPU 1 #define A_GLSL 1 +#define FSR_RCAS_PASSTHROUGH_ALPHA 1 #ifdef YUZU_USE_FP16 #define A_HALF @@ -67,9 +68,7 @@ layout (location = 0) uniform uvec4 constants[4]; #include "ffx_fsr1.h" -#if USE_RCAS - layout(location = 0) in vec2 frag_texcoord; -#endif +layout (location = 0) in vec2 frag_texcoord; layout (location = 0) out vec4 frag_color; void CurrFilter(AU2 pos) @@ -78,22 +77,22 @@ void CurrFilter(AU2 pos) #ifndef YUZU_USE_FP16 AF3 c; FsrEasuF(c, pos, constants[0], constants[1], constants[2], constants[3]); - frag_color = AF4(c, 1.0); + frag_color = AF4(c, texture(InputTexture, frag_texcoord).a); #else AH3 c; FsrEasuH(c, pos, constants[0], constants[1], constants[2], constants[3]); - frag_color = AH4(c, 1.0); + frag_color = AH4(c, texture(InputTexture, frag_texcoord).a); #endif #endif #if USE_RCAS #ifndef YUZU_USE_FP16 - AF3 c; - FsrRcasF(c.r, c.g, c.b, pos, constants[0]); - frag_color = AF4(c, 1.0); + AF4 c; + FsrRcasF(c.r, c.g, c.b, c.a, pos, constants[0]); + frag_color = c; #else AH3 c; - FsrRcasH(c.r, c.g, c.b, pos, constants[0]); - frag_color = AH4(c, 1.0); + FsrRcasH(c.r, c.g, c.b, c.a, pos, constants[0]); + frag_color = c; #endif #endif } diff --git a/src/video_core/host_shaders/opengl_present.frag b/src/video_core/host_shaders/opengl_present.frag index 7644a47ae..cc134180e 100755 --- a/src/video_core/host_shaders/opengl_present.frag +++ b/src/video_core/host_shaders/opengl_present.frag @@ -9,5 +9,5 @@ layout (location = 0) out vec4 color; layout (binding = 0) uniform sampler2D color_texture; void main() { - color = vec4(texture(color_texture, frag_tex_coord).rgb, 1.0f); + color = vec4(texture(color_texture, frag_tex_coord)); } diff --git a/src/video_core/host_shaders/present_bicubic.frag b/src/video_core/host_shaders/present_bicubic.frag index 53d8bc761..a3aa9cb99 100755 --- a/src/video_core/host_shaders/present_bicubic.frag +++ b/src/video_core/host_shaders/present_bicubic.frag @@ -52,5 +52,5 @@ vec4 textureBicubic( sampler2D textureSampler, vec2 texCoords ) { } void main() { - color = vec4(textureBicubic(color_texture, frag_tex_coord).rgb, 1.0f); + color = textureBicubic(color_texture, frag_tex_coord); } diff --git a/src/video_core/host_shaders/present_gaussian.frag b/src/video_core/host_shaders/present_gaussian.frag index de848e386..73a6e1bb5 100755 --- a/src/video_core/host_shaders/present_gaussian.frag +++ b/src/video_core/host_shaders/present_gaussian.frag @@ -46,14 +46,14 @@ vec4 blurDiagonal(sampler2D textureSampler, vec2 coord, vec2 norm) { } void main() { - vec3 base = texture(color_texture, vec2(frag_tex_coord)).rgb * weight[0]; + vec4 base = texture(color_texture, vec2(frag_tex_coord)) * weight[0]; vec2 tex_offset = 1.0f / textureSize(color_texture, 0); // TODO(Blinkhawk): This code can be optimized through shader group instructions. - vec3 horizontal = blurHorizontal(color_texture, frag_tex_coord, tex_offset).rgb; - vec3 vertical = blurVertical(color_texture, frag_tex_coord, tex_offset).rgb; - vec3 diagonalA = blurDiagonal(color_texture, frag_tex_coord, tex_offset).rgb; - vec3 diagonalB = blurDiagonal(color_texture, frag_tex_coord, tex_offset * vec2(1.0, -1.0)).rgb; - vec3 combination = mix(mix(horizontal, vertical, 0.5f), mix(diagonalA, diagonalB, 0.5f), 0.5f); - color = vec4(combination + base, 1.0f); + vec4 horizontal = blurHorizontal(color_texture, frag_tex_coord, tex_offset); + vec4 vertical = blurVertical(color_texture, frag_tex_coord, tex_offset); + vec4 diagonalA = blurDiagonal(color_texture, frag_tex_coord, tex_offset); + vec4 diagonalB = blurDiagonal(color_texture, frag_tex_coord, tex_offset * vec2(1.0, -1.0)); + vec4 combination = mix(mix(horizontal, vertical, 0.5f), mix(diagonalA, diagonalB, 0.5f), 0.5f); + color = combination + base; } diff --git a/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp16.frag b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp16.frag index d369bef06..05d033310 100755 --- a/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp16.frag +++ b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp16.frag @@ -6,5 +6,6 @@ #define YUZU_USE_FP16 #define USE_EASU 1 +#define VERSION 1 #include "fidelityfx_fsr.frag" diff --git a/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp32.frag b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp32.frag index 6f25ef00f..7ae11dd66 100755 --- a/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp32.frag +++ b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp32.frag @@ -5,5 +5,6 @@ #extension GL_GOOGLE_include_directive : enable #define USE_EASU 1 +#define VERSION 1 #include "fidelityfx_fsr.frag" diff --git a/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp16.frag b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp16.frag index 0c953a900..c017214a5 100755 --- a/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp16.frag +++ b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp16.frag @@ -6,5 +6,6 @@ #define YUZU_USE_FP16 #define USE_RCAS 1 +#define VERSION 1 #include "fidelityfx_fsr.frag" diff --git a/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp32.frag b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp32.frag index 02e9a27c6..976825f4b 100755 --- a/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp32.frag +++ b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp32.frag @@ -5,5 +5,6 @@ #extension GL_GOOGLE_include_directive : enable #define USE_RCAS 1 +#define VERSION 1 #include "fidelityfx_fsr.frag" diff --git a/src/video_core/host_shaders/vulkan_present_scaleforce_fp16.frag b/src/video_core/host_shaders/vulkan_present_scaleforce_fp16.frag index d9ee1111b..072394a6d 100755 --- a/src/video_core/host_shaders/vulkan_present_scaleforce_fp16.frag +++ b/src/video_core/host_shaders/vulkan_present_scaleforce_fp16.frag @@ -5,7 +5,7 @@ #extension GL_GOOGLE_include_directive : enable -#define VERSION 1 +#define VERSION 2 #define YUZU_USE_FP16 #include "opengl_present_scaleforce.frag" diff --git a/src/video_core/host_shaders/vulkan_present_scaleforce_fp32.frag b/src/video_core/host_shaders/vulkan_present_scaleforce_fp32.frag index 4d64e340c..67a248b5f 100755 --- a/src/video_core/host_shaders/vulkan_present_scaleforce_fp32.frag +++ b/src/video_core/host_shaders/vulkan_present_scaleforce_fp32.frag @@ -5,6 +5,6 @@ #extension GL_GOOGLE_include_directive : enable -#define VERSION 1 +#define VERSION 2 #include "opengl_present_scaleforce.frag" diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index 1b31cdcee..9b312b112 100755 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -42,6 +42,8 @@ public: u64 page_bits_ = 12); ~MemoryManager(); + static constexpr bool HAS_FLUSH_INVALIDATION = true; + size_t GetID() const { return unique_identifier; } diff --git a/src/video_core/present.h b/src/video_core/present.h new file mode 100755 index 000000000..4fdfcca68 --- /dev/null +++ b/src/video_core/present.h @@ -0,0 +1,37 @@ +// SPDX-FileCopyrightText: Copyright 2024 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include "common/settings.h" + +static inline Settings::ScalingFilter GetScalingFilter() { + return Settings::values.scaling_filter.GetValue(); +} + +static inline Settings::AntiAliasing GetAntiAliasing() { + return Settings::values.anti_aliasing.GetValue(); +} + +static inline Settings::ScalingFilter GetScalingFilterForAppletCapture() { + return Settings::ScalingFilter::Bilinear; +} + +static inline Settings::AntiAliasing GetAntiAliasingForAppletCapture() { + return Settings::AntiAliasing::None; +} + +struct PresentFilters { + Settings::ScalingFilter (*get_scaling_filter)(); + Settings::AntiAliasing (*get_anti_aliasing)(); +}; + +constexpr PresentFilters PresentFiltersForDisplay{ + .get_scaling_filter = &GetScalingFilter, + .get_anti_aliasing = &GetAntiAliasing, +}; + +constexpr PresentFilters PresentFiltersForAppletCapture{ + .get_scaling_filter = &GetScalingFilterForAppletCapture, + .get_anti_aliasing = &GetAntiAliasingForAppletCapture, +}; diff --git a/src/video_core/renderer_base.h b/src/video_core/renderer_base.h index f617e800b..b72ab12fb 100755 --- a/src/video_core/renderer_base.h +++ b/src/video_core/renderer_base.h @@ -40,6 +40,9 @@ public: /// Finalize rendering the guest frame and draw into the presentation texture virtual void Composite(std::span layers) = 0; + /// Get the tiled applet layer capture buffer + virtual std::vector GetAppletCaptureBuffer() = 0; + [[nodiscard]] virtual RasterizerInterface* ReadRasterizer() = 0; [[nodiscard]] virtual std::string GetDeviceVendor() const = 0; diff --git a/src/video_core/renderer_null/renderer_null.cpp b/src/video_core/renderer_null/renderer_null.cpp index c89daff53..e6147d66c 100755 --- a/src/video_core/renderer_null/renderer_null.cpp +++ b/src/video_core/renderer_null/renderer_null.cpp @@ -3,6 +3,7 @@ #include "core/frontend/emu_window.h" #include "core/frontend/graphics_context.h" +#include "video_core/capture.h" #include "video_core/renderer_null/renderer_null.h" namespace Null { @@ -22,4 +23,8 @@ void RendererNull::Composite(std::span framebuff render_window.OnFrameDisplayed(); } +std::vector RendererNull::GetAppletCaptureBuffer() { + return std::vector(VideoCore::Capture::TiledSize); +} + } // namespace Null diff --git a/src/video_core/renderer_null/renderer_null.h b/src/video_core/renderer_null/renderer_null.h index 063b476bb..34dbe1e4f 100755 --- a/src/video_core/renderer_null/renderer_null.h +++ b/src/video_core/renderer_null/renderer_null.h @@ -19,6 +19,8 @@ public: void Composite(std::span framebuffer) override; + std::vector GetAppletCaptureBuffer() override; + VideoCore::RasterizerInterface* ReadRasterizer() override { return &m_rasterizer; } diff --git a/src/video_core/renderer_opengl/gl_blit_screen.cpp b/src/video_core/renderer_opengl/gl_blit_screen.cpp index 6ba8b214b..9260a4dc4 100755 --- a/src/video_core/renderer_opengl/gl_blit_screen.cpp +++ b/src/video_core/renderer_opengl/gl_blit_screen.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include "common/settings.h" +#include "video_core/present.h" #include "video_core/renderer_opengl/gl_blit_screen.h" #include "video_core/renderer_opengl/gl_state_tracker.h" #include "video_core/renderer_opengl/present/filters.h" @@ -13,14 +14,14 @@ namespace OpenGL { BlitScreen::BlitScreen(RasterizerOpenGL& rasterizer_, Tegra::MaxwellDeviceMemoryManager& device_memory_, StateTracker& state_tracker_, ProgramManager& program_manager_, - Device& device_) + Device& device_, const PresentFilters& filters_) : rasterizer(rasterizer_), device_memory(device_memory_), state_tracker(state_tracker_), - program_manager(program_manager_), device(device_) {} + program_manager(program_manager_), device(device_), filters(filters_) {} BlitScreen::~BlitScreen() = default; void BlitScreen::DrawScreen(std::span framebuffers, - const Layout::FramebufferLayout& layout) { + const Layout::FramebufferLayout& layout, bool invert_y) { // TODO: Signal state tracker about these changes state_tracker.NotifyScreenDrawVertexArray(); state_tracker.NotifyPolygonModes(); @@ -56,22 +57,22 @@ void BlitScreen::DrawScreen(std::span framebuffe glDepthRangeIndexed(0, 0.0, 0.0); while (layers.size() < framebuffers.size()) { - layers.emplace_back(rasterizer, device_memory); + layers.emplace_back(rasterizer, device_memory, filters); } CreateWindowAdapt(); - window_adapt->DrawToFramebuffer(program_manager, layers, framebuffers, layout); + window_adapt->DrawToFramebuffer(program_manager, layers, framebuffers, layout, invert_y); // TODO // program_manager.RestoreGuestPipeline(); } void BlitScreen::CreateWindowAdapt() { - if (window_adapt && Settings::values.scaling_filter.GetValue() == current_window_adapt) { + if (window_adapt && filters.get_scaling_filter() == current_window_adapt) { return; } - current_window_adapt = Settings::values.scaling_filter.GetValue(); + current_window_adapt = filters.get_scaling_filter(); switch (current_window_adapt) { case Settings::ScalingFilter::NearestNeighbor: window_adapt = MakeNearestNeighbor(device); diff --git a/src/video_core/renderer_opengl/gl_blit_screen.h b/src/video_core/renderer_opengl/gl_blit_screen.h index 0c3d838f1..df2da9424 100755 --- a/src/video_core/renderer_opengl/gl_blit_screen.h +++ b/src/video_core/renderer_opengl/gl_blit_screen.h @@ -15,6 +15,8 @@ namespace Layout { struct FramebufferLayout; } +struct PresentFilters; + namespace Tegra { struct FramebufferConfig; } @@ -46,12 +48,12 @@ public: explicit BlitScreen(RasterizerOpenGL& rasterizer, Tegra::MaxwellDeviceMemoryManager& device_memory, StateTracker& state_tracker, ProgramManager& program_manager, - Device& device); + Device& device, const PresentFilters& filters); ~BlitScreen(); /// Draws the emulated screens to the emulator window. void DrawScreen(std::span framebuffers, - const Layout::FramebufferLayout& layout); + const Layout::FramebufferLayout& layout, bool invert_y); private: void CreateWindowAdapt(); @@ -61,6 +63,7 @@ private: StateTracker& state_tracker; ProgramManager& program_manager; Device& device; + const PresentFilters& filters; Settings::ScalingFilter current_window_adapt{}; std::unique_ptr window_adapt; diff --git a/src/video_core/renderer_opengl/present/layer.cpp b/src/video_core/renderer_opengl/present/layer.cpp index 8643e07c6..6c7092d22 100755 --- a/src/video_core/renderer_opengl/present/layer.cpp +++ b/src/video_core/renderer_opengl/present/layer.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include "video_core/framebuffer_config.h" +#include "video_core/present.h" #include "video_core/renderer_opengl/gl_blit_screen.h" #include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/present/fsr.h" @@ -14,8 +15,9 @@ namespace OpenGL { -Layer::Layer(RasterizerOpenGL& rasterizer_, Tegra::MaxwellDeviceMemoryManager& device_memory_) - : rasterizer(rasterizer_), device_memory(device_memory_) { +Layer::Layer(RasterizerOpenGL& rasterizer_, Tegra::MaxwellDeviceMemoryManager& device_memory_, + const PresentFilters& filters_) + : rasterizer(rasterizer_), device_memory(device_memory_), filters(filters_) { // Allocate textures for the screen framebuffer_texture.resource.Create(GL_TEXTURE_2D); @@ -34,12 +36,12 @@ GLuint Layer::ConfigureDraw(std::array& out_matrix, std::array& out_vertices, ProgramManager& program_manager, const Tegra::FramebufferConfig& framebuffer, - const Layout::FramebufferLayout& layout) { + const Layout::FramebufferLayout& layout, bool invert_y) { FramebufferTextureInfo info = PrepareRenderTarget(framebuffer); auto crop = Tegra::NormalizeCrop(framebuffer, info.width, info.height); GLuint texture = info.display_texture; - auto anti_aliasing = Settings::values.anti_aliasing.GetValue(); + auto anti_aliasing = filters.get_anti_aliasing(); if (anti_aliasing != Settings::AntiAliasing::None) { glEnablei(GL_SCISSOR_TEST, 0); auto viewport_width = Settings::values.resolution_info.ScaleUp(framebuffer_texture.width); @@ -64,7 +66,7 @@ GLuint Layer::ConfigureDraw(std::array& out_matrix, glDisablei(GL_SCISSOR_TEST, 0); - if (Settings::values.scaling_filter.GetValue() == Settings::ScalingFilter::Fsr) { + if (filters.get_scaling_filter() == Settings::ScalingFilter::Fsr) { if (!fsr || fsr->NeedsRecreation(layout.screen)) { fsr = std::make_unique(layout.screen.GetWidth(), layout.screen.GetHeight()); } @@ -83,10 +85,15 @@ GLuint Layer::ConfigureDraw(std::array& out_matrix, const auto w = screen.GetWidth(); const auto h = screen.GetHeight(); - out_vertices[0] = ScreenRectVertex(x, y, crop.left, crop.top); - out_vertices[1] = ScreenRectVertex(x + w, y, crop.right, crop.top); - out_vertices[2] = ScreenRectVertex(x, y + h, crop.left, crop.bottom); - out_vertices[3] = ScreenRectVertex(x + w, y + h, crop.right, crop.bottom); + const auto left = crop.left; + const auto right = crop.right; + const auto top = invert_y ? crop.bottom : crop.top; + const auto bottom = invert_y ? crop.top : crop.bottom; + + out_vertices[0] = ScreenRectVertex(x, y, left, top); + out_vertices[1] = ScreenRectVertex(x + w, y, right, top); + out_vertices[2] = ScreenRectVertex(x, y + h, left, bottom); + out_vertices[3] = ScreenRectVertex(x + w, y + h, right, bottom); return texture; } @@ -131,10 +138,12 @@ FramebufferTextureInfo Layer::LoadFBToScreenInfo(const Tegra::FramebufferConfig& const u64 size_in_bytes{Tegra::Texture::CalculateSize( true, bytes_per_pixel, framebuffer.stride, framebuffer.height, 1, block_height_log2, 0)}; const u8* const host_ptr{device_memory.GetPointer(framebuffer_addr)}; - const std::span input_data(host_ptr, size_in_bytes); - Tegra::Texture::UnswizzleTexture(gl_framebuffer_data, input_data, bytes_per_pixel, - framebuffer.width, framebuffer.height, 1, block_height_log2, - 0); + if (host_ptr) { + const std::span input_data(host_ptr, size_in_bytes); + Tegra::Texture::UnswizzleTexture(gl_framebuffer_data, input_data, bytes_per_pixel, + framebuffer.width, framebuffer.height, 1, + block_height_log2, 0); + } glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); glPixelStorei(GL_UNPACK_ROW_LENGTH, static_cast(framebuffer.stride)); diff --git a/src/video_core/renderer_opengl/present/layer.h b/src/video_core/renderer_opengl/present/layer.h index ef1055abf..5b15b730f 100755 --- a/src/video_core/renderer_opengl/present/layer.h +++ b/src/video_core/renderer_opengl/present/layer.h @@ -13,6 +13,8 @@ namespace Layout { struct FramebufferLayout; } +struct PresentFilters; + namespace Service::android { enum class PixelFormat : u32; }; @@ -44,14 +46,15 @@ struct ScreenRectVertex; class Layer { public: - explicit Layer(RasterizerOpenGL& rasterizer, Tegra::MaxwellDeviceMemoryManager& device_memory); + explicit Layer(RasterizerOpenGL& rasterizer, Tegra::MaxwellDeviceMemoryManager& device_memory, + const PresentFilters& filters); ~Layer(); GLuint ConfigureDraw(std::array& out_matrix, std::array& out_vertices, ProgramManager& program_manager, const Tegra::FramebufferConfig& framebuffer, - const Layout::FramebufferLayout& layout); + const Layout::FramebufferLayout& layout, bool invert_y); private: /// Loads framebuffer from emulated memory into the active OpenGL texture. @@ -65,6 +68,7 @@ private: private: RasterizerOpenGL& rasterizer; Tegra::MaxwellDeviceMemoryManager& device_memory; + const PresentFilters& filters; /// OpenGL framebuffer data std::vector gl_framebuffer_data; diff --git a/src/video_core/renderer_opengl/present/window_adapt_pass.cpp b/src/video_core/renderer_opengl/present/window_adapt_pass.cpp index 4d681606b..d8b6a11cb 100755 --- a/src/video_core/renderer_opengl/present/window_adapt_pass.cpp +++ b/src/video_core/renderer_opengl/present/window_adapt_pass.cpp @@ -37,7 +37,7 @@ WindowAdaptPass::~WindowAdaptPass() = default; void WindowAdaptPass::DrawToFramebuffer(ProgramManager& program_manager, std::list& layers, std::span framebuffers, - const Layout::FramebufferLayout& layout) { + const Layout::FramebufferLayout& layout, bool invert_y) { GLint old_read_fb; GLint old_draw_fb; glGetIntegerv(GL_READ_FRAMEBUFFER_BINDING, &old_read_fb); @@ -51,7 +51,7 @@ void WindowAdaptPass::DrawToFramebuffer(ProgramManager& program_manager, std::li auto layer_it = layers.begin(); for (size_t i = 0; i < layer_count; i++) { textures[i] = layer_it->ConfigureDraw(matrices[i], vertices[i], program_manager, - framebuffers[i], layout); + framebuffers[i], layout, invert_y); layer_it++; } @@ -92,6 +92,21 @@ void WindowAdaptPass::DrawToFramebuffer(ProgramManager& program_manager, std::li glClear(GL_COLOR_BUFFER_BIT); for (size_t i = 0; i < layer_count; i++) { + switch (framebuffers[i].blending) { + case Tegra::BlendMode::Opaque: + default: + glDisablei(GL_BLEND, 0); + break; + case Tegra::BlendMode::Premultiplied: + glEnablei(GL_BLEND, 0); + glBlendFuncSeparatei(0, GL_ONE, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, GL_ZERO); + break; + case Tegra::BlendMode::Coverage: + glEnablei(GL_BLEND, 0); + glBlendFuncSeparatei(0, GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, GL_ZERO); + break; + } + glBindTextureUnit(0, textures[i]); glProgramUniformMatrix3x2fv(vert.handle, ModelViewMatrixLocation, 1, GL_FALSE, matrices[i].data()); diff --git a/src/video_core/renderer_opengl/present/window_adapt_pass.h b/src/video_core/renderer_opengl/present/window_adapt_pass.h index 00975a9c6..0a8bcef2f 100755 --- a/src/video_core/renderer_opengl/present/window_adapt_pass.h +++ b/src/video_core/renderer_opengl/present/window_adapt_pass.h @@ -31,7 +31,7 @@ public: void DrawToFramebuffer(ProgramManager& program_manager, std::list& layers, std::span framebuffers, - const Layout::FramebufferLayout& layout); + const Layout::FramebufferLayout& layout, bool invert_y); private: const Device& device; diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index a4a15a9e7..c5fa63229 100755 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -16,6 +16,8 @@ #include "core/core_timing.h" #include "core/frontend/emu_window.h" #include "core/telemetry_session.h" +#include "video_core/capture.h" +#include "video_core/present.h" #include "video_core/renderer_opengl/gl_blit_screen.h" #include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_shader_manager.h" @@ -120,7 +122,15 @@ RendererOpenGL::RendererOpenGL(Core::TelemetrySession& telemetry_session_, glEnableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV); } blit_screen = std::make_unique(rasterizer, device_memory, state_tracker, - program_manager, device); + program_manager, device, PresentFiltersForDisplay); + blit_applet = + std::make_unique(rasterizer, device_memory, state_tracker, program_manager, + device, PresentFiltersForAppletCapture); + capture_framebuffer.Create(); + capture_renderbuffer.Create(); + glBindRenderbuffer(GL_RENDERBUFFER, capture_renderbuffer.handle); + glRenderbufferStorage(GL_RENDERBUFFER, GL_SRGB8, VideoCore::Capture::LinearWidth, + VideoCore::Capture::LinearHeight); } RendererOpenGL::~RendererOpenGL() = default; @@ -130,10 +140,11 @@ void RendererOpenGL::Composite(std::span framebu return; } + RenderAppletCaptureLayer(framebuffers); RenderScreenshot(framebuffers); state_tracker.BindFramebuffer(0); - blit_screen->DrawScreen(framebuffers, emu_window.GetFramebufferLayout()); + blit_screen->DrawScreen(framebuffers, emu_window.GetFramebufferLayout(), false); ++m_current_frame; @@ -159,11 +170,8 @@ void RendererOpenGL::AddTelemetryFields() { telemetry_session.AddField(user_system, "GPU_OpenGL_Version", std::string(gl_version)); } -void RendererOpenGL::RenderScreenshot(std::span framebuffers) { - if (!renderer_settings.screenshot_requested) { - return; - } - +void RendererOpenGL::RenderToBuffer(std::span framebuffers, + const Layout::FramebufferLayout& layout, void* dst) { GLint old_read_fb; GLint old_draw_fb; glGetIntegerv(GL_READ_FRAMEBUFFER_BINDING, &old_read_fb); @@ -173,29 +181,86 @@ void RendererOpenGL::RenderScreenshot(std::span screenshot_framebuffer.Create(); glBindFramebuffer(GL_FRAMEBUFFER, screenshot_framebuffer.handle); - const Layout::FramebufferLayout layout{renderer_settings.screenshot_framebuffer_layout}; - GLuint renderbuffer; glGenRenderbuffers(1, &renderbuffer); glBindRenderbuffer(GL_RENDERBUFFER, renderbuffer); glRenderbufferStorage(GL_RENDERBUFFER, GL_SRGB8, layout.width, layout.height); glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, renderbuffer); - blit_screen->DrawScreen(framebuffers, layout); + blit_screen->DrawScreen(framebuffers, layout, false); glBindBuffer(GL_PIXEL_PACK_BUFFER, 0); glPixelStorei(GL_PACK_ROW_LENGTH, 0); - glReadPixels(0, 0, layout.width, layout.height, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV, - renderer_settings.screenshot_bits); + glReadPixels(0, 0, layout.width, layout.height, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV, dst); screenshot_framebuffer.Release(); glDeleteRenderbuffers(1, &renderbuffer); glBindFramebuffer(GL_READ_FRAMEBUFFER, old_read_fb); glBindFramebuffer(GL_DRAW_FRAMEBUFFER, old_draw_fb); +} + +void RendererOpenGL::RenderScreenshot(std::span framebuffers) { + if (!renderer_settings.screenshot_requested) { + return; + } + + RenderToBuffer(framebuffers, renderer_settings.screenshot_framebuffer_layout, + renderer_settings.screenshot_bits); renderer_settings.screenshot_complete_callback(true); renderer_settings.screenshot_requested = false; } +void RendererOpenGL::RenderAppletCaptureLayer( + std::span framebuffers) { + GLint old_read_fb; + GLint old_draw_fb; + glGetIntegerv(GL_READ_FRAMEBUFFER_BINDING, &old_read_fb); + glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &old_draw_fb); + + glBindFramebuffer(GL_FRAMEBUFFER, capture_framebuffer.handle); + glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, + capture_renderbuffer.handle); + + blit_applet->DrawScreen(framebuffers, VideoCore::Capture::Layout, true); + + glBindFramebuffer(GL_READ_FRAMEBUFFER, old_read_fb); + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, old_draw_fb); +} + +std::vector RendererOpenGL::GetAppletCaptureBuffer() { + using namespace VideoCore::Capture; + + std::vector linear(TiledSize); + std::vector out(TiledSize); + + GLint old_read_fb; + GLint old_draw_fb; + GLint old_pixel_pack_buffer; + GLint old_pack_row_length; + glGetIntegerv(GL_READ_FRAMEBUFFER_BINDING, &old_read_fb); + glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &old_draw_fb); + glGetIntegerv(GL_PIXEL_PACK_BUFFER_BINDING, &old_pixel_pack_buffer); + glGetIntegerv(GL_PACK_ROW_LENGTH, &old_pack_row_length); + + glBindFramebuffer(GL_FRAMEBUFFER, capture_framebuffer.handle); + glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, + capture_renderbuffer.handle); + glBindBuffer(GL_PIXEL_PACK_BUFFER, 0); + glPixelStorei(GL_PACK_ROW_LENGTH, 0); + glReadPixels(0, 0, LinearWidth, LinearHeight, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, + linear.data()); + + glBindFramebuffer(GL_READ_FRAMEBUFFER, old_read_fb); + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, old_draw_fb); + glBindBuffer(GL_PIXEL_PACK_BUFFER, old_pixel_pack_buffer); + glPixelStorei(GL_PACK_ROW_LENGTH, old_pack_row_length); + + Tegra::Texture::SwizzleTexture(out, linear, BytesPerPixel, LinearWidth, LinearHeight, + LinearDepth, BlockHeight, BlockDepth); + + return out; +} + } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h index 48ef6d377..38a24f84c 100755 --- a/src/video_core/renderer_opengl/renderer_opengl.h +++ b/src/video_core/renderer_opengl/renderer_opengl.h @@ -42,6 +42,8 @@ public: void Composite(std::span framebuffers) override; + std::vector GetAppletCaptureBuffer() override; + VideoCore::RasterizerInterface* ReadRasterizer() override { return &rasterizer; } @@ -52,7 +54,11 @@ public: private: void AddTelemetryFields(); + + void RenderToBuffer(std::span framebuffers, + const Layout::FramebufferLayout& layout, void* dst); void RenderScreenshot(std::span framebuffers); + void RenderAppletCaptureLayer(std::span framebuffers); Core::TelemetrySession& telemetry_session; Core::Frontend::EmuWindow& emu_window; @@ -64,8 +70,11 @@ private: ProgramManager program_manager; RasterizerOpenGL rasterizer; OGLFramebuffer screenshot_framebuffer; + OGLFramebuffer capture_framebuffer; + OGLRenderbuffer capture_renderbuffer; std::unique_ptr blit_screen; + std::unique_ptr blit_applet; }; } // namespace OpenGL diff --git a/src/video_core/renderer_vulkan/present/layer.cpp b/src/video_core/renderer_vulkan/present/layer.cpp index cfc04be44..3847a9a13 100755 --- a/src/video_core/renderer_vulkan/present/layer.cpp +++ b/src/video_core/renderer_vulkan/present/layer.cpp @@ -1,6 +1,7 @@ // SPDX-FileCopyrightText: Copyright 2024 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include "video_core/present.h" #include "video_core/renderer_vulkan/vk_rasterizer.h" #include "common/settings.h" @@ -48,12 +49,12 @@ VkFormat GetFormat(const Tegra::FramebufferConfig& framebuffer) { Layer::Layer(const Device& device_, MemoryAllocator& memory_allocator_, Scheduler& scheduler_, Tegra::MaxwellDeviceMemoryManager& device_memory_, size_t image_count_, - VkExtent2D output_size, VkDescriptorSetLayout layout) + VkExtent2D output_size, VkDescriptorSetLayout layout, const PresentFilters& filters_) : device(device_), memory_allocator(memory_allocator_), scheduler(scheduler_), - device_memory(device_memory_), image_count(image_count_) { + device_memory(device_memory_), filters(filters_), image_count(image_count_) { CreateDescriptorPool(); CreateDescriptorSets(layout); - if (Settings::values.scaling_filter.GetValue() == Settings::ScalingFilter::Fsr) { + if (filters.get_scaling_filter() == Settings::ScalingFilter::Fsr) { CreateFSR(output_size); } } @@ -171,11 +172,11 @@ void Layer::RefreshResources(const Tegra::FramebufferConfig& framebuffer) { } void Layer::SetAntiAliasPass() { - if (anti_alias && anti_alias_setting == Settings::values.anti_aliasing.GetValue()) { + if (anti_alias && anti_alias_setting == filters.get_anti_aliasing()) { return; } - anti_alias_setting = Settings::values.anti_aliasing.GetValue(); + anti_alias_setting = filters.get_anti_aliasing(); const VkExtent2D render_area{ .width = Settings::values.resolution_info.ScaleUp(raw_width), @@ -270,9 +271,11 @@ void Layer::UpdateRawImage(const Tegra::FramebufferConfig& framebuffer, size_t i const u64 linear_size{GetSizeInBytes(framebuffer)}; const u64 tiled_size{Tegra::Texture::CalculateSize( true, bytes_per_pixel, framebuffer.stride, framebuffer.height, 1, block_height_log2, 0)}; - Tegra::Texture::UnswizzleTexture( - mapped_span.subspan(image_offset, linear_size), std::span(host_ptr, tiled_size), - bytes_per_pixel, framebuffer.width, framebuffer.height, 1, block_height_log2, 0); + if (host_ptr) { + Tegra::Texture::UnswizzleTexture( + mapped_span.subspan(image_offset, linear_size), std::span(host_ptr, tiled_size), + bytes_per_pixel, framebuffer.width, framebuffer.height, 1, block_height_log2, 0); + } const VkBufferImageCopy copy{ .bufferOffset = image_offset, diff --git a/src/video_core/renderer_vulkan/present/layer.h b/src/video_core/renderer_vulkan/present/layer.h index 88d43fc5f..f5effdcd7 100755 --- a/src/video_core/renderer_vulkan/present/layer.h +++ b/src/video_core/renderer_vulkan/present/layer.h @@ -11,6 +11,8 @@ namespace Layout { struct FramebufferLayout; } +struct PresentFilters; + namespace Tegra { struct FramebufferConfig; } @@ -37,7 +39,8 @@ class Layer final { public: explicit Layer(const Device& device, MemoryAllocator& memory_allocator, Scheduler& scheduler, Tegra::MaxwellDeviceMemoryManager& device_memory, size_t image_count, - VkExtent2D output_size, VkDescriptorSetLayout layout); + VkExtent2D output_size, VkDescriptorSetLayout layout, + const PresentFilters& filters); ~Layer(); void ConfigureDraw(PresentPushConstants* out_push_constants, @@ -71,6 +74,7 @@ private: MemoryAllocator& memory_allocator; Scheduler& scheduler; Tegra::MaxwellDeviceMemoryManager& device_memory; + const PresentFilters& filters; const size_t image_count{}; vk::DescriptorPool descriptor_pool{}; vk::DescriptorSets descriptor_sets{}; diff --git a/src/video_core/renderer_vulkan/present/util.cpp b/src/video_core/renderer_vulkan/present/util.cpp index 6ee16595d..7f27c7c1b 100755 --- a/src/video_core/renderer_vulkan/present/util.cpp +++ b/src/video_core/renderer_vulkan/present/util.cpp @@ -362,10 +362,10 @@ vk::PipelineLayout CreateWrappedPipelineLayout(const Device& device, }); } -vk::Pipeline CreateWrappedPipeline(const Device& device, vk::RenderPass& renderpass, - vk::PipelineLayout& layout, - std::tuple shaders, - bool enable_blending) { +static vk::Pipeline CreateWrappedPipelineImpl( + const Device& device, vk::RenderPass& renderpass, vk::PipelineLayout& layout, + std::tuple shaders, + VkPipelineColorBlendAttachmentState blending) { const std::array shader_stages{{ { .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, @@ -443,30 +443,6 @@ vk::Pipeline CreateWrappedPipeline(const Device& device, vk::RenderPass& renderp .alphaToOneEnable = VK_FALSE, }; - constexpr VkPipelineColorBlendAttachmentState color_blend_attachment_disabled{ - .blendEnable = VK_FALSE, - .srcColorBlendFactor = VK_BLEND_FACTOR_ZERO, - .dstColorBlendFactor = VK_BLEND_FACTOR_ZERO, - .colorBlendOp = VK_BLEND_OP_ADD, - .srcAlphaBlendFactor = VK_BLEND_FACTOR_ZERO, - .dstAlphaBlendFactor = VK_BLEND_FACTOR_ZERO, - .alphaBlendOp = VK_BLEND_OP_ADD, - .colorWriteMask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | - VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT, - }; - - constexpr VkPipelineColorBlendAttachmentState color_blend_attachment_enabled{ - .blendEnable = VK_TRUE, - .srcColorBlendFactor = VK_BLEND_FACTOR_SRC_ALPHA, - .dstColorBlendFactor = VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA, - .colorBlendOp = VK_BLEND_OP_ADD, - .srcAlphaBlendFactor = VK_BLEND_FACTOR_ONE, - .dstAlphaBlendFactor = VK_BLEND_FACTOR_ZERO, - .alphaBlendOp = VK_BLEND_OP_ADD, - .colorWriteMask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | - VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT, - }; - const VkPipelineColorBlendStateCreateInfo color_blend_ci{ .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO, .pNext = nullptr, @@ -474,8 +450,7 @@ vk::Pipeline CreateWrappedPipeline(const Device& device, vk::RenderPass& renderp .logicOpEnable = VK_FALSE, .logicOp = VK_LOGIC_OP_COPY, .attachmentCount = 1, - .pAttachments = - enable_blending ? &color_blend_attachment_enabled : &color_blend_attachment_disabled, + .pAttachments = &blending, .blendConstants = {0.0f, 0.0f, 0.0f, 0.0f}, }; @@ -515,6 +490,63 @@ vk::Pipeline CreateWrappedPipeline(const Device& device, vk::RenderPass& renderp }); } +vk::Pipeline CreateWrappedPipeline(const Device& device, vk::RenderPass& renderpass, + vk::PipelineLayout& layout, + std::tuple shaders) { + constexpr VkPipelineColorBlendAttachmentState color_blend_attachment_disabled{ + .blendEnable = VK_FALSE, + .srcColorBlendFactor = VK_BLEND_FACTOR_ZERO, + .dstColorBlendFactor = VK_BLEND_FACTOR_ZERO, + .colorBlendOp = VK_BLEND_OP_ADD, + .srcAlphaBlendFactor = VK_BLEND_FACTOR_ZERO, + .dstAlphaBlendFactor = VK_BLEND_FACTOR_ZERO, + .alphaBlendOp = VK_BLEND_OP_ADD, + .colorWriteMask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | + VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT, + }; + + return CreateWrappedPipelineImpl(device, renderpass, layout, shaders, + color_blend_attachment_disabled); +} + +vk::Pipeline CreateWrappedPremultipliedBlendingPipeline( + const Device& device, vk::RenderPass& renderpass, vk::PipelineLayout& layout, + std::tuple shaders) { + constexpr VkPipelineColorBlendAttachmentState color_blend_attachment_premultiplied{ + .blendEnable = VK_TRUE, + .srcColorBlendFactor = VK_BLEND_FACTOR_ONE, + .dstColorBlendFactor = VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA, + .colorBlendOp = VK_BLEND_OP_ADD, + .srcAlphaBlendFactor = VK_BLEND_FACTOR_ONE, + .dstAlphaBlendFactor = VK_BLEND_FACTOR_ZERO, + .alphaBlendOp = VK_BLEND_OP_ADD, + .colorWriteMask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | + VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT, + }; + + return CreateWrappedPipelineImpl(device, renderpass, layout, shaders, + color_blend_attachment_premultiplied); +} + +vk::Pipeline CreateWrappedCoverageBlendingPipeline( + const Device& device, vk::RenderPass& renderpass, vk::PipelineLayout& layout, + std::tuple shaders) { + constexpr VkPipelineColorBlendAttachmentState color_blend_attachment_coverage{ + .blendEnable = VK_TRUE, + .srcColorBlendFactor = VK_BLEND_FACTOR_SRC_ALPHA, + .dstColorBlendFactor = VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA, + .colorBlendOp = VK_BLEND_OP_ADD, + .srcAlphaBlendFactor = VK_BLEND_FACTOR_ONE, + .dstAlphaBlendFactor = VK_BLEND_FACTOR_ZERO, + .alphaBlendOp = VK_BLEND_OP_ADD, + .colorWriteMask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | + VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT, + }; + + return CreateWrappedPipelineImpl(device, renderpass, layout, shaders, + color_blend_attachment_coverage); +} + VkWriteDescriptorSet CreateWriteDescriptorSet(std::vector& images, VkSampler sampler, VkImageView view, VkDescriptorSet set, u32 binding) { diff --git a/src/video_core/renderer_vulkan/present/util.h b/src/video_core/renderer_vulkan/present/util.h index 1104aaa15..5b22f0fa8 100755 --- a/src/video_core/renderer_vulkan/present/util.h +++ b/src/video_core/renderer_vulkan/present/util.h @@ -42,8 +42,13 @@ vk::PipelineLayout CreateWrappedPipelineLayout(const Device& device, vk::DescriptorSetLayout& layout); vk::Pipeline CreateWrappedPipeline(const Device& device, vk::RenderPass& renderpass, vk::PipelineLayout& layout, - std::tuple shaders, - bool enable_blending = false); + std::tuple shaders); +vk::Pipeline CreateWrappedPremultipliedBlendingPipeline( + const Device& device, vk::RenderPass& renderpass, vk::PipelineLayout& layout, + std::tuple shaders); +vk::Pipeline CreateWrappedCoverageBlendingPipeline( + const Device& device, vk::RenderPass& renderpass, vk::PipelineLayout& layout, + std::tuple shaders); VkWriteDescriptorSet CreateWriteDescriptorSet(std::vector& images, VkSampler sampler, VkImageView view, VkDescriptorSet set, u32 binding); diff --git a/src/video_core/renderer_vulkan/present/window_adapt_pass.cpp b/src/video_core/renderer_vulkan/present/window_adapt_pass.cpp index c5db0230d..22ffacf11 100755 --- a/src/video_core/renderer_vulkan/present/window_adapt_pass.cpp +++ b/src/video_core/renderer_vulkan/present/window_adapt_pass.cpp @@ -22,7 +22,7 @@ WindowAdaptPass::WindowAdaptPass(const Device& device_, VkFormat frame_format, CreatePipelineLayout(); CreateVertexShader(); CreateRenderPass(frame_format); - CreatePipeline(); + CreatePipelines(); } WindowAdaptPass::~WindowAdaptPass() = default; @@ -34,7 +34,6 @@ void WindowAdaptPass::Draw(RasterizerVulkan& rasterizer, Scheduler& scheduler, s const VkFramebuffer host_framebuffer{*dst->framebuffer}; const VkRenderPass renderpass{*render_pass}; - const VkPipeline graphics_pipeline{*pipeline}; const VkPipelineLayout graphics_pipeline_layout{*pipeline_layout}; const VkExtent2D render_area{ .width = dst->width, @@ -44,9 +43,23 @@ void WindowAdaptPass::Draw(RasterizerVulkan& rasterizer, Scheduler& scheduler, s const size_t layer_count = configs.size(); std::vector push_constants(layer_count); std::vector descriptor_sets(layer_count); + std::vector graphics_pipelines(layer_count); auto layer_it = layers.begin(); for (size_t i = 0; i < layer_count; i++) { + switch (configs[i].blending) { + case Tegra::BlendMode::Opaque: + default: + graphics_pipelines[i] = *opaque_pipeline; + break; + case Tegra::BlendMode::Premultiplied: + graphics_pipelines[i] = *premultiplied_pipeline; + break; + case Tegra::BlendMode::Coverage: + graphics_pipelines[i] = *coverage_pipeline; + break; + } + layer_it->ConfigureDraw(&push_constants[i], &descriptor_sets[i], rasterizer, *sampler, image_index, configs[i], layout); layer_it++; @@ -77,8 +90,8 @@ void WindowAdaptPass::Draw(RasterizerVulkan& rasterizer, Scheduler& scheduler, s BeginRenderPass(cmdbuf, renderpass, host_framebuffer, render_area); cmdbuf.ClearAttachments({clear_attachment}, {clear_rect}); - cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, graphics_pipeline); for (size_t i = 0; i < layer_count; i++) { + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, graphics_pipelines[i]); cmdbuf.PushConstants(graphics_pipeline_layout, VK_SHADER_STAGE_VERTEX_BIT, push_constants[i]); cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_GRAPHICS, graphics_pipeline_layout, 0, @@ -129,9 +142,13 @@ void WindowAdaptPass::CreateRenderPass(VkFormat frame_format) { render_pass = CreateWrappedRenderPass(device, frame_format, VK_IMAGE_LAYOUT_UNDEFINED); } -void WindowAdaptPass::CreatePipeline() { - pipeline = CreateWrappedPipeline(device, render_pass, pipeline_layout, - std::tie(vertex_shader, fragment_shader), false); +void WindowAdaptPass::CreatePipelines() { + opaque_pipeline = CreateWrappedPipeline(device, render_pass, pipeline_layout, + std::tie(vertex_shader, fragment_shader)); + premultiplied_pipeline = CreateWrappedPremultipliedBlendingPipeline( + device, render_pass, pipeline_layout, std::tie(vertex_shader, fragment_shader)); + coverage_pipeline = CreateWrappedCoverageBlendingPipeline( + device, render_pass, pipeline_layout, std::tie(vertex_shader, fragment_shader)); } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/present/window_adapt_pass.h b/src/video_core/renderer_vulkan/present/window_adapt_pass.h index 0e2edfc31..cf667a4fc 100755 --- a/src/video_core/renderer_vulkan/present/window_adapt_pass.h +++ b/src/video_core/renderer_vulkan/present/window_adapt_pass.h @@ -42,7 +42,7 @@ private: void CreatePipelineLayout(); void CreateVertexShader(); void CreateRenderPass(VkFormat frame_format); - void CreatePipeline(); + void CreatePipelines(); private: const Device& device; @@ -52,7 +52,9 @@ private: vk::ShaderModule vertex_shader; vk::ShaderModule fragment_shader; vk::RenderPass render_pass; - vk::Pipeline pipeline; + vk::Pipeline opaque_pipeline; + vk::Pipeline premultiplied_pipeline; + vk::Pipeline coverage_pipeline; }; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp index 81e305bd3..111f5df3c 100755 --- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp +++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp @@ -19,7 +19,9 @@ #include "core/core_timing.h" #include "core/frontend/graphics_context.h" #include "core/telemetry_session.h" +#include "video_core/capture.h" #include "video_core/gpu.h" +#include "video_core/present.h" #include "video_core/renderer_vulkan/present/util.h" #include "video_core/renderer_vulkan/renderer_vulkan.h" #include "video_core/renderer_vulkan/vk_blit_screen.h" @@ -38,6 +40,20 @@ namespace Vulkan { namespace { + +constexpr VkExtent2D CaptureImageSize{ + .width = VideoCore::Capture::LinearWidth, + .height = VideoCore::Capture::LinearHeight, +}; + +constexpr VkExtent3D CaptureImageExtent{ + .width = VideoCore::Capture::LinearWidth, + .height = VideoCore::Capture::LinearHeight, + .depth = VideoCore::Capture::LinearDepth, +}; + +constexpr VkFormat CaptureFormat = VK_FORMAT_A8B8G8R8_UNORM_PACK32; + std::string GetReadableVersion(u32 version) { return fmt::format("{}.{}.{}", VK_VERSION_MAJOR(version), VK_VERSION_MINOR(version), VK_VERSION_PATCH(version)); @@ -99,10 +115,15 @@ RendererVulkan::RendererVulkan(Core::TelemetrySession& telemetry_session_, render_window.GetFramebufferLayout().height), present_manager(instance, render_window, device, memory_allocator, scheduler, swapchain, surface), - blit_swapchain(device_memory, device, memory_allocator, present_manager, scheduler), - blit_screenshot(device_memory, device, memory_allocator, present_manager, scheduler), + blit_swapchain(device_memory, device, memory_allocator, present_manager, scheduler, + PresentFiltersForDisplay), + blit_capture(device_memory, device, memory_allocator, present_manager, scheduler, + PresentFiltersForDisplay), + blit_applet(device_memory, device, memory_allocator, present_manager, scheduler, + PresentFiltersForAppletCapture), rasterizer(render_window, gpu, device_memory, device, memory_allocator, state_tracker, - scheduler) { + scheduler), + applet_frame() { if (Settings::values.renderer_force_max_clock.GetValue() && device.ShouldBoostClocks()) { turbo_mode.emplace(instance, dld); scheduler.RegisterOnSubmit([this] { turbo_mode->QueueSubmitted(); }); @@ -125,6 +146,8 @@ void RendererVulkan::Composite(std::span framebu SCOPE_EXIT({ render_window.OnFrameDisplayed(); }); + RenderAppletCaptureLayer(framebuffers); + if (!render_window.IsShown()) { return; } @@ -167,30 +190,20 @@ void RendererVulkan::Report() const { telemetry_session.AddField(field, "GPU_Vulkan_Extensions", extensions); } -void Vulkan::RendererVulkan::RenderScreenshot( - std::span framebuffers) { - if (!renderer_settings.screenshot_requested) { - return; - } - - constexpr VkFormat ScreenshotFormat{VK_FORMAT_B8G8R8A8_UNORM}; - const Layout::FramebufferLayout layout{renderer_settings.screenshot_framebuffer_layout}; - +vk::Buffer RendererVulkan::RenderToBuffer(std::span framebuffers, + const Layout::FramebufferLayout& layout, VkFormat format, + VkDeviceSize buffer_size) { auto frame = [&]() { Frame f{}; - f.image = CreateWrappedImage(memory_allocator, VkExtent2D{layout.width, layout.height}, - ScreenshotFormat); - f.image_view = CreateWrappedImageView(device, f.image, ScreenshotFormat); - f.framebuffer = blit_screenshot.CreateFramebuffer(layout, *f.image_view, ScreenshotFormat); + f.image = + CreateWrappedImage(memory_allocator, VkExtent2D{layout.width, layout.height}, format); + f.image_view = CreateWrappedImageView(device, f.image, format); + f.framebuffer = blit_capture.CreateFramebuffer(layout, *f.image_view, format); return f; }(); - blit_screenshot.DrawToFrame(rasterizer, &frame, framebuffers, layout, 1, - VK_FORMAT_B8G8R8A8_UNORM); - - const auto dst_buffer = CreateWrappedBuffer( - memory_allocator, static_cast(layout.width * layout.height * 4), - MemoryUsage::Download); + auto dst_buffer = CreateWrappedBuffer(memory_allocator, buffer_size, MemoryUsage::Download); + blit_capture.DrawToFrame(rasterizer, &frame, framebuffers, layout, 1, format); scheduler.RequestOutsideRenderPassOperationContext(); scheduler.Record([&](vk::CommandBuffer cmdbuf) { @@ -198,15 +211,68 @@ void Vulkan::RendererVulkan::RenderScreenshot( VkExtent3D{layout.width, layout.height, 1}); }); - // Ensure the copy is fully completed before saving the screenshot + // Ensure the copy is fully completed before saving the capture scheduler.Finish(); - // Copy backing image data to the QImage screenshot buffer + // Copy backing image data to the capture buffer dst_buffer.Invalidate(); + return dst_buffer; +} + +void RendererVulkan::RenderScreenshot(std::span framebuffers) { + if (!renderer_settings.screenshot_requested) { + return; + } + + const auto& layout{renderer_settings.screenshot_framebuffer_layout}; + const auto dst_buffer = RenderToBuffer(framebuffers, layout, VK_FORMAT_B8G8R8A8_UNORM, + layout.width * layout.height * 4); + std::memcpy(renderer_settings.screenshot_bits, dst_buffer.Mapped().data(), dst_buffer.Mapped().size()); renderer_settings.screenshot_complete_callback(false); renderer_settings.screenshot_requested = false; } +std::vector RendererVulkan::GetAppletCaptureBuffer() { + using namespace VideoCore::Capture; + + std::vector out(VideoCore::Capture::TiledSize); + + if (!applet_frame.image) { + return out; + } + + const auto dst_buffer = + CreateWrappedBuffer(memory_allocator, VideoCore::Capture::TiledSize, MemoryUsage::Download); + + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([&](vk::CommandBuffer cmdbuf) { + DownloadColorImage(cmdbuf, *applet_frame.image, *dst_buffer, CaptureImageExtent); + }); + + // Ensure the copy is fully completed before writing the capture + scheduler.Finish(); + + // Swizzle image data to the capture buffer + dst_buffer.Invalidate(); + Tegra::Texture::SwizzleTexture(out, dst_buffer.Mapped(), BytesPerPixel, LinearWidth, + LinearHeight, LinearDepth, BlockHeight, BlockDepth); + + return out; +} + +void RendererVulkan::RenderAppletCaptureLayer( + std::span framebuffers) { + if (!applet_frame.image) { + applet_frame.image = CreateWrappedImage(memory_allocator, CaptureImageSize, CaptureFormat); + applet_frame.image_view = CreateWrappedImageView(device, applet_frame.image, CaptureFormat); + applet_frame.framebuffer = blit_applet.CreateFramebuffer( + VideoCore::Capture::Layout, *applet_frame.image_view, CaptureFormat); + } + + blit_applet.DrawToFrame(rasterizer, &applet_frame, framebuffers, VideoCore::Capture::Layout, 1, + CaptureFormat); +} + } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.h b/src/video_core/renderer_vulkan/renderer_vulkan.h index 505f9ddf6..81f97bb59 100755 --- a/src/video_core/renderer_vulkan/renderer_vulkan.h +++ b/src/video_core/renderer_vulkan/renderer_vulkan.h @@ -48,6 +48,8 @@ public: void Composite(std::span framebuffers) override; + std::vector GetAppletCaptureBuffer() override; + VideoCore::RasterizerInterface* ReadRasterizer() override { return &rasterizer; } @@ -59,7 +61,11 @@ public: private: void Report() const; + vk::Buffer RenderToBuffer(std::span framebuffers, + const Layout::FramebufferLayout& layout, VkFormat format, + VkDeviceSize buffer_size); void RenderScreenshot(std::span framebuffers); + void RenderAppletCaptureLayer(std::span framebuffers); Core::TelemetrySession& telemetry_session; Tegra::MaxwellDeviceMemoryManager& device_memory; @@ -79,9 +85,12 @@ private: Swapchain swapchain; PresentManager present_manager; BlitScreen blit_swapchain; - BlitScreen blit_screenshot; + BlitScreen blit_capture; + BlitScreen blit_applet; RasterizerVulkan rasterizer; std::optional turbo_mode; + + Frame applet_frame; }; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp index 14699f2b3..5118207bb 100755 --- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include "video_core/framebuffer_config.h" +#include "video_core/present.h" #include "video_core/renderer_vulkan/present/filters.h" #include "video_core/renderer_vulkan/present/layer.h" #include "video_core/renderer_vulkan/vk_blit_screen.h" @@ -12,9 +13,9 @@ namespace Vulkan { BlitScreen::BlitScreen(Tegra::MaxwellDeviceMemoryManager& device_memory_, const Device& device_, MemoryAllocator& memory_allocator_, PresentManager& present_manager_, - Scheduler& scheduler_) + Scheduler& scheduler_, const PresentFilters& filters_) : device_memory{device_memory_}, device{device_}, memory_allocator{memory_allocator_}, - present_manager{present_manager_}, scheduler{scheduler_}, image_count{1}, + present_manager{present_manager_}, scheduler{scheduler_}, filters{filters_}, image_count{1}, swapchain_view_format{VK_FORMAT_B8G8R8A8_UNORM} {} BlitScreen::~BlitScreen() = default; @@ -27,7 +28,7 @@ void BlitScreen::WaitIdle() { void BlitScreen::SetWindowAdaptPass() { layers.clear(); - scaling_filter = Settings::values.scaling_filter.GetValue(); + scaling_filter = filters.get_scaling_filter(); switch (scaling_filter) { case Settings::ScalingFilter::NearestNeighbor: @@ -59,7 +60,7 @@ void BlitScreen::DrawToFrame(RasterizerVulkan& rasterizer, Frame* frame, bool presentation_recreate_required = false; // Recreate dynamic resources if the adapting filter changed - if (!window_adapt || scaling_filter != Settings::values.scaling_filter.GetValue()) { + if (!window_adapt || scaling_filter != filters.get_scaling_filter()) { resource_update_required = true; } @@ -102,7 +103,7 @@ void BlitScreen::DrawToFrame(RasterizerVulkan& rasterizer, Frame* frame, while (layers.size() < framebuffers.size()) { layers.emplace_back(device, memory_allocator, scheduler, device_memory, image_count, - window_size, window_adapt->GetDescriptorSetLayout()); + window_size, window_adapt->GetDescriptorSetLayout(), filters); } // Perform the draw @@ -119,8 +120,7 @@ vk::Framebuffer BlitScreen::CreateFramebuffer(const Layout::FramebufferLayout& l VkFormat current_view_format) { const bool format_updated = std::exchange(swapchain_view_format, current_view_format) != current_view_format; - if (!window_adapt || scaling_filter != Settings::values.scaling_filter.GetValue() || - format_updated) { + if (!window_adapt || scaling_filter != filters.get_scaling_filter() || format_updated) { WaitIdle(); SetWindowAdaptPass(); } diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.h b/src/video_core/renderer_vulkan/vk_blit_screen.h index 6ac55daa1..2543da1a1 100755 --- a/src/video_core/renderer_vulkan/vk_blit_screen.h +++ b/src/video_core/renderer_vulkan/vk_blit_screen.h @@ -16,6 +16,8 @@ namespace Core { class System; } +struct PresentFilters; + namespace Tegra { struct FramebufferConfig; } @@ -47,7 +49,7 @@ class BlitScreen { public: explicit BlitScreen(Tegra::MaxwellDeviceMemoryManager& device_memory, const Device& device, MemoryAllocator& memory_allocator, PresentManager& present_manager, - Scheduler& scheduler); + Scheduler& scheduler, const PresentFilters& filters); ~BlitScreen(); void DrawToFrame(RasterizerVulkan& rasterizer, Frame* frame, @@ -70,6 +72,7 @@ private: MemoryAllocator& memory_allocator; PresentManager& present_manager; Scheduler& scheduler; + const PresentFilters& filters; std::size_t image_count{}; std::size_t image_index{}; VkFormat swapchain_view_format{}; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index dd9496164..e5e90c069 100755 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -381,8 +381,9 @@ PipelineCache::PipelineCache(Tegra::MaxwellDeviceMemoryManager& device_memory_, .support_float64 = device.IsFloat64Supported(), .support_float16 = device.IsFloat16Supported(), .support_int64 = device.IsShaderInt64Supported(), - .needs_demote_reorder = - driver_id == VK_DRIVER_ID_AMD_PROPRIETARY || driver_id == VK_DRIVER_ID_AMD_OPEN_SOURCE, + .needs_demote_reorder = driver_id == VK_DRIVER_ID_AMD_PROPRIETARY || + driver_id == VK_DRIVER_ID_AMD_OPEN_SOURCE || + driver_id == VK_DRIVER_ID_SAMSUNG_PROPRIETARY, .support_snorm_render_buffer = true, .support_viewport_index_layer = device.IsExtShaderViewportIndexLayerSupported(), .min_ssbo_alignment = static_cast(device.GetStorageBufferAlignment()), diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index c0a2597fa..85046e708 100755 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -746,7 +746,13 @@ std::pair TextureCache

::TryFindFramebufferImage }(); const auto GetImageViewForFramebuffer = [&](ImageId image_id) { - const ImageViewInfo info{ImageViewType::e2D, view_format}; + ImageViewInfo info{ImageViewType::e2D, view_format}; + if (config.blending == Tegra::BlendMode::Opaque) { + info.x_source = static_cast(SwizzleSource::R); + info.y_source = static_cast(SwizzleSource::G); + info.z_source = static_cast(SwizzleSource::B); + info.w_source = static_cast(SwizzleSource::OneFloat); + } return std::make_pair(&slot_image_views[FindOrEmplaceImageView(image_id, info)], slot_images[image_id].IsRescaled()); }; diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index 0a494af27..bd77057b0 100755 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -868,6 +868,8 @@ std::string Device::GetDriverName() const { return "Qualcomm"; case VK_DRIVER_ID_ARM_PROPRIETARY: return "Mali"; + case VK_DRIVER_ID_SAMSUNG_PROPRIETARY: + return "Xclipse"; case VK_DRIVER_ID_GOOGLE_SWIFTSHADER: return "SwiftShader"; case VK_DRIVER_ID_BROADCOM_PROPRIETARY: diff --git a/src/yuzu/CMakeLists.txt b/src/yuzu/CMakeLists.txt index ff712d5fd..7c092061e 100755 --- a/src/yuzu/CMakeLists.txt +++ b/src/yuzu/CMakeLists.txt @@ -41,6 +41,9 @@ add_executable(yuzu configuration/configuration_shared.cpp configuration/configuration_shared.h configuration/configure.ui + configuration/configure_applets.cpp + configuration/configure_applets.h + configuration/configure_applets.ui configuration/configure_audio.cpp configuration/configure_audio.h configuration/configure_audio.ui diff --git a/src/yuzu/configuration/configure_applets.cpp b/src/yuzu/configuration/configure_applets.cpp new file mode 100755 index 000000000..513ecb548 --- /dev/null +++ b/src/yuzu/configuration/configure_applets.cpp @@ -0,0 +1,86 @@ +// SPDX-FileCopyrightText: 2024 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "common/settings.h" +#include "core/core.h" +#include "ui_configure_applets.h" +#include "yuzu/configuration/configuration_shared.h" +#include "yuzu/configuration/configure_applets.h" +#include "yuzu/configuration/shared_widget.h" + +ConfigureApplets::ConfigureApplets(Core::System& system_, + std::shared_ptr> group_, + const ConfigurationShared::Builder& builder, QWidget* parent) + : Tab(group_, parent), ui{std::make_unique()}, system{system_} { + ui->setupUi(this); + + Setup(builder); + + SetConfiguration(); +} + +ConfigureApplets::~ConfigureApplets() = default; + +void ConfigureApplets::changeEvent(QEvent* event) { + if (event->type() == QEvent::LanguageChange) { + RetranslateUI(); + } + + QWidget::changeEvent(event); +} + +void ConfigureApplets::RetranslateUI() { + ui->retranslateUi(this); +} + +void ConfigureApplets::Setup(const ConfigurationShared::Builder& builder) { + auto& library_applets_layout = *ui->group_library_applet_modes->layout(); + std::map applets_hold{}; + + std::vector settings; + auto push = [&settings](auto& list) { + for (auto setting : list) { + settings.push_back(setting); + } + }; + + push(Settings::values.linkage.by_category[Settings::Category::LibraryApplet]); + + for (auto setting : settings) { + ConfigurationShared::Widget* widget = builder.BuildWidget(setting, apply_funcs); + + if (widget == nullptr) { + continue; + } + if (!widget->Valid()) { + widget->deleteLater(); + continue; + } + + // Untested applets + if (setting->Id() == Settings::values.data_erase_applet_mode.Id() || + setting->Id() == Settings::values.error_applet_mode.Id() || + setting->Id() == Settings::values.net_connect_applet_mode.Id() || + setting->Id() == Settings::values.web_applet_mode.Id() || + setting->Id() == Settings::values.shop_applet_mode.Id() || + setting->Id() == Settings::values.login_share_applet_mode.Id() || + setting->Id() == Settings::values.wifi_web_auth_applet_mode.Id() || + setting->Id() == Settings::values.my_page_applet_mode.Id()) { + widget->setHidden(true); + } + + applets_hold.emplace(setting->Id(), widget); + } + for (const auto& [label, widget] : applets_hold) { + library_applets_layout.addWidget(widget); + } +} + +void ConfigureApplets::SetConfiguration() {} + +void ConfigureApplets::ApplyConfiguration() { + const bool powered_on = system.IsPoweredOn(); + for (const auto& func : apply_funcs) { + func(powered_on); + } +} diff --git a/src/yuzu/configuration/configure_applets.h b/src/yuzu/configuration/configure_applets.h new file mode 100755 index 000000000..54f494d2f --- /dev/null +++ b/src/yuzu/configuration/configure_applets.h @@ -0,0 +1,48 @@ +// SPDX-FileCopyrightText: 2024 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include "yuzu/configuration/configuration_shared.h" + +class QCheckBox; +class QLineEdit; +class QComboBox; +class QDateTimeEdit; +namespace Core { +class System; +} + +namespace Ui { +class ConfigureApplets; +} + +namespace ConfigurationShared { +class Builder; +} + +class ConfigureApplets : public ConfigurationShared::Tab { +public: + explicit ConfigureApplets(Core::System& system_, + std::shared_ptr> group, + const ConfigurationShared::Builder& builder, + QWidget* parent = nullptr); + ~ConfigureApplets() override; + + void ApplyConfiguration() override; + void SetConfiguration() override; + +private: + void changeEvent(QEvent* event) override; + void RetranslateUI(); + + void Setup(const ConfigurationShared::Builder& builder); + + std::vector> apply_funcs{}; + + std::unique_ptr ui; + bool enabled = false; + + Core::System& system; +}; diff --git a/src/yuzu/configuration/configure_applets.ui b/src/yuzu/configuration/configure_applets.ui new file mode 100755 index 000000000..6f2ca66bd --- /dev/null +++ b/src/yuzu/configuration/configure_applets.ui @@ -0,0 +1,65 @@ + + + ConfigureApplets + + + + 0 + 0 + 605 + 300 + + + + Form + + + Applets + + + + + + + + Applet mode preference + + + + + + + 0 + + + 0 + + + 0 + + + + + + + + + + + + + Qt::Vertical + + + + 20 + 40 + + + + + + + + + diff --git a/src/yuzu/configuration/configure_dialog.cpp b/src/yuzu/configuration/configure_dialog.cpp index 14e3580a2..e9c92a0b9 100755 --- a/src/yuzu/configuration/configure_dialog.cpp +++ b/src/yuzu/configuration/configure_dialog.cpp @@ -8,6 +8,7 @@ #include "core/core.h" #include "ui_configure.h" #include "vk_device_info.h" +#include "yuzu/configuration/configure_applets.h" #include "yuzu/configuration/configure_audio.h" #include "yuzu/configuration/configure_cpu.h" #include "yuzu/configuration/configure_debug_tab.h" @@ -34,6 +35,7 @@ ConfigureDialog::ConfigureDialog(QWidget* parent, HotkeyRegistry& registry_, : QDialog(parent), ui{std::make_unique()}, registry(registry_), system{system_}, builder{std::make_unique( this, !system_.IsPoweredOn())}, + applets_tab{std::make_unique(system_, nullptr, *builder, this)}, audio_tab{std::make_unique(system_, nullptr, *builder, this)}, cpu_tab{std::make_unique(system_, nullptr, *builder, this)}, debug_tab_tab{std::make_unique(system_, this)}, @@ -58,6 +60,7 @@ ConfigureDialog::ConfigureDialog(QWidget* parent, HotkeyRegistry& registry_, ui->setupUi(this); + ui->tabWidget->addTab(applets_tab.get(), tr("Applets")); ui->tabWidget->addTab(audio_tab.get(), tr("Audio")); ui->tabWidget->addTab(cpu_tab.get(), tr("CPU")); ui->tabWidget->addTab(debug_tab_tab.get(), tr("Debug")); @@ -124,6 +127,7 @@ void ConfigureDialog::ApplyConfiguration() { debug_tab_tab->ApplyConfiguration(); web_tab->ApplyConfiguration(); network_tab->ApplyConfiguration(); + applets_tab->ApplyConfiguration(); system.ApplySettings(); Settings::LogSettings(); } @@ -161,7 +165,8 @@ void ConfigureDialog::PopulateSelectionList() { {{tr("General"), {general_tab.get(), hotkeys_tab.get(), ui_tab.get(), web_tab.get(), debug_tab_tab.get()}}, {tr("System"), - {system_tab.get(), profile_tab.get(), network_tab.get(), filesystem_tab.get()}}, + {system_tab.get(), profile_tab.get(), network_tab.get(), filesystem_tab.get(), + applets_tab.get()}}, {tr("CPU"), {cpu_tab.get()}}, {tr("Graphics"), {graphics_tab.get(), graphics_advanced_tab.get()}}, {tr("Audio"), {audio_tab.get()}}, diff --git a/src/yuzu/configuration/configure_dialog.h b/src/yuzu/configuration/configure_dialog.h index 8f56c9bca..1c507d5c2 100755 --- a/src/yuzu/configuration/configure_dialog.h +++ b/src/yuzu/configuration/configure_dialog.h @@ -15,6 +15,7 @@ namespace Core { class System; } +class ConfigureApplets; class ConfigureAudio; class ConfigureCpu; class ConfigureDebugTab; @@ -75,6 +76,7 @@ private: std::unique_ptr builder; std::vector tab_group; + std::unique_ptr applets_tab; std::unique_ptr audio_tab; std::unique_ptr cpu_tab; std::unique_ptr debug_tab_tab; diff --git a/src/yuzu/configuration/shared_translation.cpp b/src/yuzu/configuration/shared_translation.cpp index ed9c7d859..ce65b2bf1 100755 --- a/src/yuzu/configuration/shared_translation.cpp +++ b/src/yuzu/configuration/shared_translation.cpp @@ -26,6 +26,23 @@ std::unique_ptr InitializeTranslations(QWidget* parent) { // A setting can be ignored by giving it a blank name + // Applets + INSERT(Settings, cabinet_applet_mode, tr("Amiibo editor"), QStringLiteral()); + INSERT(Settings, controller_applet_mode, tr("Controller configuration"), QStringLiteral()); + INSERT(Settings, data_erase_applet_mode, tr("Data erase"), QStringLiteral()); + INSERT(Settings, error_applet_mode, tr("Error"), QStringLiteral()); + INSERT(Settings, net_connect_applet_mode, tr("Net connect"), QStringLiteral()); + INSERT(Settings, player_select_applet_mode, tr("Player select"), QStringLiteral()); + INSERT(Settings, swkbd_applet_mode, tr("Software keyboard"), QStringLiteral()); + INSERT(Settings, mii_edit_applet_mode, tr("Mii Edit"), QStringLiteral()); + INSERT(Settings, web_applet_mode, tr("Online web"), QStringLiteral()); + INSERT(Settings, shop_applet_mode, tr("Shop"), QStringLiteral()); + INSERT(Settings, photo_viewer_applet_mode, tr("Photo viewer"), QStringLiteral()); + INSERT(Settings, offline_web_applet_mode, tr("Offline web"), QStringLiteral()); + INSERT(Settings, login_share_applet_mode, tr("Login share"), QStringLiteral()); + INSERT(Settings, wifi_web_auth_applet_mode, tr("Wifi web auth"), QStringLiteral()); + INSERT(Settings, my_page_applet_mode, tr("My page"), QStringLiteral()); + // Audio INSERT(Settings, sink_id, tr("Output Engine:"), QStringLiteral()); INSERT(Settings, audio_output_device_id, tr("Output Device:"), QStringLiteral()); @@ -203,6 +220,11 @@ std::unique_ptr ComboboxEnumeration(QWidget* parent) { #define PAIR(ENUM, VALUE, TRANSLATION) {static_cast(Settings::ENUM::VALUE), (TRANSLATION)} // Intentionally skipping VSyncMode to let the UI fill that one out + translations->insert({Settings::EnumMetadata::Index(), + { + PAIR(AppletMode, HLE, tr("Custom frontend")), + PAIR(AppletMode, LLE, tr("Real applet")), + }}); translations->insert({Settings::EnumMetadata::Index(), {