From 2391029e4794861f9c6b1a166c8bc859a28cd34c Mon Sep 17 00:00:00 2001
From: pineappleEA <pineaea@gmail.com>
Date: Fri, 2 Feb 2024 17:58:50 +0100
Subject: [PATCH] early-access version 4109

---
 CMakeLists.txt                                |    2 +-
 README.md                                     |    2 +-
 src/common/settings.cpp                       |    3 +
 src/common/settings.h                         |   32 +
 src/common/settings_common.h                  |    1 +
 src/common/settings_enums.h                   |    2 +
 src/core/device_memory_manager.h              |    2 +
 src/core/guest_memory.h                       |   62 +-
 src/core/hle/kernel/k_process.cpp             |    7 +-
 src/core/hle/service/am/am_types.h            |    6 +-
 .../am/frontend/applet_software_keyboard.cpp  |   12 +-
 .../am/frontend/applet_software_keyboard.h    |    2 +-
 .../hle/service/am/library_applet_creator.cpp |   57 +-
 src/core/hle/service/am/self_controller.cpp   |    3 +-
 .../hle/service/am/system_buffer_manager.cpp  |   18 +-
 .../hle/service/am/system_buffer_manager.h    |    3 +-
 src/core/hle/service/nvdrv/core/container.cpp |    7 +-
 src/core/hle/service/nvdrv/core/container.h   |    4 +-
 src/core/hle/service/nvdrv/core/nvmap.cpp     |    8 +-
 .../service/nvdrv/devices/nvdisp_disp0.cpp    |   17 +
 .../service/nvdrv/devices/nvhost_nvdec.cpp    |   15 +-
 .../nvdrv/devices/nvhost_nvdec_common.cpp     |   27 +-
 .../nvdrv/devices/nvhost_nvdec_common.h       |    1 +
 .../hle/service/nvdrv/devices/nvhost_vic.cpp  |   15 +-
 .../nvnflinger/fb_share_buffer_manager.cpp    |  176 ++-
 .../nvnflinger/fb_share_buffer_manager.h      |   24 +-
 .../service/nvnflinger/hardware_composer.cpp  |    1 +
 src/core/hle/service/nvnflinger/hwc_layer.h   |   13 +
 .../hle/service/nvnflinger/nvnflinger.cpp     |    7 +-
 src/core/hle/service/nvnflinger/nvnflinger.h  |    6 +-
 src/core/hle/service/vi/layer/vi_layer.cpp    |    6 +-
 src/core/hle/service/vi/layer/vi_layer.h      |   13 +
 src/core/memory.h                             |    2 +
 src/frontend_common/config.cpp                |   18 +
 src/frontend_common/config.h                  |    2 +
 src/video_core/CMakeLists.txt                 |    8 +-
 src/video_core/capture.h                      |   36 +
 src/video_core/cdma_pusher.cpp                |  206 ++-
 src/video_core/cdma_pusher.h                  |   87 +-
 src/video_core/framebuffer_config.h           |    7 +
 src/video_core/gpu.cpp                        |   48 +-
 src/video_core/gpu.h                          |   11 +-
 src/video_core/gpu_thread.cpp                 |    1 +
 src/video_core/host1x/codecs/decoder.cpp      |   18 +-
 src/video_core/host1x/codecs/decoder.h        |   26 +-
 src/video_core/host1x/codecs/h264.cpp         |  143 +-
 src/video_core/host1x/codecs/h264.h           |  293 ++--
 src/video_core/host1x/codecs/vp8.cpp          |   73 +-
 src/video_core/host1x/codecs/vp8.h            |   44 +-
 src/video_core/host1x/codecs/vp9.cpp          |  152 +-
 src/video_core/host1x/codecs/vp9.h            |   46 +-
 src/video_core/host1x/codecs/vp9_types.h      |   27 +-
 src/video_core/host1x/control.cpp             |    1 +
 src/video_core/host1x/control.h               |   10 +-
 src/video_core/host1x/ffmpeg/ffmpeg.cpp       |  246 ++-
 src/video_core/host1x/ffmpeg/ffmpeg.h         |   61 +-
 src/video_core/host1x/host1x.cpp              |   26 +-
 src/video_core/host1x/host1x.h                |  139 +-
 src/video_core/host1x/nvdec.cpp               |   63 +-
 src/video_core/host1x/nvdec.h                 |   34 +-
 src/video_core/host1x/nvdec_common.h          |   84 +-
 src/video_core/host1x/syncpoint_manager.cpp   |    6 +-
 src/video_core/host1x/vic.cpp                 | 1331 ++++++++++++++---
 src/video_core/host1x/vic.h                   |  653 +++++++-
 .../host_shaders/fidelityfx_fsr.frag          |   21 +-
 src/video_core/host_shaders/fxaa.frag         |    2 +-
 .../host_shaders/opengl_fidelityfx_fsr.frag   |   19 +-
 .../host_shaders/opengl_present.frag          |    2 +-
 .../host_shaders/present_bicubic.frag         |    2 +-
 .../host_shaders/present_gaussian.frag        |   14 +-
 .../vulkan_fidelityfx_fsr_easu_fp16.frag      |    1 +
 .../vulkan_fidelityfx_fsr_easu_fp32.frag      |    1 +
 .../vulkan_fidelityfx_fsr_rcas_fp16.frag      |    1 +
 .../vulkan_fidelityfx_fsr_rcas_fp32.frag      |    1 +
 .../vulkan_present_scaleforce_fp16.frag       |    2 +-
 .../vulkan_present_scaleforce_fp32.frag       |    2 +-
 src/video_core/memory_manager.h               |    2 +
 src/video_core/present.h                      |   37 +
 src/video_core/renderer_base.h                |    3 +
 .../renderer_null/renderer_null.cpp           |    5 +
 src/video_core/renderer_null/renderer_null.h  |    2 +
 .../renderer_opengl/gl_blit_screen.cpp        |   15 +-
 .../renderer_opengl/gl_blit_screen.h          |    7 +-
 .../renderer_opengl/present/layer.cpp         |   35 +-
 .../renderer_opengl/present/layer.h           |    8 +-
 .../present/window_adapt_pass.cpp             |   19 +-
 .../present/window_adapt_pass.h               |    2 +-
 .../renderer_opengl/renderer_opengl.cpp       |   89 +-
 .../renderer_opengl/renderer_opengl.h         |    9 +
 .../renderer_vulkan/present/layer.cpp         |   19 +-
 .../renderer_vulkan/present/layer.h           |    6 +-
 .../renderer_vulkan/present/util.cpp          |   92 +-
 src/video_core/renderer_vulkan/present/util.h |    9 +-
 .../present/window_adapt_pass.cpp             |   29 +-
 .../present/window_adapt_pass.h               |    6 +-
 .../renderer_vulkan/renderer_vulkan.cpp       |  114 +-
 .../renderer_vulkan/renderer_vulkan.h         |   11 +-
 .../renderer_vulkan/vk_blit_screen.cpp        |   14 +-
 .../renderer_vulkan/vk_blit_screen.h          |    5 +-
 .../renderer_vulkan/vk_pipeline_cache.cpp     |    5 +-
 src/video_core/texture_cache/texture_cache.h  |    8 +-
 .../vulkan_common/vulkan_device.cpp           |    2 +
 src/yuzu/CMakeLists.txt                       |    3 +
 src/yuzu/configuration/configure_applets.cpp  |   86 ++
 src/yuzu/configuration/configure_applets.h    |   48 +
 src/yuzu/configuration/configure_applets.ui   |   65 +
 src/yuzu/configuration/configure_dialog.cpp   |    7 +-
 src/yuzu/configuration/configure_dialog.h     |    2 +
 src/yuzu/configuration/shared_translation.cpp |   22 +
 109 files changed, 4082 insertions(+), 1228 deletions(-)
 create mode 100755 src/video_core/capture.h
 create mode 100755 src/video_core/present.h
 create mode 100755 src/yuzu/configuration/configure_applets.cpp
 create mode 100755 src/yuzu/configuration/configure_applets.h
 create mode 100755 src/yuzu/configuration/configure_applets.ui

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8d851522f..b2e23c379 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -307,7 +307,7 @@ find_package(ZLIB 1.2 REQUIRED)
 find_package(zstd 1.5 REQUIRED)
 
 if (NOT YUZU_USE_EXTERNAL_VULKAN_HEADERS)
-    find_package(Vulkan 1.3.274 REQUIRED)
+    find_package(VulkanHeaders 1.3.274 REQUIRED)
 endif()
 
 if (NOT YUZU_USE_EXTERNAL_VULKAN_UTILITY_LIBRARIES)
diff --git a/README.md b/README.md
index a6e4270e1..5a9980269 100755
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 yuzu emulator early access
 =============
 
-This is the source code for early-access 4107.
+This is the source code for early-access 4109.
 
 ## Legal Notice
 
diff --git a/src/common/settings.cpp b/src/common/settings.cpp
index 685ee1097..52a20ea61 100755
--- a/src/common/settings.cpp
+++ b/src/common/settings.cpp
@@ -30,6 +30,7 @@ namespace Settings {
 #define SETTING(TYPE, RANGED) template class Setting<TYPE, RANGED>
 #define SWITCHABLE(TYPE, RANGED) template class SwitchableSetting<TYPE, RANGED>
 
+SETTING(AppletMode, false);
 SETTING(AudioEngine, false);
 SETTING(bool, false);
 SETTING(int, false);
@@ -215,6 +216,8 @@ const char* TranslateCategory(Category category) {
         return "Debugging";
     case Category::GpuDriver:
         return "GpuDriver";
+    case Category::LibraryApplet:
+        return "LibraryApplet";
     case Category::Miscellaneous:
         return "Miscellaneous";
     case Category::Network:
diff --git a/src/common/settings.h b/src/common/settings.h
index ee5d42644..9ff40d118 100755
--- a/src/common/settings.h
+++ b/src/common/settings.h
@@ -133,6 +133,38 @@ struct TouchFromButtonMap {
 struct Values {
     Linkage linkage{};
 
+    // Applet
+    Setting<AppletMode> cabinet_applet_mode{linkage, AppletMode::LLE, "cabinet_applet_mode",
+                                            Category::LibraryApplet};
+    Setting<AppletMode> controller_applet_mode{linkage, AppletMode::HLE, "controller_applet_mode",
+                                               Category::LibraryApplet};
+    Setting<AppletMode> data_erase_applet_mode{linkage, AppletMode::HLE, "data_erase_applet_mode",
+                                               Category::LibraryApplet};
+    Setting<AppletMode> error_applet_mode{linkage, AppletMode::HLE, "error_applet_mode",
+                                          Category::LibraryApplet};
+    Setting<AppletMode> net_connect_applet_mode{linkage, AppletMode::HLE, "net_connect_applet_mode",
+                                                Category::LibraryApplet};
+    Setting<AppletMode> player_select_applet_mode{
+        linkage, AppletMode::HLE, "player_select_applet_mode", Category::LibraryApplet};
+    Setting<AppletMode> swkbd_applet_mode{linkage, AppletMode::LLE, "swkbd_applet_mode",
+                                          Category::LibraryApplet};
+    Setting<AppletMode> mii_edit_applet_mode{linkage, AppletMode::LLE, "mii_edit_applet_mode",
+                                             Category::LibraryApplet};
+    Setting<AppletMode> web_applet_mode{linkage, AppletMode::HLE, "web_applet_mode",
+                                        Category::LibraryApplet};
+    Setting<AppletMode> shop_applet_mode{linkage, AppletMode::HLE, "shop_applet_mode",
+                                         Category::LibraryApplet};
+    Setting<AppletMode> photo_viewer_applet_mode{
+        linkage, AppletMode::LLE, "photo_viewer_applet_mode", Category::LibraryApplet};
+    Setting<AppletMode> offline_web_applet_mode{linkage, AppletMode::LLE, "offline_web_applet_mode",
+                                                Category::LibraryApplet};
+    Setting<AppletMode> login_share_applet_mode{linkage, AppletMode::HLE, "login_share_applet_mode",
+                                                Category::LibraryApplet};
+    Setting<AppletMode> wifi_web_auth_applet_mode{
+        linkage, AppletMode::HLE, "wifi_web_auth_applet_mode", Category::LibraryApplet};
+    Setting<AppletMode> my_page_applet_mode{linkage, AppletMode::LLE, "my_page_applet_mode",
+                                            Category::LibraryApplet};
+
     // Audio
     SwitchableSetting<AudioEngine> sink_id{linkage, AudioEngine::Auto, "output_engine",
                                            Category::Audio, Specialization::RuntimeList};
diff --git a/src/common/settings_common.h b/src/common/settings_common.h
index 987489e8a..2df3f0809 100755
--- a/src/common/settings_common.h
+++ b/src/common/settings_common.h
@@ -44,6 +44,7 @@ enum class Category : u32 {
     Services,
     Paths,
     Linux,
+    LibraryApplet,
     MaxEnum,
 };
 
diff --git a/src/common/settings_enums.h b/src/common/settings_enums.h
index 617036588..f42367e67 100755
--- a/src/common/settings_enums.h
+++ b/src/common/settings_enums.h
@@ -151,6 +151,8 @@ ENUM(AspectRatio, R16_9, R4_3, R21_9, R16_10, Stretch);
 
 ENUM(ConsoleMode, Handheld, Docked);
 
+ENUM(AppletMode, HLE, LLE);
+
 template <typename Type>
 inline std::string CanonicalizeEnum(Type id) {
     const auto group = EnumMetadata<Type>::Canonicalizations();
diff --git a/src/core/device_memory_manager.h b/src/core/device_memory_manager.h
index 0568a821b..6dcf7bb22 100755
--- a/src/core/device_memory_manager.h
+++ b/src/core/device_memory_manager.h
@@ -43,6 +43,8 @@ public:
     DeviceMemoryManager(const DeviceMemory& device_memory);
     ~DeviceMemoryManager();
 
+    static constexpr bool HAS_FLUSH_INVALIDATION = true;
+
     void BindInterface(DeviceInterface* device_inter);
 
     DAddr Allocate(size_t size);
diff --git a/src/core/guest_memory.h b/src/core/guest_memory.h
index 7ee18c126..83292f702 100755
--- a/src/core/guest_memory.h
+++ b/src/core/guest_memory.h
@@ -44,15 +44,32 @@ public:
     GuestMemory() = delete;
     explicit GuestMemory(M& memory, u64 addr, std::size_t size,
                          Common::ScratchBuffer<T>* backup = nullptr)
-        : m_memory{memory}, m_addr{addr}, m_size{size} {
+        : m_memory{&memory}, m_addr{addr}, m_size{size} {
         static_assert(FLAGS & GuestMemoryFlags::Read || FLAGS & GuestMemoryFlags::Write);
-        if constexpr (FLAGS & GuestMemoryFlags::Read) {
+        if constexpr (!(FLAGS & GuestMemoryFlags::Read)) {
+            if (!this->TrySetSpan()) {
+                if (backup) {
+                    backup->resize_destructive(this->size());
+                    m_data_span = *backup;
+                    m_span_valid = true;
+                    m_is_data_copy = true;
+                } else {
+                    m_data_copy.resize(this->size());
+                    m_data_span = std::span(m_data_copy);
+                    m_span_valid = true;
+                    m_is_data_copy = true;
+                }
+            }
+        } else if constexpr (FLAGS & GuestMemoryFlags::Read) {
             Read(addr, size, backup);
         }
     }
 
     ~GuestMemory() = default;
 
+    GuestMemory(GuestMemory&& rhs) = default;
+    GuestMemory& operator=(GuestMemory&& rhs) = default;
+
     T* data() noexcept {
         return m_data_span.data();
     }
@@ -109,8 +126,8 @@ public:
         }
 
         if (this->TrySetSpan()) {
-            if constexpr (FLAGS & GuestMemoryFlags::Safe) {
-                m_memory.FlushRegion(m_addr, this->size_bytes());
+            if constexpr (FLAGS & GuestMemoryFlags::Safe && M::HAS_FLUSH_INVALIDATION) {
+                m_memory->FlushRegion(m_addr, this->size_bytes());
             }
         } else {
             if (backup) {
@@ -123,9 +140,9 @@ public:
             m_is_data_copy = true;
             m_span_valid = true;
             if constexpr (FLAGS & GuestMemoryFlags::Safe) {
-                m_memory.ReadBlock(m_addr, this->data(), this->size_bytes());
+                m_memory->ReadBlock(m_addr, this->data(), this->size_bytes());
             } else {
-                m_memory.ReadBlockUnsafe(m_addr, this->data(), this->size_bytes());
+                m_memory->ReadBlockUnsafe(m_addr, this->data(), this->size_bytes());
             }
         }
         return m_data_span;
@@ -133,18 +150,19 @@ public:
 
     void Write(std::span<T> write_data) noexcept {
         if constexpr (FLAGS & GuestMemoryFlags::Cached) {
-            m_memory.WriteBlockCached(m_addr, write_data.data(), this->size_bytes());
+            m_memory->WriteBlockCached(m_addr, write_data.data(), this->size_bytes());
         } else if constexpr (FLAGS & GuestMemoryFlags::Safe) {
-            m_memory.WriteBlock(m_addr, write_data.data(), this->size_bytes());
+            m_memory->WriteBlock(m_addr, write_data.data(), this->size_bytes());
         } else {
-            m_memory.WriteBlockUnsafe(m_addr, write_data.data(), this->size_bytes());
+            m_memory->WriteBlockUnsafe(m_addr, write_data.data(), this->size_bytes());
         }
     }
 
     bool TrySetSpan() noexcept {
-        if (u8* ptr = m_memory.GetSpan(m_addr, this->size_bytes()); ptr) {
+        if (u8* ptr = m_memory->GetSpan(m_addr, this->size_bytes()); ptr) {
             m_data_span = {reinterpret_cast<T*>(ptr), this->size()};
             m_span_valid = true;
+            m_is_data_copy = false;
             return true;
         }
         return false;
@@ -159,7 +177,7 @@ protected:
         return m_addr_changed;
     }
 
-    M& m_memory;
+    M* m_memory;
     u64 m_addr{};
     size_t m_size{};
     std::span<T> m_data_span{};
@@ -175,17 +193,7 @@ public:
     GuestMemoryScoped() = delete;
     explicit GuestMemoryScoped(M& memory, u64 addr, std::size_t size,
                                Common::ScratchBuffer<T>* backup = nullptr)
-        : GuestMemory<M, T, FLAGS>(memory, addr, size, backup) {
-        if constexpr (!(FLAGS & GuestMemoryFlags::Read)) {
-            if (!this->TrySetSpan()) {
-                if (backup) {
-                    this->m_data_span = *backup;
-                    this->m_span_valid = true;
-                    this->m_is_data_copy = true;
-                }
-            }
-        }
-    }
+        : GuestMemory<M, T, FLAGS>(memory, addr, size, backup) {}
 
     ~GuestMemoryScoped() {
         if constexpr (FLAGS & GuestMemoryFlags::Write) {
@@ -196,15 +204,17 @@ public:
             if (this->AddressChanged() || this->IsDataCopy()) {
                 ASSERT(this->m_span_valid);
                 if constexpr (FLAGS & GuestMemoryFlags::Cached) {
-                    this->m_memory.WriteBlockCached(this->m_addr, this->data(), this->size_bytes());
+                    this->m_memory->WriteBlockCached(this->m_addr, this->data(),
+                                                     this->size_bytes());
                 } else if constexpr (FLAGS & GuestMemoryFlags::Safe) {
-                    this->m_memory.WriteBlock(this->m_addr, this->data(), this->size_bytes());
+                    this->m_memory->WriteBlock(this->m_addr, this->data(), this->size_bytes());
                 } else {
-                    this->m_memory.WriteBlockUnsafe(this->m_addr, this->data(), this->size_bytes());
+                    this->m_memory->WriteBlockUnsafe(this->m_addr, this->data(),
+                                                     this->size_bytes());
                 }
             } else if constexpr ((FLAGS & GuestMemoryFlags::Safe) ||
                                  (FLAGS & GuestMemoryFlags::Cached)) {
-                this->m_memory.InvalidateRegion(this->m_addr, this->size_bytes());
+                this->m_memory->InvalidateRegion(this->m_addr, this->size_bytes());
             }
         }
     }
diff --git a/src/core/hle/kernel/k_process.cpp b/src/core/hle/kernel/k_process.cpp
index 6cad999aa..4ad83f8a9 100755
--- a/src/core/hle/kernel/k_process.cpp
+++ b/src/core/hle/kernel/k_process.cpp
@@ -4,8 +4,9 @@
 #include <random>
 #include "common/scope_exit.h"
 #include "common/settings.h"
+#include "core/arm/dynarmic/arm_dynarmic.h"
+#include "core/arm/dynarmic/dynarmic_exclusive_monitor.h"
 #include "core/core.h"
-#include "core/gpu_dirty_memory_manager.h"
 #include "core/hle/kernel/k_process.h"
 #include "core/hle/kernel/k_scoped_resource_reservation.h"
 #include "core/hle/kernel/k_shared_memory.h"
@@ -1258,6 +1259,10 @@ void KProcess::InitializeInterfaces() {
 
 #ifdef HAS_NCE
     if (this->IsApplication() && Settings::IsNceEnabled()) {
+        // Register the scoped JIT handler before creating any NCE instances
+        // so that its signal handler will appear first in the signal chain.
+        Core::ScopedJitExecution::RegisterHandler();
+
         for (size_t i = 0; i < Core::Hardware::NUM_CPU_CORES; i++) {
             m_arm_interfaces[i] = std::make_unique<Core::ArmNce>(m_kernel.System(), true, i);
         }
diff --git a/src/core/hle/service/am/am_types.h b/src/core/hle/service/am/am_types.h
index a2b852b12..8c33feb15 100755
--- a/src/core/hle/service/am/am_types.h
+++ b/src/core/hle/service/am/am_types.h
@@ -130,9 +130,9 @@ enum class AppletProgramId : u64 {
 
 enum class LibraryAppletMode : u32 {
     AllForeground = 0,
-    Background = 1,
-    NoUI = 2,
-    BackgroundIndirectDisplay = 3,
+    PartialForeground = 1,
+    NoUi = 2,
+    PartialForegroundIndirectDisplay = 3,
     AllForegroundInitiallyHidden = 4,
 };
 
diff --git a/src/core/hle/service/am/frontend/applet_software_keyboard.cpp b/src/core/hle/service/am/frontend/applet_software_keyboard.cpp
index fbf75d379..034c62f32 100755
--- a/src/core/hle/service/am/frontend/applet_software_keyboard.cpp
+++ b/src/core/hle/service/am/frontend/applet_software_keyboard.cpp
@@ -68,9 +68,9 @@ void SoftwareKeyboard::Initialize() {
     case LibraryAppletMode::AllForeground:
         InitializeForeground();
         break;
-    case LibraryAppletMode::Background:
-    case LibraryAppletMode::BackgroundIndirectDisplay:
-        InitializeBackground(applet_mode);
+    case LibraryAppletMode::PartialForeground:
+    case LibraryAppletMode::PartialForegroundIndirectDisplay:
+        InitializePartialForeground(applet_mode);
         break;
     default:
         ASSERT_MSG(false, "Invalid LibraryAppletMode={}", applet_mode);
@@ -243,7 +243,7 @@ void SoftwareKeyboard::InitializeForeground() {
     InitializeFrontendNormalKeyboard();
 }
 
-void SoftwareKeyboard::InitializeBackground(LibraryAppletMode library_applet_mode) {
+void SoftwareKeyboard::InitializePartialForeground(LibraryAppletMode library_applet_mode) {
     LOG_INFO(Service_AM, "Initializing Inline Software Keyboard Applet.");
 
     is_background = true;
@@ -258,9 +258,9 @@ void SoftwareKeyboard::InitializeBackground(LibraryAppletMode library_applet_mod
                 swkbd_inline_initialize_arg.size());
 
     if (swkbd_initialize_arg.library_applet_mode_flag) {
-        ASSERT(library_applet_mode == LibraryAppletMode::Background);
+        ASSERT(library_applet_mode == LibraryAppletMode::PartialForeground);
     } else {
-        ASSERT(library_applet_mode == LibraryAppletMode::BackgroundIndirectDisplay);
+        ASSERT(library_applet_mode == LibraryAppletMode::PartialForegroundIndirectDisplay);
     }
 }
 
diff --git a/src/core/hle/service/am/frontend/applet_software_keyboard.h b/src/core/hle/service/am/frontend/applet_software_keyboard.h
index f464b7e15..2a7d01b96 100755
--- a/src/core/hle/service/am/frontend/applet_software_keyboard.h
+++ b/src/core/hle/service/am/frontend/applet_software_keyboard.h
@@ -62,7 +62,7 @@ private:
     void InitializeForeground();
 
     /// Initializes the inline software keyboard.
-    void InitializeBackground(LibraryAppletMode library_applet_mode);
+    void InitializePartialForeground(LibraryAppletMode library_applet_mode);
 
     /// Processes the text check sent by the application.
     void ProcessTextCheck();
diff --git a/src/core/hle/service/am/library_applet_creator.cpp b/src/core/hle/service/am/library_applet_creator.cpp
index 47bab7528..a883a021e 100755
--- a/src/core/hle/service/am/library_applet_creator.cpp
+++ b/src/core/hle/service/am/library_applet_creator.cpp
@@ -1,6 +1,7 @@
 // SPDX-FileCopyrightText: Copyright 2024 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
+#include "common/settings.h"
 #include "core/hle/kernel/k_transfer_memory.h"
 #include "core/hle/service/am/applet_data_broker.h"
 #include "core/hle/service/am/applet_manager.h"
@@ -16,6 +17,34 @@ namespace Service::AM {
 
 namespace {
 
+bool ShouldCreateGuestApplet(AppletId applet_id) {
+#define X(Name, name)                                                                              \
+    if (applet_id == AppletId::Name &&                                                             \
+        Settings::values.name##_applet_mode.GetValue() != Settings::AppletMode::LLE) {             \
+        return false;                                                                              \
+    }
+
+    X(Cabinet, cabinet)
+    X(Controller, controller)
+    X(DataErase, data_erase)
+    X(Error, error)
+    X(NetConnect, net_connect)
+    X(ProfileSelect, player_select)
+    X(SoftwareKeyboard, swkbd)
+    X(MiiEdit, mii_edit)
+    X(Web, web)
+    X(Shop, shop)
+    X(PhotoViewer, photo_viewer)
+    X(OfflineWeb, offline_web)
+    X(LoginShare, login_share)
+    X(WebAuth, wifi_web_auth)
+    X(MyPage, my_page)
+
+#undef X
+
+    return true;
+}
+
 AppletProgramId AppletIdToProgramId(AppletId applet_id) {
     switch (applet_id) {
     case AppletId::OverlayDisplay:
@@ -63,9 +92,10 @@ AppletProgramId AppletIdToProgramId(AppletId applet_id) {
     }
 }
 
-[[maybe_unused]] std::shared_ptr<ILibraryAppletAccessor> CreateGuestApplet(
-    Core::System& system, std::shared_ptr<Applet> caller_applet, AppletId applet_id,
-    LibraryAppletMode mode) {
+std::shared_ptr<ILibraryAppletAccessor> CreateGuestApplet(Core::System& system,
+                                                          std::shared_ptr<Applet> caller_applet,
+                                                          AppletId applet_id,
+                                                          LibraryAppletMode mode) {
     const auto program_id = static_cast<u64>(AppletIdToProgramId(applet_id));
     if (program_id == 0) {
         // Unknown applet
@@ -87,7 +117,7 @@ AppletProgramId AppletIdToProgramId(AppletId applet_id) {
     // Set focus state
     switch (mode) {
     case LibraryAppletMode::AllForeground:
-    case LibraryAppletMode::NoUI:
+    case LibraryAppletMode::NoUi:
         applet->focus_state = FocusState::InFocus;
         applet->hid_registration.EnableAppletToGetInput(true);
         applet->message_queue.PushMessage(AppletMessageQueue::AppletMessage::ChangeIntoForeground);
@@ -99,8 +129,8 @@ AppletProgramId AppletIdToProgramId(AppletId applet_id) {
         applet->hid_registration.EnableAppletToGetInput(false);
         applet->message_queue.PushMessage(AppletMessageQueue::AppletMessage::FocusStateChanged);
         break;
-    case LibraryAppletMode::Background:
-    case LibraryAppletMode::BackgroundIndirectDisplay:
+    case LibraryAppletMode::PartialForeground:
+    case LibraryAppletMode::PartialForegroundIndirectDisplay:
     default:
         applet->focus_state = FocusState::Background;
         applet->hid_registration.EnableAppletToGetInput(true);
@@ -117,9 +147,10 @@ AppletProgramId AppletIdToProgramId(AppletId applet_id) {
     return std::make_shared<ILibraryAppletAccessor>(system, broker, applet);
 }
 
-[[maybe_unused]] std::shared_ptr<ILibraryAppletAccessor> CreateFrontendApplet(
-    Core::System& system, std::shared_ptr<Applet> caller_applet, AppletId applet_id,
-    LibraryAppletMode mode) {
+std::shared_ptr<ILibraryAppletAccessor> CreateFrontendApplet(Core::System& system,
+                                                             std::shared_ptr<Applet> caller_applet,
+                                                             AppletId applet_id,
+                                                             LibraryAppletMode mode) {
     const auto program_id = static_cast<u64>(AppletIdToProgramId(applet_id));
 
     auto process = std::make_unique<Process>(system);
@@ -163,7 +194,13 @@ void ILibraryAppletCreator::CreateLibraryApplet(HLERequestContext& ctx) {
     LOG_DEBUG(Service_AM, "called with applet_id={:08X}, applet_mode={:08X}", applet_id,
               applet_mode);
 
-    auto library_applet = CreateFrontendApplet(system, applet, applet_id, applet_mode);
+    std::shared_ptr<ILibraryAppletAccessor> library_applet;
+    if (ShouldCreateGuestApplet(applet_id)) {
+        library_applet = CreateGuestApplet(system, applet, applet_id, applet_mode);
+    }
+    if (!library_applet) {
+        library_applet = CreateFrontendApplet(system, applet, applet_id, applet_mode);
+    }
     if (!library_applet) {
         LOG_ERROR(Service_AM, "Applet doesn't exist! applet_id={}", applet_id);
 
diff --git a/src/core/hle/service/am/self_controller.cpp b/src/core/hle/service/am/self_controller.cpp
index 0289f5cf1..b92663b2b 100755
--- a/src/core/hle/service/am/self_controller.cpp
+++ b/src/core/hle/service/am/self_controller.cpp
@@ -288,7 +288,8 @@ void ISelfController::GetSystemSharedBufferHandle(HLERequestContext& ctx) {
 }
 
 Result ISelfController::EnsureBufferSharingEnabled(Kernel::KProcess* process) {
-    if (applet->system_buffer_manager.Initialize(&nvnflinger, process, applet->applet_id)) {
+    if (applet->system_buffer_manager.Initialize(&nvnflinger, process, applet->applet_id,
+                                                 applet->library_applet_mode)) {
         return ResultSuccess;
     }
 
diff --git a/src/core/hle/service/am/system_buffer_manager.cpp b/src/core/hle/service/am/system_buffer_manager.cpp
index 60a9afc9d..7fb9e3a75 100755
--- a/src/core/hle/service/am/system_buffer_manager.cpp
+++ b/src/core/hle/service/am/system_buffer_manager.cpp
@@ -17,11 +17,12 @@ SystemBufferManager::~SystemBufferManager() {
 
     // Clean up shared layers.
     if (m_buffer_sharing_enabled) {
+        m_nvnflinger->GetSystemBufferManager().Finalize(m_process);
     }
 }
 
 bool SystemBufferManager::Initialize(Nvnflinger::Nvnflinger* nvnflinger, Kernel::KProcess* process,
-                                     AppletId applet_id) {
+                                     AppletId applet_id, LibraryAppletMode mode) {
     if (m_nvnflinger) {
         return m_buffer_sharing_enabled;
     }
@@ -36,9 +37,14 @@ bool SystemBufferManager::Initialize(Nvnflinger::Nvnflinger* nvnflinger, Kernel:
         return false;
     }
 
+    Nvnflinger::LayerBlending blending = Nvnflinger::LayerBlending::None;
+    if (mode == LibraryAppletMode::PartialForeground) {
+        blending = Nvnflinger::LayerBlending::Coverage;
+    }
+
     const auto display_id = m_nvnflinger->OpenDisplay("Default").value();
     const auto res = m_nvnflinger->GetSystemBufferManager().Initialize(
-        &m_system_shared_buffer_id, &m_system_shared_layer_id, display_id);
+        m_process, &m_system_shared_buffer_id, &m_system_shared_layer_id, display_id, blending);
 
     if (res.IsSuccess()) {
         m_buffer_sharing_enabled = true;
@@ -62,8 +68,12 @@ void SystemBufferManager::SetWindowVisibility(bool visible) {
 
 Result SystemBufferManager::WriteAppletCaptureBuffer(bool* out_was_written,
                                                      s32* out_fbshare_layer_index) {
-    // TODO
-    R_SUCCEED();
+    if (!m_buffer_sharing_enabled) {
+        return VI::ResultPermissionDenied;
+    }
+
+    return m_nvnflinger->GetSystemBufferManager().WriteAppletCaptureBuffer(out_was_written,
+                                                                           out_fbshare_layer_index);
 }
 
 } // namespace Service::AM
diff --git a/src/core/hle/service/am/system_buffer_manager.h b/src/core/hle/service/am/system_buffer_manager.h
index 98c3cf055..0690f68b6 100755
--- a/src/core/hle/service/am/system_buffer_manager.h
+++ b/src/core/hle/service/am/system_buffer_manager.h
@@ -27,7 +27,8 @@ public:
     SystemBufferManager();
     ~SystemBufferManager();
 
-    bool Initialize(Nvnflinger::Nvnflinger* flinger, Kernel::KProcess* process, AppletId applet_id);
+    bool Initialize(Nvnflinger::Nvnflinger* flinger, Kernel::KProcess* process, AppletId applet_id,
+                    LibraryAppletMode mode);
 
     void GetSystemSharedLayerHandle(u64* out_system_shared_buffer_id,
                                     u64* out_system_shared_layer_id) {
diff --git a/src/core/hle/service/nvdrv/core/container.cpp b/src/core/hle/service/nvdrv/core/container.cpp
index 5a64576a4..18103504c 100755
--- a/src/core/hle/service/nvdrv/core/container.cpp
+++ b/src/core/hle/service/nvdrv/core/container.cpp
@@ -49,6 +49,7 @@ SessionId Container::OpenSession(Kernel::KProcess* process) {
             continue;
         }
         if (session.process == process) {
+            session.ref_count++;
             return session.id;
         }
     }
@@ -66,6 +67,7 @@ SessionId Container::OpenSession(Kernel::KProcess* process) {
     }
     auto& session = impl->sessions[new_id];
     session.is_active = true;
+    session.ref_count = 1;
     // Optimization
     if (process->IsApplication()) {
         auto& page_table = process->GetPageTable().GetBasePageTable();
@@ -114,8 +116,11 @@ SessionId Container::OpenSession(Kernel::KProcess* process) {
 
 void Container::CloseSession(SessionId session_id) {
     std::scoped_lock lk(impl->session_guard);
-    impl->file.UnmapAllHandles(session_id);
     auto& session = impl->sessions[session_id.id];
+    if (--session.ref_count > 0) {
+        return;
+    }
+    impl->file.UnmapAllHandles(session_id);
     auto& smmu = impl->host1x.MemoryManager();
     if (session.has_preallocated_area) {
         const DAddr region_start = session.mapper->GetRegionStart();
diff --git a/src/core/hle/service/nvdrv/core/container.h b/src/core/hle/service/nvdrv/core/container.h
index c4aa9018f..e352d7e70 100755
--- a/src/core/hle/service/nvdrv/core/container.h
+++ b/src/core/hle/service/nvdrv/core/container.h
@@ -46,6 +46,7 @@ struct Session {
     bool has_preallocated_area{};
     std::unique_ptr<HeapMapper> mapper{};
     bool is_active{};
+    s32 ref_count{};
 };
 
 class Container {
@@ -67,10 +68,7 @@ public:
     const SyncpointManager& GetSyncpointManager() const;
 
     struct Host1xDeviceFileData {
-        std::unordered_map<DeviceFD, u32> fd_to_id{};
         std::deque<u32> syncpts_accumulated{};
-        u32 nvdec_next_id{};
-        u32 vic_next_id{};
     };
 
     Host1xDeviceFileData& Host1xDeviceFile();
diff --git a/src/core/hle/service/nvdrv/core/nvmap.cpp b/src/core/hle/service/nvdrv/core/nvmap.cpp
index dc5455231..0e21bf6de 100755
--- a/src/core/hle/service/nvdrv/core/nvmap.cpp
+++ b/src/core/hle/service/nvdrv/core/nvmap.cpp
@@ -333,9 +333,13 @@ void NvMap::UnmapAllHandles(NvCore::SessionId session_id) {
     }();
 
     for (auto& [id, handle] : handles_copy) {
-        if (handle->session_id.id == session_id.id) {
-            FreeHandle(id, false);
+        {
+            std::scoped_lock lk{handle->mutex};
+            if (handle->session_id.id != session_id.id || handle->dupes <= 0) {
+                continue;
+            }
         }
+        FreeHandle(id, false);
     }
 }
 
diff --git a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp
index 62fefe4b9..e7285a8d8 100755
--- a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp
@@ -15,6 +15,22 @@
 
 namespace Service::Nvidia::Devices {
 
+namespace {
+
+Tegra::BlendMode ConvertBlending(Service::Nvnflinger::LayerBlending blending) {
+    switch (blending) {
+    case Service::Nvnflinger::LayerBlending::None:
+    default:
+        return Tegra::BlendMode::Opaque;
+    case Service::Nvnflinger::LayerBlending::Premultiplied:
+        return Tegra::BlendMode::Premultiplied;
+    case Service::Nvnflinger::LayerBlending::Coverage:
+        return Tegra::BlendMode::Coverage;
+    }
+}
+
+} // namespace
+
 nvdisp_disp0::nvdisp_disp0(Core::System& system_, NvCore::Container& core)
     : nvdevice{system_}, container{core}, nvmap{core.GetNvMapFile()} {}
 nvdisp_disp0::~nvdisp_disp0() = default;
@@ -56,6 +72,7 @@ void nvdisp_disp0::Composite(std::span<const Nvnflinger::HwcLayer> sorted_layers
             .pixel_format = layer.format,
             .transform_flags = layer.transform,
             .crop_rect = layer.crop_rect,
+            .blending = ConvertBlending(layer.blending),
         });
 
         for (size_t i = 0; i < layer.acquire_fence.num_fences; i++) {
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
index 762abc560..0daa97c7f 100755
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
@@ -8,6 +8,7 @@
 #include "core/hle/service/nvdrv/core/container.h"
 #include "core/hle/service/nvdrv/devices/ioctl_serialization.h"
 #include "core/hle/service/nvdrv/devices/nvhost_nvdec.h"
+#include "video_core/host1x/host1x.h"
 #include "video_core/renderer_base.h"
 
 namespace Service::Nvidia::Devices {
@@ -21,13 +22,8 @@ NvResult nvhost_nvdec::Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> in
     switch (command.group) {
     case 0x0:
         switch (command.cmd) {
-        case 0x1: {
-            auto& host1x_file = core.Host1xDeviceFile();
-            if (!host1x_file.fd_to_id.contains(fd)) {
-                host1x_file.fd_to_id[fd] = host1x_file.nvdec_next_id++;
-            }
+        case 0x1:
             return WrapFixedVariable(this, &nvhost_nvdec::Submit, input, output, fd);
-        }
         case 0x2:
             return WrapFixed(this, &nvhost_nvdec::GetSyncpoint, input, output);
         case 0x3:
@@ -72,15 +68,12 @@ void nvhost_nvdec::OnOpen(NvCore::SessionId session_id, DeviceFD fd) {
     LOG_INFO(Service_NVDRV, "NVDEC video stream started");
     system.SetNVDECActive(true);
     sessions[fd] = session_id;
+    host1x.StartDevice(fd, Tegra::Host1x::ChannelType::NvDec, channel_syncpoint);
 }
 
 void nvhost_nvdec::OnClose(DeviceFD fd) {
     LOG_INFO(Service_NVDRV, "NVDEC video stream ended");
-    auto& host1x_file = core.Host1xDeviceFile();
-    const auto iter = host1x_file.fd_to_id.find(fd);
-    if (iter != host1x_file.fd_to_id.end()) {
-        system.GPU().ClearCdmaInstance(iter->second);
-    }
+    host1x.StopDevice(fd, Tegra::Host1x::ChannelType::NvDec);
     system.SetNVDECActive(false);
     auto it = sessions.find(fd);
     if (it != sessions.end()) {
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
index 38778c714..da29f332c 100755
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
@@ -55,8 +55,9 @@ std::size_t WriteVectors(std::span<u8> dst, const std::vector<T>& src, std::size
 
 nvhost_nvdec_common::nvhost_nvdec_common(Core::System& system_, NvCore::Container& core_,
                                          NvCore::ChannelType channel_type_)
-    : nvdevice{system_}, core{core_}, syncpoint_manager{core.GetSyncpointManager()},
-      nvmap{core.GetNvMapFile()}, channel_type{channel_type_} {
+    : nvdevice{system_}, host1x{system_.Host1x()}, core{core_},
+      syncpoint_manager{core.GetSyncpointManager()}, nvmap{core.GetNvMapFile()},
+      channel_type{channel_type_} {
     auto& syncpts_accumulated = core.Host1xDeviceFile().syncpts_accumulated;
     if (syncpts_accumulated.empty()) {
         channel_syncpoint = syncpoint_manager.AllocateSyncpoint(false);
@@ -95,24 +96,24 @@ NvResult nvhost_nvdec_common::Submit(IoctlSubmit& params, std::span<u8> data, De
     offset += SliceVectors(data, syncpt_increments, params.syncpoint_count, offset);
     offset += SliceVectors(data, fence_thresholds, params.fence_count, offset);
 
-    auto& gpu = system.GPU();
     auto* session = core.GetSession(sessions[fd]);
 
-    if (gpu.UseNvdec()) {
-        for (std::size_t i = 0; i < syncpt_increments.size(); i++) {
-            const SyncptIncr& syncpt_incr = syncpt_increments[i];
-            fence_thresholds[i] =
-                syncpoint_manager.IncrementSyncpointMaxExt(syncpt_incr.id, syncpt_incr.increments);
-        }
+    for (std::size_t i = 0; i < syncpt_increments.size(); i++) {
+        const SyncptIncr& syncpt_incr = syncpt_increments[i];
+        fence_thresholds[i] =
+            syncpoint_manager.IncrementSyncpointMaxExt(syncpt_incr.id, syncpt_incr.increments);
     }
+
     for (const auto& cmd_buffer : command_buffers) {
         const auto object = nvmap.GetHandle(cmd_buffer.memory_id);
         ASSERT_OR_EXECUTE(object, return NvResult::InvalidState;);
-        Tegra::ChCommandHeaderList cmdlist(cmd_buffer.word_count);
-        session->process->GetMemory().ReadBlock(object->address + cmd_buffer.offset, cmdlist.data(),
-                                                cmdlist.size() * sizeof(u32));
-        gpu.PushCommandBuffer(core.Host1xDeviceFile().fd_to_id[fd], cmdlist);
+        Core::Memory::CpuGuestMemory<Tegra::ChCommandHeader,
+                                     Core::Memory::GuestMemoryFlags::SafeRead>
+            cmdlist(session->process->GetMemory(), object->address + cmd_buffer.offset,
+                    cmd_buffer.word_count);
+        host1x.PushEntries(fd, std::move(cmdlist));
     }
+
     // Some games expect command_buffers to be written back
     offset = 0;
     offset += WriteVectors(data, command_buffers, offset);
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h
index 59ed38fa9..9e6eefb19 100755
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h
@@ -119,6 +119,7 @@ protected:
 
     Kernel::KEvent* QueryEvent(u32 event_id) override;
 
+    Tegra::Host1x::Host1x& host1x;
     u32 channel_syncpoint;
     s32_le nvmap_fd{};
     u32_le submit_timeout{};
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
index 12ffddeb8..9d5bb298e 100755
--- a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
@@ -7,6 +7,7 @@
 #include "core/hle/service/nvdrv/core/container.h"
 #include "core/hle/service/nvdrv/devices/ioctl_serialization.h"
 #include "core/hle/service/nvdrv/devices/nvhost_vic.h"
+#include "video_core/host1x/host1x.h"
 #include "video_core/renderer_base.h"
 
 namespace Service::Nvidia::Devices {
@@ -21,13 +22,8 @@ NvResult nvhost_vic::Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> inpu
     switch (command.group) {
     case 0x0:
         switch (command.cmd) {
-        case 0x1: {
-            auto& host1x_file = core.Host1xDeviceFile();
-            if (!host1x_file.fd_to_id.contains(fd)) {
-                host1x_file.fd_to_id[fd] = host1x_file.vic_next_id++;
-            }
+        case 0x1:
             return WrapFixedVariable(this, &nvhost_vic::Submit, input, output, fd);
-        }
         case 0x2:
             return WrapFixed(this, &nvhost_vic::GetSyncpoint, input, output);
         case 0x3:
@@ -70,14 +66,11 @@ NvResult nvhost_vic::Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> inpu
 
 void nvhost_vic::OnOpen(NvCore::SessionId session_id, DeviceFD fd) {
     sessions[fd] = session_id;
+    host1x.StartDevice(fd, Tegra::Host1x::ChannelType::VIC, channel_syncpoint);
 }
 
 void nvhost_vic::OnClose(DeviceFD fd) {
-    auto& host1x_file = core.Host1xDeviceFile();
-    const auto iter = host1x_file.fd_to_id.find(fd);
-    if (iter != host1x_file.fd_to_id.end()) {
-        system.GPU().ClearCdmaInstance(iter->second);
-    }
+    host1x.StopDevice(fd, Tegra::Host1x::ChannelType::VIC);
     sessions.erase(fd);
 }
 
diff --git a/src/core/hle/service/nvnflinger/fb_share_buffer_manager.cpp b/src/core/hle/service/nvnflinger/fb_share_buffer_manager.cpp
index e71652cdf..90f7248a0 100755
--- a/src/core/hle/service/nvnflinger/fb_share_buffer_manager.cpp
+++ b/src/core/hle/service/nvnflinger/fb_share_buffer_manager.cpp
@@ -14,24 +14,20 @@
 #include "core/hle/service/nvnflinger/ui/graphic_buffer.h"
 #include "core/hle/service/vi/layer/vi_layer.h"
 #include "core/hle/service/vi/vi_results.h"
+#include "video_core/gpu.h"
+#include "video_core/host1x/host1x.h"
 
 namespace Service::Nvnflinger {
 
 namespace {
 
-Result AllocateIoForProcessAddressSpace(Common::ProcessAddress* out_map_address,
-                                        std::unique_ptr<Kernel::KPageGroup>* out_page_group,
-                                        Core::System& system, u32 size) {
+Result AllocateSharedBufferMemory(std::unique_ptr<Kernel::KPageGroup>* out_page_group,
+                                  Core::System& system, u32 size) {
     using Core::Memory::YUZU_PAGESIZE;
 
     // Allocate memory for the system shared buffer.
-    // FIXME: Because the gmmu can only point to cpu addresses, we need
-    //        to map this in the application space to allow it to be used.
-    // FIXME: Add proper smmu emulation.
     // FIXME: This memory belongs to vi's .data section.
     auto& kernel = system.Kernel();
-    auto* process = system.ApplicationProcess();
-    auto& page_table = process->GetPageTable();
 
     // Hold a temporary page group reference while we try to map it.
     auto pg = std::make_unique<Kernel::KPageGroup>(
@@ -43,6 +39,30 @@ Result AllocateIoForProcessAddressSpace(Common::ProcessAddress* out_map_address,
         Kernel::KMemoryManager::EncodeOption(Kernel::KMemoryManager::Pool::Secure,
                                              Kernel::KMemoryManager::Direction::FromBack)));
 
+    // Fill the output data with red.
+    for (auto& block : *pg) {
+        u32* start = system.DeviceMemory().GetPointer<u32>(block.GetAddress());
+        u32* end = system.DeviceMemory().GetPointer<u32>(block.GetAddress() + block.GetSize());
+
+        for (; start < end; start++) {
+            *start = 0xFF0000FF;
+        }
+    }
+
+    // Return the mapped page group.
+    *out_page_group = std::move(pg);
+
+    // We succeeded.
+    R_SUCCEED();
+}
+
+Result MapSharedBufferIntoProcessAddressSpace(Common::ProcessAddress* out_map_address,
+                                              std::unique_ptr<Kernel::KPageGroup>& pg,
+                                              Kernel::KProcess* process, Core::System& system) {
+    using Core::Memory::YUZU_PAGESIZE;
+
+    auto& page_table = process->GetPageTable();
+
     // Get bounds of where mapping is possible.
     const VAddr alias_code_begin = GetInteger(page_table.GetAliasCodeRegionStart());
     const VAddr alias_code_size = page_table.GetAliasCodeRegionSize() / YUZU_PAGESIZE;
@@ -64,9 +84,6 @@ Result AllocateIoForProcessAddressSpace(Common::ProcessAddress* out_map_address,
     // Return failure, if necessary
     R_UNLESS(i < 64, res);
 
-    // Return the mapped page group.
-    *out_page_group = std::move(pg);
-
     // We succeeded.
     R_SUCCEED();
 }
@@ -135,6 +152,13 @@ Result AllocateHandleForBuffer(u32* out_handle, Nvidia::Module& nvdrv, Nvidia::D
     R_RETURN(AllocNvMapHandle(*nvmap, *out_handle, buffer, size, nvmap_fd));
 }
 
+void FreeHandle(u32 handle, Nvidia::Module& nvdrv, Nvidia::DeviceFD nvmap_fd) {
+    auto nvmap = nvdrv.GetDevice<Nvidia::Devices::nvmap>(nvmap_fd);
+    ASSERT(nvmap != nullptr);
+
+    R_ASSERT(FreeNvMapHandle(*nvmap, handle, nvmap_fd));
+}
+
 constexpr auto SharedBufferBlockLinearFormat = android::PixelFormat::Rgba8888;
 constexpr u32 SharedBufferBlockLinearBpp = 4;
 
@@ -186,53 +210,97 @@ FbShareBufferManager::FbShareBufferManager(Core::System& system, Nvnflinger& fli
 
 FbShareBufferManager::~FbShareBufferManager() = default;
 
-Result FbShareBufferManager::Initialize(u64* out_buffer_id, u64* out_layer_id, u64 display_id) {
+Result FbShareBufferManager::Initialize(Kernel::KProcess* owner_process, u64* out_buffer_id,
+                                        u64* out_layer_handle, u64 display_id,
+                                        LayerBlending blending) {
     std::scoped_lock lk{m_guard};
 
-    // Ensure we have not already created a buffer.
-    R_UNLESS(m_buffer_id == 0, VI::ResultOperationFailed);
+    // Ensure we haven't already created.
+    const u64 aruid = owner_process->GetProcessId();
+    R_UNLESS(!m_sessions.contains(aruid), VI::ResultPermissionDenied);
 
-    // Allocate memory and space for the shared buffer.
-    Common::ProcessAddress map_address;
-    R_TRY(AllocateIoForProcessAddressSpace(std::addressof(map_address),
-                                           std::addressof(m_buffer_page_group), m_system,
-                                           SharedBufferSize));
+    // Allocate memory for the shared buffer if needed.
+    if (!m_buffer_page_group) {
+        R_TRY(AllocateSharedBufferMemory(std::addressof(m_buffer_page_group), m_system,
+                                         SharedBufferSize));
+
+        // Record buffer id.
+        m_buffer_id = m_next_buffer_id++;
+
+        // Record display id.
+        m_display_id = display_id;
+    }
+
+    // Map into process.
+    Common::ProcessAddress map_address{};
+    R_TRY(MapSharedBufferIntoProcessAddressSpace(std::addressof(map_address), m_buffer_page_group,
+                                                 owner_process, m_system));
+
+    // Create new session.
+    auto [it, was_emplaced] = m_sessions.emplace(aruid, FbShareSession{});
+    auto& session = it->second;
 
     auto& container = m_nvdrv->GetContainer();
-    m_session_id = container.OpenSession(m_system.ApplicationProcess());
-    m_nvmap_fd = m_nvdrv->Open("/dev/nvmap", m_session_id);
+    session.session_id = container.OpenSession(owner_process);
+    session.nvmap_fd = m_nvdrv->Open("/dev/nvmap", session.session_id);
 
     // Create an nvmap handle for the buffer and assign the memory to it.
-    R_TRY(AllocateHandleForBuffer(std::addressof(m_buffer_nvmap_handle), *m_nvdrv, m_nvmap_fd,
-                                  map_address, SharedBufferSize));
-
-    // Record the display id.
-    m_display_id = display_id;
+    R_TRY(AllocateHandleForBuffer(std::addressof(session.buffer_nvmap_handle), *m_nvdrv,
+                                  session.nvmap_fd, map_address, SharedBufferSize));
 
     // Create and open a layer for the display.
-    m_layer_id = m_flinger.CreateLayer(m_display_id).value();
-    m_flinger.OpenLayer(m_layer_id);
-
-    // Set up the buffer.
-    m_buffer_id = m_next_buffer_id++;
+    session.layer_id = m_flinger.CreateLayer(m_display_id, blending).value();
+    m_flinger.OpenLayer(session.layer_id);
 
     // Get the layer.
-    VI::Layer* layer = m_flinger.FindLayer(m_display_id, m_layer_id);
+    VI::Layer* layer = m_flinger.FindLayer(m_display_id, session.layer_id);
     ASSERT(layer != nullptr);
 
     // Get the producer and set preallocated buffers.
     auto& producer = layer->GetBufferQueue();
-    MakeGraphicBuffer(producer, 0, m_buffer_nvmap_handle);
-    MakeGraphicBuffer(producer, 1, m_buffer_nvmap_handle);
+    MakeGraphicBuffer(producer, 0, session.buffer_nvmap_handle);
+    MakeGraphicBuffer(producer, 1, session.buffer_nvmap_handle);
 
     // Assign outputs.
     *out_buffer_id = m_buffer_id;
-    *out_layer_id = m_layer_id;
+    *out_layer_handle = session.layer_id;
 
     // We succeeded.
     R_SUCCEED();
 }
 
+void FbShareBufferManager::Finalize(Kernel::KProcess* owner_process) {
+    std::scoped_lock lk{m_guard};
+
+    if (m_buffer_id == 0) {
+        return;
+    }
+
+    const u64 aruid = owner_process->GetProcessId();
+    const auto it = m_sessions.find(aruid);
+    if (it == m_sessions.end()) {
+        return;
+    }
+
+    auto& session = it->second;
+
+    // Destroy the layer.
+    m_flinger.DestroyLayer(session.layer_id);
+
+    // Close nvmap handle.
+    FreeHandle(session.buffer_nvmap_handle, *m_nvdrv, session.nvmap_fd);
+
+    // Close nvmap device.
+    m_nvdrv->Close(session.nvmap_fd);
+
+    // Close session.
+    auto& container = m_nvdrv->GetContainer();
+    container.CloseSession(session.session_id);
+
+    // Erase.
+    m_sessions.erase(it);
+}
+
 Result FbShareBufferManager::GetSharedBufferMemoryHandleId(u64* out_buffer_size,
                                                            s32* out_nvmap_handle,
                                                            SharedMemoryPoolLayout* out_pool_layout,
@@ -242,17 +310,18 @@ Result FbShareBufferManager::GetSharedBufferMemoryHandleId(u64* out_buffer_size,
 
     R_UNLESS(m_buffer_id > 0, VI::ResultNotFound);
     R_UNLESS(buffer_id == m_buffer_id, VI::ResultNotFound);
+    R_UNLESS(m_sessions.contains(applet_resource_user_id), VI::ResultNotFound);
 
     *out_pool_layout = SharedBufferPoolLayout;
     *out_buffer_size = SharedBufferSize;
-    *out_nvmap_handle = m_buffer_nvmap_handle;
+    *out_nvmap_handle = m_sessions[applet_resource_user_id].buffer_nvmap_handle;
 
     R_SUCCEED();
 }
 
 Result FbShareBufferManager::GetLayerFromId(VI::Layer** out_layer, u64 layer_id) {
     // Ensure the layer id is valid.
-    R_UNLESS(m_layer_id > 0 && layer_id == m_layer_id, VI::ResultNotFound);
+    R_UNLESS(layer_id > 0, VI::ResultNotFound);
 
     // Get the layer.
     VI::Layer* layer = m_flinger.FindLayer(m_display_id, layer_id);
@@ -309,6 +378,10 @@ Result FbShareBufferManager::PresentSharedFrameBuffer(android::Fence fence,
                  android::Status::NoError,
              VI::ResultOperationFailed);
 
+    ON_RESULT_FAILURE {
+        producer.CancelBuffer(static_cast<s32>(slot), fence);
+    };
+
     // Queue the buffer to the producer.
     android::QueueBufferInput input{};
     android::QueueBufferOutput output{};
@@ -342,4 +415,33 @@ Result FbShareBufferManager::GetSharedFrameBufferAcquirableEvent(Kernel::KReadab
     R_SUCCEED();
 }
 
+Result FbShareBufferManager::WriteAppletCaptureBuffer(bool* out_was_written, s32* out_layer_index) {
+    std::vector<u8> capture_buffer(m_system.GPU().GetAppletCaptureBuffer());
+    Common::ScratchBuffer<u32> scratch;
+
+    // TODO: this could be optimized
+    s64 e = -1280 * 768 * 4;
+    for (auto& block : *m_buffer_page_group) {
+        u8* start = m_system.DeviceMemory().GetPointer<u8>(block.GetAddress());
+        u8* end = m_system.DeviceMemory().GetPointer<u8>(block.GetAddress() + block.GetSize());
+
+        for (; start < end; start++) {
+            *start = 0;
+
+            if (e >= 0 && e < static_cast<s64>(capture_buffer.size())) {
+                *start = capture_buffer[e];
+            }
+            e++;
+        }
+
+        m_system.GPU().Host1x().MemoryManager().ApplyOpOnPointer(start, scratch, [&](DAddr addr) {
+            m_system.GPU().InvalidateRegion(addr, end - start);
+        });
+    }
+
+    *out_was_written = true;
+    *out_layer_index = 1;
+    R_SUCCEED();
+}
+
 } // namespace Service::Nvnflinger
diff --git a/src/core/hle/service/nvnflinger/fb_share_buffer_manager.h b/src/core/hle/service/nvnflinger/fb_share_buffer_manager.h
index 033bf4bbe..b79a7d23a 100755
--- a/src/core/hle/service/nvnflinger/fb_share_buffer_manager.h
+++ b/src/core/hle/service/nvnflinger/fb_share_buffer_manager.h
@@ -3,9 +3,12 @@
 
 #pragma once
 
+#include <map>
+
 #include "common/math_util.h"
 #include "core/hle/service/nvdrv/core/container.h"
 #include "core/hle/service/nvdrv/nvdata.h"
+#include "core/hle/service/nvnflinger/hwc_layer.h"
 #include "core/hle/service/nvnflinger/nvnflinger.h"
 #include "core/hle/service/nvnflinger/ui/fence.h"
 
@@ -29,13 +32,18 @@ struct SharedMemoryPoolLayout {
 };
 static_assert(sizeof(SharedMemoryPoolLayout) == 0x188, "SharedMemoryPoolLayout has wrong size");
 
+struct FbShareSession;
+
 class FbShareBufferManager final {
 public:
     explicit FbShareBufferManager(Core::System& system, Nvnflinger& flinger,
                                   std::shared_ptr<Nvidia::Module> nvdrv);
     ~FbShareBufferManager();
 
-    Result Initialize(u64* out_buffer_id, u64* out_layer_handle, u64 display_id);
+    Result Initialize(Kernel::KProcess* owner_process, u64* out_buffer_id, u64* out_layer_handle,
+                      u64 display_id, LayerBlending blending);
+    void Finalize(Kernel::KProcess* owner_process);
+
     Result GetSharedBufferMemoryHandleId(u64* out_buffer_size, s32* out_nvmap_handle,
                                          SharedMemoryPoolLayout* out_pool_layout, u64 buffer_id,
                                          u64 applet_resource_user_id);
@@ -45,6 +53,8 @@ public:
                                     u32 transform, s32 swap_interval, u64 layer_id, s64 slot);
     Result GetSharedFrameBufferAcquirableEvent(Kernel::KReadableEvent** out_event, u64 layer_id);
 
+    Result WriteAppletCaptureBuffer(bool* out_was_written, s32* out_layer_index);
+
 private:
     Result GetLayerFromId(VI::Layer** out_layer, u64 layer_id);
 
@@ -52,11 +62,8 @@ private:
     u64 m_next_buffer_id = 1;
     u64 m_display_id = 0;
     u64 m_buffer_id = 0;
-    u64 m_layer_id = 0;
-    u32 m_buffer_nvmap_handle = 0;
     SharedMemoryPoolLayout m_pool_layout = {};
-    Nvidia::DeviceFD m_nvmap_fd = {};
-    Nvidia::NvCore::SessionId m_session_id = {};
+    std::map<u64, FbShareSession> m_sessions;
     std::unique_ptr<Kernel::KPageGroup> m_buffer_page_group;
 
     std::mutex m_guard;
@@ -65,4 +72,11 @@ private:
     std::shared_ptr<Nvidia::Module> m_nvdrv;
 };
 
+struct FbShareSession {
+    Nvidia::DeviceFD nvmap_fd = {};
+    Nvidia::NvCore::SessionId session_id = {};
+    u64 layer_id = {};
+    u32 buffer_nvmap_handle = 0;
+};
+
 } // namespace Service::Nvnflinger
diff --git a/src/core/hle/service/nvnflinger/hardware_composer.cpp b/src/core/hle/service/nvnflinger/hardware_composer.cpp
index c720dd1f8..b2e35ffd4 100755
--- a/src/core/hle/service/nvnflinger/hardware_composer.cpp
+++ b/src/core/hle/service/nvnflinger/hardware_composer.cpp
@@ -109,6 +109,7 @@ u32 HardwareComposer::ComposeLocked(f32* out_speed_scale, VI::Display& display,
             .height = igbp_buffer.Height(),
             .stride = igbp_buffer.Stride(),
             .z_index = 0,
+            .blending = layer.GetBlending(),
             .transform = static_cast<android::BufferTransformFlags>(item.transform),
             .crop_rect = item.crop,
             .acquire_fence = item.fence,
diff --git a/src/core/hle/service/nvnflinger/hwc_layer.h b/src/core/hle/service/nvnflinger/hwc_layer.h
index 3af668a25..f71a5d822 100755
--- a/src/core/hle/service/nvnflinger/hwc_layer.h
+++ b/src/core/hle/service/nvnflinger/hwc_layer.h
@@ -11,6 +11,18 @@
 
 namespace Service::Nvnflinger {
 
+// hwc_layer_t::blending values
+enum class LayerBlending : u32 {
+    // No blending
+    None = 0x100,
+
+    // ONE / ONE_MINUS_SRC_ALPHA
+    Premultiplied = 0x105,
+
+    // SRC_ALPHA / ONE_MINUS_SRC_ALPHA
+    Coverage = 0x405,
+};
+
 struct HwcLayer {
     u32 buffer_handle;
     u32 offset;
@@ -19,6 +31,7 @@ struct HwcLayer {
     u32 height;
     u32 stride;
     s32 z_index;
+    LayerBlending blending;
     android::BufferTransformFlags transform;
     Common::Rectangle<int> crop_rect;
     android::Fence acquire_fence;
diff --git a/src/core/hle/service/nvnflinger/nvnflinger.cpp b/src/core/hle/service/nvnflinger/nvnflinger.cpp
index a4e848882..0ad3e099a 100755
--- a/src/core/hle/service/nvnflinger/nvnflinger.cpp
+++ b/src/core/hle/service/nvnflinger/nvnflinger.cpp
@@ -157,7 +157,7 @@ bool Nvnflinger::CloseDisplay(u64 display_id) {
     return true;
 }
 
-std::optional<u64> Nvnflinger::CreateLayer(u64 display_id) {
+std::optional<u64> Nvnflinger::CreateLayer(u64 display_id, LayerBlending blending) {
     const auto lock_guard = Lock();
     auto* const display = FindDisplay(display_id);
 
@@ -166,13 +166,14 @@ std::optional<u64> Nvnflinger::CreateLayer(u64 display_id) {
     }
 
     const u64 layer_id = next_layer_id++;
-    CreateLayerAtId(*display, layer_id);
+    CreateLayerAtId(*display, layer_id, blending);
     return layer_id;
 }
 
-void Nvnflinger::CreateLayerAtId(VI::Display& display, u64 layer_id) {
+void Nvnflinger::CreateLayerAtId(VI::Display& display, u64 layer_id, LayerBlending blending) {
     const auto buffer_id = next_buffer_queue_id++;
     display.CreateLayer(layer_id, buffer_id, nvdrv->container);
+    display.FindLayer(layer_id)->SetBlending(blending);
 }
 
 bool Nvnflinger::OpenLayer(u64 layer_id) {
diff --git a/src/core/hle/service/nvnflinger/nvnflinger.h b/src/core/hle/service/nvnflinger/nvnflinger.h
index c984d55a0..4cf4f069d 100755
--- a/src/core/hle/service/nvnflinger/nvnflinger.h
+++ b/src/core/hle/service/nvnflinger/nvnflinger.h
@@ -15,6 +15,7 @@
 #include "common/thread.h"
 #include "core/hle/result.h"
 #include "core/hle/service/kernel_helpers.h"
+#include "core/hle/service/nvnflinger/hwc_layer.h"
 
 namespace Common {
 class Event;
@@ -72,7 +73,8 @@ public:
     /// Creates a layer on the specified display and returns the layer ID.
     ///
     /// If an invalid display ID is specified, then an empty optional is returned.
-    [[nodiscard]] std::optional<u64> CreateLayer(u64 display_id);
+    [[nodiscard]] std::optional<u64> CreateLayer(u64 display_id,
+                                                 LayerBlending blending = LayerBlending::None);
 
     /// Opens a layer on all displays for the given layer ID.
     bool OpenLayer(u64 layer_id);
@@ -128,7 +130,7 @@ private:
     [[nodiscard]] VI::Layer* FindLayer(u64 display_id, u64 layer_id);
 
     /// Creates a layer with the specified layer ID in the desired display.
-    void CreateLayerAtId(VI::Display& display, u64 layer_id);
+    void CreateLayerAtId(VI::Display& display, u64 layer_id, LayerBlending blending);
 
     void SplitVSync(std::stop_token stop_token);
 
diff --git a/src/core/hle/service/vi/layer/vi_layer.cpp b/src/core/hle/service/vi/layer/vi_layer.cpp
index 0185d187d..cb9af576c 100755
--- a/src/core/hle/service/vi/layer/vi_layer.cpp
+++ b/src/core/hle/service/vi/layer/vi_layer.cpp
@@ -1,6 +1,7 @@
 // SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
+#include "core/hle/service/nvnflinger/hwc_layer.h"
 #include "core/hle/service/vi/layer/vi_layer.h"
 
 namespace Service::VI {
@@ -8,8 +9,9 @@ namespace Service::VI {
 Layer::Layer(u64 layer_id_, u32 binder_id_, android::BufferQueueCore& core_,
              android::BufferQueueProducer& binder_,
              std::shared_ptr<android::BufferItemConsumer>&& consumer_)
-    : layer_id{layer_id_}, binder_id{binder_id_}, core{core_}, binder{binder_},
-      consumer{std::move(consumer_)}, open{false}, visible{true} {}
+    : layer_id{layer_id_}, binder_id{binder_id_}, core{core_}, binder{binder_}, consumer{std::move(
+                                                                                    consumer_)},
+      blending{Nvnflinger::LayerBlending::None}, open{false}, visible{true} {}
 
 Layer::~Layer() = default;
 
diff --git a/src/core/hle/service/vi/layer/vi_layer.h b/src/core/hle/service/vi/layer/vi_layer.h
index 859346eab..dd5c84693 100755
--- a/src/core/hle/service/vi/layer/vi_layer.h
+++ b/src/core/hle/service/vi/layer/vi_layer.h
@@ -14,6 +14,10 @@ class BufferQueueCore;
 class BufferQueueProducer;
 } // namespace Service::android
 
+namespace Service::Nvnflinger {
+enum class LayerBlending : u32;
+}
+
 namespace Service::VI {
 
 /// Represents a single display layer.
@@ -92,12 +96,21 @@ public:
         return !std::exchange(open, true);
     }
 
+    Nvnflinger::LayerBlending GetBlending() {
+        return blending;
+    }
+
+    void SetBlending(Nvnflinger::LayerBlending b) {
+        blending = b;
+    }
+
 private:
     const u64 layer_id;
     const u32 binder_id;
     android::BufferQueueCore& core;
     android::BufferQueueProducer& binder;
     std::shared_ptr<android::BufferItemConsumer> consumer;
+    Service::Nvnflinger::LayerBlending blending;
     bool open;
     bool visible;
 };
diff --git a/src/core/memory.h b/src/core/memory.h
index 80a93ef90..c8fd99c82 100755
--- a/src/core/memory.h
+++ b/src/core/memory.h
@@ -64,6 +64,8 @@ public:
     Memory(Memory&&) = default;
     Memory& operator=(Memory&&) = delete;
 
+    static constexpr bool HAS_FLUSH_INVALIDATION = false;
+
     /**
      * Resets the state of the Memory system.
      */
diff --git a/src/frontend_common/config.cpp b/src/frontend_common/config.cpp
index d34624d28..cbbb07ac7 100755
--- a/src/frontend_common/config.cpp
+++ b/src/frontend_common/config.cpp
@@ -401,6 +401,14 @@ void Config::ReadNetworkValues() {
     EndGroup();
 }
 
+void Config::ReadLibraryAppletValues() {
+    BeginGroup(Settings::TranslateCategory(Settings::Category::LibraryApplet));
+
+    ReadCategory(Settings::Category::LibraryApplet);
+
+    EndGroup();
+}
+
 void Config::ReadValues() {
     if (global) {
         ReadDataStorageValues();
@@ -410,6 +418,7 @@ void Config::ReadValues() {
         ReadServiceValues();
         ReadWebServiceValues();
         ReadMiscellaneousValues();
+        ReadLibraryAppletValues();
     }
     ReadControlValues();
     ReadCoreValues();
@@ -511,6 +520,7 @@ void Config::SaveValues() {
         SaveNetworkValues();
         SaveWebServiceValues();
         SaveMiscellaneousValues();
+        SaveLibraryAppletValues();
     } else {
         LOG_DEBUG(Config, "Saving only generic configuration values");
     }
@@ -691,6 +701,14 @@ void Config::SaveWebServiceValues() {
     EndGroup();
 }
 
+void Config::SaveLibraryAppletValues() {
+    BeginGroup(Settings::TranslateCategory(Settings::Category::LibraryApplet));
+
+    WriteCategory(Settings::Category::LibraryApplet);
+
+    EndGroup();
+}
+
 bool Config::ReadBooleanSetting(const std::string& key, const std::optional<bool> default_value) {
     std::string full_key = GetFullKey(key, false);
     if (!default_value.has_value()) {
diff --git a/src/frontend_common/config.h b/src/frontend_common/config.h
index 4ecb97044..8b0599cc3 100755
--- a/src/frontend_common/config.h
+++ b/src/frontend_common/config.h
@@ -88,6 +88,7 @@ protected:
     void ReadSystemValues();
     void ReadWebServiceValues();
     void ReadNetworkValues();
+    void ReadLibraryAppletValues();
 
     // Read platform specific sections
     virtual void ReadHidbusValues() = 0;
@@ -121,6 +122,7 @@ protected:
     void SaveScreenshotValues();
     void SaveSystemValues();
     void SaveWebServiceValues();
+    void SaveLibraryAppletValues();
 
     // Save platform specific sections
     virtual void SaveHidbusValues() = 0;
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 1da7b9fbe..578a3f66b 100755
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -18,6 +18,7 @@ add_library(video_core STATIC
     buffer_cache/usage_tracker.h
     buffer_cache/word_manager.h
     cache_types.h
+    capture.h
     cdma_pusher.cpp
     cdma_pusher.h
     compatible_formats.cpp
@@ -59,8 +60,8 @@ add_library(video_core STATIC
     framebuffer_config.h
     fsr.cpp
     fsr.h
-    host1x/codecs/codec.cpp
-    host1x/codecs/codec.h
+    host1x/codecs/decoder.cpp
+    host1x/codecs/decoder.h
     host1x/codecs/h264.cpp
     host1x/codecs/h264.h
     host1x/codecs/vp8.cpp
@@ -79,8 +80,6 @@ add_library(video_core STATIC
     host1x/nvdec.cpp
     host1x/nvdec.h
     host1x/nvdec_common.h
-    host1x/sync_manager.cpp
-    host1x/sync_manager.h
     host1x/syncpoint_manager.cpp
     host1x/syncpoint_manager.h
     host1x/vic.cpp
@@ -101,6 +100,7 @@ add_library(video_core STATIC
     memory_manager.cpp
     memory_manager.h
     precompiled_headers.h
+    present.h
     pte_kind.h
     query_cache/bank_base.h
     query_cache/query_base.h
diff --git a/src/video_core/capture.h b/src/video_core/capture.h
new file mode 100755
index 000000000..8db14a8ec
--- /dev/null
+++ b/src/video_core/capture.h
@@ -0,0 +1,36 @@
+// SPDX-FileCopyrightText: Copyright 2024 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include "common/alignment.h"
+#include "common/bit_util.h"
+#include "common/common_types.h"
+#include "core/frontend/framebuffer_layout.h"
+#include "video_core/surface.h"
+
+namespace VideoCore::Capture {
+
+constexpr u32 BlockHeight = 4;
+constexpr u32 BlockDepth = 0;
+constexpr u32 BppLog2 = 2;
+
+constexpr auto PixelFormat = Surface::PixelFormat::B8G8R8A8_UNORM;
+
+constexpr auto LinearWidth = Layout::ScreenUndocked::Width;
+constexpr auto LinearHeight = Layout::ScreenUndocked::Height;
+constexpr auto LinearDepth = 1U;
+constexpr auto BytesPerPixel = 4U;
+
+constexpr auto TiledWidth = LinearWidth;
+constexpr auto TiledHeight = Common::AlignUpLog2(LinearHeight, BlockHeight + BlockDepth + BppLog2);
+constexpr auto TiledSize = TiledWidth * TiledHeight * (1 << BppLog2);
+
+constexpr Layout::FramebufferLayout Layout{
+    .width = LinearWidth,
+    .height = LinearHeight,
+    .screen = {0, 0, LinearWidth, LinearHeight},
+    .is_srgb = false,
+};
+
+} // namespace VideoCore::Capture
diff --git a/src/video_core/cdma_pusher.cpp b/src/video_core/cdma_pusher.cpp
index 2e38410cf..6daca0f0a 100755
--- a/src/video_core/cdma_pusher.cpp
+++ b/src/video_core/cdma_pusher.cpp
@@ -2,136 +2,130 @@
 // SPDX-License-Identifier: MIT
 
 #include <bit>
+
+#include "common/thread.h"
+#include "core/core.h"
 #include "video_core/cdma_pusher.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/host1x/control.h"
 #include "video_core/host1x/host1x.h"
 #include "video_core/host1x/nvdec.h"
 #include "video_core/host1x/nvdec_common.h"
-#include "video_core/host1x/sync_manager.h"
 #include "video_core/host1x/vic.h"
 #include "video_core/memory_manager.h"
 
 namespace Tegra {
-CDmaPusher::CDmaPusher(Host1x::Host1x& host1x_)
-    : host1x{host1x_}, nvdec_processor(std::make_shared<Host1x::Nvdec>(host1x)),
-      vic_processor(std::make_unique<Host1x::Vic>(host1x, nvdec_processor)),
-      host1x_processor(std::make_unique<Host1x::Control>(host1x)),
-      sync_manager(std::make_unique<Host1x::SyncptIncrManager>(host1x)) {}
+
+CDmaPusher::CDmaPusher(Host1x::Host1x& host1x_, s32 id)
+    : host1x{host1x_}, memory_manager{host1x.GMMU()},
+      host_processor{std::make_unique<Host1x::Control>(host1x_)}, current_class{
+                                                                      static_cast<ChClassId>(id)} {
+    thread = std::jthread([this](std::stop_token stop_token) { ProcessEntries(stop_token); });
+}
 
 CDmaPusher::~CDmaPusher() = default;
 
-void CDmaPusher::ProcessEntries(ChCommandHeaderList&& entries) {
-    for (const auto& value : entries) {
-        if (mask != 0) {
-            const auto lbs = static_cast<u32>(std::countr_zero(mask));
-            mask &= ~(1U << lbs);
-            ExecuteCommand(offset + lbs, value.raw);
-            continue;
-        } else if (count != 0) {
-            --count;
-            ExecuteCommand(offset, value.raw);
-            if (incrementing) {
-                ++offset;
+void CDmaPusher::ProcessEntries(std::stop_token stop_token) {
+    Common::SetCurrentThreadPriority(Common::ThreadPriority::High);
+    ChCommandHeaderList command_list{host1x.System().ApplicationMemory(), 0, 0};
+    u32 count{};
+    u32 method_offset{};
+    u32 mask{};
+    bool incrementing{};
+
+    while (!stop_token.stop_requested()) {
+        {
+            std::unique_lock l{command_mutex};
+            Common::CondvarWait(command_cv, l, stop_token,
+                                [this]() { return command_lists.size() > 0; });
+            if (stop_token.stop_requested()) {
+                return;
             }
-            continue;
+
+            command_list = std::move(command_lists.front());
+            command_lists.pop_front();
         }
-        const auto mode = value.submission_mode.Value();
-        switch (mode) {
-        case ChSubmissionMode::SetClass: {
-            mask = value.value & 0x3f;
-            offset = value.method_offset;
-            current_class = static_cast<ChClassId>((value.value >> 6) & 0x3ff);
-            break;
-        }
-        case ChSubmissionMode::Incrementing:
-        case ChSubmissionMode::NonIncrementing:
-            count = value.value;
-            offset = value.method_offset;
-            incrementing = mode == ChSubmissionMode::Incrementing;
-            break;
-        case ChSubmissionMode::Mask:
-            mask = value.value;
-            offset = value.method_offset;
-            break;
-        case ChSubmissionMode::Immediate: {
-            const u32 data = value.value & 0xfff;
-            offset = value.method_offset;
-            ExecuteCommand(offset, data);
-            break;
-        }
-        default:
-            UNIMPLEMENTED_MSG("ChSubmission mode {} is not implemented!", static_cast<u32>(mode));
-            break;
+
+        size_t i = 0;
+        for (const auto value : command_list) {
+            i++;
+            if (mask != 0) {
+                const auto lbs = static_cast<u32>(std::countr_zero(mask));
+                mask &= ~(1U << lbs);
+                ExecuteCommand(method_offset + lbs, value.raw);
+                continue;
+            } else if (count != 0) {
+                --count;
+                ExecuteCommand(method_offset, value.raw);
+                if (incrementing) {
+                    ++method_offset;
+                }
+                continue;
+            }
+            const auto mode = value.submission_mode.Value();
+            switch (mode) {
+            case ChSubmissionMode::SetClass: {
+                mask = value.value & 0x3f;
+                method_offset = value.method_offset;
+                current_class = static_cast<ChClassId>((value.value >> 6) & 0x3ff);
+                break;
+            }
+            case ChSubmissionMode::Incrementing:
+            case ChSubmissionMode::NonIncrementing:
+                count = value.value;
+                method_offset = value.method_offset;
+                incrementing = mode == ChSubmissionMode::Incrementing;
+                break;
+            case ChSubmissionMode::Mask:
+                mask = value.value;
+                method_offset = value.method_offset;
+                break;
+            case ChSubmissionMode::Immediate: {
+                const u32 data = value.value & 0xfff;
+                method_offset = value.method_offset;
+                ExecuteCommand(method_offset, data);
+                break;
+            }
+            default:
+                LOG_ERROR(HW_GPU, "Bad command at index {} (bytes 0x{:X}), buffer size {}", i - 1,
+                          (i - 1) * sizeof(u32), command_list.size());
+                UNIMPLEMENTED_MSG("ChSubmission mode {} is not implemented!",
+                                  static_cast<u32>(mode));
+                break;
+            }
         }
     }
 }
 
-void CDmaPusher::ExecuteCommand(u32 state_offset, u32 data) {
+void CDmaPusher::ExecuteCommand(u32 method, u32 arg) {
     switch (current_class) {
-    case ChClassId::NvDec:
-        ThiStateWrite(nvdec_thi_state, offset, data);
-        switch (static_cast<ThiMethod>(offset)) {
-        case ThiMethod::IncSyncpt: {
-            LOG_DEBUG(Service_NVDRV, "NVDEC Class IncSyncpt Method");
-            const auto syncpoint_id = static_cast<u32>(data & 0xFF);
-            const auto cond = static_cast<u32>((data >> 8) & 0xFF);
-            if (cond == 0) {
-                sync_manager->Increment(syncpoint_id);
-            } else {
-                sync_manager->SignalDone(
-                    sync_manager->IncrementWhenDone(static_cast<u32>(current_class), syncpoint_id));
-            }
-            break;
-        }
-        case ThiMethod::SetMethod1:
-            LOG_DEBUG(Service_NVDRV, "NVDEC method 0x{:X}",
-                      static_cast<u32>(nvdec_thi_state.method_0));
-            nvdec_processor->ProcessMethod(nvdec_thi_state.method_0, data);
-            break;
-        default:
-            break;
-        }
-        break;
-    case ChClassId::GraphicsVic:
-        ThiStateWrite(vic_thi_state, static_cast<u32>(state_offset), {data});
-        switch (static_cast<ThiMethod>(state_offset)) {
-        case ThiMethod::IncSyncpt: {
-            LOG_DEBUG(Service_NVDRV, "VIC Class IncSyncpt Method");
-            const auto syncpoint_id = static_cast<u32>(data & 0xFF);
-            const auto cond = static_cast<u32>((data >> 8) & 0xFF);
-            if (cond == 0) {
-                sync_manager->Increment(syncpoint_id);
-            } else {
-                sync_manager->SignalDone(
-                    sync_manager->IncrementWhenDone(static_cast<u32>(current_class), syncpoint_id));
-            }
-            break;
-        }
-        case ThiMethod::SetMethod1:
-            LOG_DEBUG(Service_NVDRV, "VIC method 0x{:X}, Args=({})",
-                      static_cast<u32>(vic_thi_state.method_0), data);
-            vic_processor->ProcessMethod(static_cast<Host1x::Vic::Method>(vic_thi_state.method_0),
-                                         data);
-            break;
-        default:
-            break;
-        }
-        break;
     case ChClassId::Control:
-        // This device is mainly for syncpoint synchronization
-        LOG_DEBUG(Service_NVDRV, "Host1X Class Method");
-        host1x_processor->ProcessMethod(static_cast<Host1x::Control::Method>(offset), data);
+        LOG_TRACE(Service_NVDRV, "Class {} method 0x{:X} arg 0x{:X}",
+                  static_cast<u32>(current_class), method, arg);
+        host_processor->ProcessMethod(static_cast<Host1x::Control::Method>(method), arg);
         break;
     default:
-        UNIMPLEMENTED_MSG("Current class not implemented {:X}", static_cast<u32>(current_class));
-        break;
+        thi_regs.reg_array[method] = arg;
+        switch (static_cast<ThiMethod>(method)) {
+        case ThiMethod::IncSyncpt: {
+            const auto syncpoint_id = static_cast<u32>(arg & 0xFF);
+            [[maybe_unused]] const auto cond = static_cast<u32>((arg >> 8) & 0xFF);
+            LOG_TRACE(Service_NVDRV, "Class {} IncSyncpt Method, syncpt {} cond {}",
+                      static_cast<u32>(current_class), syncpoint_id, cond);
+            auto& syncpoint_manager = host1x.GetSyncpointManager();
+            syncpoint_manager.IncrementGuest(syncpoint_id);
+            syncpoint_manager.IncrementHost(syncpoint_id);
+            break;
+        }
+        case ThiMethod::SetMethod1:
+            LOG_TRACE(Service_NVDRV, "Class {} method 0x{:X} arg 0x{:X}",
+                      static_cast<u32>(current_class), static_cast<u32>(thi_regs.method_0), arg);
+            ProcessMethod(thi_regs.method_0, arg);
+            break;
+        default:
+            break;
+        }
     }
 }
 
-void CDmaPusher::ThiStateWrite(ThiRegisters& state, u32 state_offset, u32 argument) {
-    u8* const offset_ptr = reinterpret_cast<u8*>(&state) + sizeof(u32) * state_offset;
-    std::memcpy(offset_ptr, &argument, sizeof(u32));
-}
-
 } // namespace Tegra
diff --git a/src/video_core/cdma_pusher.h b/src/video_core/cdma_pusher.h
index 2d663cdaa..553654b8f 100755
--- a/src/video_core/cdma_pusher.h
+++ b/src/video_core/cdma_pusher.h
@@ -3,12 +3,18 @@
 
 #pragma once
 
+#include <condition_variable>
+#include <deque>
 #include <memory>
+#include <mutex>
+#include <thread>
 #include <vector>
 
 #include "common/bit_field.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
+#include "common/polyfill_thread.h"
+#include "core/memory.h"
 
 namespace Tegra {
 
@@ -62,23 +68,31 @@ struct ChCommand {
     std::vector<u32> arguments;
 };
 
-using ChCommandHeaderList = std::vector<ChCommandHeader>;
+using ChCommandHeaderList =
+    Core::Memory::CpuGuestMemory<Tegra::ChCommandHeader, Core::Memory::GuestMemoryFlags::SafeRead>;
 
 struct ThiRegisters {
-    u32_le increment_syncpt{};
-    INSERT_PADDING_WORDS(1);
-    u32_le increment_syncpt_error{};
-    u32_le ctx_switch_incremement_syncpt{};
-    INSERT_PADDING_WORDS(4);
-    u32_le ctx_switch{};
-    INSERT_PADDING_WORDS(1);
-    u32_le ctx_syncpt_eof{};
-    INSERT_PADDING_WORDS(5);
-    u32_le method_0{};
-    u32_le method_1{};
-    INSERT_PADDING_WORDS(12);
-    u32_le int_status{};
-    u32_le int_mask{};
+    static constexpr std::size_t NUM_REGS = 0x20;
+
+    union {
+        struct {
+            u32_le increment_syncpt;
+            INSERT_PADDING_WORDS_NOINIT(1);
+            u32_le increment_syncpt_error;
+            u32_le ctx_switch_incremement_syncpt;
+            INSERT_PADDING_WORDS_NOINIT(4);
+            u32_le ctx_switch;
+            INSERT_PADDING_WORDS_NOINIT(1);
+            u32_le ctx_syncpt_eof;
+            INSERT_PADDING_WORDS_NOINIT(5);
+            u32_le method_0;
+            u32_le method_1;
+            INSERT_PADDING_WORDS_NOINIT(12);
+            u32_le int_status;
+            u32_le int_mask;
+        };
+        std::array<u32, NUM_REGS> reg_array;
+    };
 };
 
 enum class ThiMethod : u32 {
@@ -89,32 +103,39 @@ enum class ThiMethod : u32 {
 
 class CDmaPusher {
 public:
-    explicit CDmaPusher(Host1x::Host1x& host1x);
-    ~CDmaPusher();
+    CDmaPusher() = delete;
+    virtual ~CDmaPusher();
 
-    /// Process the command entry
-    void ProcessEntries(ChCommandHeaderList&& entries);
+    void PushEntries(ChCommandHeaderList&& entries) {
+        std::scoped_lock l{command_mutex};
+        command_lists.push_back(std::move(entries));
+        command_cv.notify_one();
+    }
+
+protected:
+    explicit CDmaPusher(Host1x::Host1x& host1x, s32 id);
+
+    virtual void ProcessMethod(u32 method, u32 arg) = 0;
+
+    Host1x::Host1x& host1x;
+    Tegra::MemoryManager& memory_manager;
 
 private:
+    /// Process the command entry
+    void ProcessEntries(std::stop_token stop_token);
+
     /// Invoke command class devices to execute the command based on the current state
     void ExecuteCommand(u32 state_offset, u32 data);
 
-    /// Write arguments value to the ThiRegisters member at the specified offset
-    void ThiStateWrite(ThiRegisters& state, u32 offset, u32 argument);
+    std::unique_ptr<Host1x::Control> host_processor;
 
-    Host1x::Host1x& host1x;
-    std::shared_ptr<Tegra::Host1x::Nvdec> nvdec_processor;
-    std::unique_ptr<Tegra::Host1x::Vic> vic_processor;
-    std::unique_ptr<Tegra::Host1x::Control> host1x_processor;
-    std::unique_ptr<Host1x::SyncptIncrManager> sync_manager;
-    ChClassId current_class{};
-    ThiRegisters vic_thi_state{};
-    ThiRegisters nvdec_thi_state{};
+    std::mutex command_mutex;
+    std::condition_variable_any command_cv;
+    std::deque<ChCommandHeaderList> command_lists;
+    std::jthread thread;
 
-    u32 count{};
-    u32 offset{};
-    u32 mask{};
-    bool incrementing{};
+    ThiRegisters thi_regs{};
+    ChClassId current_class;
 };
 
 } // namespace Tegra
diff --git a/src/video_core/framebuffer_config.h b/src/video_core/framebuffer_config.h
index 4855fdae3..d196dd96a 100755
--- a/src/video_core/framebuffer_config.h
+++ b/src/video_core/framebuffer_config.h
@@ -11,6 +11,12 @@
 
 namespace Tegra {
 
+enum class BlendMode {
+    Opaque,
+    Premultiplied,
+    Coverage,
+};
+
 /**
  * Struct describing framebuffer configuration
  */
@@ -23,6 +29,7 @@ struct FramebufferConfig {
     Service::android::PixelFormat pixel_format{};
     Service::android::BufferTransformFlags transform_flags{};
     Common::Rectangle<int> crop_rect{};
+    BlendMode blending{};
 };
 
 Common::Rectangle<f32> NormalizeCrop(const FramebufferConfig& framebuffer, u32 texture_width,
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 03ad6e68b..59356015b 100755
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -250,30 +250,6 @@ struct GPU::Impl {
         gpu_thread.SubmitList(channel, std::move(entries));
     }
 
-    /// Push GPU command buffer entries to be processed
-    void PushCommandBuffer(u32 id, Tegra::ChCommandHeaderList& entries) {
-        if (!use_nvdec) {
-            return;
-        }
-
-        if (!cdma_pushers.contains(id)) {
-            cdma_pushers.insert_or_assign(id, std::make_unique<Tegra::CDmaPusher>(host1x));
-        }
-
-        // SubmitCommandBuffer would make the nvdec operations async, this is not currently working
-        // TODO(ameerj): RE proper async nvdec operation
-        // gpu_thread.SubmitCommandBuffer(std::move(entries));
-        cdma_pushers[id]->ProcessEntries(std::move(entries));
-    }
-
-    /// Frees the CDMAPusher instance to free up resources
-    void ClearCdmaInstance(u32 id) {
-        const auto iter = cdma_pushers.find(id);
-        if (iter != cdma_pushers.end()) {
-            cdma_pushers.erase(iter);
-        }
-    }
-
     /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
     void FlushRegion(DAddr addr, u64 size) {
         gpu_thread.FlushRegion(addr, size);
@@ -347,11 +323,21 @@ struct GPU::Impl {
         WaitForSyncOperation(wait_fence);
     }
 
+    std::vector<u8> GetAppletCaptureBuffer() {
+        std::vector<u8> out;
+
+        const auto wait_fence =
+            RequestSyncOperation([&] { out = renderer->GetAppletCaptureBuffer(); });
+        gpu_thread.TickGPU();
+        WaitForSyncOperation(wait_fence);
+
+        return out;
+    }
+
     GPU& gpu;
     Core::System& system;
     Host1x::Host1x& host1x;
 
-    std::map<u32, std::unique_ptr<Tegra::CDmaPusher>> cdma_pushers;
     std::unique_ptr<VideoCore::RendererBase> renderer;
     VideoCore::RasterizerInterface* rasterizer = nullptr;
     const bool use_nvdec;
@@ -505,6 +491,10 @@ void GPU::RequestComposite(std::vector<Tegra::FramebufferConfig>&& layers,
     impl->RequestComposite(std::move(layers), std::move(fences));
 }
 
+std::vector<u8> GPU::GetAppletCaptureBuffer() {
+    return impl->GetAppletCaptureBuffer();
+}
+
 u64 GPU::GetTicks() const {
     return impl->GetTicks();
 }
@@ -541,14 +531,6 @@ void GPU::PushGPUEntries(s32 channel, Tegra::CommandList&& entries) {
     impl->PushGPUEntries(channel, std::move(entries));
 }
 
-void GPU::PushCommandBuffer(u32 id, Tegra::ChCommandHeaderList& entries) {
-    impl->PushCommandBuffer(id, entries);
-}
-
-void GPU::ClearCdmaInstance(u32 id) {
-    impl->ClearCdmaInstance(id);
-}
-
 VideoCore::RasterizerDownloadArea GPU::OnCPURead(PAddr addr, u64 size) {
     return impl->OnCPURead(addr, size);
 }
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index 86edf9aaa..25c75a109 100755
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -215,6 +215,8 @@ public:
     void RequestComposite(std::vector<Tegra::FramebufferConfig>&& layers,
                           std::vector<Service::Nvidia::NvFence>&& fences);
 
+    std::vector<u8> GetAppletCaptureBuffer();
+
     /// Performs any additional setup necessary in order to begin GPU emulation.
     /// This can be used to launch any necessary threads and register any necessary
     /// core timing events.
@@ -232,15 +234,6 @@ public:
     /// Push GPU command entries to be processed
     void PushGPUEntries(s32 channel, Tegra::CommandList&& entries);
 
-    /// Push GPU command buffer entries to be processed
-    void PushCommandBuffer(u32 id, Tegra::ChCommandHeaderList& entries);
-
-    /// Frees the CDMAPusher instance to free up resources
-    void ClearCdmaInstance(u32 id);
-
-    /// Swap buffers (render frame)
-    void SwapBuffers(const Tegra::FramebufferConfig* framebuffer);
-
     /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
     [[nodiscard]] VideoCore::RasterizerDownloadArea OnCPURead(DAddr addr, u64 size);
 
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index d477743ca..0832234af 100755
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -12,6 +12,7 @@
 #include "video_core/dma_pusher.h"
 #include "video_core/gpu.h"
 #include "video_core/gpu_thread.h"
+#include "video_core/host1x/host1x.h"
 #include "video_core/renderer_base.h"
 
 namespace VideoCommon::GPUThread {
diff --git a/src/video_core/host1x/codecs/decoder.cpp b/src/video_core/host1x/codecs/decoder.cpp
index 559166b51..8508cc172 100755
--- a/src/video_core/host1x/codecs/decoder.cpp
+++ b/src/video_core/host1x/codecs/decoder.cpp
@@ -9,8 +9,10 @@
 
 namespace Tegra {
 
-Decoder::Decoder(Host1x::Host1x& host1x_, s32 id_, const Host1x::NvdecCommon::NvdecRegisters& regs_)
-    : host1x(host1x_), memory_manager{host1x.GMMU()}, regs{regs_}, id{id_} {}
+Decoder::Decoder(Host1x::Host1x& host1x_, s32 id_, const Host1x::NvdecCommon::NvdecRegisters& regs_,
+                 Host1x::FrameQueue& frame_queue_)
+    : host1x(host1x_), memory_manager{host1x.GMMU()}, regs{regs_}, id{id_}, frame_queue{
+                                                                                frame_queue_} {}
 
 Decoder::~Decoder() = default;
 
@@ -43,11 +45,11 @@ void Decoder::Decode() {
         }
 
         if (UsingDecodeOrder()) {
-            decode_order_frames.insert_or_assign(luma_top, std::move(frame));
-            decode_order_frames.insert_or_assign(luma_bottom, std::move(frame_copy));
+            frame_queue.PushDecodeOrder(id, luma_top, std::move(frame));
+            frame_queue.PushDecodeOrder(id, luma_bottom, std::move(frame_copy));
         } else {
-            presentation_order_frames.push(std::move(frame));
-            presentation_order_frames.push(std::move(frame_copy));
+            frame_queue.PushPresentOrder(id, luma_top, std::move(frame));
+            frame_queue.PushPresentOrder(id, luma_bottom, std::move(frame_copy));
         }
     } else {
         auto [luma_offset, chroma_offset] = GetProgressiveOffsets();
@@ -57,9 +59,9 @@ void Decoder::Decode() {
         }
 
         if (UsingDecodeOrder()) {
-            decode_order_frames.insert_or_assign(luma_offset, std::move(frame));
+            frame_queue.PushDecodeOrder(id, luma_offset, std::move(frame));
         } else {
-            presentation_order_frames.push(std::move(frame));
+            frame_queue.PushPresentOrder(id, luma_offset, std::move(frame));
         }
     }
 }
diff --git a/src/video_core/host1x/codecs/decoder.h b/src/video_core/host1x/codecs/decoder.h
index 9245b6b8f..9b350ee3e 100755
--- a/src/video_core/host1x/codecs/decoder.h
+++ b/src/video_core/host1x/codecs/decoder.h
@@ -4,6 +4,7 @@
 #pragma once
 
 #include <memory>
+#include <mutex>
 #include <optional>
 #include <string_view>
 #include <unordered_map>
@@ -17,6 +18,7 @@ namespace Tegra {
 
 namespace Host1x {
 class Host1x;
+class FrameQueue;
 } // namespace Host1x
 
 class Decoder {
@@ -30,23 +32,6 @@ public:
         return decode_api.UsingDecodeOrder();
     }
 
-    std::shared_ptr<FFmpeg::Frame> GetFrame(u64 luma_offset) {
-        if (UsingDecodeOrder()) {
-            auto it = decode_order_frames.find(luma_offset);
-            if (it == decode_order_frames.end()) {
-                return {};
-            }
-            return decode_order_frames.extract(it).mapped();
-        }
-
-        if (presentation_order_frames.size() == 0) {
-            return {};
-        }
-        auto frame = std::move(presentation_order_frames.front());
-        presentation_order_frames.pop();
-        return frame;
-    }
-
     /// Returns the value of current_codec
     [[nodiscard]] Host1x::NvdecCommon::VideoCodec GetCurrentCodec() const {
         return codec;
@@ -57,7 +42,8 @@ public:
 
 protected:
     explicit Decoder(Host1x::Host1x& host1x, s32 id,
-                     const Host1x::NvdecCommon::NvdecRegisters& regs);
+                     const Host1x::NvdecCommon::NvdecRegisters& regs,
+                     Host1x::FrameQueue& frame_queue);
 
     virtual std::span<const u8> ComposeFrame() = 0;
     virtual std::tuple<u64, u64> GetProgressiveOffsets() = 0;
@@ -68,12 +54,10 @@ protected:
     Tegra::MemoryManager& memory_manager;
     const Host1x::NvdecCommon::NvdecRegisters& regs;
     s32 id;
+    Host1x::FrameQueue& frame_queue;
     Host1x::NvdecCommon::VideoCodec codec;
     FFmpeg::DecodeApi decode_api;
     bool initialized{};
-    std::queue<std::shared_ptr<FFmpeg::Frame>> presentation_order_frames;
-    std::unordered_map<u64, std::shared_ptr<FFmpeg::Frame>> decode_order_frames;
-
     bool vp9_hidden_frame{};
 };
 
diff --git a/src/video_core/host1x/codecs/h264.cpp b/src/video_core/host1x/codecs/h264.cpp
index 77eda05c2..d3b66c20c 100755
--- a/src/video_core/host1x/codecs/h264.cpp
+++ b/src/video_core/host1x/codecs/h264.cpp
@@ -10,7 +10,7 @@
 #include "video_core/host1x/host1x.h"
 #include "video_core/memory_manager.h"
 
-namespace Tegra::Decoder {
+namespace Tegra::Decoders {
 namespace {
 // ZigZag LUTs from libavcodec.
 constexpr std::array<u8, 64> zig_zag_direct{
@@ -25,23 +25,56 @@ constexpr std::array<u8, 16> zig_zag_scan{
 };
 } // Anonymous namespace
 
-H264::H264(Host1x::Host1x& host1x_) : host1x{host1x_} {}
+H264::H264(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs_, s32 id_,
+           Host1x::FrameQueue& frame_queue_)
+    : Decoder{host1x_, id_, regs_, frame_queue_} {
+    codec = Host1x::NvdecCommon::VideoCodec::H264;
+    initialized = decode_api.Initialize(codec);
+}
 
 H264::~H264() = default;
 
-std::span<const u8> H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state,
-                                       size_t* out_configuration_size, bool is_first_frame) {
-    H264DecoderContext context;
-    host1x.GMMU().ReadBlock(state.picture_info_offset, &context, sizeof(H264DecoderContext));
+std::tuple<u64, u64> H264::GetProgressiveOffsets() {
+    auto pic_idx{current_context.h264_parameter_set.curr_pic_idx};
+    auto luma{regs.surface_luma_offsets[pic_idx].Address() +
+              current_context.h264_parameter_set.luma_frame_offset.Address()};
+    auto chroma{regs.surface_chroma_offsets[pic_idx].Address() +
+                current_context.h264_parameter_set.chroma_frame_offset.Address()};
+    return {luma, chroma};
+}
 
-    const s64 frame_number = context.h264_parameter_set.frame_number.Value();
+std::tuple<u64, u64, u64, u64> H264::GetInterlacedOffsets() {
+    auto pic_idx{current_context.h264_parameter_set.curr_pic_idx};
+    auto luma_top{regs.surface_luma_offsets[pic_idx].Address() +
+                  current_context.h264_parameter_set.luma_top_offset.Address()};
+    auto luma_bottom{regs.surface_luma_offsets[pic_idx].Address() +
+                     current_context.h264_parameter_set.luma_bot_offset.Address()};
+    auto chroma_top{regs.surface_chroma_offsets[pic_idx].Address() +
+                    current_context.h264_parameter_set.chroma_top_offset.Address()};
+    auto chroma_bottom{regs.surface_chroma_offsets[pic_idx].Address() +
+                       current_context.h264_parameter_set.chroma_bot_offset.Address()};
+    return {luma_top, luma_bottom, chroma_top, chroma_bottom};
+}
+
+bool H264::IsInterlaced() {
+    return current_context.h264_parameter_set.luma_top_offset.Address() != 0 ||
+           current_context.h264_parameter_set.luma_bot_offset.Address() != 0;
+}
+
+std::span<const u8> H264::ComposeFrame() {
+    memory_manager.ReadBlock(regs.picture_info_offset.Address(), &current_context,
+                             sizeof(H264DecoderContext));
+
+    const s64 frame_number = current_context.h264_parameter_set.frame_number.Value();
     if (!is_first_frame && frame_number != 0) {
-        frame.resize_destructive(context.stream_len);
-        host1x.GMMU().ReadBlock(state.frame_bitstream_offset, frame.data(), frame.size());
-        *out_configuration_size = 0;
-        return frame;
+        frame_scratch.resize_destructive(current_context.stream_len);
+        memory_manager.ReadBlock(regs.frame_bitstream_offset.Address(), frame_scratch.data(),
+                                 frame_scratch.size());
+        return frame_scratch;
     }
 
+    is_first_frame = false;
+
     // Encode header
     H264BitWriter writer{};
     writer.WriteU(1, 24);
@@ -53,7 +86,7 @@ std::span<const u8> H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters
     writer.WriteU(31, 8);
     writer.WriteUe(0);
     const u32 chroma_format_idc =
-        static_cast<u32>(context.h264_parameter_set.chroma_format_idc.Value());
+        static_cast<u32>(current_context.h264_parameter_set.chroma_format_idc.Value());
     writer.WriteUe(chroma_format_idc);
     if (chroma_format_idc == 3) {
         writer.WriteBit(false);
@@ -61,42 +94,44 @@ std::span<const u8> H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters
 
     writer.WriteUe(0);
     writer.WriteUe(0);
-    writer.WriteBit(false); // QpprimeYZeroTransformBypassFlag
+    writer.WriteBit(current_context.qpprime_y_zero_transform_bypass_flag.Value() != 0);
     writer.WriteBit(false); // Scaling matrix present flag
 
-    writer.WriteUe(static_cast<u32>(context.h264_parameter_set.log2_max_frame_num_minus4.Value()));
+    writer.WriteUe(
+        static_cast<u32>(current_context.h264_parameter_set.log2_max_frame_num_minus4.Value()));
 
     const auto order_cnt_type =
-        static_cast<u32>(context.h264_parameter_set.pic_order_cnt_type.Value());
+        static_cast<u32>(current_context.h264_parameter_set.pic_order_cnt_type.Value());
     writer.WriteUe(order_cnt_type);
     if (order_cnt_type == 0) {
-        writer.WriteUe(context.h264_parameter_set.log2_max_pic_order_cnt_lsb_minus4);
+        writer.WriteUe(current_context.h264_parameter_set.log2_max_pic_order_cnt_lsb_minus4);
     } else if (order_cnt_type == 1) {
-        writer.WriteBit(context.h264_parameter_set.delta_pic_order_always_zero_flag != 0);
+        writer.WriteBit(current_context.h264_parameter_set.delta_pic_order_always_zero_flag != 0);
 
         writer.WriteSe(0);
         writer.WriteSe(0);
         writer.WriteUe(0);
     }
 
-    const s32 pic_height = context.h264_parameter_set.frame_height_in_map_units /
-                           (context.h264_parameter_set.frame_mbs_only_flag ? 1 : 2);
+    const s32 pic_height = current_context.h264_parameter_set.frame_height_in_mbs /
+                           (current_context.h264_parameter_set.frame_mbs_only_flag ? 1 : 2);
 
-    // TODO (ameerj): Where do we get this number, it seems to be particular for each stream
-    const auto nvdec_decoding = Settings::values.nvdec_emulation.GetValue();
-    const bool uses_gpu_decoding = nvdec_decoding == Settings::NvdecEmulation::Gpu;
-    const u32 max_num_ref_frames = uses_gpu_decoding ? 6u : 16u;
+    u32 max_num_ref_frames =
+        std::max(std::max(current_context.h264_parameter_set.num_refidx_l0_default_active,
+                          current_context.h264_parameter_set.num_refidx_l1_default_active) +
+                     1,
+                 4);
     writer.WriteUe(max_num_ref_frames);
     writer.WriteBit(false);
-    writer.WriteUe(context.h264_parameter_set.pic_width_in_mbs - 1);
+    writer.WriteUe(current_context.h264_parameter_set.pic_width_in_mbs - 1);
     writer.WriteUe(pic_height - 1);
-    writer.WriteBit(context.h264_parameter_set.frame_mbs_only_flag != 0);
+    writer.WriteBit(current_context.h264_parameter_set.frame_mbs_only_flag != 0);
 
-    if (!context.h264_parameter_set.frame_mbs_only_flag) {
-        writer.WriteBit(context.h264_parameter_set.flags.mbaff_frame.Value() != 0);
+    if (!current_context.h264_parameter_set.frame_mbs_only_flag) {
+        writer.WriteBit(current_context.h264_parameter_set.flags.mbaff_frame.Value() != 0);
     }
 
-    writer.WriteBit(context.h264_parameter_set.flags.direct_8x8_inference.Value() != 0);
+    writer.WriteBit(current_context.h264_parameter_set.flags.direct_8x8_inference.Value() != 0);
     writer.WriteBit(false); // Frame cropping flag
     writer.WriteBit(false); // VUI parameter present flag
 
@@ -111,57 +146,59 @@ std::span<const u8> H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters
     writer.WriteUe(0);
     writer.WriteUe(0);
 
-    writer.WriteBit(context.h264_parameter_set.entropy_coding_mode_flag != 0);
-    writer.WriteBit(context.h264_parameter_set.pic_order_present_flag != 0);
+    writer.WriteBit(current_context.h264_parameter_set.entropy_coding_mode_flag != 0);
+    writer.WriteBit(current_context.h264_parameter_set.pic_order_present_flag != 0);
     writer.WriteUe(0);
-    writer.WriteUe(context.h264_parameter_set.num_refidx_l0_default_active);
-    writer.WriteUe(context.h264_parameter_set.num_refidx_l1_default_active);
-    writer.WriteBit(context.h264_parameter_set.flags.weighted_pred.Value() != 0);
-    writer.WriteU(static_cast<s32>(context.h264_parameter_set.weighted_bipred_idc.Value()), 2);
-    s32 pic_init_qp = static_cast<s32>(context.h264_parameter_set.pic_init_qp_minus26.Value());
+    writer.WriteUe(current_context.h264_parameter_set.num_refidx_l0_default_active);
+    writer.WriteUe(current_context.h264_parameter_set.num_refidx_l1_default_active);
+    writer.WriteBit(current_context.h264_parameter_set.flags.weighted_pred.Value() != 0);
+    writer.WriteU(static_cast<s32>(current_context.h264_parameter_set.weighted_bipred_idc.Value()),
+                  2);
+    s32 pic_init_qp =
+        static_cast<s32>(current_context.h264_parameter_set.pic_init_qp_minus26.Value());
     writer.WriteSe(pic_init_qp);
     writer.WriteSe(0);
     s32 chroma_qp_index_offset =
-        static_cast<s32>(context.h264_parameter_set.chroma_qp_index_offset.Value());
+        static_cast<s32>(current_context.h264_parameter_set.chroma_qp_index_offset.Value());
 
     writer.WriteSe(chroma_qp_index_offset);
-    writer.WriteBit(context.h264_parameter_set.deblocking_filter_control_present_flag != 0);
-    writer.WriteBit(context.h264_parameter_set.flags.constrained_intra_pred.Value() != 0);
-    writer.WriteBit(context.h264_parameter_set.redundant_pic_cnt_present_flag != 0);
-    writer.WriteBit(context.h264_parameter_set.transform_8x8_mode_flag != 0);
+    writer.WriteBit(current_context.h264_parameter_set.deblocking_filter_control_present_flag != 0);
+    writer.WriteBit(current_context.h264_parameter_set.flags.constrained_intra_pred.Value() != 0);
+    writer.WriteBit(current_context.h264_parameter_set.redundant_pic_cnt_present_flag != 0);
+    writer.WriteBit(current_context.h264_parameter_set.transform_8x8_mode_flag != 0);
 
     writer.WriteBit(true); // pic_scaling_matrix_present_flag
 
     for (s32 index = 0; index < 6; index++) {
         writer.WriteBit(true);
-        std::span<const u8> matrix{context.weight_scale};
-        writer.WriteScalingList(scan, matrix, index * 16, 16);
+        std::span<const u8> matrix{current_context.weight_scale_4x4};
+        writer.WriteScalingList(scan_scratch, matrix, index * 16, 16);
     }
 
-    if (context.h264_parameter_set.transform_8x8_mode_flag) {
+    if (current_context.h264_parameter_set.transform_8x8_mode_flag) {
         for (s32 index = 0; index < 2; index++) {
             writer.WriteBit(true);
-            std::span<const u8> matrix{context.weight_scale_8x8};
-            writer.WriteScalingList(scan, matrix, index * 64, 64);
+            std::span<const u8> matrix{current_context.weight_scale_8x8};
+            writer.WriteScalingList(scan_scratch, matrix, index * 64, 64);
         }
     }
 
     s32 chroma_qp_index_offset2 =
-        static_cast<s32>(context.h264_parameter_set.second_chroma_qp_index_offset.Value());
+        static_cast<s32>(current_context.h264_parameter_set.second_chroma_qp_index_offset.Value());
 
     writer.WriteSe(chroma_qp_index_offset2);
 
     writer.End();
 
     const auto& encoded_header = writer.GetByteArray();
-    frame.resize(encoded_header.size() + context.stream_len);
-    std::memcpy(frame.data(), encoded_header.data(), encoded_header.size());
+    frame_scratch.resize(encoded_header.size() + current_context.stream_len);
+    std::memcpy(frame_scratch.data(), encoded_header.data(), encoded_header.size());
 
-    *out_configuration_size = encoded_header.size();
-    host1x.GMMU().ReadBlock(state.frame_bitstream_offset, frame.data() + encoded_header.size(),
-                            context.stream_len);
+    memory_manager.ReadBlock(regs.frame_bitstream_offset.Address(),
+                             frame_scratch.data() + encoded_header.size(),
+                             current_context.stream_len);
 
-    return frame;
+    return frame_scratch;
 }
 
 H264BitWriter::H264BitWriter() = default;
@@ -278,4 +315,4 @@ void H264BitWriter::Flush() {
     buffer = 0;
     buffer_pos = 0;
 }
-} // namespace Tegra::Decoder
+} // namespace Tegra::Decoders
diff --git a/src/video_core/host1x/codecs/h264.h b/src/video_core/host1x/codecs/h264.h
index 80dbb4e4b..9e95c6c68 100755
--- a/src/video_core/host1x/codecs/h264.h
+++ b/src/video_core/host1x/codecs/h264.h
@@ -10,6 +10,7 @@
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "common/scratch_buffer.h"
+#include "video_core/host1x/codecs/decoder.h"
 #include "video_core/host1x/nvdec_common.h"
 
 namespace Tegra {
@@ -18,7 +19,7 @@ namespace Host1x {
 class Host1x;
 } // namespace Host1x
 
-namespace Decoder {
+namespace Decoders {
 
 class H264BitWriter {
 public:
@@ -60,123 +61,213 @@ private:
     std::vector<u8> byte_array;
 };
 
-class H264 {
-public:
-    explicit H264(Host1x::Host1x& host1x);
-    ~H264();
-
-    /// Compose the H264 frame for FFmpeg decoding
-    [[nodiscard]] std::span<const u8> ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state,
-                                                   size_t* out_configuration_size,
-                                                   bool is_first_frame = false);
+struct Offset {
+    constexpr u32 Address() const noexcept {
+        return offset << 8;
+    }
 
 private:
-    Common::ScratchBuffer<u8> frame;
-    Common::ScratchBuffer<u8> scan;
-    Host1x::Host1x& host1x;
+    u32 offset;
+};
+static_assert(std::is_trivial_v<Offset>, "Offset must be trivial");
+static_assert(sizeof(Offset) == 0x4, "Offset has the wrong size!");
 
-    struct H264ParameterSet {
-        s32 log2_max_pic_order_cnt_lsb_minus4; ///< 0x00
-        s32 delta_pic_order_always_zero_flag;  ///< 0x04
-        s32 frame_mbs_only_flag;               ///< 0x08
-        u32 pic_width_in_mbs;                  ///< 0x0C
-        u32 frame_height_in_map_units;         ///< 0x10
-        union {                                ///< 0x14
-            BitField<0, 2, u32> tile_format;
-            BitField<2, 3, u32> gob_height;
-        };
-        u32 entropy_coding_mode_flag;               ///< 0x18
-        s32 pic_order_present_flag;                 ///< 0x1C
-        s32 num_refidx_l0_default_active;           ///< 0x20
-        s32 num_refidx_l1_default_active;           ///< 0x24
-        s32 deblocking_filter_control_present_flag; ///< 0x28
-        s32 redundant_pic_cnt_present_flag;         ///< 0x2C
-        u32 transform_8x8_mode_flag;                ///< 0x30
-        u32 pitch_luma;                             ///< 0x34
-        u32 pitch_chroma;                           ///< 0x38
-        u32 luma_top_offset;                        ///< 0x3C
-        u32 luma_bot_offset;                        ///< 0x40
-        u32 luma_frame_offset;                      ///< 0x44
-        u32 chroma_top_offset;                      ///< 0x48
-        u32 chroma_bot_offset;                      ///< 0x4C
-        u32 chroma_frame_offset;                    ///< 0x50
-        u32 hist_buffer_size;                       ///< 0x54
-        union {                                     ///< 0x58
-            union {
-                BitField<0, 1, u64> mbaff_frame;
-                BitField<1, 1, u64> direct_8x8_inference;
-                BitField<2, 1, u64> weighted_pred;
-                BitField<3, 1, u64> constrained_intra_pred;
-                BitField<4, 1, u64> ref_pic;
-                BitField<5, 1, u64> field_pic;
-                BitField<6, 1, u64> bottom_field;
-                BitField<7, 1, u64> second_field;
-            } flags;
-            BitField<8, 4, u64> log2_max_frame_num_minus4;
-            BitField<12, 2, u64> chroma_format_idc;
-            BitField<14, 2, u64> pic_order_cnt_type;
-            BitField<16, 6, s64> pic_init_qp_minus26;
-            BitField<22, 5, s64> chroma_qp_index_offset;
-            BitField<27, 5, s64> second_chroma_qp_index_offset;
-            BitField<32, 2, u64> weighted_bipred_idc;
-            BitField<34, 7, u64> curr_pic_idx;
-            BitField<41, 5, u64> curr_col_idx;
-            BitField<46, 16, u64> frame_number;
-            BitField<62, 1, u64> frame_surfaces;
-            BitField<63, 1, u64> output_memory_layout;
-        };
+struct H264ParameterSet {
+    s32 log2_max_pic_order_cnt_lsb_minus4; ///< 0x00
+    s32 delta_pic_order_always_zero_flag;  ///< 0x04
+    s32 frame_mbs_only_flag;               ///< 0x08
+    u32 pic_width_in_mbs;                  ///< 0x0C
+    u32 frame_height_in_mbs;               ///< 0x10
+    union {                                ///< 0x14
+        BitField<0, 2, u32> tile_format;
+        BitField<2, 3, u32> gob_height;
+        BitField<5, 27, u32> reserved_surface_format;
     };
-    static_assert(sizeof(H264ParameterSet) == 0x60, "H264ParameterSet is an invalid size");
-
-    struct H264DecoderContext {
-        INSERT_PADDING_WORDS_NOINIT(18);       ///< 0x0000
-        u32 stream_len;                        ///< 0x0048
-        INSERT_PADDING_WORDS_NOINIT(3);        ///< 0x004C
-        H264ParameterSet h264_parameter_set;   ///< 0x0058
-        INSERT_PADDING_WORDS_NOINIT(66);       ///< 0x00B8
-        std::array<u8, 0x60> weight_scale;     ///< 0x01C0
-        std::array<u8, 0x80> weight_scale_8x8; ///< 0x0220
+    u32 entropy_coding_mode_flag;               ///< 0x18
+    s32 pic_order_present_flag;                 ///< 0x1C
+    s32 num_refidx_l0_default_active;           ///< 0x20
+    s32 num_refidx_l1_default_active;           ///< 0x24
+    s32 deblocking_filter_control_present_flag; ///< 0x28
+    s32 redundant_pic_cnt_present_flag;         ///< 0x2C
+    u32 transform_8x8_mode_flag;                ///< 0x30
+    u32 pitch_luma;                             ///< 0x34
+    u32 pitch_chroma;                           ///< 0x38
+    Offset luma_top_offset;                     ///< 0x3C
+    Offset luma_bot_offset;                     ///< 0x40
+    Offset luma_frame_offset;                   ///< 0x44
+    Offset chroma_top_offset;                   ///< 0x48
+    Offset chroma_bot_offset;                   ///< 0x4C
+    Offset chroma_frame_offset;                 ///< 0x50
+    u32 hist_buffer_size;                       ///< 0x54
+    union {                                     ///< 0x58
+        union {
+            BitField<0, 1, u64> mbaff_frame;
+            BitField<1, 1, u64> direct_8x8_inference;
+            BitField<2, 1, u64> weighted_pred;
+            BitField<3, 1, u64> constrained_intra_pred;
+            BitField<4, 1, u64> ref_pic;
+            BitField<5, 1, u64> field_pic;
+            BitField<6, 1, u64> bottom_field;
+            BitField<7, 1, u64> second_field;
+        } flags;
+        BitField<8, 4, u64> log2_max_frame_num_minus4;
+        BitField<12, 2, u64> chroma_format_idc;
+        BitField<14, 2, u64> pic_order_cnt_type;
+        BitField<16, 6, s64> pic_init_qp_minus26;
+        BitField<22, 5, s64> chroma_qp_index_offset;
+        BitField<27, 5, s64> second_chroma_qp_index_offset;
+        BitField<32, 2, u64> weighted_bipred_idc;
+        BitField<34, 7, u64> curr_pic_idx;
+        BitField<41, 5, u64> curr_col_idx;
+        BitField<46, 16, u64> frame_number;
+        BitField<62, 1, u64> frame_surfaces;
+        BitField<63, 1, u64> output_memory_layout;
     };
-    static_assert(sizeof(H264DecoderContext) == 0x2A0, "H264DecoderContext is an invalid size");
+};
+static_assert(sizeof(H264ParameterSet) == 0x60, "H264ParameterSet is an invalid size");
 
 #define ASSERT_POSITION(field_name, position)                                                      \
     static_assert(offsetof(H264ParameterSet, field_name) == position,                              \
                   "Field " #field_name " has invalid position")
 
-    ASSERT_POSITION(log2_max_pic_order_cnt_lsb_minus4, 0x00);
-    ASSERT_POSITION(delta_pic_order_always_zero_flag, 0x04);
-    ASSERT_POSITION(frame_mbs_only_flag, 0x08);
-    ASSERT_POSITION(pic_width_in_mbs, 0x0C);
-    ASSERT_POSITION(frame_height_in_map_units, 0x10);
-    ASSERT_POSITION(tile_format, 0x14);
-    ASSERT_POSITION(entropy_coding_mode_flag, 0x18);
-    ASSERT_POSITION(pic_order_present_flag, 0x1C);
-    ASSERT_POSITION(num_refidx_l0_default_active, 0x20);
-    ASSERT_POSITION(num_refidx_l1_default_active, 0x24);
-    ASSERT_POSITION(deblocking_filter_control_present_flag, 0x28);
-    ASSERT_POSITION(redundant_pic_cnt_present_flag, 0x2C);
-    ASSERT_POSITION(transform_8x8_mode_flag, 0x30);
-    ASSERT_POSITION(pitch_luma, 0x34);
-    ASSERT_POSITION(pitch_chroma, 0x38);
-    ASSERT_POSITION(luma_top_offset, 0x3C);
-    ASSERT_POSITION(luma_bot_offset, 0x40);
-    ASSERT_POSITION(luma_frame_offset, 0x44);
-    ASSERT_POSITION(chroma_top_offset, 0x48);
-    ASSERT_POSITION(chroma_bot_offset, 0x4C);
-    ASSERT_POSITION(chroma_frame_offset, 0x50);
-    ASSERT_POSITION(hist_buffer_size, 0x54);
-    ASSERT_POSITION(flags, 0x58);
+ASSERT_POSITION(log2_max_pic_order_cnt_lsb_minus4, 0x00);
+ASSERT_POSITION(delta_pic_order_always_zero_flag, 0x04);
+ASSERT_POSITION(frame_mbs_only_flag, 0x08);
+ASSERT_POSITION(pic_width_in_mbs, 0x0C);
+ASSERT_POSITION(frame_height_in_mbs, 0x10);
+ASSERT_POSITION(tile_format, 0x14);
+ASSERT_POSITION(entropy_coding_mode_flag, 0x18);
+ASSERT_POSITION(pic_order_present_flag, 0x1C);
+ASSERT_POSITION(num_refidx_l0_default_active, 0x20);
+ASSERT_POSITION(num_refidx_l1_default_active, 0x24);
+ASSERT_POSITION(deblocking_filter_control_present_flag, 0x28);
+ASSERT_POSITION(redundant_pic_cnt_present_flag, 0x2C);
+ASSERT_POSITION(transform_8x8_mode_flag, 0x30);
+ASSERT_POSITION(pitch_luma, 0x34);
+ASSERT_POSITION(pitch_chroma, 0x38);
+ASSERT_POSITION(luma_top_offset, 0x3C);
+ASSERT_POSITION(luma_bot_offset, 0x40);
+ASSERT_POSITION(luma_frame_offset, 0x44);
+ASSERT_POSITION(chroma_top_offset, 0x48);
+ASSERT_POSITION(chroma_bot_offset, 0x4C);
+ASSERT_POSITION(chroma_frame_offset, 0x50);
+ASSERT_POSITION(hist_buffer_size, 0x54);
+ASSERT_POSITION(flags, 0x58);
 #undef ASSERT_POSITION
 
+struct DpbEntry {
+    union {
+        BitField<0, 7, u32> index;
+        BitField<7, 5, u32> col_idx;
+        BitField<12, 2, u32> state;
+        BitField<14, 1, u32> is_long_term;
+        BitField<15, 1, u32> non_existing;
+        BitField<16, 1, u32> is_field;
+        BitField<17, 4, u32> top_field_marking;
+        BitField<21, 4, u32> bottom_field_marking;
+        BitField<25, 1, u32> output_memory_layout;
+        BitField<26, 6, u32> reserved;
+    } flags;
+    std::array<u32, 2> field_order_cnt;
+    u32 frame_idx;
+};
+static_assert(sizeof(DpbEntry) == 0x10, "DpbEntry has the wrong size!");
+
+struct DisplayParam {
+    union {
+        BitField<0, 1, u32> enable_tf_output;
+        BitField<1, 1, u32> vc1_map_y_flag;
+        BitField<2, 3, u32> map_y_value;
+        BitField<5, 1, u32> vc1_map_uv_flag;
+        BitField<6, 3, u32> map_uv_value;
+        BitField<9, 8, u32> out_stride;
+        BitField<17, 3, u32> tiling_format;
+        BitField<20, 1, u32> output_structure; // 0=frame, 1=field
+        BitField<21, 11, u32> reserved0;
+    };
+    std::array<s32, 2> output_top;
+    std::array<s32, 2> output_bottom;
+    union {
+        BitField<0, 1, u32> enable_histogram;
+        BitField<1, 12, u32> histogram_start_x;
+        BitField<13, 12, u32> histogram_start_y;
+        BitField<25, 7, u32> reserved1;
+    };
+    union {
+        BitField<0, 12, u32> histogram_end_x;
+        BitField<12, 12, u32> histogram_end_y;
+        BitField<24, 8, u32> reserved2;
+    };
+};
+static_assert(sizeof(DisplayParam) == 0x1C, "DisplayParam has the wrong size!");
+
+struct H264DecoderContext {
+    INSERT_PADDING_WORDS_NOINIT(13);                        ///< 0x0000
+    std::array<u8, 16> eos;                                 ///< 0x0034
+    u8 explicit_eos_present_flag;                           ///< 0x0044
+    u8 hint_dump_en;                                        ///< 0x0045
+    INSERT_PADDING_BYTES_NOINIT(2);                         ///< 0x0046
+    u32 stream_len;                                         ///< 0x0048
+    u32 slice_count;                                        ///< 0x004C
+    u32 mbhist_buffer_size;                                 ///< 0x0050
+    u32 gptimer_timeout_value;                              ///< 0x0054
+    H264ParameterSet h264_parameter_set;                    ///< 0x0058
+    std::array<s32, 2> curr_field_order_cnt;                ///< 0x00B8
+    std::array<DpbEntry, 16> dpb;                           ///< 0x00C0
+    std::array<u8, 0x60> weight_scale_4x4;                  ///< 0x01C0
+    std::array<u8, 0x80> weight_scale_8x8;                  ///< 0x0220
+    std::array<u8, 2> num_inter_view_refs_lX;               ///< 0x02A0
+    std::array<u8, 14> reserved2;                           ///< 0x02A2
+    std::array<std::array<s8, 16>, 2> inter_view_refidx_lX; ///< 0x02B0
+    union {                                                 ///< 0x02D0
+        BitField<0, 1, u32> lossless_ipred8x8_filter_enable;
+        BitField<1, 1, u32> qpprime_y_zero_transform_bypass_flag;
+        BitField<2, 30, u32> reserved3;
+    };
+    DisplayParam display_param;   ///< 0x02D4
+    std::array<u32, 3> reserved4; ///< 0x02F0
+};
+static_assert(sizeof(H264DecoderContext) == 0x2FC, "H264DecoderContext is an invalid size");
+
 #define ASSERT_POSITION(field_name, position)                                                      \
     static_assert(offsetof(H264DecoderContext, field_name) == position,                            \
                   "Field " #field_name " has invalid position")
 
-    ASSERT_POSITION(stream_len, 0x48);
-    ASSERT_POSITION(h264_parameter_set, 0x58);
-    ASSERT_POSITION(weight_scale, 0x1C0);
+ASSERT_POSITION(stream_len, 0x48);
+ASSERT_POSITION(h264_parameter_set, 0x58);
+ASSERT_POSITION(dpb, 0xC0);
+ASSERT_POSITION(weight_scale_4x4, 0x1C0);
 #undef ASSERT_POSITION
+
+class H264 final : public Decoder {
+public:
+    explicit H264(Host1x::Host1x& host1x, const Host1x::NvdecCommon::NvdecRegisters& regs, s32 id,
+                  Host1x::FrameQueue& frame_queue);
+    ~H264() override;
+
+    H264(const H264&) = delete;
+    H264& operator=(const H264&) = delete;
+
+    H264(H264&&) = delete;
+    H264& operator=(H264&&) = delete;
+
+    /// Compose the H264 frame for FFmpeg decoding
+    [[nodiscard]] std::span<const u8> ComposeFrame() override;
+
+    std::tuple<u64, u64> GetProgressiveOffsets() override;
+    std::tuple<u64, u64, u64, u64> GetInterlacedOffsets() override;
+    bool IsInterlaced() override;
+
+    std::string_view GetCurrentCodecName() const override {
+        return "H264";
+    }
+
+private:
+    bool is_first_frame{true};
+    Common::ScratchBuffer<u8> frame_scratch;
+    Common::ScratchBuffer<u8> scan_scratch;
+    H264DecoderContext current_context{};
 };
 
-} // namespace Decoder
+} // namespace Decoders
 } // namespace Tegra
diff --git a/src/video_core/host1x/codecs/vp8.cpp b/src/video_core/host1x/codecs/vp8.cpp
index e8b9bf6bf..94c6ab7f0 100755
--- a/src/video_core/host1x/codecs/vp8.cpp
+++ b/src/video_core/host1x/codecs/vp8.cpp
@@ -7,47 +7,70 @@
 #include "video_core/host1x/host1x.h"
 #include "video_core/memory_manager.h"
 
-namespace Tegra::Decoder {
-VP8::VP8(Host1x::Host1x& host1x_) : host1x{host1x_} {}
+namespace Tegra::Decoders {
+VP8::VP8(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs_, s32 id_,
+         Host1x::FrameQueue& frame_queue_)
+    : Decoder{host1x_, id_, regs_, frame_queue_} {
+    codec = Host1x::NvdecCommon::VideoCodec::VP8;
+    initialized = decode_api.Initialize(codec);
+}
 
 VP8::~VP8() = default;
 
-std::span<const u8> VP8::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state) {
-    VP8PictureInfo info;
-    host1x.GMMU().ReadBlock(state.picture_info_offset, &info, sizeof(VP8PictureInfo));
+std::tuple<u64, u64> VP8::GetProgressiveOffsets() {
+    auto luma{regs.surface_luma_offsets[static_cast<u32>(Vp8SurfaceIndex::Current)].Address()};
+    auto chroma{regs.surface_chroma_offsets[static_cast<u32>(Vp8SurfaceIndex::Current)].Address()};
+    return {luma, chroma};
+}
 
-    const bool is_key_frame = info.key_frame == 1u;
-    const auto bitstream_size = static_cast<size_t>(info.vld_buffer_size);
+std::tuple<u64, u64, u64, u64> VP8::GetInterlacedOffsets() {
+    auto luma_top{regs.surface_luma_offsets[static_cast<u32>(Vp8SurfaceIndex::Current)].Address()};
+    auto luma_bottom{
+        regs.surface_luma_offsets[static_cast<u32>(Vp8SurfaceIndex::Current)].Address()};
+    auto chroma_top{
+        regs.surface_chroma_offsets[static_cast<u32>(Vp8SurfaceIndex::Current)].Address()};
+    auto chroma_bottom{
+        regs.surface_chroma_offsets[static_cast<u32>(Vp8SurfaceIndex::Current)].Address()};
+    return {luma_top, luma_bottom, chroma_top, chroma_bottom};
+}
+
+std::span<const u8> VP8::ComposeFrame() {
+    memory_manager.ReadBlock(regs.picture_info_offset.Address(), &current_context,
+                             sizeof(VP8PictureInfo));
+
+    const bool is_key_frame = current_context.key_frame == 1u;
+    const auto bitstream_size = static_cast<size_t>(current_context.vld_buffer_size);
     const size_t header_size = is_key_frame ? 10u : 3u;
-    frame.resize(header_size + bitstream_size);
+    frame_scratch.resize(header_size + bitstream_size);
 
     // Based on page 30 of the VP8 specification.
     // https://datatracker.ietf.org/doc/rfc6386/
-    frame[0] = is_key_frame ? 0u : 1u; // 1-bit frame type (0: keyframe, 1: interframes).
-    frame[0] |= static_cast<u8>((info.version & 7u) << 1u); // 3-bit version number
-    frame[0] |= static_cast<u8>(1u << 4u);                  // 1-bit show_frame flag
+    frame_scratch[0] = is_key_frame ? 0u : 1u; // 1-bit frame type (0: keyframe, 1: interframes).
+    frame_scratch[0] |=
+        static_cast<u8>((current_context.version & 7u) << 1u); // 3-bit version number
+    frame_scratch[0] |= static_cast<u8>(1u << 4u);             // 1-bit show_frame flag
 
     // The next 19-bits are the first partition size
-    frame[0] |= static_cast<u8>((info.first_part_size & 7u) << 5u);
-    frame[1] = static_cast<u8>((info.first_part_size & 0x7f8u) >> 3u);
-    frame[2] = static_cast<u8>((info.first_part_size & 0x7f800u) >> 11u);
+    frame_scratch[0] |= static_cast<u8>((current_context.first_part_size & 7u) << 5u);
+    frame_scratch[1] = static_cast<u8>((current_context.first_part_size & 0x7f8u) >> 3u);
+    frame_scratch[2] = static_cast<u8>((current_context.first_part_size & 0x7f800u) >> 11u);
 
     if (is_key_frame) {
-        frame[3] = 0x9du;
-        frame[4] = 0x01u;
-        frame[5] = 0x2au;
+        frame_scratch[3] = 0x9du;
+        frame_scratch[4] = 0x01u;
+        frame_scratch[5] = 0x2au;
         // TODO(ameerj): Horizontal/Vertical Scale
         // 16 bits: (2 bits Horizontal Scale << 14) | Width (14 bits)
-        frame[6] = static_cast<u8>(info.frame_width & 0xff);
-        frame[7] = static_cast<u8>(((info.frame_width >> 8) & 0x3f));
+        frame_scratch[6] = static_cast<u8>(current_context.frame_width & 0xff);
+        frame_scratch[7] = static_cast<u8>(((current_context.frame_width >> 8) & 0x3f));
         // 16 bits:(2 bits Vertical Scale << 14) | Height (14 bits)
-        frame[8] = static_cast<u8>(info.frame_height & 0xff);
-        frame[9] = static_cast<u8>(((info.frame_height >> 8) & 0x3f));
+        frame_scratch[8] = static_cast<u8>(current_context.frame_height & 0xff);
+        frame_scratch[9] = static_cast<u8>(((current_context.frame_height >> 8) & 0x3f));
     }
-    const u64 bitstream_offset = state.frame_bitstream_offset;
-    host1x.GMMU().ReadBlock(bitstream_offset, frame.data() + header_size, bitstream_size);
+    const u64 bitstream_offset = regs.frame_bitstream_offset.Address();
+    memory_manager.ReadBlock(bitstream_offset, frame_scratch.data() + header_size, bitstream_size);
 
-    return frame;
+    return frame_scratch;
 }
 
-} // namespace Tegra::Decoder
+} // namespace Tegra::Decoders
diff --git a/src/video_core/host1x/codecs/vp8.h b/src/video_core/host1x/codecs/vp8.h
index 36a7e08e0..0ab48ca37 100755
--- a/src/video_core/host1x/codecs/vp8.h
+++ b/src/video_core/host1x/codecs/vp8.h
@@ -9,6 +9,7 @@
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "common/scratch_buffer.h"
+#include "video_core/host1x/codecs/decoder.h"
 #include "video_core/host1x/nvdec_common.h"
 
 namespace Tegra {
@@ -17,20 +18,41 @@ namespace Host1x {
 class Host1x;
 } // namespace Host1x
 
-namespace Decoder {
+namespace Decoders {
+enum class Vp8SurfaceIndex : u32 {
+    Last = 0,
+    Golden = 1,
+    AltRef = 2,
+    Current = 3,
+};
 
-class VP8 {
+class VP8 final : public Decoder {
 public:
-    explicit VP8(Host1x::Host1x& host1x);
-    ~VP8();
+    explicit VP8(Host1x::Host1x& host1x, const Host1x::NvdecCommon::NvdecRegisters& regs, s32 id,
+                 Host1x::FrameQueue& frame_queue);
+    ~VP8() override;
 
-    /// Compose the VP8 frame for FFmpeg decoding
-    [[nodiscard]] std::span<const u8> ComposeFrame(
-        const Host1x::NvdecCommon::NvdecRegisters& state);
+    VP8(const VP8&) = delete;
+    VP8& operator=(const VP8&) = delete;
+
+    VP8(VP8&&) = delete;
+    VP8& operator=(VP8&&) = delete;
+
+    [[nodiscard]] std::span<const u8> ComposeFrame() override;
+
+    std::tuple<u64, u64> GetProgressiveOffsets() override;
+    std::tuple<u64, u64, u64, u64> GetInterlacedOffsets() override;
+
+    bool IsInterlaced() override {
+        return false;
+    }
+
+    std::string_view GetCurrentCodecName() const override {
+        return "VP8";
+    }
 
 private:
-    Common::ScratchBuffer<u8> frame;
-    Host1x::Host1x& host1x;
+    Common::ScratchBuffer<u8> frame_scratch;
 
     struct VP8PictureInfo {
         INSERT_PADDING_WORDS_NOINIT(14);
@@ -73,7 +95,9 @@ private:
         INSERT_PADDING_WORDS_NOINIT(3);
     };
     static_assert(sizeof(VP8PictureInfo) == 0xc0, "PictureInfo is an invalid size");
+
+    VP8PictureInfo current_context{};
 };
 
-} // namespace Decoder
+} // namespace Decoders
 } // namespace Tegra
diff --git a/src/video_core/host1x/codecs/vp9.cpp b/src/video_core/host1x/codecs/vp9.cpp
index 9701b867b..e286a199a 100755
--- a/src/video_core/host1x/codecs/vp9.cpp
+++ b/src/video_core/host1x/codecs/vp9.cpp
@@ -4,12 +4,13 @@
 #include <algorithm> // for std::copy
 #include <numeric>
 
+#include "common/alignment.h"
 #include "common/assert.h"
 #include "video_core/host1x/codecs/vp9.h"
 #include "video_core/host1x/host1x.h"
 #include "video_core/memory_manager.h"
 
-namespace Tegra::Decoder {
+namespace Tegra::Decoders {
 namespace {
 constexpr u32 diff_update_probability = 252;
 constexpr u32 frame_sync_code = 0x498342;
@@ -237,7 +238,12 @@ constexpr std::array<u8, 254> map_lut{
 }
 } // Anonymous namespace
 
-VP9::VP9(Host1x::Host1x& host1x_) : host1x{host1x_} {}
+VP9::VP9(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs_, s32 id_,
+         Host1x::FrameQueue& frame_queue_)
+    : Decoder{host1x_, id_, regs_, frame_queue_} {
+    codec = Host1x::NvdecCommon::VideoCodec::VP9;
+    initialized = decode_api.Initialize(codec);
+}
 
 VP9::~VP9() = default;
 
@@ -356,35 +362,113 @@ void VP9::WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_
     }
 }
 
-Vp9PictureInfo VP9::GetVp9PictureInfo(const Host1x::NvdecCommon::NvdecRegisters& state) {
-    PictureInfo picture_info;
-    host1x.GMMU().ReadBlock(state.picture_info_offset, &picture_info, sizeof(PictureInfo));
-    Vp9PictureInfo vp9_info = picture_info.Convert();
+void VP9::WriteSegmentation(VpxBitStreamWriter& writer) {
+    bool enabled = current_picture_info.segmentation.enabled != 0;
+    writer.WriteBit(enabled);
+    if (!enabled) {
+        return;
+    }
 
-    InsertEntropy(state.vp9_entropy_probs_offset, vp9_info.entropy);
+    auto update_map = current_picture_info.segmentation.update_map != 0;
+    writer.WriteBit(update_map);
+
+    if (update_map) {
+        EntropyProbs entropy_probs{};
+        memory_manager.ReadBlock(regs.vp9_prob_tab_buffer_offset.Address(), &entropy_probs,
+                                 sizeof(entropy_probs));
+
+        auto WriteProb = [&](u8 prob) {
+            bool coded = prob != 255;
+            writer.WriteBit(coded);
+            if (coded) {
+                writer.WriteU(prob, 8);
+            }
+        };
+
+        for (size_t i = 0; i < entropy_probs.mb_segment_tree_probs.size(); i++) {
+            WriteProb(entropy_probs.mb_segment_tree_probs[i]);
+        }
+
+        auto temporal_update = current_picture_info.segmentation.temporal_update != 0;
+        writer.WriteBit(temporal_update);
+
+        if (temporal_update) {
+            for (s32 i = 0; i < 3; i++) {
+                WriteProb(entropy_probs.segment_pred_probs[i]);
+            }
+        }
+    }
+
+    if (last_segmentation == current_picture_info.segmentation) {
+        writer.WriteBit(false);
+        return;
+    }
+
+    last_segmentation = current_picture_info.segmentation;
+    writer.WriteBit(true);
+    writer.WriteBit(current_picture_info.segmentation.abs_delta != 0);
+
+    constexpr s32 MAX_SEGMENTS = 8;
+    constexpr std::array SegmentationFeatureBits = {8, 6, 2, 0};
+
+    for (s32 i = 0; i < MAX_SEGMENTS; i++) {
+        auto q_enabled = current_picture_info.segmentation.feature_enabled[i][0] != 0;
+        writer.WriteBit(q_enabled);
+        if (q_enabled) {
+            writer.WriteS(current_picture_info.segmentation.feature_data[i][0],
+                          SegmentationFeatureBits[0]);
+        }
+
+        auto lf_enabled = current_picture_info.segmentation.feature_enabled[i][1] != 0;
+        writer.WriteBit(lf_enabled);
+        if (lf_enabled) {
+            writer.WriteS(current_picture_info.segmentation.feature_data[i][1],
+                          SegmentationFeatureBits[1]);
+        }
+
+        auto ref_enabled = current_picture_info.segmentation.feature_enabled[i][2] != 0;
+        writer.WriteBit(ref_enabled);
+        if (ref_enabled) {
+            writer.WriteU(current_picture_info.segmentation.feature_data[i][2],
+                          SegmentationFeatureBits[2]);
+        }
+
+        auto skip_enabled = current_picture_info.segmentation.feature_enabled[i][3] != 0;
+        writer.WriteBit(skip_enabled);
+    }
+}
+
+Vp9PictureInfo VP9::GetVp9PictureInfo() {
+    memory_manager.ReadBlock(regs.picture_info_offset.Address(), &current_picture_info,
+                             sizeof(PictureInfo));
+    Vp9PictureInfo vp9_info = current_picture_info.Convert();
+
+    InsertEntropy(regs.vp9_prob_tab_buffer_offset.Address(), vp9_info.entropy);
 
     // surface_luma_offset[0:3] contains the address of the reference frame offsets in the following
     // order: last, golden, altref, current.
-    std::copy(state.surface_luma_offset.begin(), state.surface_luma_offset.begin() + 4,
-              vp9_info.frame_offsets.begin());
+    for (size_t i = 0; i < 4; i++) {
+        vp9_info.frame_offsets[i] = regs.surface_luma_offsets[i].Address();
+    }
 
     return vp9_info;
 }
 
 void VP9::InsertEntropy(u64 offset, Vp9EntropyProbs& dst) {
     EntropyProbs entropy;
-    host1x.GMMU().ReadBlock(offset, &entropy, sizeof(EntropyProbs));
+    memory_manager.ReadBlock(offset, &entropy, sizeof(EntropyProbs));
     entropy.Convert(dst);
 }
 
-Vp9FrameContainer VP9::GetCurrentFrame(const Host1x::NvdecCommon::NvdecRegisters& state) {
+Vp9FrameContainer VP9::GetCurrentFrame() {
     Vp9FrameContainer current_frame{};
     {
         // gpu.SyncGuestHost(); epic, why?
-        current_frame.info = GetVp9PictureInfo(state);
+        current_frame.info = GetVp9PictureInfo();
         current_frame.bit_stream.resize(current_frame.info.bitstream_size);
-        host1x.GMMU().ReadBlock(state.frame_bitstream_offset, current_frame.bit_stream.data(),
-                                current_frame.info.bitstream_size);
+        memory_manager.ReadBlock(regs.frame_bitstream_offset.Address(),
+                                 current_frame.bit_stream.data(),
+                                 current_frame.info.bitstream_size);
     }
     if (!next_frame.bit_stream.empty()) {
         Vp9FrameContainer temp{
@@ -742,8 +826,7 @@ VpxBitStreamWriter VP9::ComposeUncompressedHeader() {
     uncomp_writer.WriteDeltaQ(current_frame_info.uv_dc_delta_q);
     uncomp_writer.WriteDeltaQ(current_frame_info.uv_ac_delta_q);
 
-    ASSERT(!current_frame_info.segment_enabled);
-    uncomp_writer.WriteBit(false); // Segmentation enabled (TODO).
+    WriteSegmentation(uncomp_writer);
 
     const s32 min_tile_cols_log2 = CalcMinLog2TileCols(current_frame_info.frame_size.width);
     const s32 max_tile_cols_log2 = CalcMaxLog2TileCols(current_frame_info.frame_size.width);
@@ -770,10 +853,29 @@ VpxBitStreamWriter VP9::ComposeUncompressedHeader() {
     return uncomp_writer;
 }
 
-void VP9::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state) {
+std::tuple<u64, u64> VP9::GetProgressiveOffsets() {
+    auto luma{regs.surface_luma_offsets[static_cast<u32>(Vp9SurfaceIndex::Current)].Address()};
+    auto chroma{regs.surface_chroma_offsets[static_cast<u32>(Vp9SurfaceIndex::Current)].Address()};
+    return {luma, chroma};
+}
+
+std::tuple<u64, u64, u64, u64> VP9::GetInterlacedOffsets() {
+    auto luma_top{regs.surface_luma_offsets[static_cast<u32>(Vp9SurfaceIndex::Current)].Address()};
+    auto luma_bottom{
+        regs.surface_luma_offsets[static_cast<u32>(Vp9SurfaceIndex::Current)].Address()};
+    auto chroma_top{
+        regs.surface_chroma_offsets[static_cast<u32>(Vp9SurfaceIndex::Current)].Address()};
+    auto chroma_bottom{
+        regs.surface_chroma_offsets[static_cast<u32>(Vp9SurfaceIndex::Current)].Address()};
+    return {luma_top, luma_bottom, chroma_top, chroma_bottom};
+}
+
+std::span<const u8> VP9::ComposeFrame() {
+    vp9_hidden_frame = false;
+
     std::vector<u8> bitstream;
     {
-        Vp9FrameContainer curr_frame = GetCurrentFrame(state);
+        Vp9FrameContainer curr_frame = GetCurrentFrame();
         current_frame_info = curr_frame.info;
         bitstream = std::move(curr_frame.bit_stream);
     }
@@ -786,12 +888,16 @@ void VP9::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state) {
     std::vector<u8> uncompressed_header = uncomp_writer.GetByteArray();
 
     // Write headers and frame to buffer
-    frame.resize(uncompressed_header.size() + compressed_header.size() + bitstream.size());
-    std::copy(uncompressed_header.begin(), uncompressed_header.end(), frame.begin());
+    frame_scratch.resize(uncompressed_header.size() + compressed_header.size() + bitstream.size());
+    std::copy(uncompressed_header.begin(), uncompressed_header.end(), frame_scratch.begin());
     std::copy(compressed_header.begin(), compressed_header.end(),
-              frame.begin() + uncompressed_header.size());
+              frame_scratch.begin() + uncompressed_header.size());
     std::copy(bitstream.begin(), bitstream.end(),
-              frame.begin() + uncompressed_header.size() + compressed_header.size());
+              frame_scratch.begin() + uncompressed_header.size() + compressed_header.size());
+
+    vp9_hidden_frame = WasFrameHidden();
+
+    return GetFrameBytes();
 }
 
 VpxRangeEncoder::VpxRangeEncoder() {
@@ -944,4 +1050,4 @@ const std::vector<u8>& VpxBitStreamWriter::GetByteArray() const {
     return byte_array;
 }
 
-} // namespace Tegra::Decoder
+} // namespace Tegra::Decoders
diff --git a/src/video_core/host1x/codecs/vp9.h b/src/video_core/host1x/codecs/vp9.h
index 23abf4a33..b9f4081dc 100755
--- a/src/video_core/host1x/codecs/vp9.h
+++ b/src/video_core/host1x/codecs/vp9.h
@@ -10,6 +10,7 @@
 #include "common/common_types.h"
 #include "common/scratch_buffer.h"
 #include "common/stream.h"
+#include "video_core/host1x/codecs/decoder.h"
 #include "video_core/host1x/codecs/vp9_types.h"
 #include "video_core/host1x/nvdec_common.h"
 
@@ -19,7 +20,7 @@ namespace Host1x {
 class Host1x;
 } // namespace Host1x
 
-namespace Decoder {
+namespace Decoders {
 
 /// The VpxRangeEncoder, and VpxBitStreamWriter classes are used to compose the
 /// VP9 header bitstreams.
@@ -110,21 +111,32 @@ private:
     std::vector<u8> byte_array;
 };
 
-class VP9 {
+class VP9 final : public Decoder {
 public:
-    explicit VP9(Host1x::Host1x& host1x);
-    ~VP9();
+    explicit VP9(Host1x::Host1x& host1x, const Host1x::NvdecCommon::NvdecRegisters& regs, s32 id,
+                 Host1x::FrameQueue& frame_queue);
+    ~VP9() override;
 
     VP9(const VP9&) = delete;
     VP9& operator=(const VP9&) = delete;
 
-    VP9(VP9&&) = default;
+    VP9(VP9&&) = delete;
     VP9& operator=(VP9&&) = delete;
 
-    /// Composes the VP9 frame from the GPU state information.
-    /// Based on the official VP9 spec documentation
-    void ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state);
+    [[nodiscard]] std::span<const u8> ComposeFrame() override;
 
+    std::tuple<u64, u64> GetProgressiveOffsets() override;
+    std::tuple<u64, u64, u64, u64> GetInterlacedOffsets() override;
+
+    bool IsInterlaced() override {
+        return false;
+    }
+
+    std::string_view GetCurrentCodecName() const override {
+        return "VP9";
+    }
+
+private:
     /// Returns true if the most recent frame was a hidden frame.
     [[nodiscard]] bool WasFrameHidden() const {
         return !current_frame_info.show_frame;
@@ -132,10 +144,9 @@ public:
 
     /// Returns a const span to the composed frame data.
     [[nodiscard]] std::span<const u8> GetFrameBytes() const {
-        return frame;
+        return frame_scratch;
     }
 
-private:
     /// Generates compressed header probability updates in the bitstream writer
     template <typename T, std::size_t N>
     void WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
@@ -167,23 +178,22 @@ private:
     /// Write motion vector probability updates. 6.3.17 in the spec
     void WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob);
 
+    void WriteSegmentation(VpxBitStreamWriter& writer);
+
     /// Returns VP9 information from NVDEC provided offset and size
-    [[nodiscard]] Vp9PictureInfo GetVp9PictureInfo(
-        const Host1x::NvdecCommon::NvdecRegisters& state);
+    [[nodiscard]] Vp9PictureInfo GetVp9PictureInfo();
 
     /// Read and convert NVDEC provided entropy probs to Vp9EntropyProbs struct
     void InsertEntropy(u64 offset, Vp9EntropyProbs& dst);
 
     /// Returns frame to be decoded after buffering
-    [[nodiscard]] Vp9FrameContainer GetCurrentFrame(
-        const Host1x::NvdecCommon::NvdecRegisters& state);
+    [[nodiscard]] Vp9FrameContainer GetCurrentFrame();
 
     /// Use NVDEC providied information to compose the headers for the current frame
     [[nodiscard]] std::vector<u8> ComposeCompressedHeader();
     [[nodiscard]] VpxBitStreamWriter ComposeUncompressedHeader();
 
-    Host1x::Host1x& host1x;
-    Common::ScratchBuffer<u8> frame;
+    Common::ScratchBuffer<u8> frame_scratch;
 
     std::array<s8, 4> loop_filter_ref_deltas{};
     std::array<s8, 2> loop_filter_mode_deltas{};
@@ -192,9 +202,11 @@ private:
     std::array<Vp9EntropyProbs, 4> frame_ctxs{};
     bool swap_ref_indices{};
 
+    Segmentation last_segmentation{};
+    PictureInfo current_picture_info{};
     Vp9PictureInfo current_frame_info{};
     Vp9EntropyProbs prev_frame_probs{};
 };
 
-} // namespace Decoder
+} // namespace Decoders
 } // namespace Tegra
diff --git a/src/video_core/host1x/codecs/vp9_types.h b/src/video_core/host1x/codecs/vp9_types.h
index 591fe73de..dad6e8437 100755
--- a/src/video_core/host1x/codecs/vp9_types.h
+++ b/src/video_core/host1x/codecs/vp9_types.h
@@ -11,7 +11,14 @@
 
 namespace Tegra {
 
-namespace Decoder {
+namespace Decoders {
+enum class Vp9SurfaceIndex : u32 {
+    Last = 0,
+    Golden = 1,
+    AltRef = 2,
+    Current = 3,
+};
+
 struct Vp9FrameDimensions {
     s16 width;
     s16 height;
@@ -48,11 +55,13 @@ enum class TxMode {
 };
 
 struct Segmentation {
+    constexpr bool operator==(const Segmentation& rhs) const = default;
+
     u8 enabled;
     u8 update_map;
     u8 temporal_update;
     u8 abs_delta;
-    std::array<u32, 8> feature_mask;
+    std::array<std::array<u8, 4>, 8> feature_enabled;
     std::array<std::array<s16, 4>, 8> feature_data;
 };
 static_assert(sizeof(Segmentation) == 0x64, "Segmentation is an invalid size");
@@ -190,7 +199,17 @@ struct PictureInfo {
 static_assert(sizeof(PictureInfo) == 0x100, "PictureInfo is an invalid size");
 
 struct EntropyProbs {
-    INSERT_PADDING_BYTES_NOINIT(1024);                 ///< 0x0000
+    std::array<u8, 10 * 10 * 8> kf_bmode_prob;         ///< 0x0000
+    std::array<u8, 10 * 10 * 1> kf_bmode_probB;        ///< 0x0320
+    std::array<u8, 3> ref_pred_probs;                  ///< 0x0384
+    std::array<u8, 7> mb_segment_tree_probs;           ///< 0x0387
+    std::array<u8, 3> segment_pred_probs;              ///< 0x038E
+    std::array<u8, 4> ref_scores;                      ///< 0x0391
+    std::array<u8, 2> prob_comppred;                   ///< 0x0395
+    INSERT_PADDING_BYTES_NOINIT(9);                    ///< 0x0397
+    std::array<u8, 10 * 8> kf_uv_mode_prob;            ///< 0x03A0
+    std::array<u8, 10 * 1> kf_uv_mode_probB;           ///< 0x03F0
+    INSERT_PADDING_BYTES_NOINIT(6);                    ///< 0x03FA
     std::array<u8, 28> inter_mode_prob;                ///< 0x0400
     std::array<u8, 4> intra_inter_prob;                ///< 0x041C
     INSERT_PADDING_BYTES_NOINIT(80);                   ///< 0x0420
@@ -302,5 +321,5 @@ ASSERT_POSITION(class_0_fr, 0x560);
 ASSERT_POSITION(coef_probs, 0x5A0);
 #undef ASSERT_POSITION
 
-}; // namespace Decoder
+}; // namespace Decoders
 }; // namespace Tegra
diff --git a/src/video_core/host1x/control.cpp b/src/video_core/host1x/control.cpp
index 1406bd41e..b931e854d 100755
--- a/src/video_core/host1x/control.cpp
+++ b/src/video_core/host1x/control.cpp
@@ -27,6 +27,7 @@ void Control::ProcessMethod(Method method, u32 argument) {
 }
 
 void Control::Execute(u32 data) {
+    LOG_TRACE(Service_NVDRV, "Control wait syncpt {} value {}", data, syncpoint_value);
     host1x.GetSyncpointManager().WaitHost(data, syncpoint_value);
 }
 
diff --git a/src/video_core/host1x/control.h b/src/video_core/host1x/control.h
index d76da5ecf..41dfe8ca0 100755
--- a/src/video_core/host1x/control.h
+++ b/src/video_core/host1x/control.h
@@ -6,9 +6,7 @@
 
 #include "common/common_types.h"
 
-namespace Tegra {
-
-namespace Host1x {
+namespace Tegra::Host1x {
 
 class Host1x;
 class Nvdec;
@@ -31,10 +29,8 @@ private:
     /// For Host1x, execute is waiting on a syncpoint previously written into the state
     void Execute(u32 data);
 
-    u32 syncpoint_value{};
     Host1x& host1x;
+    u32 syncpoint_value{};
 };
 
-} // namespace Host1x
-
-} // namespace Tegra
+} // namespace Tegra::Host1x
diff --git a/src/video_core/host1x/ffmpeg/ffmpeg.cpp b/src/video_core/host1x/ffmpeg/ffmpeg.cpp
index 96686da59..c80768ca3 100755
--- a/src/video_core/host1x/ffmpeg/ffmpeg.cpp
+++ b/src/video_core/host1x/ffmpeg/ffmpeg.cpp
@@ -5,7 +5,9 @@
 #include "common/logging/log.h"
 #include "common/scope_exit.h"
 #include "common/settings.h"
+#include "core/memory.h"
 #include "video_core/host1x/ffmpeg/ffmpeg.h"
+#include "video_core/memory_manager.h"
 
 extern "C" {
 #ifdef LIBVA_FOUND
@@ -132,7 +134,7 @@ bool HardwareContext::InitializeForDecoder(DecoderContext& decoder_context,
                                            const Decoder& decoder) {
     const auto supported_types = GetSupportedDeviceTypes();
     for (const auto type : PreferredGpuDecoders) {
-        AVPixelFormat hw_pix_fmt;
+        // AVPixelFormat hw_pix_fmt;
 
         if (std::ranges::find(supported_types, type) == supported_types.end()) {
             LOG_DEBUG(HW_GPU, "{} explicitly unsupported", av_hwdevice_get_type_name(type));
@@ -143,12 +145,14 @@ bool HardwareContext::InitializeForDecoder(DecoderContext& decoder_context,
             continue;
         }
 
-        if (decoder.SupportsDecodingOnDevice(&hw_pix_fmt, type)) {
-            decoder_context.InitializeHardwareDecoder(*this, hw_pix_fmt);
-            return true;
-        }
+        // Disable GPU decoding as it cannot return decode frame ordering which breaks everything.
+        // if (decoder.SupportsDecodingOnDevice(&hw_pix_fmt, type)) {
+        //    decoder_context.InitializeHardwareDecoder(*this, hw_pix_fmt);
+        //    return true;
+        //}
     }
 
+    LOG_INFO(HW_GPU, "Hardware decoding is disabled due to implementation issues, using CPU.");
     return false;
 }
 
@@ -183,8 +187,8 @@ bool HardwareContext::InitializeWithType(AVHWDeviceType type) {
     return true;
 }
 
-DecoderContext::DecoderContext(const Decoder& decoder) {
-    m_codec_context = avcodec_alloc_context3(decoder.GetCodec());
+DecoderContext::DecoderContext(const Decoder& decoder) : m_decoder{decoder} {
+    m_codec_context = avcodec_alloc_context3(m_decoder.GetCodec());
     av_opt_set(m_codec_context->priv_data, "tune", "zerolatency", 0);
     m_codec_context->thread_count = 0;
     m_codec_context->thread_type &= ~FF_THREAD_FRAME;
@@ -216,6 +220,25 @@ bool DecoderContext::OpenContext(const Decoder& decoder) {
 }
 
 bool DecoderContext::SendPacket(const Packet& packet) {
+    m_temp_frame = std::make_shared<Frame>();
+    m_got_frame = 0;
+
+// Android can randomly crash when calling decode directly, so skip.
+// TODO update ffmpeg and hope that fixes it.
+#ifndef ANDROID
+    if (!m_codec_context->hw_device_ctx && m_codec_context->codec_id == AV_CODEC_ID_H264) {
+        m_decode_order = true;
+        auto* codec{ffcodec(m_decoder.GetCodec())};
+        if (const int ret = codec->cb.decode(m_codec_context, m_temp_frame->GetFrame(),
+                                             &m_got_frame, packet.GetPacket());
+            ret < 0) {
+            LOG_DEBUG(Service_NVDRV, "avcodec_send_packet error {}", AVError(ret));
+            return false;
+        }
+        return true;
+    }
+#endif
+
     if (const int ret = avcodec_send_packet(m_codec_context, packet.GetPacket()); ret < 0) {
         LOG_ERROR(HW_GPU, "avcodec_send_packet error: {}", AVError(ret));
         return false;
@@ -224,139 +247,73 @@ bool DecoderContext::SendPacket(const Packet& packet) {
     return true;
 }
 
-std::unique_ptr<Frame> DecoderContext::ReceiveFrame(bool* out_is_interlaced) {
-    auto dst_frame = std::make_unique<Frame>();
+std::shared_ptr<Frame> DecoderContext::ReceiveFrame() {
+    // Android can randomly crash when calling decode directly, so skip.
+    // TODO update ffmpeg and hope that fixes it.
+#ifndef ANDROID
+    if (!m_codec_context->hw_device_ctx && m_codec_context->codec_id == AV_CODEC_ID_H264) {
+        m_decode_order = true;
+        auto* codec{ffcodec(m_decoder.GetCodec())};
+        int ret{0};
 
-    const auto ReceiveImpl = [&](AVFrame* frame) {
-        if (const int ret = avcodec_receive_frame(m_codec_context, frame); ret < 0) {
-            LOG_ERROR(HW_GPU, "avcodec_receive_frame error: {}", AVError(ret));
-            return false;
+        if (m_got_frame == 0) {
+            Packet packet{{}};
+            auto* pkt = packet.GetPacket();
+            pkt->data = nullptr;
+            pkt->size = 0;
+            ret = codec->cb.decode(m_codec_context, m_temp_frame->GetFrame(), &m_got_frame, pkt);
+            m_codec_context->has_b_frames = 0;
         }
 
-        *out_is_interlaced =
-#if defined(FF_API_INTERLACED_FRAME) || LIBAVUTIL_VERSION_MAJOR >= 59
-            (frame->flags & AV_FRAME_FLAG_INTERLACED) != 0;
-#else
-            frame->interlaced_frame != 0;
+        if (m_got_frame == 0 || ret < 0) {
+            LOG_ERROR(Service_NVDRV, "Failed to receive a frame! error {}", ret);
+            return {};
+        }
+    } else
 #endif
-        return true;
-    };
+    {
 
-    if (m_codec_context->hw_device_ctx) {
-        // If we have a hardware context, make a separate frame here to receive the
-        // hardware result before sending it to the output.
-        Frame intermediate_frame;
+        const auto ReceiveImpl = [&](AVFrame* frame) {
+            if (const int ret = avcodec_receive_frame(m_codec_context, frame); ret < 0) {
+                LOG_ERROR(HW_GPU, "avcodec_receive_frame error: {}", AVError(ret));
+                return false;
+            }
 
-        if (!ReceiveImpl(intermediate_frame.GetFrame())) {
-            return {};
-        }
+            return true;
+        };
 
-        dst_frame->SetFormat(PreferredGpuFormat);
-        if (const int ret =
-                av_hwframe_transfer_data(dst_frame->GetFrame(), intermediate_frame.GetFrame(), 0);
-            ret < 0) {
-            LOG_ERROR(HW_GPU, "av_hwframe_transfer_data error: {}", AVError(ret));
-            return {};
-        }
-    } else {
-        // Otherwise, decode the frame as normal.
-        if (!ReceiveImpl(dst_frame->GetFrame())) {
-            return {};
+        if (m_codec_context->hw_device_ctx) {
+            // If we have a hardware context, make a separate frame here to receive the
+            // hardware result before sending it to the output.
+            Frame intermediate_frame;
+
+            if (!ReceiveImpl(intermediate_frame.GetFrame())) {
+                return {};
+            }
+
+            m_temp_frame->SetFormat(PreferredGpuFormat);
+            if (const int ret = av_hwframe_transfer_data(m_temp_frame->GetFrame(),
+                                                         intermediate_frame.GetFrame(), 0);
+                ret < 0) {
+                LOG_ERROR(HW_GPU, "av_hwframe_transfer_data error: {}", AVError(ret));
+                return {};
+            }
+        } else {
+            // Otherwise, decode the frame as normal.
+            if (!ReceiveImpl(m_temp_frame->GetFrame())) {
+                return {};
+            }
         }
     }
 
-    return dst_frame;
-}
-
-DeinterlaceFilter::DeinterlaceFilter(const Frame& frame) {
-    const AVFilter* buffer_src = avfilter_get_by_name("buffer");
-    const AVFilter* buffer_sink = avfilter_get_by_name("buffersink");
-    AVFilterInOut* inputs = avfilter_inout_alloc();
-    AVFilterInOut* outputs = avfilter_inout_alloc();
-    SCOPE_EXIT({
-        avfilter_inout_free(&inputs);
-        avfilter_inout_free(&outputs);
-    });
-
-    // Don't know how to get the accurate time_base but it doesn't matter for yadif filter
-    // so just use 1/1 to make buffer filter happy
-    std::string args = fmt::format("video_size={}x{}:pix_fmt={}:time_base=1/1", frame.GetWidth(),
-                                   frame.GetHeight(), static_cast<int>(frame.GetPixelFormat()));
-
-    m_filter_graph = avfilter_graph_alloc();
-    int ret = avfilter_graph_create_filter(&m_source_context, buffer_src, "in", args.c_str(),
-                                           nullptr, m_filter_graph);
-    if (ret < 0) {
-        LOG_ERROR(HW_GPU, "avfilter_graph_create_filter source error: {}", AVError(ret));
-        return;
-    }
-
-    ret = avfilter_graph_create_filter(&m_sink_context, buffer_sink, "out", nullptr, nullptr,
-                                       m_filter_graph);
-    if (ret < 0) {
-        LOG_ERROR(HW_GPU, "avfilter_graph_create_filter sink error: {}", AVError(ret));
-        return;
-    }
-
-    inputs->name = av_strdup("out");
-    inputs->filter_ctx = m_sink_context;
-    inputs->pad_idx = 0;
-    inputs->next = nullptr;
-
-    outputs->name = av_strdup("in");
-    outputs->filter_ctx = m_source_context;
-    outputs->pad_idx = 0;
-    outputs->next = nullptr;
-
-    const char* description = "yadif=1:-1:0";
-    ret = avfilter_graph_parse_ptr(m_filter_graph, description, &inputs, &outputs, nullptr);
-    if (ret < 0) {
-        LOG_ERROR(HW_GPU, "avfilter_graph_parse_ptr error: {}", AVError(ret));
-        return;
-    }
-
-    ret = avfilter_graph_config(m_filter_graph, nullptr);
-    if (ret < 0) {
-        LOG_ERROR(HW_GPU, "avfilter_graph_config error: {}", AVError(ret));
-        return;
-    }
-
-    m_initialized = true;
-}
-
-bool DeinterlaceFilter::AddSourceFrame(const Frame& frame) {
-    if (const int ret = av_buffersrc_add_frame_flags(m_source_context, frame.GetFrame(),
-                                                     AV_BUFFERSRC_FLAG_KEEP_REF);
-        ret < 0) {
-        LOG_ERROR(HW_GPU, "av_buffersrc_add_frame_flags error: {}", AVError(ret));
-        return false;
-    }
-
-    return true;
-}
-
-std::unique_ptr<Frame> DeinterlaceFilter::DrainSinkFrame() {
-    auto dst_frame = std::make_unique<Frame>();
-    const int ret = av_buffersink_get_frame(m_sink_context, dst_frame->GetFrame());
-
-    if (ret == AVERROR(EAGAIN) || ret == AVERROR(AVERROR_EOF)) {
-        return {};
-    }
-
-    if (ret < 0) {
-        LOG_ERROR(HW_GPU, "av_buffersink_get_frame error: {}", AVError(ret));
-        return {};
-    }
-
-    return dst_frame;
-}
-
-DeinterlaceFilter::~DeinterlaceFilter() {
-    avfilter_graph_free(&m_filter_graph);
+#if defined(FF_API_INTERLACED_FRAME) || LIBAVUTIL_VERSION_MAJOR >= 59
+    m_temp_frame->GetFrame()->interlaced_frame =
+        (m_temp_frame->GetFrame()->flags & AV_FRAME_FLAG_INTERLACED) != 0;
+#endif
+    return std::move(m_temp_frame);
 }
 
 void DecodeApi::Reset() {
-    m_deinterlace_filter.reset();
     m_hardware_context.reset();
     m_decoder_context.reset();
     m_decoder.reset();
@@ -382,43 +339,14 @@ bool DecodeApi::Initialize(Tegra::Host1x::NvdecCommon::VideoCodec codec) {
     return true;
 }
 
-bool DecodeApi::SendPacket(std::span<const u8> packet_data, size_t configuration_size) {
+bool DecodeApi::SendPacket(std::span<const u8> packet_data) {
     FFmpeg::Packet packet(packet_data);
     return m_decoder_context->SendPacket(packet);
 }
 
-void DecodeApi::ReceiveFrames(std::queue<std::unique_ptr<Frame>>& frame_queue) {
+std::shared_ptr<Frame> DecodeApi::ReceiveFrame() {
     // Receive raw frame from decoder.
-    bool is_interlaced;
-    auto frame = m_decoder_context->ReceiveFrame(&is_interlaced);
-    if (!frame) {
-        return;
-    }
-
-    if (!is_interlaced) {
-        // If the frame is not interlaced, we can pend it now.
-        frame_queue.push(std::move(frame));
-    } else {
-        // Create the deinterlacer if needed.
-        if (!m_deinterlace_filter) {
-            m_deinterlace_filter.emplace(*frame);
-        }
-
-        // Add the frame we just received.
-        if (!m_deinterlace_filter->AddSourceFrame(*frame)) {
-            return;
-        }
-
-        // Pend output fields.
-        while (true) {
-            auto filter_frame = m_deinterlace_filter->DrainSinkFrame();
-            if (!filter_frame) {
-                break;
-            }
-
-            frame_queue.push(std::move(filter_frame));
-        }
-    }
+    return m_decoder_context->ReceiveFrame();
 }
 
 } // namespace FFmpeg
diff --git a/src/video_core/host1x/ffmpeg/ffmpeg.h b/src/video_core/host1x/ffmpeg/ffmpeg.h
index 1de0bbd83..a74fcba80 100755
--- a/src/video_core/host1x/ffmpeg/ffmpeg.h
+++ b/src/video_core/host1x/ffmpeg/ffmpeg.h
@@ -20,17 +20,20 @@ extern "C" {
 #endif
 
 #include <libavcodec/avcodec.h>
-#include <libavfilter/avfilter.h>
-#include <libavfilter/buffersink.h>
-#include <libavfilter/buffersrc.h>
-#include <libavutil/avutil.h>
 #include <libavutil/opt.h>
+#ifndef ANDROID
+#include <libavcodec/codec_internal.h>
+#endif
 
 #if defined(__GNUC__) || defined(__clang__)
 #pragma GCC diagnostic pop
 #endif
 }
 
+namespace Tegra {
+class MemoryManager;
+}
+
 namespace FFmpeg {
 
 class Packet;
@@ -90,6 +93,10 @@ public:
         return m_frame->data[plane];
     }
 
+    const u8* GetPlane(int plane) const {
+        return m_frame->data[plane];
+    }
+
     u8** GetPlanes() const {
         return m_frame->data;
     }
@@ -98,6 +105,14 @@ public:
         m_frame->format = format;
     }
 
+    bool IsInterlaced() const {
+        return m_frame->interlaced_frame != 0;
+    }
+
+    bool IsHardwareDecoded() const {
+        return m_frame->hw_frames_ctx != nullptr;
+    }
+
     AVFrame* GetFrame() const {
         return m_frame;
     }
@@ -160,33 +175,22 @@ public:
     void InitializeHardwareDecoder(const HardwareContext& context, AVPixelFormat hw_pix_fmt);
     bool OpenContext(const Decoder& decoder);
     bool SendPacket(const Packet& packet);
-    std::unique_ptr<Frame> ReceiveFrame(bool* out_is_interlaced);
+    std::shared_ptr<Frame> ReceiveFrame();
 
     AVCodecContext* GetCodecContext() const {
         return m_codec_context;
     }
 
+    bool UsingDecodeOrder() const {
+        return m_decode_order;
+    }
+
 private:
+    const Decoder& m_decoder;
     AVCodecContext* m_codec_context{};
-};
-
-// Wraps an AVFilterGraph.
-class DeinterlaceFilter {
-public:
-    YUZU_NON_COPYABLE(DeinterlaceFilter);
-    YUZU_NON_MOVEABLE(DeinterlaceFilter);
-
-    explicit DeinterlaceFilter(const Frame& frame);
-    ~DeinterlaceFilter();
-
-    bool AddSourceFrame(const Frame& frame);
-    std::unique_ptr<Frame> DrainSinkFrame();
-
-private:
-    AVFilterGraph* m_filter_graph{};
-    AVFilterContext* m_source_context{};
-    AVFilterContext* m_sink_context{};
-    bool m_initialized{};
+    s32 m_got_frame{};
+    std::shared_ptr<Frame> m_temp_frame{};
+    bool m_decode_order{};
 };
 
 class DecodeApi {
@@ -200,14 +204,17 @@ public:
     bool Initialize(Tegra::Host1x::NvdecCommon::VideoCodec codec);
     void Reset();
 
-    bool SendPacket(std::span<const u8> packet_data, size_t configuration_size);
-    void ReceiveFrames(std::queue<std::unique_ptr<Frame>>& frame_queue);
+    bool UsingDecodeOrder() const {
+        return m_decoder_context->UsingDecodeOrder();
+    }
+
+    bool SendPacket(std::span<const u8> packet_data);
+    std::shared_ptr<Frame> ReceiveFrame();
 
 private:
     std::optional<FFmpeg::Decoder> m_decoder;
     std::optional<FFmpeg::DecoderContext> m_decoder_context;
     std::optional<FFmpeg::HardwareContext> m_hardware_context;
-    std::optional<FFmpeg::DeinterlaceFilter> m_deinterlace_filter;
 };
 
 } // namespace FFmpeg
diff --git a/src/video_core/host1x/host1x.cpp b/src/video_core/host1x/host1x.cpp
index 27c59640b..6bc85b9ed 100755
--- a/src/video_core/host1x/host1x.cpp
+++ b/src/video_core/host1x/host1x.cpp
@@ -3,10 +3,10 @@
 
 #include "core/core.h"
 #include "video_core/host1x/host1x.h"
+#include "video_core/host1x/nvdec.h"
+#include "video_core/host1x/vic.h"
 
-namespace Tegra {
-
-namespace Host1x {
+namespace Tegra::Host1x {
 
 Host1x::Host1x(Core::System& system_)
     : system{system_}, syncpoint_manager{},
@@ -15,6 +15,22 @@ Host1x::Host1x(Core::System& system_)
 
 Host1x::~Host1x() = default;
 
-} // namespace Host1x
+void Host1x::StartDevice(s32 fd, ChannelType type, u32 syncpt) {
+    switch (type) {
+    case ChannelType::NvDec:
+        devices[fd] = std::make_unique<Tegra::Host1x::Nvdec>(*this, fd, syncpt, frame_queue);
+        break;
+    case ChannelType::VIC:
+        devices[fd] = std::make_unique<Tegra::Host1x::Vic>(*this, fd, syncpt, frame_queue);
+        break;
+    default:
+        LOG_ERROR(HW_GPU, "Unimplemented host1x device {}", static_cast<u32>(type));
+        break;
+    }
+}
 
-} // namespace Tegra
+void Host1x::StopDevice(s32 fd, ChannelType type) {
+    devices.erase(fd);
+}
+
+} // namespace Tegra::Host1x
diff --git a/src/video_core/host1x/host1x.h b/src/video_core/host1x/host1x.h
index 958aeda4f..24a46e26c 100755
--- a/src/video_core/host1x/host1x.h
+++ b/src/video_core/host1x/host1x.h
@@ -3,9 +3,14 @@
 
 #pragma once
 
+#include <unordered_map>
+#include <unordered_set>
+#include <queue>
+
 #include "common/common_types.h"
 
 #include "common/address_space.h"
+#include "video_core/cdma_pusher.h"
 #include "video_core/host1x/gpu_device_memory_manager.h"
 #include "video_core/host1x/syncpoint_manager.h"
 #include "video_core/memory_manager.h"
@@ -14,15 +19,128 @@ namespace Core {
 class System;
 } // namespace Core
 
-namespace Tegra {
+namespace FFmpeg {
+class Frame;
+} // namespace FFmpeg
 
-namespace Host1x {
+namespace Tegra::Host1x {
+class Nvdec;
+
+class FrameQueue {
+public:
+    void Open(s32 fd) {
+        std::scoped_lock l{m_mutex};
+        m_presentation_order.insert({fd, {}});
+        m_decode_order.insert({fd, {}});
+    }
+
+    void Close(s32 fd) {
+        std::scoped_lock l{m_mutex};
+        m_presentation_order.erase(fd);
+        m_decode_order.erase(fd);
+    }
+
+    s32 VicFindNvdecFdFromOffset(u64 search_offset) {
+        std::scoped_lock l{m_mutex};
+        // Vic does not know which nvdec is producing frames for it, so search all the fds here for
+        // the given offset.
+        for (auto& map : m_presentation_order) {
+            for (auto& [offset, frame] : map.second) {
+                if (offset == search_offset) {
+                    return map.first;
+                }
+            }
+        }
+
+        for (auto& map : m_decode_order) {
+            for (auto& [offset, frame] : map.second) {
+                if (offset == search_offset) {
+                    return map.first;
+                }
+            }
+        }
+
+        return -1;
+    }
+
+    void PushPresentOrder(s32 fd, u64 offset, std::shared_ptr<FFmpeg::Frame>&& frame) {
+        std::scoped_lock l{m_mutex};
+        auto map = m_presentation_order.find(fd);
+        map->second.emplace_back(offset, std::move(frame));
+    }
+
+    void PushDecodeOrder(s32 fd, u64 offset, std::shared_ptr<FFmpeg::Frame>&& frame) {
+        std::scoped_lock l{m_mutex};
+        auto map = m_decode_order.find(fd);
+        map->second.insert_or_assign(offset, std::move(frame));
+    }
+
+    std::shared_ptr<FFmpeg::Frame> GetFrame(s32 fd, u64 offset) {
+        if (fd == -1) {
+            return {};
+        }
+
+        std::scoped_lock l{m_mutex};
+        auto present_map = m_presentation_order.find(fd);
+        if (present_map->second.size() > 0) {
+            return GetPresentOrderLocked(fd);
+        }
+
+        auto decode_map = m_decode_order.find(fd);
+        if (decode_map->second.size() > 0) {
+            return GetDecodeOrderLocked(fd, offset);
+        }
+
+        return {};
+    }
+
+private:
+    std::shared_ptr<FFmpeg::Frame> GetPresentOrderLocked(s32 fd) {
+        auto map = m_presentation_order.find(fd);
+        if (map->second.size() == 0) {
+            return {};
+        }
+        auto frame = std::move(map->second.front().second);
+        map->second.pop_front();
+        return frame;
+    }
+
+    std::shared_ptr<FFmpeg::Frame> GetDecodeOrderLocked(s32 fd, u64 offset) {
+        auto map = m_decode_order.find(fd);
+        auto it = map->second.find(offset);
+        if (it == map->second.end()) {
+            return {};
+        }
+        return std::move(map->second.extract(it).mapped());
+    }
+
+    using FramePtr = std::shared_ptr<FFmpeg::Frame>;
+
+    std::mutex m_mutex{};
+    std::unordered_map<s32, std::deque<std::pair<u64, FramePtr>>> m_presentation_order;
+    std::unordered_map<s32, std::unordered_map<u64, FramePtr>> m_decode_order;
+};
+
+enum class ChannelType : u32 {
+    MsEnc = 0,
+    VIC = 1,
+    GPU = 2,
+    NvDec = 3,
+    Display = 4,
+    NvJpg = 5,
+    TSec = 6,
+    Max = 7,
+};
 
 class Host1x {
 public:
     explicit Host1x(Core::System& system);
     ~Host1x();
 
+    Core::System& System() {
+        return system;
+    }
+
     SyncpointManager& GetSyncpointManager() {
         return syncpoint_manager;
     }
@@ -55,14 +173,25 @@ public:
         return *allocator;
     }
 
+    void StartDevice(s32 fd, ChannelType type, u32 syncpt);
+    void StopDevice(s32 fd, ChannelType type);
+
+    void PushEntries(s32 fd, ChCommandHeaderList&& entries) {
+        auto it = devices.find(fd);
+        if (it == devices.end()) {
+            return;
+        }
+        it->second->PushEntries(std::move(entries));
+    }
+
 private:
     Core::System& system;
     SyncpointManager syncpoint_manager;
     Tegra::MaxwellDeviceMemoryManager memory_manager;
     Tegra::MemoryManager gmmu_manager;
     std::unique_ptr<Common::FlatAllocator<u32, 0, 32>> allocator;
+    FrameQueue frame_queue;
+    std::unordered_map<s32, std::unique_ptr<CDmaPusher>> devices;
 };
 
-} // namespace Host1x
-
-} // namespace Tegra
+} // namespace Tegra::Host1x
diff --git a/src/video_core/host1x/nvdec.cpp b/src/video_core/host1x/nvdec.cpp
index 4507a0d26..2972bbf60 100755
--- a/src/video_core/host1x/nvdec.cpp
+++ b/src/video_core/host1x/nvdec.cpp
@@ -2,6 +2,12 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 
 #include "common/assert.h"
+
+#include "common/polyfill_thread.h"
+#include "common/settings.h"
+#include "video_core/host1x/codecs/h264.h"
+#include "video_core/host1x/codecs/vp8.h"
+#include "video_core/host1x/codecs/vp9.h"
 #include "video_core/host1x/host1x.h"
 #include "video_core/host1x/nvdec.h"
 
@@ -10,37 +16,70 @@ namespace Tegra::Host1x {
 #define NVDEC_REG_INDEX(field_name)                                                                \
     (offsetof(NvdecCommon::NvdecRegisters, field_name) / sizeof(u64))
 
-Nvdec::Nvdec(Host1x& host1x_)
-    : host1x(host1x_), state{}, codec(std::make_unique<Codec>(host1x, state)) {}
+Nvdec::Nvdec(Host1x& host1x_, s32 id_, u32 syncpt, FrameQueue& frame_queue_)
+    : CDmaPusher{host1x_, id_}, id{id_}, syncpoint{syncpt}, frame_queue{frame_queue_} {
+    LOG_INFO(HW_GPU, "Created nvdec {}", id);
+    frame_queue.Open(id);
+}
 
-Nvdec::~Nvdec() = default;
+Nvdec::~Nvdec() {
+    LOG_INFO(HW_GPU, "Destroying nvdec {}", id);
+    frame_queue.Close(id);
+}
 
 void Nvdec::ProcessMethod(u32 method, u32 argument) {
-    state.reg_array[method] = static_cast<u64>(argument) << 8;
+    regs.reg_array[method] = argument;
 
     switch (method) {
     case NVDEC_REG_INDEX(set_codec_id):
-        codec->SetTargetCodec(static_cast<NvdecCommon::VideoCodec>(argument));
+        CreateDecoder(static_cast<NvdecCommon::VideoCodec>(argument));
         break;
-    case NVDEC_REG_INDEX(execute):
+    case NVDEC_REG_INDEX(execute): {
+        if (wait_needed) {
+            std::this_thread::sleep_for(std::chrono::milliseconds(32));
+            wait_needed = false;
+        }
         Execute();
-        break;
+    } break;
     }
 }
 
-std::unique_ptr<FFmpeg::Frame> Nvdec::GetFrame() {
-    return codec->GetCurrentFrame();
+void Nvdec::CreateDecoder(NvdecCommon::VideoCodec codec) {
+    if (decoder.get()) {
+        return;
+    }
+    switch (codec) {
+    case NvdecCommon::VideoCodec::H264:
+        decoder = std::make_unique<Decoders::H264>(host1x, regs, id, frame_queue);
+        break;
+    case NvdecCommon::VideoCodec::VP8:
+        decoder = std::make_unique<Decoders::VP8>(host1x, regs, id, frame_queue);
+        break;
+    case NvdecCommon::VideoCodec::VP9:
+        decoder = std::make_unique<Decoders::VP9>(host1x, regs, id, frame_queue);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Codec {}", decoder->GetCurrentCodecName());
+        break;
+    }
+    LOG_INFO(HW_GPU, "Created decoder {} for id {}", decoder->GetCurrentCodecName(), id);
 }
 
 void Nvdec::Execute() {
-    switch (codec->GetCurrentCodec()) {
+    if (Settings::values.nvdec_emulation.GetValue() == Settings::NvdecEmulation::Off) [[unlikely]] {
+        // Signalling syncpts too fast can cause games to get stuck as they don't expect a <1ms
+        // execution time. Sleep for half of a 60 fps frame just in case.
+        std::this_thread::sleep_for(std::chrono::milliseconds(8));
+        return;
+    }
+    switch (decoder->GetCurrentCodec()) {
     case NvdecCommon::VideoCodec::H264:
     case NvdecCommon::VideoCodec::VP8:
     case NvdecCommon::VideoCodec::VP9:
-        codec->Decode();
+        decoder->Decode();
         break;
     default:
-        UNIMPLEMENTED_MSG("Codec {}", codec->GetCurrentCodecName());
+        UNIMPLEMENTED_MSG("Codec {}", decoder->GetCurrentCodecName());
         break;
     }
 }
diff --git a/src/video_core/host1x/nvdec.h b/src/video_core/host1x/nvdec.h
index 806c30f2c..a1755696b 100755
--- a/src/video_core/host1x/nvdec.h
+++ b/src/video_core/host1x/nvdec.h
@@ -5,33 +5,47 @@
 
 #include <memory>
 #include <vector>
+
 #include "common/common_types.h"
-#include "video_core/host1x/codecs/codec.h"
+#include "video_core/cdma_pusher.h"
+#include "video_core/host1x/codecs/decoder.h"
 
 namespace Tegra {
 
 namespace Host1x {
-
 class Host1x;
+class FrameQueue;
 
-class Nvdec {
+class Nvdec final : public CDmaPusher {
 public:
-    explicit Nvdec(Host1x& host1x);
+    explicit Nvdec(Host1x& host1x, s32 id, u32 syncpt, FrameQueue& frame_queue_);
     ~Nvdec();
 
     /// Writes the method into the state, Invoke Execute() if encountered
-    void ProcessMethod(u32 method, u32 argument);
+    void ProcessMethod(u32 method, u32 arg) override;
 
-    /// Return most recently decoded frame
-    [[nodiscard]] std::unique_ptr<FFmpeg::Frame> GetFrame();
+    u32 GetSyncpoint() const {
+        return syncpoint;
+    }
+
+    void SetWait() {
+        wait_needed = true;
+    }
 
 private:
+    /// Create the decoder when the codec id is set
+    void CreateDecoder(NvdecCommon::VideoCodec codec);
+
     /// Invoke codec to decode a frame
     void Execute();
 
-    Host1x& host1x;
-    NvdecCommon::NvdecRegisters state;
-    std::unique_ptr<Codec> codec;
+    s32 id;
+    u32 syncpoint;
+    FrameQueue& frame_queue;
+
+    NvdecCommon::NvdecRegisters regs{};
+    std::unique_ptr<Decoder> decoder;
+    bool wait_needed{false};
 };
 
 } // namespace Host1x
diff --git a/src/video_core/host1x/nvdec_common.h b/src/video_core/host1x/nvdec_common.h
index 0fd678269..13b96452f 100755
--- a/src/video_core/host1x/nvdec_common.h
+++ b/src/video_core/host1x/nvdec_common.h
@@ -17,6 +17,17 @@ enum class VideoCodec : u64 {
     VP9 = 0x9,
 };
 
+struct Offset {
+    constexpr u64 Address() const noexcept {
+        return offset << 8;
+    }
+
+private:
+    u64 offset;
+};
+static_assert(std::is_trivial_v<Offset>, "Offset must be trivial");
+static_assert(sizeof(Offset) == 0x8, "Offset has the wrong size!");
+
 // NVDEC should use a 32-bit address space, but is mapped to 64-bit,
 // doubling the sizes here is compensating for that.
 struct NvdecRegisters {
@@ -38,29 +49,40 @@ struct NvdecRegisters {
                     BitField<17, 1, u64> all_intra_frame;
                 };
             } control_params;
-            u64 picture_info_offset;                   ///< 0x0808
-            u64 frame_bitstream_offset;                ///< 0x0810
-            u64 frame_number;                          ///< 0x0818
-            u64 h264_slice_data_offsets;               ///< 0x0820
-            u64 h264_mv_dump_offset;                   ///< 0x0828
-            INSERT_PADDING_WORDS_NOINIT(6);            ///< 0x0830
-            u64 frame_stats_offset;                    ///< 0x0848
-            u64 h264_last_surface_luma_offset;         ///< 0x0850
-            u64 h264_last_surface_chroma_offset;       ///< 0x0858
-            std::array<u64, 17> surface_luma_offset;   ///< 0x0860
-            std::array<u64, 17> surface_chroma_offset; ///< 0x08E8
-            INSERT_PADDING_WORDS_NOINIT(68);           ///< 0x0970
-            u64 vp8_prob_data_offset;                  ///< 0x0A80
-            u64 vp8_header_partition_buf_offset;       ///< 0x0A88
-            INSERT_PADDING_WORDS_NOINIT(60);           ///< 0x0A90
-            u64 vp9_entropy_probs_offset;              ///< 0x0B80
-            u64 vp9_backward_updates_offset;           ///< 0x0B88
-            u64 vp9_last_frame_segmap_offset;          ///< 0x0B90
-            u64 vp9_curr_frame_segmap_offset;          ///< 0x0B98
-            INSERT_PADDING_WORDS_NOINIT(2);            ///< 0x0BA0
-            u64 vp9_last_frame_mvs_offset;             ///< 0x0BA8
-            u64 vp9_curr_frame_mvs_offset;             ///< 0x0BB0
-            INSERT_PADDING_WORDS_NOINIT(2);            ///< 0x0BB8
+            Offset picture_info_offset;                    ///< 0x0808
+            Offset frame_bitstream_offset;                 ///< 0x0810
+            u64 frame_number;                              ///< 0x0818
+            Offset h264_slice_data_offsets;                ///< 0x0820
+            Offset h264_mv_dump_offset;                    ///< 0x0828
+            INSERT_PADDING_WORDS_NOINIT(6);                ///< 0x0830
+            Offset frame_stats_offset;                     ///< 0x0848
+            Offset h264_last_surface_luma_offset;          ///< 0x0850
+            Offset h264_last_surface_chroma_offset;        ///< 0x0858
+            std::array<Offset, 17> surface_luma_offsets;   ///< 0x0860
+            std::array<Offset, 17> surface_chroma_offsets; ///< 0x08E8
+            Offset pic_scratch_buf_offset;                 ///< 0x0970
+            Offset external_mvbuffer_offset;               ///< 0x0978
+            INSERT_PADDING_WORDS_NOINIT(32);               ///< 0x0980
+            Offset h264_mbhist_buffer_offset;              ///< 0x0A00
+            INSERT_PADDING_WORDS_NOINIT(30);               ///< 0x0A08
+            Offset vp8_prob_data_offset;                   ///< 0x0A80
+            Offset vp8_header_partition_buf_offset;        ///< 0x0A88
+            INSERT_PADDING_WORDS_NOINIT(28);               ///< 0x0A90
+            Offset hvec_scalist_list_offset;               ///< 0x0B00
+            Offset hvec_tile_sizes_offset;                 ///< 0x0B08
+            Offset hvec_filter_buffer_offset;              ///< 0x0B10
+            Offset hvec_sao_buffer_offset;                 ///< 0x0B18
+            Offset hvec_slice_info_buffer_offset;          ///< 0x0B20
+            Offset hvec_slice_group_index_offset;          ///< 0x0B28
+            INSERT_PADDING_WORDS_NOINIT(20);               ///< 0x0B30
+            Offset vp9_prob_tab_buffer_offset;             ///< 0x0B80
+            Offset vp9_ctx_counter_buffer_offset;          ///< 0x0B88
+            Offset vp9_segment_read_buffer_offset;         ///< 0x0B90
+            Offset vp9_segment_write_buffer_offset;        ///< 0x0B98
+            Offset vp9_tile_size_buffer_offset;            ///< 0x0BA0
+            Offset vp9_col_mvwrite_buffer_offset;          ///< 0x0BA8
+            Offset vp9_col_mvread_buffer_offset;           ///< 0x0BB0
+            Offset vp9_filter_buffer_offset;               ///< 0x0BB8
         };
         std::array<u64, NUM_REGS> reg_array;
     };
@@ -81,16 +103,16 @@ ASSERT_REG_POSITION(h264_slice_data_offsets, 0x104);
 ASSERT_REG_POSITION(frame_stats_offset, 0x109);
 ASSERT_REG_POSITION(h264_last_surface_luma_offset, 0x10A);
 ASSERT_REG_POSITION(h264_last_surface_chroma_offset, 0x10B);
-ASSERT_REG_POSITION(surface_luma_offset, 0x10C);
-ASSERT_REG_POSITION(surface_chroma_offset, 0x11D);
+ASSERT_REG_POSITION(surface_luma_offsets, 0x10C);
+ASSERT_REG_POSITION(surface_chroma_offsets, 0x11D);
 ASSERT_REG_POSITION(vp8_prob_data_offset, 0x150);
 ASSERT_REG_POSITION(vp8_header_partition_buf_offset, 0x151);
-ASSERT_REG_POSITION(vp9_entropy_probs_offset, 0x170);
-ASSERT_REG_POSITION(vp9_backward_updates_offset, 0x171);
-ASSERT_REG_POSITION(vp9_last_frame_segmap_offset, 0x172);
-ASSERT_REG_POSITION(vp9_curr_frame_segmap_offset, 0x173);
-ASSERT_REG_POSITION(vp9_last_frame_mvs_offset, 0x175);
-ASSERT_REG_POSITION(vp9_curr_frame_mvs_offset, 0x176);
+ASSERT_REG_POSITION(vp9_prob_tab_buffer_offset, 0x170);
+ASSERT_REG_POSITION(vp9_ctx_counter_buffer_offset, 0x171);
+ASSERT_REG_POSITION(vp9_segment_read_buffer_offset, 0x172);
+ASSERT_REG_POSITION(vp9_segment_write_buffer_offset, 0x173);
+ASSERT_REG_POSITION(vp9_col_mvwrite_buffer_offset, 0x175);
+ASSERT_REG_POSITION(vp9_col_mvread_buffer_offset, 0x176);
 
 #undef ASSERT_REG_POSITION
 
diff --git a/src/video_core/host1x/syncpoint_manager.cpp b/src/video_core/host1x/syncpoint_manager.cpp
index eb5dd8d70..1b5dd4ba6 100755
--- a/src/video_core/host1x/syncpoint_manager.cpp
+++ b/src/video_core/host1x/syncpoint_manager.cpp
@@ -18,7 +18,7 @@ SyncpointManager::ActionHandle SyncpointManager::RegisterAction(
         return {};
     }
 
-    std::unique_lock lk(guard);
+    std::scoped_lock lk(guard);
     if (syncpoint.load(std::memory_order_relaxed) >= expected_value) {
         action();
         return {};
@@ -35,7 +35,7 @@ SyncpointManager::ActionHandle SyncpointManager::RegisterAction(
 
 void SyncpointManager::DeregisterAction(std::list<RegisteredAction>& action_storage,
                                         const ActionHandle& handle) {
-    std::unique_lock lk(guard);
+    std::scoped_lock lk(guard);
 
     // We want to ensure the iterator still exists prior to erasing it
     // Otherwise, if an invalid iterator was passed in then it could lead to UB
@@ -78,7 +78,7 @@ void SyncpointManager::Increment(std::atomic<u32>& syncpoint, std::condition_var
                                  std::list<RegisteredAction>& action_storage) {
     auto new_value{syncpoint.fetch_add(1, std::memory_order_acq_rel) + 1};
 
-    std::unique_lock lk(guard);
+    std::scoped_lock lk(guard);
     auto it = action_storage.begin();
     while (it != action_storage.end()) {
         if (it->expected_value > new_value) {
diff --git a/src/video_core/host1x/vic.cpp b/src/video_core/host1x/vic.cpp
index 34b365803..6c0127782 100755
--- a/src/video_core/host1x/vic.cpp
+++ b/src/video_core/host1x/vic.cpp
@@ -2,6 +2,18 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 
 #include <array>
+#include <tuple>
+#include <stdint.h>
+
+#if defined(ARCHITECTURE_x86_64)
+#if defined(_MSC_VER)
+#include <intrin.h>
+#else
+#include <immintrin.h>
+#endif
+#elif defined(ARCHITECTURE_arm64)
+#include <arm_neon.h>
+#endif
 
 extern "C" {
 #if defined(__GNUC__) || defined(__clang__)
@@ -14,228 +26,1181 @@ extern "C" {
 #endif
 }
 
+#include "common/alignment.h"
 #include "common/assert.h"
 #include "common/bit_field.h"
 #include "common/logging/log.h"
+#include "common/polyfill_thread.h"
+#include "common/settings.h"
 
 #include "video_core/engines/maxwell_3d.h"
+#include "video_core/guest_memory.h"
 #include "video_core/host1x/host1x.h"
 #include "video_core/host1x/nvdec.h"
 #include "video_core/host1x/vic.h"
 #include "video_core/memory_manager.h"
 #include "video_core/textures/decoders.h"
 
-namespace Tegra {
-
-namespace Host1x {
+#if defined(ARCHITECTURE_x86_64)
+#include "common/x64/cpu_detect.h"
+#elif defined(ARCHITECTURE_arm64)
+// Some ARM64 detect
+#endif
 
+namespace Tegra::Host1x {
 namespace {
-enum class VideoPixelFormat : u64_le {
-    RGBA8 = 0x1f,
-    BGRA8 = 0x20,
-    RGBX8 = 0x23,
-    YUV420 = 0x44,
-};
-} // Anonymous namespace
+static bool HasSSE41() {
+#if defined(ARCHITECTURE_x86_64)
+    const auto& cpu_caps{Common::GetCPUCaps()};
+    return cpu_caps.sse4_1;
+#else
+    return false;
+#endif
+}
 
-union VicConfig {
-    u64_le raw{};
-    BitField<0, 7, VideoPixelFormat> pixel_format;
-    BitField<7, 2, u64_le> chroma_loc_horiz;
-    BitField<9, 2, u64_le> chroma_loc_vert;
-    BitField<11, 4, u64_le> block_linear_kind;
-    BitField<15, 4, u64_le> block_linear_height_log2;
-    BitField<32, 14, u64_le> surface_width_minus1;
-    BitField<46, 14, u64_le> surface_height_minus1;
-};
+void SwizzleSurface(std::span<u8> output, u32 out_stride, std::span<const u8> input, u32 in_stride,
+                    u32 height) {
+    /*
+     * Taken from https://github.com/averne/FFmpeg/blob/nvtegra/libavutil/hwcontext_nvtegra.c#L949
+     * Can only handle block height == 1.
+     */
+    const uint32_t x_mask = 0xFFFFFFD2u;
+    const uint32_t y_mask = 0x2Cu;
+    uint32_t offs_x{};
+    uint32_t offs_y{};
+    uint32_t offs_line{};
 
-Vic::Vic(Host1x& host1x_, std::shared_ptr<Nvdec> nvdec_processor_)
-    : host1x(host1x_),
-      nvdec_processor(std::move(nvdec_processor_)), converted_frame_buffer{nullptr, av_free} {}
+    for (u32 y = 0; y < height; y += 2) {
+        auto dst_line = output.data() + offs_y * 16;
+        const auto src_line = input.data() + y * (in_stride / 16) * 16;
 
-Vic::~Vic() = default;
+        offs_line = offs_x;
+        for (u32 x = 0; x < in_stride; x += 16) {
+            std::memcpy(&dst_line[offs_line * 16], &src_line[x], 16);
+            std::memcpy(&dst_line[offs_line * 16 + 16], &src_line[x + in_stride], 16);
+            offs_line = (offs_line - x_mask) & x_mask;
+        }
 
-void Vic::ProcessMethod(Method method, u32 argument) {
-    LOG_DEBUG(HW_GPU, "Vic method 0x{:X}", static_cast<u32>(method));
-    const u64 arg = static_cast<u64>(argument) << 8;
-    switch (method) {
-    case Method::Execute:
+        offs_y = (offs_y - y_mask) & y_mask;
+
+        /* Wrap into next tile row */
+        if (!offs_y) {
+            offs_x += out_stride;
+        }
+    }
+}
+
+} // namespace
+
+Vic::Vic(Host1x& host1x_, s32 id_, u32 syncpt, FrameQueue& frame_queue_)
+    : CDmaPusher{host1x_, id_}, id{id_}, syncpoint{syncpt},
+      frame_queue{frame_queue_}, has_sse41{HasSSE41()} {
+    LOG_INFO(HW_GPU, "Created vic {}", id);
+}
+
+Vic::~Vic() {
+    LOG_INFO(HW_GPU, "Destroying vic {}", id);
+}
+
+void Vic::ProcessMethod(u32 method, u32 arg) {
+    LOG_TRACE(HW_GPU, "Vic method 0x{:X}", static_cast<u32>(method));
+    regs.reg_array[method] = arg;
+
+    switch (static_cast<Method>(method * sizeof(u32))) {
+    case Method::Execute: {
         Execute();
-        break;
-    case Method::SetConfigStructOffset:
-        config_struct_address = arg;
-        break;
-    case Method::SetOutputSurfaceLumaOffset:
-        output_surface_luma_address = arg;
-        break;
-    case Method::SetOutputSurfaceChromaOffset:
-        output_surface_chroma_address = arg;
-        break;
+    } break;
     default:
         break;
     }
 }
 
 void Vic::Execute() {
-    if (output_surface_luma_address == 0) {
-        LOG_ERROR(Service_NVDRV, "VIC Luma address not set.");
-        return;
-    }
-    const VicConfig config{host1x.GMMU().Read<u64>(config_struct_address + 0x20)};
-    auto frame = nvdec_processor->GetFrame();
-    if (!frame) {
-        return;
-    }
-    const u64 surface_width = config.surface_width_minus1 + 1;
-    const u64 surface_height = config.surface_height_minus1 + 1;
-    if (static_cast<u64>(frame->GetWidth()) != surface_width ||
-        static_cast<u64>(frame->GetHeight()) != surface_height) {
-        // TODO: Properly support multiple video streams with differing frame dimensions
-        LOG_WARNING(Service_NVDRV, "Frame dimensions {}x{} don't match surface dimensions {}x{}",
-                    frame->GetWidth(), frame->GetHeight(), surface_width, surface_height);
-    }
-    switch (config.pixel_format) {
-    case VideoPixelFormat::RGBA8:
-    case VideoPixelFormat::BGRA8:
-    case VideoPixelFormat::RGBX8:
-        WriteRGBFrame(std::move(frame), config);
-        break;
-    case VideoPixelFormat::YUV420:
-        WriteYUVFrame(std::move(frame), config);
-        break;
-    default:
-        UNIMPLEMENTED_MSG("Unknown video pixel format {:X}", config.pixel_format.Value());
-        break;
-    }
-}
+    ConfigStruct config{};
+    memory_manager.ReadBlock(regs.config_struct_offset.Address(), &config, sizeof(ConfigStruct));
 
-void Vic::WriteRGBFrame(std::unique_ptr<FFmpeg::Frame> frame, const VicConfig& config) {
-    LOG_TRACE(Service_NVDRV, "Writing RGB Frame");
+    auto output_width{config.output_surface_config.out_surface_width + 1};
+    auto output_height{config.output_surface_config.out_surface_height + 1};
+    output_surface.resize_destructive(output_width * output_height);
 
-    const auto frame_width = frame->GetWidth();
-    const auto frame_height = frame->GetHeight();
-    const auto frame_format = frame->GetPixelFormat();
-
-    if (!scaler_ctx || frame_width != scaler_width || frame_height != scaler_height) {
-        const AVPixelFormat target_format = [pixel_format = config.pixel_format]() {
-            switch (pixel_format) {
-            case VideoPixelFormat::RGBA8:
-                return AV_PIX_FMT_RGBA;
-            case VideoPixelFormat::BGRA8:
-                return AV_PIX_FMT_BGRA;
-            case VideoPixelFormat::RGBX8:
-                return AV_PIX_FMT_RGB0;
-            default:
-                return AV_PIX_FMT_RGBA;
-            }
-        }();
-
-        sws_freeContext(scaler_ctx);
-        // Frames are decoded into either YUV420 or NV12 formats. Convert to desired RGB format
-        scaler_ctx = sws_getContext(frame_width, frame_height, frame_format, frame_width,
-                                    frame_height, target_format, 0, nullptr, nullptr, nullptr);
-        scaler_width = frame_width;
-        scaler_height = frame_height;
-        converted_frame_buffer.reset();
-    }
-    if (!converted_frame_buffer) {
-        const size_t frame_size = frame_width * frame_height * 4;
-        converted_frame_buffer = AVMallocPtr{static_cast<u8*>(av_malloc(frame_size)), av_free};
-    }
-    const std::array<int, 4> converted_stride{frame_width * 4, frame_height * 4, 0, 0};
-    u8* const converted_frame_buf_addr{converted_frame_buffer.get()};
-    sws_scale(scaler_ctx, frame->GetPlanes(), frame->GetStrides(), 0, frame_height,
-              &converted_frame_buf_addr, converted_stride.data());
-
-    // Use the minimum of surface/frame dimensions to avoid buffer overflow.
-    const u32 surface_width = static_cast<u32>(config.surface_width_minus1) + 1;
-    const u32 surface_height = static_cast<u32>(config.surface_height_minus1) + 1;
-    const u32 width = std::min(surface_width, static_cast<u32>(frame_width));
-    const u32 height = std::min(surface_height, static_cast<u32>(frame_height));
-    const u32 blk_kind = static_cast<u32>(config.block_linear_kind);
-    if (blk_kind != 0) {
-        // swizzle pitch linear to block linear
-        const u32 block_height = static_cast<u32>(config.block_linear_height_log2);
-        const auto size = Texture::CalculateSize(true, 4, width, height, 1, block_height, 0);
-        luma_buffer.resize_destructive(size);
-        std::span<const u8> frame_buff(converted_frame_buf_addr, 4 * width * height);
-        Texture::SwizzleSubrect(luma_buffer, frame_buff, 4, width, height, 1, 0, 0, width, height,
-                                block_height, 0, width * 4);
-
-        host1x.GMMU().WriteBlock(output_surface_luma_address, luma_buffer.data(), size);
+    if (Settings::values.nvdec_emulation.GetValue() == Settings::NvdecEmulation::Off) [[unlikely]] {
+        // Fill the frame with black, as otherwise they can have random data and be very glitchy.
+        std::fill(output_surface.begin(), output_surface.end(), Pixel{});
     } else {
-        // send pitch linear frame
-        const size_t linear_size = width * height * 4;
-        host1x.GMMU().WriteBlock(output_surface_luma_address, converted_frame_buf_addr,
-                                 linear_size);
+        for (size_t i = 0; i < config.slot_structs.size(); i++) {
+            auto& slot_config{config.slot_structs[i]};
+            if (!slot_config.config.slot_enable) {
+                continue;
+            }
+
+            auto luma_offset{regs.surfaces[i][SurfaceIndex::Current].luma.Address()};
+            if (nvdec_id == -1) {
+                nvdec_id = frame_queue.VicFindNvdecFdFromOffset(luma_offset);
+            }
+
+            auto frame = frame_queue.GetFrame(nvdec_id, luma_offset);
+            if (!frame.get()) {
+                LOG_ERROR(HW_GPU, "Vic failed to get frame with offset 0x{:X}", luma_offset);
+                continue;
+            }
+
+            switch (frame->GetPixelFormat()) {
+            case AV_PIX_FMT_YUV420P:
+                ReadY8__V8U8_N420<true>(slot_config, regs.surfaces[i], std::move(frame));
+                break;
+            case AV_PIX_FMT_NV12:
+                ReadY8__V8U8_N420<false>(slot_config, regs.surfaces[i], std::move(frame));
+                break;
+            default:
+                UNIMPLEMENTED_MSG(
+                    "Unimplemented slot pixel format {}",
+                    static_cast<u32>(slot_config.surface_config.slot_pixel_format.Value()));
+                break;
+            }
+
+            Blend(config, slot_config);
+        }
+    }
+
+    switch (config.output_surface_config.out_pixel_format) {
+    case VideoPixelFormat::A8B8G8R8:
+    case VideoPixelFormat::X8B8G8R8:
+        WriteABGR<VideoPixelFormat::A8B8G8R8>(config.output_surface_config);
+        break;
+    case VideoPixelFormat::A8R8G8B8:
+        WriteABGR<VideoPixelFormat::A8R8G8B8>(config.output_surface_config);
+        break;
+    case VideoPixelFormat::Y8__V8U8_N420:
+        WriteY8__V8U8_N420(config.output_surface_config);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unknown video pixel format {}",
+                          config.output_surface_config.out_pixel_format.Value());
+        break;
     }
 }
 
-void Vic::WriteYUVFrame(std::unique_ptr<FFmpeg::Frame> frame, const VicConfig& config) {
-    LOG_TRACE(Service_NVDRV, "Writing YUV420 Frame");
+template <bool Planar, bool Interlaced>
+void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot,
+                                       std::span<const PlaneOffsets> offsets,
+                                       std::shared_ptr<const FFmpeg::Frame> frame) {
+    const auto out_luma_width{slot.surface_config.slot_surface_width + 1};
+    auto out_luma_height{slot.surface_config.slot_surface_height + 1};
+    const auto out_luma_stride{out_luma_width};
 
-    const std::size_t surface_width = config.surface_width_minus1 + 1;
-    const std::size_t surface_height = config.surface_height_minus1 + 1;
-    const std::size_t aligned_width = (surface_width + 0xff) & ~0xffUL;
-    // Use the minimum of surface/frame dimensions to avoid buffer overflow.
-    const auto frame_width = std::min(surface_width, static_cast<size_t>(frame->GetWidth()));
-    const auto frame_height = std::min(surface_height, static_cast<size_t>(frame->GetHeight()));
-
-    const auto stride = static_cast<size_t>(frame->GetStride(0));
-
-    luma_buffer.resize_destructive(aligned_width * surface_height);
-    chroma_buffer.resize_destructive(aligned_width * surface_height / 2);
-
-    // Populate luma buffer
-    const u8* luma_src = frame->GetData(0);
-    for (std::size_t y = 0; y < frame_height; ++y) {
-        const std::size_t src = y * stride;
-        const std::size_t dst = y * aligned_width;
-        std::memcpy(luma_buffer.data() + dst, luma_src + src, frame_width);
+    if constexpr (Interlaced) {
+        out_luma_height *= 2;
     }
-    host1x.GMMU().WriteBlock(output_surface_luma_address, luma_buffer.data(), luma_buffer.size());
 
-    // Chroma
-    const std::size_t half_height = frame_height / 2;
-    const auto half_stride = static_cast<size_t>(frame->GetStride(1));
+    slot_surface.resize_destructive(out_luma_width * out_luma_height);
 
-    switch (frame->GetPixelFormat()) {
-    case AV_PIX_FMT_YUV420P: {
-        // Frame from FFmpeg software
-        // Populate chroma buffer from both channels with interleaving.
-        const std::size_t half_width = frame_width / 2;
-        u8* chroma_buffer_data = chroma_buffer.data();
-        const u8* chroma_b_src = frame->GetData(1);
-        const u8* chroma_r_src = frame->GetData(2);
-        for (std::size_t y = 0; y < half_height; ++y) {
-            const std::size_t src = y * half_stride;
-            const std::size_t dst = y * aligned_width;
-            for (std::size_t x = 0; x < half_width; ++x) {
-                chroma_buffer_data[dst + x * 2] = chroma_b_src[src + x];
-                chroma_buffer_data[dst + x * 2 + 1] = chroma_r_src[src + x];
+    const auto in_luma_width{std::min(frame->GetWidth(), static_cast<s32>(out_luma_width))};
+    const auto in_luma_height{std::min(frame->GetHeight(), static_cast<s32>(out_luma_height))};
+    const auto in_luma_stride{frame->GetStride(0)};
+
+    const auto in_chroma_stride{frame->GetStride(1)};
+
+    const auto* luma_buffer{frame->GetPlane(0)};
+    const auto* chroma_u_buffer{frame->GetPlane(1)};
+    const auto* chroma_v_buffer{frame->GetPlane(2)};
+
+    LOG_TRACE(HW_GPU,
+              "Reading frame"
+              "\ninput luma {}x{} stride {} chroma {}x{} stride {}\n"
+              "output luma {}x{} stride {} chroma {}x{} stride {}",
+              in_luma_width, in_luma_height, in_luma_stride, in_luma_width / 2, in_luma_height / 2,
+              in_chroma_stride, out_luma_width, out_luma_height, out_luma_stride, out_luma_width,
+              out_luma_height, out_luma_stride);
+
+    [[maybe_unused]] auto DecodeLinear = [&]() {
+        const auto alpha{static_cast<u16>(slot.config.planar_alpha.Value())};
+
+        for (s32 y = 0; y < in_luma_height; y++) {
+            const auto src_luma{y * in_luma_stride};
+            const auto src_chroma{(y / 2) * in_chroma_stride};
+            const auto dst{y * out_luma_stride};
+            for (s32 x = 0; x < in_luma_width; x++) {
+                slot_surface[dst + x].r = static_cast<u16>(luma_buffer[src_luma + x] << 2);
+                // Chroma samples are duplicated horizontally and vertically.
+                if constexpr (Planar) {
+                    slot_surface[dst + x].g =
+                        static_cast<u16>(chroma_u_buffer[src_chroma + x / 2] << 2);
+                    slot_surface[dst + x].b =
+                        static_cast<u16>(chroma_v_buffer[src_chroma + x / 2] << 2);
+                } else {
+                    slot_surface[dst + x].g =
+                        static_cast<u16>(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2);
+                    slot_surface[dst + x].b =
+                        static_cast<u16>(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2);
+                }
+                slot_surface[dst + x].a = alpha;
             }
         }
-        break;
+    };
+
+#if defined(ARCHITECTURE_x86_64)
+    if (!has_sse41) {
+        DecodeLinear();
+        return;
     }
-    case AV_PIX_FMT_NV12: {
-        // Frame from VA-API hardware
-        // This is already interleaved so just copy
-        const u8* chroma_src = frame->GetData(1);
-        for (std::size_t y = 0; y < half_height; ++y) {
-            const std::size_t src = y * stride;
-            const std::size_t dst = y * aligned_width;
-            std::memcpy(chroma_buffer.data() + dst, chroma_src + src, frame_width);
+
+    const auto alpha =
+        _mm_slli_epi64(_mm_set1_epi64x(static_cast<s64>(slot.config.planar_alpha.Value())), 48);
+
+    const auto shuffle_mask = _mm_set_epi8(13, 15, 14, 12, 9, 11, 10, 8, 5, 7, 6, 4, 1, 3, 2, 0);
+
+    for (s32 y = 0; y < in_luma_height; y++) {
+        const auto src_luma{y * in_luma_stride};
+        const auto src_chroma{(y / 2) * in_chroma_stride};
+        const auto dst{y * out_luma_stride};
+        for (s32 x = 0; x < in_luma_width; x += 16) {
+            // clang-format off
+            // Prefetch next iteration's memory
+            _mm_prefetch((const char*)&luma_buffer[src_luma + x + 16], _MM_HINT_T0);
+
+            // Load 8 bytes * 2 of 8-bit luma samples
+            // luma0 = 00 00 00 00 00 00 00 00 LL LL LL LL LL LL LL LL
+            auto luma0 = _mm_loadl_epi64((__m128i*)&luma_buffer[src_luma + x + 0]);
+            auto luma1 = _mm_loadl_epi64((__m128i*)&luma_buffer[src_luma + x + 8]);
+
+            __m128i chroma;
+
+            if constexpr (Planar) {
+                _mm_prefetch((const char*)&chroma_u_buffer[src_chroma + x / 2 + 8], _MM_HINT_T0);
+                _mm_prefetch((const char*)&chroma_v_buffer[src_chroma + x / 2 + 8], _MM_HINT_T0);
+
+                // If Chroma is planar, we have separate U and V planes, load 8 bytes of each
+                // chroma_u0 = 00 00 00 00 00 00 00 00 UU UU UU UU UU UU UU UU
+                // chroma_v0 = 00 00 00 00 00 00 00 00 VV VV VV VV VV VV VV VV
+                auto chroma_u0 = _mm_loadl_epi64((__m128i*)&chroma_u_buffer[src_chroma + x / 2]);
+                auto chroma_v0 = _mm_loadl_epi64((__m128i*)&chroma_v_buffer[src_chroma + x / 2]);
+
+                // Interleave the 8 bytes of U and V into a single 16 byte reg
+                // chroma = VV UU VV UU VV UU VV UU VV UU VV UU VV UU VV UU
+                chroma = _mm_unpacklo_epi8(chroma_u0, chroma_v0);
+            } else {
+                _mm_prefetch((const char*)&chroma_u_buffer[src_chroma + x / 2 + 8], _MM_HINT_T0);
+
+                // Chroma is already interleaved in semiplanar format, just load 16 bytes
+                // chroma = VV UU VV UU VV UU VV UU VV UU VV UU VV UU VV UU
+                chroma = _mm_load_si128((__m128i*)&chroma_u_buffer[src_chroma + x]);
+            }
+
+            // Convert the low 8 bytes of 8-bit luma into 16-bit luma
+            // luma0 = [00] [00] [00] [00] [00] [00] [00] [00] [LL] [LL] [LL] [LL] [LL] [LL] [LL] [LL]
+            // ->
+            // luma0 = [00 LL] [00 LL] [00 LL] [00 LL] [00 LL] [00 LL] [00 LL] [00 LL]
+            luma0 = _mm_cvtepu8_epi16(luma0);
+            luma1 = _mm_cvtepu8_epi16(luma1);
+
+            // Treat the 8 bytes of 8-bit chroma as 16-bit channels, this allows us to take both the
+            // U and V together as one element. Using chroma twice here duplicates the values, as we
+            // take element 0 from chroma, and then element 0 from chroma again, etc. We need to
+            // duplicate chroma horitonally as chroma is half the width of luma.
+            // chroma   = [VV8 UU8] [VV7 UU7] [VV6 UU6] [VV5 UU5] [VV4 UU4] [VV3 UU3] [VV2 UU2] [VV1 UU1]
+            // ->
+            // chroma00 = [VV4 UU4] [VV4 UU4] [VV3 UU3] [VV3 UU3] [VV2 UU2] [VV2 UU2] [VV1 UU1] [VV1 UU1]
+            // chroma01 = [VV8 UU8] [VV8 UU8] [VV7 UU7] [VV7 UU7] [VV6 UU6] [VV6 UU6] [VV5 UU5] [VV5 UU5]
+            auto chroma00 = _mm_unpacklo_epi16(chroma, chroma);
+            auto chroma01 = _mm_unpackhi_epi16(chroma, chroma);
+
+            // Interleave the 16-bit luma and chroma.
+            // luma0    = [008 LL8] [007 LL7] [006 LL6] [005 LL5] [004 LL4] [003 LL3] [002 LL2] [001 LL1]
+            // chroma00 = [VV8 UU8] [VV7 UU7] [VV6 UU6] [VV5 UU5] [VV4 UU4] [VV3 UU3] [VV2 UU2] [VV1 UU1]
+            // ->
+            // yuv0     = [VV4 UU4 004 LL4] [VV3 UU3 003 LL3] [VV2 UU2 002 LL2] [VV1 UU1 001 LL1]
+            // yuv1     = [VV8 UU8 008 LL8] [VV7 UU7 007 LL7] [VV6 UU6 006 LL6] [VV5 UU5 005 LL5]
+            auto yuv0 = _mm_unpacklo_epi16(luma0, chroma00);
+            auto yuv1 = _mm_unpackhi_epi16(luma0, chroma00);
+            auto yuv2 = _mm_unpacklo_epi16(luma1, chroma01);
+            auto yuv3 = _mm_unpackhi_epi16(luma1, chroma01);
+
+            // Shuffle the luma/chroma into the channel ordering we actually want. The high byte of
+            // the luma which is now a constant 0 after converting 8-bit -> 16-bit is used as the
+            // alpha. Luma -> R, U -> G, V -> B, 0 -> A
+            // yuv0 = [VV4 UU4 004 LL4] [VV3 UU3 003 LL3] [VV2 UU2 002 LL2] [VV1 UU1 001 LL1]
+            // ->
+            // yuv0 = [AA4 VV4 UU4 LL4] [AA3 VV3 UU3 LL3] [AA2 VV2 UU2 LL2] [AA1 VV1 UU1 LL1]
+            yuv0 = _mm_shuffle_epi8(yuv0, shuffle_mask);
+            yuv1 = _mm_shuffle_epi8(yuv1, shuffle_mask);
+            yuv2 = _mm_shuffle_epi8(yuv2, shuffle_mask);
+            yuv3 = _mm_shuffle_epi8(yuv3, shuffle_mask);
+
+            // Extend the 8-bit channels we have into 16-bits, as that's the target surface format.
+            // Since this turns just the low 8 bytes into 16 bytes, the second of
+            // each operation here right shifts the register by 8 to get the high pixels.
+            // yuv0  = [AA4] [VV4] [UU4] [LL4] [AA3] [VV3] [UU3] [LL3] [AA2] [VV2] [UU2] [LL2] [AA1] [VV1] [UU1] [LL1]
+            // ->
+            // yuv01 = [002 AA2] [002 VV2] [002 UU2] [002 LL2] [001 AA1] [001 VV1] [001 UU1] [001 LL1]
+            // yuv23 = [004 AA4] [004 VV4] [004 UU4] [004 LL4] [003 AA3] [003 VV3] ]003 UU3] [003 LL3]
+            auto yuv01 = _mm_cvtepu8_epi16(yuv0);
+            auto yuv23 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv0, 8));
+            auto yuv45 = _mm_cvtepu8_epi16(yuv1);
+            auto yuv67 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv1, 8));
+            auto yuv89 = _mm_cvtepu8_epi16(yuv2);
+            auto yuv1011 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv2, 8));
+            auto yuv1213 = _mm_cvtepu8_epi16(yuv3);
+            auto yuv1415 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv3, 8));
+
+            // Left-shift all 16-bit channels by 2, this is to get us into a 10-bit format instead
+            // of 8, which is the format alpha is in, as well as other blending values.
+            yuv01 = _mm_slli_epi16(yuv01, 2);
+            yuv23 = _mm_slli_epi16(yuv23, 2);
+            yuv45 = _mm_slli_epi16(yuv45, 2);
+            yuv67 = _mm_slli_epi16(yuv67, 2);
+            yuv89 = _mm_slli_epi16(yuv89, 2);
+            yuv1011 = _mm_slli_epi16(yuv1011, 2);
+            yuv1213 = _mm_slli_epi16(yuv1213, 2);
+            yuv1415 = _mm_slli_epi16(yuv1415, 2);
+
+            // OR in the planar alpha, this has already been duplicated and shifted into position,
+            // and just fills in the AA channels with the actual alpha value.
+            yuv01 = _mm_or_si128(yuv01, alpha);
+            yuv23 = _mm_or_si128(yuv23, alpha);
+            yuv45 = _mm_or_si128(yuv45, alpha);
+            yuv67 = _mm_or_si128(yuv67, alpha);
+            yuv89 = _mm_or_si128(yuv89, alpha);
+            yuv1011 = _mm_or_si128(yuv1011, alpha);
+            yuv1213 = _mm_or_si128(yuv1213, alpha);
+            yuv1415 = _mm_or_si128(yuv1415, alpha);
+
+            // Store out the pixels. One pixel is now 8 bytes, so each store is 2 pixels.
+            // [AA AA] [VV VV] [UU UU] [LL LL] [AA AA] [VV VV] [UU UU] [LL LL]
+            _mm_store_si128((__m128i*)&slot_surface[dst + x + 0], yuv01);
+            _mm_store_si128((__m128i*)&slot_surface[dst + x + 2], yuv23);
+            _mm_store_si128((__m128i*)&slot_surface[dst + x + 4], yuv45);
+            _mm_store_si128((__m128i*)&slot_surface[dst + x + 6], yuv67);
+            _mm_store_si128((__m128i*)&slot_surface[dst + x + 8], yuv89);
+            _mm_store_si128((__m128i*)&slot_surface[dst + x + 10], yuv1011);
+            _mm_store_si128((__m128i*)&slot_surface[dst + x + 12], yuv1213);
+            _mm_store_si128((__m128i*)&slot_surface[dst + x + 14], yuv1415);
+
+            // clang-format on
         }
-        break;
     }
-    default:
-        ASSERT(false);
-        break;
-    }
-    host1x.GMMU().WriteBlock(output_surface_chroma_address, chroma_buffer.data(),
-                             chroma_buffer.size());
+#elif defined(ARCHITECTURE_arm64)
+    DecodeLinear();
+#else
+    DecodeLinear();
+#endif
 }
 
-} // namespace Host1x
+template <bool Planar, bool TopField>
+void Vic::ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets,
+                                      std::shared_ptr<const FFmpeg::Frame> frame) {
+    if constexpr (!Planar) {
+        ReadProgressiveY8__V8U8_N420<Planar, true>(slot, offsets, std::move(frame));
+        return;
+    }
+    const auto out_luma_width{slot.surface_config.slot_surface_width + 1};
+    const auto out_luma_height{(slot.surface_config.slot_surface_height + 1) * 2};
+    const auto out_luma_stride{out_luma_width};
 
-} // namespace Tegra
+    slot_surface.resize_destructive(out_luma_width * out_luma_height);
+
+    const auto in_luma_width{std::min(frame->GetWidth(), static_cast<s32>(out_luma_width))};
+    [[maybe_unused]] const auto in_luma_height{
+        std::min(frame->GetHeight(), static_cast<s32>(out_luma_height))};
+    const auto in_luma_stride{frame->GetStride(0)};
+
+    [[maybe_unused]] const auto in_chroma_width{(frame->GetWidth() + 1) / 2};
+    const auto in_chroma_height{(frame->GetHeight() + 1) / 2};
+    const auto in_chroma_stride{frame->GetStride(1)};
+
+    const auto* luma_buffer{frame->GetPlane(0)};
+    const auto* chroma_u_buffer{frame->GetPlane(1)};
+    const auto* chroma_v_buffer{frame->GetPlane(2)};
+
+    LOG_TRACE(HW_GPU,
+              "Reading frame"
+              "\ninput luma {}x{} stride {} chroma {}x{} stride {}\n"
+              "output luma {}x{} stride {} chroma {}x{} stride {}",
+              in_luma_width, in_luma_height, in_luma_stride, in_chroma_width, in_chroma_height,
+              in_chroma_stride, out_luma_width, out_luma_height, out_luma_stride,
+              out_luma_width / 2, out_luma_height / 2, out_luma_stride);
+
+    [[maybe_unused]] auto DecodeLinear = [&]() {
+        auto DecodeBobField = [&]() {
+            const auto alpha{static_cast<u16>(slot.config.planar_alpha.Value())};
+
+            for (s32 y = static_cast<s32>(TopField == false); y < in_chroma_height * 2; y += 2) {
+                const auto src_luma{y * in_luma_stride};
+                const auto src_chroma{(y / 2) * in_chroma_stride};
+                const auto dst{y * out_luma_stride};
+                for (s32 x = 0; x < in_luma_width; x++) {
+                    slot_surface[dst + x].r = static_cast<u16>(luma_buffer[src_luma + x] << 2);
+                    if constexpr (Planar) {
+                        slot_surface[dst + x].g =
+                            static_cast<u16>(chroma_u_buffer[src_chroma + x / 2] << 2);
+                        slot_surface[dst + x].b =
+                            static_cast<u16>(chroma_v_buffer[src_chroma + x / 2] << 2);
+                    } else {
+                        slot_surface[dst + x].g =
+                            static_cast<u16>(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2);
+                        slot_surface[dst + x].b =
+                            static_cast<u16>(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2);
+                    }
+                    slot_surface[dst + x].a = alpha;
+                }
+
+                s32 other_line{};
+                if constexpr (TopField) {
+                    other_line = (y + 1) * out_luma_stride;
+                } else {
+                    other_line = (y - 1) * out_luma_stride;
+                }
+                std::memcpy(&slot_surface[other_line], &slot_surface[dst],
+                            out_luma_width * sizeof(Pixel));
+            }
+        };
+
+        switch (slot.config.deinterlace_mode) {
+        case DXVAHD_DEINTERLACE_MODE_PRIVATE::WEAVE:
+            // Due to the fact that we do not write to memory in nvdec, we cannot use Weave as it
+            // relies on the previous frame.
+            DecodeBobField();
+            break;
+        case DXVAHD_DEINTERLACE_MODE_PRIVATE::BOB_FIELD:
+            DecodeBobField();
+            break;
+        case DXVAHD_DEINTERLACE_MODE_PRIVATE::DISI1:
+            // Due to the fact that we do not write to memory in nvdec, we cannot use DISI1 as it
+            // relies on previous/next frames.
+            DecodeBobField();
+            break;
+        default:
+            UNIMPLEMENTED_MSG("Deinterlace mode {} not implemented!",
+                              static_cast<s32>(slot.config.deinterlace_mode.Value()));
+            break;
+        }
+    };
+
+    DecodeLinear();
+}
+
+template <bool Planar>
+void Vic::ReadY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets,
+                            std::shared_ptr<const FFmpeg::Frame> frame) {
+    switch (slot.config.frame_format) {
+    case DXVAHD_FRAME_FORMAT::PROGRESSIVE:
+        ReadProgressiveY8__V8U8_N420<Planar>(slot, offsets, std::move(frame));
+        break;
+    case DXVAHD_FRAME_FORMAT::TOP_FIELD:
+        ReadInterlacedY8__V8U8_N420<Planar, true>(slot, offsets, std::move(frame));
+        break;
+    case DXVAHD_FRAME_FORMAT::BOTTOM_FIELD:
+        ReadInterlacedY8__V8U8_N420<Planar, false>(slot, offsets, std::move(frame));
+        break;
+    default:
+        LOG_ERROR(HW_GPU, "Unknown deinterlace format {}",
+                  static_cast<s32>(slot.config.frame_format.Value()));
+        break;
+    }
+}
+
+void Vic::Blend(const ConfigStruct& config, const SlotStruct& slot) {
+    constexpr auto add_one([](u32 v) -> u32 { return v != 0 ? v + 1 : 0; });
+
+    auto source_left{add_one(static_cast<u32>(slot.config.source_rect_left.Value()))};
+    auto source_right{add_one(static_cast<u32>(slot.config.source_rect_right.Value()))};
+    auto source_top{add_one(static_cast<u32>(slot.config.source_rect_top.Value()))};
+    auto source_bottom{add_one(static_cast<u32>(slot.config.source_rect_bottom.Value()))};
+
+    const auto dest_left{add_one(static_cast<u32>(slot.config.dest_rect_left.Value()))};
+    const auto dest_right{add_one(static_cast<u32>(slot.config.dest_rect_right.Value()))};
+    const auto dest_top{add_one(static_cast<u32>(slot.config.dest_rect_top.Value()))};
+    const auto dest_bottom{add_one(static_cast<u32>(slot.config.dest_rect_bottom.Value()))};
+
+    auto rect_left{add_one(config.output_config.target_rect_left.Value())};
+    auto rect_right{add_one(config.output_config.target_rect_right.Value())};
+    auto rect_top{add_one(config.output_config.target_rect_top.Value())};
+    auto rect_bottom{add_one(config.output_config.target_rect_bottom.Value())};
+
+    rect_left = std::max(rect_left, dest_left);
+    rect_right = std::min(rect_right, dest_right);
+    rect_top = std::max(rect_top, dest_top);
+    rect_bottom = std::min(rect_bottom, dest_bottom);
+
+    source_left = std::max(source_left, rect_left);
+    source_right = std::min(source_right, rect_right);
+    source_top = std::max(source_top, rect_top);
+    source_bottom = std::min(source_bottom, rect_bottom);
+
+    if (source_left >= source_right || source_top >= source_bottom) {
+        return;
+    }
+
+    const auto out_surface_width{config.output_surface_config.out_surface_width + 1};
+    [[maybe_unused]] const auto out_surface_height{config.output_surface_config.out_surface_height +
+                                                   1};
+    const auto in_surface_width{slot.surface_config.slot_surface_width + 1};
+
+    source_bottom = std::min(source_bottom, out_surface_height);
+    source_right = std::min(source_right, out_surface_width);
+
+    // TODO Alpha blending. No games I've seen use more than a single surface or supply an alpha
+    // below max, so it's ignored for now.
+
+    if (!slot.color_matrix.matrix_enable) {
+        const auto copy_width = std::min(source_right - source_left, rect_right - rect_left);
+
+        for (u32 y = source_top; y < source_bottom; y++) {
+            const auto dst_line = y * out_surface_width;
+            const auto src_line = y * in_surface_width;
+            std::memcpy(&output_surface[dst_line + rect_left],
+                        &slot_surface[src_line + source_left], copy_width * sizeof(Pixel));
+        }
+    } else {
+        // clang-format off
+        // Colour conversion is enabled, this is a 3x4 * 4x1 matrix multiplication, resulting in a 3x1 matrix.
+        // | r0c0 r0c1 r0c2 r0c3 |   | R |   | R |
+        // | r1c0 r1c1 r1c2 r1c3 | * | G | = | G |
+        // | r2c0 r2c1 r2c2 r2c3 |   | B |   | B |
+        //                           | 1 |
+        // clang-format on
+
+        [[maybe_unused]] auto DecodeLinear = [&]() {
+            const auto r0c0 = static_cast<s32>(slot.color_matrix.matrix_coeff00.Value());
+            const auto r0c1 = static_cast<s32>(slot.color_matrix.matrix_coeff01.Value());
+            const auto r0c2 = static_cast<s32>(slot.color_matrix.matrix_coeff02.Value());
+            const auto r0c3 = static_cast<s32>(slot.color_matrix.matrix_coeff03.Value());
+            const auto r1c0 = static_cast<s32>(slot.color_matrix.matrix_coeff10.Value());
+            const auto r1c1 = static_cast<s32>(slot.color_matrix.matrix_coeff11.Value());
+            const auto r1c2 = static_cast<s32>(slot.color_matrix.matrix_coeff12.Value());
+            const auto r1c3 = static_cast<s32>(slot.color_matrix.matrix_coeff13.Value());
+            const auto r2c0 = static_cast<s32>(slot.color_matrix.matrix_coeff20.Value());
+            const auto r2c1 = static_cast<s32>(slot.color_matrix.matrix_coeff21.Value());
+            const auto r2c2 = static_cast<s32>(slot.color_matrix.matrix_coeff22.Value());
+            const auto r2c3 = static_cast<s32>(slot.color_matrix.matrix_coeff23.Value());
+
+            const auto shift = static_cast<s32>(slot.color_matrix.matrix_r_shift.Value());
+            const auto clamp_min = static_cast<s32>(slot.config.soft_clamp_low.Value());
+            const auto clamp_max = static_cast<s32>(slot.config.soft_clamp_high.Value());
+
+            auto MatMul = [&](const Pixel& in_pixel) -> std::tuple<s32, s32, s32, s32> {
+                auto r = static_cast<s32>(in_pixel.r);
+                auto g = static_cast<s32>(in_pixel.g);
+                auto b = static_cast<s32>(in_pixel.b);
+
+                r = in_pixel.r * r0c0 + in_pixel.g * r0c1 + in_pixel.b * r0c2;
+                g = in_pixel.r * r1c0 + in_pixel.g * r1c1 + in_pixel.b * r1c2;
+                b = in_pixel.r * r2c0 + in_pixel.g * r2c1 + in_pixel.b * r2c2;
+
+                r >>= shift;
+                g >>= shift;
+                b >>= shift;
+
+                r += r0c3;
+                g += r1c3;
+                b += r2c3;
+
+                r >>= 8;
+                g >>= 8;
+                b >>= 8;
+
+                return {r, g, b, static_cast<s32>(in_pixel.a)};
+            };
+
+            for (u32 y = source_top; y < source_bottom; y++) {
+                const auto src{y * in_surface_width + source_left};
+                const auto dst{y * out_surface_width + rect_left};
+                for (u32 x = source_left; x < source_right; x++) {
+                    auto [r, g, b, a] = MatMul(slot_surface[src + x]);
+
+                    r = std::clamp(r, clamp_min, clamp_max);
+                    g = std::clamp(g, clamp_min, clamp_max);
+                    b = std::clamp(b, clamp_min, clamp_max);
+                    a = std::clamp(a, clamp_min, clamp_max);
+
+                    output_surface[dst + x] = {static_cast<u16>(r), static_cast<u16>(g),
+                                               static_cast<u16>(b), static_cast<u16>(a)};
+                }
+            }
+        };
+
+#if defined(ARCHITECTURE_x86_64)
+        if (!has_sse41) {
+            DecodeLinear();
+            return;
+        }
+
+        // Fill the columns, e.g
+        // c0 = [00 00 00 00] [r2c0 r2c0 r2c0 r2c0] [r1c0 r1c0 r1c0 r1c0] [r0c0 r0c0 r0c0 r0c0]
+
+        const auto c0 = _mm_set_epi32(0, static_cast<s32>(slot.color_matrix.matrix_coeff20.Value()),
+                                      static_cast<s32>(slot.color_matrix.matrix_coeff10.Value()),
+                                      static_cast<s32>(slot.color_matrix.matrix_coeff00.Value()));
+        const auto c1 = _mm_set_epi32(0, static_cast<s32>(slot.color_matrix.matrix_coeff21.Value()),
+                                      static_cast<s32>(slot.color_matrix.matrix_coeff11.Value()),
+                                      static_cast<s32>(slot.color_matrix.matrix_coeff01.Value()));
+        const auto c2 = _mm_set_epi32(0, static_cast<s32>(slot.color_matrix.matrix_coeff22.Value()),
+                                      static_cast<s32>(slot.color_matrix.matrix_coeff12.Value()),
+                                      static_cast<s32>(slot.color_matrix.matrix_coeff02.Value()));
+        const auto c3 = _mm_set_epi32(0, static_cast<s32>(slot.color_matrix.matrix_coeff23.Value()),
+                                      static_cast<s32>(slot.color_matrix.matrix_coeff13.Value()),
+                                      static_cast<s32>(slot.color_matrix.matrix_coeff03.Value()));
+
+        // Set the matrix right-shift as a single element.
+        const auto shift =
+            _mm_set_epi32(0, 0, 0, static_cast<s32>(slot.color_matrix.matrix_r_shift.Value()));
+
+        // Set every 16-bit value to the soft clamp values for clamping every 16-bit channel.
+        const auto clamp_min = _mm_set1_epi16(static_cast<u16>(slot.config.soft_clamp_low.Value()));
+        const auto clamp_max =
+            _mm_set1_epi16(static_cast<u16>(slot.config.soft_clamp_high.Value()));
+
+        // clang-format off
+
+        auto MatMul = [](__m128i& p, const __m128i& col0, const __m128i& col1, const __m128i& col2,
+                         const __m128i& col3, const __m128i& trm_shift) -> __m128i {
+            // Duplicate the 32-bit channels, e.g
+            // p = [AA AA AA AA] [BB BB BB BB] [GG GG GG GG] [RR RR RR RR]
+            // ->
+            // r = [RR4 RR4 RR4 RR4] [RR3 RR3 RR3 RR3] [RR2 RR2 RR2 RR2] [RR1 RR1 RR1 RR1]
+            auto r = _mm_shuffle_epi32(p, 0x0);
+            auto g = _mm_shuffle_epi32(p, 0x55);
+            auto b = _mm_shuffle_epi32(p, 0xAA);
+
+            // Multiply the rows and columns c0 * r, c1 * g, c2 * b, e.g
+            // r  = [RR4 RR4 RR4 RR4] [ RR3  RR3  RR3  RR3] [ RR2  RR2  RR2  RR2] [ RR1  RR1  RR1  RR1]
+            //                                             *
+            // c0 = [ 00  00  00  00] [r2c0 r2c0 r2c0 r2c0] [r1c0 r1c0 r1c0 r1c0] [r0c0 r0c0 r0c0 r0c0]
+            r = _mm_mullo_epi32(r, col0);
+            g = _mm_mullo_epi32(g, col1);
+            b = _mm_mullo_epi32(b, col2);
+
+            // Add them all together vertically, such that the 32-bit element
+            // out[0] = (r[0] * c0[0]) + (g[0] * c1[0]) + (b[0] * c2[0])
+            auto out = _mm_add_epi32(_mm_add_epi32(r, g), b);
+
+            // Shift the result by r_shift, as the TRM says
+            out = _mm_sra_epi32(out, trm_shift);
+
+            // Add the final column. Because the 4x1 matrix has this row as 1, there's no need to
+            // multiply by it, and as per the TRM this column ignores r_shift, so it's just added
+            // here after shifting.
+            out = _mm_add_epi32(out, col3);
+
+            // Shift the result back from S12.8 to integer values
+            return _mm_srai_epi32(out, 8);
+        };
+
+        for (u32 y = source_top; y < source_bottom; y++) {
+            const auto src{y * in_surface_width + source_left};
+            const auto dst{y * out_surface_width + rect_left};
+            for (u32 x = source_left; x < source_right; x += 8) {
+                // clang-format off
+                // Prefetch the next iteration's memory
+                _mm_prefetch((const char*)&slot_surface[src + x + 8], _MM_HINT_T0);
+
+                // Load in pixels
+                // p01 = [AA AA] [BB BB] [GG GG] [RR RR] [AA AA] [BB BB] [GG GG] [RR RR]
+                auto p01 = _mm_load_si128((__m128i*)&slot_surface[src + x + 0]);
+                auto p23 = _mm_load_si128((__m128i*)&slot_surface[src + x + 2]);
+                auto p45 = _mm_load_si128((__m128i*)&slot_surface[src + x + 4]);
+                auto p67 = _mm_load_si128((__m128i*)&slot_surface[src + x + 6]);
+
+                // Convert the 16-bit channels into 32-bit (unsigned), as the matrix values are
+                // 32-bit and to avoid overflow.
+                // p01    = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1]
+                // ->
+                // p01_lo = [001 001 AA1 AA1] [001 001 BB1 BB1] [001 001 GG1 GG1] [001 001 RR1 RR1]
+                // p01_hi = [002 002 AA2 AA2] [002 002 BB2 BB2] [002 002 GG2 GG2] [002 002 RR2 RR2]
+                auto p01_lo = _mm_cvtepu16_epi32(p01);
+                auto p01_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p01, 8));
+                auto p23_lo = _mm_cvtepu16_epi32(p23);
+                auto p23_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p23, 8));
+                auto p45_lo = _mm_cvtepu16_epi32(p45);
+                auto p45_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p45, 8));
+                auto p67_lo = _mm_cvtepu16_epi32(p67);
+                auto p67_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p67, 8));
+
+                // Matrix multiply the pixel, doing the colour conversion.
+                auto out0 = MatMul(p01_lo, c0, c1, c2, c3, shift);
+                auto out1 = MatMul(p01_hi, c0, c1, c2, c3, shift);
+                auto out2 = MatMul(p23_lo, c0, c1, c2, c3, shift);
+                auto out3 = MatMul(p23_hi, c0, c1, c2, c3, shift);
+                auto out4 = MatMul(p45_lo, c0, c1, c2, c3, shift);
+                auto out5 = MatMul(p45_hi, c0, c1, c2, c3, shift);
+                auto out6 = MatMul(p67_lo, c0, c1, c2, c3, shift);
+                auto out7 = MatMul(p67_hi, c0, c1, c2, c3, shift);
+
+                // Pack the 32-bit channel pixels back into 16-bit using unsigned saturation
+                // out0  = [001 001 AA1 AA1] [001 001 BB1 BB1] [001 001 GG1 GG1] [001 001 RR1 RR1]
+                // out1  = [002 002 AA2 AA2] [002 002 BB2 BB2] [002 002 GG2 GG2] [002 002 RR2 RR2]
+                // ->
+                // done0 = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1]
+                auto done0 = _mm_packus_epi32(out0, out1);
+                auto done1 = _mm_packus_epi32(out2, out3);
+                auto done2 = _mm_packus_epi32(out4, out5);
+                auto done3 = _mm_packus_epi32(out6, out7);
+
+                // Blend the original alpha back into the pixel, as the matrix multiply gives us a
+                // 3-channel output, not 4.
+                // 0x88 = b10001000, taking RGB from the first argument, A from the second argument.
+                // done0 = [002 002] [BB2 BB2] [GG2 GG2] [RR2 RR2] [001 001] [BB1 BB1] [GG1 GG1] [RR1 RR1]
+                // ->
+                // done0 = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1]
+                done0 = _mm_blend_epi16(done0, p01, 0x88);
+                done1 = _mm_blend_epi16(done1, p23, 0x88);
+                done2 = _mm_blend_epi16(done2, p45, 0x88);
+                done3 = _mm_blend_epi16(done3, p67, 0x88);
+
+                // Clamp the 16-bit channels to the soft-clamp min/max.
+                done0 = _mm_max_epu16(done0, clamp_min);
+                done1 = _mm_max_epu16(done1, clamp_min);
+                done2 = _mm_max_epu16(done2, clamp_min);
+                done3 = _mm_max_epu16(done3, clamp_min);
+
+                done0 = _mm_min_epu16(done0, clamp_max);
+                done1 = _mm_min_epu16(done1, clamp_max);
+                done2 = _mm_min_epu16(done2, clamp_max);
+                done3 = _mm_min_epu16(done3, clamp_max);
+
+                // Store the pixels to the output surface.
+                _mm_store_si128((__m128i*)&output_surface[dst + x + 0], done0);
+                _mm_store_si128((__m128i*)&output_surface[dst + x + 2], done1);
+                _mm_store_si128((__m128i*)&output_surface[dst + x + 4], done2);
+                _mm_store_si128((__m128i*)&output_surface[dst + x + 6], done3);
+
+            }
+        }
+        // clang-format on
+#elif defined(ARCHITECTURE_arm64)
+        DecodeLinear();
+#else
+        DecodeLinear();
+#endif
+    }
+}
+
+void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) {
+    constexpr u32 BytesPerPixel = 1;
+
+    auto surface_width{output_surface_config.out_surface_width + 1};
+    auto surface_height{output_surface_config.out_surface_height + 1};
+    const auto surface_stride{surface_width};
+
+    const auto out_luma_width = output_surface_config.out_luma_width + 1;
+    const auto out_luma_height = output_surface_config.out_luma_height + 1;
+    const auto out_luma_stride = Common::AlignUp(out_luma_width * BytesPerPixel, 0x10);
+    const auto out_luma_size = out_luma_height * out_luma_stride;
+
+    const auto out_chroma_width = output_surface_config.out_chroma_width + 1;
+    const auto out_chroma_height = output_surface_config.out_chroma_height + 1;
+    const auto out_chroma_stride = Common::AlignUp(out_chroma_width * BytesPerPixel * 2, 0x10);
+    const auto out_chroma_size = out_chroma_height * out_chroma_stride;
+
+    surface_width = std::min(surface_width, out_luma_width);
+    surface_height = std::min(surface_height, out_luma_height);
+
+    [[maybe_unused]] auto DecodeLinear = [&](std::span<u8> out_luma, std::span<u8> out_chroma) {
+        for (u32 y = 0; y < surface_height; ++y) {
+            const auto src_luma = y * surface_stride;
+            const auto dst_luma = y * out_luma_stride;
+            const auto src_chroma = y * surface_stride;
+            const auto dst_chroma = (y / 2) * out_chroma_stride;
+            for (u32 x = 0; x < surface_width; x += 2) {
+                out_luma[dst_luma + x + 0] =
+                    static_cast<u8>(output_surface[src_luma + x + 0].r >> 2);
+                out_luma[dst_luma + x + 1] =
+                    static_cast<u8>(output_surface[src_luma + x + 1].r >> 2);
+                out_chroma[dst_chroma + x + 0] =
+                    static_cast<u8>(output_surface[src_chroma + x].g >> 2);
+                out_chroma[dst_chroma + x + 1] =
+                    static_cast<u8>(output_surface[src_chroma + x].b >> 2);
+            }
+        }
+    };
+
+    auto Decode = [&](std::span<u8> out_luma, std::span<u8> out_chroma) {
+#if defined(ARCHITECTURE_x86_64)
+        if (!has_sse41) {
+            DecodeLinear(out_luma, out_chroma);
+            return;
+        }
+
+        // luma_mask   = [00 00] [00 00] [00 00] [FF FF] [00 00] [00 00] [00 00] [FF FF]
+        const auto luma_mask = _mm_set_epi16(0, 0, 0, -1, 0, 0, 0, -1);
+
+        for (u32 y = 0; y < surface_height; ++y) {
+            const auto src = y * surface_stride;
+            const auto dst_luma = y * out_luma_stride;
+            const auto dst_chroma = (y / 2) * out_chroma_stride;
+            for (u32 x = 0; x < surface_width; x += 16) {
+                // clang-format off
+                // Prefetch the next cache lines, 2 per iteration
+                _mm_prefetch((const char*)&output_surface[src + x + 16], _MM_HINT_T0);
+                _mm_prefetch((const char*)&output_surface[src + x + 24], _MM_HINT_T0);
+
+                // Load the 64-bit pixels, 2 per variable.
+                auto pixel01 = _mm_load_si128((__m128i*)&output_surface[src + x + 0]);
+                auto pixel23 = _mm_load_si128((__m128i*)&output_surface[src + x + 2]);
+                auto pixel45 = _mm_load_si128((__m128i*)&output_surface[src + x + 4]);
+                auto pixel67 = _mm_load_si128((__m128i*)&output_surface[src + x + 6]);
+                auto pixel89 = _mm_load_si128((__m128i*)&output_surface[src + x + 8]);
+                auto pixel1011 = _mm_load_si128((__m128i*)&output_surface[src + x + 10]);
+                auto pixel1213 = _mm_load_si128((__m128i*)&output_surface[src + x + 12]);
+                auto pixel1415 = _mm_load_si128((__m128i*)&output_surface[src + x + 14]);
+
+                // Split out the luma of each pixel using the luma_mask above.
+                // pixel01 = [AA2 AA2] [VV2 VV2] [UU2 UU2] [LL2 LL2] [AA1 AA1] [VV1 VV1] [UU1 UU1] [LL1 LL1]
+                // ->
+                //     l01 = [002 002] [002 002] [002 002] [LL2 LL2] [001 001] [001 001] [001 001] [LL1 LL1]
+                auto l01 = _mm_and_si128(pixel01, luma_mask);
+                auto l23 = _mm_and_si128(pixel23, luma_mask);
+                auto l45 = _mm_and_si128(pixel45, luma_mask);
+                auto l67 = _mm_and_si128(pixel67, luma_mask);
+                auto l89 = _mm_and_si128(pixel89, luma_mask);
+                auto l1011 = _mm_and_si128(pixel1011, luma_mask);
+                auto l1213 = _mm_and_si128(pixel1213, luma_mask);
+                auto l1415 = _mm_and_si128(pixel1415, luma_mask);
+
+                // Pack 32-bit elements from 2 registers down into 16-bit elements in 1 register.
+                // l01   = [002 002 002 002] [002 002 LL2 LL2] [001 001 001 001] [001 001 LL1 LL1]
+                // l23   = [004 004 004 004] [004 004 LL4 LL4] [003 003 003 003] [003 003 LL3 LL3]
+                // ->
+                // l0123 = [004 004] [LL4 LL4] [003 003] [LL3 LL3] [002 002] [LL2 LL2] [001 001] [LL1 LL1]
+                auto l0123 = _mm_packus_epi32(l01, l23);
+                auto l4567 = _mm_packus_epi32(l45, l67);
+                auto l891011 = _mm_packus_epi32(l89, l1011);
+                auto l12131415 = _mm_packus_epi32(l1213, l1415);
+
+                // Pack 32-bit elements from 2 registers down into 16-bit elements in 1 register.
+                // l0123   = [004 004 LL4 LL4] [003 003 LL3 LL3] [002 002 LL2 LL2] [001 001 LL1 LL1]
+                // l4567   = [008 008 LL8 LL8] [007 007 LL7 LL7] [006 006 LL6 LL6] [005 005 LL5 LL5]
+                // ->
+                // luma_lo = [LL8 LL8] [LL7 LL7] [LL6 LL6] [LL5 LL5] [LL4 LL4] [LL3 LL3] [LL2 LL2] [LL1 LL1]
+                auto luma_lo = _mm_packus_epi32(l0123, l4567);
+                auto luma_hi = _mm_packus_epi32(l891011, l12131415);
+
+                // Right-shift the 16-bit elements by 2, un-doing the left shift by 2 on read
+                // and bringing the range back to 8-bit.
+                luma_lo = _mm_srli_epi16(luma_lo, 2);
+                luma_hi = _mm_srli_epi16(luma_hi, 2);
+
+                // Pack with unsigned saturation the 16-bit values in 2 registers into 8-bit values in 1 register.
+                // luma_lo =  [LL8  LL8]  [LL7  LL7]  [LL6  LL6]  [LL5  LL5]  [LL4  LL4]  [LL3  LL3]  [LL2  LL2] [LL1 LL1]
+                // luma_hi = [LL16 LL16] [LL15 LL15] [LL14 LL14] [LL13 LL13] [LL12 LL12] [LL11 LL11] [LL10 LL10] [LL9 LL9]
+                // ->
+                // luma = [LL16] [LL15] [LL14] [LL13] [LL12] [LL11] [LL10] [LL9] [LL8] [LL7] [LL6] [LL5] [LL4] [LL3] [LL2] [LL1]
+                auto luma = _mm_packus_epi16(luma_lo, luma_hi);
+
+                // Store the 16 bytes of luma
+                _mm_store_si128((__m128i*)&out_luma[dst_luma + x], luma);
+
+                if (y % 2 == 0) {
+                    // Chroma, done every other line as it's half the height of luma.
+
+                    // Shift the register right by 2 bytes (not bits), to kick out the 16-bit luma.
+                    // We can do this instead of &'ing a mask and then shifting.
+                    // pixel01 = [AA2 AA2] [VV2 VV2] [UU2 UU2] [LL2 LL2] [AA1 AA1] [VV1 VV1] [UU1 UU1] [LL1 LL1]
+                    // ->
+                    //     c01 = [ 00  00] [AA2 AA2] [VV2 VV2] [UU2 UU2] [LL2 LL2] [AA1 AA1] [VV1 VV1] [UU1 UU1]
+                    auto c01 = _mm_srli_si128(pixel01, 2);
+                    auto c23 = _mm_srli_si128(pixel23, 2);
+                    auto c45 = _mm_srli_si128(pixel45, 2);
+                    auto c67 = _mm_srli_si128(pixel67, 2);
+                    auto c89 = _mm_srli_si128(pixel89, 2);
+                    auto c1011 = _mm_srli_si128(pixel1011, 2);
+                    auto c1213 = _mm_srli_si128(pixel1213, 2);
+                    auto c1415 = _mm_srli_si128(pixel1415, 2);
+
+                    // Interleave the lower 8 bytes as 32-bit elements from 2 registers into 1 register.
+                    // This has the effect of skipping every other chroma value horitonally,
+                    // notice the high pixels UU2/UU4 are skipped.
+                    // This is intended as N420 chroma width is half the luma width.
+                    // c01   = [ 00  00 AA2 AA2] [VV2 VV2 UU2 UU2] [LL2 LL2 AA1 AA1] [VV1 VV1 UU1 UU1]
+                    // c23   = [ 00  00 AA4 AA4] [VV4 VV4 UU4 UU4] [LL4 LL4 AA3 AA3] [VV3 VV3 UU3 UU3]
+                    // ->
+                    // c0123 = [LL4 LL4 AA3 AA3] [LL2 LL2 AA1 AA1] [VV3 VV3 UU3 UU3] [VV1 VV1 UU1 UU1]
+                    auto c0123 = _mm_unpacklo_epi32(c01, c23);
+                    auto c4567 = _mm_unpacklo_epi32(c45, c67);
+                    auto c891011 = _mm_unpacklo_epi32(c89, c1011);
+                    auto c12131415 = _mm_unpacklo_epi32(c1213, c1415);
+
+                    // Interleave the low 64-bit elements from 2 registers into 1.
+                    // c0123     = [LL4 LL4 AA3 AA3 LL2 LL2 AA1 AA1] [VV3 VV3 UU3 UU3 VV1 VV1 UU1 UU1]
+                    // c4567     = [LL8 LL8 AA7 AA7 LL6 LL6 AA5 AA5] [VV7 VV7 UU7 UU7 VV5 VV5 UU5 UU5]
+                    // ->
+                    // chroma_lo = [VV7 VV7 UU7 UU7 VV5 VV5 UU5 UU5] [VV3 VV3 UU3 UU3 VV1 VV1 UU1 UU1]
+                    auto chroma_lo = _mm_unpacklo_epi64(c0123, c4567);
+                    auto chroma_hi = _mm_unpacklo_epi64(c891011, c12131415);
+
+                    // Right-shift the 16-bit elements by 2, un-doing the left shift by 2 on read
+                    // and bringing the range back to 8-bit.
+                    chroma_lo = _mm_srli_epi16(chroma_lo, 2);
+                    chroma_hi = _mm_srli_epi16(chroma_hi, 2);
+
+                    // Pack with unsigned saturation the 16-bit elements from 2 registers into 8-bit elements in 1 register.
+                    // chroma_lo = [ VV7  VV7] [ UU7  UU7] [ VV5  VV5] [ UU5  UU5] [ VV3  VV3] [ UU3  UU3] [VV1 VV1] [UU1 UU1]
+                    // chroma_hi = [VV15 VV15] [UU15 UU15] [VV13 VV13] [UU13 UU13] [VV11 VV11] [UU11 UU11] [VV9 VV9] [UU9 UU9]
+                    // ->
+                    // chroma    = [VV15] [UU15] [VV13] [UU13] [VV11] [UU11] [VV9] [UU9] [VV7] [UU7] [VV5] [UU5] [VV3] [UU3] [VV1] [UU1]
+                    auto chroma = _mm_packus_epi16(chroma_lo, chroma_hi);
+
+                    // Store the 16 bytes of chroma.
+                    _mm_store_si128((__m128i*)&out_chroma[dst_chroma + x + 0], chroma);
+                }
+
+                // clang-format on
+            }
+        }
+#elif defined(ARCHITECTURE_arm64)
+        DecodeLinear(out_luma, out_chroma);
+#else
+        DecodeLinear(out_luma, out_chroma);
+#endif
+    };
+
+    switch (output_surface_config.out_block_kind) {
+    case BLK_KIND::GENERIC_16Bx2: {
+        const u32 block_height = static_cast<u32>(output_surface_config.out_block_height);
+        const auto out_luma_swizzle_size = Texture::CalculateSize(
+            true, BytesPerPixel, out_luma_width, out_luma_height, 1, block_height, 0);
+        const auto out_chroma_swizzle_size = Texture::CalculateSize(
+            true, BytesPerPixel * 2, out_chroma_width, out_chroma_height, 1, block_height, 0);
+
+        LOG_TRACE(
+            HW_GPU,
+            "Writing Y8__V8U8_N420 swizzled frame\n"
+            "\tinput surface {}x{} stride {} size 0x{:X}\n"
+            "\toutput   luma {}x{} stride {} size 0x{:X} block height {} swizzled size 0x{:X}\n",
+            "\toutput chroma {}x{} stride {} size 0x{:X} block height {} swizzled size 0x{:X}",
+            surface_width, surface_height, surface_stride * BytesPerPixel,
+            surface_stride * surface_height * BytesPerPixel, out_luma_width, out_luma_height,
+            out_luma_stride, out_luma_size, block_height, out_luma_swizzle_size, out_chroma_width,
+            out_chroma_height, out_chroma_stride, out_chroma_size, block_height,
+            out_chroma_swizzle_size);
+
+        luma_scratch.resize_destructive(out_luma_size);
+        chroma_scratch.resize_destructive(out_chroma_size);
+
+        Decode(luma_scratch, chroma_scratch);
+
+        Tegra::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeWrite> out_luma(
+            memory_manager, regs.output_surface.luma.Address(), out_luma_swizzle_size,
+            &swizzle_scratch);
+
+        if (block_height == 1) {
+            SwizzleSurface(out_luma, out_luma_stride, luma_scratch, out_luma_stride,
+                           out_luma_height);
+        } else {
+            Texture::SwizzleTexture(out_luma, luma_scratch, BytesPerPixel, out_luma_width,
+                                    out_luma_height, 1, block_height, 0, 1);
+        }
+
+        Tegra::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeWrite>
+            out_chroma(memory_manager, regs.output_surface.chroma_u.Address(),
+                       out_chroma_swizzle_size, &swizzle_scratch);
+
+        if (block_height == 1) {
+            SwizzleSurface(out_chroma, out_chroma_stride, chroma_scratch, out_chroma_stride,
+                           out_chroma_height);
+        } else {
+            Texture::SwizzleTexture(out_chroma, chroma_scratch, BytesPerPixel, out_chroma_width,
+                                    out_chroma_height, 1, block_height, 0, 1);
+        }
+    } break;
+    case BLK_KIND::PITCH: {
+        LOG_TRACE(
+            HW_GPU,
+            "Writing Y8__V8U8_N420 swizzled frame\n"
+            "\tinput surface {}x{} stride {} size 0x{:X}\n"
+            "\toutput   luma {}x{} stride {} size 0x{:X} block height {} swizzled size 0x{:X}\n",
+            "\toutput chroma {}x{} stride {} size 0x{:X} block height {} swizzled size 0x{:X}",
+            surface_width, surface_height, surface_stride * BytesPerPixel,
+            surface_stride * surface_height * BytesPerPixel, out_luma_width, out_luma_height,
+            out_luma_stride, out_luma_size, out_chroma_width, out_chroma_height, out_chroma_stride,
+            out_chroma_size);
+
+        // Unfortunately due to a driver bug or game bug, the chroma address can be not
+        // appropriately spaced from the luma, so the luma of size out_stride * height runs into the
+        // top of the chroma buffer. Unfortunately that removes an optimisation here where we could
+        // create guest spans and decode into game memory directly to avoid the memory copy from
+        // scratch to game. Due to this bug, we must write the luma first, and then the chroma
+        // afterwards to re-overwrite the luma being too large.
+        luma_scratch.resize_destructive(out_luma_size);
+        chroma_scratch.resize_destructive(out_chroma_size);
+
+        Decode(luma_scratch, chroma_scratch);
+
+        memory_manager.WriteBlock(regs.output_surface.luma.Address(), luma_scratch.data(),
+                                  out_luma_size);
+        memory_manager.WriteBlock(regs.output_surface.chroma_u.Address(), chroma_scratch.data(),
+                                  out_chroma_size);
+    } break;
+    default:
+        UNREACHABLE();
+        break;
+    }
+}
+
+template <VideoPixelFormat Format>
+void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config) {
+    constexpr u32 BytesPerPixel = 4;
+
+    auto surface_width{output_surface_config.out_surface_width + 1};
+    auto surface_height{output_surface_config.out_surface_height + 1};
+    const auto surface_stride{surface_width};
+
+    const auto out_luma_width = output_surface_config.out_luma_width + 1;
+    const auto out_luma_height = output_surface_config.out_luma_height + 1;
+    const auto out_luma_stride = Common ::AlignUp(out_luma_width * BytesPerPixel, 0x10);
+    const auto out_luma_size = out_luma_height * out_luma_stride;
+
+    surface_width = std::min(surface_width, out_luma_width);
+    surface_height = std::min(surface_height, out_luma_height);
+
+    [[maybe_unused]] auto DecodeLinear = [&](std::span<u8> out_buffer) {
+        for (u32 y = 0; y < surface_height; y++) {
+            const auto src = y * surface_stride;
+            const auto dst = y * out_luma_stride;
+            for (u32 x = 0; x < surface_width; x++) {
+                if constexpr (Format == VideoPixelFormat::A8R8G8B8) {
+                    out_buffer[dst + x * 4 + 0] = static_cast<u8>(output_surface[src + x].b >> 2);
+                    out_buffer[dst + x * 4 + 1] = static_cast<u8>(output_surface[src + x].g >> 2);
+                    out_buffer[dst + x * 4 + 2] = static_cast<u8>(output_surface[src + x].r >> 2);
+                    out_buffer[dst + x * 4 + 3] = static_cast<u8>(output_surface[src + x].a >> 2);
+                } else {
+                    out_buffer[dst + x * 4 + 0] = static_cast<u8>(output_surface[src + x].r >> 2);
+                    out_buffer[dst + x * 4 + 1] = static_cast<u8>(output_surface[src + x].g >> 2);
+                    out_buffer[dst + x * 4 + 2] = static_cast<u8>(output_surface[src + x].b >> 2);
+                    out_buffer[dst + x * 4 + 3] = static_cast<u8>(output_surface[src + x].a >> 2);
+                }
+            }
+        }
+    };
+
+    auto Decode = [&](std::span<u8> out_buffer) {
+#if defined(ARCHITECTURE_x86_64)
+        if (!has_sse41) {
+            DecodeLinear(out_buffer);
+            return;
+        }
+
+        for (u32 y = 0; y < surface_height; y++) {
+            const auto src = y * surface_stride;
+            const auto dst = y * out_luma_stride;
+            for (u32 x = 0; x < surface_width; x += 16) {
+                // clang-format off
+                // Prefetch the next 2 cache lines
+                _mm_prefetch((const char*)&output_surface[src + x + 16], _MM_HINT_T0);
+                _mm_prefetch((const char*)&output_surface[src + x + 24], _MM_HINT_T0);
+
+                // Load the pixels, 16-bit channels, 8 bytes per pixel, e.g
+                // pixel01 = [AA AA BB BB GG GG RR RR AA AA BB BB GG GG RR RR
+                auto pixel01 = _mm_load_si128((__m128i*)&output_surface[src + x + 0]);
+                auto pixel23 = _mm_load_si128((__m128i*)&output_surface[src + x + 2]);
+                auto pixel45 = _mm_load_si128((__m128i*)&output_surface[src + x + 4]);
+                auto pixel67 = _mm_load_si128((__m128i*)&output_surface[src + x + 6]);
+                auto pixel89 = _mm_load_si128((__m128i*)&output_surface[src + x + 8]);
+                auto pixel1011 = _mm_load_si128((__m128i*)&output_surface[src + x + 10]);
+                auto pixel1213 = _mm_load_si128((__m128i*)&output_surface[src + x + 12]);
+                auto pixel1415 = _mm_load_si128((__m128i*)&output_surface[src + x + 14]);
+
+                // Right-shift the channels by 16 to un-do the left shit on read and bring the range
+                // back to 8-bit.
+                pixel01 = _mm_srli_epi16(pixel01, 2);
+                pixel23 = _mm_srli_epi16(pixel23, 2);
+                pixel45 = _mm_srli_epi16(pixel45, 2);
+                pixel67 = _mm_srli_epi16(pixel67, 2);
+                pixel89 = _mm_srli_epi16(pixel89, 2);
+                pixel1011 = _mm_srli_epi16(pixel1011, 2);
+                pixel1213 = _mm_srli_epi16(pixel1213, 2);
+                pixel1415 = _mm_srli_epi16(pixel1415, 2);
+
+                // Pack with unsigned saturation 16-bit channels from 2 registers into 8-bit channels in 1 register.
+                // pixel01    = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1]
+                // pixel23    = [AA4 AA4] [BB4 BB4] [GG4 GG4] [RR4 RR4] [AA3 AA3] [BB3 BB3] [GG3 GG3] [RR3 RR3]
+                // ->
+                // pixels0_lo = [AA4] [BB4] [GG4] [RR4] [AA3] [BB3] [GG3] [RR3] [AA2] [BB2] [GG2] [RR2] [AA1] [BB1] [GG1] [RR1]
+                auto pixels0_lo = _mm_packus_epi16(pixel01, pixel23);
+                auto pixels0_hi = _mm_packus_epi16(pixel45, pixel67);
+                auto pixels1_lo = _mm_packus_epi16(pixel89, pixel1011);
+                auto pixels1_hi = _mm_packus_epi16(pixel1213, pixel1415);
+
+                if constexpr (Format == VideoPixelFormat::A8R8G8B8) {
+                    const auto shuffle =
+                        _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2);
+
+                    // Our pixels are ABGR (big-endian) by default, if ARGB is needed, we need to shuffle.
+                    // pixels0_lo = [AA4 BB4 GG4 RR4] [AA3 BB3 GG3 RR3] [AA2 BB2 GG2 RR2] [AA1 BB1 GG1 RR1]
+                    // ->
+                    // pixels0_lo = [AA4 RR4 GG4 BB4] [AA3 RR3 GG3 BB3] [AA2 RR2 GG2 BB2] [AA1 RR1 GG1 BB1]
+                    pixels0_lo = _mm_shuffle_epi8(pixels0_lo, shuffle);
+                    pixels0_hi = _mm_shuffle_epi8(pixels0_hi, shuffle);
+                    pixels1_lo = _mm_shuffle_epi8(pixels1_lo, shuffle);
+                    pixels1_hi = _mm_shuffle_epi8(pixels1_hi, shuffle);
+                }
+
+                // Store the pixels
+                _mm_store_si128((__m128i*)&out_buffer[dst + x * 4 + 0], pixels0_lo);
+                _mm_store_si128((__m128i*)&out_buffer[dst + x * 4 + 16], pixels0_hi);
+                _mm_store_si128((__m128i*)&out_buffer[dst + x * 4 + 32], pixels1_lo);
+                _mm_store_si128((__m128i*)&out_buffer[dst + x * 4 + 48], pixels1_hi);
+
+                // clang-format on
+            }
+        }
+#elif defined(ARCHITECTURE_arm64)
+        DecodeLinear(out_buffer);
+#else
+        DecodeLinear(out_buffer);
+#endif
+    };
+
+    switch (output_surface_config.out_block_kind) {
+    case BLK_KIND::GENERIC_16Bx2: {
+        const u32 block_height = static_cast<u32>(output_surface_config.out_block_height);
+        const auto out_swizzle_size = Texture::CalculateSize(true, BytesPerPixel, out_luma_width,
+                                                             out_luma_height, 1, block_height, 0);
+
+        LOG_TRACE(
+            HW_GPU,
+            "Writing ABGR swizzled frame\n"
+            "\tinput surface {}x{} stride {} size 0x{:X}\n"
+            "\toutput surface {}x{} stride {} size 0x{:X} block height {} swizzled size 0x{:X}",
+            surface_width, surface_height, surface_stride * BytesPerPixel,
+            surface_stride * surface_height * BytesPerPixel, out_luma_width, out_luma_height,
+            out_luma_stride, out_luma_size, block_height, out_swizzle_size);
+
+        luma_scratch.resize_destructive(out_luma_size);
+
+        Decode(luma_scratch);
+
+        Tegra::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeWrite> out_luma(
+            memory_manager, regs.output_surface.luma.Address(), out_swizzle_size, &swizzle_scratch);
+
+        if (block_height == 1) {
+            SwizzleSurface(out_luma, out_luma_stride, luma_scratch, out_luma_stride,
+                           out_luma_height);
+        } else {
+            Texture::SwizzleTexture(out_luma, luma_scratch, BytesPerPixel, out_luma_width,
+                                    out_luma_height, 1, block_height, 0, 1);
+        }
+
+    } break;
+    case BLK_KIND::PITCH: {
+        LOG_TRACE(HW_GPU,
+                  "Writing ABGR pitch frame\n"
+                  "\tinput surface {}x{} stride {} size 0x{:X}"
+                  "\toutput surface {}x{} stride {} size 0x{:X}",
+                  surface_width, surface_height, surface_stride,
+                  surface_stride * surface_height * BytesPerPixel, out_luma_width, out_luma_height,
+                  out_luma_stride, out_luma_size);
+
+        luma_scratch.resize_destructive(out_luma_size);
+
+        Tegra::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeWrite> out_luma(
+            memory_manager, regs.output_surface.luma.Address(), out_luma_size, &luma_scratch);
+
+        Decode(out_luma);
+    } break;
+    default:
+        UNREACHABLE();
+        break;
+    }
+}
+
+} // namespace Tegra::Host1x
diff --git a/src/video_core/host1x/vic.h b/src/video_core/host1x/vic.h
index 2de77e71e..3245b261c 100755
--- a/src/video_core/host1x/vic.h
+++ b/src/video_core/host1x/vic.h
@@ -3,65 +3,646 @@
 
 #pragma once
 
+#include <condition_variable>
+#include <functional>
 #include <memory>
+#include <mutex>
+#include <thread>
 
 #include "common/common_types.h"
 #include "common/scratch_buffer.h"
+#include "video_core/cdma_pusher.h"
 
-struct SwsContext;
-
-namespace Tegra {
-
-namespace Host1x {
-
+namespace Tegra::Host1x {
 class Host1x;
 class Nvdec;
-union VicConfig;
 
-class Vic {
+struct Pixel {
+    u16 r;
+    u16 g;
+    u16 b;
+    u16 a;
+};
+
+// One underscore represents separate pixels.
+// Double underscore represents separate planes.
+// _N represents chroma subsampling, not a separate pixel.
+enum class VideoPixelFormat : u32 {
+    A8 = 0,
+    L8 = 1,
+    A4L4 = 2,
+    L4A4 = 3,
+    R8 = 4,
+    A8L8 = 5,
+    L8A8 = 6,
+    R8G8 = 7,
+    G8R8 = 8,
+    B5G6R5 = 9,
+    R5G6B5 = 10,
+    B6G5R5 = 11,
+    R5G5B6 = 12,
+    A1B5G5R5 = 13,
+    A1R5G5B5 = 14,
+    B5G5R5A1 = 15,
+    R5G5B5A1 = 16,
+    A5B5G5R1 = 17,
+    A5R1G5B5 = 18,
+    B5G5R1A5 = 19,
+    R1G5B5A5 = 20,
+    X1B5G5R5 = 21,
+    X1R5G5B5 = 22,
+    B5G5R5X1 = 23,
+    R5G5B5X1 = 24,
+    A4B4G5R4 = 25,
+    A4R4G4B4 = 26,
+    B4G4R4A4 = 27,
+    R4G4B4A4 = 28,
+    B8G8R8 = 29,
+    R8G8B8 = 30,
+    A8B8G8R8 = 31,
+    A8R8G8B8 = 32,
+    B8G8R8A8 = 33,
+    R8G8B8A8 = 34,
+    X8B8G8R8 = 35,
+    X8R8G8B8 = 36,
+    B8G8R8X8 = 37,
+    R8G8B8X8 = 38,
+    A8B10G10R10 = 39,
+    A2R10G10B10 = 40,
+    B10G10R10A2 = 41,
+    R10G10B10A2 = 42,
+    A4P4 = 43,
+    P4A4 = 44,
+    P8A8 = 45,
+    A8P8 = 46,
+    P8 = 47,
+    P1 = 48,
+    U8V8 = 49,
+    V8U8 = 50,
+    A8Y8U8V8 = 51,
+    V8U8Y8A8 = 52,
+    Y8U8V8 = 53,
+    Y8V8U8 = 54,
+    U8V8Y8 = 55,
+    V8U8Y8 = 56,
+    Y8U8_Y8V8 = 57,
+    Y8V8_Y8U8 = 58,
+    U8Y8_V8Y8 = 59,
+    V8Y8_U8Y8 = 60,
+    Y8__U8V8_N444 = 61,
+    Y8__V8U8_N444 = 62,
+    Y8__U8V8_N422 = 63,
+    Y8__V8U8_N422 = 64,
+    Y8__U8V8_N422R = 65,
+    Y8__V8U8_N422R = 66,
+    Y8__U8V8_N420 = 67,
+    Y8__V8U8_N420 = 68,
+    Y8__U8__V8_N444 = 69,
+    Y8__U8__V8_N422 = 70,
+    Y8__U8__V8_N422R = 71,
+    Y8__U8__V8_N420 = 72,
+    U8 = 73,
+    V8 = 74,
+};
+
+struct Offset {
+    constexpr u32 Address() const noexcept {
+        return offset << 8;
+    }
+
+private:
+    u32 offset;
+};
+static_assert(std::is_trivial_v<Offset>, "Offset must be trivial");
+static_assert(sizeof(Offset) == 0x4, "Offset has the wrong size!");
+
+struct PlaneOffsets {
+    Offset luma;
+    Offset chroma_u;
+    Offset chroma_v;
+};
+static_assert(sizeof(PlaneOffsets) == 0xC, "PlaneOffsets has the wrong size!");
+
+enum SurfaceIndex : u32 {
+    Current = 0,
+    Previous = 1,
+    Next = 2,
+    NextNoiseReduced = 3,
+    CurrentMotion = 4,
+    PreviousMotion = 5,
+    PreviousPreviousMotion = 6,
+    CombinedMotion = 7,
+};
+
+enum class DXVAHD_ALPHA_FILL_MODE : u32 {
+    OPAQUE = 0,
+    BACKGROUND = 1,
+    DESTINATION = 2,
+    SOURCE_STREAM = 3,
+    COMPOSITED = 4,
+    SOURCE_ALPHA = 5,
+};
+
+enum class DXVAHD_FRAME_FORMAT : u64 {
+    PROGRESSIVE = 0,
+    INTERLACED_TOP_FIELD_FIRST = 1,
+    INTERLACED_BOTTOM_FIELD_FIRST = 2,
+    TOP_FIELD = 3,
+    BOTTOM_FIELD = 4,
+    SUBPIC_PROGRESSIVE = 5,
+    SUBPIC_INTERLACED_TOP_FIELD_FIRST = 6,
+    SUBPIC_INTERLACED_BOTTOM_FIELD_FIRST = 7,
+    SUBPIC_TOP_FIELD = 8,
+    SUBPIC_BOTTOM_FIELD = 9,
+    TOP_FIELD_CHROMA_BOTTOM = 10,
+    BOTTOM_FIELD_CHROMA_TOP = 11,
+    SUBPIC_TOP_FIELD_CHROMA_BOTTOM = 12,
+    SUBPIC_BOTTOM_FIELD_CHROMA_TOP = 13,
+};
+
+enum class DXVAHD_DEINTERLACE_MODE_PRIVATE : u64 {
+    WEAVE = 0,
+    BOB_FIELD = 1,
+    BOB = 2,
+    NEWBOB = 3,
+    DISI1 = 4,
+    WEAVE_LUMA_BOB_FIELD_CHROMA = 5,
+    MAX = 0xF,
+};
+
+enum class BLK_KIND {
+    PITCH = 0,
+    GENERIC_16Bx2 = 1,
+    // These are unsupported in the vic
+    BL_NAIVE = 2,
+    BL_KEPLER_XBAR_RAW = 3,
+    VP2_TILED = 15,
+};
+
+enum class BLEND_SRCFACTC : u32 {
+    K1 = 0,
+    K1_TIMES_DST = 1,
+    NEG_K1_TIMES_DST = 2,
+    K1_TIMES_SRC = 3,
+    ZERO = 4,
+};
+
+enum class BLEND_DSTFACTC : u32 {
+    K1 = 0,
+    K2 = 1,
+    K1_TIMES_DST = 2,
+    NEG_K1_TIMES_DST = 3,
+    NEG_K1_TIMES_SRC = 4,
+    ZERO = 5,
+    ONE = 6,
+};
+
+enum class BLEND_SRCFACTA : u32 {
+    K1 = 0,
+    K2 = 1,
+    NEG_K1_TIMES_DST = 2,
+    ZERO = 3,
+    MAX = 7,
+};
+
+enum class BLEND_DSTFACTA : u32 {
+    K2 = 0,
+    NEG_K1_TIMES_SRC = 1,
+    ZERO = 2,
+    ONE = 3,
+    MAX = 7,
+};
+
+struct PipeConfig {
+    union {
+        BitField<0, 11, u32> downsample_horiz;
+        BitField<11, 5, u32> reserved0;
+        BitField<16, 11, u32> downsample_vert;
+        BitField<27, 5, u32> reserved1;
+    };
+    u32 reserved2;
+    u32 reserved3;
+    u32 reserved4;
+};
+static_assert(sizeof(PipeConfig) == 0x10, "PipeConfig has the wrong size!");
+
+struct OutputConfig {
+    union {
+        BitField<0, 3, DXVAHD_ALPHA_FILL_MODE> alpha_fill_mode;
+        BitField<3, 3, u64> alpha_fill_slot;
+        BitField<6, 10, u64> background_a;
+        BitField<16, 10, u64> background_r;
+        BitField<26, 10, u64> background_g;
+        BitField<36, 10, u64> background_b;
+        BitField<46, 2, u64> regamma_mode;
+        BitField<48, 1, u64> output_flip_x;
+        BitField<49, 1, u64> output_flip_y;
+        BitField<50, 1, u64> output_transpose;
+        BitField<51, 1, u64> reserved1;
+        BitField<52, 12, u64> reserved2;
+    };
+    union {
+        BitField<0, 14, u32> target_rect_left;
+        BitField<14, 2, u32> reserved3;
+        BitField<16, 14, u32> target_rect_right;
+        BitField<30, 2, u32> reserved4;
+    };
+    union {
+        BitField<0, 14, u32> target_rect_top;
+        BitField<14, 2, u32> reserved5;
+        BitField<16, 14, u32> target_rect_bottom;
+        BitField<30, 2, u32> reserved6;
+    };
+};
+static_assert(sizeof(OutputConfig) == 0x10, "OutputConfig has the wrong size!");
+
+struct OutputSurfaceConfig {
+    union {
+        BitField<0, 7, VideoPixelFormat> out_pixel_format;
+        BitField<7, 2, u32> out_chroma_loc_horiz;
+        BitField<9, 2, u32> out_chroma_loc_vert;
+        BitField<11, 4, BLK_KIND> out_block_kind;
+        BitField<15, 4, u32> out_block_height; // in gobs, log2
+        BitField<19, 3, u32> reserved0;
+        BitField<22, 10, u32> reserved1;
+    };
+    union {
+        BitField<0, 14, u32> out_surface_width;   // - 1
+        BitField<14, 14, u32> out_surface_height; // - 1
+        BitField<28, 4, u32> reserved2;
+    };
+    union {
+        BitField<0, 14, u32> out_luma_width;   // - 1
+        BitField<14, 14, u32> out_luma_height; // - 1
+        BitField<28, 4, u32> reserved3;
+    };
+    union {
+        BitField<0, 14, u32> out_chroma_width;   // - 1
+        BitField<14, 14, u32> out_chroma_height; // - 1
+        BitField<28, 4, u32> reserved4;
+    };
+};
+static_assert(sizeof(OutputSurfaceConfig) == 0x10, "OutputSurfaceConfig has the wrong size!");
+
+struct MatrixStruct {
+    union {
+        BitField<0, 20, s64> matrix_coeff00;  // (0,0) of 4x3 conversion matrix
+        BitField<20, 20, s64> matrix_coeff10; // (1,0) of 4x3 conversion matrix
+        BitField<40, 20, s64> matrix_coeff20; // (2,0) of 4x3 conversion matrix
+        BitField<60, 4, u64> matrix_r_shift;
+    };
+    union {
+        BitField<0, 20, s64> matrix_coeff01;  // (0,1) of 4x3 conversion matrix
+        BitField<20, 20, s64> matrix_coeff11; // (1,1) of 4x3 conversion matrix
+        BitField<40, 20, s64> matrix_coeff21; // (2,1) of 4x3 conversion matrix
+        BitField<60, 3, u64> reserved0;
+        BitField<63, 1, u64> matrix_enable;
+    };
+    union {
+        BitField<0, 20, s64> matrix_coeff02;  // (0,2) of 4x3 conversion matrix
+        BitField<20, 20, s64> matrix_coeff12; // (1,2) of 4x3 conversion matrix
+        BitField<40, 20, s64> matrix_coeff22; // (2,2) of 4x3 conversion matrix
+        BitField<60, 4, u64> reserved1;
+    };
+    union {
+        BitField<0, 20, s64> matrix_coeff03;  // (0,3) of 4x3 conversion matrix
+        BitField<20, 20, s64> matrix_coeff13; // (1,3) of 4x3 conversion matrix
+        BitField<40, 20, s64> matrix_coeff23; // (2,3) of 4x3 conversion matrix
+        BitField<60, 4, u64> reserved2;
+    };
+};
+static_assert(sizeof(MatrixStruct) == 0x20, "MatrixStruct has the wrong size!");
+
+struct ClearRectStruct {
+    union {
+        BitField<0, 14, u32> clear_rect0_left;
+        BitField<14, 2, u32> reserved0;
+        BitField<16, 14, u32> clear_rect0_right;
+        BitField<30, 2, u32> reserved1;
+    };
+    union {
+        BitField<0, 14, u32> clear_rect0_top;
+        BitField<14, 2, u32> reserved2;
+        BitField<16, 14, u32> clear_rect0_bottom;
+        BitField<30, 2, u32> reserved3;
+    };
+    union {
+        BitField<0, 14, u32> clear_rect1_left;
+        BitField<14, 2, u32> reserved4;
+        BitField<16, 14, u32> clear_rect1_right;
+        BitField<30, 2, u32> reserved5;
+    };
+    union {
+        BitField<0, 14, u32> clear_rect1_top;
+        BitField<14, 2, u32> reserved6;
+        BitField<16, 14, u32> clear_rect1_bottom;
+        BitField<30, 2, u32> reserved7;
+    };
+};
+static_assert(sizeof(ClearRectStruct) == 0x10, "ClearRectStruct has the wrong size!");
+
+struct SlotConfig {
+    union {
+        BitField<0, 1, u64> slot_enable;
+        BitField<1, 1, u64> denoise;
+        BitField<2, 1, u64> advanced_denoise;
+        BitField<3, 1, u64> cadence_detect;
+        BitField<4, 1, u64> motion_map;
+        BitField<5, 1, u64> motion_map_capture;
+        BitField<6, 1, u64> is_even;
+        BitField<7, 1, u64> chroma_even;
+        // fetch control struct
+        BitField<8, 1, u64> current_field_enable;
+        BitField<9, 1, u64> prev_field_enable;
+        BitField<10, 1, u64> next_field_enable;
+        BitField<11, 1, u64> next_nr_field_enable; // noise reduction
+        BitField<12, 1, u64> current_motion_field_enable;
+        BitField<13, 1, u64> prev_motion_field_enable;
+        BitField<14, 1, u64> prev_prev_motion_field_enable;
+        BitField<15, 1, u64> combined_motion_field_enable;
+
+        BitField<16, 4, DXVAHD_FRAME_FORMAT> frame_format;
+        BitField<20, 2, u64> filter_length_y; // 0: 1-tap, 1: 2-tap, 2: 5-tap, 3: 10-tap
+        BitField<22, 2, u64> filter_length_x;
+        BitField<24, 12, u64> panoramic;
+        BitField<36, 22, u64> reserved1;
+        BitField<58, 6, u64> detail_filter_clamp;
+    };
+    union {
+        BitField<0, 10, u64> filter_noise;
+        BitField<10, 10, u64> filter_detail;
+        BitField<20, 10, u64> chroma_noise;
+        BitField<30, 10, u64> chroma_detail;
+        BitField<40, 4, DXVAHD_DEINTERLACE_MODE_PRIVATE> deinterlace_mode;
+        BitField<44, 3, u64> motion_accumulation_weight;
+        BitField<47, 11, u64> noise_iir;
+        BitField<58, 4, u64> light_level;
+        BitField<62, 2, u64> reserved4;
+    };
+    union {
+        BitField<0, 10, u64> soft_clamp_low;
+        BitField<10, 10, u64> soft_clamp_high;
+        BitField<20, 3, u64> reserved5;
+        BitField<23, 9, u64> reserved6;
+        BitField<32, 10, u64> planar_alpha;
+        BitField<42, 1, u64> constant_alpha;
+        BitField<43, 3, u64> stereo_interleave;
+        BitField<46, 1, u64> clip_enabled;
+        BitField<47, 8, u64> clear_rect_mask;
+        BitField<55, 2, u64> degamma_mode;
+        BitField<57, 1, u64> reserved7;
+        BitField<58, 1, u64> decompress_enable;
+        BitField<59, 5, u64> reserved9;
+    };
+    union {
+        BitField<0, 8, u64> decompress_ctb_count;
+        BitField<8, 32, u64> decompress_zbc_count;
+        BitField<40, 24, u64> reserved12;
+    };
+    union {
+        BitField<0, 30, u64> source_rect_left;
+        BitField<30, 2, u64> reserved14;
+        BitField<32, 30, u64> source_rect_right;
+        BitField<62, 2, u64> reserved15;
+    };
+    union {
+        BitField<0, 30, u64> source_rect_top;
+        BitField<30, 2, u64> reserved16;
+        BitField<32, 30, u64> source_rect_bottom;
+        BitField<62, 2, u64> reserved17;
+    };
+    union {
+        BitField<0, 14, u64> dest_rect_left;
+        BitField<14, 2, u64> reserved18;
+        BitField<16, 14, u64> dest_rect_right;
+        BitField<30, 2, u64> reserved19;
+        BitField<32, 14, u64> dest_rect_top;
+        BitField<46, 2, u64> reserved20;
+        BitField<48, 14, u64> dest_rect_bottom;
+        BitField<62, 2, u64> reserved21;
+    };
+    u32 reserved22;
+    u32 reserved23;
+};
+static_assert(sizeof(SlotConfig) == 0x40, "SlotConfig has the wrong size!");
+
+struct SlotSurfaceConfig {
+    union {
+        BitField<0, 7, VideoPixelFormat> slot_pixel_format;
+        BitField<7, 2, u32> slot_chroma_loc_horiz;
+        BitField<9, 2, u32> slot_chroma_loc_vert;
+        BitField<11, 4, u32> slot_block_kind;
+        BitField<15, 4, u32> slot_block_height;
+        BitField<19, 3, u32> slot_cache_width;
+        BitField<22, 10, u32> reserved0;
+    };
+    union {
+        BitField<0, 14, u32> slot_surface_width;   //  - 1
+        BitField<14, 14, u32> slot_surface_height; //  - 1
+        BitField<28, 4, u32> reserved1;
+    };
+    union {
+        BitField<0, 14, u32> slot_luma_width;   // padded, - 1
+        BitField<14, 14, u32> slot_luma_height; // padded, - 1
+        BitField<28, 4, u32> reserved2;
+    };
+    union {
+        BitField<0, 14, u32> slot_chroma_width;   // padded, - 1
+        BitField<14, 14, u32> slot_chroma_height; // padded, - 1
+        BitField<28, 4, u32> reserved3;
+    };
+};
+static_assert(sizeof(SlotSurfaceConfig) == 0x10, "SlotSurfaceConfig has the wrong size!");
+
+struct LumaKeyStruct {
+    union {
+        BitField<0, 20, u64> luma_coeff0;  // (0) of 4x1 conversion matrix, S12.8 format
+        BitField<20, 20, u64> luma_coeff1; // (1) of 4x1 conversion matrix, S12.8 format
+        BitField<40, 20, u64> luma_coeff2; // (2) of 4x1 conversion matrix, S12.8 format
+        BitField<60, 4, u64> luma_r_shift;
+    };
+    union {
+        BitField<0, 20, u64> luma_coeff3; // (3) of 4x1 conversion matrix, S12.8 format
+        BitField<20, 10, u64> luma_key_lower;
+        BitField<30, 10, u64> luma_key_upper;
+        BitField<40, 1, u64> luma_key_enabled;
+        BitField<41, 2, u64> reserved0;
+        BitField<43, 21, u64> reserved1;
+    };
+};
+static_assert(sizeof(LumaKeyStruct) == 0x10, "LumaKeyStruct has the wrong size!");
+
+struct BlendingSlotStruct {
+    union {
+        BitField<0, 10, u32> alpha_k1;
+        BitField<10, 6, u32> reserved0;
+        BitField<16, 10, u32> alpha_k2;
+        BitField<26, 6, u32> reserved1;
+    };
+    union {
+        BitField<0, 3, BLEND_SRCFACTC> src_factor_color_match_select;
+        BitField<3, 1, u32> reserved2;
+        BitField<4, 3, BLEND_DSTFACTC> dst_factor_color_match_select;
+        BitField<7, 1, u32> reserved3;
+        BitField<8, 3, BLEND_SRCFACTA> src_factor_a_match_select;
+        BitField<11, 1, u32> reserved4;
+        BitField<12, 3, BLEND_DSTFACTA> dst_factor_a_match_select;
+        BitField<15, 1, u32> reserved5;
+        BitField<16, 4, u32> reserved6;
+        BitField<20, 4, u32> reserved7;
+        BitField<24, 4, u32> reserved8;
+        BitField<28, 4, u32> reserved9;
+    };
+    union {
+        BitField<0, 2, u32> reserved10;
+        BitField<2, 10, u32> override_r;
+        BitField<12, 10, u32> override_g;
+        BitField<22, 10, u32> override_b;
+    };
+    union {
+        BitField<0, 10, u32> override_a;
+        BitField<10, 2, u32> reserved11;
+        BitField<12, 1, u32> use_override_r;
+        BitField<13, 1, u32> use_override_g;
+        BitField<14, 1, u32> use_override_b;
+        BitField<15, 1, u32> use_override_a;
+        BitField<16, 1, u32> mask_r;
+        BitField<17, 1, u32> mask_g;
+        BitField<18, 1, u32> mask_b;
+        BitField<19, 1, u32> mask_a;
+        BitField<20, 12, u32> reserved12;
+    };
+};
+static_assert(sizeof(BlendingSlotStruct) == 0x10, "BlendingSlotStruct has the wrong size!");
+
+struct SlotStruct {
+    SlotConfig config;
+    SlotSurfaceConfig surface_config;
+    LumaKeyStruct luma_key;
+    MatrixStruct color_matrix;
+    MatrixStruct gamut_matrix;
+    BlendingSlotStruct blending;
+};
+static_assert(sizeof(SlotStruct) == 0xB0, "SlotStruct has the wrong size!");
+
+struct ConfigStruct {
+    PipeConfig pipe_config;
+    OutputConfig output_config;
+    OutputSurfaceConfig output_surface_config;
+    MatrixStruct out_color_matrix;
+    std::array<ClearRectStruct, 4> clear_rects;
+    std::array<SlotStruct, 8> slot_structs;
+};
+static_assert(offsetof(ConfigStruct, pipe_config) == 0x0, "pipe_config is in the wrong place!");
+static_assert(offsetof(ConfigStruct, output_config) == 0x10,
+              "output_config is in the wrong place!");
+static_assert(offsetof(ConfigStruct, output_surface_config) == 0x20,
+              "output_surface_config is in the wrong place!");
+static_assert(offsetof(ConfigStruct, out_color_matrix) == 0x30,
+              "out_color_matrix is in the wrong place!");
+static_assert(offsetof(ConfigStruct, clear_rects) == 0x50, "clear_rects is in the wrong place!");
+static_assert(offsetof(ConfigStruct, slot_structs) == 0x90, "slot_structs is in the wrong place!");
+static_assert(sizeof(ConfigStruct) == 0x610, "ConfigStruct has the wrong size!");
+
+struct VicRegisters {
+    static constexpr std::size_t NUM_REGS = 0x446;
+
+    union {
+        struct {
+            INSERT_PADDING_WORDS_NOINIT(0xC0);
+            u32 execute;
+            INSERT_PADDING_WORDS_NOINIT(0x3F);
+            std::array<std::array<PlaneOffsets, 8>, 8> surfaces;
+            u32 picture_index;
+            u32 control_params;
+            Offset config_struct_offset;
+            Offset filter_struct_offset;
+            Offset palette_offset;
+            Offset hist_offset;
+            u32 context_id;
+            u32 fce_ucode_size;
+            PlaneOffsets output_surface;
+            Offset fce_ucode_offset;
+            INSERT_PADDING_WORDS_NOINIT(0x4);
+            std::array<u32, 8> slot_context_ids;
+            std::array<Offset, 8> comp_tag_buffer_offsets;
+            std::array<Offset, 8> history_buffer_offset;
+            INSERT_PADDING_WORDS_NOINIT(0x25D);
+            u32 pm_trigger_end;
+        };
+        std::array<u32, NUM_REGS> reg_array;
+    };
+};
+static_assert(offsetof(VicRegisters, execute) == 0x300, "execute is in the wrong place!");
+static_assert(offsetof(VicRegisters, surfaces) == 0x400, "surfaces is in the wrong place!");
+static_assert(offsetof(VicRegisters, picture_index) == 0x700,
+              "picture_index is in the wrong place!");
+static_assert(offsetof(VicRegisters, control_params) == 0x704,
+              "control_params is in the wrong place!");
+static_assert(offsetof(VicRegisters, config_struct_offset) == 0x708,
+              "config_struct_offset is in the wrong place!");
+static_assert(offsetof(VicRegisters, output_surface) == 0x720,
+              "output_surface is in the wrong place!");
+static_assert(offsetof(VicRegisters, slot_context_ids) == 0x740,
+              "slot_context_ids is in the wrong place!");
+static_assert(offsetof(VicRegisters, history_buffer_offset) == 0x780,
+              "history_buffer_offset is in the wrong place!");
+static_assert(offsetof(VicRegisters, pm_trigger_end) == 0x1114,
+              "pm_trigger_end is in the wrong place!");
+static_assert(sizeof(VicRegisters) == 0x1118, "VicRegisters has the wrong size!");
+
+class Vic final : public CDmaPusher {
 public:
     enum class Method : u32 {
-        Execute = 0xc0,
-        SetControlParams = 0x1c1,
-        SetConfigStructOffset = 0x1c2,
-        SetOutputSurfaceLumaOffset = 0x1c8,
-        SetOutputSurfaceChromaOffset = 0x1c9,
-        SetOutputSurfaceChromaUnusedOffset = 0x1ca
+        Execute = offsetof(VicRegisters, execute),
+        SetControlParams = offsetof(VicRegisters, control_params),
+        SetConfigStructOffset = offsetof(VicRegisters, config_struct_offset),
+        SetOutputSurfaceLumaOffset = offsetof(VicRegisters, output_surface.luma),
+        SetOutputSurfaceChromaOffset = offsetof(VicRegisters, output_surface.chroma_u),
+        SetOutputSurfaceChromaUnusedOffset = offsetof(VicRegisters, output_surface.chroma_v)
     };
 
-    explicit Vic(Host1x& host1x, std::shared_ptr<Nvdec> nvdec_processor);
-
+    explicit Vic(Host1x& host1x, s32 id, u32 syncpt, FrameQueue& frame_queue);
     ~Vic();
 
     /// Write to the device state.
-    void ProcessMethod(Method method, u32 argument);
+    void ProcessMethod(u32 method, u32 arg) override;
 
 private:
     void Execute();
 
-    void WriteRGBFrame(std::unique_ptr<FFmpeg::Frame> frame, const VicConfig& config);
+    void Blend(const ConfigStruct& config, const SlotStruct& slot);
 
-    void WriteYUVFrame(std::unique_ptr<FFmpeg::Frame> frame, const VicConfig& config);
+    template <bool Planar, bool Interlaced = false>
+    void ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets,
+                                      std::shared_ptr<const FFmpeg::Frame> frame);
+    template <bool Planar, bool TopField>
+    void ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets,
+                                     std::shared_ptr<const FFmpeg::Frame> frame);
 
-    Host1x& host1x;
-    std::shared_ptr<Tegra::Host1x::Nvdec> nvdec_processor;
+    template <bool Planar>
+    void ReadY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets,
+                           std::shared_ptr<const FFmpeg::Frame> frame);
 
-    /// Avoid reallocation of the following buffers every frame, as their
-    /// size does not change during a stream
-    using AVMallocPtr = std::unique_ptr<u8, decltype(&av_free)>;
-    AVMallocPtr converted_frame_buffer;
-    Common::ScratchBuffer<u8> luma_buffer;
-    Common::ScratchBuffer<u8> chroma_buffer;
+    void WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config);
 
-    GPUVAddr config_struct_address{};
-    GPUVAddr output_surface_luma_address{};
-    GPUVAddr output_surface_chroma_address{};
+    template <VideoPixelFormat Format>
+    void WriteABGR(const OutputSurfaceConfig& output_surface_config);
 
-    SwsContext* scaler_ctx{};
-    s32 scaler_width{};
-    s32 scaler_height{};
+    s32 id;
+    s32 nvdec_id{-1};
+    u32 syncpoint;
+
+    VicRegisters regs{};
+    FrameQueue& frame_queue;
+
+    const bool has_sse41{false};
+
+    Common::ScratchBuffer<Pixel> output_surface;
+    Common::ScratchBuffer<Pixel> slot_surface;
+    Common::ScratchBuffer<u8> luma_scratch;
+    Common::ScratchBuffer<u8> chroma_scratch;
+    Common::ScratchBuffer<u8> swizzle_scratch;
 };
 
-} // namespace Host1x
-
-} // namespace Tegra
+} // namespace Tegra::Host1x
diff --git a/src/video_core/host_shaders/fidelityfx_fsr.frag b/src/video_core/host_shaders/fidelityfx_fsr.frag
index a266e1c4e..54eedb450 100755
--- a/src/video_core/host_shaders/fidelityfx_fsr.frag
+++ b/src/video_core/host_shaders/fidelityfx_fsr.frag
@@ -37,6 +37,7 @@ layout(set=0,binding=0) uniform sampler2D InputTexture;
 
 #define A_GPU 1
 #define A_GLSL 1
+#define FSR_RCAS_PASSTHROUGH_ALPHA 1
 
 #ifndef YUZU_USE_FP16
     #include "ffx_a.h"
@@ -71,9 +72,7 @@ layout(set=0,binding=0) uniform sampler2D InputTexture;
 
 #include "ffx_fsr1.h"
 
-#if USE_RCAS
-    layout(location = 0) in vec2 frag_texcoord;
-#endif
+layout (location = 0) in vec2 frag_texcoord;
 layout (location = 0) out vec4 frag_color;
 
 void CurrFilter(AU2 pos) {
@@ -81,22 +80,22 @@ void CurrFilter(AU2 pos) {
     #ifndef YUZU_USE_FP16
         AF3 c;
         FsrEasuF(c, pos, Const0, Const1, Const2, Const3);
-        frag_color = AF4(c, 1.0);
+        frag_color = AF4(c, texture(InputTexture, frag_texcoord).a);
     #else
         AH3 c;
         FsrEasuH(c, pos, Const0, Const1, Const2, Const3);
-        frag_color = AH4(c, 1.0);
+        frag_color = AH4(c, texture(InputTexture, frag_texcoord).a);
     #endif
 #endif
 #if USE_RCAS
     #ifndef YUZU_USE_FP16
-        AF3 c;
-        FsrRcasF(c.r, c.g, c.b, pos, Const0);
-        frag_color = AF4(c, 1.0);
+        AF4 c;
+        FsrRcasF(c.r, c.g, c.b, c.a, pos, Const0);
+        frag_color = c;
     #else
-        AH3 c;
-        FsrRcasH(c.r, c.g, c.b, pos, Const0);
-        frag_color = AH4(c, 1.0);
+        AH4 c;
+        FsrRcasH(c.r, c.g, c.b, c.a, pos, Const0);
+        frag_color = c;
     #endif
 #endif
 }
diff --git a/src/video_core/host_shaders/fxaa.frag b/src/video_core/host_shaders/fxaa.frag
index 5c03c3724..012c147c6 100755
--- a/src/video_core/host_shaders/fxaa.frag
+++ b/src/video_core/host_shaders/fxaa.frag
@@ -71,5 +71,5 @@ vec3 FxaaPixelShader(vec4 posPos, sampler2D tex) {
 }
 
 void main() {
-  frag_color = vec4(FxaaPixelShader(posPos, input_texture), 1.0);
+  frag_color = vec4(FxaaPixelShader(posPos, input_texture), texture(input_texture, posPos.xy).a);
 }
diff --git a/src/video_core/host_shaders/opengl_fidelityfx_fsr.frag b/src/video_core/host_shaders/opengl_fidelityfx_fsr.frag
index 16d22f58e..fc47d3810 100755
--- a/src/video_core/host_shaders/opengl_fidelityfx_fsr.frag
+++ b/src/video_core/host_shaders/opengl_fidelityfx_fsr.frag
@@ -31,6 +31,7 @@ layout (location = 0) uniform uvec4 constants[4];
 
 #define A_GPU 1
 #define A_GLSL 1
+#define FSR_RCAS_PASSTHROUGH_ALPHA 1
 
 #ifdef YUZU_USE_FP16
     #define A_HALF
@@ -67,9 +68,7 @@ layout (location = 0) uniform uvec4 constants[4];
 
 #include "ffx_fsr1.h"
 
-#if USE_RCAS
-    layout(location = 0) in vec2 frag_texcoord;
-#endif
+layout (location = 0) in vec2 frag_texcoord;
 layout (location = 0) out vec4 frag_color;
 
 void CurrFilter(AU2 pos)
@@ -78,22 +77,22 @@ void CurrFilter(AU2 pos)
     #ifndef YUZU_USE_FP16
         AF3 c;
         FsrEasuF(c, pos, constants[0], constants[1], constants[2], constants[3]);
-        frag_color = AF4(c, 1.0);
+        frag_color = AF4(c, texture(InputTexture, frag_texcoord).a);
     #else
         AH3 c;
         FsrEasuH(c, pos, constants[0], constants[1], constants[2], constants[3]);
-        frag_color = AH4(c, 1.0);
+        frag_color = AH4(c, texture(InputTexture, frag_texcoord).a);
     #endif
 #endif
 #if USE_RCAS
     #ifndef YUZU_USE_FP16
-        AF3 c;
-        FsrRcasF(c.r, c.g, c.b, pos, constants[0]);
-        frag_color = AF4(c, 1.0);
+        AF4 c;
+        FsrRcasF(c.r, c.g, c.b, c.a, pos, constants[0]);
+        frag_color = c;
     #else
         AH3 c;
-        FsrRcasH(c.r, c.g, c.b, pos, constants[0]);
-        frag_color = AH4(c, 1.0);
+        FsrRcasH(c.r, c.g, c.b, c.a, pos, constants[0]);
+        frag_color = c;
     #endif
 #endif
 }
diff --git a/src/video_core/host_shaders/opengl_present.frag b/src/video_core/host_shaders/opengl_present.frag
index 7644a47ae..cc134180e 100755
--- a/src/video_core/host_shaders/opengl_present.frag
+++ b/src/video_core/host_shaders/opengl_present.frag
@@ -9,5 +9,5 @@ layout (location = 0) out vec4 color;
 layout (binding = 0) uniform sampler2D color_texture;
 
 void main() {
-    color = vec4(texture(color_texture, frag_tex_coord).rgb, 1.0f);
+    color = vec4(texture(color_texture, frag_tex_coord));
 }
diff --git a/src/video_core/host_shaders/present_bicubic.frag b/src/video_core/host_shaders/present_bicubic.frag
index 53d8bc761..a3aa9cb99 100755
--- a/src/video_core/host_shaders/present_bicubic.frag
+++ b/src/video_core/host_shaders/present_bicubic.frag
@@ -52,5 +52,5 @@ vec4 textureBicubic( sampler2D textureSampler, vec2 texCoords ) {
 }
 
 void main() {
-    color = vec4(textureBicubic(color_texture, frag_tex_coord).rgb, 1.0f);
+    color = textureBicubic(color_texture, frag_tex_coord);
 }
diff --git a/src/video_core/host_shaders/present_gaussian.frag b/src/video_core/host_shaders/present_gaussian.frag
index de848e386..73a6e1bb5 100755
--- a/src/video_core/host_shaders/present_gaussian.frag
+++ b/src/video_core/host_shaders/present_gaussian.frag
@@ -46,14 +46,14 @@ vec4 blurDiagonal(sampler2D textureSampler, vec2 coord, vec2 norm) {
 }
 
 void main() {
-    vec3 base = texture(color_texture, vec2(frag_tex_coord)).rgb * weight[0];
+    vec4 base = texture(color_texture, vec2(frag_tex_coord)) * weight[0];
     vec2 tex_offset = 1.0f / textureSize(color_texture, 0);
 
     // TODO(Blinkhawk): This code can be optimized through shader group instructions.
-    vec3 horizontal = blurHorizontal(color_texture, frag_tex_coord, tex_offset).rgb;
-    vec3 vertical = blurVertical(color_texture, frag_tex_coord, tex_offset).rgb;
-    vec3 diagonalA = blurDiagonal(color_texture, frag_tex_coord, tex_offset).rgb;
-    vec3 diagonalB = blurDiagonal(color_texture, frag_tex_coord, tex_offset * vec2(1.0, -1.0)).rgb;
-    vec3 combination = mix(mix(horizontal, vertical, 0.5f), mix(diagonalA, diagonalB, 0.5f), 0.5f);
-    color = vec4(combination + base, 1.0f);
+    vec4 horizontal = blurHorizontal(color_texture, frag_tex_coord, tex_offset);
+    vec4 vertical = blurVertical(color_texture, frag_tex_coord, tex_offset);
+    vec4 diagonalA = blurDiagonal(color_texture, frag_tex_coord, tex_offset);
+    vec4 diagonalB = blurDiagonal(color_texture, frag_tex_coord, tex_offset * vec2(1.0, -1.0));
+    vec4 combination = mix(mix(horizontal, vertical, 0.5f), mix(diagonalA, diagonalB, 0.5f), 0.5f);
+    color = combination + base;
 }
diff --git a/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp16.frag b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp16.frag
index d369bef06..05d033310 100755
--- a/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp16.frag
+++ b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp16.frag
@@ -6,5 +6,6 @@
 
 #define YUZU_USE_FP16
 #define USE_EASU 1
+#define VERSION 1
 
 #include "fidelityfx_fsr.frag"
diff --git a/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp32.frag b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp32.frag
index 6f25ef00f..7ae11dd66 100755
--- a/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp32.frag
+++ b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp32.frag
@@ -5,5 +5,6 @@
 #extension GL_GOOGLE_include_directive : enable
 
 #define USE_EASU 1
+#define VERSION 1
 
 #include "fidelityfx_fsr.frag"
diff --git a/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp16.frag b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp16.frag
index 0c953a900..c017214a5 100755
--- a/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp16.frag
+++ b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp16.frag
@@ -6,5 +6,6 @@
 
 #define YUZU_USE_FP16
 #define USE_RCAS 1
+#define VERSION 1
 
 #include "fidelityfx_fsr.frag"
diff --git a/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp32.frag b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp32.frag
index 02e9a27c6..976825f4b 100755
--- a/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp32.frag
+++ b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp32.frag
@@ -5,5 +5,6 @@
 #extension GL_GOOGLE_include_directive : enable
 
 #define USE_RCAS 1
+#define VERSION 1
 
 #include "fidelityfx_fsr.frag"
diff --git a/src/video_core/host_shaders/vulkan_present_scaleforce_fp16.frag b/src/video_core/host_shaders/vulkan_present_scaleforce_fp16.frag
index d9ee1111b..072394a6d 100755
--- a/src/video_core/host_shaders/vulkan_present_scaleforce_fp16.frag
+++ b/src/video_core/host_shaders/vulkan_present_scaleforce_fp16.frag
@@ -5,7 +5,7 @@
 
 #extension GL_GOOGLE_include_directive : enable
 
-#define VERSION 1
+#define VERSION 2
 #define YUZU_USE_FP16
 
 #include "opengl_present_scaleforce.frag"
diff --git a/src/video_core/host_shaders/vulkan_present_scaleforce_fp32.frag b/src/video_core/host_shaders/vulkan_present_scaleforce_fp32.frag
index 4d64e340c..67a248b5f 100755
--- a/src/video_core/host_shaders/vulkan_present_scaleforce_fp32.frag
+++ b/src/video_core/host_shaders/vulkan_present_scaleforce_fp32.frag
@@ -5,6 +5,6 @@
 
 #extension GL_GOOGLE_include_directive : enable
 
-#define VERSION 1
+#define VERSION 2
 
 #include "opengl_present_scaleforce.frag"
diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h
index 1b31cdcee..9b312b112 100755
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -42,6 +42,8 @@ public:
                            u64 page_bits_ = 12);
     ~MemoryManager();
 
+    static constexpr bool HAS_FLUSH_INVALIDATION = true;
+
     size_t GetID() const {
         return unique_identifier;
     }
diff --git a/src/video_core/present.h b/src/video_core/present.h
new file mode 100755
index 000000000..4fdfcca68
--- /dev/null
+++ b/src/video_core/present.h
@@ -0,0 +1,37 @@
+// SPDX-FileCopyrightText: Copyright 2024 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include "common/settings.h"
+
+static inline Settings::ScalingFilter GetScalingFilter() {
+    return Settings::values.scaling_filter.GetValue();
+}
+
+static inline Settings::AntiAliasing GetAntiAliasing() {
+    return Settings::values.anti_aliasing.GetValue();
+}
+
+static inline Settings::ScalingFilter GetScalingFilterForAppletCapture() {
+    return Settings::ScalingFilter::Bilinear;
+}
+
+static inline Settings::AntiAliasing GetAntiAliasingForAppletCapture() {
+    return Settings::AntiAliasing::None;
+}
+
+struct PresentFilters {
+    Settings::ScalingFilter (*get_scaling_filter)();
+    Settings::AntiAliasing (*get_anti_aliasing)();
+};
+
+constexpr PresentFilters PresentFiltersForDisplay{
+    .get_scaling_filter = &GetScalingFilter,
+    .get_anti_aliasing = &GetAntiAliasing,
+};
+
+constexpr PresentFilters PresentFiltersForAppletCapture{
+    .get_scaling_filter = &GetScalingFilterForAppletCapture,
+    .get_anti_aliasing = &GetAntiAliasingForAppletCapture,
+};
diff --git a/src/video_core/renderer_base.h b/src/video_core/renderer_base.h
index f617e800b..b72ab12fb 100755
--- a/src/video_core/renderer_base.h
+++ b/src/video_core/renderer_base.h
@@ -40,6 +40,9 @@ public:
     /// Finalize rendering the guest frame and draw into the presentation texture
     virtual void Composite(std::span<const Tegra::FramebufferConfig> layers) = 0;
 
+    /// Get the tiled applet layer capture buffer
+    virtual std::vector<u8> GetAppletCaptureBuffer() = 0;
+
     [[nodiscard]] virtual RasterizerInterface* ReadRasterizer() = 0;
 
     [[nodiscard]] virtual std::string GetDeviceVendor() const = 0;
diff --git a/src/video_core/renderer_null/renderer_null.cpp b/src/video_core/renderer_null/renderer_null.cpp
index c89daff53..e6147d66c 100755
--- a/src/video_core/renderer_null/renderer_null.cpp
+++ b/src/video_core/renderer_null/renderer_null.cpp
@@ -3,6 +3,7 @@
 
 #include "core/frontend/emu_window.h"
 #include "core/frontend/graphics_context.h"
+#include "video_core/capture.h"
 #include "video_core/renderer_null/renderer_null.h"
 
 namespace Null {
@@ -22,4 +23,8 @@ void RendererNull::Composite(std::span<const Tegra::FramebufferConfig> framebuff
     render_window.OnFrameDisplayed();
 }
 
+std::vector<u8> RendererNull::GetAppletCaptureBuffer() {
+    return std::vector<u8>(VideoCore::Capture::TiledSize);
+}
+
 } // namespace Null
diff --git a/src/video_core/renderer_null/renderer_null.h b/src/video_core/renderer_null/renderer_null.h
index 063b476bb..34dbe1e4f 100755
--- a/src/video_core/renderer_null/renderer_null.h
+++ b/src/video_core/renderer_null/renderer_null.h
@@ -19,6 +19,8 @@ public:
 
     void Composite(std::span<const Tegra::FramebufferConfig> framebuffer) override;
 
+    std::vector<u8> GetAppletCaptureBuffer() override;
+
     VideoCore::RasterizerInterface* ReadRasterizer() override {
         return &m_rasterizer;
     }
diff --git a/src/video_core/renderer_opengl/gl_blit_screen.cpp b/src/video_core/renderer_opengl/gl_blit_screen.cpp
index 6ba8b214b..9260a4dc4 100755
--- a/src/video_core/renderer_opengl/gl_blit_screen.cpp
+++ b/src/video_core/renderer_opengl/gl_blit_screen.cpp
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 
 #include "common/settings.h"
+#include "video_core/present.h"
 #include "video_core/renderer_opengl/gl_blit_screen.h"
 #include "video_core/renderer_opengl/gl_state_tracker.h"
 #include "video_core/renderer_opengl/present/filters.h"
@@ -13,14 +14,14 @@ namespace OpenGL {
 BlitScreen::BlitScreen(RasterizerOpenGL& rasterizer_,
                        Tegra::MaxwellDeviceMemoryManager& device_memory_,
                        StateTracker& state_tracker_, ProgramManager& program_manager_,
-                       Device& device_)
+                       Device& device_, const PresentFilters& filters_)
     : rasterizer(rasterizer_), device_memory(device_memory_), state_tracker(state_tracker_),
-      program_manager(program_manager_), device(device_) {}
+      program_manager(program_manager_), device(device_), filters(filters_) {}
 
 BlitScreen::~BlitScreen() = default;
 
 void BlitScreen::DrawScreen(std::span<const Tegra::FramebufferConfig> framebuffers,
-                            const Layout::FramebufferLayout& layout) {
+                            const Layout::FramebufferLayout& layout, bool invert_y) {
     // TODO: Signal state tracker about these changes
     state_tracker.NotifyScreenDrawVertexArray();
     state_tracker.NotifyPolygonModes();
@@ -56,22 +57,22 @@ void BlitScreen::DrawScreen(std::span<const Tegra::FramebufferConfig> framebuffe
     glDepthRangeIndexed(0, 0.0, 0.0);
 
     while (layers.size() < framebuffers.size()) {
-        layers.emplace_back(rasterizer, device_memory);
+        layers.emplace_back(rasterizer, device_memory, filters);
     }
 
     CreateWindowAdapt();
-    window_adapt->DrawToFramebuffer(program_manager, layers, framebuffers, layout);
+    window_adapt->DrawToFramebuffer(program_manager, layers, framebuffers, layout, invert_y);
 
     // TODO
     // program_manager.RestoreGuestPipeline();
 }
 
 void BlitScreen::CreateWindowAdapt() {
-    if (window_adapt && Settings::values.scaling_filter.GetValue() == current_window_adapt) {
+    if (window_adapt && filters.get_scaling_filter() == current_window_adapt) {
         return;
     }
 
-    current_window_adapt = Settings::values.scaling_filter.GetValue();
+    current_window_adapt = filters.get_scaling_filter();
     switch (current_window_adapt) {
     case Settings::ScalingFilter::NearestNeighbor:
         window_adapt = MakeNearestNeighbor(device);
diff --git a/src/video_core/renderer_opengl/gl_blit_screen.h b/src/video_core/renderer_opengl/gl_blit_screen.h
index 0c3d838f1..df2da9424 100755
--- a/src/video_core/renderer_opengl/gl_blit_screen.h
+++ b/src/video_core/renderer_opengl/gl_blit_screen.h
@@ -15,6 +15,8 @@ namespace Layout {
 struct FramebufferLayout;
 }
 
+struct PresentFilters;
+
 namespace Tegra {
 struct FramebufferConfig;
 }
@@ -46,12 +48,12 @@ public:
     explicit BlitScreen(RasterizerOpenGL& rasterizer,
                         Tegra::MaxwellDeviceMemoryManager& device_memory,
                         StateTracker& state_tracker, ProgramManager& program_manager,
-                        Device& device);
+                        Device& device, const PresentFilters& filters);
     ~BlitScreen();
 
     /// Draws the emulated screens to the emulator window.
     void DrawScreen(std::span<const Tegra::FramebufferConfig> framebuffers,
-                    const Layout::FramebufferLayout& layout);
+                    const Layout::FramebufferLayout& layout, bool invert_y);
 
 private:
     void CreateWindowAdapt();
@@ -61,6 +63,7 @@ private:
     StateTracker& state_tracker;
     ProgramManager& program_manager;
     Device& device;
+    const PresentFilters& filters;
 
     Settings::ScalingFilter current_window_adapt{};
     std::unique_ptr<WindowAdaptPass> window_adapt;
diff --git a/src/video_core/renderer_opengl/present/layer.cpp b/src/video_core/renderer_opengl/present/layer.cpp
index 8643e07c6..6c7092d22 100755
--- a/src/video_core/renderer_opengl/present/layer.cpp
+++ b/src/video_core/renderer_opengl/present/layer.cpp
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 
 #include "video_core/framebuffer_config.h"
+#include "video_core/present.h"
 #include "video_core/renderer_opengl/gl_blit_screen.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/present/fsr.h"
@@ -14,8 +15,9 @@
 
 namespace OpenGL {
 
-Layer::Layer(RasterizerOpenGL& rasterizer_, Tegra::MaxwellDeviceMemoryManager& device_memory_)
-    : rasterizer(rasterizer_), device_memory(device_memory_) {
+Layer::Layer(RasterizerOpenGL& rasterizer_, Tegra::MaxwellDeviceMemoryManager& device_memory_,
+             const PresentFilters& filters_)
+    : rasterizer(rasterizer_), device_memory(device_memory_), filters(filters_) {
     // Allocate textures for the screen
     framebuffer_texture.resource.Create(GL_TEXTURE_2D);
 
@@ -34,12 +36,12 @@ GLuint Layer::ConfigureDraw(std::array<GLfloat, 3 * 2>& out_matrix,
                             std::array<ScreenRectVertex, 4>& out_vertices,
                             ProgramManager& program_manager,
                             const Tegra::FramebufferConfig& framebuffer,
-                            const Layout::FramebufferLayout& layout) {
+                            const Layout::FramebufferLayout& layout, bool invert_y) {
     FramebufferTextureInfo info = PrepareRenderTarget(framebuffer);
     auto crop = Tegra::NormalizeCrop(framebuffer, info.width, info.height);
     GLuint texture = info.display_texture;
 
-    auto anti_aliasing = Settings::values.anti_aliasing.GetValue();
+    auto anti_aliasing = filters.get_anti_aliasing();
     if (anti_aliasing != Settings::AntiAliasing::None) {
         glEnablei(GL_SCISSOR_TEST, 0);
         auto viewport_width = Settings::values.resolution_info.ScaleUp(framebuffer_texture.width);
@@ -64,7 +66,7 @@ GLuint Layer::ConfigureDraw(std::array<GLfloat, 3 * 2>& out_matrix,
 
     glDisablei(GL_SCISSOR_TEST, 0);
 
-    if (Settings::values.scaling_filter.GetValue() == Settings::ScalingFilter::Fsr) {
+    if (filters.get_scaling_filter() == Settings::ScalingFilter::Fsr) {
         if (!fsr || fsr->NeedsRecreation(layout.screen)) {
             fsr = std::make_unique<FSR>(layout.screen.GetWidth(), layout.screen.GetHeight());
         }
@@ -83,10 +85,15 @@ GLuint Layer::ConfigureDraw(std::array<GLfloat, 3 * 2>& out_matrix,
     const auto w = screen.GetWidth();
     const auto h = screen.GetHeight();
 
-    out_vertices[0] = ScreenRectVertex(x, y, crop.left, crop.top);
-    out_vertices[1] = ScreenRectVertex(x + w, y, crop.right, crop.top);
-    out_vertices[2] = ScreenRectVertex(x, y + h, crop.left, crop.bottom);
-    out_vertices[3] = ScreenRectVertex(x + w, y + h, crop.right, crop.bottom);
+    const auto left = crop.left;
+    const auto right = crop.right;
+    const auto top = invert_y ? crop.bottom : crop.top;
+    const auto bottom = invert_y ? crop.top : crop.bottom;
+
+    out_vertices[0] = ScreenRectVertex(x, y, left, top);
+    out_vertices[1] = ScreenRectVertex(x + w, y, right, top);
+    out_vertices[2] = ScreenRectVertex(x, y + h, left, bottom);
+    out_vertices[3] = ScreenRectVertex(x + w, y + h, right, bottom);
 
     return texture;
 }
@@ -131,10 +138,12 @@ FramebufferTextureInfo Layer::LoadFBToScreenInfo(const Tegra::FramebufferConfig&
     const u64 size_in_bytes{Tegra::Texture::CalculateSize(
         true, bytes_per_pixel, framebuffer.stride, framebuffer.height, 1, block_height_log2, 0)};
     const u8* const host_ptr{device_memory.GetPointer<u8>(framebuffer_addr)};
-    const std::span<const u8> input_data(host_ptr, size_in_bytes);
-    Tegra::Texture::UnswizzleTexture(gl_framebuffer_data, input_data, bytes_per_pixel,
-                                     framebuffer.width, framebuffer.height, 1, block_height_log2,
-                                     0);
+    if (host_ptr) {
+        const std::span<const u8> input_data(host_ptr, size_in_bytes);
+        Tegra::Texture::UnswizzleTexture(gl_framebuffer_data, input_data, bytes_per_pixel,
+                                         framebuffer.width, framebuffer.height, 1,
+                                         block_height_log2, 0);
+    }
 
     glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
     glPixelStorei(GL_UNPACK_ROW_LENGTH, static_cast<GLint>(framebuffer.stride));
diff --git a/src/video_core/renderer_opengl/present/layer.h b/src/video_core/renderer_opengl/present/layer.h
index ef1055abf..5b15b730f 100755
--- a/src/video_core/renderer_opengl/present/layer.h
+++ b/src/video_core/renderer_opengl/present/layer.h
@@ -13,6 +13,8 @@ namespace Layout {
 struct FramebufferLayout;
 }
 
+struct PresentFilters;
+
 namespace Service::android {
 enum class PixelFormat : u32;
 };
@@ -44,14 +46,15 @@ struct ScreenRectVertex;
 
 class Layer {
 public:
-    explicit Layer(RasterizerOpenGL& rasterizer, Tegra::MaxwellDeviceMemoryManager& device_memory);
+    explicit Layer(RasterizerOpenGL& rasterizer, Tegra::MaxwellDeviceMemoryManager& device_memory,
+                   const PresentFilters& filters);
     ~Layer();
 
     GLuint ConfigureDraw(std::array<GLfloat, 3 * 2>& out_matrix,
                          std::array<ScreenRectVertex, 4>& out_vertices,
                          ProgramManager& program_manager,
                          const Tegra::FramebufferConfig& framebuffer,
-                         const Layout::FramebufferLayout& layout);
+                         const Layout::FramebufferLayout& layout, bool invert_y);
 
 private:
     /// Loads framebuffer from emulated memory into the active OpenGL texture.
@@ -65,6 +68,7 @@ private:
 private:
     RasterizerOpenGL& rasterizer;
     Tegra::MaxwellDeviceMemoryManager& device_memory;
+    const PresentFilters& filters;
 
     /// OpenGL framebuffer data
     std::vector<u8> gl_framebuffer_data;
diff --git a/src/video_core/renderer_opengl/present/window_adapt_pass.cpp b/src/video_core/renderer_opengl/present/window_adapt_pass.cpp
index 4d681606b..d8b6a11cb 100755
--- a/src/video_core/renderer_opengl/present/window_adapt_pass.cpp
+++ b/src/video_core/renderer_opengl/present/window_adapt_pass.cpp
@@ -37,7 +37,7 @@ WindowAdaptPass::~WindowAdaptPass() = default;
 
 void WindowAdaptPass::DrawToFramebuffer(ProgramManager& program_manager, std::list<Layer>& layers,
                                         std::span<const Tegra::FramebufferConfig> framebuffers,
-                                        const Layout::FramebufferLayout& layout) {
+                                        const Layout::FramebufferLayout& layout, bool invert_y) {
     GLint old_read_fb;
     GLint old_draw_fb;
     glGetIntegerv(GL_READ_FRAMEBUFFER_BINDING, &old_read_fb);
@@ -51,7 +51,7 @@ void WindowAdaptPass::DrawToFramebuffer(ProgramManager& program_manager, std::li
     auto layer_it = layers.begin();
     for (size_t i = 0; i < layer_count; i++) {
         textures[i] = layer_it->ConfigureDraw(matrices[i], vertices[i], program_manager,
-                                              framebuffers[i], layout);
+                                              framebuffers[i], layout, invert_y);
         layer_it++;
     }
 
@@ -92,6 +92,21 @@ void WindowAdaptPass::DrawToFramebuffer(ProgramManager& program_manager, std::li
     glClear(GL_COLOR_BUFFER_BIT);
 
     for (size_t i = 0; i < layer_count; i++) {
+        switch (framebuffers[i].blending) {
+        case Tegra::BlendMode::Opaque:
+        default:
+            glDisablei(GL_BLEND, 0);
+            break;
+        case Tegra::BlendMode::Premultiplied:
+            glEnablei(GL_BLEND, 0);
+            glBlendFuncSeparatei(0, GL_ONE, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, GL_ZERO);
+            break;
+        case Tegra::BlendMode::Coverage:
+            glEnablei(GL_BLEND, 0);
+            glBlendFuncSeparatei(0, GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, GL_ZERO);
+            break;
+        }
+
         glBindTextureUnit(0, textures[i]);
         glProgramUniformMatrix3x2fv(vert.handle, ModelViewMatrixLocation, 1, GL_FALSE,
                                     matrices[i].data());
diff --git a/src/video_core/renderer_opengl/present/window_adapt_pass.h b/src/video_core/renderer_opengl/present/window_adapt_pass.h
index 00975a9c6..0a8bcef2f 100755
--- a/src/video_core/renderer_opengl/present/window_adapt_pass.h
+++ b/src/video_core/renderer_opengl/present/window_adapt_pass.h
@@ -31,7 +31,7 @@ public:
 
     void DrawToFramebuffer(ProgramManager& program_manager, std::list<Layer>& layers,
                            std::span<const Tegra::FramebufferConfig> framebuffers,
-                           const Layout::FramebufferLayout& layout);
+                           const Layout::FramebufferLayout& layout, bool invert_y);
 
 private:
     const Device& device;
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index a4a15a9e7..c5fa63229 100755
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -16,6 +16,8 @@
 #include "core/core_timing.h"
 #include "core/frontend/emu_window.h"
 #include "core/telemetry_session.h"
+#include "video_core/capture.h"
+#include "video_core/present.h"
 #include "video_core/renderer_opengl/gl_blit_screen.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/gl_shader_manager.h"
@@ -120,7 +122,15 @@ RendererOpenGL::RendererOpenGL(Core::TelemetrySession& telemetry_session_,
         glEnableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV);
     }
     blit_screen = std::make_unique<BlitScreen>(rasterizer, device_memory, state_tracker,
-                                               program_manager, device);
+                                               program_manager, device, PresentFiltersForDisplay);
+    blit_applet =
+        std::make_unique<BlitScreen>(rasterizer, device_memory, state_tracker, program_manager,
+                                     device, PresentFiltersForAppletCapture);
+    capture_framebuffer.Create();
+    capture_renderbuffer.Create();
+    glBindRenderbuffer(GL_RENDERBUFFER, capture_renderbuffer.handle);
+    glRenderbufferStorage(GL_RENDERBUFFER, GL_SRGB8, VideoCore::Capture::LinearWidth,
+                          VideoCore::Capture::LinearHeight);
 }
 
 RendererOpenGL::~RendererOpenGL() = default;
@@ -130,10 +140,11 @@ void RendererOpenGL::Composite(std::span<const Tegra::FramebufferConfig> framebu
         return;
     }
 
+    RenderAppletCaptureLayer(framebuffers);
     RenderScreenshot(framebuffers);
 
     state_tracker.BindFramebuffer(0);
-    blit_screen->DrawScreen(framebuffers, emu_window.GetFramebufferLayout());
+    blit_screen->DrawScreen(framebuffers, emu_window.GetFramebufferLayout(), false);
 
     ++m_current_frame;
 
@@ -159,11 +170,8 @@ void RendererOpenGL::AddTelemetryFields() {
     telemetry_session.AddField(user_system, "GPU_OpenGL_Version", std::string(gl_version));
 }
 
-void RendererOpenGL::RenderScreenshot(std::span<const Tegra::FramebufferConfig> framebuffers) {
-    if (!renderer_settings.screenshot_requested) {
-        return;
-    }
-
+void RendererOpenGL::RenderToBuffer(std::span<const Tegra::FramebufferConfig> framebuffers,
+                                    const Layout::FramebufferLayout& layout, void* dst) {
     GLint old_read_fb;
     GLint old_draw_fb;
     glGetIntegerv(GL_READ_FRAMEBUFFER_BINDING, &old_read_fb);
@@ -173,29 +181,86 @@ void RendererOpenGL::RenderScreenshot(std::span<const Tegra::FramebufferConfig>
     screenshot_framebuffer.Create();
     glBindFramebuffer(GL_FRAMEBUFFER, screenshot_framebuffer.handle);
 
-    const Layout::FramebufferLayout layout{renderer_settings.screenshot_framebuffer_layout};
-
     GLuint renderbuffer;
     glGenRenderbuffers(1, &renderbuffer);
     glBindRenderbuffer(GL_RENDERBUFFER, renderbuffer);
     glRenderbufferStorage(GL_RENDERBUFFER, GL_SRGB8, layout.width, layout.height);
     glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, renderbuffer);
 
-    blit_screen->DrawScreen(framebuffers, layout);
+    blit_screen->DrawScreen(framebuffers, layout, false);
 
     glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
     glPixelStorei(GL_PACK_ROW_LENGTH, 0);
-    glReadPixels(0, 0, layout.width, layout.height, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV,
-                 renderer_settings.screenshot_bits);
+    glReadPixels(0, 0, layout.width, layout.height, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV, dst);
 
     screenshot_framebuffer.Release();
     glDeleteRenderbuffers(1, &renderbuffer);
 
     glBindFramebuffer(GL_READ_FRAMEBUFFER, old_read_fb);
     glBindFramebuffer(GL_DRAW_FRAMEBUFFER, old_draw_fb);
+}
+
+void RendererOpenGL::RenderScreenshot(std::span<const Tegra::FramebufferConfig> framebuffers) {
+    if (!renderer_settings.screenshot_requested) {
+        return;
+    }
+
+    RenderToBuffer(framebuffers, renderer_settings.screenshot_framebuffer_layout,
+                   renderer_settings.screenshot_bits);
 
     renderer_settings.screenshot_complete_callback(true);
     renderer_settings.screenshot_requested = false;
 }
 
+void RendererOpenGL::RenderAppletCaptureLayer(
+    std::span<const Tegra::FramebufferConfig> framebuffers) {
+    GLint old_read_fb;
+    GLint old_draw_fb;
+    glGetIntegerv(GL_READ_FRAMEBUFFER_BINDING, &old_read_fb);
+    glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &old_draw_fb);
+
+    glBindFramebuffer(GL_FRAMEBUFFER, capture_framebuffer.handle);
+    glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER,
+                              capture_renderbuffer.handle);
+
+    blit_applet->DrawScreen(framebuffers, VideoCore::Capture::Layout, true);
+
+    glBindFramebuffer(GL_READ_FRAMEBUFFER, old_read_fb);
+    glBindFramebuffer(GL_DRAW_FRAMEBUFFER, old_draw_fb);
+}
+
+std::vector<u8> RendererOpenGL::GetAppletCaptureBuffer() {
+    using namespace VideoCore::Capture;
+
+    std::vector<u8> linear(TiledSize);
+    std::vector<u8> out(TiledSize);
+
+    GLint old_read_fb;
+    GLint old_draw_fb;
+    GLint old_pixel_pack_buffer;
+    GLint old_pack_row_length;
+    glGetIntegerv(GL_READ_FRAMEBUFFER_BINDING, &old_read_fb);
+    glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &old_draw_fb);
+    glGetIntegerv(GL_PIXEL_PACK_BUFFER_BINDING, &old_pixel_pack_buffer);
+    glGetIntegerv(GL_PACK_ROW_LENGTH, &old_pack_row_length);
+
+    glBindFramebuffer(GL_FRAMEBUFFER, capture_framebuffer.handle);
+    glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER,
+                              capture_renderbuffer.handle);
+    glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
+    glPixelStorei(GL_PACK_ROW_LENGTH, 0);
+    glReadPixels(0, 0, LinearWidth, LinearHeight, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV,
+                 linear.data());
+
+    glBindFramebuffer(GL_READ_FRAMEBUFFER, old_read_fb);
+    glBindFramebuffer(GL_DRAW_FRAMEBUFFER, old_draw_fb);
+    glBindBuffer(GL_PIXEL_PACK_BUFFER, old_pixel_pack_buffer);
+    glPixelStorei(GL_PACK_ROW_LENGTH, old_pack_row_length);
+
+    Tegra::Texture::SwizzleTexture(out, linear, BytesPerPixel, LinearWidth, LinearHeight,
+                                   LinearDepth, BlockHeight, BlockDepth);
+
+    return out;
+}
+
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h
index 48ef6d377..38a24f84c 100755
--- a/src/video_core/renderer_opengl/renderer_opengl.h
+++ b/src/video_core/renderer_opengl/renderer_opengl.h
@@ -42,6 +42,8 @@ public:
 
     void Composite(std::span<const Tegra::FramebufferConfig> framebuffers) override;
 
+    std::vector<u8> GetAppletCaptureBuffer() override;
+
     VideoCore::RasterizerInterface* ReadRasterizer() override {
         return &rasterizer;
     }
@@ -52,7 +54,11 @@ public:
 
 private:
     void AddTelemetryFields();
+
+    void RenderToBuffer(std::span<const Tegra::FramebufferConfig> framebuffers,
+                        const Layout::FramebufferLayout& layout, void* dst);
     void RenderScreenshot(std::span<const Tegra::FramebufferConfig> framebuffers);
+    void RenderAppletCaptureLayer(std::span<const Tegra::FramebufferConfig> framebuffers);
 
     Core::TelemetrySession& telemetry_session;
     Core::Frontend::EmuWindow& emu_window;
@@ -64,8 +70,11 @@ private:
     ProgramManager program_manager;
     RasterizerOpenGL rasterizer;
     OGLFramebuffer screenshot_framebuffer;
+    OGLFramebuffer capture_framebuffer;
+    OGLRenderbuffer capture_renderbuffer;
 
     std::unique_ptr<BlitScreen> blit_screen;
+    std::unique_ptr<BlitScreen> blit_applet;
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_vulkan/present/layer.cpp b/src/video_core/renderer_vulkan/present/layer.cpp
index cfc04be44..3847a9a13 100755
--- a/src/video_core/renderer_vulkan/present/layer.cpp
+++ b/src/video_core/renderer_vulkan/present/layer.cpp
@@ -1,6 +1,7 @@
 // SPDX-FileCopyrightText: Copyright 2024 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
+#include "video_core/present.h"
 #include "video_core/renderer_vulkan/vk_rasterizer.h"
 
 #include "common/settings.h"
@@ -48,12 +49,12 @@ VkFormat GetFormat(const Tegra::FramebufferConfig& framebuffer) {
 
 Layer::Layer(const Device& device_, MemoryAllocator& memory_allocator_, Scheduler& scheduler_,
              Tegra::MaxwellDeviceMemoryManager& device_memory_, size_t image_count_,
-             VkExtent2D output_size, VkDescriptorSetLayout layout)
+             VkExtent2D output_size, VkDescriptorSetLayout layout, const PresentFilters& filters_)
     : device(device_), memory_allocator(memory_allocator_), scheduler(scheduler_),
-      device_memory(device_memory_), image_count(image_count_) {
+      device_memory(device_memory_), filters(filters_), image_count(image_count_) {
     CreateDescriptorPool();
     CreateDescriptorSets(layout);
-    if (Settings::values.scaling_filter.GetValue() == Settings::ScalingFilter::Fsr) {
+    if (filters.get_scaling_filter() == Settings::ScalingFilter::Fsr) {
         CreateFSR(output_size);
     }
 }
@@ -171,11 +172,11 @@ void Layer::RefreshResources(const Tegra::FramebufferConfig& framebuffer) {
 }
 
 void Layer::SetAntiAliasPass() {
-    if (anti_alias && anti_alias_setting == Settings::values.anti_aliasing.GetValue()) {
+    if (anti_alias && anti_alias_setting == filters.get_anti_aliasing()) {
         return;
     }
 
-    anti_alias_setting = Settings::values.anti_aliasing.GetValue();
+    anti_alias_setting = filters.get_anti_aliasing();
 
     const VkExtent2D render_area{
         .width = Settings::values.resolution_info.ScaleUp(raw_width),
@@ -270,9 +271,11 @@ void Layer::UpdateRawImage(const Tegra::FramebufferConfig& framebuffer, size_t i
     const u64 linear_size{GetSizeInBytes(framebuffer)};
     const u64 tiled_size{Tegra::Texture::CalculateSize(
         true, bytes_per_pixel, framebuffer.stride, framebuffer.height, 1, block_height_log2, 0)};
-    Tegra::Texture::UnswizzleTexture(
-        mapped_span.subspan(image_offset, linear_size), std::span(host_ptr, tiled_size),
-        bytes_per_pixel, framebuffer.width, framebuffer.height, 1, block_height_log2, 0);
+    if (host_ptr) {
+        Tegra::Texture::UnswizzleTexture(
+            mapped_span.subspan(image_offset, linear_size), std::span(host_ptr, tiled_size),
+            bytes_per_pixel, framebuffer.width, framebuffer.height, 1, block_height_log2, 0);
+    }
 
     const VkBufferImageCopy copy{
         .bufferOffset = image_offset,
diff --git a/src/video_core/renderer_vulkan/present/layer.h b/src/video_core/renderer_vulkan/present/layer.h
index 88d43fc5f..f5effdcd7 100755
--- a/src/video_core/renderer_vulkan/present/layer.h
+++ b/src/video_core/renderer_vulkan/present/layer.h
@@ -11,6 +11,8 @@ namespace Layout {
 struct FramebufferLayout;
 }
 
+struct PresentFilters;
+
 namespace Tegra {
 struct FramebufferConfig;
 }
@@ -37,7 +39,8 @@ class Layer final {
 public:
     explicit Layer(const Device& device, MemoryAllocator& memory_allocator, Scheduler& scheduler,
                    Tegra::MaxwellDeviceMemoryManager& device_memory, size_t image_count,
-                   VkExtent2D output_size, VkDescriptorSetLayout layout);
+                   VkExtent2D output_size, VkDescriptorSetLayout layout,
+                   const PresentFilters& filters);
     ~Layer();
 
     void ConfigureDraw(PresentPushConstants* out_push_constants,
@@ -71,6 +74,7 @@ private:
     MemoryAllocator& memory_allocator;
     Scheduler& scheduler;
     Tegra::MaxwellDeviceMemoryManager& device_memory;
+    const PresentFilters& filters;
     const size_t image_count{};
     vk::DescriptorPool descriptor_pool{};
     vk::DescriptorSets descriptor_sets{};
diff --git a/src/video_core/renderer_vulkan/present/util.cpp b/src/video_core/renderer_vulkan/present/util.cpp
index 6ee16595d..7f27c7c1b 100755
--- a/src/video_core/renderer_vulkan/present/util.cpp
+++ b/src/video_core/renderer_vulkan/present/util.cpp
@@ -362,10 +362,10 @@ vk::PipelineLayout CreateWrappedPipelineLayout(const Device& device,
     });
 }
 
-vk::Pipeline CreateWrappedPipeline(const Device& device, vk::RenderPass& renderpass,
-                                   vk::PipelineLayout& layout,
-                                   std::tuple<vk::ShaderModule&, vk::ShaderModule&> shaders,
-                                   bool enable_blending) {
+static vk::Pipeline CreateWrappedPipelineImpl(
+    const Device& device, vk::RenderPass& renderpass, vk::PipelineLayout& layout,
+    std::tuple<vk::ShaderModule&, vk::ShaderModule&> shaders,
+    VkPipelineColorBlendAttachmentState blending) {
     const std::array<VkPipelineShaderStageCreateInfo, 2> shader_stages{{
         {
             .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
@@ -443,30 +443,6 @@ vk::Pipeline CreateWrappedPipeline(const Device& device, vk::RenderPass& renderp
         .alphaToOneEnable = VK_FALSE,
     };
 
-    constexpr VkPipelineColorBlendAttachmentState color_blend_attachment_disabled{
-        .blendEnable = VK_FALSE,
-        .srcColorBlendFactor = VK_BLEND_FACTOR_ZERO,
-        .dstColorBlendFactor = VK_BLEND_FACTOR_ZERO,
-        .colorBlendOp = VK_BLEND_OP_ADD,
-        .srcAlphaBlendFactor = VK_BLEND_FACTOR_ZERO,
-        .dstAlphaBlendFactor = VK_BLEND_FACTOR_ZERO,
-        .alphaBlendOp = VK_BLEND_OP_ADD,
-        .colorWriteMask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT |
-                          VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT,
-    };
-
-    constexpr VkPipelineColorBlendAttachmentState color_blend_attachment_enabled{
-        .blendEnable = VK_TRUE,
-        .srcColorBlendFactor = VK_BLEND_FACTOR_SRC_ALPHA,
-        .dstColorBlendFactor = VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA,
-        .colorBlendOp = VK_BLEND_OP_ADD,
-        .srcAlphaBlendFactor = VK_BLEND_FACTOR_ONE,
-        .dstAlphaBlendFactor = VK_BLEND_FACTOR_ZERO,
-        .alphaBlendOp = VK_BLEND_OP_ADD,
-        .colorWriteMask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT |
-                          VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT,
-    };
-
     const VkPipelineColorBlendStateCreateInfo color_blend_ci{
         .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
         .pNext = nullptr,
@@ -474,8 +450,7 @@ vk::Pipeline CreateWrappedPipeline(const Device& device, vk::RenderPass& renderp
         .logicOpEnable = VK_FALSE,
         .logicOp = VK_LOGIC_OP_COPY,
         .attachmentCount = 1,
-        .pAttachments =
-            enable_blending ? &color_blend_attachment_enabled : &color_blend_attachment_disabled,
+        .pAttachments = &blending,
         .blendConstants = {0.0f, 0.0f, 0.0f, 0.0f},
     };
 
@@ -515,6 +490,63 @@ vk::Pipeline CreateWrappedPipeline(const Device& device, vk::RenderPass& renderp
     });
 }
 
+vk::Pipeline CreateWrappedPipeline(const Device& device, vk::RenderPass& renderpass,
+                                   vk::PipelineLayout& layout,
+                                   std::tuple<vk::ShaderModule&, vk::ShaderModule&> shaders) {
+    constexpr VkPipelineColorBlendAttachmentState color_blend_attachment_disabled{
+        .blendEnable = VK_FALSE,
+        .srcColorBlendFactor = VK_BLEND_FACTOR_ZERO,
+        .dstColorBlendFactor = VK_BLEND_FACTOR_ZERO,
+        .colorBlendOp = VK_BLEND_OP_ADD,
+        .srcAlphaBlendFactor = VK_BLEND_FACTOR_ZERO,
+        .dstAlphaBlendFactor = VK_BLEND_FACTOR_ZERO,
+        .alphaBlendOp = VK_BLEND_OP_ADD,
+        .colorWriteMask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT |
+                          VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT,
+    };
+
+    return CreateWrappedPipelineImpl(device, renderpass, layout, shaders,
+                                     color_blend_attachment_disabled);
+}
+
+vk::Pipeline CreateWrappedPremultipliedBlendingPipeline(
+    const Device& device, vk::RenderPass& renderpass, vk::PipelineLayout& layout,
+    std::tuple<vk::ShaderModule&, vk::ShaderModule&> shaders) {
+    constexpr VkPipelineColorBlendAttachmentState color_blend_attachment_premultiplied{
+        .blendEnable = VK_TRUE,
+        .srcColorBlendFactor = VK_BLEND_FACTOR_ONE,
+        .dstColorBlendFactor = VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA,
+        .colorBlendOp = VK_BLEND_OP_ADD,
+        .srcAlphaBlendFactor = VK_BLEND_FACTOR_ONE,
+        .dstAlphaBlendFactor = VK_BLEND_FACTOR_ZERO,
+        .alphaBlendOp = VK_BLEND_OP_ADD,
+        .colorWriteMask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT |
+                          VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT,
+    };
+
+    return CreateWrappedPipelineImpl(device, renderpass, layout, shaders,
+                                     color_blend_attachment_premultiplied);
+}
+
+vk::Pipeline CreateWrappedCoverageBlendingPipeline(
+    const Device& device, vk::RenderPass& renderpass, vk::PipelineLayout& layout,
+    std::tuple<vk::ShaderModule&, vk::ShaderModule&> shaders) {
+    constexpr VkPipelineColorBlendAttachmentState color_blend_attachment_coverage{
+        .blendEnable = VK_TRUE,
+        .srcColorBlendFactor = VK_BLEND_FACTOR_SRC_ALPHA,
+        .dstColorBlendFactor = VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA,
+        .colorBlendOp = VK_BLEND_OP_ADD,
+        .srcAlphaBlendFactor = VK_BLEND_FACTOR_ONE,
+        .dstAlphaBlendFactor = VK_BLEND_FACTOR_ZERO,
+        .alphaBlendOp = VK_BLEND_OP_ADD,
+        .colorWriteMask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT |
+                          VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT,
+    };
+
+    return CreateWrappedPipelineImpl(device, renderpass, layout, shaders,
+                                     color_blend_attachment_coverage);
+}
+
 VkWriteDescriptorSet CreateWriteDescriptorSet(std::vector<VkDescriptorImageInfo>& images,
                                               VkSampler sampler, VkImageView view,
                                               VkDescriptorSet set, u32 binding) {
diff --git a/src/video_core/renderer_vulkan/present/util.h b/src/video_core/renderer_vulkan/present/util.h
index 1104aaa15..5b22f0fa8 100755
--- a/src/video_core/renderer_vulkan/present/util.h
+++ b/src/video_core/renderer_vulkan/present/util.h
@@ -42,8 +42,13 @@ vk::PipelineLayout CreateWrappedPipelineLayout(const Device& device,
                                                vk::DescriptorSetLayout& layout);
 vk::Pipeline CreateWrappedPipeline(const Device& device, vk::RenderPass& renderpass,
                                    vk::PipelineLayout& layout,
-                                   std::tuple<vk::ShaderModule&, vk::ShaderModule&> shaders,
-                                   bool enable_blending = false);
+                                   std::tuple<vk::ShaderModule&, vk::ShaderModule&> shaders);
+vk::Pipeline CreateWrappedPremultipliedBlendingPipeline(
+    const Device& device, vk::RenderPass& renderpass, vk::PipelineLayout& layout,
+    std::tuple<vk::ShaderModule&, vk::ShaderModule&> shaders);
+vk::Pipeline CreateWrappedCoverageBlendingPipeline(
+    const Device& device, vk::RenderPass& renderpass, vk::PipelineLayout& layout,
+    std::tuple<vk::ShaderModule&, vk::ShaderModule&> shaders);
 VkWriteDescriptorSet CreateWriteDescriptorSet(std::vector<VkDescriptorImageInfo>& images,
                                               VkSampler sampler, VkImageView view,
                                               VkDescriptorSet set, u32 binding);
diff --git a/src/video_core/renderer_vulkan/present/window_adapt_pass.cpp b/src/video_core/renderer_vulkan/present/window_adapt_pass.cpp
index c5db0230d..22ffacf11 100755
--- a/src/video_core/renderer_vulkan/present/window_adapt_pass.cpp
+++ b/src/video_core/renderer_vulkan/present/window_adapt_pass.cpp
@@ -22,7 +22,7 @@ WindowAdaptPass::WindowAdaptPass(const Device& device_, VkFormat frame_format,
     CreatePipelineLayout();
     CreateVertexShader();
     CreateRenderPass(frame_format);
-    CreatePipeline();
+    CreatePipelines();
 }
 
 WindowAdaptPass::~WindowAdaptPass() = default;
@@ -34,7 +34,6 @@ void WindowAdaptPass::Draw(RasterizerVulkan& rasterizer, Scheduler& scheduler, s
 
     const VkFramebuffer host_framebuffer{*dst->framebuffer};
     const VkRenderPass renderpass{*render_pass};
-    const VkPipeline graphics_pipeline{*pipeline};
     const VkPipelineLayout graphics_pipeline_layout{*pipeline_layout};
     const VkExtent2D render_area{
         .width = dst->width,
@@ -44,9 +43,23 @@ void WindowAdaptPass::Draw(RasterizerVulkan& rasterizer, Scheduler& scheduler, s
     const size_t layer_count = configs.size();
     std::vector<PresentPushConstants> push_constants(layer_count);
     std::vector<VkDescriptorSet> descriptor_sets(layer_count);
+    std::vector<VkPipeline> graphics_pipelines(layer_count);
 
     auto layer_it = layers.begin();
     for (size_t i = 0; i < layer_count; i++) {
+        switch (configs[i].blending) {
+        case Tegra::BlendMode::Opaque:
+        default:
+            graphics_pipelines[i] = *opaque_pipeline;
+            break;
+        case Tegra::BlendMode::Premultiplied:
+            graphics_pipelines[i] = *premultiplied_pipeline;
+            break;
+        case Tegra::BlendMode::Coverage:
+            graphics_pipelines[i] = *coverage_pipeline;
+            break;
+        }
+
         layer_it->ConfigureDraw(&push_constants[i], &descriptor_sets[i], rasterizer, *sampler,
                                 image_index, configs[i], layout);
         layer_it++;
@@ -77,8 +90,8 @@ void WindowAdaptPass::Draw(RasterizerVulkan& rasterizer, Scheduler& scheduler, s
         BeginRenderPass(cmdbuf, renderpass, host_framebuffer, render_area);
         cmdbuf.ClearAttachments({clear_attachment}, {clear_rect});
 
-        cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, graphics_pipeline);
         for (size_t i = 0; i < layer_count; i++) {
+            cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, graphics_pipelines[i]);
             cmdbuf.PushConstants(graphics_pipeline_layout, VK_SHADER_STAGE_VERTEX_BIT,
                                  push_constants[i]);
             cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_GRAPHICS, graphics_pipeline_layout, 0,
@@ -129,9 +142,13 @@ void WindowAdaptPass::CreateRenderPass(VkFormat frame_format) {
     render_pass = CreateWrappedRenderPass(device, frame_format, VK_IMAGE_LAYOUT_UNDEFINED);
 }
 
-void WindowAdaptPass::CreatePipeline() {
-    pipeline = CreateWrappedPipeline(device, render_pass, pipeline_layout,
-                                     std::tie(vertex_shader, fragment_shader), false);
+void WindowAdaptPass::CreatePipelines() {
+    opaque_pipeline = CreateWrappedPipeline(device, render_pass, pipeline_layout,
+                                            std::tie(vertex_shader, fragment_shader));
+    premultiplied_pipeline = CreateWrappedPremultipliedBlendingPipeline(
+        device, render_pass, pipeline_layout, std::tie(vertex_shader, fragment_shader));
+    coverage_pipeline = CreateWrappedCoverageBlendingPipeline(
+        device, render_pass, pipeline_layout, std::tie(vertex_shader, fragment_shader));
 }
 
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/present/window_adapt_pass.h b/src/video_core/renderer_vulkan/present/window_adapt_pass.h
index 0e2edfc31..cf667a4fc 100755
--- a/src/video_core/renderer_vulkan/present/window_adapt_pass.h
+++ b/src/video_core/renderer_vulkan/present/window_adapt_pass.h
@@ -42,7 +42,7 @@ private:
     void CreatePipelineLayout();
     void CreateVertexShader();
     void CreateRenderPass(VkFormat frame_format);
-    void CreatePipeline();
+    void CreatePipelines();
 
 private:
     const Device& device;
@@ -52,7 +52,9 @@ private:
     vk::ShaderModule vertex_shader;
     vk::ShaderModule fragment_shader;
     vk::RenderPass render_pass;
-    vk::Pipeline pipeline;
+    vk::Pipeline opaque_pipeline;
+    vk::Pipeline premultiplied_pipeline;
+    vk::Pipeline coverage_pipeline;
 };
 
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
index 81e305bd3..111f5df3c 100755
--- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
@@ -19,7 +19,9 @@
 #include "core/core_timing.h"
 #include "core/frontend/graphics_context.h"
 #include "core/telemetry_session.h"
+#include "video_core/capture.h"
 #include "video_core/gpu.h"
+#include "video_core/present.h"
 #include "video_core/renderer_vulkan/present/util.h"
 #include "video_core/renderer_vulkan/renderer_vulkan.h"
 #include "video_core/renderer_vulkan/vk_blit_screen.h"
@@ -38,6 +40,20 @@
 
 namespace Vulkan {
 namespace {
+
+constexpr VkExtent2D CaptureImageSize{
+    .width = VideoCore::Capture::LinearWidth,
+    .height = VideoCore::Capture::LinearHeight,
+};
+
+constexpr VkExtent3D CaptureImageExtent{
+    .width = VideoCore::Capture::LinearWidth,
+    .height = VideoCore::Capture::LinearHeight,
+    .depth = VideoCore::Capture::LinearDepth,
+};
+
+constexpr VkFormat CaptureFormat = VK_FORMAT_A8B8G8R8_UNORM_PACK32;
+
 std::string GetReadableVersion(u32 version) {
     return fmt::format("{}.{}.{}", VK_VERSION_MAJOR(version), VK_VERSION_MINOR(version),
                        VK_VERSION_PATCH(version));
@@ -99,10 +115,15 @@ RendererVulkan::RendererVulkan(Core::TelemetrySession& telemetry_session_,
                 render_window.GetFramebufferLayout().height),
       present_manager(instance, render_window, device, memory_allocator, scheduler, swapchain,
                       surface),
-      blit_swapchain(device_memory, device, memory_allocator, present_manager, scheduler),
-      blit_screenshot(device_memory, device, memory_allocator, present_manager, scheduler),
+      blit_swapchain(device_memory, device, memory_allocator, present_manager, scheduler,
+                     PresentFiltersForDisplay),
+      blit_capture(device_memory, device, memory_allocator, present_manager, scheduler,
+                   PresentFiltersForDisplay),
+      blit_applet(device_memory, device, memory_allocator, present_manager, scheduler,
+                  PresentFiltersForAppletCapture),
       rasterizer(render_window, gpu, device_memory, device, memory_allocator, state_tracker,
-                 scheduler) {
+                 scheduler),
+      applet_frame() {
     if (Settings::values.renderer_force_max_clock.GetValue() && device.ShouldBoostClocks()) {
         turbo_mode.emplace(instance, dld);
         scheduler.RegisterOnSubmit([this] { turbo_mode->QueueSubmitted(); });
@@ -125,6 +146,8 @@ void RendererVulkan::Composite(std::span<const Tegra::FramebufferConfig> framebu
 
     SCOPE_EXIT({ render_window.OnFrameDisplayed(); });
 
+    RenderAppletCaptureLayer(framebuffers);
+
     if (!render_window.IsShown()) {
         return;
     }
@@ -167,30 +190,20 @@ void RendererVulkan::Report() const {
     telemetry_session.AddField(field, "GPU_Vulkan_Extensions", extensions);
 }
 
-void Vulkan::RendererVulkan::RenderScreenshot(
-    std::span<const Tegra::FramebufferConfig> framebuffers) {
-    if (!renderer_settings.screenshot_requested) {
-        return;
-    }
-
-    constexpr VkFormat ScreenshotFormat{VK_FORMAT_B8G8R8A8_UNORM};
-    const Layout::FramebufferLayout layout{renderer_settings.screenshot_framebuffer_layout};
-
+vk::Buffer RendererVulkan::RenderToBuffer(std::span<const Tegra::FramebufferConfig> framebuffers,
+                                          const Layout::FramebufferLayout& layout, VkFormat format,
+                                          VkDeviceSize buffer_size) {
     auto frame = [&]() {
         Frame f{};
-        f.image = CreateWrappedImage(memory_allocator, VkExtent2D{layout.width, layout.height},
-                                     ScreenshotFormat);
-        f.image_view = CreateWrappedImageView(device, f.image, ScreenshotFormat);
-        f.framebuffer = blit_screenshot.CreateFramebuffer(layout, *f.image_view, ScreenshotFormat);
+        f.image =
+            CreateWrappedImage(memory_allocator, VkExtent2D{layout.width, layout.height}, format);
+        f.image_view = CreateWrappedImageView(device, f.image, format);
+        f.framebuffer = blit_capture.CreateFramebuffer(layout, *f.image_view, format);
         return f;
     }();
 
-    blit_screenshot.DrawToFrame(rasterizer, &frame, framebuffers, layout, 1,
-                                VK_FORMAT_B8G8R8A8_UNORM);
-
-    const auto dst_buffer = CreateWrappedBuffer(
-        memory_allocator, static_cast<VkDeviceSize>(layout.width * layout.height * 4),
-        MemoryUsage::Download);
+    auto dst_buffer = CreateWrappedBuffer(memory_allocator, buffer_size, MemoryUsage::Download);
+    blit_capture.DrawToFrame(rasterizer, &frame, framebuffers, layout, 1, format);
 
     scheduler.RequestOutsideRenderPassOperationContext();
     scheduler.Record([&](vk::CommandBuffer cmdbuf) {
@@ -198,15 +211,68 @@ void Vulkan::RendererVulkan::RenderScreenshot(
                            VkExtent3D{layout.width, layout.height, 1});
     });
 
-    // Ensure the copy is fully completed before saving the screenshot
+    // Ensure the copy is fully completed before saving the capture
     scheduler.Finish();
 
-    // Copy backing image data to the QImage screenshot buffer
+    // Copy backing image data to the capture buffer
     dst_buffer.Invalidate();
+    return dst_buffer;
+}
+
+void RendererVulkan::RenderScreenshot(std::span<const Tegra::FramebufferConfig> framebuffers) {
+    if (!renderer_settings.screenshot_requested) {
+        return;
+    }
+
+    const auto& layout{renderer_settings.screenshot_framebuffer_layout};
+    const auto dst_buffer = RenderToBuffer(framebuffers, layout, VK_FORMAT_B8G8R8A8_UNORM,
+                                           layout.width * layout.height * 4);
+
     std::memcpy(renderer_settings.screenshot_bits, dst_buffer.Mapped().data(),
                 dst_buffer.Mapped().size());
     renderer_settings.screenshot_complete_callback(false);
     renderer_settings.screenshot_requested = false;
 }
 
+std::vector<u8> RendererVulkan::GetAppletCaptureBuffer() {
+    using namespace VideoCore::Capture;
+
+    std::vector<u8> out(VideoCore::Capture::TiledSize);
+
+    if (!applet_frame.image) {
+        return out;
+    }
+
+    const auto dst_buffer =
+        CreateWrappedBuffer(memory_allocator, VideoCore::Capture::TiledSize, MemoryUsage::Download);
+
+    scheduler.RequestOutsideRenderPassOperationContext();
+    scheduler.Record([&](vk::CommandBuffer cmdbuf) {
+        DownloadColorImage(cmdbuf, *applet_frame.image, *dst_buffer, CaptureImageExtent);
+    });
+
+    // Ensure the copy is fully completed before writing the capture
+    scheduler.Finish();
+
+    // Swizzle image data to the capture buffer
+    dst_buffer.Invalidate();
+    Tegra::Texture::SwizzleTexture(out, dst_buffer.Mapped(), BytesPerPixel, LinearWidth,
+                                   LinearHeight, LinearDepth, BlockHeight, BlockDepth);
+
+    return out;
+}
+
+void RendererVulkan::RenderAppletCaptureLayer(
+    std::span<const Tegra::FramebufferConfig> framebuffers) {
+    if (!applet_frame.image) {
+        applet_frame.image = CreateWrappedImage(memory_allocator, CaptureImageSize, CaptureFormat);
+        applet_frame.image_view = CreateWrappedImageView(device, applet_frame.image, CaptureFormat);
+        applet_frame.framebuffer = blit_applet.CreateFramebuffer(
+            VideoCore::Capture::Layout, *applet_frame.image_view, CaptureFormat);
+    }
+
+    blit_applet.DrawToFrame(rasterizer, &applet_frame, framebuffers, VideoCore::Capture::Layout, 1,
+                            CaptureFormat);
+}
+
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.h b/src/video_core/renderer_vulkan/renderer_vulkan.h
index 505f9ddf6..81f97bb59 100755
--- a/src/video_core/renderer_vulkan/renderer_vulkan.h
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.h
@@ -48,6 +48,8 @@ public:
 
     void Composite(std::span<const Tegra::FramebufferConfig> framebuffers) override;
 
+    std::vector<u8> GetAppletCaptureBuffer() override;
+
     VideoCore::RasterizerInterface* ReadRasterizer() override {
         return &rasterizer;
     }
@@ -59,7 +61,11 @@ public:
 private:
     void Report() const;
 
+    vk::Buffer RenderToBuffer(std::span<const Tegra::FramebufferConfig> framebuffers,
+                              const Layout::FramebufferLayout& layout, VkFormat format,
+                              VkDeviceSize buffer_size);
     void RenderScreenshot(std::span<const Tegra::FramebufferConfig> framebuffers);
+    void RenderAppletCaptureLayer(std::span<const Tegra::FramebufferConfig> framebuffers);
 
     Core::TelemetrySession& telemetry_session;
     Tegra::MaxwellDeviceMemoryManager& device_memory;
@@ -79,9 +85,12 @@ private:
     Swapchain swapchain;
     PresentManager present_manager;
     BlitScreen blit_swapchain;
-    BlitScreen blit_screenshot;
+    BlitScreen blit_capture;
+    BlitScreen blit_applet;
     RasterizerVulkan rasterizer;
     std::optional<TurboMode> turbo_mode;
+
+    Frame applet_frame;
 };
 
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp
index 14699f2b3..5118207bb 100755
--- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp
+++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 
 #include "video_core/framebuffer_config.h"
+#include "video_core/present.h"
 #include "video_core/renderer_vulkan/present/filters.h"
 #include "video_core/renderer_vulkan/present/layer.h"
 #include "video_core/renderer_vulkan/vk_blit_screen.h"
@@ -12,9 +13,9 @@ namespace Vulkan {
 
 BlitScreen::BlitScreen(Tegra::MaxwellDeviceMemoryManager& device_memory_, const Device& device_,
                        MemoryAllocator& memory_allocator_, PresentManager& present_manager_,
-                       Scheduler& scheduler_)
+                       Scheduler& scheduler_, const PresentFilters& filters_)
     : device_memory{device_memory_}, device{device_}, memory_allocator{memory_allocator_},
-      present_manager{present_manager_}, scheduler{scheduler_}, image_count{1},
+      present_manager{present_manager_}, scheduler{scheduler_}, filters{filters_}, image_count{1},
       swapchain_view_format{VK_FORMAT_B8G8R8A8_UNORM} {}
 
 BlitScreen::~BlitScreen() = default;
@@ -27,7 +28,7 @@ void BlitScreen::WaitIdle() {
 
 void BlitScreen::SetWindowAdaptPass() {
     layers.clear();
-    scaling_filter = Settings::values.scaling_filter.GetValue();
+    scaling_filter = filters.get_scaling_filter();
 
     switch (scaling_filter) {
     case Settings::ScalingFilter::NearestNeighbor:
@@ -59,7 +60,7 @@ void BlitScreen::DrawToFrame(RasterizerVulkan& rasterizer, Frame* frame,
     bool presentation_recreate_required = false;
 
     // Recreate dynamic resources if the adapting filter changed
-    if (!window_adapt || scaling_filter != Settings::values.scaling_filter.GetValue()) {
+    if (!window_adapt || scaling_filter != filters.get_scaling_filter()) {
         resource_update_required = true;
     }
 
@@ -102,7 +103,7 @@ void BlitScreen::DrawToFrame(RasterizerVulkan& rasterizer, Frame* frame,
 
     while (layers.size() < framebuffers.size()) {
         layers.emplace_back(device, memory_allocator, scheduler, device_memory, image_count,
-                            window_size, window_adapt->GetDescriptorSetLayout());
+                            window_size, window_adapt->GetDescriptorSetLayout(), filters);
     }
 
     // Perform the draw
@@ -119,8 +120,7 @@ vk::Framebuffer BlitScreen::CreateFramebuffer(const Layout::FramebufferLayout& l
                                               VkFormat current_view_format) {
     const bool format_updated =
         std::exchange(swapchain_view_format, current_view_format) != current_view_format;
-    if (!window_adapt || scaling_filter != Settings::values.scaling_filter.GetValue() ||
-        format_updated) {
+    if (!window_adapt || scaling_filter != filters.get_scaling_filter() || format_updated) {
         WaitIdle();
         SetWindowAdaptPass();
     }
diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.h b/src/video_core/renderer_vulkan/vk_blit_screen.h
index 6ac55daa1..2543da1a1 100755
--- a/src/video_core/renderer_vulkan/vk_blit_screen.h
+++ b/src/video_core/renderer_vulkan/vk_blit_screen.h
@@ -16,6 +16,8 @@ namespace Core {
 class System;
 }
 
+struct PresentFilters;
+
 namespace Tegra {
 struct FramebufferConfig;
 }
@@ -47,7 +49,7 @@ class BlitScreen {
 public:
     explicit BlitScreen(Tegra::MaxwellDeviceMemoryManager& device_memory, const Device& device,
                         MemoryAllocator& memory_allocator, PresentManager& present_manager,
-                        Scheduler& scheduler);
+                        Scheduler& scheduler, const PresentFilters& filters);
     ~BlitScreen();
 
     void DrawToFrame(RasterizerVulkan& rasterizer, Frame* frame,
@@ -70,6 +72,7 @@ private:
     MemoryAllocator& memory_allocator;
     PresentManager& present_manager;
     Scheduler& scheduler;
+    const PresentFilters& filters;
     std::size_t image_count{};
     std::size_t image_index{};
     VkFormat swapchain_view_format{};
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
index dd9496164..e5e90c069 100755
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -381,8 +381,9 @@ PipelineCache::PipelineCache(Tegra::MaxwellDeviceMemoryManager& device_memory_,
         .support_float64 = device.IsFloat64Supported(),
         .support_float16 = device.IsFloat16Supported(),
         .support_int64 = device.IsShaderInt64Supported(),
-        .needs_demote_reorder =
-            driver_id == VK_DRIVER_ID_AMD_PROPRIETARY || driver_id == VK_DRIVER_ID_AMD_OPEN_SOURCE,
+        .needs_demote_reorder = driver_id == VK_DRIVER_ID_AMD_PROPRIETARY ||
+                                driver_id == VK_DRIVER_ID_AMD_OPEN_SOURCE ||
+                                driver_id == VK_DRIVER_ID_SAMSUNG_PROPRIETARY,
         .support_snorm_render_buffer = true,
         .support_viewport_index_layer = device.IsExtShaderViewportIndexLayerSupported(),
         .min_ssbo_alignment = static_cast<u32>(device.GetStorageBufferAlignment()),
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index c0a2597fa..85046e708 100755
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -746,7 +746,13 @@ std::pair<typename P::ImageView*, bool> TextureCache<P>::TryFindFramebufferImage
     }();
 
     const auto GetImageViewForFramebuffer = [&](ImageId image_id) {
-        const ImageViewInfo info{ImageViewType::e2D, view_format};
+        ImageViewInfo info{ImageViewType::e2D, view_format};
+        if (config.blending == Tegra::BlendMode::Opaque) {
+            info.x_source = static_cast<u8>(SwizzleSource::R);
+            info.y_source = static_cast<u8>(SwizzleSource::G);
+            info.z_source = static_cast<u8>(SwizzleSource::B);
+            info.w_source = static_cast<u8>(SwizzleSource::OneFloat);
+        }
         return std::make_pair(&slot_image_views[FindOrEmplaceImageView(image_id, info)],
                               slot_images[image_id].IsRescaled());
     };
diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp
index 0a494af27..bd77057b0 100755
--- a/src/video_core/vulkan_common/vulkan_device.cpp
+++ b/src/video_core/vulkan_common/vulkan_device.cpp
@@ -868,6 +868,8 @@ std::string Device::GetDriverName() const {
         return "Qualcomm";
     case VK_DRIVER_ID_ARM_PROPRIETARY:
         return "Mali";
+    case VK_DRIVER_ID_SAMSUNG_PROPRIETARY:
+        return "Xclipse";
     case VK_DRIVER_ID_GOOGLE_SWIFTSHADER:
         return "SwiftShader";
     case VK_DRIVER_ID_BROADCOM_PROPRIETARY:
diff --git a/src/yuzu/CMakeLists.txt b/src/yuzu/CMakeLists.txt
index ff712d5fd..7c092061e 100755
--- a/src/yuzu/CMakeLists.txt
+++ b/src/yuzu/CMakeLists.txt
@@ -41,6 +41,9 @@ add_executable(yuzu
     configuration/configuration_shared.cpp
     configuration/configuration_shared.h
     configuration/configure.ui
+    configuration/configure_applets.cpp
+    configuration/configure_applets.h
+    configuration/configure_applets.ui
     configuration/configure_audio.cpp
     configuration/configure_audio.h
     configuration/configure_audio.ui
diff --git a/src/yuzu/configuration/configure_applets.cpp b/src/yuzu/configuration/configure_applets.cpp
new file mode 100755
index 000000000..513ecb548
--- /dev/null
+++ b/src/yuzu/configuration/configure_applets.cpp
@@ -0,0 +1,86 @@
+// SPDX-FileCopyrightText: 2024 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include "common/settings.h"
+#include "core/core.h"
+#include "ui_configure_applets.h"
+#include "yuzu/configuration/configuration_shared.h"
+#include "yuzu/configuration/configure_applets.h"
+#include "yuzu/configuration/shared_widget.h"
+
+ConfigureApplets::ConfigureApplets(Core::System& system_,
+                                   std::shared_ptr<std::vector<ConfigurationShared::Tab*>> group_,
+                                   const ConfigurationShared::Builder& builder, QWidget* parent)
+    : Tab(group_, parent), ui{std::make_unique<Ui::ConfigureApplets>()}, system{system_} {
+    ui->setupUi(this);
+
+    Setup(builder);
+
+    SetConfiguration();
+}
+
+ConfigureApplets::~ConfigureApplets() = default;
+
+void ConfigureApplets::changeEvent(QEvent* event) {
+    if (event->type() == QEvent::LanguageChange) {
+        RetranslateUI();
+    }
+
+    QWidget::changeEvent(event);
+}
+
+void ConfigureApplets::RetranslateUI() {
+    ui->retranslateUi(this);
+}
+
+void ConfigureApplets::Setup(const ConfigurationShared::Builder& builder) {
+    auto& library_applets_layout = *ui->group_library_applet_modes->layout();
+    std::map<u32, QWidget*> applets_hold{};
+
+    std::vector<Settings::BasicSetting*> settings;
+    auto push = [&settings](auto& list) {
+        for (auto setting : list) {
+            settings.push_back(setting);
+        }
+    };
+
+    push(Settings::values.linkage.by_category[Settings::Category::LibraryApplet]);
+
+    for (auto setting : settings) {
+        ConfigurationShared::Widget* widget = builder.BuildWidget(setting, apply_funcs);
+
+        if (widget == nullptr) {
+            continue;
+        }
+        if (!widget->Valid()) {
+            widget->deleteLater();
+            continue;
+        }
+
+        // Untested applets
+        if (setting->Id() == Settings::values.data_erase_applet_mode.Id() ||
+            setting->Id() == Settings::values.error_applet_mode.Id() ||
+            setting->Id() == Settings::values.net_connect_applet_mode.Id() ||
+            setting->Id() == Settings::values.web_applet_mode.Id() ||
+            setting->Id() == Settings::values.shop_applet_mode.Id() ||
+            setting->Id() == Settings::values.login_share_applet_mode.Id() ||
+            setting->Id() == Settings::values.wifi_web_auth_applet_mode.Id() ||
+            setting->Id() == Settings::values.my_page_applet_mode.Id()) {
+            widget->setHidden(true);
+        }
+
+        applets_hold.emplace(setting->Id(), widget);
+    }
+    for (const auto& [label, widget] : applets_hold) {
+        library_applets_layout.addWidget(widget);
+    }
+}
+
+void ConfigureApplets::SetConfiguration() {}
+
+void ConfigureApplets::ApplyConfiguration() {
+    const bool powered_on = system.IsPoweredOn();
+    for (const auto& func : apply_funcs) {
+        func(powered_on);
+    }
+}
diff --git a/src/yuzu/configuration/configure_applets.h b/src/yuzu/configuration/configure_applets.h
new file mode 100755
index 000000000..54f494d2f
--- /dev/null
+++ b/src/yuzu/configuration/configure_applets.h
@@ -0,0 +1,48 @@
+// SPDX-FileCopyrightText: 2024 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include <QWidget>
+#include "yuzu/configuration/configuration_shared.h"
+
+class QCheckBox;
+class QLineEdit;
+class QComboBox;
+class QDateTimeEdit;
+namespace Core {
+class System;
+}
+
+namespace Ui {
+class ConfigureApplets;
+}
+
+namespace ConfigurationShared {
+class Builder;
+}
+
+class ConfigureApplets : public ConfigurationShared::Tab {
+public:
+    explicit ConfigureApplets(Core::System& system_,
+                              std::shared_ptr<std::vector<ConfigurationShared::Tab*>> group,
+                              const ConfigurationShared::Builder& builder,
+                              QWidget* parent = nullptr);
+    ~ConfigureApplets() override;
+
+    void ApplyConfiguration() override;
+    void SetConfiguration() override;
+
+private:
+    void changeEvent(QEvent* event) override;
+    void RetranslateUI();
+
+    void Setup(const ConfigurationShared::Builder& builder);
+
+    std::vector<std::function<void(bool)>> apply_funcs{};
+
+    std::unique_ptr<Ui::ConfigureApplets> ui;
+    bool enabled = false;
+
+    Core::System& system;
+};
diff --git a/src/yuzu/configuration/configure_applets.ui b/src/yuzu/configuration/configure_applets.ui
new file mode 100755
index 000000000..6f2ca66bd
--- /dev/null
+++ b/src/yuzu/configuration/configure_applets.ui
@@ -0,0 +1,65 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<ui version="4.0">
+ <class>ConfigureApplets</class>
+ <widget class="QWidget" name="ConfigureApplets">
+  <property name="geometry">
+   <rect>
+    <x>0</x>
+    <y>0</y>
+    <width>605</width>
+    <height>300</height>
+   </rect>
+  </property>
+  <property name="windowTitle">
+   <string>Form</string>
+  </property>
+  <property name="accessibleName">
+   <string>Applets</string>
+  </property>
+  <layout class="QVBoxLayout" name="verticalLayout_1">
+   <item>
+    <layout class="QVBoxLayout" name="verticalLayout">
+     <item>
+      <widget class="QGroupBox" name="group_library_applet_modes">
+       <property name="title">
+        <string>Applet mode preference</string>
+       </property>
+       <layout class="QVBoxLayout">
+        <item>
+         <widget class="QWidget" name="applets_widget" native="true">
+          <layout class="QVBoxLayout" name="verticalLayout_3">
+           <property name="leftMargin">
+            <number>0</number>
+           </property>
+           <property name="topMargin">
+            <number>0</number>
+           </property>
+           <property name="rightMargin">
+            <number>0</number>
+           </property>
+          </layout>
+         </widget>
+        </item>
+       </layout>
+      </widget>
+     </item>
+    </layout>
+   </item>
+   <item>
+    <spacer name="verticalSpacer">
+     <property name="orientation">
+      <enum>Qt::Vertical</enum>
+     </property>
+     <property name="sizeHint" stdset="0">
+      <size>
+       <width>20</width>
+       <height>40</height>
+      </size>
+     </property>
+    </spacer>
+   </item>
+  </layout>
+ </widget>
+ <resources/>
+ <connections/>
+</ui>
diff --git a/src/yuzu/configuration/configure_dialog.cpp b/src/yuzu/configuration/configure_dialog.cpp
index 14e3580a2..e9c92a0b9 100755
--- a/src/yuzu/configuration/configure_dialog.cpp
+++ b/src/yuzu/configuration/configure_dialog.cpp
@@ -8,6 +8,7 @@
 #include "core/core.h"
 #include "ui_configure.h"
 #include "vk_device_info.h"
+#include "yuzu/configuration/configure_applets.h"
 #include "yuzu/configuration/configure_audio.h"
 #include "yuzu/configuration/configure_cpu.h"
 #include "yuzu/configuration/configure_debug_tab.h"
@@ -34,6 +35,7 @@ ConfigureDialog::ConfigureDialog(QWidget* parent, HotkeyRegistry& registry_,
     : QDialog(parent), ui{std::make_unique<Ui::ConfigureDialog>()},
       registry(registry_), system{system_}, builder{std::make_unique<ConfigurationShared::Builder>(
                                                 this, !system_.IsPoweredOn())},
+      applets_tab{std::make_unique<ConfigureApplets>(system_, nullptr, *builder, this)},
       audio_tab{std::make_unique<ConfigureAudio>(system_, nullptr, *builder, this)},
       cpu_tab{std::make_unique<ConfigureCpu>(system_, nullptr, *builder, this)},
       debug_tab_tab{std::make_unique<ConfigureDebugTab>(system_, this)},
@@ -58,6 +60,7 @@ ConfigureDialog::ConfigureDialog(QWidget* parent, HotkeyRegistry& registry_,
 
     ui->setupUi(this);
 
+    ui->tabWidget->addTab(applets_tab.get(), tr("Applets"));
     ui->tabWidget->addTab(audio_tab.get(), tr("Audio"));
     ui->tabWidget->addTab(cpu_tab.get(), tr("CPU"));
     ui->tabWidget->addTab(debug_tab_tab.get(), tr("Debug"));
@@ -124,6 +127,7 @@ void ConfigureDialog::ApplyConfiguration() {
     debug_tab_tab->ApplyConfiguration();
     web_tab->ApplyConfiguration();
     network_tab->ApplyConfiguration();
+    applets_tab->ApplyConfiguration();
     system.ApplySettings();
     Settings::LogSettings();
 }
@@ -161,7 +165,8 @@ void ConfigureDialog::PopulateSelectionList() {
         {{tr("General"),
           {general_tab.get(), hotkeys_tab.get(), ui_tab.get(), web_tab.get(), debug_tab_tab.get()}},
          {tr("System"),
-          {system_tab.get(), profile_tab.get(), network_tab.get(), filesystem_tab.get()}},
+          {system_tab.get(), profile_tab.get(), network_tab.get(), filesystem_tab.get(),
+           applets_tab.get()}},
          {tr("CPU"), {cpu_tab.get()}},
          {tr("Graphics"), {graphics_tab.get(), graphics_advanced_tab.get()}},
          {tr("Audio"), {audio_tab.get()}},
diff --git a/src/yuzu/configuration/configure_dialog.h b/src/yuzu/configuration/configure_dialog.h
index 8f56c9bca..1c507d5c2 100755
--- a/src/yuzu/configuration/configure_dialog.h
+++ b/src/yuzu/configuration/configure_dialog.h
@@ -15,6 +15,7 @@ namespace Core {
 class System;
 }
 
+class ConfigureApplets;
 class ConfigureAudio;
 class ConfigureCpu;
 class ConfigureDebugTab;
@@ -75,6 +76,7 @@ private:
     std::unique_ptr<ConfigurationShared::Builder> builder;
     std::vector<ConfigurationShared::Tab*> tab_group;
 
+    std::unique_ptr<ConfigureApplets> applets_tab;
     std::unique_ptr<ConfigureAudio> audio_tab;
     std::unique_ptr<ConfigureCpu> cpu_tab;
     std::unique_ptr<ConfigureDebugTab> debug_tab_tab;
diff --git a/src/yuzu/configuration/shared_translation.cpp b/src/yuzu/configuration/shared_translation.cpp
index ed9c7d859..ce65b2bf1 100755
--- a/src/yuzu/configuration/shared_translation.cpp
+++ b/src/yuzu/configuration/shared_translation.cpp
@@ -26,6 +26,23 @@ std::unique_ptr<TranslationMap> InitializeTranslations(QWidget* parent) {
 
     // A setting can be ignored by giving it a blank name
 
+    // Applets
+    INSERT(Settings, cabinet_applet_mode, tr("Amiibo editor"), QStringLiteral());
+    INSERT(Settings, controller_applet_mode, tr("Controller configuration"), QStringLiteral());
+    INSERT(Settings, data_erase_applet_mode, tr("Data erase"), QStringLiteral());
+    INSERT(Settings, error_applet_mode, tr("Error"), QStringLiteral());
+    INSERT(Settings, net_connect_applet_mode, tr("Net connect"), QStringLiteral());
+    INSERT(Settings, player_select_applet_mode, tr("Player select"), QStringLiteral());
+    INSERT(Settings, swkbd_applet_mode, tr("Software keyboard"), QStringLiteral());
+    INSERT(Settings, mii_edit_applet_mode, tr("Mii Edit"), QStringLiteral());
+    INSERT(Settings, web_applet_mode, tr("Online web"), QStringLiteral());
+    INSERT(Settings, shop_applet_mode, tr("Shop"), QStringLiteral());
+    INSERT(Settings, photo_viewer_applet_mode, tr("Photo viewer"), QStringLiteral());
+    INSERT(Settings, offline_web_applet_mode, tr("Offline web"), QStringLiteral());
+    INSERT(Settings, login_share_applet_mode, tr("Login share"), QStringLiteral());
+    INSERT(Settings, wifi_web_auth_applet_mode, tr("Wifi web auth"), QStringLiteral());
+    INSERT(Settings, my_page_applet_mode, tr("My page"), QStringLiteral());
+
     // Audio
     INSERT(Settings, sink_id, tr("Output Engine:"), QStringLiteral());
     INSERT(Settings, audio_output_device_id, tr("Output Device:"), QStringLiteral());
@@ -203,6 +220,11 @@ std::unique_ptr<ComboboxTranslationMap> ComboboxEnumeration(QWidget* parent) {
 #define PAIR(ENUM, VALUE, TRANSLATION) {static_cast<u32>(Settings::ENUM::VALUE), (TRANSLATION)}
 
     // Intentionally skipping VSyncMode to let the UI fill that one out
+    translations->insert({Settings::EnumMetadata<Settings::AppletMode>::Index(),
+                          {
+                              PAIR(AppletMode, HLE, tr("Custom frontend")),
+                              PAIR(AppletMode, LLE, tr("Real applet")),
+                          }});
 
     translations->insert({Settings::EnumMetadata<Settings::AstcDecodeMode>::Index(),
                           {