From 660484e64b61db4f7ed22a1678417a48cd737b0c Mon Sep 17 00:00:00 2001
From: bnnm <bananaman255@gmail.com>
Date: Wed, 22 Jan 2025 01:43:38 +0100
Subject: [PATCH 01/17] api: tweak some internals for float decoders

---
 src/base/api_decode_play.c | 21 +++++++++++++--------
 src/base/decode.c          | 16 +++++++---------
 src/base/decode.h          |  2 +-
 src/base/info.c            | 16 ++++++++++++++++
 src/base/mixing.c          |  8 ++++++--
 src/base/render.c          |  7 +++----
 src/base/sbuf.c            |  9 +++++++--
 src/layout/blocked.c       | 15 +++++++--------
 src/layout/flat.c          | 15 +++++++--------
 src/layout/interleave.c    | 17 ++++++++---------
 src/layout/layout.h        |  6 +++---
 11 files changed, 78 insertions(+), 54 deletions(-)

diff --git a/src/base/api_decode_play.c b/src/base/api_decode_play.c
index bc1dad7c..31b7173b 100644
--- a/src/base/api_decode_play.c
+++ b/src/base/api_decode_play.c
@@ -1,5 +1,6 @@
 #include "api_internal.h"
 #include "mixing.h"
+#include "render.h"
 
 #if LIBVGMSTREAM_ENABLE
 
@@ -17,24 +18,24 @@ static bool reset_buf(libvgmstream_priv_t* priv) {
     int input_channels = 0, output_channels = 0;
     vgmstream_mixing_enable(priv->vgmstream, 0, &input_channels, &output_channels); //query
 
-    int min_channels = input_channels;
-    if (min_channels < output_channels)
-        min_channels = output_channels;
+    int max_channels = input_channels;
+    if (max_channels < output_channels)
+        max_channels = output_channels;
 
     sfmt_t input_sfmt = mixing_get_input_sample_type(priv->vgmstream);
     sfmt_t output_sfmt = mixing_get_output_sample_type(priv->vgmstream);
     int input_sample_size = sfmt_get_sample_size(input_sfmt);
     int output_sample_size = sfmt_get_sample_size(output_sfmt);
 
-    int min_sample_size = input_sample_size;
-    if (min_sample_size < output_sample_size)
-        min_sample_size = output_sample_size;
+    int max_sample_size = input_sample_size;
+    if (max_sample_size < output_sample_size)
+        max_sample_size = output_sample_size;
 
     priv->buf.max_samples = INTERNAL_BUF_SAMPLES;
     priv->buf.sample_size = output_sample_size;
     priv->buf.channels = output_channels;
 
-    int max_bytes = priv->buf.max_samples * min_sample_size * min_channels;
+    int max_bytes = priv->buf.max_samples * max_sample_size * max_channels;
     priv->buf.data = malloc(max_bytes);
     if (!priv->buf.data) return false;
 
@@ -79,7 +80,11 @@ LIBVGMSTREAM_API int libvgmstream_render(libvgmstream_t* lib) {
     if (!priv->pos.play_forever && to_get + priv->pos.current > priv->pos.play_samples)
         to_get = priv->pos.play_samples - priv->pos.current;
 
-    int decoded = render_vgmstream(priv->buf.data, to_get, priv->vgmstream);
+    sbuf_t ssrc;
+    sfmt_t sfmt = mixing_get_input_sample_type(priv->vgmstream);
+    sbuf_init(&ssrc, sfmt, priv->buf.data, to_get, priv->vgmstream->channels);
+
+    int decoded = render_main(&ssrc, priv->vgmstream);
     update_buf(priv, decoded);
     update_decoder_info(priv, decoded);
 
diff --git a/src/base/decode.c b/src/base/decode.c
index 0e914e59..157fec3c 100644
--- a/src/base/decode.c
+++ b/src/base/decode.c
@@ -935,10 +935,12 @@ decode_fail:
  * buffer already, and we have samples_to_do consecutive samples ahead of us (won't call
  * more than one frame if configured above to do so).
  * Called by layouts since they handle samples written/to_do */
-void decode_vgmstream(VGMSTREAM* vgmstream, int samples_filled, int samples_to_do, sample_t* buffer) {
+void decode_vgmstream(sbuf_t* sdst, VGMSTREAM* vgmstream, int samples_to_do) {
     int ch;
 
-    buffer += samples_filled * vgmstream->channels; /* passed externally to simplify I guess */
+    //TODO: this cast isn't correct for float sbuf-decoders but shouldn't be used/matter (for buffer+ch below)
+    int16_t* buffer = sdst->buf;
+    buffer += sdst->filled * vgmstream->channels; // passed externally to decoders to simplify I guess
     //samples_to_do -= samples_filled; /* pre-adjusted */
 
     switch (vgmstream->coding_type) {
@@ -1673,14 +1675,10 @@ void decode_vgmstream(VGMSTREAM* vgmstream, int samples_filled, int samples_to_d
             break;
 
         default: {
-            sbuf_t sbuf_tmp = {0};
-            sbuf_t* sbuf = &sbuf_tmp;
+            sbuf_t stmp = *sdst;
+            stmp.samples = stmp.filled + samples_to_do; //TODO improve 
 
-            // buffers already adjusted
-            sbuf_init_s16(sbuf, buffer, /*samples_filled +*/ samples_to_do, vgmstream->channels);
-            sbuf->filled = 0; // samples_filled;
-
-            decode_frames(sbuf, vgmstream);
+            decode_frames(&stmp, vgmstream);
             break;
         }
     }
diff --git a/src/base/decode.h b/src/base/decode.h
index 4556b272..deef7187 100644
--- a/src/base/decode.h
+++ b/src/base/decode.h
@@ -10,7 +10,7 @@ void decode_reset(VGMSTREAM* vgmstream);
 
 /* Decode samples into the buffer. Assume that we have written samples_filled into the
  * buffer already, and we have samples_to_do consecutive samples ahead of us. */
-void decode_vgmstream(VGMSTREAM* vgmstream, int samples_filled, int samples_to_do, sample_t* buffer);
+void decode_vgmstream(sbuf_t* sdst, VGMSTREAM* vgmstream, int samples_to_do);
 
 /* Detect loop start and save values, or detect loop end and restore (loop back). Returns true if loop was done. */
 bool decode_do_loop(VGMSTREAM* vgmstream);
diff --git a/src/base/info.c b/src/base/info.c
index b1b94b17..f6610ed6 100644
--- a/src/base/info.c
+++ b/src/base/info.c
@@ -171,6 +171,21 @@ void describe_vgmstream(VGMSTREAM* vgmstream, char* desc, int length) {
         concatn(length,desc,temp);
     }
 
+    sfmt_t sfmt = mixing_get_input_sample_type(vgmstream);
+    if (sfmt != SFMT_S16) {
+        const char* sfmt_desc;
+        switch(sfmt) {
+            case SFMT_FLT: sfmt_desc = "float"; break;
+            case SFMT_F32: sfmt_desc = "float32"; break;
+            case SFMT_S16: sfmt_desc = "pcm16"; break;
+            default: sfmt_desc = "???";
+        }
+
+        snprintf(temp,TEMPSIZE, "sample type: %s\n", sfmt_desc);
+        concatn(length,desc,temp);
+    }
+
+
     if (vgmstream->config_enabled) {
         int32_t samples = vgmstream->pstate.play_duration;
 
@@ -178,6 +193,7 @@ void describe_vgmstream(VGMSTREAM* vgmstream, char* desc, int length) {
         snprintf(temp,TEMPSIZE, "play duration: %d samples (%1.0f:%06.3f seconds)\n", samples, time_mm, time_ss);
         concatn(length,desc,temp);
     }
+
 }
 
 void describe_vgmstream_info(VGMSTREAM* vgmstream, vgmstream_info* info) {
diff --git a/src/base/mixing.c b/src/base/mixing.c
index 3db330d2..fd7e22ae 100644
--- a/src/base/mixing.c
+++ b/src/base/mixing.c
@@ -143,9 +143,13 @@ void mixing_info(VGMSTREAM* vgmstream, int* p_input_channels, int* p_output_chan
 }
 
 sfmt_t mixing_get_input_sample_type(VGMSTREAM* vgmstream) {
-    // TODO: check vgmstream
     // TODO: on layered/segments, detect biggest value and use that (ex. if one of the layers uses flt > flt)
-    return SFMT_S16;
+    switch(vgmstream->coding_type) {
+        case coding_KA1A:
+            return SFMT_FLT;
+        default:
+            return SFMT_S16;
+    }
 }
 
 sfmt_t mixing_get_output_sample_type(VGMSTREAM* vgmstream) {
diff --git a/src/base/render.c b/src/base/render.c
index de878d4f..bdabb6e1 100644
--- a/src/base/render.c
+++ b/src/base/render.c
@@ -73,7 +73,6 @@ void render_reset(VGMSTREAM* vgmstream) {
 }
 
 int render_layout(sbuf_t* sbuf, VGMSTREAM* vgmstream) {
-    void* buf = sbuf->buf;
     int sample_count = sbuf->samples;
 
     if (sample_count == 0)
@@ -90,10 +89,10 @@ int render_layout(sbuf_t* sbuf, VGMSTREAM* vgmstream) {
 
     switch (vgmstream->layout_type) {
         case layout_interleave:
-            render_vgmstream_interleave(buf, sample_count, vgmstream);
+            render_vgmstream_interleave(sbuf, vgmstream);
             break;
         case layout_none:
-            render_vgmstream_flat(buf, sample_count, vgmstream);
+            render_vgmstream_flat(sbuf, vgmstream);
             break;
         case layout_blocked_mxch:
         case layout_blocked_ast:
@@ -134,7 +133,7 @@ int render_layout(sbuf_t* sbuf, VGMSTREAM* vgmstream) {
         case layout_blocked_ubi_sce:
         case layout_blocked_tt_ad:
         case layout_blocked_vas:
-            render_vgmstream_blocked(buf, sample_count, vgmstream);
+            render_vgmstream_blocked(sbuf, vgmstream);
             break;
         case layout_segmented:
             render_vgmstream_segmented(sbuf, vgmstream);
diff --git a/src/base/sbuf.c b/src/base/sbuf.c
index 184c63e0..5f7538ce 100644
--- a/src/base/sbuf.c
+++ b/src/base/sbuf.c
@@ -112,8 +112,6 @@ void sbuf_copy_to_f32(float* dst, sbuf_t* sbuf) {
             }
             break;
         }
-
-        case SFMT_FLT:
         case SFMT_F32: {
             float* src = sbuf->buf;
             for (int s = 0; s < sbuf->filled * sbuf->channels; s++) {
@@ -121,6 +119,13 @@ void sbuf_copy_to_f32(float* dst, sbuf_t* sbuf) {
             }
             break;
         }
+        case SFMT_FLT: {
+            float* src = sbuf->buf;
+            for (int s = 0; s < sbuf->filled * sbuf->channels; s++) {
+                dst[s] = src[s] * 32768.0f;
+            }
+            break;
+        }
         default:
             break;
     }
diff --git a/src/layout/blocked.c b/src/layout/blocked.c
index ad3355ae..acea36ae 100644
--- a/src/layout/blocked.c
+++ b/src/layout/blocked.c
@@ -8,7 +8,7 @@
 /* Decodes samples for blocked streams.
  * Data is divided into headered blocks with a bunch of data. The layout calls external helper functions
  * when a block is decoded, and those must parse the new block and move offsets accordingly. */
-void render_vgmstream_blocked(sample_t* outbuf, int32_t sample_count, VGMSTREAM* vgmstream) {
+void render_vgmstream_blocked(sbuf_t* sdst, VGMSTREAM* vgmstream) {
 
     int frame_size = decode_get_frame_size(vgmstream);
     int samples_per_frame = decode_get_samples_per_frame(vgmstream);
@@ -25,8 +25,7 @@ void render_vgmstream_blocked(sample_t* outbuf, int32_t sample_count, VGMSTREAM*
         samples_this_block = vgmstream->current_block_size / frame_size * samples_per_frame;
     }
 
-    int samples_filled = 0;
-    while (samples_filled < sample_count) {
+    while (sdst->filled < sdst->samples) {
         int samples_to_do; 
 
         if (vgmstream->loop_flag && decode_do_loop(vgmstream)) {
@@ -54,15 +53,15 @@ void render_vgmstream_blocked(sample_t* outbuf, int32_t sample_count, VGMSTREAM*
         }
 
         samples_to_do = decode_get_samples_to_do(samples_this_block, samples_per_frame, vgmstream);
-        if (samples_to_do > sample_count - samples_filled)
-            samples_to_do = sample_count - samples_filled;
+        if (samples_to_do > sdst->samples - sdst->filled)
+            samples_to_do = sdst->samples - sdst->filled;
 
         if (samples_to_do > 0) {
             /* samples_this_block = 0 is allowed (empty block, do nothing then move to next block) */
-            decode_vgmstream(vgmstream, samples_filled, samples_to_do, outbuf);
+            decode_vgmstream(sdst, vgmstream, samples_to_do);
         }
 
-        samples_filled += samples_to_do;
+        sdst->filled += samples_to_do;
         vgmstream->current_sample += samples_to_do;
         vgmstream->samples_into_block += samples_to_do;
 
@@ -92,7 +91,7 @@ void render_vgmstream_blocked(sample_t* outbuf, int32_t sample_count, VGMSTREAM*
 
     return;
 decode_fail:
-    sbuf_silence_s16(outbuf, sample_count, vgmstream->channels, samples_filled);
+    sbuf_silence_rest(sdst);
 }
 
 /* helper functions to parse new block */
diff --git a/src/layout/flat.c b/src/layout/flat.c
index 5a58605c..83220af4 100644
--- a/src/layout/flat.c
+++ b/src/layout/flat.c
@@ -6,14 +6,13 @@
 
 /* Decodes samples for flat streams.
  * Data forms a single stream, and the decoder may internally skip chunks and move offsets as needed. */
-void render_vgmstream_flat(sample_t* outbuf, int32_t sample_count, VGMSTREAM* vgmstream) {
+void render_vgmstream_flat(sbuf_t* sdst, VGMSTREAM* vgmstream) {
 
     int samples_per_frame = decode_get_samples_per_frame(vgmstream);
     int samples_this_block = vgmstream->num_samples; /* do all samples if possible */
 
     /* write samples */
-    int samples_filled = 0;
-    while (samples_filled < sample_count) {
+    while (sdst->filled < sdst->samples) {
 
         if (vgmstream->loop_flag && decode_do_loop(vgmstream)) {
             /* handle looping */
@@ -21,22 +20,22 @@ void render_vgmstream_flat(sample_t* outbuf, int32_t sample_count, VGMSTREAM* vg
         }
 
         int samples_to_do = decode_get_samples_to_do(samples_this_block, samples_per_frame, vgmstream);
-        if (samples_to_do > sample_count - samples_filled)
-            samples_to_do = sample_count - samples_filled;
+        if (samples_to_do > sdst->samples - sdst->filled)
+            samples_to_do = sdst->samples - sdst->filled;
 
         if (samples_to_do <= 0) { /* when decoding more than num_samples */
             VGM_LOG_ONCE("FLAT: wrong samples_to_do\n"); 
             goto decode_fail;
         }
 
-        decode_vgmstream(vgmstream, samples_filled, samples_to_do, outbuf);
+        decode_vgmstream(sdst, vgmstream, samples_to_do);
 
-        samples_filled += samples_to_do;
+        sdst->filled += samples_to_do;
         vgmstream->current_sample += samples_to_do;
         vgmstream->samples_into_block += samples_to_do;
     }
 
     return;
 decode_fail:
-    sbuf_silence_s16(outbuf, sample_count, vgmstream->channels, samples_filled);
+    sbuf_silence_rest(sdst);
 }
diff --git a/src/layout/interleave.c b/src/layout/interleave.c
index 63bde81a..54d78aa9 100644
--- a/src/layout/interleave.c
+++ b/src/layout/interleave.c
@@ -143,11 +143,11 @@ static void update_offsets(layout_config_t* layout, VGMSTREAM* vgmstream, int* p
  * Data has interleaved chunks per channel, and once one is decoded the layout moves offsets,
  * skipping other chunks (essentially a simplified variety of blocked layout).
  * Incompatible with decoders that move offsets. */
-void render_vgmstream_interleave(sample_t* outbuf, int32_t sample_count, VGMSTREAM* vgmstream) {
+void render_vgmstream_interleave(sbuf_t* sdst, VGMSTREAM* vgmstream) {
     layout_config_t layout = {0};
     if (!setup_helper(&layout, vgmstream)) {
         VGM_LOG_ONCE("INTERLEAVE: wrong config found\n");
-        sbuf_silence_s16(outbuf, sample_count, vgmstream->channels, 0);
+        sbuf_silence_rest(sdst);
         return;
     }
 
@@ -160,8 +160,7 @@ void render_vgmstream_interleave(sample_t* outbuf, int32_t sample_count, VGMSTRE
     if (samples_this_block == 0 && vgmstream->channels == 1)
         samples_this_block = vgmstream->num_samples;
 
-    int samples_filled = 0;
-    while (samples_filled < sample_count) {
+    while (sdst->filled < sdst->samples) {
 
         if (vgmstream->loop_flag && decode_do_loop(vgmstream)) {
             /* handle looping, restore standard interleave sizes */
@@ -170,17 +169,17 @@ void render_vgmstream_interleave(sample_t* outbuf, int32_t sample_count, VGMSTRE
         }
 
         int samples_to_do = decode_get_samples_to_do(samples_this_block, samples_per_frame, vgmstream);
-        if (samples_to_do > sample_count - samples_filled)
-            samples_to_do = sample_count - samples_filled;
+        if (samples_to_do > sdst->samples - sdst->filled)
+            samples_to_do = sdst->samples - sdst->filled;
 
         if (samples_to_do <= 0) { /* happens when interleave is not set */
             VGM_LOG_ONCE("INTERLEAVE: wrong samples_to_do\n"); 
             goto decode_fail;
         }
 
-        decode_vgmstream(vgmstream, samples_filled, samples_to_do, outbuf);
+        decode_vgmstream(sdst, vgmstream, samples_to_do);
 
-        samples_filled += samples_to_do;
+        sdst->filled += samples_to_do;
         vgmstream->current_sample += samples_to_do;
         vgmstream->samples_into_block += samples_to_do;
 
@@ -193,5 +192,5 @@ void render_vgmstream_interleave(sample_t* outbuf, int32_t sample_count, VGMSTRE
 
     return;
 decode_fail:
-    sbuf_silence_s16(outbuf, sample_count, vgmstream->channels, samples_filled);
+    sbuf_silence_rest(sdst);
 }
diff --git a/src/layout/layout.h b/src/layout/layout.h
index 1c298c36..3c35ec6d 100644
--- a/src/layout/layout.h
+++ b/src/layout/layout.h
@@ -8,9 +8,9 @@
 #include "../base/sbuf.h"
 
 /* basic layouts */
-void render_vgmstream_flat(sample_t* buffer, int32_t sample_count, VGMSTREAM* vgmstream);
+void render_vgmstream_flat(sbuf_t* sbuf, VGMSTREAM* vgmstream);
 
-void render_vgmstream_interleave(sample_t* buffer, int32_t sample_count, VGMSTREAM* vgmstream);
+void render_vgmstream_interleave(sbuf_t* sbuf, VGMSTREAM* vgmstream);
 
 
 /* segmented layout */
@@ -56,7 +56,7 @@ void loop_layout_layered(VGMSTREAM* vgmstream, int32_t loop_sample);
 
 
 /* blocked layouts */
-void render_vgmstream_blocked(sample_t* buffer, int32_t sample_count, VGMSTREAM* vgmstream);
+void render_vgmstream_blocked(sbuf_t* sbuf, VGMSTREAM* vgmstream);
 void block_update(off_t block_offset, VGMSTREAM* vgmstream);
 
 void block_update_ast(off_t block_ofset, VGMSTREAM* vgmstream);

From 08971a05fbaf0b4f5bf7382e0bfc486d9d166bd2 Mon Sep 17 00:00:00 2001
From: bnnm <bananaman255@gmail.com>
Date: Wed, 22 Jan 2025 01:45:50 +0100
Subject: [PATCH 02/17] Fix some .nub again [We Ski (Wii)]

---
 src/meta/ngc_dsp_std.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/meta/ngc_dsp_std.c b/src/meta/ngc_dsp_std.c
index b527b8a0..c48e710c 100644
--- a/src/meta/ngc_dsp_std.c
+++ b/src/meta/ngc_dsp_std.c
@@ -717,13 +717,14 @@ VGMSTREAM* init_vgmstream_idsp_namco(STREAMFILE* sf) {
     dspm.header_spacing = read_u32be(0x24,sf);
     dspm.start_offset = read_u32be(0x28,sf);
 
-    /* Soul Calibur: Broken destiny (PSP), Taiko no Tatsujin: Atsumete Tomodachi Daisakusen (WiiU) */
+    /* SoulCalibur Legends (Wii), Taiko no Tatsujin: Atsumete Tomodachi Daisakusen (WiiU) */
     if (dspm.interleave == 0)  {
-        /* half interleave (happens sometimes), use channel size */
+        /* half interleave (uncommon), use channel size */
         dspm.interleave = read_u32be(0x2c,sf);
-        /* Rarely 2nd channel stars with a padding frame then real 2nd channel with initial_ps. Must be some
-         * NUS2 bug when importing DSP data as only happens for one subsong and offsets/sizes are fine [We Ski (Wii)] */
+        /* Rarely 2nd channel stars with a padding frame then real 2nd channel with initial_ps. Must be some NUS2 bug
+         * when importing DSP data as only happens for some subsongs and offsets/sizes are fine [We Ski (Wii), Go Vacation (Wii)] */
         dspm.ignore_initial_ps = true;
+        dspm.ignore_loop_ps = true;
     }
 
     // rare but valid IDSP [Super Smash Bros. Ultimate (Switch)-vc_kirby.nus3audio]

From 4ad40660db683682663597d533dd51025e1d0e87 Mon Sep 17 00:00:00 2001
From: bnnm <bananaman255@gmail.com>
Date: Wed, 22 Jan 2025 01:46:31 +0100
Subject: [PATCH 03/17] cli: minor tweaks

---
 cli/vgmstream_cli.c       | 64 +++++++++++++++++++++------------------
 cli/vgmstream_cli.h       |  1 +
 cli/vgmstream_cli_utils.c | 15 ++++-----
 3 files changed, 42 insertions(+), 38 deletions(-)

diff --git a/cli/vgmstream_cli.c b/cli/vgmstream_cli.c
index 39170a1f..e3a84a5a 100644
--- a/cli/vgmstream_cli.c
+++ b/cli/vgmstream_cli.c
@@ -65,7 +65,7 @@ static void print_usage(const char* progname, bool is_help) {
             "    -P: output to stdout even if stdout is a terminal\n"
             "    -c: loop forever (continuously) to stdout\n"
             "    -L: append a smpl chunk and create a looping wav\n"
-          //"    -w: allow .wav in original sample format rather than downmixing to PCM16\n"
+          //"    -w: allow .wav in original sample format rather than mixing to PCM16\n"
             "    -V: print version info and supported extensions as JSON\n"
             "    -I: print requested file info as JSON\n"
             "    -h: print all commands\n"
@@ -89,6 +89,7 @@ static void print_usage(const char* progname, bool is_help) {
             "    -T: print title (for title testing)\n"
             "    -D <max channels>: downmix to <max channels> (for plugin downmix testing)\n"
             "    -B <samples> force a sample buffer size (for api testing)\n"
+          //"    -W: force .wav to output in float sample format\n"
             "    -O: decode but don't write to file (for performance testing)\n"
     );
 
@@ -108,7 +109,7 @@ static bool parse_config(cli_config_t* cfg, int argc, char** argv) {
     optind = 1; /* reset getopt's ugly globals (needed in wasm that may call same main() multiple times) */
 
     /* read config */
-    while ((opt = getopt(argc, argv, "o:l:f:d:ipPcmxeLEFrgb2:s:tTk:K:hOvD:S:B:VI")) != -1) {
+    while ((opt = getopt(argc, argv, "o:l:f:d:ipPcmxeLEFrgb2:s:tTk:K:hOvD:S:B:VIwW")) != -1) {
         switch (opt) {
             case 'o':
                 cfg->outfilename = optarg;
@@ -216,6 +217,9 @@ static bool parse_config(cli_config_t* cfg, int argc, char** argv) {
                     goto fail;
                 }
                 break;
+            case 'W':
+                cfg->write_float_wav = true;
+                break;
             case '2':
                 cfg->stereo_track = atoi(optarg) + 1;
                 break;
@@ -330,29 +334,23 @@ static void apply_config(VGMSTREAM* vgmstream, cli_config_t* cfg) {
 
 static bool write_file(VGMSTREAM* vgmstream, cli_config_t* cfg) {
     FILE* outfile = NULL;
-    int32_t len_samples;
-    sample_t* buf = NULL;
-    int channels, input_channels;
 
+    int channels = vgmstream->channels;
 
-    channels = vgmstream->channels;
-    input_channels = vgmstream->channels;
-
+    int input_channels = vgmstream->channels;
     vgmstream_mixing_enable(vgmstream, 0, &input_channels, &channels);
-
-    /* last init */
-    buf = malloc(cfg->sample_buffer_size * sizeof(sample_t) * input_channels);
+    sample_t* buf = malloc(cfg->sample_buffer_size * sizeof(sample_t) * input_channels);
     if (!buf) {
         fprintf(stderr, "failed allocating output buffer\n");
-        goto fail;
+        return false;
     }
 
     /* simulate seek */
-    len_samples = vgmstream_get_samples(vgmstream);
+    int32_t play_samples = vgmstream_get_samples(vgmstream);
     if (cfg->seek_samples2 >= 0)
-        len_samples -= cfg->seek_samples2;
+        play_samples -= cfg->seek_samples2;
     else if (cfg->seek_samples1 >= 0)
-        len_samples -= cfg->seek_samples1;
+        play_samples -= cfg->seek_samples1;
 
     if (cfg->seek_samples1 >= 0)
         seek_vgmstream(vgmstream, cfg->seek_samples1);
@@ -385,7 +383,7 @@ static bool write_file(VGMSTREAM* vgmstream, cli_config_t* cfg) {
         size_t bytes_done;
 
         wav_header_t wav = {
-            .sample_count = len_samples,
+            .sample_count = play_samples,
             .sample_rate = vgmstream->sample_rate,
             .channels = channels,
             .write_smpl_chunk = cfg->write_lwav,
@@ -394,31 +392,39 @@ static bool write_file(VGMSTREAM* vgmstream, cli_config_t* cfg) {
         };
 
         bytes_done = wav_make_header(wav_buf, 0x100, &wav);
+        if (bytes_done == 0) goto fail;
         fwrite(wav_buf, sizeof(uint8_t), bytes_done, outfile);
     }
 
-    /* decode forever */
+    /* decode forever */// TODO improve logic of play forever + normal play
     while (cfg->play_forever && !cfg->decode_only) {
         int to_get = cfg->sample_buffer_size;
-
         render_vgmstream(buf, to_get, vgmstream);
 
-        wav_swap_samples_le(buf, channels * to_get, 0);
-        fwrite(buf, sizeof(sample_t), to_get * channels, outfile);
+        int buf_bytes = to_get * channels * sizeof(sample_t);
+        int buf_samples = to_get;
+        int sample_size = 0;
+
+        wav_swap_samples_le(buf, channels * buf_samples, sample_size);
+        fwrite(buf, sizeof(uint8_t), buf_bytes, outfile);
         /* should write infinitely until program kill */
     }
 
     /* decode */
-    for (int i = 0; i < len_samples; i += cfg->sample_buffer_size) {
+    for (int i = 0; i < play_samples; i += cfg->sample_buffer_size) {
         int to_get = cfg->sample_buffer_size;
-        if (i + cfg->sample_buffer_size > len_samples)
-            to_get = len_samples - i;
+        if (i + cfg->sample_buffer_size > play_samples)
+            to_get = play_samples - i;
 
         render_vgmstream(buf, to_get, vgmstream);
 
+        int buf_bytes = to_get * channels * sizeof(sample_t);
+        int buf_samples = to_get;
+        int sample_size = 0;
+
         if (!cfg->decode_only) {
-            wav_swap_samples_le(buf, channels * to_get, 0);
-            fwrite(buf, sizeof(sample_t), to_get * channels, outfile);
+            wav_swap_samples_le(buf, channels * buf_samples, sample_size);
+            fwrite(buf, sizeof(uint8_t), buf_bytes, outfile);
         }
     }
 
@@ -490,7 +496,7 @@ fail:
 static bool convert_file(cli_config_t* cfg) {
     VGMSTREAM* vgmstream = NULL;
     char outfilename_temp[CLI_PATH_LIMIT];
-    int32_t len_samples;
+    int32_t play_samples;
 
 
     /* for plugin testing */
@@ -510,8 +516,8 @@ static bool convert_file(cli_config_t* cfg) {
 
 
     /* get final play config */
-    len_samples = vgmstream_get_samples(vgmstream);
-    if (len_samples <= 0) {
+    play_samples = vgmstream_get_samples(vgmstream);
+    if (play_samples <= 0) {
         fprintf(stderr, "wrong time config\n");
         goto fail;
     }
@@ -525,7 +531,7 @@ static bool convert_file(cli_config_t* cfg) {
     }
 
     /* would be ignored by seek code though (allowed for seek_samples2 to test this) */
-    if (cfg->seek_samples1 < -1 || cfg->seek_samples1 >= len_samples) {
+    if (cfg->seek_samples1 < -1 || cfg->seek_samples1 >= play_samples) {
         fprintf(stderr, "wrong seek config\n");
         goto fail;
     }
diff --git a/cli/vgmstream_cli.h b/cli/vgmstream_cli.h
index 24e1e9e1..49b1b603 100644
--- a/cli/vgmstream_cli.h
+++ b/cli/vgmstream_cli.h
@@ -36,6 +36,7 @@ typedef struct {
     // wav config
     bool write_lwav;
     bool write_original_wav;
+    bool write_float_wav;
 
     // print flags
     bool print_metaonly;
diff --git a/cli/vgmstream_cli_utils.c b/cli/vgmstream_cli_utils.c
index 8742b00b..25a35084 100644
--- a/cli/vgmstream_cli_utils.c
+++ b/cli/vgmstream_cli_utils.c
@@ -1,14 +1,11 @@
 #include <string.h>
 #include <inttypes.h>
 #include <stdio.h>
-
 #include "vgmstream_cli.h"
+#include "vjson.h"
 #include "../src/api.h"
 #include "../src/vgmstream.h"
 
-#include "vjson.h"
-
-
 static void clean_filename(char* dst, int clean_paths) {
     for (int i = 0; i < strlen(dst); i++) {
         char c = dst[i];
@@ -79,7 +76,7 @@ void replace_filename(char* dst, size_t dstsize, cli_config_t* cfg, VGMSTREAM* v
         }
         else {
             /* not recognized */
-            // TO-DO should move buf or swap "?" with "_"? may happen with non-ascii on Windows; for now break to avoid infinite loops
+            // TO-DO: should move buf or swap "?" with "_"? may happen with non-ascii on Windows; for now break to avoid infinite loops
             break;
         }
 
@@ -101,8 +98,8 @@ void replace_filename(char* dst, size_t dstsize, cli_config_t* cfg, VGMSTREAM* v
 
 void print_info(VGMSTREAM* vgmstream, cli_config_t* cfg) {
     int channels = vgmstream->channels;
-    int64_t num_samples = vgmstream->num_samples;
     bool loop_flag = vgmstream->loop_flag;
+    int64_t num_samples = vgmstream->num_samples;
     int64_t loop_start = vgmstream->loop_start_sample;
     int64_t loop_end = vgmstream->loop_start_sample;
 
@@ -180,9 +177,9 @@ void print_title(VGMSTREAM* vgmstream, cli_config_t* cfg) {
     if (!cfg->print_title)
         return;
 
-    tcfg.force_title = 0;
-    tcfg.subsong_range = 0;
-    tcfg.remove_extension = 0;
+    tcfg.force_title = false;
+    tcfg.subsong_range = false;
+    tcfg.remove_extension = true;
 
     vgmstream_get_title(title, sizeof(title), cfg->infilename, vgmstream, &tcfg);
 

From 2928d402a07c4ce4d94efa819155fdc51d5f4e0b Mon Sep 17 00:00:00 2001
From: bnnm <bananaman255@gmail.com>
Date: Wed, 22 Jan 2025 01:46:46 +0100
Subject: [PATCH 04/17] foo: minor tweaks

---
 fb2k/foo_streamfile.cpp | 91 +++++++++++++++++++++--------------------
 fb2k/foo_vgmstream.cpp  |  8 +---
 fb2k/foo_vgmstream.h    |  6 +--
 3 files changed, 52 insertions(+), 53 deletions(-)

diff --git a/fb2k/foo_streamfile.cpp b/fb2k/foo_streamfile.cpp
index 574fba55..0af38093 100644
--- a/fb2k/foo_streamfile.cpp
+++ b/fb2k/foo_streamfile.cpp
@@ -14,6 +14,9 @@ extern "C" {
 }
 #include "foo_vgmstream.h"
 
+/* Value can be adjusted freely but 8k is a good enough compromise. */
+#define FOO_STREAMFILE_DEFAULT_BUFFER_SIZE 0x8000
+
 
 /* a STREAMFILE that operates via foobar's file service using a buffer */
 typedef struct {
@@ -30,8 +33,8 @@ typedef struct {
     int archpath_end;           /* where the last \ ends before archive name */
     int archfile_end;           /* where the last | ends before file name */
 
-    offv_t offset;              /* last read offset (info) */
-    offv_t buf_offset;          /* current buffer data start */
+    int64_t offset;             /* last read offset (info) */
+    int64_t buf_offset;         /* current buffer data start */
     uint8_t* buf;               /* data buffer */
     size_t buf_size;            /* max buffer size */
     size_t valid_size;          /* current buffer size */
@@ -41,35 +44,36 @@ typedef struct {
 static STREAMFILE* open_foo_streamfile_buffer(const char* const filename, size_t buf_size, abort_callback* p_abort, t_filestats* stats);
 static STREAMFILE* open_foo_streamfile_buffer_by_file(service_ptr_t<file> m_file, bool m_file_opened, const char* const filename, size_t buf_size, abort_callback* p_abort);
 
-static size_t foo_read(FOO_STREAMFILE* sf, uint8_t* dst, offv_t offset, size_t length) {
+static size_t foo_read(FOO_STREAMFILE* sf, uint8_t* dst, offv_t offset, size_t dst_size) {
     size_t read_total = 0;
-
-    if (!sf || !sf->m_file_opened || !dst || length <= 0 || offset < 0)
+    if (!sf || !sf->m_file_opened || !dst || dst_size <= 0 || offset < 0)
         return 0;
 
+    sf->offset = offset; /* current offset */
+
     /* is the part of the requested length in the buffer? */
-    if (offset >= sf->buf_offset && offset < sf->buf_offset + sf->valid_size) {
+    if (sf->offset >= sf->buf_offset && sf->offset < sf->buf_offset + sf->valid_size) {
         size_t buf_limit;
-        int buf_into = (int)(offset - sf->buf_offset);
+        int buf_into = (int)(sf->offset - sf->buf_offset);
 
         buf_limit = sf->valid_size - buf_into;
-        if (buf_limit > length)
-            buf_limit = length;
+        if (buf_limit > dst_size)
+            buf_limit = dst_size;
 
         memcpy(dst, sf->buf + buf_into, buf_limit);
         read_total += buf_limit;
-        length -= buf_limit;
-        offset += buf_limit;
+        dst_size -= buf_limit;
+        sf->offset += buf_limit;
         dst += buf_limit;
     }
 
 
     /* read the rest of the requested length */
-    while (length > 0) {
+    while (dst_size > 0) {
         size_t buf_limit;
 
         /* ignore requests at EOF */
-        if (offset >= sf->file_size) {
+        if (sf->offset >= sf->file_size) {
             //offset = sf->file_size; /* seems fseek doesn't clamp offset */
             //VGM_ASSERT_ONCE(offset > sf->file_size, "STDIO: reading over file_size 0x%x @ 0x%lx + 0x%x\n", sf->file_size, offset, length);
             break;
@@ -77,42 +81,41 @@ static size_t foo_read(FOO_STREAMFILE* sf, uint8_t* dst, offv_t offset, size_t l
 
         /* position to new offset */
         try {
-            sf->m_file->seek(offset, *sf->p_abort);
+            sf->m_file->seek(sf->offset, *sf->p_abort);
         } catch (...) {
             break; /* this shouldn't happen in our code */
         }
 
         /* fill the buffer (offset now is beyond buf_offset) */
         try {
-            sf->buf_offset = offset;
+            sf->buf_offset = sf->offset;
             sf->valid_size = sf->m_file->read(sf->buf, sf->buf_size, *sf->p_abort);
         } catch(...) {
             break; /* improbable? */
         }
 
         /* decide how much must be read this time */
-        if (length > sf->buf_size)
+        if (dst_size > sf->buf_size)
             buf_limit = sf->buf_size;
         else
-            buf_limit = length;
+            buf_limit = dst_size;
 
         /* give up on partial reads (EOF) */
         if (sf->valid_size < buf_limit) {
             memcpy(dst, sf->buf, sf->valid_size);
-            offset += sf->valid_size;
+            sf->offset += sf->valid_size;
             read_total += sf->valid_size;
             break;
         }
 
         /* use the new buffer */
         memcpy(dst, sf->buf, buf_limit);
-        offset += buf_limit;
+        sf->offset += buf_limit;
         read_total += buf_limit;
-        length -= buf_limit;
+        dst_size -= buf_limit;
         dst += buf_limit;
     }
 
-    sf->offset = offset; /* last fread offset */
     return read_total;
 }
 
@@ -151,9 +154,8 @@ static void foo_close(FOO_STREAMFILE* sf) {
 }
 
 static STREAMFILE* foo_open(FOO_STREAMFILE* sf, const char* const filename, size_t buf_size) {
-    service_ptr_t<file> m_file;
 
-    if (!filename)
+    if (!sf || !filename)
         return NULL;
 
     // vgmstream may need to open "files based on another" (like a changing extension) and "files in the same subdir" (like .txth)
@@ -166,29 +168,32 @@ static STREAMFILE* foo_open(FOO_STREAMFILE* sf, const char* const filename, size
     // > opens:         "unpack://zip|23|file://C:\file.zip|.txth
     // (assumes archives won't need to open files outside archives, and goes before filedup trick)
     if (sf->archname) {
-        char finalname[PATH_LIMIT];
-        const char* dirsep = NULL; 
+        char finalname[FOO_PATH_LIMIT];
+        const char* filepart = NULL; 
 
         // newly open files should be "(current-path)\newfile" or "(current-path)\folder\newfile", so we need to make
         // (archive-path = current-path)\(rest = newfile plus new folders)
-        int filename_len = strlen(filename);
 
+        int filename_len = strlen(filename);
         if (filename_len > sf->archpath_end) {
-            dirsep = &filename[sf->archpath_end];
+            filepart = &filename[sf->archpath_end];
         } else  {
-            dirsep = strrchr(filename, '\\'); // vgmstream shouldn't remove paths though
-            if (!dirsep)
-                dirsep = filename;
+            filepart = strrchr(filename, '\\'); // vgmstream shouldn't remove paths though
+            if (!filepart)
+                filepart = filename;
             else
-                dirsep += 1;
+                filepart += 1;
         }
 
-        //TODO improve strops
-        memcpy(finalname, sf->archname, sf->archfile_end); //copy current path+archive
-        finalname[sf->archfile_end] = '\0';
-        concatn(sizeof(finalname), finalname, dirsep); //paste possible extra dirs and filename
+        //TODO improve str ops
 
-        // subfolders inside archives use "/" (path\archive.ext|subfolder/file.ext)
+        // copy current path+archive ("unpack://zip|23|file://C:\file.zip|")
+        memcpy(finalname, sf->archname, sf->archfile_end);
+        finalname[sf->archfile_end] = '\0';
+        // concat possible extra dirs and filename ("unpack://zip|23|file://C:\file.zip|" + "folder/bgm01.vag")
+        concatn(sizeof(finalname), finalname, filepart);
+
+        // normalize subfolders inside archives to use "/" (path\archive.ext|subfolder/file.ext)
         for (int i = sf->archfile_end; i < sizeof(finalname); i++) {
             if (finalname[i] == '\0')
                 break;
@@ -202,14 +207,12 @@ static STREAMFILE* foo_open(FOO_STREAMFILE* sf, const char* const filename, size
 
     // if same name, duplicate the file pointer we already have open
     if (sf->m_file_opened && !strcmp(sf->name, filename)) {
-        m_file = sf->m_file; //copy?
-        {
-            STREAMFILE* new_sf = open_foo_streamfile_buffer_by_file(m_file, sf->m_file_opened, filename, buf_size, sf->p_abort);
-            if (new_sf) {
-                return new_sf;
-            }
-            // failure, close it and try the default path (which will probably fail a second time)
+        service_ptr_t<file> m_file = sf->m_file; //copy?
+        STREAMFILE* new_sf = open_foo_streamfile_buffer_by_file(m_file, sf->m_file_opened, filename, buf_size, sf->p_abort);
+        if (new_sf) {
+            return new_sf;
         }
+        // failure, close it and try the default path (which will probably fail a second time)
     }
 
     // a normal open, open a new file
@@ -324,5 +327,5 @@ static STREAMFILE* open_foo_streamfile_buffer(const char* const filename, size_t
 }
 
 STREAMFILE* open_foo_streamfile(const char* const filename, abort_callback* p_abort, t_filestats* stats) {
-    return open_foo_streamfile_buffer(filename, STREAMFILE_DEFAULT_BUFFER_SIZE, p_abort, stats);
+    return open_foo_streamfile_buffer(filename, FOO_STREAMFILE_DEFAULT_BUFFER_SIZE, p_abort, stats);
 }
diff --git a/fb2k/foo_vgmstream.cpp b/fb2k/foo_vgmstream.cpp
index b30ef60d..03656424 100644
--- a/fb2k/foo_vgmstream.cpp
+++ b/fb2k/foo_vgmstream.cpp
@@ -11,10 +11,6 @@
 
 #include <foobar2000/SDK/foobar2000.h>
 
-extern "C" {
-#include "../src/vgmstream.h"
-#include "../src/api.h"
-}
 #include "foo_vgmstream.h"
 #include "foo_filetypes.h"
 
@@ -50,7 +46,7 @@ input_vgmstream::input_vgmstream() {
     output_channels = 0;
 
     decoding = false;
-    paused = 0;
+
     decode_pos_ms = 0;
     decode_pos_samples = 0;
     length_samples = 0;
@@ -199,6 +195,7 @@ void input_vgmstream::put_into_tagfile(file_info& p_info, abort_callback& p_abor
         strcpy(tagfile_path, tagfile_name);
     }
 
+
     STREAMFILE* sf_tags = open_foo_streamfile(tagfile_path, &p_abort, NULL);
     if (sf_tags == NULL)
         return;
@@ -445,7 +442,6 @@ void input_vgmstream::setup_vgmstream(abort_callback & p_abort) {
 
     decode_pos_ms = 0;
     decode_pos_samples = 0;
-    paused = 0;
     length_samples = vgmstream_get_samples(vgmstream);
 }
 
diff --git a/fb2k/foo_vgmstream.h b/fb2k/foo_vgmstream.h
index e59feb51..8571c0b8 100644
--- a/fb2k/foo_vgmstream.h
+++ b/fb2k/foo_vgmstream.h
@@ -6,6 +6,7 @@
 
 extern "C" {
 #include "../src/vgmstream.h"
+#include "../src/api.h"
 }
 
 typedef struct {
@@ -83,7 +84,7 @@ class input_vgmstream : public input_stubs {
         int output_channels;
 
         bool decoding;
-        int paused;
+
         int decode_pos_ms;
         int decode_pos_samples;
         int length_samples;
@@ -124,5 +125,4 @@ class input_vgmstream : public input_stubs {
 /* foo_streamfile.cpp */
 STREAMFILE* open_foo_streamfile(const char* const filename, abort_callback* p_abort, t_filestats* stats);
 
-
-#endif /*_FOO_VGMSTREAM_*/
+#endif

From e3f714b818cfed5334e381eaaa6eacc2f3b1bb01 Mon Sep 17 00:00:00 2001
From: bnnm <bananaman255@gmail.com>
Date: Wed, 22 Jan 2025 01:46:53 +0100
Subject: [PATCH 05/17] doc

---
 src/coding/libs/ka1a_dec.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/coding/libs/ka1a_dec.c b/src/coding/libs/ka1a_dec.c
index af253534..794f4997 100644
--- a/src/coding/libs/ka1a_dec.c
+++ b/src/coding/libs/ka1a_dec.c
@@ -15,7 +15,7 @@
  * OG code isn't too optimized though.
  *
  * Reverse engineered from exes, thanks to Kelebek1 and AceKombat for help and debugging.
- * Output has been compared to memdumps and should be accurate with minor +-diffs.
+ * Output has been compared to memdumps and should be accurate with minor +-diffs (vs MSVC 22 /O2).
  * 
  * Even though some parts can be simplified/optimized code tries to emulate what source code
  * may look like, undoing unrolled/vectorized parts. Functions marked as 'inline' don't exist in
@@ -147,7 +147,7 @@ static inline int unpack_get_bits(uint8_t* src, int* p_byte_pos, int* p_bit_pos,
 //
 // Bands encode less coefs than dst may hold, so 'positions' are used to put coefs
 // non-linearly, where unset indexes are 0 (dst must be memset before calling unpack frame).
-// dst should be 1024, though usually only lower 512 (max step is 390 + ((1<<7) - 1)).
+// dst should be 1024, though usually only lower 512 are used (max step is 390 + ((1<<7) - 1)).
 static void unpack_frame(uint8_t* src, float* dst, int steps_size, void* unused, int bitrate_index) {
 
     // copy coefs counts as they may be modified below
@@ -461,7 +461,7 @@ void transform_frame(void* unused1, float* src, float* dst, void* unused2, float
     }
 
     // Reorder output (input buf may be reused as output here as there is no overlap).
-    // Note that input is 512 coefs but output is 1024 samples (externally combined with samples)
+    // Note that input is 512 coefs but output is 1024 samples (externally combined with prev samples)
     int pos = 0;
     for (int i = 0; i < 128; i++) {
         dst[pos++] = real[128 + i];
@@ -522,13 +522,13 @@ static void decode_frame(unsigned char* src, int tracks, int channels, float* ds
         }
     }
 
-    if (setup_flag) // MOD: expect only 1 block per call
+    if (setup_flag) // OG MOD: changed to expect only 1 block per call
         return;
 
     // decode 'current block of frames' (writes 512 samples, plus setups 'prev' buf)
     {
         //uint8_t* src_block = &src[channels * tracks * frame_size]; // 2nd block in src in OG code
-        uint8_t* src_block = &src[0]; // MOD: expect only 1 block  per call
+        uint8_t* src_block = &src[0]; // OG MOD: changed to expect only 1 block  per call
 
         for (int track = 0; track < tracks; track++) {
             int frame_num = channels * track;
@@ -568,7 +568,7 @@ struct ka1a_handle_t {
 
     // state
     bool setup_flag;        // next frame will be used as setup and won't output samples
-    float temp[1024 * 2];   // fft + coef buf
+    float temp[1024 * 2];   // fft + spectrum coefs buf
     float* prev;            // at least samples * channels * tracks
 };
 

From 311aad295e070752548b310568ae12a272e9951c Mon Sep 17 00:00:00 2001
From: bnnm <bananaman255@gmail.com>
Date: Wed, 22 Jan 2025 01:47:12 +0100
Subject: [PATCH 06/17] awc: fix compiling without ffmpeg

---
 src/meta/awc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/meta/awc.c b/src/meta/awc.c
index bf84bcfd..af2e9bee 100644
--- a/src/meta/awc.c
+++ b/src/meta/awc.c
@@ -205,7 +205,7 @@ VGMSTREAM* init_vgmstream_awc(STREAMFILE* sf) {
             } 
             break;
 
-#ifdef VGM_USE_ATRAC9
+#ifdef VGM_USE_FFMPEG
         case 0x0D: {    /* OPUS (PC) [Red Dead Redemption (PC)] */
             if (awc.is_streamed) {
                 vgmstream->layout_data = build_layered_awc(sf_body, &awc);

From 279e0e28626db93faef74b4fe81cb718e5da09bb Mon Sep 17 00:00:00 2001
From: bnnm <bananaman255@gmail.com>
Date: Wed, 22 Jan 2025 22:49:06 +0100
Subject: [PATCH 07/17] mixing: internal cleanup

---
 src/base/mixer.c            |  64 +++++++++++------
 src/base/mixer_ops_common.c | 139 +++++++++++++++++++-----------------
 src/base/mixer_ops_fade.c   |  20 +++---
 src/base/mixer_priv.h       |  24 +++----
 4 files changed, 137 insertions(+), 110 deletions(-)

diff --git a/src/base/mixer.c b/src/base/mixer.c
index 7ce06043..272fca0c 100644
--- a/src/base/mixer.c
+++ b/src/base/mixer.c
@@ -70,6 +70,34 @@ bool mixer_is_active(mixer_t* mixer) {
     return false;
 }
 
+// TODO: probably could be pre-initialized
+static void setup_mixbuf(mixer_t* mixer, sbuf_t* sbuf) {
+    sbuf_t* smix = &mixer->smix;
+
+    // mixbuf can be interpreted as FLT or F32; try to use src's to keep buf as-is (less rounding errors)
+    if (sbuf->fmt == SFMT_F32 || sbuf->fmt == SFMT_FLT)
+        sbuf_init(smix, sbuf->fmt, mixer->mixbuf, sbuf->filled, sbuf->channels); //mixer->input_channels
+    else
+        sbuf_init(smix, SFMT_F32, mixer->mixbuf, sbuf->filled, sbuf->channels);
+
+    // remix to temp buf (somehow using float buf rather than int32 is faster?)
+    sbuf_copy_segments(smix, sbuf, sbuf->filled);
+}
+
+static void setup_outbuf(mixer_t* mixer, sbuf_t* sbuf) {
+    sbuf_t* smix = &mixer->smix; //TODO: probably could be pre-initialized
+
+    // setup + remix to output buf (buf is expected to be big enough to handle config)
+    sbuf->channels = mixer->output_channels;
+    sbuf->filled = 0;
+    smix->channels = mixer->output_channels;
+    if (mixer->force_type) {
+        sbuf->fmt = mixer->force_type;
+    }
+
+    sbuf_copy_segments(sbuf, smix, smix->filled);
+}
+
 void mixer_process(mixer_t* mixer, sbuf_t* sbuf, int32_t current_pos) {
 
     /* external */
@@ -78,46 +106,36 @@ void mixer_process(mixer_t* mixer, sbuf_t* sbuf, int32_t current_pos) {
 
     /* try to skip if no fades apply (set but does nothing yet) + only has fades 
      * (could be done in mix op but avoids upgrading bufs in some cases) */
-    mixer->current_subpos = 0;
     if (mixer->has_fade) {
         //;VGM_LOG("MIX: fade test %i, %i\n", data->has_non_fade, mixer_op_fade_is_active(data, current_pos, current_pos + sample_count));
         if (!mixer->has_non_fade && !mixer_op_fade_is_active(mixer, current_pos, current_pos + sbuf->filled))
             return;
-
-        //;VGM_LOG("MIX: fade pos=%i\n", current_pos);
-        mixer->current_subpos = current_pos;
     }
 
-    // remix to temp buf for mixing (somehow using float buf rather than int32 is faster?)
-    sbuf_copy_to_f32(mixer->mixbuf, sbuf);
+    mixer->current_subpos = current_pos;
 
-    // apply mixing ops in order. current_channels may increase or decrease per op
+    setup_mixbuf(mixer, sbuf);
+
+    // apply mixing ops in order. channesl in mixersmix may increase or decrease per op
     // - 2ch w/ "1+2,1u" = ch1+ch2, ch1(add and push rest) = 3ch: ch1' ch1+ch2 ch2
     // - 2ch w/ "1u"     = downmix to 1ch (current_channels decreases once)
-    mixer->current_channels = mixer->input_channels;
     for (int m = 0; m < mixer->chain_count; m++) {
         mix_op_t* mix = &mixer->chain[m];
 
         //TO-DO: set callback
         switch(mix->type) {
-            case MIX_SWAP:      mixer_op_swap(mixer, sbuf->filled, mix); break;
-            case MIX_ADD:       mixer_op_add(mixer, sbuf->filled, mix); break;
-            case MIX_VOLUME:    mixer_op_volume(mixer, sbuf->filled, mix); break;
-            case MIX_LIMIT:     mixer_op_limit(mixer, sbuf->filled, mix); break;
-            case MIX_UPMIX:     mixer_op_upmix(mixer, sbuf->filled, mix); break;
-            case MIX_DOWNMIX:   mixer_op_downmix(mixer, sbuf->filled, mix); break;
-            case MIX_KILLMIX:   mixer_op_killmix(mixer, sbuf->filled, mix); break;
-            case MIX_FADE:      mixer_op_fade(mixer, sbuf->filled, mix);
+            case MIX_SWAP:      mixer_op_swap(mixer, mix); break;
+            case MIX_ADD:       mixer_op_add(mixer, mix); break;
+            case MIX_VOLUME:    mixer_op_volume(mixer, mix); break;
+            case MIX_LIMIT:     mixer_op_limit(mixer, mix); break;
+            case MIX_UPMIX:     mixer_op_upmix(mixer, mix); break;
+            case MIX_DOWNMIX:   mixer_op_downmix(mixer, mix); break;
+            case MIX_KILLMIX:   mixer_op_killmix(mixer, mix); break;
+            case MIX_FADE:      mixer_op_fade(mixer, mix);
             default:
                 break;
         }
     }
 
-    // setup + remix to output buf (buf is expected to be big enough to handle config)
-    sbuf->channels = mixer->output_channels;
-    if (mixer->force_type) {
-        sbuf->fmt = mixer->force_type;
-    }
-
-    sbuf_copy_from_f32(sbuf, mixer->mixbuf);
+    setup_outbuf(mixer, sbuf);
 }
diff --git a/src/base/mixer_ops_common.c b/src/base/mixer_ops_common.c
index cabf27de..fc0b1556 100644
--- a/src/base/mixer_ops_common.c
+++ b/src/base/mixer_ops_common.c
@@ -5,138 +5,147 @@
 // when there are no actual float ops (ex. 'swap', if no ' volume' )
 // Performance gain is probably fairly small, though.
 
-void mixer_op_swap(mixer_t* mixer, int32_t sample_count, mix_op_t* op) {
-    float* sbuf = mixer->mixbuf;
+void mixer_op_swap(mixer_t* mixer, mix_op_t* op) {
+    sbuf_t* smix = &mixer->smix;
+    float* dst = smix->buf;
 
-    for (int s = 0; s < sample_count; s++) {
-        float temp_f = sbuf[op->ch_dst];
-        sbuf[op->ch_dst] = sbuf[op->ch_src];
-        sbuf[op->ch_src] = temp_f;
+    for (int s = 0; s < smix->filled; s++) {
+        float temp_f = dst[op->ch_dst];
+        dst[op->ch_dst] = dst[op->ch_src];
+        dst[op->ch_src] = temp_f;
 
-        sbuf += mixer->current_channels;
+        dst += smix->channels;
     }
 }
 
-void mixer_op_add(mixer_t* mixer, int32_t sample_count, mix_op_t* op) {
-    float* sbuf = mixer->mixbuf;
+void mixer_op_add(mixer_t* mixer, mix_op_t* op) {
+    sbuf_t* smix = &mixer->smix;
+    float* dst = smix->buf;
 
     /* could optimize when vol == 1 to avoid one multiplication but whatevs (not common) */
-    for (int s = 0; s < sample_count; s++) {
-        sbuf[op->ch_dst] = sbuf[op->ch_dst] + sbuf[op->ch_src] * op->vol;
+    for (int s = 0; s < smix->filled; s++) {
+        dst[op->ch_dst] = dst[op->ch_dst] + dst[op->ch_src] * op->vol;
 
-        sbuf += mixer->current_channels;
+        dst += smix->channels;
     }
 }
 
-void mixer_op_volume(mixer_t* mixer, int32_t sample_count, mix_op_t* op) {
-    float* sbuf = mixer->mixbuf;
+void mixer_op_volume(mixer_t* mixer, mix_op_t* op) {
+    sbuf_t* smix = &mixer->smix;
+    float* dst = smix->buf;
     
     if (op->ch_dst < 0) {
         /* "all channels", most common case */
-        for (int s = 0; s < sample_count * mixer->current_channels; s++) {
-            sbuf[s] = sbuf[s] * op->vol;
+        for (int s = 0; s < smix->filled * smix->channels; s++) {
+            dst[s] = dst[s] * op->vol;
         }
     }
     else {
-        for (int s = 0; s < sample_count; s++) {
-            sbuf[op->ch_dst] = sbuf[op->ch_dst] * op->vol;
+        for (int s = 0; s < smix->filled; s++) {
+            dst[op->ch_dst] = dst[op->ch_dst] * op->vol;
 
-            sbuf += mixer->current_channels;
+            dst += smix->channels;
         }
     }
 }
 
-void mixer_op_limit(mixer_t* mixer, int32_t sample_count, mix_op_t* op) {
-    float* sbuf = mixer->mixbuf;
+void mixer_op_limit(mixer_t* mixer, mix_op_t* op) {
+    sbuf_t* smix = &mixer->smix;
+    float* dst = smix->buf;
 
-    const float limiter_max = 32767.0f;
-    const float limiter_min = -32768.0f;
+    const float limiter_max = smix->fmt == SFMT_FLT ? 1.0f : 32767.0f;
+    const float limiter_min = smix->fmt == SFMT_FLT ? -1.0f : -32768.0f;
 
     const float temp_max = limiter_max * op->vol;
     const float temp_min = limiter_min * op->vol;
 
     /* could optimize when vol == 1 to avoid one multiplication but whatevs (not common) */
-    for (int s = 0; s < sample_count; s++) {
+    for (int s = 0; s < smix->filled; s++) {
 
         if (op->ch_dst < 0) {
-            for (int ch = 0; ch < mixer->current_channels; ch++) {
-                if (sbuf[ch] > temp_max)
-                    sbuf[ch] = temp_max;
-                else if (sbuf[ch] < temp_min)
-                    sbuf[ch] = temp_min;
+            for (int ch = 0; ch < smix->channels; ch++) {
+                if (dst[ch] > temp_max)
+                    dst[ch] = temp_max;
+                else if (dst[ch] < temp_min)
+                    dst[ch] = temp_min;
             }
         }
         else {
-            if (sbuf[op->ch_dst] > temp_max)
-                sbuf[op->ch_dst] = temp_max;
-            else if (sbuf[op->ch_dst] < temp_min)
-                sbuf[op->ch_dst] = temp_min;
+            if (dst[op->ch_dst] > temp_max)
+                dst[op->ch_dst] = temp_max;
+            else if (dst[op->ch_dst] < temp_min)
+                dst[op->ch_dst] = temp_min;
         }
 
-        sbuf += mixer->current_channels;
+        dst += smix->channels;
     }
 }
 
-void mixer_op_upmix(mixer_t* mixer, int32_t sample_count, mix_op_t* op) {
-    int max_channels = mixer->current_channels;
-    mixer->current_channels += 1;
+void mixer_op_upmix(mixer_t* mixer, mix_op_t* op) {
+    sbuf_t* smix = &mixer->smix;
+    float* sbuf = smix->buf;
 
-    float* sbuf_tmp = mixer->mixbuf + sample_count * mixer->current_channels;
-    float* sbuf = mixer->mixbuf + sample_count * max_channels;
+    int max_channels = smix->channels;
+    smix->channels += 1;
+
+    float* dst = sbuf + smix->filled * smix->channels;
+    float* src = sbuf + smix->filled * max_channels;
 
     /* copy 'backwards' as otherwise would overwrite samples before moving them forward */
-    for (int s = 0; s < sample_count; s++) {
-        sbuf_tmp -= mixer->current_channels;
-        sbuf -= max_channels;
+    for (int s = 0; s < smix->filled; s++) {
+        dst -= smix->channels;
+        src -= max_channels;
 
         int sbuf_ch = max_channels - 1;
-        for (int ch = mixer->current_channels - 1; ch >= 0; ch--) {
+        for (int ch = smix->channels - 1; ch >= 0; ch--) {
             if (ch == op->ch_dst) {
-                sbuf_tmp[ch] = 0; /* inserted as silent */
+                dst[ch] = 0; // inserted as silent
             }
             else {
-                sbuf_tmp[ch] = sbuf[sbuf_ch]; /* 'pull' channels backward */
+                dst[ch] = src[sbuf_ch]; // 'pull' channels backward
                 sbuf_ch--;
             }
         }
     }
 }
 
-void mixer_op_downmix(mixer_t* mixer, int32_t sample_count, mix_op_t* op) {
-    int max_channels = mixer->current_channels;
-    mixer->current_channels -= 1;
+void mixer_op_downmix(mixer_t* mixer, mix_op_t* op) {
+    sbuf_t* smix = &mixer->smix;
+    float* src = smix->buf;
+    float* dst = smix->buf;
 
-    float* sbuf = mixer->mixbuf;
-    float* sbuf_tmp = sbuf;
+    int max_channels = smix->channels;
+    smix->channels -= 1;
 
-    for (int s = 0; s < sample_count; s++) {
+    for (int s = 0; s < smix->filled; s++) {
 
         for (int ch = 0; ch < op->ch_dst; ch++) {
-            sbuf_tmp[ch] = sbuf[ch]; /* copy untouched channels */
+            dst[ch] = src[ch]; // copy untouched channels
         }
 
         for (int ch = op->ch_dst; ch < max_channels - 1; ch++) {
-            sbuf_tmp[ch] = sbuf[ch + 1]; /* 'pull' dropped channels back */
+            dst[ch] = src[ch + 1]; // 'pull' dropped channels back
         }
 
-        sbuf_tmp += mixer->current_channels;
-        sbuf += max_channels;
+        dst += smix->channels;
+        src += max_channels;
     }
 }
 
-void mixer_op_killmix(mixer_t* mixer, int32_t sample_count, mix_op_t* op) {
-    int max_channels = mixer->current_channels;
-    mixer->current_channels = op->ch_dst; /* clamp channels */
+void mixer_op_killmix(mixer_t* mixer, mix_op_t* op) {
+    sbuf_t* smix = &mixer->smix;
+    float* src = smix->buf;
+    float* dst = smix->buf;
 
-    float* sbuf = mixer->mixbuf;
-    float* sbuf_tmp = sbuf;
+    int max_channels = smix->channels;
+    smix->channels = op->ch_dst; // clamp channels
 
-    for (int s = 0; s < sample_count; s++) {
-        for (int ch = 0; ch < mixer->current_channels; ch++) {
-            sbuf_tmp[ch] = sbuf[ch];
+    for (int s = 0; s < smix->filled; s++) {
+        for (int ch = 0; ch < smix->channels; ch++) {
+            dst[ch] = src[ch];
         }
 
-        sbuf_tmp += mixer->current_channels;
-        sbuf += max_channels;
+        dst += smix->channels;
+        src += max_channels;
     }
 }
diff --git a/src/base/mixer_ops_fade.c b/src/base/mixer_ops_fade.c
index f0670da0..0591c07e 100644
--- a/src/base/mixer_ops_fade.c
+++ b/src/base/mixer_ops_fade.c
@@ -2,6 +2,8 @@
 #include <limits.h>
 #include <math.h>
 
+//TODO: could precalculate tables + interpolate for some performance gain
+
 #define MIXING_PI   3.14159265358979323846f
 
 static inline float get_fade_gain_curve(char shape, float index) {
@@ -112,36 +114,34 @@ static bool get_fade_gain(mix_op_t* op, float* out_cur_vol, int32_t current_subp
     return true;
 }
 
-void mixer_op_fade(mixer_t* mixer, int32_t sample_count, mix_op_t* mix) {
-    float* sbuf = mixer->mixbuf;
+void mixer_op_fade(mixer_t* mixer, mix_op_t* mix) {
+    sbuf_t* smix = &mixer->smix;
+    float* dst = smix->buf;
     float new_gain = 0.0f;
 
-    int channels = mixer->current_channels;
+    int channels = smix->channels;
     int32_t current_subpos = mixer->current_subpos;
 
     //TODO optimize for case 0?
-    for (int s = 0; s < sample_count; s++) {
+    for (int s = 0; s < smix->filled; s++) {
         bool fade_applies = get_fade_gain(mix, &new_gain, current_subpos);
         if (!fade_applies) //TODO optimize?
             continue;
 
         if (mix->ch_dst < 0) {
             for (int ch = 0; ch < channels; ch++) {
-                sbuf[ch] = sbuf[ch] * new_gain;
+                dst[ch] = dst[ch] * new_gain;
             }
         }
         else {
-            sbuf[mix->ch_dst] = sbuf[mix->ch_dst] * new_gain;
+            dst[mix->ch_dst] = dst[mix->ch_dst] * new_gain;
         }
 
-        sbuf += channels;
+        dst += channels;
         current_subpos++;
     }
-
-    mixer->current_subpos = current_subpos;
 }
 
-
 bool mixer_op_fade_is_active(mixer_t* mixer, int32_t current_start, int32_t current_end) {
 
     for (int i = 0; i < mixer->chain_count; i++) {
diff --git a/src/base/mixer_priv.h b/src/base/mixer_priv.h
index 73fe757e..5a7cd6fd 100644
--- a/src/base/mixer_priv.h
+++ b/src/base/mixer_priv.h
@@ -49,20 +49,20 @@ struct mixer_t {
     bool has_non_fade;
     bool has_fade;
 
-    float* mixbuf;          /* internal mixing buffer */
-    int current_channels;   /* state: channels may increase/decrease during ops */
-    int32_t current_subpos; /* state: current sample pos in the stream */
+    float* mixbuf;          // internal mixing buffer
+    sbuf_t smix;            // temp sbuf
+    int32_t current_subpos; // state: current sample pos in the stream
 
-    sfmt_t force_type;
+    sfmt_t force_type;      // mixer output is original buffer's by default, unless forced
 };
 
-void mixer_op_swap(mixer_t* mixer, int32_t sample_count, mix_op_t* op);
-void mixer_op_add(mixer_t* mixer, int32_t sample_count, mix_op_t* op);
-void mixer_op_volume(mixer_t* mixer, int32_t sample_count, mix_op_t* op);
-void mixer_op_limit(mixer_t* mixer, int32_t sample_count, mix_op_t* op);
-void mixer_op_upmix(mixer_t* mixer, int32_t sample_count, mix_op_t* op);
-void mixer_op_downmix(mixer_t* mixer, int32_t sample_count, mix_op_t* op);
-void mixer_op_killmix(mixer_t* mixer, int32_t sample_count, mix_op_t* op);
-void mixer_op_fade(mixer_t* mixer, int32_t sample_count, mix_op_t* op);
+void mixer_op_swap(mixer_t* mixer, mix_op_t* op);
+void mixer_op_add(mixer_t* mixer, mix_op_t* op);
+void mixer_op_volume(mixer_t* mixer, mix_op_t* op);
+void mixer_op_limit(mixer_t* mixer, mix_op_t* op);
+void mixer_op_upmix(mixer_t* mixer, mix_op_t* op);
+void mixer_op_downmix(mixer_t* mixer, mix_op_t* op);
+void mixer_op_killmix(mixer_t* mixer, mix_op_t* op);
+void mixer_op_fade(mixer_t* mixer, mix_op_t* op);
 bool mixer_op_fade_is_active(mixer_t* mixer, int32_t current_start, int32_t current_end);
 #endif

From 36545010737a9646037626d4026c4a18acb8f597 Mon Sep 17 00:00:00 2001
From: bnnm <bananaman255@gmail.com>
Date: Wed, 22 Jan 2025 22:49:46 +0100
Subject: [PATCH 08/17] Fix some .msf loops being slightly off

---
 src/meta/msf.c | 69 +++++++++++++++++++++++++++-----------------------
 1 file changed, 38 insertions(+), 31 deletions(-)

diff --git a/src/meta/msf.c b/src/meta/msf.c
index 247f56cd..f51ad0e0 100644
--- a/src/meta/msf.c
+++ b/src/meta/msf.c
@@ -11,6 +11,15 @@ VGMSTREAM* init_vgmstream_msf(STREAMFILE* sf) {
 
 
     /* checks */
+    if ((read_u32be(0x00,sf) & 0xffffff00) != get_id32be("MSF\0"))
+        return NULL;
+    // "MSF" + n.n version:
+    // - 0x01: Megazone 23: Aoi Garland (PS3)
+    // - 0x02: Switchball (PS3)
+    // - 0x30 ('0'): ?
+    // - 0x35 ('5'): SDKs
+    // - 0x43 ('C'): latest/most common
+
     /* .msf: standard
      * .msa: Sonic & Sega All-Stars Racing (PS3)
      * .at3: Silent Hill HD Collection (PS3), Z/X Zekkai no Crusade (PS3)
@@ -18,12 +27,7 @@ VGMSTREAM* init_vgmstream_msf(STREAMFILE* sf) {
      * .str: Pac-Man and the Ghostly Adventures (PS3)
      * .snd: HamsterBall (PS3) */
     if (!check_extensions(sf,"msf,msa,at3,mp3,str,snd"))
-        goto fail;
-
-    /* check header "MSF" + version-char, usually:
-     *  0x01, 0x02, 0x30="0", 0x35="5", 0x43="C" (last/most common version) */
-    if ((read_u32be(0x00,sf) & 0xffffff00) != 0x4D534600) /* "MSF\0" */
-        goto fail;
+        return NULL;
 
     start_offset = 0x40;
 
@@ -45,13 +49,11 @@ VGMSTREAM* init_vgmstream_msf(STREAMFILE* sf) {
      * 0x10 often goes with 0x01 but not always (Castlevania HoD); Malicious PS3 uses flag 0x2 instead */
     loop_flag = (flags != 0xffffffff) && ((flags & 0x01) || (flags & 0x02));
 
-    /* loop markers (marker N @ 0x18 + N*(4+4), but in practice only marker 0 is used) */
+    /* loop offset markers (marker N @ 0x18 + N*(4+4), but in practice only marker 0 is used) */
     if (loop_flag) {
         loop_start = read_u32be(0x18,sf);
         loop_end = read_u32be(0x1C,sf); /* loop duration */
         loop_end = loop_start + loop_end; /* usually equals data_size but not always */
-        if (loop_end > data_size) /* not seen */
-            loop_end = data_size;
     }
 
 
@@ -71,12 +73,11 @@ VGMSTREAM* init_vgmstream_msf(STREAMFILE* sf) {
             vgmstream->layout_type = layout_interleave;
             vgmstream->interleave_block_size = 0x02;
 
-            vgmstream->num_samples = pcm_bytes_to_samples(data_size, channels, 16);
+            vgmstream->num_samples = pcm16_bytes_to_samples(data_size, channels);
             if (loop_flag){
-                vgmstream->loop_start_sample = pcm_bytes_to_samples(loop_start, channels, 16);
-                vgmstream->loop_end_sample = pcm_bytes_to_samples(loop_end, channels, 16);
+                vgmstream->loop_start_sample = pcm16_bytes_to_samples(loop_start, channels);
+                vgmstream->loop_end_sample = pcm16_bytes_to_samples(loop_end, channels);
             }
-
             break;
         }
 
@@ -102,29 +103,35 @@ VGMSTREAM* init_vgmstream_msf(STREAMFILE* sf) {
         case 0x04:   /* ATRAC3 low (66 kbps, frame size 96, Joint Stereo) [Silent Hill HD (PS3)] */
         case 0x05:   /* ATRAC3 mid (105 kbps, frame size 152) [Atelier Rorona (PS3)] */
         case 0x06: { /* ATRAC3 high (132 kbps, frame size 192) [Tekken Tag Tournament HD (PS3)] */
-            int block_align, encoder_delay;
 
-            /* MSF skip samples: from tests with MSEnc and real files (ex. TTT2 eddy.msf v43, v01 demos) seems like 1162 is consistent.
-             * Atelier Rorona bt_normal01 needs it to properly skip the beginning garbage but usually doesn't matter.
-             * (note that encoder may add a fade-in with looping/resampling enabled but should be skipped) */
-            encoder_delay = 1024 + 69*2;
-            block_align   = (codec==4 ? 0x60 : (codec==5 ? 0x98 : 0xC0)) * vgmstream->channels;
-            vgmstream->num_samples = atrac3_bytes_to_samples(data_size, block_align) - encoder_delay;
-            if (vgmstream->sample_rate == -1) /* some MSFv1 (Digi World SP) */
-                vgmstream->sample_rate = 44100; /* voice tracks seems to use 44khz, not sure about other tracks */
+            /* some MSFv1 voices [Digi World SP (PS3)] */
+            if (vgmstream->sample_rate == -1)
+                vgmstream->sample_rate = 44100;
 
-            vgmstream->codec_data = init_ffmpeg_atrac3_raw(sf, start_offset,data_size, vgmstream->num_samples,vgmstream->channels,vgmstream->sample_rate, block_align, encoder_delay);
+            int block_align = (codec==4 ? 0x60 : (codec==5 ? 0x98 : 0xC0)) * vgmstream->channels;
+            vgmstream->num_samples = atrac3_bytes_to_samples(data_size, block_align);
+            vgmstream->loop_start_sample = atrac3_bytes_to_samples(loop_start, block_align);
+            vgmstream->loop_end_sample   = atrac3_bytes_to_samples(loop_end, block_align);
+
+            /* MSF skip samples: from MSEnc tests and real files (ex. TTT2 eddy.msf v43, v01 demos) seems like 1162 is consistent.
+             * Often doesn't matter but sometimes there is audible garbage [Atelier Rorona (PS3)-bt_normal01]
+             * However full loops use offset 0 to file end, so maybe decoder doesn't actually skip samples (like in MPEG).
+             * MSEnc accepts samples and will adjust loops somewhat to closest frame but is not accurate enough.
+             * Comparing vs other platforms loop start+end need to be in sync [Mamoru-kun wa Norowarette Shimatta! (PS3)]
+             * For now only remove samples if wouldn't mess up loops. */
+            int encoder_delay = 1024 + 69*2;
+            if (vgmstream->loop_flag && encoder_delay > vgmstream->loop_start_sample) {
+                encoder_delay = 0;
+            }
+            vgmstream->num_samples -= encoder_delay;
+            vgmstream->loop_start_sample -= encoder_delay;
+            vgmstream->loop_end_sample -= encoder_delay;
+
+            vgmstream->codec_data = init_ffmpeg_atrac3_raw(sf, start_offset,data_size, vgmstream->num_samples,vgmstream->channels, vgmstream->sample_rate, block_align, encoder_delay);
             if (!vgmstream->codec_data) goto fail;
             vgmstream->coding_type = coding_FFmpeg;
             vgmstream->layout_type = layout_none;
 
-            /* MSF loop/sample values are offsets so trickier to adjust but this seems correct */
-            if (loop_flag) {
-                /* set offset samples (offset 0 jumps to sample 0 > pre-applied delay, and offset end loops after sample end > adjusted delay) */
-                vgmstream->loop_start_sample = atrac3_bytes_to_samples(loop_start, block_align); //- encoder_delay
-                vgmstream->loop_end_sample   = atrac3_bytes_to_samples(loop_end, block_align) - encoder_delay;
-            }
-
             break;
         }
 #endif
@@ -139,7 +146,7 @@ VGMSTREAM* init_vgmstream_msf(STREAMFILE* sf) {
             vgmstream->num_samples = mpeg_get_samples_clean(sf, start_offset, data_size, &loop_start, &loop_end, is_vbr);
             vgmstream->loop_start_sample = loop_start;
             vgmstream->loop_end_sample = loop_end;
-            /* MPEG here seems stripped from ID3/Xing headers, loops are frame offsets */
+            /* MSEnc seems to strip ID3/Xing headers, loops are frame offsets */
 
             /* encoder delay varies between 1152 (1f), 528, 576, etc; probably not actually skipped */
             break;

From cd49311d8efb40c1376183cc5983b2839c8328a4 Mon Sep 17 00:00:00 2001
From: bnnm <bananaman255@gmail.com>
Date: Wed, 22 Jan 2025 22:51:58 +0100
Subject: [PATCH 09/17] Fix some .srsa+srst KTAC [Shin Hokuto Musou (Android)]

---
 src/meta/ktsr.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/src/meta/ktsr.c b/src/meta/ktsr.c
index d41a8c3e..a11a09d8 100644
--- a/src/meta/ktsr.c
+++ b/src/meta/ktsr.c
@@ -28,6 +28,7 @@ typedef struct {
     uint32_t audio_id;
     int platform;
     int format;
+    uint32_t codec_value;
     uint32_t sound_id;
     uint32_t sound_flags;
     uint32_t config_flags;
@@ -54,7 +55,7 @@ static bool parse_ktsr(ktsr_header_t* ktsr, STREAMFILE* sf);
 static layered_layout_data* build_layered_atrac9(ktsr_header_t* ktsr, STREAMFILE *sf, uint32_t config_data);
 static VGMSTREAM* init_vgmstream_ktsr_sub(STREAMFILE* sf_b, uint32_t st_offset, ktsr_header_t* ktsr, VGMSTREAM* (*init_vgmstream)(STREAMFILE* sf), const char* ext);
 
-/* KTSR - Koei Tecmo sound resource container */
+/* KTSR - Koei Tecmo sound resource container (KTSL2 sound lib) */
 VGMSTREAM* init_vgmstream_ktsr(STREAMFILE* sf) {
 
     /* checks */
@@ -204,6 +205,7 @@ static VGMSTREAM* init_vgmstream_ktsr_internal(STREAMFILE* sf, ktsr_meta_t* info
 
 
     sf_b = setup_sf_body(sf, &ktsr, info);
+    if (!sf_b) goto fail;
 
     /* subfiles */
     {
@@ -340,7 +342,7 @@ static VGMSTREAM* init_vgmstream_ktsr_sub(STREAMFILE* sf_b, uint32_t st_offset,
     sub_vgmstream = init_vgmstream(temp_sf);
     close_streamfile(temp_sf);
     if (!sub_vgmstream) {
-        VGM_LOG("ktsr: can't open subfile at %x (size %x)\n", ktsr->stream_offsets[0], ktsr->stream_sizes[0]);
+        VGM_LOG("ktsr: can't open subfile %s at %x (size %x)\n", ext, ktsr->stream_offsets[0], ktsr->stream_sizes[0]);
         return NULL;
     }
 
@@ -414,11 +416,13 @@ static int parse_codec(ktsr_header_t* ktsr) {
     /* platform + format to codec, simplified until more codec combos are found */
     switch(ktsr->platform) {
         case 0x01: /* PC */
-        case 0x05: /* PC/Steam [Fate/Samurai Remnant (PC)] */
+        case 0x05: /* PC/Steam, Android [Fate/Samurai Remnant (PC)] */
             if (ktsr->format == 0x0000 && !ktsr->is_external)
                 ktsr->codec = MSADPCM; // Warrior Orochi 4 (PC)
             else if (ktsr->format == 0x0001)
                 ktsr->codec = KA1A_INTERNAL; // Dynasty Warriors Origins (PC)
+            else if (ktsr->format == 0x0005 && ktsr->is_external && ktsr->codec_value == 0x0840)
+                ktsr->codec = KTAC; // Shin Hokuto Musou (Android
             else if (ktsr->format == 0x0005 && ktsr->is_external)
                 ktsr->codec = KOVS; // Atelier Ryza (PC)
             else if (ktsr->format == 0x1001 && ktsr->is_external)
@@ -481,7 +485,7 @@ static bool parse_ktsr_subfile(ktsr_header_t* ktsr, STREAMFILE* sf, uint32_t off
              * 14 external codec
              * 18 sample rate
              * 1c num samples
-             * 20 null / 0x1000?
+             * 20 null or codec-related value (RIFF_AT9/KM9=0x100, KTAC=0x840)
              * 24 loop start or -1 (loop end is num samples)
              * 28 channel layout (or null?)
              * 2c null
@@ -492,8 +496,9 @@ static bool parse_ktsr_subfile(ktsr_header_t* ktsr, STREAMFILE* sf, uint32_t off
              */
             //;VGM_LOG("header %08x at %x\n", type, offset);
 
-            ktsr->channels  = read_u32le(offset + 0x0c, sf);
-            ktsr->format    = read_u32le(offset + 0x14, sf);
+            ktsr->channels      = read_u32le(offset + 0x0c, sf);
+            ktsr->format        = read_u32le(offset + 0x14, sf);
+            ktsr->codec_value   = read_u32le(offset + 0x20, sf);
             /* other fields will be read in the external stream */
 
             ktsr->channel_layout = read_u32le(offset + 0x28, sf);
@@ -507,7 +512,7 @@ static bool parse_ktsr_subfile(ktsr_header_t* ktsr, STREAMFILE* sf, uint32_t off
                 ktsr->stream_sizes[0]   = read_u32le(offset + 0x38, sf);
             }
             ktsr->is_external = true;
-
+VGM_LOG("k=%x\n", ktsr->codec_value);
             break;
 
         case 0x41FDBD4E: /* internal [Attack on Titan: Wings of Freedom (Vita)] */
@@ -517,14 +522,14 @@ static bool parse_ktsr_subfile(ktsr_header_t* ktsr, STREAMFILE* sf, uint32_t off
         case 0x10250527: /* internal [Fire Emblem: Three Houses DLC (Switch)] */
             /* 08 subtype? (0x6029DBD2, 0xD20A92F90, 0xDC6FF709)
              * 0c channels
-             * 10 format? (00=platform's ADPCM? 01=ATRAC9?)
-             * 11 bps? (always 16)
+             * 10 format
+             * 11 null or sometimes 16
              * 12 null
              * 14 sample rate
              * 18 num samples
-             * 1c null or 0x100?
+             * 1c null or codec-related value?
              * 20 loop start or -1 (loop end is num samples)
-             * 24 null or channel layout (for 1 track in case of multi-track streams))
+             * 24 null or channel layout (for 1 track in case of multi-track streams)
              * 28 header offset (within subfile)
              * 2c header size [B, C]
              * 30 offset to data start offset [A, C] or to data start+size [B]
@@ -679,7 +684,7 @@ static bool parse_ktsr(ktsr_header_t* ktsr, STREAMFILE* sf) {
      * 04: type
      * 08: version?
      * 0a: unknown (usually 00, 02/03 seen in Vita)
-     * 0b: platform (01=PC, 03=Vita, 04=Switch)
+     * 0b: platform
      * 0c: audio id? (seen in multiple files/games and used as Ogg stream IDs)
      * 10: null
      * 14: null

From 5e112dc34ed5ad62e88730390a578714bfd241ae Mon Sep 17 00:00:00 2001
From: bnnm <bananaman255@gmail.com>
Date: Wed, 22 Jan 2025 22:52:27 +0100
Subject: [PATCH 10/17] cleanup: icelib misc

---
 src/coding/libs/icelib.c | 29 ++---------------------------
 1 file changed, 2 insertions(+), 27 deletions(-)

diff --git a/src/coding/libs/icelib.c b/src/coding/libs/icelib.c
index e44bdd4b..b2d130c7 100644
--- a/src/coding/libs/icelib.c
+++ b/src/coding/libs/icelib.c
@@ -9,7 +9,7 @@
 
 //TODO change to streaming decoder
 // Currently lib expects most data in memory. Due to how format is designed it's not the
-// easiest thing to change, to be fixed it later:
+// easiest thing to change, to be fixed later:
 // - data is divided into 2 blocks (intro+body) that are decoded separatedly
 //   (streaming should read up to block max)
 // - code data isn't divided into frames, just keeps reading from the file buf
@@ -30,36 +30,11 @@
 
 //#include "zlib.h"
 #include "../../util/zlib_vgmstream.h" 
+#include "../../util/reader_get.h"
 
 #define ICESND_MAX_CHANNELS    2
 
 
-/* ************************************************************ */
-/* COMMON */
-/* ************************************************************ */
-
-static inline uint8_t get_u8(const uint8_t* p) {
-    uint8_t ret;
-    ret  = ((uint16_t)(const uint8_t)p[0]) << 0;
-    return ret;
-}
-
-static inline uint16_t get_u16le(const uint8_t* p) {
-    uint16_t ret;
-    ret  = ((uint16_t)(const uint8_t)p[0]) << 0;
-    ret |= ((uint16_t)(const uint8_t)p[1]) << 8;
-    return ret;
-}
-
-static inline uint32_t get_u32le(const uint8_t* p) {
-    uint32_t ret;
-    ret  = ((uint32_t)(const uint8_t)p[0]) << 0;
-    ret |= ((uint32_t)(const uint8_t)p[1]) << 8;
-    ret |= ((uint32_t)(const uint8_t)p[2]) << 16;
-    ret |= ((uint32_t)(const uint8_t)p[3]) << 24;
-    return ret;
-}
-
 /* bigrp entry info as read from header */
 typedef struct {
     uint32_t hash1; /* usually matches filename, different files vary on bytes, seems internally used to identify files */

From b7856554ed1ecc59303583c19cbed984dd4256e3 Mon Sep 17 00:00:00 2001
From: bnnm <bananaman255@gmail.com>
Date: Fri, 24 Jan 2025 01:00:17 +0100
Subject: [PATCH 11/17] Fix .bcwav IMA waveforms [3DS eShop (3DS)]

---
 src/meta/bfwav.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/meta/bfwav.c b/src/meta/bfwav.c
index c7923038..7c8b2864 100644
--- a/src/meta/bfwav.c
+++ b/src/meta/bfwav.c
@@ -211,7 +211,7 @@ static VGMSTREAM* init_vgmstream_bxwav(STREAMFILE* sf, bxwav_type_t type) {
     
     vgmstream->layout_type = layout_none;
 
-    /* only 0x02 is known, others can be made with SDK tools */
+    /* only 0x02/03 are known, others can be made with SDK tools */
     switch (codec) {
         case 0x00:
             vgmstream->coding_type = coding_PCM8;
@@ -227,7 +227,7 @@ static VGMSTREAM* init_vgmstream_bxwav(STREAMFILE* sf, bxwav_type_t type) {
             break;
 
         case 0x03:
-            vgmstream->coding_type = coding_NW_IMA;
+            vgmstream->coding_type = coding_IMA; // 3DS eShop applet (3DS) 
             /* hist is read below */
             break;
 

From e459db00f57a910ffd52c52be1e98eec08aba328 Mon Sep 17 00:00:00 2001
From: bnnm <bananaman255@gmail.com>
Date: Fri, 24 Jan 2025 01:01:30 +0100
Subject: [PATCH 12/17] Add .skx+.tbl [Syphon Filter: Dark Mirror (PS2/PSP),
 MLB 14 (Vita)]

---
 src/formats.c                    |   1 +
 src/libvgmstream.vcxproj         |   1 +
 src/libvgmstream.vcxproj.filters |   3 +
 src/meta/meta.h                  |   2 +
 src/meta/skex.c                  | 272 +++++++++++++++++++++++++++++++
 src/util.h                       |   3 +
 src/vgmstream_init.c             |   1 +
 7 files changed, 283 insertions(+)
 create mode 100644 src/meta/skex.c

diff --git a/src/formats.c b/src/formats.c
index b0b5b3d2..79c7d922 100644
--- a/src/formats.c
+++ b/src/formats.c
@@ -522,6 +522,7 @@ static const char* extension_list[] = {
     "sgb",
     "sgd",
     "sgt",
+    "skx",
     "slb", //txth/reserved [THE Nekomura no Hitobito (PS2)]
     "sli",
     "smc",
diff --git a/src/libvgmstream.vcxproj b/src/libvgmstream.vcxproj
index a411b9ca..327d70ba 100644
--- a/src/libvgmstream.vcxproj
+++ b/src/libvgmstream.vcxproj
@@ -685,6 +685,7 @@
     <ClCompile Include="meta\sfl.c" />
     <ClCompile Include="meta\sgxd.c" />
     <ClCompile Include="meta\silence.c" />
+    <ClCompile Include="meta\skex.c" />
     <ClCompile Include="meta\sk_aud.c" />
     <ClCompile Include="meta\sl3.c" />
     <ClCompile Include="meta\sli.c" />
diff --git a/src/libvgmstream.vcxproj.filters b/src/libvgmstream.vcxproj.filters
index 470db619..e9eabb11 100644
--- a/src/libvgmstream.vcxproj.filters
+++ b/src/libvgmstream.vcxproj.filters
@@ -1885,6 +1885,9 @@
     <ClCompile Include="meta\silence.c">
       <Filter>meta\Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="meta\skex.c">
+      <Filter>meta\Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="meta\sk_aud.c">
       <Filter>meta\Source Files</Filter>
     </ClCompile>
diff --git a/src/meta/meta.h b/src/meta/meta.h
index 08898d4a..194a0052 100644
--- a/src/meta/meta.h
+++ b/src/meta/meta.h
@@ -1024,4 +1024,6 @@ VGMSTREAM* init_vgmstream_xabp(STREAMFILE* sf);
 
 VGMSTREAM* init_vgmstream_i3ds(STREAMFILE* sf);
 
+VGMSTREAM* init_vgmstream_skex(STREAMFILE* sf);
+
 #endif
diff --git a/src/meta/skex.c b/src/meta/skex.c
new file mode 100644
index 00000000..29d7069b
--- /dev/null
+++ b/src/meta/skex.c
@@ -0,0 +1,272 @@
+#include "meta.h"
+#include "../coding/coding.h"
+#include "../util.h"
+
+
+/* SKEX - from SCE America second party devs [Syphon Filter: Dark Mirror (PS2/PSP), MLB 2004 (PS2), MLB 15 (Vita)] */
+VGMSTREAM* init_vgmstream_skex(STREAMFILE* sf) {
+    VGMSTREAM* vgmstream = NULL;
+    STREAMFILE* temp_sf = NULL;
+    STREAMFILE* sf_h = NULL;
+
+
+    /* checks */
+    if (!is_id32be(0x00,sf, "SKEX"))
+        return NULL;
+    if (!check_extensions(sf,"skx"))
+        return NULL;
+
+    // bank-like format with helper files typically found inside memory/bigfiles (.SWD, .DAT, etc)
+    // - .skx: external streams (pack of full formats)
+    // - .tbl: main stream info
+    // - .ctl: cues?
+    // - .mrk: text script related to .tbl
+    // usually .tbl is the header and .skx its body, but rarely may be combined so use .skx as a base
+
+    uint16_t version         = read_u16le(0x04, sf); // in hex NN.NN form
+    // 06: low number, seems related to file (id?)
+    // 08: null
+    // 0c: null
+    uint32_t head_offset    = read_u32le(0x10, sf);
+    uint32_t head_size      = read_u32le(0x14, sf);
+    int entries             = read_u16le(0x18, sf); // even with no head_offset/size
+
+    // micro optimization (empty banks do exist)
+    if (get_streamfile_size(sf) <= 0x100) {
+        vgm_logi("SKEX: bank has no subsongs\n");
+        return NULL;
+    }
+
+    // setup header
+    if (head_offset && head_size) {
+        // rare [MLB 2004 (PS2), NBA 06 (PS2)]
+        sf_h = sf;
+    }
+    else {
+        // note that may .skx may be uppercase and companion file lowercase (meaning Linux won't open this)
+        sf_h = open_streamfile_by_ext(sf, "tbl");
+        if (!sf_h) {
+            vgm_logi("SKEX: companion file .tbl not found (put together)\n");
+            return NULL;
+        }
+    }
+
+
+    uint32_t subfile_offset = 0, subfile_size = 0, prev_offset = 0, subfile_type = 0;
+
+    int total_subsongs = 0;
+    int target_subsong = sf->stream_index;
+    if (target_subsong == 0) target_subsong = 1;
+
+    // Entries have many repeats so calculate totals.
+    // After last entry there is a fake entry with .skx size (meaning next_offset is always valid).
+    // With flags = 0x1000, after all is another table with increasing low number per entry
+    switch(version) {
+        case 0x1070: {  // MLB 2003 (PS2), MLB 2004 (PS2)
+            uint32_t offset = head_offset;
+
+            // entries go after files
+            for (int i = 0; i < entries; i++) {
+                uint32_t curr_offset = read_u32le(offset + 0x00, sf_h);
+                uint32_t curr_type   = read_u32le(offset + 0x04, sf_h);
+                // 08: null?
+
+                offset += 0x0c;
+
+                switch(curr_type) {
+                    case 0x05: // .vag (mono)
+                    case 0x0c: // .vag (stereo)
+                        break;
+                    default:
+                        vgm_logi("SKEX: unknown format %x (report)\n", curr_type);
+                        goto fail;
+                }
+
+                if (prev_offset == curr_offset)
+                    continue;
+                prev_offset = curr_offset;
+
+                total_subsongs++;
+
+                if (target_subsong == total_subsongs && !subfile_offset) {
+                    uint32_t next_offset = read_u32le(offset, sf_h);
+                    subfile_offset = curr_offset;
+                    subfile_size = next_offset - curr_offset;
+                    subfile_type = curr_type;
+                }
+            }
+            break;
+        }
+
+        case 0x2040:    // MLB 2005 (PS2)
+        case 0x2070:    // MLB 2006 (PS2), NBA 06 (PS2), MLB (PSP)
+        case 0x3000: {  // Syphon Filter: Dark Mirror (PS2/PSP), Syphon Filter: Logan's Shadow (PSP)
+            uint32_t offset = head_offset;
+
+            // 00: header id
+            // 04: version
+            // 06: low number, seems related to file
+            // 08: entries (same as .skx)
+            // 0a: flags
+            // 0c: null?
+            // 10: entries again?
+            if (!is_id32be(offset + 0x00,sf_h, "STBL")) {
+                VGM_LOG("SKEX: incorrect .tbl\n");
+                goto fail;
+            }
+            offset += 0x50;
+
+            for (int i = 0; i < entries; i++) {
+                uint32_t curr_offset = read_u32le(offset + 0x00, sf_h);
+                // 04: 0 or 1 (doesn't seem to be related to loops, companion files or such)
+                // 05: null?
+                // 06: null?
+                uint8_t  curr_type   = read_u8   (offset + 0x07, sf_h);
+
+                offset += 0x08;
+
+                switch(curr_type) {
+                    case 0x00: // dummy/config?
+                    case 0x01: // dummy/config?
+                    case 0x0e: // "Names" in .skx (empty?)
+                        continue;
+                    case 0x05: // .vag (mono)
+                    case 0x09: // .at3
+                    case 0x0b: // .vpk
+                    case 0x0c: // .vag (stereo)
+                        break;
+                    default:
+                        vgm_logi("SKEX: unknown format %x (report)\n", curr_type);
+                        goto fail;
+                }
+
+                if (prev_offset == curr_offset)
+                    continue;
+                prev_offset = curr_offset;
+
+                total_subsongs++;
+
+                if (target_subsong == total_subsongs && !subfile_offset) {
+                    uint32_t next_offset = read_u32le(offset, sf_h);
+                    subfile_offset = curr_offset;
+                    subfile_size = next_offset - curr_offset;
+                    subfile_type = curr_type;
+                }
+            }
+            break;
+        }
+
+        case 0x5100: {   // MLB 14 (Vita), MLB 15 (Vita)
+            uint32_t offset = head_offset;
+            uint16_t multiplier, align = 0;
+
+            // 00: header id
+            // 04: version
+            // 06: low number, seems related to file
+            // 08: entries (same as .skx)
+            // 0a: null?
+            // 0c: file size (without padding)
+            // 10: offset to 2nd table
+            // 14: null?
+            // 18: offset multiplier (0x800/0x400/0x01)
+            // 1a: flags? (rarely 0x08)
+            // 1c: some entries?
+            // 20: null
+            // 24: entries again?
+            if (!is_id32be(offset + 0x00,sf_h, "STBL")) {
+                VGM_LOG("SKEX: incorrect .tbl\n");
+                goto fail;
+            }
+            multiplier = read_u16le(offset + 0x18, sf_h);
+
+            offset += 0x64;
+            for (int i = 0; i < entries; i++) {
+                uint32_t curr_offset = read_u32le(offset + 0x00, sf_h) * multiplier;
+                // 04: null?
+                uint8_t  curr_type   = read_u8   (offset + 0x05, sf_h);
+
+                offset += 0x06;
+
+                switch(curr_type) {
+                    case 0x00: // dummy?
+                    case 0x01: // some config?
+                    case 0x0e: // "<HR_EMITTER>"
+                    case 0x0f: // MIDX (maybe some instrument/midi definition, but data doesn't look midi-like)
+                        continue;
+                    case 0x02: // .at9
+                    case 0x42: // .at9 (no diffs?)
+                        break;
+                    default:
+                        vgm_logi("SKEX: unknown format %x (report)\n", curr_type);
+                        goto fail;
+                }
+
+                // oddly misaligned by 1, no apparent flags [MLB 15 (Vita)-FEPXP.SKX]
+                if (curr_offset == 0x00 && multiplier == 0x800 && !align) {
+                    align = multiplier;
+                }
+                curr_offset += align;
+
+                if (prev_offset == curr_offset)
+                    continue;
+                prev_offset = curr_offset;
+
+                total_subsongs++;
+
+                if (target_subsong == total_subsongs && !subfile_offset) {
+                    uint32_t next_offset = read_u32le(offset, sf_h) * multiplier + align;
+                    subfile_offset = curr_offset;
+                    subfile_size = next_offset - curr_offset;
+                    subfile_type = curr_type;
+                }
+            }
+            break;
+        }
+        default:
+            goto fail;
+    }
+
+    if (total_subsongs == 0) {
+        vgm_logi("SKEX: bank has no subsongs\n"); //sometimes
+        goto fail;
+    }
+
+    if (!check_subsongs(&target_subsong, total_subsongs))
+        goto fail;
+
+    ;VGM_LOG("subfile=%x, %x, %x, %i\n", subfile_offset, subfile_size, subfile_type, total_subsongs);
+    {
+        init_vgmstream_t init_vgmstream = NULL;
+        const char* ext;
+        switch(subfile_type) {
+            case 0x05:
+            case 0x0c:  init_vgmstream = init_vgmstream_vag; ext = "vag"; break;
+            case 0x09:  init_vgmstream = init_vgmstream_riff; ext = "at3"; break;
+            case 0x0b:  init_vgmstream = init_vgmstream_vpk; ext = "vpk"; break;
+            case 0x02:
+            case 0x42:  init_vgmstream = init_vgmstream_riff; ext = "at9"; break;
+            default: goto fail;
+        }
+
+        if (subfile_type == 0x09 || subfile_type == 0x02) { // use RIFF's
+            subfile_size = read_u32le(subfile_offset + 0x04, sf) + 0x08;
+        }
+
+        temp_sf = setup_subfile_streamfile(sf, subfile_offset, subfile_size, ext);
+        if (!temp_sf) goto fail;
+
+        vgmstream = init_vgmstream(temp_sf);
+        if (!vgmstream) goto fail;
+    }
+
+    vgmstream->num_streams = total_subsongs;
+
+    if (sf_h != sf) close_streamfile(sf_h);
+    close_streamfile(temp_sf);
+    return vgmstream;
+fail:
+    if (sf_h != sf) close_streamfile(sf_h);
+    close_streamfile(temp_sf);
+    close_vgmstream(vgmstream);
+    return NULL;
+}
diff --git a/src/util.h b/src/util.h
index d0148743..f9cc48ba 100644
--- a/src/util.h
+++ b/src/util.h
@@ -47,6 +47,9 @@ uint32_t clamp_u32(uint32_t v, uint32_t min, uint32_t max);
 
 int round10(int val);
 
+#define align_size align_size_to_block
+
+// returns size with padding, ex. value=0x560, block=0x100 > 0x600
 size_t align_size_to_block(size_t value, size_t block_align);
 
 /* return a file's extension (a pointer to the first character of the
diff --git a/src/vgmstream_init.c b/src/vgmstream_init.c
index 60b01442..fd514a1c 100644
--- a/src/vgmstream_init.c
+++ b/src/vgmstream_init.c
@@ -516,6 +516,7 @@ init_vgmstream_t init_vgmstream_functions[] = {
     init_vgmstream_xabp,
     init_vgmstream_i3ds,
     init_vgmstream_sdbs,
+    init_vgmstream_skex,
 
     /* lower priority metas (no clean header identity, somewhat ambiguous, or need extension/companion file to identify) */
     init_vgmstream_agsc,

From 78b6bd8c265393a5d77c391f98ec63dbf0b69141 Mon Sep 17 00:00:00 2001
From: bnnm <bananaman255@gmail.com>
Date: Fri, 24 Jan 2025 01:01:38 +0100
Subject: [PATCH 13/17] Fix some stereo .vag [NBA 06 (PS2)]

---
 src/meta/vag.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/meta/vag.c b/src/meta/vag.c
index d2b9b07e..cd179e36 100644
--- a/src/meta/vag.c
+++ b/src/meta/vag.c
@@ -307,6 +307,15 @@ VGMSTREAM* init_vgmstream_vag(STREAMFILE* sf) {
                 channel_size -= 0x40;
                 loop_flag = ps_find_loop_offsets(sf, start_offset, channel_size, channels, interleave, &loop_start_sample, &loop_end_sample);
             }
+            else if (version == 0x00000020 && is_id64be(0x20, sf, "KAudioDL") &&  ( (channel_size + 0x30) * 2 == file_size 
+                || align_size(channel_size + 0x30, 0x800) * 2 == file_size ||  align_size(channel_size + 0x30, 0x400) * 2 == file_size) ) {
+                /* .SKX stereo vag (name is always KAudioDLL and streams are padded unlike memory audio) [NBA 06 (PS2)] */
+                start_offset = 0x30;
+                interleave = file_size / 2;
+                channels = 2; // mono KAudioDLL streams also exist
+
+                loop_flag = ps_find_loop_offsets(sf, start_offset, channel_size, channels, interleave, &loop_start_sample, &loop_end_sample);
+            }
             else {
                 /* standard PS1/PS2/PS3 .vag [Ecco the Dolphin (PS2), Legasista (PS3)] */
                 start_offset = 0x30;

From 3f898174c87a2d2eea4e0d91be9aad561409f9be Mon Sep 17 00:00:00 2001
From: bnnm <bananaman255@gmail.com>
Date: Fri, 24 Jan 2025 01:02:04 +0100
Subject: [PATCH 14/17] doc/cleanup

---
 src/meta/hd_bd.c      |  2 +-
 src/meta/ktac.c       | 11 ++++++-----
 src/meta/ktsr.c       |  6 +++---
 src/meta/ogg_vorbis.c |  2 +-
 4 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/meta/hd_bd.c b/src/meta/hd_bd.c
index 5ef8d180..259ec47b 100644
--- a/src/meta/hd_bd.c
+++ b/src/meta/hd_bd.c
@@ -31,7 +31,7 @@ VGMSTREAM* init_vgmstream_hd_bd(STREAMFILE* sf) {
     // 0x1c: Smpl offset
     uint32_t vagi_offset = read_u32le(head_offset + 0x20, sf);
     // 0x24: Setb offset
-    // rest: reserved (-1)
+    // rest: reserved (-1, or rarely 0 [Midnight Club 2 (PS2)])
 
     meta_header_t h = {
         .meta = meta_HD_BD,
diff --git a/src/meta/ktac.c b/src/meta/ktac.c
index 568a67a9..c075dd90 100644
--- a/src/meta/ktac.c
+++ b/src/meta/ktac.c
@@ -20,16 +20,17 @@ VGMSTREAM* init_vgmstream_ktac(STREAMFILE* sf) {
     ktac_header_t ktac = {0};
 
     /* checks */
-    /* .ktac: header id */
-    if (!check_extensions(sf,"ktac"))
-        goto fail;
     if (!is_id32be(0x00,sf, "KTAC"))
-        goto fail;
+        return NULL;
+
+    /* .ktac: header id (probable extension from debug strings is "kac" */
+    if (!check_extensions(sf,"ktac"))
+        return NULL;
 
     /* 0x04: version? (always 1) */
     ktac.file_size = read_u32le(0x08,sf);
     if (ktac.file_size != get_streamfile_size(sf))
-        goto fail;
+        return NULL;
     ktac.mp4.stream_offset  = read_u32le(0x0c,sf);
     ktac.mp4.stream_size    = read_u32le(0x10,sf);
     ktac.type               = read_u32le(0x14,sf);
diff --git a/src/meta/ktsr.c b/src/meta/ktsr.c
index a11a09d8..1517d8b9 100644
--- a/src/meta/ktsr.c
+++ b/src/meta/ktsr.c
@@ -53,7 +53,7 @@ typedef struct {
 static VGMSTREAM* init_vgmstream_ktsr_internal(STREAMFILE* sf, ktsr_meta_t* info);
 static bool parse_ktsr(ktsr_header_t* ktsr, STREAMFILE* sf);
 static layered_layout_data* build_layered_atrac9(ktsr_header_t* ktsr, STREAMFILE *sf, uint32_t config_data);
-static VGMSTREAM* init_vgmstream_ktsr_sub(STREAMFILE* sf_b, uint32_t st_offset, ktsr_header_t* ktsr, VGMSTREAM* (*init_vgmstream)(STREAMFILE* sf), const char* ext);
+static VGMSTREAM* init_vgmstream_ktsr_sub(STREAMFILE* sf_b, uint32_t st_offset, ktsr_header_t* ktsr, init_vgmstream_t init_vgmstream, const char* ext);
 
 /* KTSR - Koei Tecmo sound resource container (KTSL2 sound lib) */
 VGMSTREAM* init_vgmstream_ktsr(STREAMFILE* sf) {
@@ -217,7 +217,7 @@ static VGMSTREAM* init_vgmstream_ktsr_internal(STREAMFILE* sf, ktsr_meta_t* info
                 ktsr.codec = RIFF_ATRAC9;
         }
 
-        VGMSTREAM* (*init_vgmstream)(STREAMFILE* sf) = NULL;
+        init_vgmstream_t init_vgmstream = NULL;
         const char* ext;
         switch(ktsr.codec) {
             case RIFF_ATRAC9:   init_vgmstream = init_vgmstream_riff; ext = "at9"; break;       // Nioh (PS4)
@@ -332,7 +332,7 @@ fail:
 }
 
 // TODO improve, unify with other metas that do similar stuff
-static VGMSTREAM* init_vgmstream_ktsr_sub(STREAMFILE* sf_b, uint32_t st_offset, ktsr_header_t* ktsr, VGMSTREAM* (*init_vgmstream)(STREAMFILE* sf), const char* ext) {
+static VGMSTREAM* init_vgmstream_ktsr_sub(STREAMFILE* sf_b, uint32_t st_offset, ktsr_header_t* ktsr, init_vgmstream_t init_vgmstream, const char* ext) {
     VGMSTREAM* sub_vgmstream = NULL;
     STREAMFILE* temp_sf = NULL;
 
diff --git a/src/meta/ogg_vorbis.c b/src/meta/ogg_vorbis.c
index 8f3188d6..eed5379c 100644
--- a/src/meta/ogg_vorbis.c
+++ b/src/meta/ogg_vorbis.c
@@ -186,7 +186,7 @@ static int _init_vgmstream_ogg_vorbis_tests(STREAMFILE* sf, ogg_vorbis_io_config
 
         cfg->start = 0x20;
 
-        /* .kvs: Atelier Sophie (PC)
+        /* .kvs: Atelier Sophie (PC), debug strings
          * .kovs: header id only? */
         if (!check_extensions(sf,"kvs,kovs"))
             goto fail;

From a3f549929fba6304e98a43e93e0012cf16629f7a Mon Sep 17 00:00:00 2001
From: bnnm <bananaman255@gmail.com>
Date: Fri, 24 Jan 2025 01:09:07 +0100
Subject: [PATCH 15/17] cleanup: rename nw_ima to camelot_ima

---
 src/base/decode.c        | 8 ++++----
 src/coding/coding.h      | 2 +-
 src/coding/ima_decoder.c | 8 ++++----
 src/formats.c            | 2 +-
 src/meta/bcstm.c         | 4 ++--
 src/vgmstream_types.h    | 2 +-
 6 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/base/decode.c b/src/base/decode.c
index 157fec3c..94de7d6f 100644
--- a/src/base/decode.c
+++ b/src/base/decode.c
@@ -469,7 +469,7 @@ int decode_get_samples_per_frame(VGMSTREAM* vgmstream) {
         case coding_PCM4_U:
         case coding_IMA_int:
         case coding_DVI_IMA_int:
-        case coding_NW_IMA:
+        case coding_CAMELOT_IMA:
         case coding_WV6_IMA:
         case coding_HV_IMA:
         case coding_FFTA2_IMA:
@@ -685,7 +685,7 @@ int decode_get_frame_size(VGMSTREAM* vgmstream) {
         case coding_IMA_int:
         case coding_DVI_IMA:
         case coding_DVI_IMA_int:
-        case coding_NW_IMA:
+        case coding_CAMELOT_IMA:
         case coding_WV6_IMA:
         case coding_HV_IMA:
         case coding_FFTA2_IMA:
@@ -1336,9 +1336,9 @@ void decode_vgmstream(sbuf_t* sdst, VGMSTREAM* vgmstream, int samples_to_do) {
             }
             break;
         }
-        case coding_NW_IMA:
+        case coding_CAMELOT_IMA:
             for (ch = 0; ch < vgmstream->channels; ch++) {
-                decode_nw_ima(&vgmstream->ch[ch], buffer+ch,
+                decode_camelot_ima(&vgmstream->ch[ch], buffer+ch,
                         vgmstream->channels, vgmstream->samples_into_block, samples_to_do);
             }
             break;
diff --git a/src/coding/coding.h b/src/coding/coding.h
index 4f8abeac..7d2600a6 100644
--- a/src/coding/coding.h
+++ b/src/coding/coding.h
@@ -20,7 +20,7 @@ void g72x_init_state(struct g72x_state* state_ptr);
 
 /* ima_decoder */
 void decode_standard_ima(VGMSTREAMCHANNEL* stream, sample_t* outbuf, int channelspacing, int32_t first_sample, int32_t samples_to_do, int channel, int is_stereo, int is_high_first);
-void decode_nw_ima(VGMSTREAMCHANNEL* stream, sample_t* outbuf, int channelspacing, int32_t first_sample, int32_t samples_to_do);
+void decode_camelot_ima(VGMSTREAMCHANNEL* stream, sample_t* outbuf, int channelspacing, int32_t first_sample, int32_t samples_to_do);
 void decode_snds_ima(VGMSTREAMCHANNEL* stream, sample_t* outbuf, int channelspacing, int32_t first_sample, int32_t samples_to_do, int channel);
 void decode_otns_ima(VGMSTREAM* vgmstream, VGMSTREAMCHANNEL* stream, sample_t* outbuf, int channelspacing, int32_t first_sample, int32_t samples_to_do, int channel);
 void decode_wv6_ima(VGMSTREAMCHANNEL* stream, sample_t* outbuf, int channelspacing, int32_t first_sample, int32_t samples_to_do);
diff --git a/src/coding/ima_decoder.c b/src/coding/ima_decoder.c
index 6bff9c2c..69f83db3 100644
--- a/src/coding/ima_decoder.c
+++ b/src/coding/ima_decoder.c
@@ -124,8 +124,8 @@ static void std_ima_expand_nibble_mul(VGMSTREAMCHANNEL * stream, off_t byte_offs
     if (*step_index > 88) *step_index=88;
 }
 
-/* NintendoWare IMA (Mario Golf, Mario Tennis; maybe other Camelot games) */
-static void nw_ima_expand_nibble(VGMSTREAMCHANNEL * stream, off_t byte_offset, int nibble_shift, int32_t * hist1, int32_t * step_index) {
+/* Camelot IMA (Mario Golf, Mario Tennis; maybe other Camelot games) */
+static void camelot_ima_expand_nibble(VGMSTREAMCHANNEL * stream, off_t byte_offset, int nibble_shift, int32_t * hist1, int32_t * step_index) {
     int sample_nibble, sample_decoded, step, delta;
 
     sample_nibble = (read_8bit(byte_offset,stream->streamfile) >> nibble_shift)&0xf;
@@ -418,7 +418,7 @@ void decode_mtf_ima(VGMSTREAMCHANNEL * stream, sample_t * outbuf, int channelspa
     stream->adpcm_step_index = step_index;
 }
 
-void decode_nw_ima(VGMSTREAMCHANNEL * stream, sample_t * outbuf, int channelspacing, int32_t first_sample, int32_t samples_to_do) {
+void decode_camelot_ima(VGMSTREAMCHANNEL * stream, sample_t * outbuf, int channelspacing, int32_t first_sample, int32_t samples_to_do) {
     int i, sample_count;
     int32_t hist1 = stream->adpcm_history1_32;
     int step_index = stream->adpcm_step_index;
@@ -431,7 +431,7 @@ void decode_nw_ima(VGMSTREAMCHANNEL * stream, sample_t * outbuf, int channelspac
         off_t byte_offset = stream->offset + i/2;
         int nibble_shift = (i&1?4:0); //low nibble order
 
-        nw_ima_expand_nibble(stream, byte_offset,nibble_shift, &hist1, &step_index);
+        camelot_ima_expand_nibble(stream, byte_offset,nibble_shift, &hist1, &step_index);
         outbuf[sample_count] = (short)(hist1);
     }
 
diff --git a/src/formats.c b/src/formats.c
index 79c7d922..8f6ec98f 100644
--- a/src/formats.c
+++ b/src/formats.c
@@ -836,7 +836,7 @@ static const coding_info coding_info_list[] = {
         {coding_IMA_int,            "IMA 4-bit ADPCM (mono/interleave)"},
         {coding_DVI_IMA,            "Intel DVI 4-bit IMA ADPCM"},
         {coding_DVI_IMA_int,        "Intel DVI 4-bit IMA ADPCM (mono/interleave)"},
-        {coding_NW_IMA,             "NintendoWare IMA 4-bit ADPCM"},
+        {coding_CAMELOT_IMA,        "Camelot IMA 4-bit ADPCM"},
         {coding_SNDS_IMA,           "Heavy Iron .snds 4-bit IMA ADPCM"},
         {coding_QD_IMA,             "Quantic Dream 4-bit IMA ADPCM"},
         {coding_WV6_IMA,            "Gorilla Systems WV6 4-bit IMA ADPCM"},
diff --git a/src/meta/bcstm.c b/src/meta/bcstm.c
index d6eca09d..4557f619 100644
--- a/src/meta/bcstm.c
+++ b/src/meta/bcstm.c
@@ -96,7 +96,7 @@ VGMSTREAM* init_vgmstream_bcstm(STREAMFILE* sf) {
             vgmstream->coding_type = coding_NGC_DSP;
 
             if (is_camelot_ima) {
-                vgmstream->coding_type = coding_NW_IMA;
+                vgmstream->coding_type = coding_CAMELOT_IMA;
             }
             else {
                 off_t channel_indexes, channel_info_offset, coefs_offset;
@@ -113,7 +113,7 @@ VGMSTREAM* init_vgmstream_bcstm(STREAMFILE* sf) {
             }
             break;
 
-        default: /* 0x03: IMA? */
+        default: /* 0x03: regular IMA? (like .bcwav) */
             goto fail;
     }
 
diff --git a/src/vgmstream_types.h b/src/vgmstream_types.h
index 11363bda..24ade741 100644
--- a/src/vgmstream_types.h
+++ b/src/vgmstream_types.h
@@ -63,7 +63,7 @@ typedef enum {
     coding_IMA_int,         /* IMA ADPCM (mono/interleave, low nibble first) */
     coding_DVI_IMA,         /* DVI IMA ADPCM (stereo or mono, high nibble first) */
     coding_DVI_IMA_int,     /* DVI IMA ADPCM (mono/interleave, high nibble first) */
-    coding_NW_IMA,
+    coding_CAMELOT_IMA,
     coding_SNDS_IMA,        /* Heavy Iron Studios .snds IMA ADPCM */
     coding_QD_IMA,
     coding_WV6_IMA,         /* Gorilla Systems WV6 4-bit IMA ADPCM */

From ec6d9186646244ee2643b339b699451370c8c161 Mon Sep 17 00:00:00 2001
From: bnnm <bananaman255@gmail.com>
Date: Fri, 24 Jan 2025 01:09:19 +0100
Subject: [PATCH 16/17] Add HCA key

---
 src/meta/hca_keys.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/meta/hca_keys.h b/src/meta/hca_keys.h
index b23783aa..0d97d3dd 100644
--- a/src/meta/hca_keys.h
+++ b/src/meta/hca_keys.h
@@ -1515,6 +1515,9 @@ static const hcakey_info hcakey_list[] = {
 
     // Muv-Luv Dimensions (Android)
     {8848},                     // 0000000000002290
+
+    // Tales of Graces f Remastered (PC)
+    {51485416730473395},        // 00B6E9B6B75533B3
 };
 
 #endif

From 6802b636f2ff4f9bd34122adf2a3c247e4c408e7 Mon Sep 17 00:00:00 2001
From: bnnm <bananaman255@gmail.com>
Date: Fri, 24 Jan 2025 01:09:24 +0100
Subject: [PATCH 17/17] doc

---
 doc/FORMATS.md | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/doc/FORMATS.md b/doc/FORMATS.md
index fe3e1390..f01f4d29 100644
--- a/doc/FORMATS.md
+++ b/doc/FORMATS.md
@@ -45,7 +45,7 @@ different internally (encrypted, different versions, etc) and not always can be
   - *bcwav*: `.bcwav .adpcm .bms .sfx .str .zic`
   - *brwar*: `.rwar`
     - Subfiles: *brwav*
-  - Codecs: PCM8 PCM16BE PCM16LE NGC_DSP NW_IMA
+  - Codecs: PCM8 PCM16BE PCM16LE NGC_DSP IMA
 - **nds_strm.c**
   - Nintendo STRM header [*STRM*]
   - *nds_strm*: `.strm`
@@ -918,7 +918,7 @@ different internally (encrypted, different versions, etc) and not always can be
 - **bcstm.c**
   - Nintendo CSTM Header [*CSTM*]
   - *bcstm*: `.bcstm`
-  - Codecs: PCM8 PCM16LE NGC_DSP NW_IMA
+  - Codecs: PCM8 PCM16LE NGC_DSP CAMELOT_IMA
 - **g1l.c**
   - Koei Tecmo WiiBGM Header [*KT_WIIBGM*]
   - *kt_g1l*: `.g1l`
@@ -1846,6 +1846,10 @@ different internally (encrypted, different versions, etc) and not always can be
   - Codemasters i3DS header [*I3DS*]
   - *i3ds*: `.3ds`
   - Codecs: NGC_DSP
+- **skex.c**
+  - (container)
+  - *skex*: `.skx + .tbl`
+    - Subfiles: *vag riff vpk*
 - **agsc.c**
   - Retro Studios AGSC header [*AGSC*]
   - *agsc*: `.agsc`