From 9973b3fb72338c9eb6506af613251082553a3ff0 Mon Sep 17 00:00:00 2001
From: bnnm <bananaman255@gmail.com>
Date: Fri, 17 Jan 2025 15:54:26 +0100
Subject: [PATCH] Add .srsa/srst KA1A [Dynasty Warriors Origins (PC)]

---
 src/base/decode.c                | 114 +++---
 src/base/decode.h                |   2 -
 src/base/decode_state.h          |   2 -
 src/base/sbuf.c                  |  55 ++-
 src/base/sbuf.h                  |   5 +-
 src/coding/coding.h              |  13 +-
 src/coding/ka1a_decoder.c        | 146 +++++++
 src/coding/libs/ka1a_dec.c       | 636 +++++++++++++++++++++++++++++++
 src/coding/libs/ka1a_dec.h       |  42 ++
 src/coding/libs/ka1a_dec_data.h  | 260 +++++++++++++
 src/formats.c                    |   3 +
 src/layout/segmented.c           |  14 +-
 src/libvgmstream.vcxproj         |   5 +
 src/libvgmstream.vcxproj.filters |  15 +
 src/meta/ka1a.c                  |  56 +++
 src/meta/ktsr.c                  |  40 +-
 src/meta/meta.h                  |   2 +
 src/vgmstream.c                  |   4 -
 src/vgmstream.h                  |   4 +-
 src/vgmstream_init.c             |   1 +
 src/vgmstream_types.h            |   2 +
 21 files changed, 1322 insertions(+), 99 deletions(-)
 create mode 100644 src/coding/ka1a_decoder.c
 create mode 100644 src/coding/libs/ka1a_dec.c
 create mode 100644 src/coding/libs/ka1a_dec.h
 create mode 100644 src/coding/libs/ka1a_dec_data.h
 create mode 100644 src/meta/ka1a.c

diff --git a/src/base/decode.c b/src/base/decode.c
index 43938b2e..0e914e59 100644
--- a/src/base/decode.c
+++ b/src/base/decode.c
@@ -6,7 +6,6 @@
 #include "plugins.h"
 #include "sbuf.h"
 
-#if VGM_TEST_DECODER
 #include "../util/log.h"
 #include "decode_state.h"
 
@@ -16,23 +15,26 @@ static void* decode_state_init() {
 }
 
 static void decode_state_reset(VGMSTREAM* vgmstream) {
+    if (!vgmstream->decode_state)
+        return;
     memset(vgmstream->decode_state, 0, sizeof(decode_state_t));
 }
 
+static void decode_state_free(VGMSTREAM* vgmstream) {
+    free(vgmstream->decode_state);
+}
+
 // this could be part of the VGMSTREAM but for now keep separate as it simplifies 
 // some loop-related stuff
 void* decode_init() {
     return decode_state_init();
 }
-#endif
 
 
 /* custom codec handling, not exactly "decode" stuff but here to simplify adding new codecs */
 
 void decode_free(VGMSTREAM* vgmstream) {
-#if VGM_TEST_DECODER
-    free(vgmstream->decode_state);
-#endif
+    decode_state_free(vgmstream);
 
     if (!vgmstream->codec_data)
         return;
@@ -88,6 +90,10 @@ void decode_free(VGMSTREAM* vgmstream) {
         free_ea_mt(vgmstream->codec_data, vgmstream->channels);
     }
 
+    if (vgmstream->coding_type == coding_KA1A) {
+        free_ka1a(vgmstream->codec_data);
+    }
+
 #ifdef VGM_USE_FFMPEG
     if (vgmstream->coding_type == coding_FFmpeg) {
         free_ffmpeg(vgmstream->codec_data);
@@ -151,9 +157,7 @@ void decode_free(VGMSTREAM* vgmstream) {
 
 
 void decode_seek(VGMSTREAM* vgmstream) {
-#if VGM_TEST_DECODER
     decode_state_reset(vgmstream);
-#endif
 
     if (!vgmstream->codec_data)
         return;
@@ -199,6 +203,10 @@ void decode_seek(VGMSTREAM* vgmstream) {
         seek_ea_mt(vgmstream, vgmstream->loop_current_sample);
     }
 
+    if (vgmstream->coding_type == coding_KA1A) {
+        seek_ka1a(vgmstream, vgmstream->loop_current_sample);
+    }
+
 #ifdef VGM_USE_VORBIS
     if (vgmstream->coding_type == coding_OGG_VORBIS) {
         seek_ogg_vorbis(vgmstream->codec_data, vgmstream->loop_current_sample);
@@ -256,9 +264,7 @@ void decode_seek(VGMSTREAM* vgmstream) {
 
 
 void decode_reset(VGMSTREAM* vgmstream) {
-#if VGM_TEST_DECODER
     decode_state_reset(vgmstream);
-#endif
 
     if (!vgmstream->codec_data)
         return;
@@ -314,6 +320,10 @@ void decode_reset(VGMSTREAM* vgmstream) {
         reset_ea_mt(vgmstream);
     }
 
+    if (vgmstream->coding_type == coding_KA1A) {
+        reset_ka1a(vgmstream->codec_data);
+    }
+
 #if defined(VGM_USE_MP4V2) && defined(VGM_USE_FDKAAC)
     if (vgmstream->coding_type == coding_MP4_AAC) {
         reset_mp4_aac(vgmstream);
@@ -857,74 +867,75 @@ bool decode_uses_internal_offset_updates(VGMSTREAM* vgmstream) {
     return vgmstream->coding_type == coding_MS_IMA || vgmstream->coding_type == coding_MS_IMA_mono;
 }
 
-#if VGM_TEST_DECODER
-// decode frames for decoders which have their own sample buffer
-static void decode_frames(sbuf_t* sbuf, VGMSTREAM* vgmstream) {
-    const int max_empty = 10000;
+
+// decode frames for decoders which decode frame by frame and have their own sample buffer
+static void decode_frames(sbuf_t* sdst, VGMSTREAM* vgmstream) {
+    const int max_empty = 1000;
     int num_empty = 0;
-
     decode_state_t* ds = vgmstream->decode_state;
+    sbuf_t* ssrc = &ds->sbuf;
 
-    while (sbuf->filled < sbuf->samples) {
 
-        // decode new frame if all was consumed
-        if (ds->sbuf.filled == 0) {
+    // fill the external buf by decoding N times; may read partially that buf
+    while (sdst->filled < sdst->samples) {
+
+        // decode new frame if prev one was consumed
+        if (ssrc->filled == 0) {
             bool ok = false;
             switch (vgmstream->coding_type) {
-                case coding_TAC:
-                    ok = decode_tac_frame(vgmstream);
+                case coding_KA1A:
+                    ok = decode_ka1a_frame(vgmstream);
                     break;
                 default:
-                    break;
+                    goto decode_fail;
             }
 
             if (!ok)
                 goto decode_fail;
         }
 
+        // decoder may not fill the buffer in a few calls in some codecs, but more it's probably a bug
+        if (ssrc->filled == 0) {
+            num_empty++;
+            if (num_empty > max_empty) {
+                VGM_LOG("VGMSTREAM: deadlock?\n");
+                goto decode_fail;
+            }
+        }
+    
         if (ds->discard) {
-            // decode may signal that decoded samples need to be discarded, because of encoder delay
-            // (first samples of a file need to be ignored) or a loop
-            int current_discard = ds->discard;
-            if (current_discard > ds->sbuf.filled)
-                current_discard = ds->sbuf.filled;
+            // decoder may signal that samples need to be discarded (ex. encoder delay or during loops)
+            int samples_discard = ds->discard;
+            if (samples_discard > ssrc->filled)
+                samples_discard = ssrc->filled;
 
-            sbuf_consume(&ds->sbuf, current_discard);
-
-            ds->discard -= current_discard;
+            sbuf_consume(ssrc, samples_discard);
+            ds->discard -= samples_discard;
+            // there may be more discard in next loop
         }
         else {
             // copy + consume
-            int samples_copy = ds->sbuf.filled;
-            if (samples_copy > sbuf->samples - sbuf->filled)
-                samples_copy = sbuf->samples - sbuf->filled;
+            int samples_copy = sbuf_get_copy_max(sdst, ssrc);
 
-            sbuf_copy_segments(sbuf, &ds->sbuf);
-            sbuf_consume(&ds->sbuf, samples_copy);
-
-            sbuf->filled += samples_copy;
+            sbuf_copy_segments(sdst, ssrc, samples_copy);
+            sbuf_consume(ssrc, samples_copy);
         }
     }
 
     return;
 decode_fail:
-    /* on error just put some 0 samples */
-    VGM_LOG("VGMSTREAM: decode fail, missing %i samples\n", sbuf->samples - sbuf->filled);
-    sbuf_silence_rest(sbuf);
+    //TODO clean ssrc?
+    //* on error just put some 0 samples
+    VGM_LOG("VGMSTREAM: decode fail, missing %i samples\n", sdst->samples - sdst->filled);
+    sbuf_silence_rest(sdst);
 }
-#endif
+
 
 /* Decode samples into the buffer. Assume that we have written samples_filled into the
  * buffer already, and we have samples_to_do consecutive samples ahead of us (won't call
  * more than one frame if configured above to do so).
  * Called by layouts since they handle samples written/to_do */
 void decode_vgmstream(VGMSTREAM* vgmstream, int samples_filled, int samples_to_do, sample_t* buffer) {
-#if VGM_TEST_DECODER
-    sbuf_t sbuf_tmp = {0};
-    sbuf_t* sbuf = &sbuf_tmp;
-    sbuf_init_s16(sbuf, buffer,  samples_filled + samples_to_do, vgmstream->channels);
-    sbuf->filled = samples_filled;
-#endif
     int ch;
 
     buffer += samples_filled * vgmstream->channels; /* passed externally to simplify I guess */
@@ -1660,11 +1671,18 @@ void decode_vgmstream(VGMSTREAM* vgmstream, int samples_filled, int samples_to_d
                 decode_ea_mt(vgmstream, buffer+ch, vgmstream->channels, samples_to_do, ch);
             }
             break;
-        default:
-#if VGM_TEST_DECODER
+
+        default: {
+            sbuf_t sbuf_tmp = {0};
+            sbuf_t* sbuf = &sbuf_tmp;
+
+            // buffers already adjusted
+            sbuf_init_s16(sbuf, buffer, /*samples_filled +*/ samples_to_do, vgmstream->channels);
+            sbuf->filled = 0; // samples_filled;
+
             decode_frames(sbuf, vgmstream);
-#endif
             break;
+        }
     }
 }
 
diff --git a/src/base/decode.h b/src/base/decode.h
index 4731eab4..4556b272 100644
--- a/src/base/decode.h
+++ b/src/base/decode.h
@@ -3,9 +3,7 @@
 
 #include "../vgmstream.h"
 
-#if VGM_TEST_DECODER
 void* decode_init();
-#endif
 void decode_free(VGMSTREAM* vgmstream);
 void decode_seek(VGMSTREAM* vgmstream);
 void decode_reset(VGMSTREAM* vgmstream);
diff --git a/src/base/decode_state.h b/src/base/decode_state.h
index 64bf7267..3ad71274 100644
--- a/src/base/decode_state.h
+++ b/src/base/decode_state.h
@@ -1,13 +1,11 @@
 #ifndef _DECODE_STATE_H
 #define _DECODE_STATE_H
 
-#if VGM_TEST_DECODER
 #include "sbuf.h"
 
 typedef struct {
     int discard;
     sbuf_t sbuf;
 } decode_state_t;
-#endif
 
 #endif
diff --git a/src/base/sbuf.c b/src/base/sbuf.c
index e6d49c51..184c63e0 100644
--- a/src/base/sbuf.c
+++ b/src/base/sbuf.c
@@ -3,6 +3,7 @@
 //#include <math.h>
 #include "../util.h"
 #include "sbuf.h"
+#include "../util/log.h"
 
 
 void sbuf_init(sbuf_t* sbuf, sfmt_t format, void* buf, int samples, int channels) {
@@ -14,19 +15,15 @@ void sbuf_init(sbuf_t* sbuf, sfmt_t format, void* buf, int samples, int channels
 }
 
 void sbuf_init_s16(sbuf_t* sbuf, int16_t* buf, int samples, int channels) {
-    memset(sbuf, 0, sizeof(sbuf_t));
-    sbuf->buf = buf;
-    sbuf->samples = samples;
-    sbuf->channels = channels;
-    sbuf->fmt = SFMT_S16;
+    sbuf_init(sbuf, SFMT_S16, buf, samples, channels);
 }
 
 void sbuf_init_f32(sbuf_t* sbuf, float* buf, int samples, int channels) {
-    memset(sbuf, 0, sizeof(sbuf_t));
-    sbuf->buf = buf;
-    sbuf->samples = samples;
-    sbuf->channels = channels;
-    sbuf->fmt = SFMT_F32;
+    sbuf_init(sbuf, SFMT_F32, buf, samples, channels);
+}
+
+void sbuf_init_flt(sbuf_t* sbuf, float* buf, int samples, int channels) {
+    sbuf_init(sbuf, SFMT_FLT, buf, samples, channels);
 }
 
 
@@ -50,19 +47,19 @@ void* sbuf_get_filled_buf(sbuf_t* sbuf) {
     return buf;
 }
 
-void sbuf_consume(sbuf_t* sbuf, int count) {
+void sbuf_consume(sbuf_t* sbuf, int samples) {
     int sample_size = sfmt_get_sample_size(sbuf->fmt);
-    if (sample_size <= 0)
+    if (sample_size <= 0) //???
         return;
-    if (count > sbuf->samples || count > sbuf->filled) //TODO?
+    if (samples > sbuf->samples || samples > sbuf->filled) //???
         return;
 
     uint8_t* buf = sbuf->buf;
-    buf += count * sbuf->channels * sample_size;
+    buf += samples * sbuf->channels * sample_size;
 
     sbuf->buf = buf;
-    sbuf->filled -= count;
-    sbuf->samples -= count;
+    sbuf->filled -= samples;
+    sbuf->samples -= samples;
 }
 
 /* when casting float to int, value is simply truncated:
@@ -157,6 +154,15 @@ void sbuf_copy_from_f32(sbuf_t* sbuf, float* src) {
     }
 }
 
+// max samples to copy from ssrc to sdst, considering that dst may be partially filled
+int sbuf_get_copy_max(sbuf_t* sdst, sbuf_t* ssrc) {
+    int sdst_max = sdst->samples - sdst->filled;
+    int samples_copy = ssrc->filled;
+    if (samples_copy > sdst_max)
+        samples_copy = sdst_max;
+    return samples_copy;
+}
+
 
 /* ugly thing to avoid repeating functions */
 #define sbuf_copy_segments_internal(dst, src, src_pos, dst_pos, src_max) \
@@ -174,25 +180,29 @@ void sbuf_copy_from_f32(sbuf_t* sbuf, float* src) {
         dst[dst_pos++] = float_to_int(src[src_pos++] * value); \
     }
 
-void sbuf_copy_segments(sbuf_t* sdst, sbuf_t* ssrc) {
-    /* uncommon so probably fine albeit slower-ish, 0'd other channels first */
+// copy N samples from ssrc into dst (should be clamped externally)
+void sbuf_copy_segments(sbuf_t* sdst, sbuf_t* ssrc, int samples_copy) {
+    
     if (ssrc->channels != sdst->channels) {
-        sbuf_silence_part(sdst, sdst->filled, ssrc->filled);
+        // 0'd other channels first (uncommon so probably fine albeit slower-ish)
+        sbuf_silence_part(sdst, sdst->filled, samples_copy);
         sbuf_copy_layers(sdst, ssrc, 0, ssrc->filled);
 #if 0
-        // "faster" but lots of extra ifs, not worth it
+        // "faster" but lots of extra ifs per sample format, not worth it
         while (src_pos < src_max) {
             for (int ch = 0; ch < dst_channels; ch++) {
                 dst[dst_pos++] = ch >= src_channels ? 0 : src[src_pos++];
             }
         }
 #endif
+        //TODO: may want to handle externally?
+        sdst->filled += samples_copy;
         return;
     }
 
     int src_pos = 0;
     int dst_pos = sdst->filled * sdst->channels;
-    int src_max = ssrc->filled * ssrc->channels;
+    int src_max = samples_copy * ssrc->channels;
 
     // define all posible combos, probably there is a better way to handle this but...
 
@@ -239,6 +249,9 @@ void sbuf_copy_segments(sbuf_t* sdst, sbuf_t* ssrc) {
         float* src = ssrc->buf;
         sbuf_copy_segments_internal_flt(dst, src, src_pos, dst_pos, src_max, (1/32768.0f));
     }
+
+    //TODO: may want to handle externally?
+    sdst->filled += samples_copy;
 }
 
 
diff --git a/src/base/sbuf.h b/src/base/sbuf.h
index 226390cb..f5792eb7 100644
--- a/src/base/sbuf.h
+++ b/src/base/sbuf.h
@@ -30,6 +30,7 @@ typedef struct {
 void sbuf_init(sbuf_t* sbuf, sfmt_t format, void* buf, int samples, int channels);
 void sbuf_init_s16(sbuf_t* sbuf, int16_t* buf, int samples, int channels);
 void sbuf_init_f32(sbuf_t* sbuf, float* buf, int samples, int channels);
+void sbuf_init_flt(sbuf_t* sbuf, float* buf, int samples, int channels);
 
 int sfmt_get_sample_size(sfmt_t fmt);
 
@@ -39,9 +40,11 @@ void* sbuf_get_filled_buf(sbuf_t* sbuf);
 void sbuf_consume(sbuf_t* sbuf, int count);
 
 /* helpers to copy between buffers; note they assume dst and src aren't the same buf */
+int sbuf_get_copy_max(sbuf_t* sdst, sbuf_t* ssrc);
+
 void sbuf_copy_to_f32(float* dst, sbuf_t* sbuf);
 void sbuf_copy_from_f32(sbuf_t* sbuf, float* src);
-void sbuf_copy_segments(sbuf_t* sdst, sbuf_t* ssrc);
+void sbuf_copy_segments(sbuf_t* sdst, sbuf_t* ssrc, int samples_copy);
 void sbuf_copy_layers(sbuf_t* sdst, sbuf_t* ssrc, int dst_ch_start, int expected);
 
 void sbuf_silence_s16(sample_t* dst, int samples, int channels, int filled);
diff --git a/src/coding/coding.h b/src/coding/coding.h
index 6e052438..85f5ebb6 100644
--- a/src/coding/coding.h
+++ b/src/coding/coding.h
@@ -372,9 +372,6 @@ typedef struct tac_codec_data tac_codec_data;
 
 tac_codec_data* init_tac(STREAMFILE* sf);
 void decode_tac(VGMSTREAM* vgmstream, sample_t* outbuf, int32_t samples_to_do);
-#if VGM_TEST_DECODER
-bool decode_tac_frame(VGMSTREAM* vgmstream);
-#endif
 void reset_tac(tac_codec_data* data);
 void seek_tac(tac_codec_data* data, int32_t num_sample);
 void free_tac(tac_codec_data* data);
@@ -390,6 +387,16 @@ void seek_ice(ice_codec_data* data, int32_t num_sample);
 void free_ice(ice_codec_data* data);
 
 
+/* ka1a_decoder */
+typedef struct ka1a_codec_data ka1a_codec_data;
+
+ka1a_codec_data* init_ka1a(int bitrate_mode, int channels_tracks);
+void free_ka1a(ka1a_codec_data* data);
+void reset_ka1a(ka1a_codec_data* data);
+bool decode_ka1a_frame(VGMSTREAM* vgmstream);
+void seek_ka1a(VGMSTREAM* v, int32_t num_sample);
+
+
 #ifdef VGM_USE_VORBIS
 /* ogg_vorbis_decoder */
 typedef struct ogg_vorbis_codec_data ogg_vorbis_codec_data;
diff --git a/src/coding/ka1a_decoder.c b/src/coding/ka1a_decoder.c
new file mode 100644
index 00000000..17d23a95
--- /dev/null
+++ b/src/coding/ka1a_decoder.c
@@ -0,0 +1,146 @@
+#include "coding.h"
+#include "../base/decode_state.h"
+#include "libs/ka1a_dec.h"
+
+
+/* opaque struct */
+struct ka1a_codec_data {
+    uint8_t* buf;
+    float* fbuf;
+
+    int frame_size;
+    void* handle;
+};
+
+
+ka1a_codec_data* init_ka1a(int bitrate_mode, int channels_tracks) {
+    ka1a_codec_data* data = NULL;
+    int buf_size;
+
+    data = calloc(1, sizeof(ka1a_codec_data));
+    if (!data) goto fail;
+
+    data->handle = ka1a_init(bitrate_mode, channels_tracks, 1);
+    if (!data->handle) goto fail;
+
+    data->frame_size = ka1a_get_frame_size(data->handle);
+    if (data->frame_size <= 0) goto fail;
+
+    buf_size = data->frame_size * channels_tracks;
+    data->buf = calloc(buf_size, sizeof(uint8_t));
+    if (!data->buf) goto fail;
+
+    data->fbuf = calloc(KA1A_FRAME_SAMPLES * channels_tracks, sizeof(float));
+    if (!data->fbuf) goto fail;
+
+    return data;
+fail:
+    free_ka1a(data);
+    return NULL;
+}
+
+static bool read_ka1a_frame(VGMSTREAM* v) {
+    ka1a_codec_data* data = v->codec_data;
+    int bytes;
+
+    if (v->codec_config) {
+        int block = data->frame_size;
+
+        // interleaved mode: read from each channel separately and mix in buf
+        for (int ch = 0; ch < v->channels; ch++) {
+            VGMSTREAMCHANNEL* vs = &v->ch[ch];
+
+            bytes = read_streamfile(data->buf + block * ch, vs->offset, block, vs->streamfile);
+            if (bytes != block)
+                return false;
+
+            vs->offset += bytes;
+        }
+    }
+    else {
+        // single block of frames
+        int block = data->frame_size * v->channels;
+        VGMSTREAMCHANNEL* vs = &v->ch[0];
+
+        bytes = read_streamfile(data->buf, vs->offset, block, vs->streamfile);
+        if (bytes != block)
+            return false;
+
+        vs->offset += bytes;
+    }
+
+    return true;
+}
+
+bool decode_ka1a_frame(VGMSTREAM* v) {
+    bool ok = read_ka1a_frame(v);
+    if (!ok)
+        return false;
+
+    decode_state_t* ds = v->decode_state;
+    ka1a_codec_data* data = v->codec_data;
+
+    int samples = ka1a_decode(data->handle, data->buf, data->fbuf);
+    if (samples < 0)
+        return false;
+
+    sbuf_init_flt(&ds->sbuf, data->fbuf, KA1A_FRAME_SAMPLES, v->channels);
+    ds->sbuf.filled = samples;
+
+    return true;
+}
+
+void reset_ka1a(ka1a_codec_data* data) {
+    if (!data || !data->handle) return;
+    
+    ka1a_reset(data->handle);
+}
+
+void seek_ka1a(VGMSTREAM* v, int32_t num_sample) {
+    ka1a_codec_data* data = v->codec_data;
+    decode_state_t* ds = v->decode_state;
+    if (!data) return;
+
+    reset_ka1a(data);
+
+    // find closest offset to desired sample
+    int32_t seek_frame = num_sample / KA1A_FRAME_SAMPLES;
+    int32_t seek_sample = num_sample % KA1A_FRAME_SAMPLES;
+
+    ds->discard = seek_sample;
+
+    if (v->codec_config) {
+        uint32_t seek_offset = seek_frame * data->frame_size;
+
+        if (v->loop_ch) {
+            for (int ch = 0; ch < v->channels; ch++) {
+                v->loop_ch[ch].offset = v->loop_ch[ch].channel_start_offset + seek_offset;
+            }
+        }
+    }
+    else {
+        uint32_t seek_offset = seek_frame * data->frame_size * v->channels;
+
+        if (v->loop_ch) {
+            v->loop_ch[0].offset = v->loop_ch[0].channel_start_offset + seek_offset;
+        }
+    }
+
+    // (due to implicit encode delay the above is byte-exact equivalent vs a discard loop)
+    #if 0
+    ds->discard = num_sample;
+    if (v->loop_ch) {
+        v->loop_ch[0].offset = v->loop_ch[0].channel_start_offset;
+    }
+    #endif
+}
+
+void free_ka1a(ka1a_codec_data* data) {
+    if (!data) return;
+
+    if (data->handle)
+        ka1a_free(data->handle);
+    free(data->buf);
+    free(data->fbuf);
+    free(data);
+}
diff --git a/src/coding/libs/ka1a_dec.c b/src/coding/libs/ka1a_dec.c
new file mode 100644
index 00000000..af253534
--- /dev/null
+++ b/src/coding/libs/ka1a_dec.c
@@ -0,0 +1,636 @@
+#include <math.h>
+#include <string.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <stdbool.h>
+
+#include "ka1a_dec.h"
+#include "ka1a_dec_data.h"
+#include "../../util/reader_get.h"
+
+/* Decodes Koei Tecmo's KA1A, a fairly simple transform-based (FFT) mono codec.
+ *
+ * The codec seems nameless (it has a "_CODECNAME" string) so this is named after streamed files'
+ * fourCC. It's somewhat inefficient (not very packed) but simple so maybe designed for speed.
+ * OG code isn't too optimized though.
+ *
+ * Reverse engineered from exes, thanks to Kelebek1 and AceKombat for help and debugging.
+ * Output has been compared to memdumps and should be accurate with minor +-diffs.
+ * 
+ * Even though some parts can be simplified/optimized code tries to emulate what source code
+ * may look like, undoing unrolled/vectorized parts. Functions marked as 'inline' don't exist in
+ * decomp but surely were part of the source code, while 'unused' args may be remants/compilation details.
+ * 
+ * If you are going to use this info/code elsewhere kindly credit your sources. It's the right thing to do.
+ */
+
+
+// Gets frame info based on bitrate mode, to unpack 1 frame.
+// OG code calls this per frame but codec is CBR (single bitrate index) plus values 
+// could be precalculated per bitrate index (remnant of VBR or more complex modes?)
+static void get_frame_info(int bitrate_index, int* p_steps_size, int* p_coefs_size) {
+    int coefs_bits = 0;
+    int steps_bits = 0;
+
+    // first 8 bands use 8-bit codes and step is implicit
+    for (int i = 0; i < 8; i++) {
+        int codes = BAND_CODES[bitrate_index][i];
+        coefs_bits += 8 * codes;
+    }
+
+    if (bitrate_index <= 5) {
+        // lower bitrate modes have one 8-bit code, rest is 4-bit
+        coefs_bits += (MAX_BANDS - 8) * 8;
+        for (int i = 8; i < MAX_BANDS; i++) {
+            int step_bits = BAND_STEP_BITS[i];
+            int codes = BAND_CODES[bitrate_index][i];
+            steps_bits += step_bits * codes;
+            coefs_bits += 4 * (codes - 1);
+        }
+    }
+    else {
+        // higher bitrate modes use 8-bit codes
+        for (int i = 8; i < MAX_BANDS; i++) {
+            int step_bits = BAND_STEP_BITS[i];
+            int codes = BAND_CODES[bitrate_index][i];
+            steps_bits += step_bits * codes;
+            coefs_bits += 8 * codes;
+        }
+    }
+
+    // bits to bytes + padding
+    *p_steps_size = (steps_bits + 7) >> 3;
+    *p_coefs_size = (coefs_bits + 7) >> 3;
+}
+
+// Helper used in related functions, but not during decode. Note that 'mode' must be validated externally (-5..5).
+// In practice values are: 0x60, 0x68, 0x73, 0x7d, 0x8c, 0x9b, 0xad, 0xc2, 0xd7, 0xed, 0x102.
+static int get_frame_size(int bitrate_mode) {
+    int scalefactor_size = 0x04;
+    int steps_size = 0;
+    int coefs_size = 0;
+    get_frame_info(bitrate_mode + BITRATE_INDEX_MODIFIER, &steps_size, &coefs_size);
+    return scalefactor_size + steps_size + coefs_size;
+}
+
+
+// Convert 8-bit signed code as exp 
+// (note that 0.086643398 being float is important to get results closer to memdumps)
+static inline float unpack_convert_code(uint8_t code, float scalefactor) {
+    float coef;
+    if (code) {
+        float code_f = (int8_t)code;
+        if (code & 0x80) {
+            code_f = -code_f;
+            scalefactor = -scalefactor;
+        }
+
+        coef = expf((code_f - 127.0f) * 0.086643398f) * scalefactor;
+    }
+    else {
+        coef = 0.0;
+    }
+
+    return coef;
+}
+
+// Adjust current coef by -1.0..1.0 (4-bit subcode values 0..14 * 1/7 to -1.0..1.0; code 15 seems unused).
+// (note that 0.14285715f being float is important to get results closer to memdumps)
+static inline float unpack_convert_subcode(uint8_t code, float coef) {
+    return ((code * 0.14285715f) - 1.0f) * coef;
+}
+
+// Get N bits (max 8) from data, MSB order.
+// Doesn't check boundaries, but should never past src as bits come from fixed tables.
+static inline int unpack_get_bits(uint8_t* src, int* p_byte_pos, int* p_bit_pos, int bits) {
+    int value = 0;
+    int byte_pos = *p_byte_pos;
+    int bit_pos = *p_bit_pos;
+
+    int next_bitpos = bit_pos + bits;
+    if (next_bitpos > 8) {
+        // read between 2 bytes
+        if (next_bitpos <= 16) { // more shouldn't happen
+            uint32_t mask_lo = (1 << (8 - bit_pos)) - 1;
+            uint32_t mask_hi = (1 << (next_bitpos - 8)) - 1;
+            uint8_t code_lo = src[byte_pos+0];
+            uint8_t code_hi = src[byte_pos+1];
+            value = ((code_hi & mask_hi) << (8 - bit_pos)) + ((code_lo >> bit_pos) & mask_lo);
+        }
+    }
+    else {
+        // read in current byte
+        uint32_t mask = (1 << bits) - 1;
+        uint8_t code = src[byte_pos];
+        value = (code >> bit_pos) & mask;
+    }
+
+    bit_pos += bits;
+    if (next_bitpos >= 8) {
+        bit_pos = next_bitpos - 8;
+        byte_pos++;
+    }
+
+    *p_byte_pos = byte_pos;
+    *p_bit_pos = bit_pos;
+    return value;
+}
+
+// Unpack a single frame into quantized spectrum coefficients, packed like this:
+// - 1 scalefactor (32-bit float)
+// - N coef sub-positions aka steps (4-7 bits) per higher bands (8..21)
+// - N codes (8-bit) per lower bands (0..7), of implicit positions
+// - 1 main code (8-bit) per higher bands 8..21 then (N-1) coefs (8 or 4-bit) per bands
+//
+// Each code is converted to a coef then saved to certain position to dst buf.
+// Lower bitrate modes use 4-bit codes that are relative to main coef (* +-1.0).
+//
+// Bands encode less coefs than dst may hold, so 'positions' are used to put coefs
+// non-linearly, where unset indexes are 0 (dst must be memset before calling unpack frame).
+// dst should be 1024, though usually only lower 512 (max step is 390 + ((1<<7) - 1)).
+static void unpack_frame(uint8_t* src, float* dst, int steps_size, void* unused, int bitrate_index) {
+
+    // copy coefs counts as they may be modified below
+    int band_codes_tmp[MAX_BANDS];
+    for (int i = 0; i < MAX_BANDS; i++) {
+        band_codes_tmp[i] = BAND_CODES[bitrate_index][i];
+    }
+
+    // read base scalefactor (first 4 bytes) and setup buffers
+    float scalefactor = get_f32le(src);
+    uint8_t* src_steps = &src[0x04];
+    uint8_t* src_codes = &src[0x04 + steps_size];
+
+    // negative scalefactor signals more/less codes for some bands (total doesn't change though)
+    if (scalefactor < 0.0f) {
+        scalefactor = -scalefactor;
+
+        int mod = BITRATE_SUBMODE[bitrate_index];
+        for (int i = 8; i < 12; i++) {
+            band_codes_tmp[i] += mod;
+        }
+        for (int i = 17; i < 21; i++) {
+            band_codes_tmp[i] -= mod;
+        }
+    }
+
+    // coefs from lower bands (in practice fixed to 5 * 8)
+    int code_pos = 0;
+    for (int band = 0; band < 8; band++) {
+        int band_codes = band_codes_tmp[band];
+        for (int i = 0; i < band_codes; i++) {
+            uint8_t code = src_codes[code_pos];
+            dst[code_pos] = unpack_convert_code(code, scalefactor);
+            code_pos++;
+        }
+    }
+
+    // simple bitreading helpers (struct?)
+    int br_bytepos = 0;
+    int br_bitpos = 0; // in current byte
+
+    int subcode_pos = code_pos + (MAX_BANDS - 8); // position after bands 8..21 main coef
+
+    uint8_t code;
+    float coef;
+    int substep;
+
+    if (bitrate_index <= 5) {
+        // lower bitrates encode 1 main 8-bit coef per band and rest is main * +-1.0, position info in a bitstream
+        bool high_flag = false;
+        for (int band = 8; band < MAX_BANDS; band++) {
+            int band_codes = band_codes_tmp[band];
+            int band_step = BAND_STEPS[band];
+            int step_bits = BAND_STEP_BITS[band];
+
+            substep = unpack_get_bits(src_steps, &br_bytepos, &br_bitpos, step_bits);
+
+            code = src_codes[code_pos];
+            code_pos++;
+
+            coef = unpack_convert_code(code, scalefactor);
+            dst[band_step + substep] = coef;
+
+            for (int i = 1; i < band_codes; i++) {
+                substep = unpack_get_bits(src_steps, &br_bytepos, &br_bitpos, step_bits);
+
+                code = src_codes[subcode_pos];
+                if (high_flag)
+                    subcode_pos++;
+
+                uint8_t subcode = high_flag ? 
+                    (code >> 4) & 0x0F : 
+                    (code >> 0) & 0x0F;
+
+                high_flag = !high_flag;
+
+                dst[band_step + substep] = unpack_convert_subcode(subcode, coef);
+            }
+        }
+    }
+    else {
+        // higher bitrates encode all coefs normally, but still use lower bitrates' ordering scheme (see above)
+        for (int band = 8; band < MAX_BANDS; band++) {
+            int band_codes = band_codes_tmp[band];
+            int band_step = BAND_STEPS[band];
+            int step_bits = BAND_STEP_BITS[band];
+
+            substep = unpack_get_bits(src_steps, &br_bytepos, &br_bitpos, step_bits);
+
+            code = src_codes[code_pos];
+            code_pos++;
+
+            coef = unpack_convert_code(code, scalefactor);
+            dst[band_step + substep] = coef;
+
+            for (int i = 1; i < band_codes; i++) {
+                substep = unpack_get_bits(src_steps, &br_bytepos, &br_bitpos, step_bits);
+
+                code = src_codes[subcode_pos];
+                subcode_pos++;
+
+                coef = unpack_convert_code(code, scalefactor);
+                dst[band_step + substep] = coef;
+            }
+        }
+    }
+}
+
+
+static void transform_twiddles(int points, float* real, float* imag, const float* tw_real, const float* tw_imag) {
+    for (int i = 0; i < points; i++) {
+        float coef_real = real[i];
+        float coef_imag = imag[i];
+        float twid_real = tw_real[i];
+        float twid_imag = tw_imag[i];
+
+        real[i] = (twid_real * coef_real) - (twid_imag * coef_imag);
+        imag[i] = (twid_imag * coef_real) + (twid_real * coef_imag);
+    }
+}
+
+static inline void transform_bit_reversal_permutation(int points, float* real, float* imag) {
+    const int half = points >> 1;
+
+    int j = 0;
+    for (int i = 1; i < points; i++) {
+
+        // j is typically calculated via subs of m, unsure if manual or compiler optimization
+        j = half ^ j;
+        int m = half;
+        while (m > j) {
+            m >>= 1;
+            j = m ^ j;
+        }
+
+        if (i < j) {
+            float coef_real = real[i];
+            float coef_imag = imag[i];
+            real[i] = real[j];
+            imag[i] = imag[j];
+            real[j] = coef_real;
+            imag[j] = coef_imag;
+        }
+    }
+}
+
+static void transform_fft(int points, void* unused, float* real, float* imag, const float* cos_table, const float* sin_table) {
+    const int half = points >> 1;
+
+    transform_bit_reversal_permutation(points, real, imag);
+
+    // these are actually the same value, so OG compilation only uses the cos_table one; added both for completeness
+    float w_real_base = cos_table[points >> 3];
+    float w_imag_base = sin_table[points >> 3];
+
+    // FFT computation using twiddle factors and sub-ffts, probably some known optimization
+    for (int m = 4; m <= points; m <<= 1) { // 0.. (log2(256) / 2)
+        int m4 = m >> 2;
+
+        for (int j = m4; j > 0; j >>= 2) {
+            int min = m4 - j;
+            int max = m4 - (j >> 1);
+            int i_md = min + 2 * m4;
+
+            for (int k = min; k < max; k++) {
+                int i_lo = i_md - m4;
+                int i_hi = i_md + m4;
+
+                float coef_im_a = imag[k] - imag[i_lo];
+                float coef_re_a = real[k] - real[i_lo];
+                real[k] = real[i_lo] + real[k];
+                imag[k] = imag[i_lo] + imag[k];
+
+                float coef_re_b = real[i_hi] - real[i_md];
+                float coef_im_b = imag[i_hi] - imag[i_md];
+                float tmp_ra_ib = coef_re_a - coef_im_b;
+                float tmp_rb_ia = coef_re_b + coef_im_a;
+                float tmp_ib_ra = coef_im_b + coef_re_a;
+                float tmp_ia_rb = coef_im_a - coef_re_b;
+
+                real[i_md] = real[i_hi] + real[i_md];
+                imag[i_md] = imag[i_hi] + imag[i_md];
+                real[i_lo] = tmp_ra_ib;
+                imag[i_lo] = tmp_rb_ia;
+                real[i_hi] = tmp_ib_ra;
+                imag[i_hi] = tmp_ia_rb;
+
+                i_md++;
+            }
+        }
+
+        if (m >= points)
+            continue;
+
+        for (int j = m4; j > 0; j >>= 2) {
+            int min = m + m4 - j;
+            int max = m + m4 - (j >> 1);
+            int i_md = min + 2 * m4;
+
+            for (int k = min; k < max; k++) {
+                int i_lo = i_md - m4;
+                int i_hi = i_md + m4;
+
+                float coef_im_a = imag[k] - imag[i_lo];
+                float coef_re_a = real[k] - real[i_lo];
+                real[k] = real[i_lo] + real[k];
+                imag[k] = imag[i_lo] + imag[k];
+
+                float coef_re_b = real[i_hi] - real[i_md];
+                float coef_im_b = imag[i_hi] - imag[i_md];
+                float tmp_ra_ib = coef_re_a - coef_im_b;
+                float tmp_rb_ia = coef_re_b + coef_im_a;
+                float tmp_ib_ra = coef_im_b + coef_re_a;
+                float tmp_ia_rb = coef_im_a - coef_re_b;
+
+                real[i_md] = real[i_hi] + real[i_md];
+                imag[i_md] = imag[i_hi] + imag[i_md];
+                real[i_lo] = (tmp_rb_ia + tmp_ra_ib) * w_real_base;
+                imag[i_lo] = (tmp_rb_ia - tmp_ra_ib) * w_real_base;
+                real[i_hi] = (tmp_ia_rb - tmp_ib_ra) * w_imag_base;
+                imag[i_hi] = (-tmp_ia_rb - tmp_ib_ra) * w_imag_base;
+
+                i_md++;
+            }
+        }
+
+        int tmp_j = half;
+        for (int m2 = m * 2; m2 < points; m2 += m) {
+            // ???
+            int tmp_m = half;
+            for (tmp_j ^= tmp_m; tmp_m > tmp_j; tmp_j ^= tmp_m) {
+                tmp_m = tmp_m >> 1;
+            }
+
+            int table_index = tmp_j >> 2;
+            float w_real1 = cos_table[table_index];
+            float w_imag1 = -sin_table[table_index];
+            float w_real3 = cos_table[table_index * 3]; 
+            float w_imag3 = -sin_table[table_index * 3];
+
+            for (int j = m4; j > 0; j >>= 2) {
+                int min = m2 + m4 - j;
+                int max = m2 + m4 - (j >> 1);
+                int i_md = min + 2 * m4;
+
+                for (int k = min; k < max; k++) {
+                    int i_lo = i_md - m4;
+                    int i_hi = i_md + m4;
+
+                    float coef_im_a = imag[k] - imag[i_lo];
+                    float coef_re_a = real[k] - real[i_lo];
+                    real[k] = real[i_lo] + real[k];
+                    imag[k] = imag[i_lo] + imag[k];
+
+                    float coef_im_b = imag[i_hi] - imag[i_md];
+                    float coef_re_b = real[i_hi] - real[i_md];
+                    float tmp_ra_ib = coef_re_a - coef_im_b;
+                    float tmp_rb_ia = coef_re_b + coef_im_a;
+                    float tmp_ib_ra = coef_im_b + coef_re_a;
+                    float tmp_ia_rb = coef_im_a - coef_re_b;
+
+                    real[i_md] = real[i_hi] + real[i_md];
+                    imag[i_md] = imag[i_hi] + imag[i_md];
+                    real[i_lo] = (tmp_ra_ib * w_real1) - (tmp_rb_ia * w_imag1);
+                    imag[i_lo] = (tmp_ra_ib * w_imag1) + (tmp_rb_ia * w_real1);
+                    real[i_hi] = (tmp_ib_ra * w_real3) - (tmp_ia_rb * w_imag3);
+                    imag[i_hi] = (tmp_ib_ra * w_imag3) + (tmp_ia_rb * w_real3);
+
+                    i_md++;
+                }
+            }
+        }
+    }
+
+    // final swapping
+    for (int m = half; m > 0; m >>= 2) {
+        int min = half - m;
+        int max = half - (m >> 1);
+
+        for (int k = min; k < max; k++) {
+            float coef_im = imag[k] - imag[k + half];
+            float coef_re = real[k] - real[k + half];
+            real[k] = real[k + half] + real[k];
+            imag[k] = imag[k + half] + imag[k];
+            real[k + half] = coef_re;
+            imag[k + half] = coef_im;
+        }
+    }
+}
+
+// Transform unpacked time-domain coefficients (spectrum) to samples using inverse FFT.
+// Seemingly a variation/simplification of the Cooley-Tukey algorithm (radix-4?).
+void transform_frame(void* unused1, float* src, float* dst, void* unused2, float* fft_buf) {
+    float* real = fft_buf;
+    float* imag = fft_buf + 256;
+
+    // initialize buffers from src
+    for (int i = 0; i < 256; i++) {
+        real[i]       = src[i * 2];
+        imag[255 - i] = src[i * 2 + 1];
+    }
+
+    transform_twiddles(256, real, imag, TWIDDLES_REAL, TWIDDLES_IMAG);
+    transform_fft(256, NULL, real, imag, COS_TABLE, SIN_TABLE);
+    transform_twiddles(256, real, imag, TWIDDLES_REAL, TWIDDLES_IMAG);
+
+    // Scale results by (1 / 512)
+    for (int i = 0; i < 256; i++) {
+        real[i] *= 0.001953125f;
+        imag[i] *= 0.001953125f;
+    }
+
+    // Reorder output (input buf may be reused as output here as there is no overlap).
+    // Note that input is 512 coefs but output is 1024 samples (externally combined with samples)
+    int pos = 0;
+    for (int i = 0; i < 128; i++) {
+        dst[pos++] = real[128 + i];
+        dst[pos++] = -imag[127 - i];
+    }
+    for (int i = 0; i < 256; i++) {
+        dst[pos++] = imag[i];
+        dst[pos++] = -real[255 - i];
+    }
+    for (int i = 0; i < 128; i++) {
+        dst[pos++] = -real[i];
+        dst[pos++] = imag[255 - i];
+    }
+}
+
+// Decodes a block of frames (see .h)
+//
+// To get 512 samples decoder needs to combine samples from prev + current frame (MP3 granule-style?).
+// though will only output samples from current. prev-frame can be optionally used to setup overlapping
+// samples with 'setup_flag'. Since decoding current-frame will also setup the overlap for next frame,
+// prev data and predecode-flag are only needed on init or after seeking.
+// 
+// Original decoder expects 2 blocks in src (1 frame * channels * tracks): src[0] = prev, src[block-size] = curr
+// (even if prev isn't used). This isn't very flexible, so this decoder expects only 1 block.
+// Probably setup this odd way due to how data is read/handled in KT's engine.
+static void decode_frame(unsigned char* src, int tracks, int channels, float* dst, int bitrate_mode, int setup_flag, float* prev, float* temp) {
+    float* fft_buf = &temp[0]; //size 512 * 2
+    float* coefs = &temp[512 * 2]; //size 512 * 2
+
+    int bitrate_index = bitrate_mode + BITRATE_INDEX_MODIFIER;
+    int steps_size = 0;
+    int coefs_size = 0;
+    get_frame_info(bitrate_index, &steps_size, &coefs_size);
+    int frame_size = 0x04 + steps_size + coefs_size;
+
+    // decode 'prev block of frames' (optional as it just setups 'prev' buf, no samples are written)
+    if (setup_flag) {
+        uint8_t* src_block = &src[0]; // 1st block in src
+
+        for (int track = 0; track < tracks; track++) {
+            int frame_num = channels * track;
+
+            for (int ch = 0; ch < channels; ch++) {
+                uint8_t* frame = &src_block[frame_num * frame_size];
+
+                memset(coefs, 0, FRAME_SAMPLES * sizeof(float));
+                unpack_frame(frame, coefs, steps_size, NULL, bitrate_index);
+                transform_frame(NULL, coefs, coefs, NULL, fft_buf);
+
+                int interleave = frame_num * FRAME_SAMPLES;
+                for (int i = 0; i < FRAME_SAMPLES; i++) {
+                    // save samples for 'current block of frames' and overlap
+                    prev[interleave + i] = coefs[512 + i] * OVERLAP_WINDOW[511 - i];
+                }
+
+                frame_num++;
+            }
+        }
+    }
+
+    if (setup_flag) // MOD: expect only 1 block per call
+        return;
+
+    // decode 'current block of frames' (writes 512 samples, plus setups 'prev' buf)
+    {
+        //uint8_t* src_block = &src[channels * tracks * frame_size]; // 2nd block in src in OG code
+        uint8_t* src_block = &src[0]; // MOD: expect only 1 block  per call
+
+        for (int track = 0; track < tracks; track++) {
+            int frame_num = channels * track;
+
+            float* dst_track = &dst[frame_num * FRAME_SAMPLES];
+            for (int ch = 0; ch < channels; ch++) {
+                uint8_t* frame = &src_block[frame_num * frame_size];
+
+                memset(coefs, 0, FRAME_SAMPLES * sizeof(float));
+                unpack_frame(frame, coefs, steps_size, NULL, bitrate_index);
+                transform_frame(NULL, coefs, coefs, NULL, fft_buf);
+
+                int interleave = frame_num * FRAME_SAMPLES;
+                for (int i = 0; i < FRAME_SAMPLES; i++) {
+                    coefs[i] *= OVERLAP_WINDOW[i];
+                    coefs[512 + i] *= OVERLAP_WINDOW[511 - i];
+                    dst_track[i * channels + ch] = coefs[i] + prev[interleave + i];
+                }
+
+                // save overlapped samples for next
+                memcpy(&prev[interleave], &coefs[512], FRAME_SAMPLES * sizeof(float));
+
+                frame_num++;
+            }
+        }
+    }
+}
+
+//-----------------------------------------------------------------------------
+// API (not part of original code)
+
+struct ka1a_handle_t {
+    // config
+    int bitrate_mode;
+    int channels;
+    int tracks;
+
+    // state
+    bool setup_flag;        // next frame will be used as setup and won't output samples
+    float temp[1024 * 2];   // fft + coef buf
+    float* prev;            // at least samples * channels * tracks
+};
+
+ka1a_handle_t* ka1a_init(int bitrate_mode, int channels, int tracks) {
+
+    int bitrate_index = bitrate_mode + BITRATE_INDEX_MODIFIER;
+    if (bitrate_index < 0 || bitrate_index >= MAX_BITRATES)
+        return NULL;
+
+    if (channels * tracks <= 0 || channels * tracks > MAX_CHANNELS_TRACKS)
+        return NULL;
+
+    ka1a_handle_t* ctx = calloc(1, sizeof(ka1a_handle_t));
+    if (!ctx) goto fail;
+
+    ctx->prev = calloc(1, FRAME_SAMPLES * channels * tracks * sizeof(float));
+    if (!ctx) goto fail;
+
+    ctx->bitrate_mode = bitrate_mode;
+    ctx->channels = channels;
+    ctx->tracks = tracks;
+
+    ka1a_reset(ctx);
+
+    return ctx;
+fail:
+    ka1a_free(ctx);
+    return NULL;
+}
+
+void ka1a_free(ka1a_handle_t* ctx) {
+    if (!ctx)
+        return;
+
+    free(ctx->prev);
+    free(ctx);
+}
+
+void ka1a_reset(ka1a_handle_t* ctx) {
+    if (!ctx)
+        return;
+
+    ctx->setup_flag = true;
+    // no need to reset buffers as on next decode frame will be used to setup them.
+}
+
+int ka1a_decode(ka1a_handle_t* ctx, unsigned char* src, float* dst) {
+    if (!ctx)
+        return -1;
+    
+    decode_frame(src, ctx->tracks, ctx->channels, dst, ctx->bitrate_mode, ctx->setup_flag, ctx->prev, ctx->temp);
+    
+    if (ctx->setup_flag) {
+        ctx->setup_flag = false;
+        return 0;
+    }
+
+    return FRAME_SAMPLES;
+}
+
+int ka1a_get_frame_size(ka1a_handle_t* ctx) {
+    if (!ctx)
+        return 0;
+    return get_frame_size(ctx->bitrate_mode);
+}
diff --git a/src/coding/libs/ka1a_dec.h b/src/coding/libs/ka1a_dec.h
new file mode 100644
index 00000000..4c3fe373
--- /dev/null
+++ b/src/coding/libs/ka1a_dec.h
@@ -0,0 +1,42 @@
+#ifndef _KA1A_DEC_
+#define _KA1A_DEC_
+
+/* Decodes Koei Tecmo's KA1A, a fairly simple transform-based (FFT) mono codec. */
+
+
+//#define KA1A_FRAME_SIZE_MAX 0x200
+#define KA1A_FRAME_SAMPLES 512
+
+
+typedef struct ka1a_handle_t ka1a_handle_t;
+
+/* Inits decoder.
+ * - bitrate_mode: value from header (-5..5)
+ * - channels: Nch-interleaved tracks
+ * - tracks: number of parts of N-ch
+ *
+ * Channel/tracks define final interleaved output per ka1a_decode:
+ *    [track0 ch0 ch1 ch0 ch1... x512][track1 ch0 ch1 ch0 ch1... x512]...
+ * Codec is mono though, so this can be safely reinterpreted, ex. channels = tracks * channels, tracks = 1:
+ *    [track0 ch0 ch1 ch3 ch4 ch5 ch6... x512]
+ * or even make N single decoders per track/channel and pass single frames.
+ */
+ka1a_handle_t* ka1a_init(int bitrate_mode, int channels, int tracks);
+
+void ka1a_free(ka1a_handle_t* handle);
+
+void ka1a_reset(ka1a_handle_t* handle);
+
+/* Decodes one block of data.
+ * Returns samples done, 0 on setup or negative or error.
+ * After init/reset next decode won't input samples (similar to encoder delay).
+ *
+ * src should have frame_size * channels * tracks.
+ * dst should have KA1A_FRAME_SAMPLES * channels * tracks (see init for interleave info).
+ */
+int ka1a_decode(ka1a_handle_t* handle, unsigned char* src, float* dst);
+
+// Get current frame size for one single frame.
+int ka1a_get_frame_size(ka1a_handle_t* handle);
+
+#endif
diff --git a/src/coding/libs/ka1a_dec_data.h b/src/coding/libs/ka1a_dec_data.h
new file mode 100644
index 00000000..fb7271b3
--- /dev/null
+++ b/src/coding/libs/ka1a_dec_data.h
@@ -0,0 +1,260 @@
+#ifndef _KA1A_DEC_DATA_
+#define _KA1A_DEC_DATA_
+
+#define MAX_CHANNELS_TRACKS   32 //arbitrary max
+
+#define FRAME_SAMPLES 512
+#define MAX_BANDS   21
+#define FFT_POINTS  256
+#define MAX_BITRATES  11
+
+// bitrate mode in header is defined from -5 to 5, where negative are lower bitrate modes which use
+// less resolution for some codes. Related functions need to add +5 to index so it's pretty pointless.
+#define BITRATE_INDEX_MODIFIER  5
+
+// default number of quantized coefficients encoded per band, for each bitrate modes
+static const int BAND_CODES[MAX_BITRATES][MAX_BANDS] = {
+	{5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, },
+	{5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, },
+	{5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 3, },
+	{5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, },
+	{5, 5, 5, 5, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 5, 5, 5, 5, 5, 5, },
+	{5, 5, 5, 5, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, },
+	{5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, },
+	{5, 5, 5, 5, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, },
+	{5, 5, 5, 5, 5, 5, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, },
+	{5, 5, 5, 5, 5, 5, 5, 5, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, },
+	{5, 5, 5, 5, 5, 5, 5, 5, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, },
+};
+
+// Number of modified coefs to be added/substracted to some bands, for each bitrate mode (varies per frame)
+// Total per 1 band shouldn't go over 10.
+static const int BITRATE_SUBMODE[MAX_BITRATES] = {
+    0, 0, 0, 2, 2, 2, 4, 3, 2, 1, 0,
+};
+
+// base positions in dst buffer for coefs in frame. A sub-position (implicit or from a bitstream) sets
+// the final index, which doesn't need to be linear.
+// ex. band 13 may write 6 coefs to dst[120 + step], where step may be 0, 11, 6, 2, 8, 13
+//     (max 19; unset indexes are implicitly 0)
+static const int BAND_STEPS[MAX_BANDS] = {
+    0, 5, 10, 15, 20, 25, 30, 35, 40, 50, 60, 70, 80, 100, 120, 140, 170, 200, 240, 300, 390,
+};
+
+// lower bands are 0 since all tables above are fixed to 8
+static const int BAND_STEP_BITS[MAX_BANDS] = {
+	0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 7, 7,
+};
+
+// 360 cosine, close to:  for (0..256) t[i] = cos(2 * PI * i / points) with some rounding?
+static const float COS_TABLE[FFT_POINTS] = {
+    1.0, 0.99969882, 0.99879545, 0.99729043, 0.99518472, 0.99247956, 0.98917651, 0.98527765,
+    0.98078525, 0.97570211, 0.97003126, 0.96377605, 0.95694035, 0.94952816, 0.94154406, 0.93299282,
+    0.9238795, 0.91420972, 0.90398932, 0.8932243, 0.88192123, 0.87008697, 0.8577286, 0.84485358,
+    0.8314696, 0.81758481, 0.80320752, 0.78834641, 0.77301043, 0.75720882, 0.74095112, 0.7242471,
+    0.70710677, 0.68954051, 0.67155892, 0.65317279, 0.63439327, 0.61523157, 0.59569931, 0.57580817,
+    0.55557019, 0.53499764, 0.5141027, 0.4928982, 0.47139665, 0.44961131, 0.42755511, 0.40524128,
+    0.38268343, 0.35989496, 0.33688983, 0.31368166, 0.29028463, 0.26671275, 0.24298012, 0.21910122,
+    0.19509023, 0.17096186, 0.1467305, 0.12241063, 0.098017134, 0.073564492, 0.04906765, 0.024541136,
+    -0.0000000437, -0.024541223, -0.049067739, -0.073564574, -0.098017223, -0.12241071, -0.14673057, -0.17096195,
+    -0.19509032, -0.21910131, -0.2429802, -0.26671284, -0.29028472, -0.31368172, -0.33688992, -0.35989505,
+    -0.38268352, -0.40524134, -0.42755508, -0.44961137, -0.47139683, -0.49289817, -0.51410276, -0.5349977,
+    -0.55557036, -0.57580817, -0.59569937, -0.61523169, -0.63439327, -0.65317285, -0.67155904, -0.68954068,
+    -0.70710677, -0.72424716, -0.74095124, -0.75720882, -0.77301049, -0.78834647, -0.80320764, -0.81758481,
+    -0.83146966, -0.84485364, -0.8577286, -0.87008703, -0.88192135, -0.8932243, -0.90398932, -0.91420978,
+    -0.92387962, -0.93299282, -0.94154412, -0.94952822, -0.95694035, -0.96377605, -0.97003126, -0.97570217,
+    -0.98078531, -0.98527765, -0.98917651, -0.9924795, -0.99518472, -0.99729049, -0.99879545, -0.99969882,
+    -1.0, -0.99969882, -0.99879545, -0.99729043, -0.99518472, -0.9924795, -0.98917651, -0.98527765,
+    -0.98078525, -0.97570211, -0.97003126, -0.96377605, -0.95694029, -0.94952816, -0.94154406, -0.93299276,
+    -0.9238795, -0.91420972, -0.90398926, -0.89322418, -0.88192123, -0.87008691, -0.85772854, -0.84485358,
+    -0.83146954, -0.81758469, -0.80320752, -0.78834641, -0.77301037, -0.7572087, -0.74095112, -0.72424704,
+    -0.70710665, -0.68954057, -0.67155892, -0.65317291, -0.63439333, -0.61523157, -0.59569919, -0.57580805,
+    -0.55557001, -0.53499734, -0.51410282, -0.4928982, -0.47139668, -0.44961122, -0.42755494, -0.40524107,
+    -0.38268313, -0.35989511, -0.33688986, -0.31368169, -0.29028454, -0.26671258, -0.24297991, -0.21910091,
+    -0.19509038, -0.17096189, -0.14673041, -0.12241054, -0.098016933, -0.073564284, -0.049067326, -0.024541287,
+    0.0000000119, 0.024541309, 0.049067825, 0.073564783, 0.098017432, 0.12241104, 0.14673042, 0.17096192,
+    0.19509041, 0.2191014, 0.24298041, 0.26671305, 0.29028502, 0.31368169, 0.33688989, 0.35989514,
+    0.3826836, 0.40524155, 0.42755538, 0.44961166, 0.47139671, 0.49289823, 0.51410282, 0.53499776,
+    0.55557042, 0.57580847, 0.59569925, 0.61523157, 0.63439333, 0.65317291, 0.6715591, 0.68954074,
+    0.70710701, 0.72424704, 0.74095112, 0.75720888, 0.77301055, 0.78834653, 0.8032077, 0.81758499,
+    0.8314696, 0.84485358, 0.85772866, 0.87008709, 0.88192135, 0.89322442, 0.90398943, 0.91420972,
+    0.92387956, 0.93299282, 0.94154412, 0.94952828, 0.95694041, 0.96377617, 0.97003126, 0.97570211,
+    0.98078531, 0.98527765, 0.98917657, 0.99247956, 0.99518478, 0.99729043, 0.99879545, 0.99969882,
+};
+
+// 360 sine, close to:  for (0..256) t[i] = cos(2 * PI * i / points) with some rounding?
+static const float SIN_TABLE[FFT_POINTS] = {
+    0.0, 0.024541229, 0.049067676, 0.073564567, 0.098017141, 0.12241068, 0.14673047, 0.1709619,
+    0.19509032, 0.21910124, 0.2429802, 0.26671278, 0.29028466, 0.31368175, 0.33688986, 0.35989505,
+    0.38268346, 0.40524134, 0.42755508, 0.44961134, 0.47139674, 0.49289823, 0.51410276, 0.53499764,
+    0.55557024, 0.57580823, 0.59569931, 0.61523163, 0.63439333, 0.65317285, 0.67155898, 0.68954057,
+    0.70710677, 0.7242471, 0.74095118, 0.75720888, 0.77301043, 0.78834641, 0.80320752, 0.81758481,
+    0.83146966, 0.84485358, 0.85772866, 0.87008697, 0.88192129, 0.8932243, 0.90398932, 0.91420978,
+    0.9238795, 0.93299282, 0.94154406, 0.94952822, 0.95694035, 0.96377605, 0.97003126, 0.97570211,
+    0.98078531, 0.98527765, 0.98917651, 0.99247956, 0.99518472, 0.99729043, 0.99879545, 0.99969882,
+    1.0, 0.99969882, 0.99879545, 0.99729043, 0.99518472, 0.9924795, 0.98917651, 0.98527765,
+    0.98078525, 0.97570211, 0.97003126, 0.96377605, 0.95694029, 0.94952816, 0.94154406, 0.93299282,
+    0.9238795, 0.91420972, 0.90398932, 0.8932243, 0.88192123, 0.87008703, 0.8577286, 0.84485352,
+    0.83146954, 0.81758481, 0.80320752, 0.78834635, 0.77301049, 0.75720882, 0.74095106, 0.72424698,
+    0.70710677, 0.68954051, 0.67155886, 0.65317285, 0.63439327, 0.61523151, 0.59569913, 0.57580817,
+    0.55557019, 0.53499746, 0.51410276, 0.49289814, 0.47139663, 0.44961137, 0.42755505, 0.40524122,
+    0.38268328, 0.35989505, 0.3368898, 0.3136816, 0.29028472, 0.26671273, 0.24298008, 0.21910107,
+    0.19509031, 0.17096181, 0.14673033, 0.1224107, 0.098017097, 0.073564447, 0.049067486, 0.02454121,
+    -0.000000087399997, -0.024541385, -0.049067661, -0.073564619, -0.098017268, -0.12241087, -0.1467305, -0.17096199,
+    -0.19509049, -0.21910124, -0.24298024, -0.2667129, -0.29028487, -0.31368178, -0.33688995, -0.3598952,
+    -0.38268343, -0.4052414, -0.42755523, -0.44961151, -0.47139677, -0.49289829, -0.51410288, -0.53499764,
+    -0.5555703, -0.57580835, -0.59569931, -0.61523163, -0.63439339, -0.65317297, -0.67155898, -0.68954062,
+    -0.70710689, -0.7242471, -0.74095118, -0.75720876, -0.77301043, -0.78834647, -0.80320758, -0.81758493,
+    -0.83146977, -0.84485376, -0.85772854, -0.87008697, -0.88192129, -0.89322436, -0.90398937, -0.91420984,
+    -0.92387968, -0.93299276, -0.94154406, -0.94952822, -0.95694035, -0.96377611, -0.97003132, -0.97570223,
+    -0.98078525, -0.98527765, -0.98917651, -0.99247956, -0.99518472, -0.99729049, -0.99879545, -0.99969882,
+    -1.0, -0.99969882, -0.99879545, -0.99729043, -0.99518472, -0.9924795, -0.98917651, -0.98527765,
+    -0.98078525, -0.97570211, -0.9700312, -0.96377599, -0.95694023, -0.94952822, -0.94154406, -0.93299276,
+    -0.92387944, -0.91420966, -0.90398914, -0.89322412, -0.88192129, -0.87008697, -0.85772854, -0.84485346,
+    -0.83146948, -0.81758463, -0.80320758, -0.78834641, -0.77301043, -0.75720876, -0.740951, -0.72424692,
+    -0.70710653, -0.68954062, -0.67155898, -0.65317279, -0.63439316, -0.61523145, -0.59569907, -0.57580793,
+    -0.5555703, -0.53499764, -0.5141027, -0.49289808, -0.47139654, -0.44961107, -0.42755479, -0.40524137,
+    -0.38268343, -0.35989496, -0.33688971, -0.31368154, -0.2902844, -0.2667124, -0.24298023, -0.21910122,
+};
+
+// similar but not quite:  for (0..256) t[i] = cos(2 * PI * i / points);
+static const float TWIDDLES_REAL[FFT_POINTS] = {
+    0.9999997, 0.99997616, 0.999915, 0.99981618, 0.99967968, 0.99950558, 0.99929386, 0.99904448,
+    0.99875754, 0.99843293, 0.99807078, 0.99767107, 0.99723375, 0.99675888, 0.99624652, 0.9956966,
+    0.99510926, 0.99448442, 0.99382216, 0.99312246, 0.99238533, 0.99161088, 0.99079913, 0.98995006,
+    0.98906368, 0.98814011, 0.98717928, 0.98618132, 0.98514622, 0.98407406, 0.98296481, 0.98181856,
+    0.98063534, 0.97941524, 0.97815824, 0.9768644, 0.97553378, 0.97416645, 0.97276247, 0.97132182,
+    0.96984458, 0.96833086, 0.96678072, 0.96519411, 0.96357119, 0.96191204, 0.96021664, 0.95848507,
+    0.95671743, 0.95491374, 0.9530741, 0.95119864, 0.9492873, 0.94734025, 0.94535756, 0.94333923,
+    0.94128537, 0.93919611, 0.9370715, 0.93491161, 0.93271649, 0.93048626, 0.92822099, 0.92592078,
+    0.92358571, 0.92121589, 0.91881138, 0.9163723, 0.91389865, 0.91139066, 0.90884835, 0.90627176,
+    0.90366107, 0.90101641, 0.89833778, 0.89562535, 0.89287919, 0.89009941, 0.88728613, 0.88443941,
+    0.88155943, 0.87864625, 0.8757, 0.87272078, 0.86970866, 0.86666387, 0.86358637, 0.86047643,
+    0.85733402, 0.85415941, 0.85095257, 0.84771371, 0.84444296, 0.84114039, 0.83780617, 0.83444041,
+    0.83104324, 0.82761478, 0.82415515, 0.82066447, 0.8171429, 0.81359059, 0.81000769, 0.80639422,
+    0.80275041, 0.79907632, 0.79537225, 0.7916382, 0.78787428, 0.7840808, 0.78025776, 0.77640527,
+    0.77252364, 0.76861292, 0.76467323, 0.76070476, 0.75670767, 0.75268203, 0.74862808, 0.74454594,
+    0.74043584, 0.73629779, 0.73213202, 0.72793871, 0.72371799, 0.71947002, 0.71519494, 0.71089298,
+    0.70656419, 0.70220888, 0.6978271, 0.69341904, 0.68898481, 0.68452471, 0.68003887, 0.67552733,
+    0.67099041, 0.66642827, 0.66184098, 0.65722877, 0.65259188, 0.64793038, 0.64324445, 0.63853431,
+    0.63380021, 0.62904215, 0.62426049, 0.61945528, 0.61462677, 0.60977507, 0.60490042, 0.60000306,
+    0.59508306, 0.59014064, 0.58517605, 0.58018941, 0.57518089, 0.57015073, 0.56509918, 0.56002629,
+    0.5549323, 0.5498175, 0.54468191, 0.53952587, 0.5343495, 0.52915293, 0.52393651, 0.51870036,
+    0.51344466, 0.50816965, 0.50287557, 0.4975625, 0.49223068, 0.48688033, 0.48151165, 0.47612482,
+    0.47072011, 0.46529773, 0.45985776, 0.45440048, 0.44892606, 0.44343477, 0.43792677, 0.43240228,
+    0.42686164, 0.42130479, 0.41573209, 0.41014373, 0.40453994, 0.39892092, 0.39328688, 0.38763815,
+    0.3819747, 0.37629688, 0.3706049, 0.36489895, 0.35917926, 0.35344607, 0.34769964, 0.34194005,
+    0.33616757, 0.33038244, 0.32458487, 0.31877509, 0.31295338, 0.30711982, 0.30127469, 0.2954182,
+    0.28955057, 0.28367206, 0.27778289, 0.27188337, 0.26597348, 0.26005358, 0.2541239, 0.24818464,
+    0.24223605, 0.23627833, 0.23031183, 0.22433653, 0.21835281, 0.21236086, 0.20636091, 0.20035319,
+    0.19433793, 0.18831547, 0.1822858, 0.17624927, 0.1702061, 0.16415653, 0.15810078, 0.15203907,
+    0.14597176, 0.13989884, 0.13382064, 0.1277374, 0.12164936, 0.11555674, 0.10945977, 0.10335879,
+    0.097253807, 0.091145165, 0.085033081, 0.078917801, 0.072799556, 0.066678561, 0.060555179, 0.054429397,
+    0.048301566, 0.042171918, 0.036040682, 0.029908087, 0.023774367, 0.01763987, 0.011504591, 0.0053688786,
+};
+
+// similar but not quite:  for (0..256) t[i] = -sin(2 * PI * i / points);
+static const float TWIDDLES_IMAG[] = {
+    -0.00076699042, -0.0069028586, -0.013038468, -0.019173585, -0.025307981, -0.031441424, -0.037573684, -0.043704528,
+    -0.049833726, -0.05596105, -0.062086266, -0.068209141, -0.074329458, -0.080446973, -0.086561449, -0.092672676,
+    -0.098780416, -0.10488442, -0.1109845, -0.11708038, -0.12317186, -0.12925872, -0.13534068, -0.14141756,
+    -0.14748912, -0.15355512, -0.15961535, -0.16566958, -0.17171754, -0.17775905, -0.18379387, -0.18982176,
+    -0.19584252, -0.20185591, -0.20786169, -0.21385963, -0.21984953, -0.22583117, -0.23180428, -0.23776868,
+    -0.24372412, -0.24967039, -0.25560728, -0.26153448, -0.26745188, -0.27335921, -0.27925625, -0.28514278,
+    -0.29101855, -0.2968834, -0.30273706, -0.3085793, -0.31440994, -0.32022873, -0.3260355, -0.33182994,
+    -0.33761194, -0.3433812, -0.34913751, -0.35488072, -0.36061054, -0.36632681, -0.37202924, -0.3777177,
+    -0.38339195, -0.38905174, -0.39469689, -0.40032718, -0.40594241, -0.41154233, -0.41712674, -0.42269552,
+    -0.42824832, -0.43378502, -0.43930539, -0.44480923, -0.45029631, -0.45576641, -0.4612194, -0.46665499,
+    -0.47207305, -0.47747329, -0.48285556, -0.48821968, -0.49356541, -0.49889252, -0.50420088, -0.50949031,
+    -0.51476049, -0.52001131, -0.52524251, -0.53045398, -0.53564543, -0.54081678, -0.54596776, -0.55109817,
+    -0.55620778, -0.56129652, -0.56636411, -0.57141036, -0.57643509, -0.58143818, -0.58641928, -0.59137839,
+    -0.59631521, -0.60122955, -0.60612124, -0.61099017, -0.61583608, -0.62065876, -0.62545812, -0.63023394,
+    -0.63498604, -0.63971418, -0.64441824, -0.6490981, -0.6537534, -0.6583842, -0.66299021, -0.66757119,
+    -0.67212707, -0.67665768, -0.68116277, -0.68564218, -0.69009584, -0.69452351, -0.69892502, -0.70330018,
+    -0.70764893, -0.71197104, -0.71626627, -0.72053456, -0.72477579, -0.72898966, -0.73317605, -0.73733491,
+    -0.74146605, -0.74556917, -0.74964428, -0.75369114, -0.75770962, -0.76169956, -0.76566088, -0.76959336,
+    -0.77349681, -0.77737117, -0.78121626, -0.78503191, -0.78881806, -0.79257452, -0.79630113, -0.79999769,
+    -0.80366421, -0.80730045, -0.81090623, -0.81448156, -0.81802624, -0.82154006, -0.825023, -0.82847482,
+    -0.83189553, -0.83528483, -0.83864272, -0.84196901, -0.84526366, -0.84852648, -0.85175729, -0.85495609,
+    -0.85812271, -0.86125702, -0.86435878, -0.86742812, -0.8704648, -0.8734687, -0.87643969, -0.87937772,
+    -0.88228261, -0.88515425, -0.88799256, -0.8907975, -0.89356887, -0.89630663, -0.89901066, -0.90168083,
+    -0.90431696, -0.90691912, -0.90948713, -0.91202086, -0.91452032, -0.91698533, -0.91941583, -0.92181164,
+    -0.92417276, -0.92649913, -0.92879063, -0.93104714, -0.93326861, -0.93545491, -0.93760598, -0.93972176,
+    -0.9418022, -0.94384718, -0.94585657, -0.94783038, -0.94976848, -0.95167089, -0.9535374, -0.95536804,
+    -0.95716274, -0.95892137, -0.96064389, -0.96233022, -0.96398038, -0.96559417, -0.96717167, -0.96871275,
+    -0.97021735, -0.97168541, -0.97311687, -0.97451174, -0.97586989, -0.97719133, -0.97847593, -0.97972375,
+    -0.98093462, -0.98210859, -0.98324561, -0.98434556, -0.98540848, -0.98643428, -0.987423, -0.98837447,
+    -0.98928875, -0.99016583, -0.99100554, -0.991808, -0.99257314, -0.99330086, -0.99399126, -0.99464417,
+    -0.99525958, -0.99583763, -0.99637812, -0.99688113, -0.99734658, -0.99777448, -0.99816483, -0.99851763,
+    -0.99883282, -0.99911034, -0.99935031, -0.99955267, -0.99971735, -0.99984443, -0.99993384, -0.99998558,
+};
+
+// seems custom, perhaps based on some common one with some alpha?
+static const float OVERLAP_WINDOW[FRAME_SAMPLES] = {
+    0.00041374451, 0.00063187029, 0.00083242479, 0.0010303947, 0.0012312527, 0.0014377162, 0.0016513923, 0.001873354,
+    0.0021043862, 0.0023451056, 0.0025960256, 0.0028575913, 0.0031302026, 0.0034142293, 0.003710018, 0.0040178993,
+    0.0043381932, 0.00467121, 0.0050172545, 0.0053766258, 0.00574962, 0.0061365301, 0.0065376465, 0.0069532581,
+    0.007383653, 0.0078291167, 0.008289936, 0.0087663941, 0.0092587769, 0.0097673666, 0.010292448, 0.010834301,
+    0.01139321, 0.011969455, 0.012563316, 0.013175075, 0.01380501, 0.0144534, 0.015120523, 0.015806656,
+    0.016512074, 0.017237054, 0.017981868, 0.018746791, 0.019532094, 0.020338045, 0.021164915, 0.022012968,
+    0.022882473, 0.023773693, 0.02468689, 0.025622323, 0.026580252, 0.027560933, 0.028564619, 0.02959156,
+    0.030642008, 0.031716209, 0.032814406, 0.033936843, 0.035083756, 0.036255382, 0.037451953, 0.038673703,
+    0.039920855, 0.041193634, 0.042492259, 0.043816946, 0.045167912, 0.046545364, 0.047949508, 0.049380545,
+    0.050838675, 0.05232409, 0.053836983, 0.055377539, 0.056945939, 0.058542356, 0.06016697, 0.061819945,
+    0.063501447, 0.065211624, 0.066950649, 0.068718657, 0.070515797, 0.07234221, 0.074198022, 0.07608337,
+    0.07799837, 0.07994315, 0.081917815, 0.083922468, 0.085957222, 0.088022165, 0.090117387, 0.092242986,
+    0.094399013, 0.096585557, 0.098802686, 0.10105046, 0.10332893, 0.10563815, 0.10797815, 0.11034897,
+    0.11275065, 0.1151832, 0.11764663, 0.12014097, 0.12266621, 0.12522236, 0.12780938, 0.13042729,
+    0.13307604, 0.13575561, 0.13846597, 0.14120705, 0.14397883, 0.14678125, 0.14961423, 0.15247771,
+    0.15537159, 0.15829581, 0.16125028, 0.16423489, 0.16724953, 0.17029409, 0.17336844, 0.17647249,
+    0.17960605, 0.18276905, 0.18596126, 0.18918259, 0.19243285, 0.19571187, 0.19901948, 0.2023555,
+    0.20571974, 0.20911199, 0.21253204, 0.21597971, 0.21945477, 0.22295699, 0.22648615, 0.23004198,
+    0.23362428, 0.23723276, 0.2408672, 0.2445273, 0.24821278, 0.25192341, 0.25565886, 0.25941887,
+    0.26320317, 0.26701137, 0.27084324, 0.27469841, 0.27857658, 0.28247747, 0.28640065, 0.29034585,
+    0.29431269, 0.29830083, 0.30230993, 0.30633962, 0.31038952, 0.31445926, 0.31854844, 0.32265672,
+    0.32678369, 0.33092892, 0.33509207, 0.33927271, 0.34347042, 0.3476848, 0.35191545, 0.35616189,
+    0.36042371, 0.36470053, 0.36899185, 0.37329727, 0.37761635, 0.38194862, 0.38629359, 0.3906509,
+    0.39502001, 0.3994005, 0.4037919, 0.40819371, 0.41260549, 0.41702676, 0.42145702, 0.42589581,
+    0.43034267, 0.43479711, 0.43925858, 0.44372663, 0.44820082, 0.45268059, 0.45716542, 0.4616549,
+    0.4661485, 0.47064567, 0.47514597, 0.47964889, 0.4841539, 0.48866051, 0.49316826, 0.49767655,
+    0.50218499, 0.50669295, 0.51120001, 0.5157057, 0.52020943, 0.52471071, 0.52920908, 0.53370398,
+    0.53819495, 0.54268152, 0.54716307, 0.5516392, 0.55610937, 0.56057316, 0.56502998, 0.56947935,
+    0.57392085, 0.57835394, 0.5827781, 0.58719289, 0.59159786, 0.59599245, 0.60037625, 0.60474873,
+    0.6091094, 0.61345792, 0.61779374, 0.62211639, 0.62642545, 0.63072038, 0.63500077, 0.63926625,
+    0.6435163, 0.64775056, 0.65196848, 0.65616965, 0.66035372, 0.66452026, 0.66866881, 0.67279899,
+    0.67691034, 0.6810025, 0.6850751, 0.68912768, 0.69315994, 0.69717139, 0.7011618, 0.70513064,
+    0.70907766, 0.71300244, 0.7169047, 0.72078407, 0.72464013, 0.72847265, 0.73228133, 0.73606575,
+    0.73982555, 0.74356061, 0.74727046, 0.75095487, 0.75461364, 0.7582463, 0.76185274, 0.76543266,
+    0.76898569, 0.77251172, 0.77601039, 0.77948159, 0.78292501, 0.78634042, 0.78972763, 0.79308641,
+    0.79641658, 0.79971796, 0.80299026, 0.80623347, 0.80944729, 0.81263155, 0.81578618, 0.81891102,
+    0.82200587, 0.82507062, 0.82810515, 0.83110934, 0.83408308, 0.83702624, 0.83993882, 0.84282064,
+    0.84567159, 0.84849167, 0.85128081, 0.85403895, 0.85676599, 0.85946196, 0.86212677, 0.86476046,
+    0.86736292, 0.86993414, 0.87247425, 0.87498307, 0.87746072, 0.87990719, 0.88232255, 0.88470674,
+    0.88705987, 0.88938189, 0.89167297, 0.89393318, 0.89616245, 0.89836091, 0.90052873, 0.90266585,
+    0.90477246, 0.90684867, 0.90889448, 0.91091013, 0.91289562, 0.91485113, 0.91677684, 0.91867274,
+    0.92053914, 0.9223761, 0.92418379, 0.92596233, 0.92771196, 0.92943287, 0.93112504, 0.93278885,
+    0.9344244, 0.936032, 0.93761164, 0.93916368, 0.94068825, 0.94218558, 0.94365591, 0.94509935,
+    0.94651628, 0.94790679, 0.9492712, 0.95060962, 0.95192248, 0.95320988, 0.95447206, 0.95570934,
+    0.95692199, 0.95811015, 0.95927411, 0.96041423, 0.96153063, 0.96262366, 0.96369362, 0.96474063,
+    0.96576512, 0.96676731, 0.96774739, 0.96870577, 0.96964264, 0.97055829, 0.97145301, 0.97232717,
+    0.97318095, 0.97401464, 0.97482848, 0.97562289, 0.97639805, 0.97715431, 0.97789192, 0.97861117,
+    0.97931236, 0.97999579, 0.98066169, 0.98131043, 0.98194218, 0.98255736, 0.98315614, 0.98373884,
+    0.9843058, 0.98485726, 0.98539352, 0.98591483, 0.98642153, 0.9869138, 0.98739201, 0.98785633,
+    0.98830712, 0.98874468, 0.98916918, 0.98958093, 0.98998028, 0.99036741, 0.99074256, 0.99110597,
+    0.991458, 0.99179888, 0.99212885, 0.99244815, 0.99275702, 0.9930557, 0.99334443, 0.9936235,
+    0.99389309, 0.99415344, 0.99440479, 0.99464744, 0.99488151, 0.99510723, 0.99532485, 0.9955346,
+    0.99573666, 0.99593133, 0.99611866, 0.99629897, 0.99647242, 0.99663919, 0.99679953, 0.99695361,
+    0.9971016, 0.99724364, 0.99737996, 0.99751073, 0.99763614, 0.9977563, 0.99787146, 0.99798179,
+    0.99808735, 0.99818838, 0.99828494, 0.99837732, 0.99846554, 0.99854976, 0.99863017, 0.99870688,
+    0.99878007, 0.99884975, 0.99891615, 0.99897939, 0.99903959, 0.99909681, 0.99915117, 0.99920285,
+    0.9992519, 0.99929845, 0.99934256, 0.9993844, 0.99942398, 0.99946147, 0.99949694, 0.99953043,
+    0.99956208, 0.99959201, 0.99962014, 0.99964666, 0.9996717, 0.99969524, 0.99971735, 0.99973816,
+    0.99975771, 0.99977601, 0.99979317, 0.99980927, 0.99982429, 0.99983829, 0.99985141, 0.99986362,
+    0.99987501, 0.99988562, 0.99989551, 0.99990469, 0.99991322, 0.99992108, 0.99992836, 0.99993503,
+    0.99994129, 0.99994701, 0.99995226, 0.99995708, 0.99996156, 0.99996561, 0.99996936, 0.9999727,
+    0.9999758, 0.9999786, 0.99998116, 0.99998349, 0.99998552, 0.99998742, 0.99998909, 0.99999058,
+    0.99999189, 0.99999309, 0.99999416, 0.99999511, 0.99999589, 0.99999666, 0.99999726, 0.99999779,
+    0.99999827, 0.99999863, 0.99999899, 0.99999923, 0.99999946, 0.99999964, 0.99999976, 0.99999988,
+};
+
+#endif
diff --git a/src/formats.c b/src/formats.c
index c3dfe85a..3e32c8c9 100644
--- a/src/formats.c
+++ b/src/formats.c
@@ -271,6 +271,7 @@ static const char* extension_list[] = {
     "joe",
     "jstm",
 
+    "ka1a",
     "kat",
     "kces",
     "kcey", //fake extension/header id for .pcm (renamed, to be removed)
@@ -907,6 +908,7 @@ static const coding_info coding_info_list[] = {
         {coding_TAC,                "tri-Ace Codec"},
         {coding_ICE_RANGE,          "Inti Creates Range Codec"},
         {coding_ICE_DCT,            "Inti Creates DCT Codec"},
+        {coding_KA1A,               "Koei Tecmo KA1A Codec"},
 
 #ifdef VGM_USE_VORBIS
         {coding_OGG_VORBIS,         "Ogg Vorbis"},
@@ -1449,6 +1451,7 @@ static const meta_info meta_info_list[] = {
         {meta_DSP_ASURA,            "Rebellion DSP header"},
         {meta_ONGAKUKAN_RIFF_ADP,   "Ongakukan RIFF WAVE header"},
         {meta_SDD,                  "Doki Denki DSBH header"},
+        {meta_KA1A,                 "Koei Tecmo KA1A header"},
 };
 
 void get_vgmstream_coding_description(VGMSTREAM* vgmstream, char* out, size_t out_size) {
diff --git a/src/layout/segmented.c b/src/layout/segmented.c
index 02b14f6f..32214927 100644
--- a/src/layout/segmented.c
+++ b/src/layout/segmented.c
@@ -80,16 +80,18 @@ void render_vgmstream_segmented(sbuf_t* sbuf, VGMSTREAM* vgmstream) {
             ssrc->buf = buf_filled;
         }
 
-        render_main(ssrc, data->segments[data->current_segment]);
-
+        int samples_done = render_main(ssrc, data->segments[data->current_segment]);
+        samples_done = samples_to_do;
         // returned buf may have changed
         if (ssrc->buf != buf_filled) {
-            sbuf_copy_segments(sbuf, ssrc);
+            sbuf_copy_segments(sbuf, ssrc, samples_done);
+        } else {
+            //TODO ???
+            sbuf->filled += samples_done;
         }
 
-        sbuf->filled += samples_to_do;
-        vgmstream->current_sample += samples_to_do;
-        vgmstream->samples_into_block += samples_to_do;
+        vgmstream->current_sample += samples_done;
+        vgmstream->samples_into_block += samples_done;
     }
 
     return;
diff --git a/src/libvgmstream.vcxproj b/src/libvgmstream.vcxproj
index 5fbf4e3a..fc012222 100644
--- a/src/libvgmstream.vcxproj
+++ b/src/libvgmstream.vcxproj
@@ -114,6 +114,8 @@
     <ClInclude Include="coding\libs\g7221_data.h" />
     <ClInclude Include="coding\libs\g7221_lib.h" />
     <ClInclude Include="coding\libs\icelib.h" />
+    <ClInclude Include="coding\libs\ka1a_dec.h" />
+    <ClInclude Include="coding\libs\ka1a_dec_data.h" />
     <ClInclude Include="coding\libs\libacm.h" />
     <ClInclude Include="coding\libs\nwa_lib.h" />
     <ClInclude Include="coding\libs\ongakukan_adp_lib.h" />
@@ -269,6 +271,7 @@
     <ClCompile Include="coding\ice_decoder.c" />
     <ClCompile Include="coding\ima_decoder.c" />
     <ClCompile Include="coding\imuse_decoder.c" />
+    <ClCompile Include="coding\ka1a_decoder.c" />
     <ClCompile Include="coding\l5_555_decoder.c" />
     <ClCompile Include="coding\lsf_decoder.c" />
     <ClCompile Include="coding\mc3_decoder.c" />
@@ -321,6 +324,7 @@
     <ClCompile Include="coding\libs\g7221_aes.c" />
     <ClCompile Include="coding\libs\g7221_lib.c" />
     <ClCompile Include="coding\libs\icelib.c" />
+    <ClCompile Include="coding\libs\ka1a_dec.c" />
     <ClCompile Include="coding\libs\libacm_decode.c" />
     <ClCompile Include="coding\libs\libacm_util.c" />
     <ClCompile Include="coding\libs\nwa_lib.c" />
@@ -525,6 +529,7 @@
     <ClCompile Include="meta\ish_isd.c" />
     <ClCompile Include="meta\ivag.c" />
     <ClCompile Include="meta\jstm.c" />
+    <ClCompile Include="meta\ka1a.c" />
     <ClCompile Include="meta\kat.c" />
     <ClCompile Include="meta\kma9.c" />
     <ClCompile Include="meta\knon.c" />
diff --git a/src/libvgmstream.vcxproj.filters b/src/libvgmstream.vcxproj.filters
index 4f79a75d..ccdd6fef 100644
--- a/src/libvgmstream.vcxproj.filters
+++ b/src/libvgmstream.vcxproj.filters
@@ -176,6 +176,12 @@
     <ClInclude Include="coding\libs\icelib.h">
       <Filter>coding\libs\Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="coding\libs\ka1a_dec.h">
+      <Filter>coding\libs\Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="coding\libs\ka1a_dec_data.h">
+      <Filter>coding\libs\Header Files</Filter>
+    </ClInclude>
     <ClInclude Include="coding\libs\libacm.h">
       <Filter>coding\libs\Header Files</Filter>
     </ClInclude>
@@ -637,6 +643,9 @@
     <ClCompile Include="coding\imuse_decoder.c">
       <Filter>coding\Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="coding\ka1a_decoder.c">
+      <Filter>coding\Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="coding\l5_555_decoder.c">
       <Filter>coding\Source Files</Filter>
     </ClCompile>
@@ -793,6 +802,9 @@
     <ClCompile Include="coding\libs\icelib.c">
       <Filter>coding\libs\Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="coding\libs\ka1a_dec.c">
+      <Filter>coding\libs\Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="coding\libs\libacm_decode.c">
       <Filter>coding\libs\Source Files</Filter>
     </ClCompile>
@@ -1405,6 +1417,9 @@
     <ClCompile Include="meta\jstm.c">
       <Filter>meta\Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="meta\ka1a.c">
+      <Filter>meta\Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="meta\kat.c">
       <Filter>meta\Source Files</Filter>
     </ClCompile>
diff --git a/src/meta/ka1a.c b/src/meta/ka1a.c
new file mode 100644
index 00000000..cb8190bc
--- /dev/null
+++ b/src/meta/ka1a.c
@@ -0,0 +1,56 @@
+#include "meta.h"
+#include "../coding/coding.h"
+
+
+/* KA1A - Koei Tecmo's custom codec streams [Dynasty Warriors Origins (PC)] */
+VGMSTREAM* init_vgmstream_ka1a(STREAMFILE* sf) {
+    VGMSTREAM* vgmstream = NULL;
+    uint32_t start_offset; 
+
+    /* checks */
+    if (!is_id32be(0x00,sf, "KA1A"))
+        return NULL;
+    /* .ka1a: header id */
+    if (!check_extensions(sf,"ka1a"))
+        return NULL;
+    // KA1A don't seem found outside SRST, but probably will (like KOVS)
+
+    //uint32_t data_size = read_u32le(0x04,sf);
+    int channels = read_s32le(0x08,sf);
+    int tracks = read_s32le(0x0c,sf);
+    int sample_rate = read_s32le(0x10,sf);
+    int32_t num_samples = read_s32le(0x14,sf);
+    int32_t loop_start = read_s32le(0x18,sf);
+    int32_t loop_region = read_s32le(0x1c,sf);
+    int bitrate_mode = read_s32le(0x20,sf); // signed! (may be negative)
+    // 0x28: reserved?
+
+    bool loop_flag = (loop_region > 0);
+
+    start_offset = 0x28;
+
+    /* build the VGMSTREAM */
+    vgmstream = allocate_vgmstream(channels * tracks, loop_flag);
+    if (!vgmstream) goto fail;
+
+    vgmstream->meta_type = meta_KA1A;
+    vgmstream->sample_rate = sample_rate;
+    vgmstream->num_samples = num_samples;
+    vgmstream->loop_start_sample = loop_start;
+    vgmstream->loop_end_sample = loop_start + loop_region; //typically num_samples
+
+    // KA1A interleaves tracks (ex. 2ch and 2 tracks = 512 stereo samples + 512 stereo samples).
+    // For vgmstream this is reinterpreted as plain channels like other KT formats do (codec handles
+    // this fine). Encoder delay is implicit.
+    vgmstream->codec_data = init_ka1a(bitrate_mode, channels * tracks);
+    if (!vgmstream->codec_data) goto fail;
+    vgmstream->coding_type = coding_KA1A;
+    vgmstream->layout_type = layout_none;
+
+    if (!vgmstream_open_stream(vgmstream, sf, start_offset))
+        goto fail;
+    return vgmstream;
+fail:
+    close_vgmstream(vgmstream);
+    return NULL;
+}
diff --git a/src/meta/ktsr.c b/src/meta/ktsr.c
index eb37ecb4..8aabdbc5 100644
--- a/src/meta/ktsr.c
+++ b/src/meta/ktsr.c
@@ -4,7 +4,7 @@
 #include "../util/companion_files.h"
 #include "ktsr_streamfile.h"
 
-typedef enum { NONE, MSADPCM, DSP, GCADPCM, ATRAC9, RIFF_ATRAC9, KOVS, KTSS, KTAC } ktsr_codec;
+typedef enum { NONE, MSADPCM, DSP, GCADPCM, ATRAC9, RIFF_ATRAC9, KOVS, KTSS, KTAC, KA1A, KA1A_INTERNAL } ktsr_codec;
 
 #define MAX_CHANNELS 8
 
@@ -87,7 +87,7 @@ static VGMSTREAM* init_vgmstream_ktsr_internal(STREAMFILE* sf, bool is_srsa) {
     STREAMFILE* sf_b = NULL;
     ktsr_header ktsr = {0};
     int target_subsong = sf->stream_index;
-    int separate_offsets = 0;
+    bool separate_offsets = false;
 
     ktsr.is_srsa = is_srsa;
     if (ktsr.is_srsa) {
@@ -152,6 +152,7 @@ static VGMSTREAM* init_vgmstream_ktsr_internal(STREAMFILE* sf, bool is_srsa) {
             case KOVS:          init_vgmstream = init_vgmstream_ogg_vorbis; ext = "kvs"; break;
             case KTSS:          init_vgmstream = init_vgmstream_ktss; ext = "ktss"; break;
             case KTAC:          init_vgmstream = init_vgmstream_ktac; ext = "ktac"; break;
+            case KA1A:          init_vgmstream = init_vgmstream_ka1a; ext = "ka1a"; break;
             default: break;
         }
 
@@ -183,16 +184,36 @@ static VGMSTREAM* init_vgmstream_ktsr_internal(STREAMFILE* sf, bool is_srsa) {
         case MSADPCM:
             vgmstream->coding_type = coding_MSADPCM_mono;
             vgmstream->layout_type = layout_none;
-            separate_offsets = 1;
+            separate_offsets = true;
 
             /* 0x00: samples per frame */
             vgmstream->frame_size = read_u16le(ktsr.extra_offset + 0x02, sf_b);
             break;
 
+        case KA1A_INTERNAL: {
+            // 00: bitrate mode
+            // XX: start offsets per channel (from hash-id start aka extra_offset - 0x48)
+            // XX: size per channel
+            // XX: padding
+
+            int bitrate_mode = read_s32le(ktsr.extra_offset + 0x00, sf); // signed! (may be negative)
+
+            vgmstream->codec_data = init_ka1a(bitrate_mode, ktsr.channels);
+            if (!vgmstream->codec_data) goto fail;
+            vgmstream->coding_type = coding_KA1A;
+            vgmstream->layout_type = layout_none;
+
+            // mono streams handled in decoder, though needs channel offsets + flag
+            vgmstream->codec_config = 1;
+            separate_offsets = true;
+
+            break;
+        }
+
         case DSP:
             vgmstream->coding_type = coding_NGC_DSP;
             vgmstream->layout_type = layout_none;
-            separate_offsets = 1;
+            separate_offsets = true;
 
             dsp_read_coefs_le(vgmstream, sf, ktsr.extra_offset + 0x1c, 0x60);
             dsp_read_hist_le (vgmstream, sf, ktsr.extra_offset + 0x40, 0x60);
@@ -327,12 +348,12 @@ static int parse_codec(ktsr_header* ktsr) {
         case 0x05: /* PC/Steam [Fate/Samurai Remnant (PC)] */
             if (ktsr->format == 0x0000 && !ktsr->is_external)
                 ktsr->codec = MSADPCM; // Warrior Orochi 4 (PC)
-            //else if (ktsr->format == 0x0001)
-            //    ktsr->codec = KA1A; // Dynasty Warriors Origins (PC)
+            else if (ktsr->format == 0x0001)
+                ktsr->codec = KA1A_INTERNAL; // Dynasty Warriors Origins (PC)
             else if (ktsr->format == 0x0005 && ktsr->is_external)
                 ktsr->codec = KOVS; // Atelier Ryza (PC)
-            //else if (ktsr->format == 0x1001 && ktsr->is_external)
-            //    ktsr->codec = KA1A; // Dynasty Warriors Origins (PC)
+            else if (ktsr->format == 0x1001 && ktsr->is_external)
+                ktsr->codec = KA1A; // Dynasty Warriors Origins (PC)
             else
                 goto fail;
             break;
@@ -377,7 +398,8 @@ static bool parse_ktsr_subfile(ktsr_header* ktsr, STREAMFILE* sf, uint32_t offse
     type = read_u32be(offset + 0x00, sf); /* hash-id? */
   //size = read_u32le(offset + 0x04, sf);
 
-    /* probably could check the flag in sound header, but the format is kinda messy */
+    // probably could check the flags in sound header, but the format is kinda messy
+    // (all these numbers are surely LE hashes of something)
     switch(type) {
 
         case 0x38D0437D: /* external [Nioh (PC/PS4), Atelier Ryza (PC)] */
diff --git a/src/meta/meta.h b/src/meta/meta.h
index 71593e20..9515ec48 100644
--- a/src/meta/meta.h
+++ b/src/meta/meta.h
@@ -1013,4 +1013,6 @@ VGMSTREAM* init_vgmstream_adp_ongakukan(STREAMFILE* sf);
 
 VGMSTREAM* init_vgmstream_sdd(STREAMFILE* sf);
 
+VGMSTREAM* init_vgmstream_ka1a(STREAMFILE* sf);
+
 #endif /*_META_H*/
diff --git a/src/vgmstream.c b/src/vgmstream.c
index 9780c359..6006dafe 100644
--- a/src/vgmstream.c
+++ b/src/vgmstream.c
@@ -225,10 +225,8 @@ VGMSTREAM* allocate_vgmstream(int channels, int loop_flag) {
     vgmstream->mixer = mixer_init(vgmstream->channels); /* pre-init */
     if (!vgmstream->mixer) goto fail;
 
-#if VGM_TEST_DECODER
     vgmstream->decode_state = decode_init();
     if (!vgmstream->decode_state) goto fail;
-#endif
 
     //TODO: improve/init later to minimize memory
     /* garbage buffer for seeking/discarding (local bufs may cause stack overflows with segments/layers)
@@ -420,9 +418,7 @@ static bool merge_vgmstream(VGMSTREAM* opened_vgmstream, VGMSTREAM* new_vgmstrea
         opened_vgmstream->layout_type = layout_none; /* fixes some odd cases */
 
     /* discard the second VGMSTREAM */
-#if VGM_TEST_DECODER
     decode_free(new_vgmstream);
-#endif
     mixer_free(new_vgmstream->mixer);
     free(new_vgmstream->tmpbuf);
     free(new_vgmstream->start_vgmstream);
diff --git a/src/vgmstream.h b/src/vgmstream.h
index 10d522fc..5b2eb4f1 100644
--- a/src/vgmstream.h
+++ b/src/vgmstream.h
@@ -242,9 +242,7 @@ typedef struct {
     void* tmpbuf;                   /* garbage buffer used for seeking/trimming */
     size_t tmpbuf_size;             /* for all channels (samples = tmpbuf_size / channels / sample_size) */
 
-#if VGM_TEST_DECODER
-    void* decode_state;             /* for some decoders (TO-DO: to be mover around) */
-#endif
+    void* decode_state;             /* for some decoders (TO-DO: to be moved around) */
 } VGMSTREAM;
 
 
diff --git a/src/vgmstream_init.c b/src/vgmstream_init.c
index ceef04b0..4ae37ddd 100644
--- a/src/vgmstream_init.c
+++ b/src/vgmstream_init.c
@@ -510,6 +510,7 @@ init_vgmstream_t init_vgmstream_functions[] = {
     init_vgmstream_dsp_asura_sfx,
     init_vgmstream_adp_ongakukan,
     init_vgmstream_sdd,
+    init_vgmstream_ka1a,
 
     /* lower priority metas (no clean header identity, somewhat ambiguous, or need extension/companion file to identify) */
     init_vgmstream_agsc,
diff --git a/src/vgmstream_types.h b/src/vgmstream_types.h
index ec66d657..fde1d254 100644
--- a/src/vgmstream_types.h
+++ b/src/vgmstream_types.h
@@ -145,6 +145,7 @@ typedef enum {
     coding_TAC,             /* tri-Ace Codec (MDCT-based) */
     coding_ICE_RANGE,       /* Inti Creates "range" codec */
     coding_ICE_DCT,         /* Inti Creates "DCT" codec */
+    coding_KA1A,            /* Koei Tecmo codec (transform-based) */
 
 #ifdef VGM_USE_VORBIS
     coding_OGG_VORBIS,      /* Xiph Vorbis with Ogg layer (MDCT-based) */
@@ -710,6 +711,7 @@ typedef enum {
     meta_DSP_ASURA,
     meta_ONGAKUKAN_RIFF_ADP,
     meta_SDD,
+    meta_KA1A,
 
 } meta_t;