#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "g7221_decoder_lib.h"
#include "g7221_decoder_aes.h"


/* Decodes Siren14 from Namco's BNSF, a mono MLT/DCT-based codec for speech/sound (low bandwidth).
 * Reverse engineered for various exes with info from Polycom's reference int decoder.
 * - Reference decoder and codec info: https://www.itu.int/rec/T-REC-G.722.1/en
 *
 * Technically the name is "ITU-T G.722.1 Annex C" (official ITU-T spec), while "Siren14"
 * was Polycom's original format with slightly different frames, though Namco calls it
 * "Siren14 Version 3.02 For Products" while using G.722.1's frame format.
 * Siren7 (7000hz bandwidth) isn't supported, only Siren14 (14000hz).
 *
 * Very roughly the encoder works like this:
 * - Apply a Modulated Lapped Transform (MLT) function over 640*2 samples to get spectrum
 *   coefficients (can be decomposed into a window, overlap and add with a DCT-IV, of samples
 *   from a current frame and samples from a prev frame).
 * - resulting coefs are divided into 28 bands called "regions" of 500hz.
 * - Each region contains 20 MLT spectrum coefs, total 28 regions * 500hz = 14000hz.
 * - Bands above 14khz are ignored (overall output quality isn't high).
 * - Pack amplitude envelope bits, defined as the RMS (Root-Mean-Square) of the coefs in
 *   the region. First region sets all bits, rest is differentially and huffman coded. 
 *   Remaning bits are left to quantize coefs.
 * - Regions are given a "category" to quantize, that define the number of quantization bits
 *   and other coding parameters. Results are combined into vector indices,
 *   and Huffman-coded (frequent vectors require less bits).
 * Decoding thus unpacks amplitudes, region coefs and does inverse MLT.
 *
 * Namco roughly follows the reference decoder ('refdec') with some differences:
 * - uses mostly int32, while refdec has int16 with exact rounding/overflow handling (no output diffs)
 * - modified random number generator (minor output diffs)
 * - very minor change in bit unpacking (minor output diffs)
 * - modified DCT-IV optimizations, scaling and window functions (minor output diffs)
 * - internally PCM16 bufs, but converts to float (sample/32768.0) afterwards if the platform needs it
 * - less error control (on error decoder is supposed to repeat last coefs)
 * - can't decode Siren7, and given output diffs it's not actually ITU-compliant
 * - minor optimizations here and there but otherwise very similar
 * This decoder generally uses Polycom's terminology, and while some parts like the bitreader could be
 * reimplemented they are mostly untouched for documentation purposes.
 *
 * TODO: missing some validations (may segfault on bad data), 
 *       access indexes with (idx & max) and clamp buffer reads
 */

#include "g7221_decoder_lib_data.h"

/*****************************************************************************
 * IMLT
 *****************************************************************************/

static int imlt_window(int16_t* new_samples, int16_t* old_samples, int16_t* out_samples) {
    int i;
    int sample_lo, sample_hi;
    int16_t win_val_lo, win_val_hi, new_val, old_val;
    const int16_t *win_ptr_lo, *win_ptr_hi;
    int16_t *new_ptr, *old_ptr, *out_ptr_lo, *out_ptr_hi;


    /* overlap 2nd half of prev frame's samples and 1st half of current frame's samples with
     * a window function to smooth out between frames */
    win_ptr_lo = imlt_samples_window + 0;
    win_ptr_hi = imlt_samples_window + 640;
    new_ptr = new_samples + 320;
    old_ptr = old_samples + 0;
    out_ptr_lo = out_samples + 0;
    out_ptr_hi = out_samples + 640;

    while (out_ptr_lo != out_ptr_hi) {
        win_val_lo = *win_ptr_lo++;
        win_val_hi = *--win_ptr_hi;
        new_val = *--new_ptr;
        old_val = *old_ptr++;

        sample_lo = (new_val * win_val_lo + old_val * *win_ptr_hi + 32768) >> 13;
        if (sample_lo > 32767)
            sample_lo = 32767;
        else if (sample_lo < -32768)
            sample_lo = -32768;
        *out_ptr_lo++ = sample_lo;

        sample_hi = (new_val * win_val_hi - old_val * win_val_lo + 32768) >> 13;
        if (sample_hi > 32767)
            sample_hi = 32767;
        else if (sample_hi < -32768)
            sample_hi = -32768;
        *--out_ptr_hi = sample_hi;
    }

    /* save the 2nd half of the new samples to use above in next frame */
    old_ptr = old_samples + 0;
    new_ptr = new_samples + 320;

    for (i = 0; i < 320; i++) {
        old_ptr[i] = new_ptr[i];
    }

    return 0;
}

/* "dct4_x640_int" */
static int imlt_dct4(int16_t* mlt_coefs, int16_t* new_samples, int mag_shift) {
    int i, j, k, n;
    const uint8_t *set1_ptr;
    int mod_shift, sub_shift;


    /* vs refdec: very optimized, output is slightly different (louder) but it's massively
     * faster (around 20% vs float refdec, int refdec was very slow to begin with).
     * Can't quite clean this due to the complex math simplifications.
     * Should correspond to: cos(PI*(t+0.5)*(k+0.5)/block_length) */

    /* rotation butterflies? (cos/sin 640 groups) */
    {
        int cos_val, sin_val;
        const uint16_t *cos_ptr, *sin_ptr;
        int16_t mlt_val_lo, mlt_val_hi;
        int16_t *mlt_ptr_lo, *mlt_ptr_hi;

        mlt_ptr_lo = mlt_coefs + 0;
        mlt_ptr_hi = mlt_coefs + 640;
        cos_ptr = &imlt_cos_tables[0]; /* cos_table_64 */
        sin_ptr = &imlt_sin_tables[0]; /* sin_table_64 */

        for (i = 40; i > 0; --i) {
            cos_val = *cos_ptr++;
            sin_val = *sin_ptr++;
            mlt_val_lo = *mlt_ptr_lo >> 1;
            *mlt_ptr_lo++ = (cos_val * mlt_val_lo + 32768) >> 16;
            *--mlt_ptr_hi = (sin_val * -mlt_val_lo + 32768) >> 16;

            cos_val = *cos_ptr++;
            sin_val = *sin_ptr++;
            mlt_val_lo = *mlt_ptr_lo >> 1;
            *mlt_ptr_lo++ = (cos_val * mlt_val_lo + 32768) >> 16;
            *--mlt_ptr_hi = (sin_val * mlt_val_lo + 32768) >> 16;
        }

        for (i = 120; i > 0; --i) {
            cos_val = *cos_ptr++;
            sin_val = *sin_ptr++;
            mlt_val_lo = *mlt_ptr_lo >> 1;
            mlt_val_hi = *--mlt_ptr_hi >> 1;
            *mlt_ptr_lo++ = (cos_val * mlt_val_lo + sin_val * mlt_val_hi + 32768) >> 16;
            *mlt_ptr_hi   = (sin_val * -mlt_val_lo + cos_val * mlt_val_hi + 32768) >> 16;

            cos_val = *cos_ptr++;
            sin_val = *sin_ptr++;
            mlt_val_lo = *mlt_ptr_lo >> 1;
            mlt_val_hi = *--mlt_ptr_hi >> 1;
            *mlt_ptr_lo++ = (cos_val * mlt_val_lo + sin_val * mlt_val_hi + 32768) >> 16;
            *mlt_ptr_hi   = (sin_val * mlt_val_lo - cos_val * mlt_val_hi + 32768) >> 16;
        }
    }

    /* sum/diff butterflies? */
    {
        int16_t mlt_val_lo, mlt_val_mlo, mlt_val_mhi, mlt_val_hi;
        int16_t *mlt_ptr, *mlt_ptr_lo, *mlt_ptr_mlo, *mlt_ptr_mhi, *mlt_ptr_hi;

        mlt_ptr = mlt_coefs + 0;
        for (i = 2; i > 0; --i) {
            mlt_ptr_lo = mlt_ptr + 0;
            mlt_ptr_hi = mlt_ptr + 320;
            mlt_ptr_mlo = mlt_ptr + 160;
            mlt_ptr_mhi = mlt_ptr + 160;
            for (j = 80; j > 0; --j) {
                mlt_val_lo = *mlt_ptr_lo;
                mlt_val_hi = *--mlt_ptr_hi;
                mlt_val_mhi = *--mlt_ptr_mhi;
                mlt_val_mlo = *mlt_ptr_mlo;
                *mlt_ptr_lo++  = (mlt_val_hi + mlt_val_lo) >> 1;
                *mlt_ptr_mlo++ = (mlt_val_lo - mlt_val_hi) >> 1;
                *mlt_ptr_mhi   = (mlt_val_mlo + mlt_val_mhi) >> 1;
                *mlt_ptr_hi    = (mlt_val_mhi - mlt_val_mlo) >> 1;
            }
            mlt_ptr += 320;
        }
    }

    /* helper table used in next 3 sections */
    set1_ptr = imlt_set1_table;

    /* rotation butterflies? (cos/sin 160/80/40/20/10 groups) */
    {
        int cos_val, sin_val;
        const uint16_t *cos_ptr, *sin_ptr, *cos_ptr_lo, *sin_ptr_lo;
        int16_t mlt_val_lo, mlt_val_hi, mlt_val_mlo, mlt_val_mhi;
        int16_t *mlt_ptr, *mlt_ptr_lo, *mlt_ptr_hi, *mlt_ptr_mlo, *mlt_ptr_mhi;

        cos_ptr = &imlt_cos_tables[320+160]; /* cos_table_16 > 8 > 4 > 2 */
        sin_ptr = &imlt_sin_tables[320+160]; /* sin_table_16 > 8 > 4 > 2 */

        for (n = 160; n >= 20; n /= 2) {
            mlt_ptr = mlt_coefs + 0;
            while (mlt_ptr < mlt_coefs + 640) {
                for (j = *set1_ptr; j > 0; --j) {
                    mlt_ptr_lo = mlt_ptr + 0;
                    mlt_ptr_hi = mlt_ptr + n;
                    mlt_ptr_mlo = mlt_ptr + (n / 2);
                    mlt_ptr_mhi = mlt_ptr + (n / 2);
                    for (k = n / 4; k > 0; --k) {
                        mlt_val_lo = *mlt_ptr_lo;
                        mlt_val_hi = *--mlt_ptr_hi;
                        mlt_val_mhi = *--mlt_ptr_mhi;
                        mlt_val_mlo = *mlt_ptr_mlo;
                        *mlt_ptr_lo++ = mlt_val_lo + mlt_val_hi;
                        *mlt_ptr_mlo++ = mlt_val_lo - mlt_val_hi;
                        *mlt_ptr_mhi = mlt_val_mlo + mlt_val_mhi;
                        *mlt_ptr_hi = mlt_val_mhi - mlt_val_mlo;
                    }
                    mlt_ptr += n;
                }
                set1_ptr++;

                for (j = *set1_ptr; j > 0; --j) {
                    mlt_ptr_lo = mlt_ptr + 0;
                    mlt_ptr_hi = mlt_ptr + n;
                    cos_ptr_lo = cos_ptr + 0;
                    sin_ptr_lo = sin_ptr + 0;
                    for (k = n / 4; k > 0; --k) {
                        cos_val = *cos_ptr_lo++;
                        sin_val = *sin_ptr_lo++;
                        mlt_val_lo = *mlt_ptr_lo;
                        mlt_val_hi = *--mlt_ptr_hi;
                        *mlt_ptr_lo++ = (cos_val * mlt_val_lo + sin_val * mlt_val_hi + 32768) >> 16;
                        *mlt_ptr_hi   = (sin_val * -mlt_val_lo + cos_val * mlt_val_hi + 32768) >> 16;

                        cos_val = *cos_ptr_lo++;
                        sin_val = *sin_ptr_lo++;
                        mlt_val_lo = *mlt_ptr_lo;
                        mlt_val_hi = *--mlt_ptr_hi;
                        *mlt_ptr_lo++ = (cos_val * mlt_val_lo + sin_val * mlt_val_hi + 32768) >> 16;
                        *mlt_ptr_hi   = (sin_val * mlt_val_lo - cos_val * mlt_val_hi + 32768) >> 16;
                    }
                    mlt_ptr += n;
                }
                set1_ptr++;
            }

            /* next sub-tables */
            cos_ptr += n / 2;
            sin_ptr += n / 2;
        }
    }

    /* rotation butterflies? (cos/sin 5 groups) */
    {
        int cos_val, sin_val;
        const uint16_t *cos_ptr, *sin_ptr;
        int16_t mlt_val_lo, mlt_val_hi, mlt_val_mlo, mlt_val_mhi;
        int16_t *mlt_ptr;

        /* n/cos-sin would continue from above but for clarity: */
        cos_ptr = &imlt_cos_tables[320+160+80+40+20+10]; /* cos_table_1 */
        sin_ptr = &imlt_sin_tables[320+160+80+40+20+10]; /* sin_table_1 */

        {
            n = 10;
            mlt_ptr = mlt_coefs + 0;
            while (mlt_ptr < mlt_coefs + 640) {
                for (j = *set1_ptr; j > 0; --j) {
                    mlt_val_lo = mlt_ptr[0];
                    mlt_val_hi = mlt_ptr[n - 1];
                    mlt_val_mlo = mlt_ptr[n / 2 - 1];
                    mlt_val_mhi = mlt_ptr[n / 2];
                    mlt_ptr[0] = mlt_val_lo + mlt_val_hi;
                    mlt_ptr[n / 2] = mlt_val_lo - mlt_val_hi;
                    mlt_ptr[n / 2 - 1] = mlt_val_mhi + mlt_val_mlo;
                    mlt_ptr[n - 1] = mlt_val_mlo - mlt_val_mhi;

                    mlt_val_lo = mlt_ptr[1];
                    mlt_val_hi = mlt_ptr[n - 2];
                    mlt_val_mlo = mlt_ptr[n / 2 - 2];
                    mlt_val_mhi = mlt_ptr[n / 2 + 1];
                    mlt_ptr[1] = mlt_val_hi + mlt_val_lo;
                    mlt_ptr[n / 2 + 1] = mlt_val_lo - mlt_val_hi;
                    mlt_ptr[n / 2 - 2] = mlt_val_mhi + mlt_val_mlo;
                    mlt_ptr[n - 2] = mlt_val_mlo - mlt_val_mhi;

                    mlt_val_lo = mlt_ptr[2];
                    mlt_val_hi = mlt_ptr[n - 3];
                    mlt_ptr[2] = mlt_val_hi + mlt_val_lo;
                    mlt_ptr[n / 2 + 2] = mlt_val_lo - mlt_val_hi;

                    mlt_ptr += n;
                }

                cos_val = cos_ptr[0];
                sin_val = sin_ptr[0];
                mlt_val_lo = mlt_ptr[0];
                mlt_val_hi = mlt_ptr[n - 1];
                mlt_ptr[0] = (cos_val * mlt_val_lo + sin_val * mlt_val_hi + 32768) >> 16;
                mlt_ptr[n - 1] = (sin_val * mlt_val_lo - cos_val * mlt_val_hi + 32768) >> 16;

                cos_val = cos_ptr[1];
                sin_val = sin_ptr[1];
                mlt_val_lo = mlt_ptr[1];
                mlt_val_hi = mlt_ptr[n - 2];
                mlt_ptr[1] = (cos_val * mlt_val_lo + sin_val * mlt_val_hi + 32768) >> 16;
                mlt_ptr[n - 2] = (sin_val * -mlt_val_lo + cos_val * mlt_val_hi + 32768) >> 16;

                cos_val = cos_ptr[2];
                sin_val = sin_ptr[2];
                mlt_val_lo = mlt_ptr[2];
                mlt_val_hi = mlt_ptr[n - 3];
                mlt_ptr[2] = (cos_val * mlt_val_lo + sin_val * mlt_val_hi + 32768) >> 16;
                mlt_ptr[n - 3]= (sin_val * mlt_val_lo - cos_val * mlt_val_hi + 32768) >> 16;

                cos_val = cos_ptr[3];
                sin_val = sin_ptr[3];
                mlt_val_lo = mlt_ptr[3];
                mlt_val_hi = mlt_ptr[n - 4];
                mlt_ptr[3] = (cos_val * mlt_val_lo + sin_val * mlt_val_hi + 32768) >> 16;
                mlt_ptr[n - 4] = (sin_val * -mlt_val_lo + cos_val * mlt_val_hi + 32768) >> 16;

                cos_val = cos_ptr[4];
                sin_val = sin_ptr[4];
                mlt_val_lo = mlt_ptr[4];
                mlt_val_hi = mlt_ptr[n - 5];
                mlt_ptr[4] = (cos_val * mlt_val_lo + sin_val * mlt_val_hi + 32768) >> 16;
                mlt_ptr[n - 5] = (sin_val * mlt_val_lo - cos_val * mlt_val_hi + 32768) >> 16;

                mlt_ptr += n;
                set1_ptr += 2;
            }
        }
    }


    mod_shift = mag_shift - 1;
    sub_shift = 1;
    if (mod_shift >= 8)
        sub_shift = 2;
    mod_shift -= sub_shift;


    /* dct core? */
    {
        const int16_t *mlt_ptr;
        int16_t *new_ptr;

        mlt_ptr = mlt_coefs + 0;
        new_ptr = new_samples + 0;
        while (1) {
            for (i = *set1_ptr; i; --i) {
                new_ptr[0] = (mlt_ptr[4] + mlt_ptr[3] + mlt_ptr[2] + mlt_ptr[1] + mlt_ptr[0]) >> sub_shift;
                new_ptr[1] = (19261 * mlt_ptr[1] + 31164 * mlt_ptr[0] - 19261 * mlt_ptr[3] - 31164 * mlt_ptr[4]) >> (sub_shift + 15);
                new_ptr[2] = (26510 * mlt_ptr[4] + 26510 * mlt_ptr[0] - 10126 * mlt_ptr[1] - 32768 * mlt_ptr[2] - 10126 * mlt_ptr[3]) >> (sub_shift + 15);
                new_ptr[3] = (31164 * mlt_ptr[3] + 19261 * mlt_ptr[0] - 31164 * mlt_ptr[1] - 19261 * mlt_ptr[4]) >> (sub_shift + 15);
                new_ptr[4] = (10126 * mlt_ptr[4] + 32768 * mlt_ptr[2] + 10126 * mlt_ptr[0] - 26510 * mlt_ptr[1] - 26510 * mlt_ptr[3]) >> (sub_shift + 15);
                mlt_ptr += 5;
                new_ptr += 5;
            }
            set1_ptr += 2;

            if (mlt_ptr >= mlt_coefs + 640)
                break;

            new_ptr[0] = (  5126 * mlt_ptr[4] +  14876 * mlt_ptr[3] +  23170 * mlt_ptr[2] + 32365 * mlt_ptr[0] + 29197 * mlt_ptr[1]) >> (sub_shift + 15);
            new_ptr[1] = (-14876 * mlt_ptr[4] + -32365 * mlt_ptr[3] +   5126 * mlt_ptr[1] + 29197 * mlt_ptr[0] - 23170 * mlt_ptr[2]) >> (sub_shift + 15);
            new_ptr[2] = ( 23170 * mlt_ptr[4] +  23170 * mlt_ptr[3] + -23170 * mlt_ptr[1] + 23170 * mlt_ptr[0] - 23170 * mlt_ptr[2]) >> (sub_shift + 15);
            new_ptr[3] = (-29197 * mlt_ptr[4] +   5126 * mlt_ptr[3] +  23170 * mlt_ptr[2] + 14876 * mlt_ptr[0] - 32365 * mlt_ptr[1]) >> (sub_shift + 15);
            new_ptr[4] = ( 32365 * mlt_ptr[4] + -29197 * mlt_ptr[3] +  23170 * mlt_ptr[2] +  5126 * mlt_ptr[0] - 14876 * mlt_ptr[1]) >> (sub_shift + 15);
            mlt_ptr += 5;
            new_ptr += 5;
        }
    }

    /* swapping and sum/diffs? */
    {
        const uint8_t *set2_ptr;
        int16_t *mlt_ptr, *new_ptr;
        int16_t tmp1_val_a, tmp1_val_b;
        int16_t *tmp0_ptr, *tmp1_ptr, *tmp1_ptr_lo, *tmp1_ptr_mlo, *tmp1_ptr_mhi, *tmp2_ptr;

        set2_ptr = imlt_set2_table;

        mlt_ptr = mlt_coefs + 0;
        new_ptr = new_samples + 0;
        while (new_ptr < new_samples + 640) {
            for (i = *set2_ptr; i; --i) {
                *mlt_ptr++ = new_ptr[0];
                *mlt_ptr++ = new_ptr[5];
                *mlt_ptr++ = new_ptr[1];
                *mlt_ptr++ = new_ptr[6];
                *mlt_ptr++ = new_ptr[2];
                *mlt_ptr++ = new_ptr[7];
                *mlt_ptr++ = new_ptr[3];
                *mlt_ptr++ = new_ptr[8];
                *mlt_ptr++ = new_ptr[4];
                *mlt_ptr++ = new_ptr[9];
                new_ptr += 10;
            }
            set2_ptr++;

            *mlt_ptr++ = new_ptr[0];
            *mlt_ptr++ = new_ptr[9] + new_ptr[1];
            *mlt_ptr++ = new_ptr[1] - new_ptr[9];
            *mlt_ptr++ = new_ptr[2] - new_ptr[8];
            *mlt_ptr++ = new_ptr[8] + new_ptr[2];
            *mlt_ptr++ = new_ptr[7] + new_ptr[3];
            *mlt_ptr++ = new_ptr[3] - new_ptr[7];
            *mlt_ptr++ = new_ptr[4] - new_ptr[6];
            *mlt_ptr++ = new_ptr[6] + new_ptr[4];
            *mlt_ptr++ = new_ptr[5];
            new_ptr += 10;
        }

        /* below is some three way swapping, tmp ptrs change between mlt<>new */
        tmp0_ptr = mlt_coefs + 640;
        tmp1_ptr = new_samples + 640;
        for (n = 20; n <= 160; n *= 2) {
            tmp2_ptr = tmp0_ptr + 0;
            tmp0_ptr = tmp1_ptr - 640;
            tmp1_ptr = tmp2_ptr - 640;
            do {
                for (j = *set2_ptr; j > 0; --j) {
                    tmp1_ptr_mhi = tmp1_ptr + (n / 2);
                    for (k = n / 4; k > 0; --k) {
                        *tmp0_ptr++ = *tmp1_ptr++;
                        *tmp0_ptr++ = *tmp1_ptr_mhi++;
                        *tmp0_ptr++ = *tmp1_ptr++;
                        *tmp0_ptr++ = *tmp1_ptr_mhi++;
                    }
                    tmp1_ptr += n / 2;
                }
                set2_ptr++;

                if (tmp1_ptr >= tmp2_ptr)
                    break;

                tmp1_ptr_lo = tmp1_ptr + 0;
                tmp1_ptr_mlo = tmp1_ptr + (n - 1);

                *tmp0_ptr++ = *tmp1_ptr_lo++;

                tmp1_val_a = *tmp1_ptr_lo++;
                tmp1_val_b = *tmp1_ptr_mlo;
                *tmp0_ptr++ = tmp1_val_b + tmp1_val_a;
                *tmp0_ptr++ = tmp1_val_a - tmp1_val_b;
                for (j = (n / 2 - 2) / 2; j > 0; --j) {
                    tmp1_val_a = *tmp1_ptr_lo++;
                    tmp1_val_b = *--tmp1_ptr_mlo;
                    *tmp0_ptr++ = tmp1_val_a - tmp1_val_b;
                    *tmp0_ptr++ = tmp1_val_b + tmp1_val_a;

                    tmp1_val_a = *tmp1_ptr_lo++;
                    tmp1_val_b = *--tmp1_ptr_mlo;
                    *tmp0_ptr++ = tmp1_val_b + tmp1_val_a;
                    *tmp0_ptr++ = tmp1_val_a - tmp1_val_b;
                }
                *tmp0_ptr++ = -*tmp1_ptr_lo;
                tmp1_ptr += n;
            }
            while (tmp1_ptr < tmp2_ptr);
        }
    }

    /* final modifications and post scaling? */
    {
        int16_t mlt_val_lo, mlt_val_mhi, mlt_val_mlo, mlt_val_hi;
        const int16_t *mlt_ptr_lo, *mlt_ptr_hi, *mlt_ptr_mlo, *mlt_ptr_mhi;
        int16_t *new_ptr;

        if (mod_shift <= 0) {
            /* negative scale (right shift) */ 
            mod_shift = -mod_shift;

            mlt_ptr_lo = mlt_coefs + 0;
            mlt_ptr_mlo = mlt_coefs + 160;
            mlt_ptr_mhi = mlt_coefs + 480;
            mlt_ptr_hi = mlt_coefs + 640;
            new_ptr = new_samples + 0;

            mlt_val_lo = *mlt_ptr_lo++;
            *new_ptr++ = mlt_val_lo << mod_shift;

            mlt_val_mlo = *mlt_ptr_mlo++;
            mlt_val_hi = *--mlt_ptr_hi;
            *new_ptr++ = (mlt_val_hi + mlt_val_mlo) << mod_shift;
            *new_ptr++ = (mlt_val_mlo - mlt_val_hi) << mod_shift;

            for (i = 159; i > 0; --i) {
                mlt_val_lo = *mlt_ptr_lo++;
                mlt_val_mhi = *--mlt_ptr_mhi;
                *new_ptr++ = (mlt_val_lo - mlt_val_mhi) << mod_shift;
                *new_ptr++ = (mlt_val_mhi + mlt_val_lo) << mod_shift;

                mlt_val_mlo = *mlt_ptr_mlo++;
                mlt_val_hi = *--mlt_ptr_hi;
                *new_ptr++ = (mlt_val_hi + mlt_val_mlo) << mod_shift;
                *new_ptr++ = (mlt_val_mlo - mlt_val_hi) << mod_shift;
            }

            *new_ptr = -*mlt_ptr_mlo << mod_shift;
        }
        else {
            /* same but positive (left shift) */

            mlt_ptr_lo = mlt_coefs + 0;
            mlt_ptr_mlo = mlt_coefs + 160;
            mlt_ptr_mhi = mlt_coefs + 480;
            mlt_ptr_hi = mlt_coefs + 640;
            new_ptr = new_samples + 0;

            mlt_val_lo = *mlt_ptr_lo++;
            *new_ptr++ = mlt_val_lo >> mod_shift;

            mlt_val_mlo = *mlt_ptr_mlo++;
            mlt_val_hi = *--mlt_ptr_hi;
            *new_ptr++ = (mlt_val_hi + mlt_val_mlo) >> mod_shift;
            *new_ptr++ = (mlt_val_mlo - mlt_val_hi) >> mod_shift;

            for (i = 159; i > 0; --i) {
                mlt_val_lo = *mlt_ptr_lo++;
                mlt_val_mhi = *--mlt_ptr_mhi;
                *new_ptr++ = (mlt_val_lo - mlt_val_mhi) >> mod_shift;
                *new_ptr++ = (mlt_val_mhi + mlt_val_lo) >> mod_shift;

                mlt_val_mlo = *mlt_ptr_mlo++;
                mlt_val_hi = *--mlt_ptr_hi;
                *new_ptr++ = (mlt_val_hi + mlt_val_mlo) >> mod_shift;
                *new_ptr++ = (mlt_val_mlo - mlt_val_hi) >> mod_shift;
            }

            *new_ptr = -*mlt_ptr_mlo >> mod_shift;
        }
    }

    return 0;
}

/* "inverse_MLT" */
static int rmlt_coefs_to_samples(int mag_shift, int16_t* mlt_coefs, int16_t* old_samples, int16_t* out_samples /*, int p_samples_done*/) {
    int res;
    int16_t new_samples[640];

    /* block transform MLT spectrum coefs to time domain PCM samples using DCT-IV (inverse) */
    res = imlt_dct4(mlt_coefs, new_samples, mag_shift);
    if (res < 0) return res;

    /* apply IMLT overlapped window filter function (640 samples) */
    res = imlt_window(new_samples, old_samples, out_samples);
    if (res < 0) return res;

    //*p_samples_done = 640; /* in Namco's code but actually ignored */

    return 0;
}

/*****************************************************************************
 * UNPACKING
 *****************************************************************************/

static inline int calc_offset(const int* absolute_region_power_index, int available_bits) {
    int region, cat_index;
    int offset, delta;

    offset = -32;
    delta = 32;
    do {
        int test_offset = offset + delta;
        int bits = 0;

        /* obtain a category for each region using the test offset */
        for (region = 0; region < NUMBER_OF_REGIONS; region++)  {
            cat_index = (test_offset - absolute_region_power_index[region]) / 2;
            if (cat_index < 0)
                cat_index = 0;
            else if (cat_index > NUM_CATEGORIES - 1)
                cat_index = NUM_CATEGORIES - 1;

            /* compute the number of bits that will be used given the cat assignments */
            bits += expected_bits_table[cat_index];
        }

        /* if (bits > available_bits - 32) then divide the offset region for the bin search */
        if (bits >= available_bits - 32) {
            offset = test_offset;
        }
        delta /= 2;
    }
    while (delta > 0);

    return offset;
}

static inline void compute_raw_power_categories(int* power_categories, const int* absolute_region_power_index, int offset) {
    int region, cat_index;

    for (region = 0; region < NUMBER_OF_REGIONS; region++) {
        cat_index = (offset - absolute_region_power_index[region]) / 2;
        if (cat_index < 0) 
            cat_index = 0;
        else if (cat_index > NUM_CATEGORIES - 1)
            cat_index = NUM_CATEGORIES - 1;

        power_categories[region] = cat_index;
    }
}

static inline void comp_powercat_and_catbalance(int* power_categories, int* category_balances, const int* absolute_region_power_index, int available_bits, int offset) {
    int region, ccp;
    int max_rate_categories[NUMBER_OF_REGIONS];
    int min_rate_categories[NUMBER_OF_REGIONS];
    int temp_category_balances[2*NUM_CATEGORIZATION_CONTROL_POSSIBILITIES];
    int expected_number_of_code_bits, max, min, max_rate_pointer, min_rate_pointer;


    /* Namco uses power_categories directly instead of max_rate_categories, but we'll separate for clarity.
     * It also loads min_rate_categories and expected_number_of_code_bits in the previous region loop */
    expected_number_of_code_bits = 0;
    for (region = 0; region < NUMBER_OF_REGIONS; region++) {
        int power_category = power_categories[region];
        max_rate_categories[region] = power_category;
        min_rate_categories[region] = power_category;
        expected_number_of_code_bits += expected_bits_table[power_category];
    }

    max = expected_number_of_code_bits;
    min = expected_number_of_code_bits;
    max_rate_pointer = NUM_CATEGORIZATION_CONTROL_POSSIBILITIES;
    min_rate_pointer = NUM_CATEGORIZATION_CONTROL_POSSIBILITIES;

    for (ccp = 0; ccp < NUM_CATEGORIZATION_CONTROL_POSSIBILITIES - 1; ccp++) {

        if (max + min <= available_bits * 2) {
            int raw_min = 10000;
            int raw_min_index = 0;

            /* Search from lowest freq regions to highest for best */
            /* region to reassign to a higher bit rate category.   */
            for (region = 0; region < NUMBER_OF_REGIONS; region++)  {
                if (max_rate_categories[region] > 0) {
                    int tmp = (offset - absolute_region_power_index[region]) - (max_rate_categories[region] * 2);
                    if (tmp < raw_min) {
                        raw_min = tmp;
                        raw_min_index = region;
                    }
                }
            }

            max_rate_pointer--;
            temp_category_balances[max_rate_pointer] = raw_min_index;

            max -= expected_bits_table[max_rate_categories[raw_min_index]];
            max_rate_categories[raw_min_index]--;
            max += expected_bits_table[max_rate_categories[raw_min_index]];
        }
        else {
            int raw_max = -10000;
            int raw_max_index = NUMBER_OF_REGIONS - 1;

            /* Search from highest freq regions to lowest for best region to reassign to a lower bit rate category. */
            for (region = NUMBER_OF_REGIONS - 1; region >= 0; region--)  {
                if (min_rate_categories[region] < NUM_CATEGORIES - 1) {
                    int tmp = (offset - absolute_region_power_index[region]) - (min_rate_categories[region] * 2);
                    if (tmp > raw_max) {
                        raw_max = tmp;
                        raw_max_index = region;
                    }
                }
            }

            temp_category_balances[min_rate_pointer] = raw_max_index;
            min_rate_pointer++;

            min -= expected_bits_table[min_rate_categories[raw_max_index]];
            min_rate_categories[raw_max_index]++;
            min += expected_bits_table[min_rate_categories[raw_max_index]];
        }
    }

    for (region = 0; region < NUMBER_OF_REGIONS; region++) {
        power_categories[region] = max_rate_categories[region];
    }

    for (ccp = 0; ccp < NUM_CATEGORIZATION_CONTROL_POSSIBILITIES - 1; ccp++) {
        category_balances[ccp] = temp_category_balances[max_rate_pointer + ccp];
    }
}

static int categorize(int available_bits, const int* absolute_region_power_index, int* power_categories, int* category_balances) {
    int offset;

    /* compensate increased bit usage for higher bitrates (used?) */
    if (available_bits > MAX_DCT_LENGTH) {
        available_bits = 5 * (available_bits - MAX_DCT_LENGTH) / 8 + MAX_DCT_LENGTH;
    }

    /* calculate category stuff (originally inline'd) */

    offset = calc_offset(absolute_region_power_index, available_bits);

    compute_raw_power_categories(power_categories, absolute_region_power_index, offset);

    comp_powercat_and_catbalance(power_categories, category_balances, absolute_region_power_index, available_bits, offset);

    return 0;
}

static inline void index_to_array(int index, int* array_cv, int category) {
    int q, p;
    int max_bin_plus_one = max_bin_plus1[category];
    int inverse_of_max_bin_plus_one_scaled = max_bin_plus_one_inverse_scaled[category];

    /* vs refdec: unrolled, inline'd version of the inverted loop, with some ops simplified
     * (depending on pre-scaled tables), since this is called many times.
     * From tests it's not too noticeable though. */

    p = index;
    /* fills array (vector_dimension[category] - 1) times inversely */
    switch (category) {
        case 0:
        case 1:
        case 2:
            q = (p * inverse_of_max_bin_plus_one_scaled) >> 16;
            array_cv[1] = p - (q * max_bin_plus_one);
            p = q;

            q = (q * inverse_of_max_bin_plus_one_scaled) >> 16;
            array_cv[0] = p - (q * max_bin_plus_one);
            //p = q;
            break;

        case 3:
            q = (p * inverse_of_max_bin_plus_one_scaled) >> 16;
            array_cv[3] = p - (q * 5); //max_bin_plus_one = 5
            p = q;

            q = (q * inverse_of_max_bin_plus_one_scaled) >> 16;
            array_cv[2] = p - (q * 5); //max_bin_plus_one = 5
            p = q;

            q = (q * inverse_of_max_bin_plus_one_scaled) >> 16;
            array_cv[1] = p - (q * 5); //max_bin_plus_one = 5
            p = q;

            q = (q * inverse_of_max_bin_plus_one_scaled) >> 16;
            array_cv[0] = p - (q * 5); //max_bin_plus_one = 5
            //p = q;
            break;

        case 4:
            array_cv[3] = p & 3;
            p >>= 2;

            array_cv[2] = p & 3;
            p >>= 2;

            array_cv[1] = p & 3;
            p >>= 2;

            array_cv[0] = p & 3;
            /* not sure how this case is optimized */
            break;

        case 5:
            q = (p * inverse_of_max_bin_plus_one_scaled) >> 16;
            array_cv[4] = p - (q * 3); //max_bin_plus_one = 3
            p = q;

            q = (q * inverse_of_max_bin_plus_one_scaled) >> 16;
            array_cv[3] = p - (q * 3); //max_bin_plus_one = 3
            p = q;

            q = (q * inverse_of_max_bin_plus_one_scaled) >> 16;
            array_cv[2] = p - (q * 3); //max_bin_plus_one = 3
            p = q;

            q = (q * inverse_of_max_bin_plus_one_scaled) >> 16;
            array_cv[1] = p - (q * 3); //max_bin_plus_one = 3
            p = q;

            q = (q * inverse_of_max_bin_plus_one_scaled) >> 16;
            array_cv[0] = p - (q * 3); //max_bin_plus_one = 3
            //p = q;
            break;

        case 6:
            array_cv[4] = p & 1;
            p >>= 1;

            array_cv[3] = p & 1;
            p >>= 1;

            array_cv[2] = p & 1;
            p >>= 1;

            array_cv[1] = p & 1;
            p >>= 1;

            array_cv[0] = p & 1;
            //p >>= 1;
            /* not sure how this case is optimized */
            break;

        default:
            break;
    }
}

static int decode_vector_quantized_mlt_indices(uint32_t* data_u32, int* p_bitpos, int bit_count, uint32_t* p_random_value, int* decoder_region_standard_deviation, int* power_categories, int16_t* mlt_coefs) {
    int16_t standard_deviation;
    int array_cv[MAX_VECTOR_DIMENSION];
    int i, v, region, category, index;
    uint32_t cur_u32, bitmask;
    uint32_t* ptr_u32;

    /* bitreading setup */
    ptr_u32 = &data_u32[(*p_bitpos >> 5)];
    bitmask = 1 << (31 - (*p_bitpos & 0x1F));
    cur_u32 = *ptr_u32;
    ptr_u32++;


    /* read MLT coefs per region, differently depending on the category config */
    for (region = 0; region < NUMBER_OF_REGIONS; region++)  {
        standard_deviation = decoder_region_standard_deviation[region];
        category = power_categories[region];

        /* lower categories encode MLT coefs based on vectors incides + huffman (?) */
        if (category < 7) {
            const int16_t* decoder_tree_ptr = table_of_decoder_tables[category];
            int16_t* decoder_mlt_ptr = &mlt_coefs[region * REGION_SIZE];

            for (v = 0; v < number_of_vectors[category]; v++)  {
                index = 0;

                do {
                    int bit = (bitmask & cur_u32) != 0;
                    bitmask >>= 1;
                    (*p_bitpos)++;
                    if (bitmask == 0) {
                        bitmask = 0x80000000;
                        cur_u32 = *ptr_u32;
                        ptr_u32++;
                    }

                    index = *(decoder_tree_ptr + (index*2) + bit);
                }
                while (index > 0);

                /* ran out of bits */
                if (ptr_u32 > &data_u32[bit_count >> 5]) {
                    category = 7; /* this category doesn't bitread and only noise fills */

                    /* Namco doesn't set remaining regions to category 7 like the spec, nor checks
                     * when reading sign bits below, but doesn't seem to cause any problems */
                    //for (i = region + 1; i < NUMBER_OF_REGIONS; i++) {
                    //    power_categories[i] = 7;
                    //}
                    break;
                }

                index = -index;

                /* convert index into array of sign bits used to access the centroid table */
                index_to_array(index, array_cv, category);

                /* vs refdec: sign reading slightly simplified */

                for (i = 0; i < vector_dimension[category]; i++) {
                    int decoder_mlt_value = 0;
                    int negative;

                    /* non-zero array  = results in non-zero coef and encodes bit sign */
                    if (array_cv[i] != 0) {
                        decoder_mlt_value = standard_deviation * mlt_quant_centroid[category][array_cv[i]];
                        decoder_mlt_value = decoder_mlt_value >> 12;

                        negative = (bitmask & cur_u32) != 0;
                        bitmask >>= 1;
                        (*p_bitpos)++;
                        if (bitmask == 0) {
                            bitmask = 0x80000000;
                            cur_u32 = *ptr_u32;
                            ptr_u32++;
                        }

                        if (negative == 0)
                            decoder_mlt_value = -decoder_mlt_value;
                    }

                    *decoder_mlt_ptr = decoder_mlt_value;
                    decoder_mlt_ptr++;
                }
            }
        }

        /* higher categories don't encode all 20 MLT coefs, so rest are filled with
         * noise to pretend we have something */
        if (category >= 5) {
            static const int noise_fill_factor[3] = {5793, 8192, 23170}; /* 0.176777, 0.25, 0.707107 */
            uint32_t random_value;

            int16_t* decoder_mlt_ptr = &mlt_coefs[region * REGION_SIZE];
            int16_t noise_fill_pos = (standard_deviation * noise_fill_factor[category - 5]) >> 15; /* should be int16 */
            int16_t noise_fill_neg = -noise_fill_pos;

            /* vs refdec: updated differently (with hist state), and reupdated after 10 coefs */
            *p_random_value *= 69069;
            random_value = *p_random_value;

            /* in some versions of Namco's decoder this is unrolled too */

            if (category >= 7) {
                /* all coefs are noise-filled */
                for (i = 0; i < REGION_SIZE; i++) {
                    {
                        if (random_value & 1) 
                            *decoder_mlt_ptr = noise_fill_pos;
                        else
                            *decoder_mlt_ptr = noise_fill_neg;
                        random_value = (random_value >> 1);
                    }
                    decoder_mlt_ptr++;
                }
            }
            else {
                /* some coefs are noise-filled */
                for (i = 0; i < REGION_SIZE; i++)  {
                    if (*decoder_mlt_ptr == 0) {
                        if (random_value & 1) 
                            *decoder_mlt_ptr = noise_fill_pos;
                        else
                            *decoder_mlt_ptr = noise_fill_neg;
                        random_value = (random_value >> 1);
                    }
                    decoder_mlt_ptr++;
                }
            }
        }
    }

    return 0;
}

/* unpacks input buffer into MLT coefs */
static int unpack_frame(int bit_rate, const uint8_t* data, int frame_size, /*int* p_frame_size, */ int* p_mag_shift, int16_t* mlt_coefs, uint32_t* p_random_value, int test_errors) {
    uint32_t data_u32[0x78/4 + 2];
    int bitpos, expected_frame_size;
    int power_categories[NUMBER_OF_REGIONS];
    int category_balances[NUM_CATEGORIZATION_CONTROL_POSSIBILITIES-1];
    int absolute_region_power_index[NUMBER_OF_REGIONS]; /* a.k.a. RMS_index */
    int decoder_region_standard_deviation[NUMBER_OF_REGIONS];
    uint16_t categorization_control;
    int i;
    int res;


    /* setup bitreading */
    {
        expected_frame_size = bit_rate / 8 / 50;
        if (frame_size < expected_frame_size)
            return 1;
        //p_frame_size = expected_frame_size; /* Namco returns this, for some reason */

        /* Siren14 data is packed into U16 LE, but Namco reads and stores them in a U32 LE temp array for their bitreading */
        for (i = 0; i < (expected_frame_size >> 2); i++) {
            data_u32[i] = (data[0x04*i + 2] << 0) | (data[0x04*i + 3] << 8) | (data[0x04*i + 0] << 16) | (data[0x04*i + 1] << 24);
        }
        /* data32 also has extra ints probably against outside reads, which wasn't originally
         * memset'ed but we'll do just in case (doesn't seem to matter) */
        for (i = (expected_frame_size >> 2); i < 0x78/4 + 2; i++) {
            data_u32[i] = 0;
        }

        bitpos = 0;
    }

    /* decode amplitude envelope scales */
    {
        int rms_index = 0; /* amplitudes are root-mean-square */
        int region;

        /* get amplitude envelope (5b) for region 0 */
        for (i = 0; i < 5; i++)  {
            int bit = (data_u32[bitpos >> 5] >> (31 - (bitpos & 0x1F))) & 1;
            bitpos++;

            rms_index = (rms_index << 1) | bit;
        }
        absolute_region_power_index[0] = rms_index - ESF_ADJUSTMENT_TO_RMS_INDEX;

        /* get amplitudes for other regions, coded differentially based on region 0 (done with a temp array in refdec) */
        for (region = 1; region < NUMBER_OF_REGIONS; region++) {
            int diff_index = 0;
            int region_index = region > 13 ? 13 - 1 : region - 1;

            do {
                int bit = (data_u32[bitpos >> 5] >> (31 - (bitpos & 0x1F))) & 1;
                bitpos++;

                diff_index = differential_region_power_decoder_tree[region_index][diff_index][bit];
            }
            while (diff_index > 0);

            absolute_region_power_index[region] = absolute_region_power_index[region-1] - diff_index - DRP_DIFF_MIN;
        }
    }

    /* read categorization info bits */
    {
        categorization_control = 0;
        for (i = 0; i < NUM_CATEGORIZATION_CONTROL_BITS; i++) {
            int bit = (data_u32[bitpos >> 5] >> (31 - (bitpos & 0x1F))) & 1;
            bitpos++;

            categorization_control = (categorization_control << 1) | bit;
        }
    }

    /* determine categorization config per region */
    res = categorize(
       8 * expected_frame_size - bitpos,
       absolute_region_power_index, power_categories, category_balances);
    if (res < 0) return res;

    /* adjust power categories (rate_adjust_categories) */
    {
        for (i = 0; i < categorization_control; i++) {
            int region = category_balances[i];
            power_categories[region]++;
        }
    }

    /* recover amplitude envelope deviation (done in decode_envelope in refdec) */
    {
        int region, region_index, max_index /*, test_index*/;

        /* vs refdec: Namco *doesn't* calc test_index here, so resulting region_index
          * can be +-1 vs refdec, and final samples around +-10 (usually quieter).
         * Also reuses and mods absolute_region_power_index but we have decoder_region_standard_deviation for clarity */

        //test_index = 0;
        max_index = 0;
        for (region = 0; region < NUMBER_OF_REGIONS; region++) {
            region_index = absolute_region_power_index[region];
            if (max_index < region_index)
                max_index = region_index;
            //test_index += region_standard_deviation_table[region_index + REGION_POWER_TABLE_NUM_NEGATIVES];
        }

        max_index += REGION_POWER_TABLE_NUM_NEGATIVES;
        region_index = 9;
        while ((region_index >= 0) && (/*test_index >= 8 ||*/ max_index > 28)) {
            max_index -= 2;
            region_index--;
            //test_index /= 2;
        }

        for (region = 0; region < NUMBER_OF_REGIONS; region++) {
            int rsd_index = absolute_region_power_index[region] + REGION_POWER_TABLE_NUM_NEGATIVES + region_index * 2;
            decoder_region_standard_deviation[region] = region_standard_deviation_table[rsd_index];
        }

        *p_mag_shift = region_index;
    }

    /* decode the quantized bits into MLT coefs */
    res = decode_vector_quantized_mlt_indices(
        data_u32, &bitpos, 8 * expected_frame_size,
        p_random_value,
        decoder_region_standard_deviation, power_categories, mlt_coefs);
    if (res < 0) return res;


    /* test for errors (in refdec but not Namco's, useful to detect decryption) */
    if (test_errors) {
        int max_pad_bytes = 0x7; /* usually 0x04 and rarely ~0x07 */
        int bits_left = 8 * expected_frame_size - bitpos;
        int i, endpos, test_bits;

        if (bits_left > 0) {

            /* frame must be padded with 1s after regular data */
            endpos = bitpos;
            for (i = 0; i < bits_left; i++) {
                int bit = (data_u32[endpos >> 5] >> (31 - (endpos & 0x1F))) & 1;
                endpos++;

                if (bit == 0)
                    return -1;
            }

            /* extra: test we aren't in the middle of padding (happens with bad keys)
             * After reading the whole frame, last bit position should land near last useful
             * data, a few bytes into padding, so check there aren't too many padding bits. */
            endpos = bitpos;
            test_bits = 8 * max_pad_bytes;
            if (test_bits > bitpos)
                test_bits = bitpos;
            for (i = 0; i < test_bits; i++) {
                int bit = (data_u32[endpos >> 5] >> (31 - (endpos & 0x1F))) & 1;
                endpos--; /* from last position towards valid data */

                if (bit != 1)
                    break;
            }

            if (i == test_bits)
                return -8;

        }
        else {
            /* ? */
            if (categorization_control < NUM_CATEGORIZATION_CONTROL_BITS - 1 && bits_left < 0)
                return -2;
        }

        for (i = 0; i < NUMBER_OF_REGIONS; i++) {
            if ((absolute_region_power_index[i] + ESF_ADJUSTMENT_TO_RMS_INDEX > 31) ||
                (absolute_region_power_index[i] + ESF_ADJUSTMENT_TO_RMS_INDEX < -8))
              return -4;
        }
    }

    return 0;
}


/*****************************************************************************
 * API
 *****************************************************************************/

struct g7221_handle {
    /* control */
    int bit_rate;
    int frame_size;
    /* AES setup/state */
    s14aes_handle* aes;
    /* state */
    int16_t mlt_coefs[MAX_DCT_LENGTH];
    int16_t old_samples[MAX_DCT_LENGTH >> 1];
    uint32_t random_value;
};

g7221_handle* g7221_init(int bytes_per_frame) {
    g7221_handle* handle = NULL;
    int bit_rate;

    /* valid only: 0x78, 0x50 or 0x3c */
    bit_rate = bytes_per_frame * 8 * 50;
    if (bit_rate != 24000 && bit_rate != 32000 && bit_rate != 48000)
        goto fail;

    handle = calloc(1, sizeof(g7221_handle));
    if (!handle) goto fail;

    handle->bit_rate = bit_rate;
    handle->frame_size = bytes_per_frame;

    g7221_reset(handle);

    return handle;
fail:
    free(handle);
    return NULL;
}


int g7221_decode_frame(g7221_handle* handle, uint8_t* data, int16_t* out_samples) {
    int res;
    int mag_shift;
    int encrypted = handle->aes != NULL;

    /* first 0x10 bytes may be encrypted with AES. Original code also saves encrypted bytes,
     * then re-crypts after unpacking, presumably to guard against memdumps. */
    if (encrypted) {
        s14aes_decrypt(handle->aes, data);
    }

    /* Namco's decoder is designed so that out_samples can be set in place of mlt_coefs,
     * so we could avoid one extra buffer, but for clarity we'll leave as is */

    /* unpack data into MLT spectrum coefs */
    res = unpack_frame(handle->bit_rate, data, handle->frame_size, &mag_shift, handle->mlt_coefs, &handle->random_value, encrypted);
    if (res < 0) goto fail;

    /* convert coefs to samples using reverse (inverse) MLT */
    res = rmlt_coefs_to_samples(mag_shift, handle->mlt_coefs, handle->old_samples, out_samples);
    if (res < 0) goto fail;

    /* Namco also sets number of codes/samples done from unpack_frame/rmlt (ptr arg),
     * but they seem unused */

    return 0;
fail:
    return res;
}

#if 0
int g7221_decode_empty(g7221_handle* handle, int16_t* out_samples) {
    static const uint8_t empty_frame[0x3c] = {
         0x1E,0x0B,0x89,0x40,0x02,0x4F,0x51,0x35, 0x10,0xA1,0xFE,0xDF,0x52,0x51,0x10,0x0B,
         0xF0,0x69,0x7B,0xAE,0x18,0x17,0x00,0x52, 0x07,0x74,0xF4,0x65,0xA2,0x58,0xD8,0x3F,
         0xD9,0xAA,0x65,0x35,0x2A,0x14,0xE3,0x58, 0xD7,0xC0,0xD2,0x02,0x5B,0x0E,0x2A,0x98,
         0xA3,0x04,0x5E,0x51,0xE5,0xC5,0xB2,0x14, 0xBF,0x58,0xFF,0xFF
    };
    int res;
    int mag_shift;

    /* This only seems to exist in older exes. Namco's samples don't reach EOF, so this
     * wouldn't need to be called. Doesn't seem to use encoder delay either. */

    res = unpack_frame(24000, empty_frame, 0x3c, &mag_shift, handle->mlt_coefs, &handle->random_value);
    if (res) goto fail;

    /* convert coefs to samples using reverse (inverse) MLT */
    res = rmlt_coefs_to_samples(mag_shift, handle->mlt_coefs, handle->old_samples, out_samples);
    if (res) goto fail;

    return 1;
fail:
    return 0;
}
#endif

void g7221_reset(g7221_handle* handle) {

    /* initialize old values (others get overwritten) */
    memset(&handle->old_samples, 0, sizeof(handle->old_samples));

    /* initialize the random number generator */
    handle->random_value = 0x10001;

    /* vs refdec: different default random. Namco used a global, so maybe multiple
     * bnsf playing at the same time would get slightly different results */
}

void g7221_free(g7221_handle* handle) {
    if (!handle)
        return;

    s14aes_close(handle->aes);
    free(handle);
}

int g7221_set_key(g7221_handle* handle, const uint8_t* key) {
    const int key_size = 192 / 8; /* only 192 bit mode */
    uint8_t temp_key[192 / 8];
    const char* mod_key = "Ua#oK3P94vdxX,ft*k-mnjoO"; /* constant for all platform/games */
    int i;

    if (!handle)
        goto fail;

    /* disable, useful for testing? */
    if (key == NULL) {
        s14aes_close(handle->aes);
        handle->aes = NULL;
        return 1;
    }

    /* init AES state (tables) or reuse if already exists */
    if (handle->aes == NULL) {
        handle->aes = s14aes_init();
        if (!handle->aes) goto fail;
    }

    /* Base key is XORed probably against memdumps, as plain key would be part of the final AES key. However
     * roundkey is still in memdumps near AES state (~0x1310 from sbox table, that starts with 0x63,0x7c,0x77,0x7b...)
     * so it isn't too effective. XORing was originally done inside aes_expand_key during S14/S22 init. */
    for (i = 0; i < key_size; i++) {
        temp_key[i] = key[i] ^ mod_key[i];
    }

    /* reset new key */
    s14aes_set_key(handle->aes, temp_key);

    return 0;
fail:
    return -1;
}