vgmstream/src/coding/circus_decoder_lib.c

#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <stdlib.h>

/* Decodes Circus's audio codec, reverse engineered from various .exe.
 *
 * Some sources identify this codec as VQ (vector quantization), though vector(?)
 * data isn't actually bitpacked and just compressed using custom LZ or standard zlib.
 * Channels aren't divided either so decoding results in N-ch interleaved PCM.
 * It does seem to be using LPC/speech stuff from VQ codecs though.
 *
 * Some info from Japanese libpcm.c found in foo_adpcm
 * https://bitbucket.org/losnoco/foo_adpcm/src/master/foo_oki/source/libpcm/libpcm.cpp
 */

#include "circus_decoder_lib.h"
#include "circus_decoder_lib_data.h"

#include "circus_decoder_lzxpcm.h"

/* use miniz (API-compatible) to avoid adding external zlib just for this codec
 * - https://github.com/richgel999/miniz */
#include "circus_decoder_miniz.h" 
//#include "zlib.h"


//#define XPCM_CODEC_PCM       0
#define XPCM_CODEC_VQ_LZXPCM   1
//#define XPCM_CODEC_ADPCM     2
#define XPCM_CODEC_VQ_DEFLATE  3

/* frame encodes 4096 PCM samples (all channels) = 4096*2 = 0x2000 bytes, re-interleaved then compressed */
#define XPCM_FRAME_SIZE         (4096 * 2)
#define XPCM_FRAME_CODES        4096
#define XPCM_FRAME_SAMPLES_ALL  4064
#define XPCM_FRAME_OVERLAP_ALL  32
#define XPCM_INPUT_SIZE         0x8000

/* ************************************************************************* */
/* DECODE */
/* ************************************************************************* */

struct circus_handle_t {
    /* config */
    off_t start;
    uint8_t codec;
    uint8_t flags;
    const int* scales;

    /* temp buffers */
    uint8_t srcbuf[XPCM_INPUT_SIZE];    /* compressed input data (arbitrary size) */
    uint8_t decbuf[XPCM_FRAME_SIZE];    /* single decompressed frame */
    uint8_t intbuf[XPCM_FRAME_SIZE];    /* re-interleaved frame */
    int32_t invbuf[XPCM_FRAME_CODES];   /* main LPC data (may need less) */
    int32_t tmpbuf[XPCM_FRAME_CODES];   /* temp LPC data (may need less) */

    /* output samples (original code reuses decbuf though) */
    int16_t pcmbuf[XPCM_FRAME_SAMPLES_ALL + XPCM_FRAME_OVERLAP_ALL]; /* final output samples and extra overlap samples */

    /* sample filter state */
    int hist1;
    int hist2;
    int frame;

    /* lz/deflate decompression state */
    lzxpcm_stream_t lstrm;
    z_stream dstrm;
    off_t offset;
};


static void convert(uint8_t flags, int32_t* invbuf, int16_t* pcmbuf, int* p_hist1, int* p_hist2, int frame) {
    int i;
    int sample, hist1, hist2;

    hist1 = *p_hist1;
    hist2 = *p_hist2;

    /* some ops below would use SHRs (>>), but there is some rounding in the
     *  original ASM that decompiles and I think corresponds do DIVs
     * (right shift and divs of negative values isn't equivalent) */

    /* do final filtering and conversion to PCM */
    for (i = 0; i < XPCM_FRAME_SAMPLES_ALL + XPCM_FRAME_OVERLAP_ALL; i++) {
        sample = *invbuf++;
        if (flags & 0x10)
            sample = 3 * sample / 2; // (3 * sample) >> 2
        sample /= 1024; //sample >>= 10;

        sample = ((3 * sample + 8 * 3 * sample + 4 * hist1 + hist2) << 11);
        sample /= 65536; // sample >>= 16;

        hist2 = hist1;
        hist1 = sample;

        /* last 32 decoded samples aren't output, but are used next frame to overlap
         * with beginning samples (filters(?) windowing, not too noticeable though) */
        if (i < XPCM_FRAME_OVERLAP_ALL && frame > 0) {
            sample = (i * sample) + ((XPCM_FRAME_OVERLAP_ALL - i) * pcmbuf[XPCM_FRAME_SAMPLES_ALL + i]);
            sample /= 32; //sample >>= 5
        }

        if (sample > 32767)
            sample = 32767;
        else if (sample < -32768)
            sample = -32768;

        pcmbuf[i] = sample;
    }

    *p_hist1 = hist1;
    *p_hist2 = hist2;
}

static void transform(int32_t* invbuf, int32_t* tmpbuf) {
    int lpc1, lpc2, lpc3, lpc4;
    int step1, step2, step3;
    int sc1, sc2;

    /* bits were originally configurable (passed arg), but actually called with const 12, 
     * and removed in later games along with superfluous ifs (coefs > 0, bits >= 3, etc) */
    //const int frame_bits = 12;

    step1 = 4096; /* 1 << 12 */
    step2 = step1 >> 1;
    step3 = step2 >> 1;
    sc1 = 1;

    /* inverse transform of LPC(?) coefs */
    for (lpc1 = 0; lpc1 < 12 - 2; lpc1++) {
        int sub1, sub2;
        int i1, i2, i3, i4;
        int64_t cos1, sin1, cos2, sin2; /* needs i64 to force 64b ops (avoid overflows) */

        cos1 = (int64_t)sincos_table[sc1 + 1024];
        sin1 = (int64_t)sincos_table[sc1 + 0];

        i1 = 0;
        i2 = step2;
        i3 = step3;
        i4 = step2 + step3;

        for (lpc2 = 0; lpc2 < 4096; lpc2 += step1) {
            sub1 = invbuf[i1 + 0] - invbuf[i2 + 0];
            sub2 = tmpbuf[i1 + 0] - tmpbuf[i2 + 0];
            invbuf[i1 + 0] += invbuf[i2 + 0];
            tmpbuf[i1 + 0] += tmpbuf[i2 + 0];
            invbuf[i2 + 0] = sub1;
            tmpbuf[i2 + 0] = sub2;

            sub1 = invbuf[i1 + 1] - invbuf[i2 + 1];
            sub2 = tmpbuf[i1 + 1] - tmpbuf[i2 + 1];
            invbuf[i1 + 1] += invbuf[i2 + 1];
            tmpbuf[i1 + 1] += tmpbuf[i2 + 1];
            invbuf[i2 + 1] = ((sub1 * cos1) >> 12) + ((sub2 * sin1) >> 12);
            tmpbuf[i2 + 1] = ((sub2 * cos1) >> 12) - ((sub1 * sin1) >> 12);

            sub1 = invbuf[i3 + 0] - invbuf[i4 + 0];
            sub2 = tmpbuf[i3 + 0] - tmpbuf[i4 + 0];
            invbuf[i3 + 0] += invbuf[i4 + 0];
            tmpbuf[i3 + 0] += tmpbuf[i4 + 0];
            invbuf[i4 + 0] = sub2;
            tmpbuf[i4 + 0] = -sub1;

            sub1 = invbuf[i3 + 1] - invbuf[i4 + 1];
            sub2 = tmpbuf[i3 + 1] - tmpbuf[i4 + 1];
            invbuf[i3 + 1] += invbuf[i4 + 1];
            tmpbuf[i3 + 1] += tmpbuf[i4 + 1];
            invbuf[i4 + 1] =   ((sub2 * cos1) >> 12) - ((sub1 * sin1) >> 12);
            tmpbuf[i4 + 1] = -(((sub1 * cos1) >> 12) + ((sub2 * sin1) >> 12));

            i1 += step1;
            i2 += step1;
            i3 += step1;
            i4 += step1;
        }

        if (step3 > 2) {
            sc2 = sc1 * 2;

            for (lpc3 = 2; lpc3 < step3; lpc3++) {
                cos2 = (int64_t)sincos_table[sc2 + 1024];
                sin2 = (int64_t)sincos_table[sc2 + 0];
                sc2 += sc1;

                i1 = 0 + lpc3;
                i2 = step2 + lpc3;
                i3 = step3 + lpc3;
                i4 = step2 + step3 + lpc3;

                for (lpc4 = 0; lpc4 < 4096; lpc4 += step1) {
                    sub1 = invbuf[i1] - invbuf[i2];
                    sub2 = tmpbuf[i1] - tmpbuf[i2];
                    invbuf[i1] += invbuf[i2];
                    tmpbuf[i1] += tmpbuf[i2];
                    invbuf[i2] = ((sub1 * cos2) >> 12) + ((sub2 * sin2) >> 12);
                    tmpbuf[i2] = ((sub2 * cos2) >> 12) - ((sub1 * sin2) >> 12);

                    sub1 = invbuf[i3] - invbuf[i4];
                    sub2 = tmpbuf[i3] - tmpbuf[i4];
                    invbuf[i3] += invbuf[i4];
                    tmpbuf[i3] += tmpbuf[i4];
                    invbuf[i4] =   ((sub2 * cos2) >> 12) - ((sub1 * sin2) >> 12);
                    tmpbuf[i4] = -(((sub1 * cos2) >> 12) + ((sub2 * sin2) >> 12));

                    i1 += step1;
                    i2 += step1;
                    i3 += step1;
                    i4 += step1;
                }
            }
        }

        step1 = step2; // step1 >>= 1;
        step2 = step3; // step2 >>= 1;
        step3 >>= 1;
        sc1 *= 2;
    }

    {
        int i, j;
        int sub1, sub2, pow;

        for (i = 0; i < 4096; i += 4) {
            sub1 = invbuf[i + 0] - invbuf[i + 2];
            invbuf[i + 0] += invbuf[i + 2];
            invbuf[i + 2] = sub1;

            sub2 = tmpbuf[i + 0] - tmpbuf[i + 2];
            tmpbuf[i + 0] += tmpbuf[i + 2];
            tmpbuf[i + 2] = sub2;

            sub1 = invbuf[i + 3] - invbuf[i + 1];
            sub2 = tmpbuf[i + 1] - tmpbuf[i + 3];
            invbuf[i + 1] += invbuf[i + 3];
            invbuf[i + 3] = sub2;
            tmpbuf[i + 1] += tmpbuf[i + 3];
            tmpbuf[i + 3] = sub1;
        }

        for (i = 0; i < 4096; i += 2) {
            sub1 = invbuf[i + 0] - invbuf[i + 1];
            invbuf[i + 0] += invbuf[i + 1];
            invbuf[i + 1] = sub1;

            sub2 = tmpbuf[i + 0] - tmpbuf[i + 1];
            tmpbuf[i + 0] += tmpbuf[i + 1];
            tmpbuf[i + 1] = sub2;
        }

        for (i = 1, j = 0; i < 4096 - 1; i++) {
            for (pow = 4096 / 2; pow <= j; pow /= 2) {
                j -= pow;
            }
            j += pow;

            if (i < j) {
                sub1 = invbuf[j];
                invbuf[j] = invbuf[i];
                invbuf[i] = sub1;

                sub2 = tmpbuf[j];
                tmpbuf[j] = tmpbuf[i];
                tmpbuf[i] = sub2;
            }
        }
    }
}

static void scale(const uint8_t* intbuf, const int* scales, int32_t* invbuf, int32_t* tmpbuf) {
    int i, j;

    /* reinterleave and scale intbuf into invbuf and tmpbuf */
    for (i = 0, j = 0; i < 4096 / 2; i++, j += 16) {
        int scale, qv1, qv2;

        scale = scales[j / 4096];

        qv1 = (intbuf[i*4 + 0] << 0) | (intbuf[i*4 + 1] << 8); /* get_u16le */
        qv2 = (intbuf[i*4 + 2] << 0) | (intbuf[i*4 + 3] << 8); /* get_u16le */

        /* lowest bit is short of "positive" flag, or rather: even=0..-32767, odd=1..32768
         * (originally done through a LUT init at runtime with all 65536 indexes) */
        qv1 = (qv1 & 1) ? (qv1 >> 1) + 1 : -(qv1 >> 1);
        qv2 = (qv2 & 1) ? (qv2 >> 1) + 1 : -(qv2 >> 1);

        invbuf[i] = scale * qv1;
        tmpbuf[i] = scale * qv2;
    }

    /* reset rest of invbuf/tmpbuf */
    for (i = 4096 / 2; i < 4096; i++) {
        invbuf[i] = 0;
        tmpbuf[i] = 0;
    }
}

static void interleave(const uint8_t* decbuf, uint8_t* intbuf) {
    int i, j;

    /* reorder odd decbuf bytes into intbuf */
    for (i = 0, j = 1; i < 0x1000; i++, j += 2) {
        intbuf[j] = decbuf[i];
    }

    /* reorder even decbuf bytes into intbuf */
    for (i = 0x1000, j = 0; i < 0x1800; i++, j += 4) {
        uint8_t lo = decbuf[i + 0x800];
        uint8_t hi = decbuf[i];

        intbuf[j + 0] = (hi & 0xF0) | (lo >> 4);
        intbuf[j + 2] = (hi << 4) | (lo & 0x0F);
    }
}

/* ************************************************************ */
/* API */
/* ************************************************************ */

circus_handle_t* circus_init(off_t start, uint8_t codec, uint8_t flags) {
    circus_handle_t* handle = NULL;
    int scale_index, err;

    handle = malloc(sizeof(circus_handle_t));
    if (!handle) goto fail;

    handle->start = start;
    handle->codec = codec; //(config >> 0) & 0xFF;
    handle->flags = flags; //(config >> 8) & 0xFF;

    scale_index = (handle->flags & 0xF);
    if (scale_index > 5) goto fail;
    handle->scales = scale_table[scale_index];

    if (handle->codec == XPCM_CODEC_VQ_DEFLATE) {
        memset(&handle->dstrm, 0, sizeof(z_stream));
        err = inflateInit(&handle->dstrm);
        if (err < 0) goto fail;
    }

    circus_reset(handle);

    return handle;
fail:
    circus_free(handle);
    return NULL;
}

void circus_free(circus_handle_t* handle) {
    if (!handle)
        return;

    if (handle->codec == XPCM_CODEC_VQ_DEFLATE) {
        inflateEnd(&handle->dstrm);
    }

    free(handle);
}

void circus_reset(circus_handle_t* handle) {
    if (!handle)
        return;
    handle->hist1 = 0;
    handle->hist2 = 0;
    handle->frame = 0;

    if (handle->codec == XPCM_CODEC_VQ_LZXPCM) {
        lzxpcm_reset(&handle->lstrm);
    } else if (handle->codec == XPCM_CODEC_VQ_DEFLATE) {
        inflateReset(&handle->dstrm);
    }
    handle->offset = handle->start;
}

static int decompress_frame_lzxpcm(circus_handle_t* handle, STREAMFILE* sf) {
    int res;

    handle->lstrm.next_out = handle->decbuf;
    handle->lstrm.avail_out = sizeof(handle->decbuf);
    handle->lstrm.total_out = 0;
    do {
        if (handle->lstrm.avail_in == 0) {
            handle->lstrm.next_in = handle->srcbuf;
            handle->lstrm.avail_in = read_streamfile(handle->srcbuf, handle->offset, sizeof(handle->srcbuf), sf);
            handle->offset += handle->lstrm.avail_in;

            /* EOF (game reserves some extra buf so memset'ing is probably equivalent) */
            if (handle->lstrm.avail_in == 0) {
                memset(handle->decbuf + handle->lstrm.total_out, 0, sizeof(handle->decbuf) - handle->dstrm.total_out);
                break;
            }
        }

        res = lzxpcm_decompress(&handle->lstrm);
        if (res != LZXPCM_OK)
            goto fail;
    }
    while(handle->lstrm.avail_out != 0);

    return 1;
fail:
    return 0;
}

static int decompress_frame_deflate(circus_handle_t* handle, STREAMFILE* sf) {
    int res;

    handle->dstrm.next_out = handle->decbuf;
    handle->dstrm.avail_out = sizeof(handle->decbuf);
    handle->dstrm.total_out = 0;
    do {
        if (handle->dstrm.avail_in == 0) {
            handle->dstrm.next_in = handle->srcbuf;
            handle->dstrm.avail_in = read_streamfile(handle->srcbuf, handle->offset, sizeof(handle->srcbuf), sf);
            handle->offset += handle->dstrm.avail_in;

            /* EOF (game reserves some extra buf so memset'ing is probably equivalent) */
            if (handle->dstrm.avail_in == 0) {
                memset(handle->decbuf + handle->dstrm.total_out, 0, sizeof(handle->decbuf) - handle->dstrm.total_out);
                break;
            }
        }

        res = inflate(&handle->dstrm, Z_NO_FLUSH);
        if (res != Z_OK && res != Z_STREAM_END)
            goto fail;
    }
    while(handle->dstrm.avail_out != 0);

    return 1;
fail:
    return 0;
}

#ifdef XPCM_ALT
/* original code uses zlib 1.2.1 to decompress the full stream into memory */
static int deflate_decompress_full(uint8_t* dst, size_t dst_size, const uint8_t* src, size_t src_size) {
    int err;
    z_stream strm = {0};
    strm.next_in  = src;
    strm.avail_in = src_size;
    strm.next_out = dst;
    strm.avail_out = dst_size;

    err = inflateInit(&strm);
    if (err < 0) {
        //printf("inflateInit error: %i\n", err);
        return 0;
    }

    err = inflate(&strm, Z_FINISH);
    if (err < 0) {
        //printf("inflate error: %i\n", err);
        //return 0;
    }

    err = inflateEnd(&strm);
    if (err < 0) {
        //printf("inflateEnd error: %i\n", err);
        return 0;
    }

    return 0;
}
#endif

int circus_decode_frame(circus_handle_t* handle, STREAMFILE* sf, int16_t** p_buf, int* p_buf_samples_all) {
    int ok;

    if (handle->codec == XPCM_CODEC_VQ_LZXPCM) {
        ok = decompress_frame_lzxpcm(handle, sf);
    } else if (handle->codec == XPCM_CODEC_VQ_DEFLATE) {
        ok = decompress_frame_deflate(handle, sf);
    } else {
        ok = 0;
    }
    if (!ok)
        goto fail;

    interleave(handle->decbuf, handle->intbuf);
    scale(handle->intbuf, handle->scales, handle->invbuf, handle->tmpbuf);
    transform(handle->invbuf, handle->tmpbuf);
    convert(handle->flags, handle->invbuf, handle->pcmbuf, &handle->hist1, &handle->hist2, handle->frame);
    handle->frame++;

    *p_buf = handle->pcmbuf;
    *p_buf_samples_all = XPCM_FRAME_SAMPLES_ALL;
    return 1;
fail:
    return 0;
}
Add .xpcm VQ + LZ/deflate codec [Eternal Fantasy (PC)] 2020-03-07 23:14:56 +01:00			`#include <stdio.h>`
			`#include <stdint.h>`
			`#include <string.h>`
			`#include <stdlib.h>`

			`/* Decodes Circus's audio codec, reverse engineered from various .exe.`
			`*`
			`* Some sources identify this codec as VQ (vector quantization), though vector(?)`
			`* data isn't actually bitpacked and just compressed using custom LZ or standard zlib.`
			`* Channels aren't divided either so decoding results in N-ch interleaved PCM.`
			`* It does seem to be using LPC/speech stuff from VQ codecs though.`
			`*`
			`* Some info from Japanese libpcm.c found in foo_adpcm`
			`* https://bitbucket.org/losnoco/foo_adpcm/src/master/foo_oki/source/libpcm/libpcm.cpp`
			`*/`

			`#include "circus_decoder_lib.h"`
			`#include "circus_decoder_lib_data.h"`

			`#include "circus_decoder_lzxpcm.h"`

			`/* use miniz (API-compatible) to avoid adding external zlib just for this codec`
			`* - https://github.com/richgel999/miniz */`
			`#include "circus_decoder_miniz.h"`
			`//#include "zlib.h"`


			`//#define XPCM_CODEC_PCM 0`
			`#define XPCM_CODEC_VQ_LZXPCM 1`
			`//#define XPCM_CODEC_ADPCM 2`
			`#define XPCM_CODEC_VQ_DEFLATE 3`

			`/* frame encodes 4096 PCM samples (all channels) = 40962 = 0x2000 bytes, re-interleaved then compressed /`
			`#define XPCM_FRAME_SIZE (4096 * 2)`
			`#define XPCM_FRAME_CODES 4096`
			`#define XPCM_FRAME_SAMPLES_ALL 4064`
			`#define XPCM_FRAME_OVERLAP_ALL 32`
			`#define XPCM_INPUT_SIZE 0x8000`

			`/* ************************************************************************* */`
			`/* DECODE */`
			`/* ************************************************************************* */`

			`struct circus_handle_t {`
			`/* config */`
			`off_t start;`
			`uint8_t codec;`
			`uint8_t flags;`
			`const int* scales;`

			`/* temp buffers */`
			`uint8_t srcbuf[XPCM_INPUT_SIZE]; /* compressed input data (arbitrary size) */`
			`uint8_t decbuf[XPCM_FRAME_SIZE]; /* single decompressed frame */`
			`uint8_t intbuf[XPCM_FRAME_SIZE]; /* re-interleaved frame */`
			`int32_t invbuf[XPCM_FRAME_CODES]; /* main LPC data (may need less) */`
			`int32_t tmpbuf[XPCM_FRAME_CODES]; /* temp LPC data (may need less) */`

			`/* output samples (original code reuses decbuf though) */`
			`int16_t pcmbuf[XPCM_FRAME_SAMPLES_ALL + XPCM_FRAME_OVERLAP_ALL]; /* final output samples and extra overlap samples */`

			`/* sample filter state */`
			`int hist1;`
			`int hist2;`
			`int frame;`

			`/* lz/deflate decompression state */`
			`lzxpcm_stream_t lstrm;`
			`z_stream dstrm;`
			`off_t offset;`
			`};`


			`static void convert(uint8_t flags, int32_t* invbuf, int16_t* pcmbuf, int* p_hist1, int* p_hist2, int frame) {`
			`int i;`
			`int sample, hist1, hist2;`

			`hist1 = *p_hist1;`
			`hist2 = *p_hist2;`

			`/* some ops below would use SHRs (>>), but there is some rounding in the`
			`* original ASM that decompiles and I think corresponds do DIVs`
			`* (right shift and divs of negative values isn't equivalent) */`

			`/* do final filtering and conversion to PCM */`
			`for (i = 0; i < XPCM_FRAME_SAMPLES_ALL + XPCM_FRAME_OVERLAP_ALL; i++) {`
			`sample = *invbuf++;`
			`if (flags & 0x10)`
			`sample = 3 * sample / 2; // (3 * sample) >> 2`
			`sample /= 1024; //sample >>= 10;`

			`sample = ((3 * sample + 8 * 3 * sample + 4 * hist1 + hist2) << 11);`
			`sample /= 65536; // sample >>= 16;`

			`hist2 = hist1;`
			`hist1 = sample;`

			`/* last 32 decoded samples aren't output, but are used next frame to overlap`
			`* with beginning samples (filters(?) windowing, not too noticeable though) */`
			`if (i < XPCM_FRAME_OVERLAP_ALL && frame > 0) {`
			`sample = (i * sample) + ((XPCM_FRAME_OVERLAP_ALL - i) * pcmbuf[XPCM_FRAME_SAMPLES_ALL + i]);`
			`sample /= 32; //sample >>= 5`
			`}`

			`if (sample > 32767)`
			`sample = 32767;`
			`else if (sample < -32768)`
			`sample = -32768;`

			`pcmbuf[i] = sample;`
			`}`

			`*p_hist1 = hist1;`
			`*p_hist2 = hist2;`
			`}`

			`static void transform(int32_t* invbuf, int32_t* tmpbuf) {`
			`int lpc1, lpc2, lpc3, lpc4;`
			`int step1, step2, step3;`
			`int sc1, sc2;`

			`/* bits were originally configurable (passed arg), but actually called with const 12,`
			`* and removed in later games along with superfluous ifs (coefs > 0, bits >= 3, etc) */`
			`//const int frame_bits = 12;`

			`step1 = 4096; /* 1 << 12 */`
			`step2 = step1 >> 1;`
			`step3 = step2 >> 1;`
			`sc1 = 1;`

			`/* inverse transform of LPC(?) coefs */`
			`for (lpc1 = 0; lpc1 < 12 - 2; lpc1++) {`
			`int sub1, sub2;`
			`int i1, i2, i3, i4;`
			`int64_t cos1, sin1, cos2, sin2; /* needs i64 to force 64b ops (avoid overflows) */`

			`cos1 = (int64_t)sincos_table[sc1 + 1024];`
			`sin1 = (int64_t)sincos_table[sc1 + 0];`

			`i1 = 0;`
			`i2 = step2;`
			`i3 = step3;`
			`i4 = step2 + step3;`

			`for (lpc2 = 0; lpc2 < 4096; lpc2 += step1) {`
			`sub1 = invbuf[i1 + 0] - invbuf[i2 + 0];`
			`sub2 = tmpbuf[i1 + 0] - tmpbuf[i2 + 0];`
			`invbuf[i1 + 0] += invbuf[i2 + 0];`
			`tmpbuf[i1 + 0] += tmpbuf[i2 + 0];`
			`invbuf[i2 + 0] = sub1;`
			`tmpbuf[i2 + 0] = sub2;`

			`sub1 = invbuf[i1 + 1] - invbuf[i2 + 1];`
			`sub2 = tmpbuf[i1 + 1] - tmpbuf[i2 + 1];`
			`invbuf[i1 + 1] += invbuf[i2 + 1];`
			`tmpbuf[i1 + 1] += tmpbuf[i2 + 1];`
			`invbuf[i2 + 1] = ((sub1 * cos1) >> 12) + ((sub2 * sin1) >> 12);`
			`tmpbuf[i2 + 1] = ((sub2 * cos1) >> 12) - ((sub1 * sin1) >> 12);`

			`sub1 = invbuf[i3 + 0] - invbuf[i4 + 0];`
			`sub2 = tmpbuf[i3 + 0] - tmpbuf[i4 + 0];`
			`invbuf[i3 + 0] += invbuf[i4 + 0];`
			`tmpbuf[i3 + 0] += tmpbuf[i4 + 0];`
			`invbuf[i4 + 0] = sub2;`
			`tmpbuf[i4 + 0] = -sub1;`

			`sub1 = invbuf[i3 + 1] - invbuf[i4 + 1];`
			`sub2 = tmpbuf[i3 + 1] - tmpbuf[i4 + 1];`
			`invbuf[i3 + 1] += invbuf[i4 + 1];`
			`tmpbuf[i3 + 1] += tmpbuf[i4 + 1];`
			`invbuf[i4 + 1] = ((sub2 * cos1) >> 12) - ((sub1 * sin1) >> 12);`
			`tmpbuf[i4 + 1] = -(((sub1 * cos1) >> 12) + ((sub2 * sin1) >> 12));`

			`i1 += step1;`
			`i2 += step1;`
			`i3 += step1;`
			`i4 += step1;`
			`}`

			`if (step3 > 2) {`
			`sc2 = sc1 * 2;`

			`for (lpc3 = 2; lpc3 < step3; lpc3++) {`
			`cos2 = (int64_t)sincos_table[sc2 + 1024];`
			`sin2 = (int64_t)sincos_table[sc2 + 0];`
			`sc2 += sc1;`

			`i1 = 0 + lpc3;`
			`i2 = step2 + lpc3;`
			`i3 = step3 + lpc3;`
			`i4 = step2 + step3 + lpc3;`

			`for (lpc4 = 0; lpc4 < 4096; lpc4 += step1) {`
			`sub1 = invbuf[i1] - invbuf[i2];`
			`sub2 = tmpbuf[i1] - tmpbuf[i2];`
			`invbuf[i1] += invbuf[i2];`
			`tmpbuf[i1] += tmpbuf[i2];`
			`invbuf[i2] = ((sub1 * cos2) >> 12) + ((sub2 * sin2) >> 12);`
			`tmpbuf[i2] = ((sub2 * cos2) >> 12) - ((sub1 * sin2) >> 12);`

			`sub1 = invbuf[i3] - invbuf[i4];`
			`sub2 = tmpbuf[i3] - tmpbuf[i4];`
			`invbuf[i3] += invbuf[i4];`
			`tmpbuf[i3] += tmpbuf[i4];`
			`invbuf[i4] = ((sub2 * cos2) >> 12) - ((sub1 * sin2) >> 12);`
			`tmpbuf[i4] = -(((sub1 * cos2) >> 12) + ((sub2 * sin2) >> 12));`

			`i1 += step1;`
			`i2 += step1;`
			`i3 += step1;`
			`i4 += step1;`
			`}`
			`}`
			`}`

			`step1 = step2; // step1 >>= 1;`
			`step2 = step3; // step2 >>= 1;`
			`step3 >>= 1;`
			`sc1 *= 2;`
			`}`

			`{`
			`int i, j;`
			`int sub1, sub2, pow;`

			`for (i = 0; i < 4096; i += 4) {`
			`sub1 = invbuf[i + 0] - invbuf[i + 2];`
			`invbuf[i + 0] += invbuf[i + 2];`
			`invbuf[i + 2] = sub1;`

			`sub2 = tmpbuf[i + 0] - tmpbuf[i + 2];`
			`tmpbuf[i + 0] += tmpbuf[i + 2];`
			`tmpbuf[i + 2] = sub2;`

			`sub1 = invbuf[i + 3] - invbuf[i + 1];`
			`sub2 = tmpbuf[i + 1] - tmpbuf[i + 3];`
			`invbuf[i + 1] += invbuf[i + 3];`
			`invbuf[i + 3] = sub2;`
			`tmpbuf[i + 1] += tmpbuf[i + 3];`
			`tmpbuf[i + 3] = sub1;`
			`}`

			`for (i = 0; i < 4096; i += 2) {`
			`sub1 = invbuf[i + 0] - invbuf[i + 1];`
			`invbuf[i + 0] += invbuf[i + 1];`
			`invbuf[i + 1] = sub1;`

			`sub2 = tmpbuf[i + 0] - tmpbuf[i + 1];`
			`tmpbuf[i + 0] += tmpbuf[i + 1];`
			`tmpbuf[i + 1] = sub2;`
			`}`

			`for (i = 1, j = 0; i < 4096 - 1; i++) {`
			`for (pow = 4096 / 2; pow <= j; pow /= 2) {`
			`j -= pow;`
			`}`
			`j += pow;`

			`if (i < j) {`
			`sub1 = invbuf[j];`
			`invbuf[j] = invbuf[i];`
			`invbuf[i] = sub1;`

			`sub2 = tmpbuf[j];`
			`tmpbuf[j] = tmpbuf[i];`
			`tmpbuf[i] = sub2;`
			`}`
			`}`
			`}`
			`}`

			`static void scale(const uint8_t* intbuf, const int* scales, int32_t* invbuf, int32_t* tmpbuf) {`
			`int i, j;`

			`/* reinterleave and scale intbuf into invbuf and tmpbuf */`
			`for (i = 0, j = 0; i < 4096 / 2; i++, j += 16) {`
			`int scale, qv1, qv2;`

			`scale = scales[j / 4096];`

			`qv1 = (intbuf[i4 + 0] << 0) \| (intbuf[i4 + 1] << 8); /* get_u16le */`
			`qv2 = (intbuf[i4 + 2] << 0) \| (intbuf[i4 + 3] << 8); /* get_u16le */`

			`/* lowest bit is short of "positive" flag, or rather: even=0..-32767, odd=1..32768`
			`* (originally done through a LUT init at runtime with all 65536 indexes) */`
			`qv1 = (qv1 & 1) ? (qv1 >> 1) + 1 : -(qv1 >> 1);`
			`qv2 = (qv2 & 1) ? (qv2 >> 1) + 1 : -(qv2 >> 1);`

			`invbuf[i] = scale * qv1;`
			`tmpbuf[i] = scale * qv2;`
			`}`

			`/* reset rest of invbuf/tmpbuf */`
			`for (i = 4096 / 2; i < 4096; i++) {`
			`invbuf[i] = 0;`
			`tmpbuf[i] = 0;`
			`}`
			`}`

			`static void interleave(const uint8_t* decbuf, uint8_t* intbuf) {`
			`int i, j;`

			`/* reorder odd decbuf bytes into intbuf */`
			`for (i = 0, j = 1; i < 0x1000; i++, j += 2) {`
			`intbuf[j] = decbuf[i];`
			`}`

			`/* reorder even decbuf bytes into intbuf */`
			`for (i = 0x1000, j = 0; i < 0x1800; i++, j += 4) {`
			`uint8_t lo = decbuf[i + 0x800];`
			`uint8_t hi = decbuf[i];`

			`intbuf[j + 0] = (hi & 0xF0) \| (lo >> 4);`
			`intbuf[j + 2] = (hi << 4) \| (lo & 0x0F);`
			`}`
			`}`

			`/* ************************************************************ */`
			`/* API */`
			`/* ************************************************************ */`

			`circus_handle_t* circus_init(off_t start, uint8_t codec, uint8_t flags) {`
			`circus_handle_t* handle = NULL;`
			`int scale_index, err;`

			`handle = malloc(sizeof(circus_handle_t));`
			`if (!handle) goto fail;`

			`handle->start = start;`
			`handle->codec = codec; //(config >> 0) & 0xFF;`
			`handle->flags = flags; //(config >> 8) & 0xFF;`

			`scale_index = (handle->flags & 0xF);`
			`if (scale_index > 5) goto fail;`
			`handle->scales = scale_table[scale_index];`

			`if (handle->codec == XPCM_CODEC_VQ_DEFLATE) {`
			`memset(&handle->dstrm, 0, sizeof(z_stream));`
			`err = inflateInit(&handle->dstrm);`
			`if (err < 0) goto fail;`
			`}`

			`circus_reset(handle);`

			`return handle;`
			`fail:`
			`circus_free(handle);`
			`return NULL;`
			`}`

			`void circus_free(circus_handle_t* handle) {`
			`if (!handle)`
			`return;`

			`if (handle->codec == XPCM_CODEC_VQ_DEFLATE) {`
			`inflateEnd(&handle->dstrm);`
			`}`

			`free(handle);`
			`}`

			`void circus_reset(circus_handle_t* handle) {`
			`if (!handle)`
			`return;`
			`handle->hist1 = 0;`
			`handle->hist2 = 0;`
			`handle->frame = 0;`

			`if (handle->codec == XPCM_CODEC_VQ_LZXPCM) {`
			`lzxpcm_reset(&handle->lstrm);`
			`} else if (handle->codec == XPCM_CODEC_VQ_DEFLATE) {`
			`inflateReset(&handle->dstrm);`
			`}`
			`handle->offset = handle->start;`
			`}`

			`static int decompress_frame_lzxpcm(circus_handle_t* handle, STREAMFILE* sf) {`
			`int res;`

			`handle->lstrm.next_out = handle->decbuf;`
			`handle->lstrm.avail_out = sizeof(handle->decbuf);`
			`handle->lstrm.total_out = 0;`
			`do {`
			`if (handle->lstrm.avail_in == 0) {`
			`handle->lstrm.next_in = handle->srcbuf;`
			`handle->lstrm.avail_in = read_streamfile(handle->srcbuf, handle->offset, sizeof(handle->srcbuf), sf);`
			`handle->offset += handle->lstrm.avail_in;`

			`/* EOF (game reserves some extra buf so memset'ing is probably equivalent) */`
			`if (handle->lstrm.avail_in == 0) {`
			`memset(handle->decbuf + handle->lstrm.total_out, 0, sizeof(handle->decbuf) - handle->dstrm.total_out);`
			`break;`
			`}`
			`}`

			`res = lzxpcm_decompress(&handle->lstrm);`
			`if (res != LZXPCM_OK)`
			`goto fail;`
			`}`
			`while(handle->lstrm.avail_out != 0);`

			`return 1;`
			`fail:`
			`return 0;`
			`}`

			`static int decompress_frame_deflate(circus_handle_t* handle, STREAMFILE* sf) {`
			`int res;`

			`handle->dstrm.next_out = handle->decbuf;`
			`handle->dstrm.avail_out = sizeof(handle->decbuf);`
			`handle->dstrm.total_out = 0;`
			`do {`
			`if (handle->dstrm.avail_in == 0) {`
			`handle->dstrm.next_in = handle->srcbuf;`
			`handle->dstrm.avail_in = read_streamfile(handle->srcbuf, handle->offset, sizeof(handle->srcbuf), sf);`
			`handle->offset += handle->dstrm.avail_in;`

			`/* EOF (game reserves some extra buf so memset'ing is probably equivalent) */`
			`if (handle->dstrm.avail_in == 0) {`
			`memset(handle->decbuf + handle->dstrm.total_out, 0, sizeof(handle->decbuf) - handle->dstrm.total_out);`
			`break;`
			`}`
			`}`

			`res = inflate(&handle->dstrm, Z_NO_FLUSH);`
			`if (res != Z_OK && res != Z_STREAM_END)`
			`goto fail;`
			`}`
			`while(handle->dstrm.avail_out != 0);`

			`return 1;`
			`fail:`
			`return 0;`
			`}`

			`#ifdef XPCM_ALT`
			`/* original code uses zlib 1.2.1 to decompress the full stream into memory */`
			`static int deflate_decompress_full(uint8_t* dst, size_t dst_size, const uint8_t* src, size_t src_size) {`
			`int err;`
			`z_stream strm = {0};`
			`strm.next_in = src;`
			`strm.avail_in = src_size;`
			`strm.next_out = dst;`
			`strm.avail_out = dst_size;`

			`err = inflateInit(&strm);`
			`if (err < 0) {`
			`//printf("inflateInit error: %i\n", err);`
			`return 0;`
			`}`

			`err = inflate(&strm, Z_FINISH);`
			`if (err < 0) {`
			`//printf("inflate error: %i\n", err);`
			`//return 0;`
			`}`

			`err = inflateEnd(&strm);`
			`if (err < 0) {`
			`//printf("inflateEnd error: %i\n", err);`
			`return 0;`
			`}`

			`return 0;`
			`}`
			`#endif`

			`int circus_decode_frame(circus_handle_t* handle, STREAMFILE* sf, int16_t** p_buf, int* p_buf_samples_all) {`
			`int ok;`

			`if (handle->codec == XPCM_CODEC_VQ_LZXPCM) {`
			`ok = decompress_frame_lzxpcm(handle, sf);`
			`} else if (handle->codec == XPCM_CODEC_VQ_DEFLATE) {`
			`ok = decompress_frame_deflate(handle, sf);`
			`} else {`
			`ok = 0;`
			`}`
			`if (!ok)`
			`goto fail;`

			`interleave(handle->decbuf, handle->intbuf);`
			`scale(handle->intbuf, handle->scales, handle->invbuf, handle->tmpbuf);`
			`transform(handle->invbuf, handle->tmpbuf);`
			`convert(handle->flags, handle->invbuf, handle->pcmbuf, &handle->hist1, &handle->hist2, handle->frame);`
			`handle->frame++;`

			`*p_buf = handle->pcmbuf;`
			`*p_buf_samples_all = XPCM_FRAME_SAMPLES_ALL;`
			`return 1;`
			`fail:`
			`return 0;`
			`}`