vgmstream/src/coding/circus_decoder_lib.c

#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <stdlib.h>

/* Decodes Circus's audio codec, reverse engineered from various .exe.
 *
 * Some sources identify this codec as VQ (vector quantization), though vector(?)
 * data isn't actually bitpacked and just compressed using custom LZ or standard zlib.
 * Channels aren't divided either so decoding results in N-ch interleaved PCM.
 * It does seem to be using LPC/speech stuff from VQ codecs though.
 *
 * Some info from Japanese libpcm.c found in foo_adpcm
 * https://bitbucket.org/losnoco/foo_adpcm/src/master/foo_oki/source/libpcm/libpcm.cpp
 */

#include "circus_decoder_lib.h"
#include "circus_decoder_lib_data.h"

#include "circus_decoder_lzxpcm.h"

/* use miniz (API-compatible) to avoid adding external zlib just for this codec
 * - https://github.com/richgel999/miniz */
#include "circus_decoder_miniz.h"
//#include "zlib.h"


//#define XPCM_CODEC_PCM       0
#define XPCM_CODEC_VQ_LZXPCM   1
//#define XPCM_CODEC_ADPCM     2
#define XPCM_CODEC_VQ_DEFLATE  3

/* frame encodes 4096 PCM samples (all channels) = 4096*2 = 0x2000 bytes, re-interleaved then compressed */
#define XPCM_FRAME_SIZE         (4096 * 2)
#define XPCM_FRAME_CODES        4096
#define XPCM_FRAME_SAMPLES_ALL  4064
#define XPCM_FRAME_OVERLAP_ALL  32
#define XPCM_INPUT_SIZE         0x8000

/* ************************************************************************* */
/* DECODE */
/* ************************************************************************* */

struct circus_handle_t {
    /* config */
    off_t start;
    uint8_t codec;
    uint8_t flags;
    const int* scales;

    /* temp buffers */
    uint8_t srcbuf[XPCM_INPUT_SIZE];    /* compressed input data (arbitrary size) */
    uint8_t decbuf[XPCM_FRAME_SIZE];    /* single decompressed frame */
    uint8_t intbuf[XPCM_FRAME_SIZE];    /* re-interleaved frame */
    int32_t invbuf[XPCM_FRAME_CODES];   /* main LPC data (may need less) */
    int32_t tmpbuf[XPCM_FRAME_CODES];   /* temp LPC data (may need less) */

    /* output samples (original code reuses decbuf though) */
    int16_t pcmbuf[XPCM_FRAME_SAMPLES_ALL + XPCM_FRAME_OVERLAP_ALL]; /* final output samples and extra overlap samples */

    /* sample filter state */
    int hist1;
    int hist2;
    int frame;

    /* lz/deflate decompression state */
    lzxpcm_stream_t lstrm;
    z_stream dstrm;
    off_t offset;
};


static void convert(uint8_t flags, int32_t* invbuf, int16_t* pcmbuf, int* p_hist1, int* p_hist2, int frame) {
    int i;
    int sample, hist1, hist2;

    hist1 = *p_hist1;
    hist2 = *p_hist2;

    /* some ops below would use SHRs (>>), but there is some rounding in the original
     * ASM that decompiles and I think corresponds do DIVs (right shift and divs of
     * negative values isn't equivalent). Similarly the filters seem to use CDQ tricks
     * to simulate s64 ops, but I'm not sure casting is 100% equivalent (sounds ok tho). */

    /* do final filtering and conversion to PCM */
    for (i = 0; i < XPCM_FRAME_SAMPLES_ALL + XPCM_FRAME_OVERLAP_ALL; i++) {
        sample = *invbuf++;
        if (flags & 0x10)
            sample = (3 * (int64_t)sample / 2) / 1024; //>> 10;
        else
            sample = sample / 1024; //>> 10;

        sample = ((27 * (int64_t)sample + 4 * hist1 + hist2) << 11) / 65536; //>> 16

        hist2 = hist1;
        hist1 = sample;

        /* last 32 decoded samples aren't output, but are used next frame to overlap
         * with beginning samples (filters(?) windowing, not too noticeable though) */
        if (i < XPCM_FRAME_OVERLAP_ALL && frame > 0) {
            sample = ((i * (int64_t)sample) + ((XPCM_FRAME_OVERLAP_ALL - i) * pcmbuf[XPCM_FRAME_SAMPLES_ALL + i])) / 32; //>> 5
        }

        if (sample > 32767)
            sample = 32767;
        else if (sample < -32768)
            sample = -32768;

        pcmbuf[i] = sample;
    }

    *p_hist1 = hist1;
    *p_hist2 = hist2;
}

static void transform(int32_t* invbuf, int32_t* tmpbuf) {
    int lpc1, lpc2, lpc3, lpc4;
    int step1, step2, step3;
    int sc1, sc2;

    /* bits were originally configurable (passed arg), but actually called with const 12,
     * and removed in later games along with superfluous ifs (coefs > 0, bits >= 3, etc) */
    //const int frame_bits = 12;

    step1 = 4096; /* 1 << 12 */
    step2 = step1 >> 1;
    step3 = step2 >> 1;
    sc1 = 1;

    /* inverse transform of LPC(?) coefs */
    for (lpc1 = 0; lpc1 < 12 - 2; lpc1++) {
        int sub1, sub2;
        int i1, i2, i3, i4;
        int64_t cos1, sin1, cos2, sin2; /* needs i64 to force 64b ops (avoid overflows) then converted to i32 */

        cos1 = (int64_t)sincos_table[sc1 + 1024];
        sin1 = (int64_t)sincos_table[sc1 + 0];

        i1 = 0;
        i2 = step2;
        i3 = step3;
        i4 = step2 + step3;

        for (lpc2 = 0; lpc2 < 4096; lpc2 += step1) {
            sub1 = invbuf[i1 + 0] - invbuf[i2 + 0];
            sub2 = tmpbuf[i1 + 0] - tmpbuf[i2 + 0];
            invbuf[i1 + 0] += invbuf[i2 + 0];
            tmpbuf[i1 + 0] += tmpbuf[i2 + 0];
            invbuf[i2 + 0] = sub1;
            tmpbuf[i2 + 0] = sub2;

            sub1 = invbuf[i1 + 1] - invbuf[i2 + 1];
            sub2 = tmpbuf[i1 + 1] - tmpbuf[i2 + 1];
            invbuf[i1 + 1] += invbuf[i2 + 1];
            tmpbuf[i1 + 1] += tmpbuf[i2 + 1];
            invbuf[i2 + 1] = (int32_t)( ((sub1 * cos1) >> 12) + ((sub2 * sin1) >> 12) );
            tmpbuf[i2 + 1] = (int32_t)( ((sub2 * cos1) >> 12) - ((sub1 * sin1) >> 12) );

            sub1 = invbuf[i3 + 0] - invbuf[i4 + 0];
            sub2 = tmpbuf[i3 + 0] - tmpbuf[i4 + 0];
            invbuf[i3 + 0] += invbuf[i4 + 0];
            tmpbuf[i3 + 0] += tmpbuf[i4 + 0];
            invbuf[i4 + 0] = sub2;
            tmpbuf[i4 + 0] = -sub1;

            sub1 = invbuf[i3 + 1] - invbuf[i4 + 1];
            sub2 = tmpbuf[i3 + 1] - tmpbuf[i4 + 1];
            invbuf[i3 + 1] += invbuf[i4 + 1];
            tmpbuf[i3 + 1] += tmpbuf[i4 + 1];
            invbuf[i4 + 1] = (int32_t)(   ((sub2 * cos1) >> 12) - ((sub1 * sin1) >> 12) );
            tmpbuf[i4 + 1] = (int32_t)( -(((sub1 * cos1) >> 12) + ((sub2 * sin1) >> 12)) );

            i1 += step1;
            i2 += step1;
            i3 += step1;
            i4 += step1;
        }

        if (step3 > 2) {
            sc2 = sc1 * 2;

            for (lpc3 = 2; lpc3 < step3; lpc3++) {
                cos2 = (int64_t)sincos_table[sc2 + 1024];
                sin2 = (int64_t)sincos_table[sc2 + 0];
                sc2 += sc1;

                i1 = 0 + lpc3;
                i2 = step2 + lpc3;
                i3 = step3 + lpc3;
                i4 = step2 + step3 + lpc3;

                for (lpc4 = 0; lpc4 < 4096; lpc4 += step1) {
                    sub1 = invbuf[i1] - invbuf[i2];
                    sub2 = tmpbuf[i1] - tmpbuf[i2];
                    invbuf[i1] += invbuf[i2];
                    tmpbuf[i1] += tmpbuf[i2];
                    invbuf[i2] = (int32_t)( ((sub1 * cos2) >> 12) + ((sub2 * sin2) >> 12) );
                    tmpbuf[i2] = (int32_t)( ((sub2 * cos2) >> 12) - ((sub1 * sin2) >> 12) );

                    sub1 = invbuf[i3] - invbuf[i4];
                    sub2 = tmpbuf[i3] - tmpbuf[i4];
                    invbuf[i3] += invbuf[i4];
                    tmpbuf[i3] += tmpbuf[i4];
                    invbuf[i4] = (int32_t)(  ((sub2 * cos2) >> 12) - ((sub1 * sin2) >> 12) );
                    tmpbuf[i4] = (int32_t)( -(((sub1 * cos2) >> 12) + ((sub2 * sin2) >> 12)) );

                    i1 += step1;
                    i2 += step1;
                    i3 += step1;
                    i4 += step1;
                }
            }
        }

        step1 = step2; // step1 >>= 1;
        step2 = step3; // step2 >>= 1;
        step3 >>= 1;
        sc1 *= 2;
    }

    {
        int i, j;
        int sub1, sub2, pow;

        for (i = 0; i < 4096; i += 4) {
            sub1 = invbuf[i + 0] - invbuf[i + 2];
            invbuf[i + 0] += invbuf[i + 2];
            invbuf[i + 2] = sub1;

            sub2 = tmpbuf[i + 0] - tmpbuf[i + 2];
            tmpbuf[i + 0] += tmpbuf[i + 2];
            tmpbuf[i + 2] = sub2;

            sub1 = invbuf[i + 3] - invbuf[i + 1];
            sub2 = tmpbuf[i + 1] - tmpbuf[i + 3];
            invbuf[i + 1] += invbuf[i + 3];
            invbuf[i + 3] = sub2;
            tmpbuf[i + 1] += tmpbuf[i + 3];
            tmpbuf[i + 3] = sub1;
        }

        for (i = 0; i < 4096; i += 2) {
            sub1 = invbuf[i + 0] - invbuf[i + 1];
            invbuf[i + 0] += invbuf[i + 1];
            invbuf[i + 1] = sub1;

            sub2 = tmpbuf[i + 0] - tmpbuf[i + 1];
            tmpbuf[i + 0] += tmpbuf[i + 1];
            tmpbuf[i + 1] = sub2;
        }

        for (i = 1, j = 0; i < 4096 - 1; i++) {
            for (pow = 4096 / 2; pow <= j; pow /= 2) {
                j -= pow;
            }
            j += pow;

            if (i < j) {
                sub1 = invbuf[j];
                invbuf[j] = invbuf[i];
                invbuf[i] = sub1;

                sub2 = tmpbuf[j];
                tmpbuf[j] = tmpbuf[i];
                tmpbuf[i] = sub2;
            }
        }
    }
}

static void scale(const uint8_t* intbuf, const int* scales, int32_t* invbuf, int32_t* tmpbuf) {
    int i, j;

    /* reinterleave and scale intbuf into invbuf and tmpbuf */
    for (i = 0, j = 0; i < 4096 / 2; i++, j += 16) {
        int scale, qv1, qv2;

        scale = scales[j / 4096];

        qv1 = (intbuf[i*4 + 0] << 0) | (intbuf[i*4 + 1] << 8); /* get_u16le */
        qv2 = (intbuf[i*4 + 2] << 0) | (intbuf[i*4 + 3] << 8); /* get_u16le */

        /* lowest bit is short of "positive" flag, or rather: even=0..-32767, odd=1..32768
         * (originally done through a LUT init at runtime with all 65536 indexes) */
        qv1 = (qv1 & 1) ? (qv1 >> 1) + 1 : -(qv1 >> 1);
        qv2 = (qv2 & 1) ? (qv2 >> 1) + 1 : -(qv2 >> 1);

        invbuf[i] = scale * qv1;
        tmpbuf[i] = scale * qv2;
    }

    /* reset rest of invbuf/tmpbuf */
    for (i = 4096 / 2; i < 4096; i++) {
        invbuf[i] = 0;
        tmpbuf[i] = 0;
    }
}

static void interleave(const uint8_t* decbuf, uint8_t* intbuf) {
    int i, j;

    /* reorder odd decbuf bytes into intbuf */
    for (i = 0, j = 1; i < 0x1000; i++, j += 2) {
        intbuf[j] = decbuf[i];
    }

    /* reorder even decbuf bytes into intbuf */
    for (i = 0x1000, j = 0; i < 0x1800; i++, j += 4) {
        uint8_t lo = decbuf[i + 0x800];
        uint8_t hi = decbuf[i];

        intbuf[j + 0] = (hi & 0xF0) | (lo >> 4);
        intbuf[j + 2] = (hi << 4) | (lo & 0x0F);
    }
}

/* ************************************************************ */
/* API */
/* ************************************************************ */

circus_handle_t* circus_init(off_t start, uint8_t codec, uint8_t flags) {
    circus_handle_t* handle = NULL;
    int scale_index, err;

    handle = malloc(sizeof(circus_handle_t));
    if (!handle) goto fail;

    handle->start = start;
    handle->codec = codec; //(config >> 0) & 0xFF;
    handle->flags = flags; //(config >> 8) & 0xFF;

    scale_index = (handle->flags & 0xF);
    if (scale_index > 5) goto fail;
    handle->scales = scale_table[scale_index];

    if (handle->codec == XPCM_CODEC_VQ_DEFLATE) {
        memset(&handle->dstrm, 0, sizeof(z_stream));
        err = inflateInit(&handle->dstrm);
        if (err < 0) goto fail;
    }

    circus_reset(handle);

    return handle;
fail:
    circus_free(handle);
    return NULL;
}

void circus_free(circus_handle_t* handle) {
    if (!handle)
        return;

    if (handle->codec == XPCM_CODEC_VQ_DEFLATE) {
        inflateEnd(&handle->dstrm);
    }

    free(handle);
}

void circus_reset(circus_handle_t* handle) {
    if (!handle)
        return;
    handle->hist1 = 0;
    handle->hist2 = 0;
    handle->frame = 0;

    if (handle->codec == XPCM_CODEC_VQ_LZXPCM) {
        lzxpcm_reset(&handle->lstrm);
    } else if (handle->codec == XPCM_CODEC_VQ_DEFLATE) {
        inflateReset(&handle->dstrm);
    }
    handle->offset = handle->start;
}

static int decompress_frame_lzxpcm(circus_handle_t* handle, STREAMFILE* sf) {
    int res;

    handle->lstrm.next_out = handle->decbuf;
    handle->lstrm.avail_out = sizeof(handle->decbuf);
    handle->lstrm.total_out = 0;
    do {
        if (handle->lstrm.avail_in == 0) {
            handle->lstrm.next_in = handle->srcbuf;
            handle->lstrm.avail_in = read_streamfile(handle->srcbuf, handle->offset, sizeof(handle->srcbuf), sf);
            handle->offset += handle->lstrm.avail_in;

            /* EOF (game reserves some extra buf so memset'ing is probably equivalent) */
            if (handle->lstrm.avail_in == 0) {
                memset(handle->decbuf + handle->lstrm.total_out, 0, sizeof(handle->decbuf) - handle->dstrm.total_out);
                break;
            }
        }

        res = lzxpcm_decompress(&handle->lstrm);
        if (res != LZXPCM_OK)
            goto fail;
    }
    while(handle->lstrm.avail_out != 0);

    return 1;
fail:
    return 0;
}

static int decompress_frame_deflate(circus_handle_t* handle, STREAMFILE* sf) {
    int res;

    handle->dstrm.next_out = handle->decbuf;
    handle->dstrm.avail_out = sizeof(handle->decbuf);
    handle->dstrm.total_out = 0;
    do {
        if (handle->dstrm.avail_in == 0) {
            handle->dstrm.next_in = handle->srcbuf;
            handle->dstrm.avail_in = read_streamfile(handle->srcbuf, handle->offset, sizeof(handle->srcbuf), sf);
            handle->offset += handle->dstrm.avail_in;

            /* EOF (game reserves some extra buf so memset'ing is probably equivalent) */
            if (handle->dstrm.avail_in == 0) {
                memset(handle->decbuf + handle->dstrm.total_out, 0, sizeof(handle->decbuf) - handle->dstrm.total_out);
                break;
            }
        }

        res = inflate(&handle->dstrm, Z_NO_FLUSH);
        if (res != Z_OK && res != Z_STREAM_END)
            goto fail;
    }
    while(handle->dstrm.avail_out != 0);

    return 1;
fail:
    return 0;
}

#ifdef XPCM_ALT
/* original code uses zlib 1.2.1 to decompress the full stream into memory */
static int deflate_decompress_full(uint8_t* dst, size_t dst_size, const uint8_t* src, size_t src_size) {
    int err;
    z_stream strm = {0};
    strm.next_in  = src;
    strm.avail_in = src_size;
    strm.next_out = dst;
    strm.avail_out = dst_size;

    err = inflateInit(&strm);
    if (err < 0) {
        //printf("inflateInit error: %i\n", err);
        return 0;
    }

    err = inflate(&strm, Z_FINISH);
    if (err < 0) {
        //printf("inflate error: %i\n", err);
        //return 0;
    }

    err = inflateEnd(&strm);
    if (err < 0) {
        //printf("inflateEnd error: %i\n", err);
        return 0;
    }

    return 0;
}
#endif

int circus_decode_frame(circus_handle_t* handle, STREAMFILE* sf, int16_t** p_buf, int* p_buf_samples_all) {
    int ok;

    if (handle->codec == XPCM_CODEC_VQ_LZXPCM) {
        ok = decompress_frame_lzxpcm(handle, sf);
    } else if (handle->codec == XPCM_CODEC_VQ_DEFLATE) {
        ok = decompress_frame_deflate(handle, sf);
    } else {
        ok = 0;
    }
    if (!ok)
        goto fail;

    interleave(handle->decbuf, handle->intbuf);
    scale(handle->intbuf, handle->scales, handle->invbuf, handle->tmpbuf);
    transform(handle->invbuf, handle->tmpbuf);
    convert(handle->flags, handle->invbuf, handle->pcmbuf, &handle->hist1, &handle->hist2, handle->frame);
    handle->frame++;

    *p_buf = handle->pcmbuf;
    *p_buf_samples_all = XPCM_FRAME_SAMPLES_ALL;
    return 1;
fail:
    return 0;
}