#include #include #include #include /* Decodes Circus's audio codec, reverse engineered from various .exe. * * Some sources identify this codec as VQ (vector quantization), though vector(?) * data isn't actually bitpacked and just compressed using custom LZ or standard zlib. * Channels aren't divided either so decoding results in N-ch interleaved PCM. * It does seem to be using LPC/speech stuff from VQ codecs though. * * Some info from Japanese libpcm.c found in foo_adpcm * https://bitbucket.org/losnoco/foo_adpcm/src/master/foo_oki/source/libpcm/libpcm.cpp */ #include "circus_decoder_lib.h" #include "circus_decoder_lib_data.h" #include "circus_decoder_lzxpcm.h" /* use miniz (API-compatible) to avoid adding external zlib just for this codec * - https://github.com/richgel999/miniz */ #include "circus_decoder_miniz.h" //#include "zlib.h" //#define XPCM_CODEC_PCM 0 #define XPCM_CODEC_VQ_LZXPCM 1 //#define XPCM_CODEC_ADPCM 2 #define XPCM_CODEC_VQ_DEFLATE 3 /* frame encodes 4096 PCM samples (all channels) = 4096*2 = 0x2000 bytes, re-interleaved then compressed */ #define XPCM_FRAME_SIZE (4096 * 2) #define XPCM_FRAME_CODES 4096 #define XPCM_FRAME_SAMPLES_ALL 4064 #define XPCM_FRAME_OVERLAP_ALL 32 #define XPCM_INPUT_SIZE 0x8000 /* ************************************************************************* */ /* DECODE */ /* ************************************************************************* */ struct circus_handle_t { /* config */ off_t start; uint8_t codec; uint8_t flags; const int* scales; /* temp buffers */ uint8_t srcbuf[XPCM_INPUT_SIZE]; /* compressed input data (arbitrary size) */ uint8_t decbuf[XPCM_FRAME_SIZE]; /* single decompressed frame */ uint8_t intbuf[XPCM_FRAME_SIZE]; /* re-interleaved frame */ int32_t invbuf[XPCM_FRAME_CODES]; /* main LPC data (may need less) */ int32_t tmpbuf[XPCM_FRAME_CODES]; /* temp LPC data (may need less) */ /* output samples (original code reuses decbuf though) */ int16_t pcmbuf[XPCM_FRAME_SAMPLES_ALL + XPCM_FRAME_OVERLAP_ALL]; /* final output samples and extra overlap samples */ /* sample filter state */ int hist1; int hist2; int frame; /* lz/deflate decompression state */ lzxpcm_stream_t lstrm; z_stream dstrm; off_t offset; }; static void convert(uint8_t flags, int32_t* invbuf, int16_t* pcmbuf, int* p_hist1, int* p_hist2, int frame) { int i; int sample, hist1, hist2; hist1 = *p_hist1; hist2 = *p_hist2; /* some ops below would use SHRs (>>), but there is some rounding in the * original ASM that decompiles and I think corresponds do DIVs * (right shift and divs of negative values isn't equivalent) */ /* do final filtering and conversion to PCM */ for (i = 0; i < XPCM_FRAME_SAMPLES_ALL + XPCM_FRAME_OVERLAP_ALL; i++) { sample = *invbuf++; if (flags & 0x10) sample = 3 * sample / 2; // (3 * sample) >> 2 sample /= 1024; //sample >>= 10; sample = ((3 * sample + 8 * 3 * sample + 4 * hist1 + hist2) << 11); sample /= 65536; // sample >>= 16; hist2 = hist1; hist1 = sample; /* last 32 decoded samples aren't output, but are used next frame to overlap * with beginning samples (filters(?) windowing, not too noticeable though) */ if (i < XPCM_FRAME_OVERLAP_ALL && frame > 0) { sample = (i * sample) + ((XPCM_FRAME_OVERLAP_ALL - i) * pcmbuf[XPCM_FRAME_SAMPLES_ALL + i]); sample /= 32; //sample >>= 5 } if (sample > 32767) sample = 32767; else if (sample < -32768) sample = -32768; pcmbuf[i] = sample; } *p_hist1 = hist1; *p_hist2 = hist2; } static void transform(int32_t* invbuf, int32_t* tmpbuf) { int lpc1, lpc2, lpc3, lpc4; int step1, step2, step3; int sc1, sc2; /* bits were originally configurable (passed arg), but actually called with const 12, * and removed in later games along with superfluous ifs (coefs > 0, bits >= 3, etc) */ //const int frame_bits = 12; step1 = 4096; /* 1 << 12 */ step2 = step1 >> 1; step3 = step2 >> 1; sc1 = 1; /* inverse transform of LPC(?) coefs */ for (lpc1 = 0; lpc1 < 12 - 2; lpc1++) { int sub1, sub2; int i1, i2, i3, i4; int64_t cos1, sin1, cos2, sin2; /* needs i64 to force 64b ops (avoid overflows) */ cos1 = (int64_t)sincos_table[sc1 + 1024]; sin1 = (int64_t)sincos_table[sc1 + 0]; i1 = 0; i2 = step2; i3 = step3; i4 = step2 + step3; for (lpc2 = 0; lpc2 < 4096; lpc2 += step1) { sub1 = invbuf[i1 + 0] - invbuf[i2 + 0]; sub2 = tmpbuf[i1 + 0] - tmpbuf[i2 + 0]; invbuf[i1 + 0] += invbuf[i2 + 0]; tmpbuf[i1 + 0] += tmpbuf[i2 + 0]; invbuf[i2 + 0] = sub1; tmpbuf[i2 + 0] = sub2; sub1 = invbuf[i1 + 1] - invbuf[i2 + 1]; sub2 = tmpbuf[i1 + 1] - tmpbuf[i2 + 1]; invbuf[i1 + 1] += invbuf[i2 + 1]; tmpbuf[i1 + 1] += tmpbuf[i2 + 1]; invbuf[i2 + 1] = ((sub1 * cos1) >> 12) + ((sub2 * sin1) >> 12); tmpbuf[i2 + 1] = ((sub2 * cos1) >> 12) - ((sub1 * sin1) >> 12); sub1 = invbuf[i3 + 0] - invbuf[i4 + 0]; sub2 = tmpbuf[i3 + 0] - tmpbuf[i4 + 0]; invbuf[i3 + 0] += invbuf[i4 + 0]; tmpbuf[i3 + 0] += tmpbuf[i4 + 0]; invbuf[i4 + 0] = sub2; tmpbuf[i4 + 0] = -sub1; sub1 = invbuf[i3 + 1] - invbuf[i4 + 1]; sub2 = tmpbuf[i3 + 1] - tmpbuf[i4 + 1]; invbuf[i3 + 1] += invbuf[i4 + 1]; tmpbuf[i3 + 1] += tmpbuf[i4 + 1]; invbuf[i4 + 1] = ((sub2 * cos1) >> 12) - ((sub1 * sin1) >> 12); tmpbuf[i4 + 1] = -(((sub1 * cos1) >> 12) + ((sub2 * sin1) >> 12)); i1 += step1; i2 += step1; i3 += step1; i4 += step1; } if (step3 > 2) { sc2 = sc1 * 2; for (lpc3 = 2; lpc3 < step3; lpc3++) { cos2 = (int64_t)sincos_table[sc2 + 1024]; sin2 = (int64_t)sincos_table[sc2 + 0]; sc2 += sc1; i1 = 0 + lpc3; i2 = step2 + lpc3; i3 = step3 + lpc3; i4 = step2 + step3 + lpc3; for (lpc4 = 0; lpc4 < 4096; lpc4 += step1) { sub1 = invbuf[i1] - invbuf[i2]; sub2 = tmpbuf[i1] - tmpbuf[i2]; invbuf[i1] += invbuf[i2]; tmpbuf[i1] += tmpbuf[i2]; invbuf[i2] = ((sub1 * cos2) >> 12) + ((sub2 * sin2) >> 12); tmpbuf[i2] = ((sub2 * cos2) >> 12) - ((sub1 * sin2) >> 12); sub1 = invbuf[i3] - invbuf[i4]; sub2 = tmpbuf[i3] - tmpbuf[i4]; invbuf[i3] += invbuf[i4]; tmpbuf[i3] += tmpbuf[i4]; invbuf[i4] = ((sub2 * cos2) >> 12) - ((sub1 * sin2) >> 12); tmpbuf[i4] = -(((sub1 * cos2) >> 12) + ((sub2 * sin2) >> 12)); i1 += step1; i2 += step1; i3 += step1; i4 += step1; } } } step1 = step2; // step1 >>= 1; step2 = step3; // step2 >>= 1; step3 >>= 1; sc1 *= 2; } { int i, j; int sub1, sub2, pow; for (i = 0; i < 4096; i += 4) { sub1 = invbuf[i + 0] - invbuf[i + 2]; invbuf[i + 0] += invbuf[i + 2]; invbuf[i + 2] = sub1; sub2 = tmpbuf[i + 0] - tmpbuf[i + 2]; tmpbuf[i + 0] += tmpbuf[i + 2]; tmpbuf[i + 2] = sub2; sub1 = invbuf[i + 3] - invbuf[i + 1]; sub2 = tmpbuf[i + 1] - tmpbuf[i + 3]; invbuf[i + 1] += invbuf[i + 3]; invbuf[i + 3] = sub2; tmpbuf[i + 1] += tmpbuf[i + 3]; tmpbuf[i + 3] = sub1; } for (i = 0; i < 4096; i += 2) { sub1 = invbuf[i + 0] - invbuf[i + 1]; invbuf[i + 0] += invbuf[i + 1]; invbuf[i + 1] = sub1; sub2 = tmpbuf[i + 0] - tmpbuf[i + 1]; tmpbuf[i + 0] += tmpbuf[i + 1]; tmpbuf[i + 1] = sub2; } for (i = 1, j = 0; i < 4096 - 1; i++) { for (pow = 4096 / 2; pow <= j; pow /= 2) { j -= pow; } j += pow; if (i < j) { sub1 = invbuf[j]; invbuf[j] = invbuf[i]; invbuf[i] = sub1; sub2 = tmpbuf[j]; tmpbuf[j] = tmpbuf[i]; tmpbuf[i] = sub2; } } } } static void scale(const uint8_t* intbuf, const int* scales, int32_t* invbuf, int32_t* tmpbuf) { int i, j; /* reinterleave and scale intbuf into invbuf and tmpbuf */ for (i = 0, j = 0; i < 4096 / 2; i++, j += 16) { int scale, qv1, qv2; scale = scales[j / 4096]; qv1 = (intbuf[i*4 + 0] << 0) | (intbuf[i*4 + 1] << 8); /* get_u16le */ qv2 = (intbuf[i*4 + 2] << 0) | (intbuf[i*4 + 3] << 8); /* get_u16le */ /* lowest bit is short of "positive" flag, or rather: even=0..-32767, odd=1..32768 * (originally done through a LUT init at runtime with all 65536 indexes) */ qv1 = (qv1 & 1) ? (qv1 >> 1) + 1 : -(qv1 >> 1); qv2 = (qv2 & 1) ? (qv2 >> 1) + 1 : -(qv2 >> 1); invbuf[i] = scale * qv1; tmpbuf[i] = scale * qv2; } /* reset rest of invbuf/tmpbuf */ for (i = 4096 / 2; i < 4096; i++) { invbuf[i] = 0; tmpbuf[i] = 0; } } static void interleave(const uint8_t* decbuf, uint8_t* intbuf) { int i, j; /* reorder odd decbuf bytes into intbuf */ for (i = 0, j = 1; i < 0x1000; i++, j += 2) { intbuf[j] = decbuf[i]; } /* reorder even decbuf bytes into intbuf */ for (i = 0x1000, j = 0; i < 0x1800; i++, j += 4) { uint8_t lo = decbuf[i + 0x800]; uint8_t hi = decbuf[i]; intbuf[j + 0] = (hi & 0xF0) | (lo >> 4); intbuf[j + 2] = (hi << 4) | (lo & 0x0F); } } /* ************************************************************ */ /* API */ /* ************************************************************ */ circus_handle_t* circus_init(off_t start, uint8_t codec, uint8_t flags) { circus_handle_t* handle = NULL; int scale_index, err; handle = malloc(sizeof(circus_handle_t)); if (!handle) goto fail; handle->start = start; handle->codec = codec; //(config >> 0) & 0xFF; handle->flags = flags; //(config >> 8) & 0xFF; scale_index = (handle->flags & 0xF); if (scale_index > 5) goto fail; handle->scales = scale_table[scale_index]; if (handle->codec == XPCM_CODEC_VQ_DEFLATE) { memset(&handle->dstrm, 0, sizeof(z_stream)); err = inflateInit(&handle->dstrm); if (err < 0) goto fail; } circus_reset(handle); return handle; fail: circus_free(handle); return NULL; } void circus_free(circus_handle_t* handle) { if (!handle) return; if (handle->codec == XPCM_CODEC_VQ_DEFLATE) { inflateEnd(&handle->dstrm); } free(handle); } void circus_reset(circus_handle_t* handle) { if (!handle) return; handle->hist1 = 0; handle->hist2 = 0; handle->frame = 0; if (handle->codec == XPCM_CODEC_VQ_LZXPCM) { lzxpcm_reset(&handle->lstrm); } else if (handle->codec == XPCM_CODEC_VQ_DEFLATE) { inflateReset(&handle->dstrm); } handle->offset = handle->start; } static int decompress_frame_lzxpcm(circus_handle_t* handle, STREAMFILE* sf) { int res; handle->lstrm.next_out = handle->decbuf; handle->lstrm.avail_out = sizeof(handle->decbuf); handle->lstrm.total_out = 0; do { if (handle->lstrm.avail_in == 0) { handle->lstrm.next_in = handle->srcbuf; handle->lstrm.avail_in = read_streamfile(handle->srcbuf, handle->offset, sizeof(handle->srcbuf), sf); handle->offset += handle->lstrm.avail_in; /* EOF (game reserves some extra buf so memset'ing is probably equivalent) */ if (handle->lstrm.avail_in == 0) { memset(handle->decbuf + handle->lstrm.total_out, 0, sizeof(handle->decbuf) - handle->dstrm.total_out); break; } } res = lzxpcm_decompress(&handle->lstrm); if (res != LZXPCM_OK) goto fail; } while(handle->lstrm.avail_out != 0); return 1; fail: return 0; } static int decompress_frame_deflate(circus_handle_t* handle, STREAMFILE* sf) { int res; handle->dstrm.next_out = handle->decbuf; handle->dstrm.avail_out = sizeof(handle->decbuf); handle->dstrm.total_out = 0; do { if (handle->dstrm.avail_in == 0) { handle->dstrm.next_in = handle->srcbuf; handle->dstrm.avail_in = read_streamfile(handle->srcbuf, handle->offset, sizeof(handle->srcbuf), sf); handle->offset += handle->dstrm.avail_in; /* EOF (game reserves some extra buf so memset'ing is probably equivalent) */ if (handle->dstrm.avail_in == 0) { memset(handle->decbuf + handle->dstrm.total_out, 0, sizeof(handle->decbuf) - handle->dstrm.total_out); break; } } res = inflate(&handle->dstrm, Z_NO_FLUSH); if (res != Z_OK && res != Z_STREAM_END) goto fail; } while(handle->dstrm.avail_out != 0); return 1; fail: return 0; } #ifdef XPCM_ALT /* original code uses zlib 1.2.1 to decompress the full stream into memory */ static int deflate_decompress_full(uint8_t* dst, size_t dst_size, const uint8_t* src, size_t src_size) { int err; z_stream strm = {0}; strm.next_in = src; strm.avail_in = src_size; strm.next_out = dst; strm.avail_out = dst_size; err = inflateInit(&strm); if (err < 0) { //printf("inflateInit error: %i\n", err); return 0; } err = inflate(&strm, Z_FINISH); if (err < 0) { //printf("inflate error: %i\n", err); //return 0; } err = inflateEnd(&strm); if (err < 0) { //printf("inflateEnd error: %i\n", err); return 0; } return 0; } #endif int circus_decode_frame(circus_handle_t* handle, STREAMFILE* sf, int16_t** p_buf, int* p_buf_samples_all) { int ok; if (handle->codec == XPCM_CODEC_VQ_LZXPCM) { ok = decompress_frame_lzxpcm(handle, sf); } else if (handle->codec == XPCM_CODEC_VQ_DEFLATE) { ok = decompress_frame_deflate(handle, sf); } else { ok = 0; } if (!ok) goto fail; interleave(handle->decbuf, handle->intbuf); scale(handle->intbuf, handle->scales, handle->invbuf, handle->tmpbuf); transform(handle->invbuf, handle->tmpbuf); convert(handle->flags, handle->invbuf, handle->pcmbuf, &handle->hist1, &handle->hist2, handle->frame); handle->frame++; *p_buf = handle->pcmbuf; *p_buf_samples_all = XPCM_FRAME_SAMPLES_ALL; return 1; fail: return 0; }