#include "coding.h"

/* Decodes EA MicroTalk (speech codec) using a copied utkencode lib.
 * EA separates MT10:1 and MT5:1 (bigger frames), but apparently are the same
 * with different encoding parameters. Later revisions may have PCM blocks (rare).
 *
 * Decoder by Andrew D'Addesio: https://github.com/daddesio/utkencode
 * Info: http://wiki.niotso.org/UTK
 *
 * The following tries to follow the original code as close as possible, with minimal changes for vgmstream
 */

/* ************************************************************************************************* */
#define UTK_BUFFER_SIZE 0x4000

//#define UTK_MAKE_U32(a,b,c,d) ((a)|((b)<<8)|((c)<<16)|((d)<<24))
#define UTK_ROUND(x) ((x) >= 0.0f ? ((x)+0.5f) : ((x)-0.5f))
#define UTK_MIN(x,y) ((x)<(y)?(x):(y))
#define UTK_MAX(x,y) ((x)>(y)?(x):(y))
#define UTK_CLAMP(x,min,max) UTK_MIN(UTK_MAX(x,min),max)


/* Note: This struct assumes a member alignment of 4 bytes.
** This matters when pitch_lag > 216 on the first subframe of any given frame. */
typedef struct UTKContext {
    uint8_t buffer[UTK_BUFFER_SIZE]; //vgmstream extra
    STREAMFILE * streamfile; //vgmstream extra
    off_t offset; //vgmstream extra
    int samples_filled; //vgmstream extra
    //FILE *fp; //vgmstream extra
    const uint8_t *ptr, *end;
    int parsed_header;
    unsigned int bits_value;
    int bits_count;
    int reduced_bw;
    int multipulse_thresh;
    float fixed_gains[64];
    float rc[12];
    float synth_history[12];
    float adapt_cb[324];
    float decompressed_frame[432];
} UTKContext;

enum {
    MDL_NORMAL = 0,
    MDL_LARGEPULSE = 1
};

static const float utk_rc_table[64] = {
    0.0f,
    -.99677598476409912109375f, -.99032700061798095703125f, -.983879029750823974609375f, -.977430999279022216796875f,
    -.970982015132904052734375f, -.964533984661102294921875f, -.958085000514984130859375f, -.9516370296478271484375f,
    -.930754005908966064453125f, -.904959976673126220703125f, -.879167020320892333984375f, -.853372991085052490234375f,
    -.827579021453857421875f, -.801786005496978759765625f, -.775991976261138916015625f, -.75019800662994384765625f,
    -.724404990673065185546875f, -.6986110210418701171875f, -.6706349849700927734375f, -.61904799938201904296875f,
    -.567460000514984130859375f, -.515873014926910400390625f, -.4642859995365142822265625f, -.4126980006694793701171875f,
    -.361110985279083251953125f, -.309523999691009521484375f, -.257937014102935791015625f, -.20634900033473968505859375f,
    -.1547619998455047607421875f, -.10317499935626983642578125f, -.05158700048923492431640625f,
    0.0f,
    +.05158700048923492431640625f, +.10317499935626983642578125f, +.1547619998455047607421875f, +.20634900033473968505859375f,
    +.257937014102935791015625f, +.309523999691009521484375f, +.361110985279083251953125f, +.4126980006694793701171875f,
    +.4642859995365142822265625f, +.515873014926910400390625f, +.567460000514984130859375f, +.61904799938201904296875f,
    +.6706349849700927734375f, +.6986110210418701171875f, +.724404990673065185546875f, +.75019800662994384765625f,
    +.775991976261138916015625f, +.801786005496978759765625f, +.827579021453857421875f, +.853372991085052490234375f,
    +.879167020320892333984375f, +.904959976673126220703125f, +.930754005908966064453125f, +.9516370296478271484375f,
    +.958085000514984130859375f, +.964533984661102294921875f, +.970982015132904052734375f, +.977430999279022216796875f,
    +.983879029750823974609375f, +.99032700061798095703125f, +.99677598476409912109375
};

static const uint8_t utk_codebooks[2][256] = {
    { /* normal model */
        4,  6,  5,  9,  4,  6,  5, 13,  4,  6,  5, 10,  4,  6,  5, 17,
        4,  6,  5,  9,  4,  6,  5, 14,  4,  6,  5, 10,  4,  6,  5, 21,
        4,  6,  5,  9,  4,  6,  5, 13,  4,  6,  5, 10,  4,  6,  5, 18,
        4,  6,  5,  9,  4,  6,  5, 14,  4,  6,  5, 10,  4,  6,  5, 25,
        4,  6,  5,  9,  4,  6,  5, 13,  4,  6,  5, 10,  4,  6,  5, 17,
        4,  6,  5,  9,  4,  6,  5, 14,  4,  6,  5, 10,  4,  6,  5, 22,
        4,  6,  5,  9,  4,  6,  5, 13,  4,  6,  5, 10,  4,  6,  5, 18,
        4,  6,  5,  9,  4,  6,  5, 14,  4,  6,  5, 10,  4,  6,  5,  0,
        4,  6,  5,  9,  4,  6,  5, 13,  4,  6,  5, 10,  4,  6,  5, 17,
        4,  6,  5,  9,  4,  6,  5, 14,  4,  6,  5, 10,  4,  6,  5, 21,
        4,  6,  5,  9,  4,  6,  5, 13,  4,  6,  5, 10,  4,  6,  5, 18,
        4,  6,  5,  9,  4,  6,  5, 14,  4,  6,  5, 10,  4,  6,  5, 26,
        4,  6,  5,  9,  4,  6,  5, 13,  4,  6,  5, 10,  4,  6,  5, 17,
        4,  6,  5,  9,  4,  6,  5, 14,  4,  6,  5, 10,  4,  6,  5, 22,
        4,  6,  5,  9,  4,  6,  5, 13,  4,  6,  5, 10,  4,  6,  5, 18,
        4,  6,  5,  9,  4,  6,  5, 14,  4,  6,  5, 10,  4,  6,  5,  2
    }, { /* large-pulse model */
        4, 11,  7, 15,  4, 12,  8, 19,  4, 11,  7, 16,  4, 12,  8, 23,
        4, 11,  7, 15,  4, 12,  8, 20,  4, 11,  7, 16,  4, 12,  8, 27,
        4, 11,  7, 15,  4, 12,  8, 19,  4, 11,  7, 16,  4, 12,  8, 24,
        4, 11,  7, 15,  4, 12,  8, 20,  4, 11,  7, 16,  4, 12,  8,  1,
        4, 11,  7, 15,  4, 12,  8, 19,  4, 11,  7, 16,  4, 12,  8, 23,
        4, 11,  7, 15,  4, 12,  8, 20,  4, 11,  7, 16,  4, 12,  8, 28,
        4, 11,  7, 15,  4, 12,  8, 19,  4, 11,  7, 16,  4, 12,  8, 24,
        4, 11,  7, 15,  4, 12,  8, 20,  4, 11,  7, 16,  4, 12,  8,  3,
        4, 11,  7, 15,  4, 12,  8, 19,  4, 11,  7, 16,  4, 12,  8, 23,
        4, 11,  7, 15,  4, 12,  8, 20,  4, 11,  7, 16,  4, 12,  8, 27,
        4, 11,  7, 15,  4, 12,  8, 19,  4, 11,  7, 16,  4, 12,  8, 24,
        4, 11,  7, 15,  4, 12,  8, 20,  4, 11,  7, 16,  4, 12,  8,  1,
        4, 11,  7, 15,  4, 12,  8, 19,  4, 11,  7, 16,  4, 12,  8, 23,
        4, 11,  7, 15,  4, 12,  8, 20,  4, 11,  7, 16,  4, 12,  8, 28,
        4, 11,  7, 15,  4, 12,  8, 19,  4, 11,  7, 16,  4, 12,  8, 24,
        4, 11,  7, 15,  4, 12,  8, 20,  4, 11,  7, 16,  4, 12,  8,  3
    }
};

static const struct {
    int next_model;
    int code_size;
    float pulse_value;
} utk_commands[29] = {
    {MDL_LARGEPULSE, 8,  0.0f},
    {MDL_LARGEPULSE, 7,  0.0f},
    {MDL_NORMAL,     8,  0.0f},
    {MDL_NORMAL,     7,  0.0f},
    {MDL_NORMAL,     2,  0.0f},
    {MDL_NORMAL,     2, -1.0f},
    {MDL_NORMAL,     2, +1.0f},
    {MDL_NORMAL,     3, -1.0f},
    {MDL_NORMAL,     3, +1.0f},
    {MDL_LARGEPULSE, 4, -2.0f},
    {MDL_LARGEPULSE, 4, +2.0f},
    {MDL_LARGEPULSE, 3, -2.0f},
    {MDL_LARGEPULSE, 3, +2.0f},
    {MDL_LARGEPULSE, 5, -3.0f},
    {MDL_LARGEPULSE, 5, +3.0f},
    {MDL_LARGEPULSE, 4, -3.0f},
    {MDL_LARGEPULSE, 4, +3.0f},
    {MDL_LARGEPULSE, 6, -4.0f},
    {MDL_LARGEPULSE, 6, +4.0f},
    {MDL_LARGEPULSE, 5, -4.0f},
    {MDL_LARGEPULSE, 5, +4.0f},
    {MDL_LARGEPULSE, 7, -5.0f},
    {MDL_LARGEPULSE, 7, +5.0f},
    {MDL_LARGEPULSE, 6, -5.0f},
    {MDL_LARGEPULSE, 6, +5.0f},
    {MDL_LARGEPULSE, 8, -6.0f},
    {MDL_LARGEPULSE, 8, +6.0f},
    {MDL_LARGEPULSE, 7, -6.0f},
    {MDL_LARGEPULSE, 7, +6.0f}
};

static int utk_read_byte(UTKContext *ctx)
{
    if (ctx->ptr < ctx->end)
        return *ctx->ptr++;

    //vgmstream extra: this reads from FILE if static buffer was exhausted, now from a context buffer and STREAMFILE instead
    if (ctx->streamfile) { //if (ctx->fp) {
        //static uint8_t buffer[4096];
        //size_t bytes_copied = fread(buffer, 1, sizeof(buffer), ctx->fp);
        size_t bytes_copied = read_streamfile(ctx->buffer, ctx->offset, sizeof(ctx->buffer), ctx->streamfile);

        ctx->offset += bytes_copied;
        if (bytes_copied > 0 && bytes_copied <= sizeof(ctx->buffer)) {
            ctx->ptr = ctx->buffer;
            ctx->end = ctx->buffer + bytes_copied;
            return *ctx->ptr++;
        }
    }

    return 0;
}

static int16_t utk_read_i16(UTKContext *ctx)
{
    int x = utk_read_byte(ctx);
    x = (x << 8) | utk_read_byte(ctx);
    return x;
}

static int utk_read_bits(UTKContext *ctx, int count)
{
    int ret = ctx->bits_value & ((1 << count) - 1);
    ctx->bits_value >>= count;
    ctx->bits_count -= count;

    if (ctx->bits_count < 8) {
        /* read another byte */
        ctx->bits_value |= utk_read_byte(ctx) << ctx->bits_count;
        ctx->bits_count += 8;
    }

    return ret;
}

static void utk_parse_header(UTKContext *ctx)
{
    int i;
    float multiplier;

    ctx->reduced_bw = utk_read_bits(ctx, 1);
    ctx->multipulse_thresh = 32 - utk_read_bits(ctx, 4);
    ctx->fixed_gains[0] = 8.0f * (1 + utk_read_bits(ctx, 4));
    multiplier = 1.04f + utk_read_bits(ctx, 6)*0.001f;

    for (i = 1; i < 64; i++)
        ctx->fixed_gains[i] = ctx->fixed_gains[i-1] * multiplier;
}

static void utk_decode_excitation(UTKContext *ctx, int use_multipulse, float *out, int stride)
{
    int i;

    if (use_multipulse) {
        /* multi-pulse model: n pulses are coded explicitly; the rest are zero */
        int model, cmd;
        model = 0;
        i = 0;
        while (i < 108) {
            cmd = utk_codebooks[model][ctx->bits_value & 0xff];
            model = utk_commands[cmd].next_model;
            utk_read_bits(ctx, utk_commands[cmd].code_size);

            if (cmd > 3) {
                /* insert a pulse with magnitude <= 6.0f */
                out[i] = utk_commands[cmd].pulse_value;
                i += stride;
            } else if (cmd > 1) {
                /* insert between 7 and 70 zeros */
                int count = 7 + utk_read_bits(ctx, 6);
                if (i + count * stride > 108)
                    count = (108 - i)/stride;

                while (count > 0) {
                    out[i] = 0.0f;
                    i += stride;
                    count--;
                }
            } else {
                /* insert a pulse with magnitude >= 7.0f */
                int x = 7;

                while (utk_read_bits(ctx, 1))
                    x++;

                if (!utk_read_bits(ctx, 1))
                    x *= -1;

                out[i] = (float)x;
                i += stride;
            }
        }
    } else {
        /* RELP model: entire residual (excitation) signal is coded explicitly */
        i = 0;
        while (i < 108) {
            if (!utk_read_bits(ctx, 1))
                out[i] = 0.0f;
            else if (!utk_read_bits(ctx, 1))
                out[i] = -2.0f;
            else
                out[i] = 2.0f;

            i += stride;
        }
    }
}

static void rc_to_lpc(const float *rc, float *lpc)
{
    int i, j;
    float tmp1[12];
    float tmp2[12];

    for (i = 10; i >= 0; i--)
        tmp2[1+i] = rc[i];

    tmp2[0] = 1.0f;

    for (i = 0; i < 12; i++) {
        float x = -tmp2[11] * rc[11];

        for (j = 10; j >= 0; j--) {
            x -= tmp2[j] * rc[j];
            tmp2[j+1] = x * rc[j] + tmp2[j];
        }

        tmp1[i] = tmp2[0] = x;

        for (j = 0; j < i; j++)
            x -= tmp1[i-1-j] * lpc[j];

        lpc[i] = x;
    }
}

static void utk_lp_synthesis_filter(UTKContext *ctx, int offset, int num_blocks)
{
    int i, j, k;
    float lpc[12];
    float *ptr = &ctx->decompressed_frame[offset];

    rc_to_lpc(ctx->rc, lpc);

    for (i = 0; i < num_blocks; i++) {
        for (j = 0; j < 12; j++) {
            float x = *ptr;

            for (k = 0; k < j; k++)
                x += lpc[k] * ctx->synth_history[k-j+12];
            for (; k < 12; k++)
                x += lpc[k] * ctx->synth_history[k-j];

            ctx->synth_history[11-j] = x;
            *ptr++ = x;
        }
    }
}

/*
** Public functions.
*/

static void utk_decode_frame(UTKContext *ctx)
{
    int i, j;
    int use_multipulse = 0;
    float excitation[5+108+5];
    float rc_delta[12];

    if (!ctx->bits_count) {
        ctx->bits_value = utk_read_byte(ctx);
        ctx->bits_count = 8;
    }

    if (!ctx->parsed_header) {
        utk_parse_header(ctx);
        ctx->parsed_header = 1;
    }

    memset(&excitation[0], 0, 5*sizeof(float));
    memset(&excitation[5+108], 0, 5*sizeof(float));

    /* read the reflection coefficients */
    for (i = 0; i < 12; i++) {
        int idx;
        if (i == 0) {
            idx = utk_read_bits(ctx, 6);
            if (idx < ctx->multipulse_thresh)
                use_multipulse = 1;
        } else if (i < 4) {
            idx = utk_read_bits(ctx, 6);
        } else {
            idx = 16 + utk_read_bits(ctx, 5);
        }

        rc_delta[i] = (utk_rc_table[idx] - ctx->rc[i])*0.25f;
    }

    /* decode four subframes */
    for (i = 0; i < 4; i++) {
        int pitch_lag = utk_read_bits(ctx, 8);
        float pitch_gain = (float)utk_read_bits(ctx, 4)/15.0f;
        float fixed_gain = ctx->fixed_gains[utk_read_bits(ctx, 6)];

        if (!ctx->reduced_bw) {
            utk_decode_excitation(ctx, use_multipulse, &excitation[5], 1);
        } else {
            /* residual (excitation) signal is encoded at reduced bandwidth */
            int align = utk_read_bits(ctx, 1);
            int zero = utk_read_bits(ctx, 1);

            utk_decode_excitation(ctx, use_multipulse, &excitation[5+align], 2);

            if (zero) {
                /* fill the remaining samples with zero
                ** (spectrum is duplicated into high frequencies) */
                for (j = 0; j < 54; j++)
                    excitation[5+(1-align)+2*j] = 0.0f;
            } else {
                /* interpolate the remaining samples
                ** (spectrum is low-pass filtered) */
                float *ptr = &excitation[5+(1-align)];
                for (j = 0; j < 108; j += 2)
                    ptr[j] =   ptr[j-5] * 0.01803267933428287506103515625f
                             - ptr[j-3] * 0.114591561257839202880859375f
                             + ptr[j-1] * 0.597385942935943603515625f
                             + ptr[j+1] * 0.597385942935943603515625f
                             - ptr[j+3] * 0.114591561257839202880859375f
                             + ptr[j+5] * 0.01803267933428287506103515625f;

                /* scale by 0.5f to give the sinc impulse response unit energy */
                fixed_gain *= 0.5f;
            }
        }

        for (j = 0; j < 108; j++)
            ctx->decompressed_frame[108*i+j] =   fixed_gain * excitation[5+j]
                                               + pitch_gain * ctx->adapt_cb[108*i+216-pitch_lag+j];
    }

    for (i = 0; i < 324; i++)
        ctx->adapt_cb[i] = ctx->decompressed_frame[108+i];

    for (i = 0; i < 4; i++) {
        for (j = 0; j < 12; j++)
            ctx->rc[j] += rc_delta[j];

        utk_lp_synthesis_filter(ctx, 12*i, i < 3 ? 1 : 33);
    }
}

static void utk_init(UTKContext *ctx)
{
    memset(ctx, 0, sizeof(*ctx));
}

#if 0 //vgmstream extra: see flush_ea_mt
static void utk_set_fp(UTKContext *ctx, FILE *fp)
{
    ctx->fp = fp;

    /* reset the bit reader */
    ctx->bits_count = 0;
}

static void utk_set_ptr(UTKContext *ctx, const uint8_t *ptr, const uint8_t *end)
{
    ctx->ptr = ptr;
    ctx->end = end;

    /* reset the bit reader */
    ctx->bits_count = 0;
}
#endif

/*
** MicroTalk Revision 3 decoding function.
*/

static void utk_rev3_decode_frame(UTKContext *ctx)
{
    int pcm_data_present = (utk_read_byte(ctx) == 0xee);
    int i;

    utk_decode_frame(ctx);

    /* unread the last 8 bits and reset the bit reader */
    ctx->ptr--;
    ctx->bits_count = 0;

    if (pcm_data_present) {
        /* Overwrite n samples at a given offset in the decoded frame with
        ** raw PCM data. */
        int offset = utk_read_i16(ctx);
        int count = utk_read_i16(ctx);

        /* sx.exe does not do any bounds checking or clamping of these two
        ** fields (see 004274D1 in sx.exe v3.01.01), which means a specially
        ** crafted MT5:1 file can crash sx.exe.
        ** We will throw an error instead. */
        if (offset < 0 || offset > 432) {
            //fprintf(stderr, "error: invalid PCM offset %d\n", offset);
            //exit(EXIT_FAILURE);
            return; //vgmstream extra
        }
        if (count < 0 || count > 432 - offset) {
            //fprintf(stderr, "error: invalid PCM count %d\n", count);
            //exit(EXIT_FAILURE);
            return; //vgmstream extra
        }

        for (i = 0; i < count; i++)
            ctx->decompressed_frame[offset+i] = (float)utk_read_i16(ctx);
    }
}

/* ************************************************************************************************* */

ea_mt_codec_data *init_ea_mt(int channel_count, int pcm_blocks) {
    ea_mt_codec_data *data = NULL;
    int i;

    data = calloc(channel_count, sizeof(ea_mt_codec_data));
    if (!data) goto fail;

    data->pcm_blocks = pcm_blocks;
    data->utk_context_size = channel_count;

    data->utk_context = calloc(channel_count, sizeof(UTKContext*));
    if (!data->utk_context) goto fail;

    for (i = 0; i < channel_count; i++) {
        data->utk_context[i] = calloc(1, sizeof(UTKContext));
        if (!data->utk_context[i]) goto fail;
        utk_init(data->utk_context[i]);
    }

    return data;

fail:
    free_ea_mt(data);
    return NULL;
}

void decode_ea_mt(VGMSTREAM * vgmstream, sample * outbuf, int channelspacing, int32_t first_sample, int32_t samples_to_do, int channel) {
    ea_mt_codec_data *data = vgmstream->codec_data;
    int i, sample_count = 0, frame_samples;
    UTKContext* ctx = data->utk_context[channel];

    /* Use the above decoder, which expects pointers to read data. Since EA-MT frames aren't
     * byte-aligned, reading new buffer data is decided by the decoder. When decoding starts
     * or a SCHl block changes flush_ea_mt must be called to reset the state.
     * A bit hacky but would need some restructuring otherwise. */

    frame_samples = 432;
    first_sample = first_sample % frame_samples;

    /* don't decode again if we didn't consume the current frame.
     * UTKContext saves the sample buffer, and can't re-decode a frame */
    if (!ctx->samples_filled) {
        if (data->pcm_blocks)
            utk_rev3_decode_frame(ctx);
        else
            utk_decode_frame(ctx);
        ctx->samples_filled = 1;
    }

    /* copy samples */
    for (i = first_sample; i < first_sample+samples_to_do; i++) {
        int x = UTK_ROUND(ctx->decompressed_frame[i]);
        outbuf[sample_count] = (int16_t)UTK_CLAMP(x, -32768, 32767);
        sample_count += channelspacing;
    }

    if (i == frame_samples)
        ctx->samples_filled = 0;
}

static void flush_ea_mt_internal(VGMSTREAM *vgmstream, int is_start) {
    ea_mt_codec_data *data = vgmstream->codec_data;
    int i;
    size_t bytes;

    /* the decoder needs to be notified when offsets change */
    for (i = 0; i < vgmstream->channels; i++) {
        UTKContext *ctx = data->utk_context[i];

        ctx->streamfile = vgmstream->ch[i].streamfile;
        ctx->offset = is_start ? vgmstream->ch[i].channel_start_offset : vgmstream->ch[i].offset;
        ctx->samples_filled = 0;

        bytes = read_streamfile(ctx->buffer,ctx->offset,sizeof(ctx->buffer),ctx->streamfile);
        ctx->offset += sizeof(ctx->buffer);
        ctx->ptr = ctx->buffer;
        ctx->end = ctx->buffer + bytes;
        ctx->bits_count = 0;
    }
}

void flush_ea_mt(VGMSTREAM *vgmstream) {
    flush_ea_mt_internal(vgmstream, 0);
}

void reset_ea_mt(VGMSTREAM *vgmstream) {
    flush_ea_mt_internal(vgmstream, 1);
}

void seek_ea_mt(VGMSTREAM * vgmstream, int32_t num_sample) {
    flush_ea_mt_internal(vgmstream, 1);
    //todo discard loop (though this should be adecuate as probably only uses full loops, if at all)
}

void free_ea_mt(ea_mt_codec_data *data) {
    int i;

    if (!data)
        return;

    for (i = 0; i < data->utk_context_size; i++) {
        free(data->utk_context[i]);
    }
    free(data->utk_context);
    free(data);
}