vgmstream/src/coding/ffmpeg_decoder_custom_mp4.c

#include "coding.h"
#include "../streamfile.h"
#include "../meta/deblock_streamfile.h"

#ifdef VGM_USE_FFMPEG

typedef enum { MP4_STD, MP4_LYN } mp4_type_t;

/**
 * Makes a MP4 header for MP4 raw data with a separate frame table, simulating a real MP4 that
 * also has such table embedded in their custom chunks.
 */
//TODO: segfaults with certain audio files (ffmpeg?)

/* *********************************************************** */

/* Helpers to make a M4A header, an insane soup of chunks (AKA "atoms").
 * Needs *A LOT* of atoms and fields so this is more elaborate than usual.
 * - https://developer.apple.com/library/archive/documentation/QuickTime/QTFF/QTFFPreface/qtffPreface.html
 */

/* generic additions */
typedef struct {
    uint8_t* out;
    int bytes;
} m4a_state_t;

typedef struct {
    STREAMFILE* sf;
    mp4_custom_t* mp4;      /* config */
    mp4_type_t type;

    uint8_t* out;           /* current position */
    int bytes;              /* written bytes */
    m4a_state_t chunks;     /* chunks offsets are absolute, save position until we know header size */
} m4a_header_t;

static void add_u32b(m4a_header_t* h, uint32_t value) {
    put_u32be(h->out, value);
    h->out += 0x04;
    h->bytes += 0x04;
}

static void add_u24b(m4a_header_t* h, uint32_t value) {
    put_u16be(h->out + 0x00, (value >> 8u) & 0xFFFF);
    put_u8   (h->out + 0x02, (value >> 0u) & 0xFF);
    h->out += 0x03;
    h->bytes += 0x03;
}

static void add_u16b(m4a_header_t* h, uint16_t value) {
    put_u16be(h->out, value);
    h->out += 0x02;
    h->bytes += 0x02;
}

static void add_u8(m4a_header_t* h, uint32_t value) {
    put_u8(h->out, value);
    h->out += 0x01;
    h->bytes += 0x01;
}

static void add_name(m4a_header_t* h, const char* name) {
    memcpy(h->out, name, 0x4);
    h->out += 0x04;
    h->bytes += 0x04;
}

static void add_atom(m4a_header_t* h, const char* name, uint32_t size) {
    add_u32b(h, size);
    add_name(h, name);
}

/* register + write final size for atoms of variable/complex size */
static void save_atom(m4a_header_t* h, m4a_state_t* s) {
    s->out = h->out;
    s->bytes = h->bytes;
}

static void load_atom(m4a_header_t* h, m4a_state_t* s) {
    put_u32be(s->out, h->bytes - s->bytes);
}

/* common atoms */

static void add_ftyp(m4a_header_t* h) {
    add_atom(h, "ftyp", 0x18);
    add_name(h, "M4A "); /* major brand */
    add_u32b(h, 512);    /* minor version */
    add_name(h, "isom"); /* compatible brands */
    add_name(h, "iso2"); /* compatible brands */
}

static void add_free(m4a_header_t* h) {
    add_atom(h, "free", 0x08);
}

static void add_mdat(m4a_header_t* h) {
    add_atom(h, "mdat", 0x08 + h->mp4->stream_size);
}

/* variable atoms */

static void add_stco(m4a_header_t* h) {
    add_atom(h, "stco", 0x10 + 1 * 0x04);
    add_u32b(h, 0);                         /* Version (1 byte) + Flags (3 byte) */
    add_u32b(h, 1);                         /* Number of entries */
    /* there may be an entry per frame, but only first seems needed */
    save_atom(h, &h->chunks);
    add_u32b(h, 0);                         /* Absolute offset N */
}

static void add_stsz(m4a_header_t* h) {
    int i;
    uint32_t size;

    add_atom(h, "stsz", 0x14 + h->mp4->table_entries * 0x04);
    add_u32b(h, 0);                         /* Version (1 byte) + Flags (3 byte) */
    add_u32b(h, 0);                         /* Sample size (CBR) */
    add_u32b(h, h->mp4->table_entries);     /* Number of entries (VBR) */

    switch(h->type) {
        case MP4_LYN: {
            uint32_t curr_size, next_size;

            /* LyN has a seek table with every frame, and frames are preprended by a 0x02
             * frame header with frame size, so we can reconstruct a frame table */
            for (i = 0; i < h->mp4->table_entries - 1; i++) {
                curr_size = read_u32le(h->mp4->table_offset + (i + 0) * 0x04, h->sf);
                next_size = read_u32le(h->mp4->table_offset + (i + 1) * 0x04, h->sf);

                size = next_size - curr_size - 0x02;
                add_u32b(h, size);          /* Sample N */
                //;VGM_LOG("%i: %x (%x: %x - %x - 0x02)\n", i, size, h->mp4->table_offset + (i + 1) * 0x04, next_size, curr_size);
            }
            curr_size = read_u32le(h->mp4->table_offset + (i + 0) * 0x04, h->sf);
            next_size = h->mp4->stream_size; /* no last offset */

            size = next_size - curr_size - 0x02;
            add_u32b(h, size);              /* Sample N */
            //;VGM_LOG("%i: %x\n", i, size);
            break;
        }

        default: {
            for (i = 0; i < h->mp4->table_entries; i++) {
                size = read_u32le(h->mp4->table_offset + i * 0x04, h->sf);
                add_u32b(h, size);          /* Sample N */
            }
            break;
        }
    }
}

static void add_stsc(m4a_header_t* h) {
    add_atom(h, "stsc", 0x1c);
    add_u32b(h, 0);                         /* Version (1 byte) + Flags (3 byte) */
    add_u32b(h, 1);                         /* Number of entries */
    add_u32b(h, 1);                         /* First chunk */
    add_u32b(h, h->mp4->table_entries);     /* Samples per chunk */
    add_u32b(h, 1);                         /* Sample description ID */
}

static void add_stts(m4a_header_t* h) {
    add_atom(h, "stts", 0x18);
    add_u32b(h, 0);                         /* Version (1 byte) + Flags (3 byte) */
    add_u32b(h, 1);                         /* Number of entries */
    add_u32b(h, h->mp4->table_entries);     /* Sample count */
    add_u32b(h, h->mp4->frame_samples);     /* Sample duration */
}

/* from mpeg4audio.c (also see ff_mp4_read_dec_config_descr) */
static const int m4a_sample_rates[16] = {
    96000, 88200, 64000, 48000, 44100, 32000, 24000, 22050, 16000, 12000, 11025, 8000, 7350
};
static const uint8_t m4a_channels[14] = {
    0,
    1, // mono (1/0)
    2, // stereo (2/0)
    3, // 3/0
    4, // 3/1
    5, // 3/2
    6, // 3/2.1
    8, // 5/2.1
    //0,
    //0,
    //0,
    //7, // 3/3.1
    //8, // 3/2/2.1
    //24 // 3/3/3 - 5/2/3 - 3/0/0.2
};

static void add_esds(m4a_header_t* h) {
    uint16_t config = 0;

    /* ES_descriptor (TLV format see ISO 14496-1) and DecSpecificInfoTag define actual decoding
     - config (channels/rate/etc), other atoms with the same stuff is just info
     * - http://ecee.colorado.edu/~ecen5653/ecen5653/papers/ISO%2014496-1%202004.PDF */

    {
        uint8_t object_type = 0x02; /* 0x00=none, 0x01=AAC main, 0x02=AAC LC */
        uint8_t sr_index = 0;
        uint8_t ch_index = 0;
        uint8_t unknown = 0;
        int i;
        for (i = 0; i < 16; i++) {
            if (m4a_sample_rates[i] == h->mp4->sample_rate) {
                sr_index = i;
                break;
            }
        }
        for (i = 0; i < 8; i++) {
            if (m4a_channels[i] == h->mp4->channels) {
                ch_index = i;
                break;
            }
        }

        config |= (object_type & 0x1F) << 11; /* 5b */
        config |= (sr_index & 0x0F) << 7; /* 4b */
        config |= (ch_index & 0x0F) << 3; /* 4b */
        config |= (unknown & 0x07) << 0; /* 3b */
    }

    add_atom(h, "esds", 0x33);
    add_u32b(h, 0);                         /* Version (1 byte) + Flags (3 byte) */

    add_u8  (h, 0x03);                      /* ES_DescrTag */
    add_u32b(h, 0x80808022);                /* size 0x22 */
    add_u16b(h, 0x0000);                    /* stream Id */
    add_u8  (h, 0x00);                      /* flags */

    add_u8  (h, 0x04);                      /* DecoderConfigDescrTag */
    add_u32b(h, 0x80808014);                /* size 0x14 */
    add_u8  (h, 0x40);                      /* object type (0x40=audio) */
    add_u8  (h, 0x15);                      /* stream type (6b: 0x5=audio) + upstream (1b) + reserved (1b: const 1) */
    add_u24b(h, 0x000000);                  /* buffer size */
    add_u32b(h, 0);                         /* max bitrate (256000?)*/
    add_u32b(h, 0);                         /* average bitrate (256000?) */

    add_u8  (h, 0x05);                      /* DecSpecificInfoTag */
    add_u32b(h, 0x80808002);                /* size 0x02 */
    add_u16b(h, config);                    /* actual decoder info */

    add_u8  (h, 0x06);                      /* SLConfigDescrTag  */
    add_u32b(h, 0x80808001);                /* size 0x01 */
    add_u8  (h, 0x02);                      /* predefined (2=default) */
}

static void add_mp4a(m4a_header_t* h) {
    add_atom(h, "mp4a", 0x57);
    add_u32b(h, 0);                         /* ? */
    add_u32b(h, 1);                         /* Data reference index */
    add_u32b(h, 0);                         /* Reserved */
    add_u32b(h, 0);                         /* Reserved 2 */
    add_u16b(h, h->mp4->channels);          /* Channel count */
    add_u16b(h, 16);                        /* Sample size */
    add_u32b(h, 0);                         /* Pre-defined */
    add_u16b(h, h->mp4->sample_rate);       /* Sample rate */
    add_u16b(h, 0);                         /* ? */
    add_esds(h); /* elementary stream descriptor */
}

static void add_stsd(m4a_header_t* h) {
    add_atom(h, "stsd", 0x67);
    add_u32b(h, 0);                         /* Version (1 byte) + Flags (3 byte) */
    add_u32b(h, 1);                         /* Number of entries */
    add_mp4a(h);
}

static void add_stbl(m4a_header_t* h) {
    m4a_state_t s;

    save_atom(h, &s);
    add_atom(h, "stbl", 0x00);
    add_stsd(h); /* Sample description */
    add_stts(h); /* Time-to-sample  */
    add_stsc(h); /* Sample-to-chunk */
    add_stsz(h); /* Sample size */
    add_stco(h); /* Chunk offset */
    load_atom(h, &s);
}

static void add_dinf(m4a_header_t* h) {
    add_atom(h, "dinf", 0x24);
    add_atom(h, "dref", 0x1c);
    add_u32b(h, 0);                         /* Version (1 byte) + Flags (3 byte) */
    add_u32b(h, 1);                         /* Number of entries */
    add_atom(h, "url ", 0x0c);
    add_u32b(h, 1);                         /* Version (1 byte) + Flags (3 byte) */
}

static void add_smhd(m4a_header_t* h) {
    add_atom(h, "smhd", 0x10);
    add_u32b(h, 0);                         /* Version (1 byte) + Flags (3 byte) */
    add_u16b(h, 0);                         /* Balance */
    add_u16b(h, 0);                         /* Reserved */
}

static void add_minf(m4a_header_t* h) {
    m4a_state_t s;

    save_atom(h, &s);
    add_atom(h, "minf", 0x00);
    add_smhd(h);
    add_dinf(h);
    add_stbl(h);
    load_atom(h, &s);
}

static void add_hdlr(m4a_header_t* h) {
    add_atom(h, "hdlr", 0x22);
    add_u32b(h, 0);                         /* version (1 byte) + flags (3 byte) */
    add_u32b(h, 0);                         /* Component type */
    add_name(h, "soun");                    /* Component subtype */
    add_u32b(h, 0);                         /* Component manufacturer */
    add_u32b(h, 0);                         /* Component flags */
    add_u32b(h, 0);                         /* Component flags mask */
    add_u16b(h, 0);                         /* Component name */
}

static void add_mdhd(m4a_header_t* h) {
    add_atom(h, "mdhd", 0x20);
    add_u32b(h, 0);                         /* Version (1 byte) + Flags (3 byte) */
    add_u32b(h, 0);                         /* Creation time */
    add_u32b(h, 0);                         /* Modification time */
    add_u32b(h, h->mp4->sample_rate);       /* Time scale */
    add_u32b(h, h->mp4->num_samples);       /* Duration */
    add_u16b(h, 0);                         /* Language (0xC455=eng?) */
    add_u16b(h, 0);                         /* Quality */
}

static void add_mdia(m4a_header_t* h) {
    m4a_state_t s;

    save_atom(h, &s);
    add_atom(h, "mdia", 0x00);
    add_mdhd(h);
    add_hdlr(h);
    add_minf(h);
    load_atom(h, &s);
}

static void add_tkhd(m4a_header_t* h) {
    add_atom(h, "tkhd", 0x5C);
    add_u32b(h, 0x00000001);            /* Version (1 byte) + Flags (3 byte), 1=track enabled */
    add_u32b(h, 0);                     /* Creation time */
    add_u32b(h, 0);                     /* Modification time */
    add_u32b(h, 1);                     /* Track ID */
    add_u32b(h, 0);                     /* Reserved 1 */
    add_u32b(h, h->mp4->num_samples);   /* Duration */
    add_u32b(h, 0);                     /* Reserved 1 */
    add_u32b(h, 0);                     /* Reserved 2 */
    add_u16b(h, 0);                     /* Layer */
    add_u16b(h, 0);                     /* Alternate group (1?) */
    add_u16b(h, 0x0100);                /* Volume */
    add_u16b(h, 0);                     /* Reserved */
    add_u32b(h, 0x00010000);            /* matrix_A */
    add_u32b(h, 0);                     /* matrix_B */
    add_u32b(h, 0);                     /* matrix_U */
    add_u32b(h, 0);                     /* matrix_C */
    add_u32b(h, 0x00010000);            /* matrix_D */
    add_u32b(h, 0);                     /* matrix_V */
    add_u32b(h, 0);                     /* matrix_X */
    add_u32b(h, 0);                     /* matrix_Y */
    add_u32b(h, 0x40000000);            /* matrix_W */
    add_u32b(h, 0);                     /* Width */
    add_u32b(h, 0);                     /* Height */
}

static void add_trak(m4a_header_t* h) {
    m4a_state_t s;

    save_atom(h, &s);
    add_atom(h, "trak", 0x00);
    add_tkhd(h);
    add_mdia(h);
    load_atom(h, &s);
}

static void add_mvhd(m4a_header_t* h) {
    add_atom(h, "mvhd", 0x6c);
    add_u32b(h, 0);                     /* Version (1 byte) + Flags (3 byte) */
    add_u32b(h, 0);                     /* Creation time */
    add_u32b(h, 0);                     /* Modification time */
    add_u32b(h, h->mp4->sample_rate);   /* Time scale */
    add_u32b(h, h->mp4->num_samples);   /* Duration */
    add_u32b(h, 0x00010000);            /* Preferred rate */
    add_u16b(h, 0x0100);                /* Preferred volume */
    add_u32b(h, 0);                     /* Reserved 1 */
    add_u32b(h, 0);                     /* Reserved 2 */
    add_u16b(h, 0);                     /* Reserved 3 */
    add_u32b(h, 0x00010000);            /* matrix_A */
    add_u32b(h, 0);                     /* matrix_B */
    add_u32b(h, 0);                     /* matrix_U */
    add_u32b(h, 0);                     /* matrix_C */
    add_u32b(h, 0x00010000);            /* matrix_D */
    add_u32b(h, 0);                     /* matrix_V */
    add_u32b(h, 0);                     /* matrix_X */
    add_u32b(h, 0);                     /* matrix_Y */
    add_u32b(h, 0x40000000);            /* matrix_W */
    add_u32b(h, 0);                     /* Preview time */
    add_u32b(h, 0);                     /* Preview duration */
    add_u32b(h, 0);                     /* Poster time */
    add_u32b(h, 0);                     /* Selection time */
    add_u32b(h, 0);                     /* Selection duration */
    add_u32b(h, 0);                     /* Current time */
    add_u32b(h, 2);                     /* Next track ID */
}

static void add_moov(m4a_header_t* h) {
    m4a_state_t s;

    save_atom(h, &s);
    add_atom(h, "moov", 0x00);
    add_mvhd(h);
    add_trak(h);
  //add_udta(h);
    load_atom(h, &s);
}

/* *** */

static int make_m4a_header(uint8_t* buf, int buf_len, mp4_custom_t* mp4, STREAMFILE* sf, mp4_type_t type) {
    m4a_header_t h = {0};

    if (buf_len < 0x400 + mp4->table_entries * 0x4) /* approx */
        goto fail;

    h.sf = sf;
    h.mp4 = mp4;
    h.type = type;
    h.out = buf;

    add_ftyp(&h);
    add_free(&h);
    add_moov(&h);
    add_mdat(&h);


    /* define absolute chunk offset after all calcs */
    put_u32be(h.chunks.out, h.bytes);

    return h.bytes;
fail:
    return 0;
}

/* ************************************************************************* */

static void block_callback(STREAMFILE* sf, deblock_io_data* data) {
    data->data_size = read_u16be(data->physical_offset, sf);
    data->skip_size = 0x02;
    data->block_size = data->skip_size + data->data_size;
}

static STREAMFILE* setup_mp4_streamfile(STREAMFILE* sf, mp4_custom_t* mp4, mp4_type_t type) {
    STREAMFILE* new_sf = NULL;
    deblock_config_t cfg = {0};

    cfg.stream_start = mp4->stream_offset;
    cfg.stream_size = mp4->stream_size;
    cfg.block_callback = block_callback;

    switch(type) {
        case MP4_LYN: /* each frame has a 0x02 header */
            cfg.logical_size = mp4->stream_size - (mp4->table_entries * 0x02);
            break;
        default:
            return NULL;
    }

    /* setup sf */
    new_sf = open_wrap_streamfile(sf);
    new_sf = open_io_deblock_streamfile_f(new_sf, &cfg);
    //new_sf = open_clamp_streamfile_f(new_sf, 0x00, clean_size);
    return new_sf;
}

/* ************************************************************************* */

static ffmpeg_codec_data* init_ffmpeg_mp4_custom(STREAMFILE* sf, mp4_custom_t* mp4, mp4_type_t type) {
    ffmpeg_codec_data* ffmpeg_data = NULL;
    STREAMFILE* temp_sf = NULL;
    int bytes;
    uint8_t* buf = NULL;
    int buf_len = 0x800 + mp4->table_entries * 0x4; /* approx max sum of atom chunks is ~0x400 */

    if (buf_len > 0x100000) /* ??? */
        goto fail;

    buf = malloc(buf_len);
    if (!buf) goto fail;
    bytes = make_m4a_header(buf, buf_len, mp4, sf, type); /* before changing stream_offset/size */

    switch(type) {
        case MP4_STD: /* regular raw data */
            temp_sf = sf;
            break;
        case MP4_LYN: /* frames have size before them, but also a seek table */
            temp_sf = setup_mp4_streamfile(sf, mp4, type);
            mp4->stream_offset = 0;
            mp4->stream_size = get_streamfile_size(temp_sf);
            break;
        default:
            goto fail;
    }
    if (!temp_sf) goto fail;

    ffmpeg_data = init_ffmpeg_header_offset(temp_sf, buf, bytes, mp4->stream_offset, mp4->stream_size);
    if (!ffmpeg_data) goto fail;

    /* not part of fake header since it's kinda complex to add (iTunes string comment) */
    ffmpeg_set_skip_samples(ffmpeg_data, mp4->encoder_delay);

    free(buf);
    if (sf != temp_sf) close_streamfile(temp_sf);
    return ffmpeg_data;
fail:
    free(buf);
    if (sf != temp_sf) close_streamfile(temp_sf);
    free_ffmpeg(ffmpeg_data);
    return NULL;
}

ffmpeg_codec_data* init_ffmpeg_mp4_custom_std(STREAMFILE* sf, mp4_custom_t* mp4) {
    return init_ffmpeg_mp4_custom(sf, mp4, MP4_STD);
}

ffmpeg_codec_data* init_ffmpeg_mp4_custom_lyn(STREAMFILE* sf, mp4_custom_t* mp4) {
    //TODO: most LyN files seem to give FFmpeg error in some frame, mono or stereo files,
    // seek table correct and complete, no observed frame size/format/etc oddities.
    // No audible issues though so maybe it's must some FFmpeg issue to be fixed there.
    // (ex. frame 272 of 1162 in VO_ACT2_M12_FD_54_GILLI_PLS_0008479.Cafe_00000006.son)
    return init_ffmpeg_mp4_custom(sf, mp4, MP4_LYN);
}

#endif