Improve EA-XAS decoding/performance

This commit is contained in:
bnnm 2019-10-06 17:54:26 +02:00
parent 35f5da2ac3
commit 4af3f6bad6

View File

@ -1,6 +1,12 @@
#include "coding.h"
#include "../util.h"
#if 0
/* known game code/platforms use float buffer and coefs, but some approximations around use this int math:
* ...
* coef1 = table[index + 0]
* coef2 = table[index + 4]
* sample = clamp16(((signed_nibble << (20 - shift)) + hist1 * coef1 + hist2 * coef2 + 128) >> 8); */
static const int EA_XA_TABLE[20] = {
0, 240, 460, 392,
0, 0, -208, -220,
@ -8,33 +14,58 @@ static const int EA_XA_TABLE[20] = {
7, 8, 10, 11,
0, -1, -3, -4
};
#endif
/* EA-XAS v1, evolution of EA-XA/XAS and cousin of MTA2. From FFmpeg (general info) + MTA2 (layout) + EA-XA (decoding)
/* standard CD-XA's K0/K1 filter pairs */
static const float xa_coefs[16][2] = {
{ 0.0, 0.0 },
{ 0.9375, 0.0 },
{ 1.796875, -0.8125 },
{ 1.53125, -0.859375 },
/* only 4 pairs exist, assume 0s for bad indexes */
};
/* EA-XAS v1, evolution of EA-XA/XAS and cousin of MTA2. Reverse engineered from various .exes/.so
*
* Layout: blocks of 0x4c per channel (128 samples), divided into 4 headers + 4 vertical groups of 15 bytes (for parallelism?).
* Layout: blocks of 0x4c per channel (128 samples), divided into 4 headers + 4 vertical groups of 15 bytes.
* Original code reads all headers first then processes all nibbles (for CPU cache/parallelism/SIMD optimizations).
* To simplify, always decodes the block and discards unneeded samples, so doesn't use external hist. */
void decode_ea_xas_v1(VGMSTREAMCHANNEL * stream, sample * outbuf, int channelspacing, int32_t first_sample, int32_t samples_to_do, int channel) {
int group, row, i;
int samples_done = 0, sample_count = 0;
void decode_ea_xas_v1(VGMSTREAMCHANNEL * stream, sample_t * outbuf, int channelspacing, int32_t first_sample, int32_t samples_to_do, int channel) {
uint8_t frame[0x4c] = {0};
off_t frame_offset;
int group, row, i, samples_done = 0, sample_count = 0;
size_t bytes_per_frame, samples_per_frame;
/* internal interleave */
int block_samples = 128;
first_sample = first_sample % block_samples;
bytes_per_frame = 0x4c;
samples_per_frame = 128;
first_sample = first_sample % samples_per_frame;
frame_offset = stream->offset + bytes_per_frame * channel;
read_streamfile(frame, frame_offset, bytes_per_frame, stream->streamfile); /* ignore EOF errors */
//todo: original code uses float sample buffer:
//- header pcm-hist to float-hist: hist * (1/32768)
//- nibble to signed to float: (int32_t)(pnibble << 28) * SHIFT_MUL_LUT[shift_index]
// look-up table just simplifies ((nibble << 12 << 12) >> 12 + shift) * (1/32768)
// though maybe introduces rounding errors?
//- coefs apply normally, though hists are already floats
//- final float sample isn't clamped
/* process groups */
/* parse group headers */
for (group = 0; group < 4; group++) {
int coef1, coef2;
float coef1, coef2;
int16_t hist1, hist2;
uint8_t shift;
uint32_t group_header = (uint32_t)read_32bitLE(stream->offset + channel*0x4c + group*0x4, stream->streamfile); /* always LE */
uint32_t group_header = (uint32_t)get_32bitLE(frame + group*0x4); /* always LE */
coef1 = EA_XA_TABLE[(uint8_t)(group_header & 0x0F) + 0];
coef2 = EA_XA_TABLE[(uint8_t)(group_header & 0x0F) + 4];
hist2 = (int16_t)(group_header & 0xFFF0);
coef1 = xa_coefs[group_header & 0x0F][0];
coef2 = xa_coefs[group_header & 0x0F][1];
hist2 = (int16_t)((group_header >> 0) & 0xFFF0);
hist1 = (int16_t)((group_header >> 16) & 0xFFF0);
shift = 20 - ((group_header >> 16) & 0x0F);
shift = (group_header >> 16) & 0x0F;
/* write header samples (needed) */
if (sample_count >= first_sample && samples_done < samples_to_do) {
@ -51,12 +82,14 @@ void decode_ea_xas_v1(VGMSTREAMCHANNEL * stream, sample * outbuf, int channelspa
/* process nibbles per group */
for (row = 0; row < 15; row++) {
for (i = 0; i < 1*2; i++) {
uint8_t sample_byte = (uint8_t)read_8bit(stream->offset + channel*0x4c + 4*4 + row*0x04 + group + i/2, stream->streamfile);
uint8_t nibbles = frame[4*4 + row*0x04 + group + i/2];
int sample;
sample = get_nibble_signed(sample_byte, !(i&1)); /* upper first */
sample = sample << shift;
sample = (sample + hist1 * coef1 + hist2 * coef2 + 128) >> 8;
sample = i&1 ? /* high nibble first */
(nibbles >> 0) & 0x0f :
(nibbles >> 4) & 0x0f;
sample = (int16_t)(sample << 12) >> shift; /* 16b sign extend + scale */
sample = sample + hist1 * coef1 + hist2 * coef2;
sample = clamp16(sample);
if (sample_count >= first_sample && samples_done < samples_to_do) {
@ -73,37 +106,43 @@ void decode_ea_xas_v1(VGMSTREAMCHANNEL * stream, sample * outbuf, int channelspa
/* internal interleave (interleaved channels, but manually advances to co-exist with ea blocks) */
if (first_sample + samples_done == block_samples) {
stream->offset += 0x4c * channelspacing;
if (first_sample + samples_done == samples_per_frame) {
stream->offset += bytes_per_frame * channelspacing;
}
}
/* EA-XAS v0, without complex layouts and closer to EA-XA. Somewhat based on daemon1's decoder */
void decode_ea_xas_v0(VGMSTREAMCHANNEL * stream, sample * outbuf, int channelspacing, int32_t first_sample, int32_t samples_to_do, int channel) {
uint8_t frame[0x13] = {0};
off_t frame_offset;
int i;
int block_samples, frames_in, samples_done = 0, sample_count = 0;
int i, frames_in, samples_done = 0, sample_count = 0;
size_t bytes_per_frame, samples_per_frame;
/* external interleave (fixed size), mono */
block_samples = 32;
frames_in = first_sample / block_samples;
first_sample = first_sample % block_samples;
bytes_per_frame = 0x02 + 0x02 + 0x0f;
samples_per_frame = 1 + 1 + 0x0f*2;
frames_in = first_sample / samples_per_frame;
first_sample = first_sample % samples_per_frame;
frame_offset = stream->offset + (0x0f+0x02+0x02)*frames_in;
frame_offset = stream->offset + bytes_per_frame * frames_in;
read_streamfile(frame, frame_offset, bytes_per_frame, stream->streamfile); /* ignore EOF errors */
/* process frames */
//todo see above
/* process frame */
{
int coef1, coef2;
float coef1, coef2;
int16_t hist1, hist2;
uint8_t shift;
uint32_t frame_header = (uint32_t)read_32bitLE(frame_offset, stream->streamfile); /* always LE */
uint32_t frame_header = (uint32_t)get_32bitLE(frame); /* always LE */
coef1 = EA_XA_TABLE[(uint8_t)(frame_header & 0x0F) + 0];
coef2 = EA_XA_TABLE[(uint8_t)(frame_header & 0x0F) + 4];
hist2 = (int16_t)(frame_header & 0xFFF0);
coef1 = xa_coefs[frame_header & 0x0F][0];
coef2 = xa_coefs[frame_header & 0x0F][1];
hist2 = (int16_t)((frame_header >> 0) & 0xFFF0);
hist1 = (int16_t)((frame_header >> 16) & 0xFFF0);
shift = 20 - ((frame_header >> 16) & 0x0F);
shift = (frame_header >> 16) & 0x0F;
/* write header samples (needed) */
if (sample_count >= first_sample && samples_done < samples_to_do) {
@ -119,12 +158,14 @@ void decode_ea_xas_v0(VGMSTREAMCHANNEL * stream, sample * outbuf, int channelspa
/* process nibbles */
for (i = 0; i < 0x0f*2; i++) {
uint8_t sample_byte = (uint8_t)read_8bit(frame_offset + 0x02 + 0x02 + i/2, stream->streamfile);
uint8_t nibbles = frame[0x02 + 0x02 + i/2];
int sample;
sample = get_nibble_signed(sample_byte, !(i&1)); /* upper first */
sample = sample << shift;
sample = (sample + hist1 * coef1 + hist2 * coef2 + 128) >> 8;
sample = i&1 ? /* high nibble first */
(nibbles >> 0) & 0x0f :
(nibbles >> 4) & 0x0f;
sample = (int16_t)(sample << 12) >> shift; /* 16b sign extend + scale */
sample = sample + hist1 * coef1 + hist2 * coef2;
sample = clamp16(sample);
if (sample_count >= first_sample && samples_done < samples_to_do) {