Improve EA-XAS decoding/performance

2025-01-17 23:36:41 +01:00 · 2019-10-06 17:54:26 +02:00 · 2019-10-06 17:54:26 +02:00 · 4af3f6bad6
commit 4af3f6bad6
parent 35f5da2ac3
1 changed files with 78 additions and 37 deletions
--- a/src/coding/ea_xas_decoder.c
+++ b/src/coding/ea_xas_decoder.c
@ -1,6 +1,12 @@
 #include "coding.h"
 #include "../util.h"

+#if 0
+/* known game code/platforms use float buffer and coefs, but some approximations around use this int math:
+ * ...
+ * coef1 = table[index + 0]
+ * coef2 = table[index + 4]
+ * sample = clamp16(((signed_nibble << (20 - shift)) + hist1 * coef1 + hist2 * coef2 + 128) >> 8); */
 static const int EA_XA_TABLE[20] = {
    0,  240,  460,  392,
    0,    0, -208, -220,
@ -8,33 +14,58 @@ static const int EA_XA_TABLE[20] = {
    7,    8,   10,   11,
    0,   -1,   -3,   -4
 };
+#endif

-/* EA-XAS v1, evolution of EA-XA/XAS and cousin of MTA2. From FFmpeg (general info) + MTA2 (layout) + EA-XA (decoding)
+/* standard CD-XA's K0/K1 filter pairs */
+static const float xa_coefs[16][2] = {
+    { 0.0,       0.0      },
+    { 0.9375,    0.0      },
+    { 1.796875, -0.8125   },
+    { 1.53125,  -0.859375 },
+    /* only 4 pairs exist, assume 0s for bad indexes */
+};
+
+/* EA-XAS v1, evolution of EA-XA/XAS and cousin of MTA2. Reverse engineered from various .exes/.so
 *
- * Layout: blocks of 0x4c per channel (128 samples), divided into 4 headers + 4 vertical groups of 15 bytes (for parallelism?).
+ * Layout: blocks of 0x4c per channel (128 samples), divided into 4 headers + 4 vertical groups of 15 bytes.
+ * Original code reads all headers first then processes all nibbles (for CPU cache/parallelism/SIMD optimizations).
 * To simplify, always decodes the block and discards unneeded samples, so doesn't use external hist. */
-void decode_ea_xas_v1(VGMSTREAMCHANNEL * stream, sample * outbuf, int channelspacing, int32_t first_sample, int32_t samples_to_do, int channel) {
-    int group, row, i;
-    int samples_done = 0, sample_count = 0;
+void decode_ea_xas_v1(VGMSTREAMCHANNEL * stream, sample_t * outbuf, int channelspacing, int32_t first_sample, int32_t samples_to_do, int channel) {
+    uint8_t frame[0x4c] = {0};
+    off_t frame_offset;
+    int group, row, i, samples_done = 0, sample_count = 0;
+    size_t bytes_per_frame, samples_per_frame;


    /* internal interleave */
-    int block_samples = 128;
-    first_sample = first_sample % block_samples;
+    bytes_per_frame = 0x4c;
+    samples_per_frame = 128;
+    first_sample = first_sample % samples_per_frame;
+
+    frame_offset = stream->offset + bytes_per_frame * channel;
+    read_streamfile(frame, frame_offset, bytes_per_frame, stream->streamfile); /* ignore EOF errors */
+
+    //todo: original code uses float sample buffer:
+    //- header pcm-hist to float-hist:  hist * (1/32768)
+    //- nibble to signed to float: (int32_t)(pnibble << 28) * SHIFT_MUL_LUT[shift_index]
+    //  look-up table just simplifies ((nibble << 12 << 12) >> 12 + shift) * (1/32768)
+    //  though maybe introduces rounding errors?
+    //- coefs apply normally, though hists are already floats
+    //- final float sample isn't clamped


-    /* process groups */
+    /* parse group headers */
    for (group = 0; group < 4; group++) {
-        int coef1, coef2;
+        float coef1, coef2;
        int16_t hist1, hist2;
        uint8_t shift;
-        uint32_t group_header = (uint32_t)read_32bitLE(stream->offset + channel*0x4c + group*0x4, stream->streamfile); /* always LE */
+        uint32_t group_header = (uint32_t)get_32bitLE(frame + group*0x4); /* always LE */

-        coef1 = EA_XA_TABLE[(uint8_t)(group_header & 0x0F) + 0];
-        coef2 = EA_XA_TABLE[(uint8_t)(group_header & 0x0F) + 4];
-        hist2 = (int16_t)(group_header & 0xFFF0);
+        coef1 = xa_coefs[group_header & 0x0F][0];
+        coef2 = xa_coefs[group_header & 0x0F][1];
+        hist2 = (int16_t)((group_header >>  0) & 0xFFF0);
        hist1 = (int16_t)((group_header >> 16) & 0xFFF0);
-        shift = 20 - ((group_header >> 16) & 0x0F);
+        shift = (group_header >> 16) & 0x0F;

        /* write header samples (needed) */
        if (sample_count >= first_sample && samples_done < samples_to_do) {
@ -51,12 +82,14 @@ void decode_ea_xas_v1(VGMSTREAMCHANNEL * stream, sample * outbuf, int channelspa
        /* process nibbles per group */
        for (row = 0; row < 15; row++) {
            for (i = 0; i < 1*2; i++) {
-                uint8_t sample_byte = (uint8_t)read_8bit(stream->offset + channel*0x4c + 4*4 + row*0x04 + group + i/2, stream->streamfile);
+                uint8_t nibbles = frame[4*4 + row*0x04 + group + i/2];
                int sample;

-                sample = get_nibble_signed(sample_byte, !(i&1)); /* upper first */
-                sample = sample << shift;
-                sample = (sample + hist1 * coef1 + hist2 * coef2 + 128) >> 8;
+                sample = i&1 ? /* high nibble first */
+                        (nibbles >> 0) & 0x0f :
+                        (nibbles >> 4) & 0x0f;
+                sample = (int16_t)(sample << 12) >> shift; /* 16b sign extend + scale */
+                sample = sample + hist1 * coef1 + hist2 * coef2;
                sample = clamp16(sample);

                if (sample_count >= first_sample && samples_done < samples_to_do) {
@ -73,37 +106,43 @@ void decode_ea_xas_v1(VGMSTREAMCHANNEL * stream, sample * outbuf, int channelspa


    /* internal interleave (interleaved channels, but manually advances to co-exist with ea blocks) */
-    if (first_sample + samples_done == block_samples)  {
-        stream->offset += 0x4c * channelspacing;
+    if (first_sample + samples_done == samples_per_frame)  {
+        stream->offset += bytes_per_frame * channelspacing;
    }
 }


 /* EA-XAS v0, without complex layouts and closer to EA-XA. Somewhat based on daemon1's decoder */
 void decode_ea_xas_v0(VGMSTREAMCHANNEL * stream, sample * outbuf, int channelspacing, int32_t first_sample, int32_t samples_to_do, int channel) {
+    uint8_t frame[0x13] = {0};
    off_t frame_offset;
-    int i;
-    int block_samples, frames_in, samples_done = 0, sample_count = 0;
+    int i, frames_in, samples_done = 0, sample_count = 0;
+    size_t bytes_per_frame, samples_per_frame;
+

    /* external interleave (fixed size), mono */
-    block_samples = 32;
-    frames_in = first_sample / block_samples;
-    first_sample = first_sample % block_samples;
+    bytes_per_frame = 0x02 + 0x02 + 0x0f;
+    samples_per_frame = 1 + 1 + 0x0f*2;
+    frames_in = first_sample / samples_per_frame;
+    first_sample = first_sample % samples_per_frame;

-    frame_offset = stream->offset + (0x0f+0x02+0x02)*frames_in;
+    frame_offset = stream->offset + bytes_per_frame * frames_in;
+    read_streamfile(frame, frame_offset, bytes_per_frame, stream->streamfile); /* ignore EOF errors */

-    /* process frames */
+    //todo see above
+
+    /* process frame */
    {
-        int coef1, coef2;
+        float coef1, coef2;
        int16_t hist1, hist2;
        uint8_t shift;
-        uint32_t frame_header = (uint32_t)read_32bitLE(frame_offset, stream->streamfile); /* always LE */
+        uint32_t frame_header = (uint32_t)get_32bitLE(frame); /* always LE */

-        coef1 = EA_XA_TABLE[(uint8_t)(frame_header & 0x0F) + 0];
-        coef2 = EA_XA_TABLE[(uint8_t)(frame_header & 0x0F) + 4];
-        hist2 = (int16_t)(frame_header & 0xFFF0);
+        coef1 = xa_coefs[frame_header & 0x0F][0];
+        coef2 = xa_coefs[frame_header & 0x0F][1];
+        hist2 = (int16_t)((frame_header >>  0) & 0xFFF0);
        hist1 = (int16_t)((frame_header >> 16) & 0xFFF0);
-        shift = 20 - ((frame_header >> 16) & 0x0F);
+        shift = (frame_header >> 16) & 0x0F;

        /* write header samples (needed) */
        if (sample_count >= first_sample && samples_done < samples_to_do) {
@ -119,12 +158,14 @@ void decode_ea_xas_v0(VGMSTREAMCHANNEL * stream, sample * outbuf, int channelspa

        /* process nibbles */
        for (i = 0; i < 0x0f*2; i++) {
-            uint8_t sample_byte = (uint8_t)read_8bit(frame_offset + 0x02 + 0x02 + i/2, stream->streamfile);
+            uint8_t nibbles = frame[0x02 + 0x02 + i/2];
            int sample;

-            sample = get_nibble_signed(sample_byte, !(i&1)); /* upper first */
-            sample = sample << shift;
-            sample = (sample + hist1 * coef1 + hist2 * coef2 + 128) >> 8;
+            sample = i&1 ? /* high nibble first */
+                    (nibbles >> 0) & 0x0f :
+                    (nibbles >> 4) & 0x0f;
+            sample = (int16_t)(sample << 12) >> shift; /* 16b sign extend + scale */
+            sample = sample + hist1 * coef1 + hist2 * coef2;
            sample = clamp16(sample);

            if (sample_count >= first_sample && samples_done < samples_to_do) {