hca: optimize key test

2025-02-21 04:48:21 +01:00 · 2021-10-19 00:35:29 +02:00 · 2021-10-19 00:35:29 +02:00 · f4c3009a00
commit f4c3009a00
parent 99eb1c328f
1 changed files with 82 additions and 49 deletions
--- a/src/coding/hca_decoder_clhca.c
+++ b/src/coding/hca_decoder_clhca.c
@ -57,9 +57,9 @@
 #define HCA_MAX_FRAME_SIZE 0xFFFF       /* lib max */

 #define HCA_MASK  0x7F7F7F7F            /* chunk obfuscation when the HCA is encrypted with key */
-#define HCA_SUBFRAMES_PER_FRAME  8
+#define HCA_SUBFRAMES  8
 #define HCA_SAMPLES_PER_SUBFRAME  128   /* also spectrum points/etc */
-#define HCA_SAMPLES_PER_FRAME  (HCA_SUBFRAMES_PER_FRAME*HCA_SAMPLES_PER_SUBFRAME)
+#define HCA_SAMPLES_PER_FRAME  (HCA_SUBFRAMES*HCA_SAMPLES_PER_SUBFRAME)
 #define HCA_MDCT_BITS  7                /* (1<<7) = 128 */

 #define HCA_MIN_CHANNELS  1
@ -88,7 +88,7 @@ typedef struct stChannel {
    unsigned int coded_count;                               /* encoded scales/resolutions/coefs */

    /* subframe state */
-    unsigned char intensity[HCA_SUBFRAMES_PER_FRAME];       /* intensity indexes for joins stereo (value max: 15 / 4b) */
+    unsigned char intensity[HCA_SUBFRAMES];                 /* intensity indexes for joins stereo (value max: 15 / 4b) */
    unsigned char scalefactors[HCA_SAMPLES_PER_SUBFRAME];   /* scale indexes (value max: 64 / 6b)*/
    unsigned char resolution[HCA_SAMPLES_PER_SUBFRAME];     /* resolution indexes (value max: 15 / 4b) */
    unsigned char noises[HCA_SAMPLES_PER_SUBFRAME];         /* indexes to coefs that need noise fill + coefs that don't (value max: 128 / 8b) */
@ -96,13 +96,14 @@ typedef struct stChannel {
    unsigned int valid_count;                               /* resolutions with valid values saved in 'noises' */

    float gain[HCA_SAMPLES_PER_SUBFRAME];                   /* gain to apply to quantized spectral data */
-    float spectra[HCA_SAMPLES_PER_SUBFRAME];                /* resulting dequantized data */
+    float spectra[HCA_SUBFRAMES][HCA_SAMPLES_PER_SUBFRAME]; /* resulting dequantized data */
+
    float temp[HCA_SAMPLES_PER_SUBFRAME];                   /* temp for DCT-IV */
    float dct[HCA_SAMPLES_PER_SUBFRAME];                    /* result of DCT-IV */
    float imdct_previous[HCA_SAMPLES_PER_SUBFRAME];         /* IMDCT */

    /* frame state */
-    float wave[HCA_SUBFRAMES_PER_FRAME][HCA_SAMPLES_PER_SUBFRAME];  /* resulting samples */
+    float wave[HCA_SUBFRAMES][HCA_SAMPLES_PER_SUBFRAME];  /* resulting samples */
 } stChannel;

 typedef struct clHCA {
@ -333,7 +334,7 @@ void clHCA_ReadSamples16(clHCA* hca, signed short *samples) {
    unsigned int i, j, k;

    /* PCM output is generally unused, but lib functions seem to use SIMD for f32 to s32 + round to zero */
-    for (i = 0; i < HCA_SUBFRAMES_PER_FRAME; i++) {
+    for (i = 0; i < HCA_SUBFRAMES; i++) {
        for (j = 0; j < HCA_SAMPLES_PER_SUBFRAME; j++) {
            for (k = 0; k < hca->channels; k++) {
                f = hca->channel[k].wave[i][j];
@ -989,8 +990,12 @@ void clHCA_SetKey(clHCA* hca, unsigned long long keycode) {
    }
 }

+static int clHCA_DecodeBlock_unpack(clHCA* hca, void *data, unsigned int size);
+static void clHCA_DecodeBlock_transform(clHCA* hca);
+
+
 int clHCA_TestBlock(clHCA* hca, void *data, unsigned int size) {
-    const int frame_samples = HCA_SUBFRAMES_PER_FRAME * HCA_SAMPLES_PER_SUBFRAME;
+    const int frame_samples = HCA_SUBFRAMES * HCA_SAMPLES_PER_SUBFRAME;
    const float scale = 32768.0f;
    unsigned int i, ch, sf, s;
    int status;
@ -1014,7 +1019,7 @@ int clHCA_TestBlock(clHCA* hca, void *data, unsigned int size) {
    }

    /* return if decode fails (happens often with wrong keys due to bad bitstream values) */
-    status = clHCA_DecodeBlock(hca, data, size);
+    status = clHCA_DecodeBlock_unpack(hca, data, size);
    if (status < 0)
        return -1;

@ -1042,8 +1047,9 @@ int clHCA_TestBlock(clHCA* hca, void *data, unsigned int size) {
    }

    /* check decode results as (rarely) bad keys may still get here */
+    clHCA_DecodeBlock_transform(hca);
    for (ch = 0; ch < hca->channels; ch++) {
-        for (sf = 0; sf < HCA_SUBFRAMES_PER_FRAME; sf++) {
+        for (sf = 0; sf < HCA_SUBFRAMES; sf++) {
            for (s = 0; s < HCA_SAMPLES_PER_SUBFRAME; s++) {
                float fsample = hca->channel[ch].wave[sf][s];

@ -1095,15 +1101,15 @@ void clHCA_DecodeReset(clHCA * hca) {
        stChannel* ch = &hca->channel[i];

        /* most values get overwritten during decode */
-        //memset(ch->intensity, 0, sizeof(ch->intensity[0]) * HCA_SUBFRAMES_PER_FRAME);
+        //memset(ch->intensity, 0, sizeof(ch->intensity[0]) * HCA_SUBFRAMES);
        //memset(ch->scalefactors, 0, sizeof(ch->scalefactors[0]) * HCA_SAMPLES_PER_SUBFRAME);
        //memset(ch->resolution, 0, sizeof(ch->resolution[0]) * HCA_SAMPLES_PER_SUBFRAME);
        //memset(ch->gain, 0, sizeof(ch->gain[0]) * HCA_SAMPLES_PER_SUBFRAME);
-        //memset(ch->spectra, 0, sizeof(ch->spectra[0]) * HCA_SAMPLES_PER_SUBFRAME);
+        //memset(ch->spectra, 0, sizeof(ch->spectra[0]) * HCA_SUBFRAMES * HCA_SAMPLES_PER_SUBFRAME);
        //memset(ch->temp, 0, sizeof(ch->temp[0]) * HCA_SAMPLES_PER_SUBFRAME);
        //memset(ch->dct, 0, sizeof(ch->dct[0]) * HCA_SAMPLES_PER_SUBFRAME);
        memset(ch->imdct_previous, 0, sizeof(ch->imdct_previous[0]) * HCA_SAMPLES_PER_SUBFRAME);
-        //memset(ch->wave, 0, sizeof(ch->wave[0][0]) * HCA_SUBFRAMES_PER_FRAME * HCA_SUBFRAMES_PER_FRAME);
+        //memset(ch->wave, 0, sizeof(ch->wave[0][0]) * HCA_SUBFRAMES * HCA_SUBFRAMES);
    }
 }

@ -1119,23 +1125,21 @@ static void calculate_resolution(stChannel* ch, unsigned int packed_noise_level,

 static void calculate_gain(stChannel* ch);

-static void dequantize_coefficients(stChannel* ch, clData* br);
+static void dequantize_coefficients(stChannel* ch, clData* br, int subframe);

-static void reconstruct_noise(stChannel* ch, unsigned int min_resolution, unsigned int ms_stereo, unsigned int* random_p);
+static void reconstruct_noise(stChannel* ch, unsigned int min_resolution, unsigned int ms_stereo, unsigned int* random_p, int subframe);

 static void reconstruct_high_frequency(stChannel* ch, unsigned int hfr_group_count, unsigned int bands_per_hfr_group,
-        unsigned int stereo_band_count, unsigned int base_band_count, unsigned int total_band_count, unsigned int version);
+        unsigned int stereo_band_count, unsigned int base_band_count, unsigned int total_band_count, unsigned int version, int subframe);

 static void apply_intensity_stereo(stChannel* ch_pair, int subframe, unsigned int base_band_count, unsigned int total_band_count);

-static void apply_ms_stereo(stChannel* ch_pair, unsigned int ms_stereo, unsigned int base_band_count, unsigned int total_band_count);
+static void apply_ms_stereo(stChannel* ch_pair, unsigned int ms_stereo, unsigned int base_band_count, unsigned int total_band_count, int subframe);

 static void imdct_transform(stChannel* ch, int subframe);


-/* takes HCA data and decodes all of a frame's samples */
-//hcadecoder_decode_block
-int clHCA_DecodeBlock(clHCA* hca, void *data, unsigned int size) {
+static int clHCA_DecodeBlock_unpack(clHCA* hca, void *data, unsigned int size) {
    clData br;
    unsigned short sync;
    unsigned int subframe, ch;
@ -1180,19 +1184,29 @@ int clHCA_DecodeBlock(clHCA* hca, void *data, unsigned int size) {
    }

    /* lib seems to use a state value to skip parts (unpacking/subframe N/etc) as needed */
-    for (subframe = 0; subframe < HCA_SUBFRAMES_PER_FRAME; subframe++) {
+    for (subframe = 0; subframe < HCA_SUBFRAMES; subframe++) {

        /* unpack channel data and get dequantized spectra */
        for (ch = 0; ch < hca->channels; ch++){
-            dequantize_coefficients(&hca->channel[ch], &br);
+            dequantize_coefficients(&hca->channel[ch], &br, subframe);
        }

+        /* original code transforms subframe here, but we have it for later */
+    }
+
+    return br.bit; /* numbers of read bits for validations */
+}
+
+static void clHCA_DecodeBlock_transform(clHCA* hca) {
+    unsigned int subframe, ch;
+
+    for (subframe = 0; subframe < HCA_SUBFRAMES; subframe++) {
        /* restore missing bands from spectra */
        for (ch = 0; ch < hca->channels; ch++) {
-            reconstruct_noise(&hca->channel[ch], hca->min_resolution, hca->ms_stereo, &hca->random);
+            reconstruct_noise(&hca->channel[ch], hca->min_resolution, hca->ms_stereo, &hca->random, subframe);

            reconstruct_high_frequency(&hca->channel[ch], hca->hfr_group_count, hca->bands_per_hfr_group,
-                    hca->stereo_band_count, hca->base_band_count, hca->total_band_count, hca->version);
+                    hca->stereo_band_count, hca->base_band_count, hca->total_band_count, hca->version, subframe);
        }

        /* restore missing joint stereo bands */
@ -1200,7 +1214,7 @@ int clHCA_DecodeBlock(clHCA* hca, void *data, unsigned int size) {
            for (ch = 0; ch < hca->channels - 1; ch++) {
                apply_intensity_stereo(&hca->channel[ch], subframe, hca->base_band_count, hca->total_band_count);

-                apply_ms_stereo(&hca->channel[ch], hca->ms_stereo, hca->base_band_count, hca->total_band_count);
+                apply_ms_stereo(&hca->channel[ch], hca->ms_stereo, hca->base_band_count, hca->total_band_count, subframe);
            }
        }

@ -1209,9 +1223,27 @@ int clHCA_DecodeBlock(clHCA* hca, void *data, unsigned int size) {
            imdct_transform(&hca->channel[ch], subframe);
        }
    }
+}


-    return br.bit; /* numbers of read bits for validations */
+/* takes HCA data and decodes all of a frame's samples */
+//hcadecoder_decode_block
+int clHCA_DecodeBlock(clHCA* hca, void *data, unsigned int size) {
+    int res;
+
+    /* Original HCA code doesn't separate unpack + transform (unpacks most data,
+     * reads a subframe's spectra, transforms that subframe.
+     *
+     * Unpacking first takes a bit more memory (1 spectra per subframe) but test keys faster
+     * (since unpack may fail with bad keys we can skip transform). For regular decoding, this
+     * way somehow is slightly faster?  (~3-5%, extra compiler optimizations with reduced scope?) */
+
+    res = clHCA_DecodeBlock_unpack(hca, data, size);
+    if (res < 0)
+        return res;
+    clHCA_DecodeBlock_transform(hca);
+
+    return res;
 }

 //--------------------------------------------------
@ -1330,7 +1362,7 @@ static int unpack_intensity(stChannel* ch, clData* br, unsigned int hfr_group_co
            ch->intensity[0] = value;
            if (value < 15) {
                bitreader_skip(br, 4);
-                for (i = 1; i < HCA_SUBFRAMES_PER_FRAME; i++) {
+                for (i = 1; i < HCA_SUBFRAMES; i++) {
                    ch->intensity[i] = bitreader_read(br, 4);
                }
            }
@ -1352,7 +1384,7 @@ static int unpack_intensity(stChannel* ch, clData* br, unsigned int hfr_group_co
                ch->intensity[0] = value;
                if (delta_bits == 3) { /* 3+1 = 4b */
                    /* fixed intensities */
-                    for (i = 1; i < HCA_SUBFRAMES_PER_FRAME; i++) {
+                    for (i = 1; i < HCA_SUBFRAMES; i++) {
                        ch->intensity[i] = bitreader_read(br, 4);
                    }
                }
@ -1361,7 +1393,7 @@ static int unpack_intensity(stChannel* ch, clData* br, unsigned int hfr_group_co
                    unsigned char bmax = (2 << delta_bits) - 1;
                    unsigned char bits = delta_bits + 1;

-                    for (i = 1; i < HCA_SUBFRAMES_PER_FRAME; i++) {
+                    for (i = 1; i < HCA_SUBFRAMES; i++) {
                        unsigned char delta = bitreader_read(br, bits);
                        if (delta == bmax) {
                            value = bitreader_read(br, 4); /* encoded */
@ -1378,7 +1410,7 @@ static int unpack_intensity(stChannel* ch, clData* br, unsigned int hfr_group_co
            }
            else {
                bitreader_skip(br, 4);
-                for (i = 0; i < HCA_SUBFRAMES_PER_FRAME; i++) {
+                for (i = 0; i < HCA_SUBFRAMES; i++) {
                    ch->intensity[i] = 7;
                }
            }
@ -1498,7 +1530,7 @@ static const float hcatbdecoder_read_val_table[128] = {
 };

 /* read spectral coefficients in the bitstream */
-static void dequantize_coefficients(stChannel* ch, clData* br) {
+static void dequantize_coefficients(stChannel* ch, clData* br, int subframe) {
    int i;
    unsigned int cc_count = ch->coded_count;

@ -1524,11 +1556,11 @@ static void dequantize_coefficients(stChannel* ch, clData* br) {
        }

        /* dequantize coef with gain */
-        ch->spectra[i] = ch->gain[i] * qc;
+        ch->spectra[subframe][i] = ch->gain[i] * qc;
    }

    /* clean rest of spectra */
-    memset(&ch->spectra[cc_count], 0, sizeof(ch->spectra[0]) * (HCA_SAMPLES_PER_SUBFRAME - cc_count));
+    memset(&ch->spectra[subframe][cc_count], 0, sizeof(ch->spectra[subframe][0]) * (HCA_SAMPLES_PER_SUBFRAME - cc_count));
 }


@ -1560,7 +1592,7 @@ static const float* hcadecoder_scale_conversion_table = (const float*)hcadecoder

 /* recreate resolution 0 coefs (not encoded) with pseudo-random noise based on
 * other coefs/scales (probably similar to AAC's perceptual noise substitution) */
-static void reconstruct_noise(stChannel* ch, unsigned int min_resolution, unsigned int ms_stereo, unsigned int* random_p) {
+static void reconstruct_noise(stChannel* ch, unsigned int min_resolution, unsigned int ms_stereo, unsigned int* random_p, int subframe) {
    if (min_resolution > 0) /* added in v3.0 */
        return;
    if (ch->valid_count <= 0 || ch->noise_count <= 0)
@ -1587,7 +1619,8 @@ static void reconstruct_noise(stChannel* ch, unsigned int min_resolution, unsign
            sf_valid = ch->scalefactors[valid_index];
            sc_index = (sf_noise - sf_valid + 62) & ~((sf_noise - sf_valid + 62) >> 31);

-            ch->spectra[noise_index] = hcadecoder_scale_conversion_table[sc_index] * ch->spectra[valid_index];
+            ch->spectra[subframe][noise_index] = 
+                hcadecoder_scale_conversion_table[sc_index] * ch->spectra[subframe][valid_index];
        }

        *random_p = random; /* lib saves this in the bitreader, maybe for simplified passing around */
@ -1596,7 +1629,7 @@ static void reconstruct_noise(stChannel* ch, unsigned int min_resolution, unsign

 /* recreate missing coefs in high bands based on lower bands (probably similar to AAC's spectral band replication) */
 static void reconstruct_high_frequency(stChannel* ch, unsigned int hfr_group_count, unsigned int bands_per_hfr_group,
-        unsigned int stereo_band_count, unsigned int base_band_count, unsigned int total_band_count, unsigned int version) {
+        unsigned int stereo_band_count, unsigned int base_band_count, unsigned int total_band_count, unsigned int version, int subframe) {
    if (bands_per_hfr_group == 0) /* added in v2.0, skipped in v2.0 files with 0 bands too */
        return;
    if (ch->type == STEREO_SECONDARY)
@ -1630,7 +1663,7 @@ static void reconstruct_high_frequency(stChannel* ch, unsigned int hfr_group_cou
                sc_index = hfr_scales[group] - ch->scalefactors[lowband] + 63;
                sc_index = sc_index & ~(sc_index >> 31); /* clamped in v3.0 lib (in theory 6b sf are 0..128) */

-                ch->spectra[highband] = hcadecoder_scale_conversion_table[sc_index] * ch->spectra[lowband];
+                ch->spectra[subframe][highband] = hcadecoder_scale_conversion_table[sc_index] * ch->spectra[subframe][lowband];

                highband += 1;
                lowband -= lowband_sub;
@ -1638,7 +1671,7 @@ static void reconstruct_high_frequency(stChannel* ch, unsigned int hfr_group_cou
        }

        /* last spectrum coefficient is 0 (normally highband = 128, but perhaps could 'break' before) */
-        ch->spectra[highband - 1] = 0.0f;
+        ch->spectra[subframe][highband - 1] = 0.0f;
    }
 }

@ -1661,8 +1694,8 @@ static void apply_intensity_stereo(stChannel* ch_pair, int subframe, unsigned in
        int band;
        float ratio_l = hcadecoder_intensity_ratio_table[ ch_pair[1].intensity[subframe] ];
        float ratio_r = 2.0f - ratio_l; /* correct, though other decoders substract 2.0 (it does use 'fsubr 2.0' and such) */
-        float* sp_l = ch_pair[0].spectra;
-        float* sp_r = ch_pair[1].spectra;
+        float* sp_l = &ch_pair[0].spectra[subframe][0];
+        float* sp_r = &ch_pair[1].spectra[subframe][0];

        for (band = base_band_count; band < total_band_count; band++) {
            float coef_l = sp_l[band] * ratio_l;
@ -1674,7 +1707,7 @@ static void apply_intensity_stereo(stChannel* ch_pair, int subframe, unsigned in
 }

 /* restore L/R bands based on mid channel + side differences */
-static void apply_ms_stereo(stChannel* ch_pair, unsigned int ms_stereo, unsigned int base_band_count, unsigned int total_band_count) {
+static void apply_ms_stereo(stChannel* ch_pair, unsigned int ms_stereo, unsigned int base_band_count, unsigned int total_band_count, int subframe) {
    if (!ms_stereo) /* added in v3.0 */
        return;
    if (ch_pair[0].type != STEREO_PRIMARY)
@ -1683,8 +1716,8 @@ static void apply_ms_stereo(stChannel* ch_pair, unsigned int ms_stereo, unsigned
    {
        int band;
        const float ratio = 0.70710676908493; /* 0x3F3504F3 */
-        float* sp_l = ch_pair[0].spectra;
-        float* sp_r = ch_pair[1].spectra;
+        float* sp_l = &ch_pair[0].spectra[subframe][0];
+        float* sp_r = &ch_pair[1].spectra[subframe][0];

        for (band = base_band_count; band < total_band_count; band++) {
            float coef_l = (sp_l[band] + sp_r[band]) * ratio;
@ -1867,8 +1900,8 @@ static void imdct_transform(stChannel* ch, int subframe) {
    {
        unsigned int count1 = 1;
        unsigned int count2 = half;
-        float* temp1 = ch->spectra;
-        float* temp2 = ch->temp;
+        float* temp1 = &ch->spectra[subframe][0];
+        float* temp2 = &ch->temp[0];

        for (i = 0; i < mdct_bits; i++) {
            float* swap;
@ -1897,8 +1930,8 @@ static void imdct_transform(stChannel* ch, int subframe) {
    {
        unsigned int count1 = half;
        unsigned int count2 = 1;
-        float* temp1 = ch->temp;
-        float* temp2 = ch->spectra;
+        float* temp1 = &ch->temp[0];
+        float* temp2 = &ch->spectra[subframe][0];

        for (i = 0; i < mdct_bits; i++) {
            const float* sin_table = (const float*) sin_tables_hex[i];//todo cleanup
@ -1934,15 +1967,15 @@ static void imdct_transform(stChannel* ch, int subframe) {
        /* copy dct */
        /* (with the above optimization spectra is already modified, so this is redundant) */
        for (i = 0; i < size; i++) {
-            ch->dct[i] = ch->spectra[i];
+            ch->dct[i] = ch->spectra[subframe][i];
        }
 #endif
    }

    /* update output/imdct with overlapped window (lib fuses this with the above) */
    {
-        const float* dct = ch->spectra; //ch->dct;
-        const float* prev = ch->imdct_previous;
+        const float* dct = &ch->spectra[subframe][0]; //ch->dct;
+        const float* prev = &ch->imdct_previous[0];

        for (i = 0; i < half; i++) {
            ch->wave[subframe][i] = hcaimdct_window_float[i] * dct[i + half] + prev[i];