Small HCA optimization

This commit is contained in:
bnnm 2021-03-21 00:51:24 +01:00
parent 35a6c5c2f9
commit 9ec4877ee7

View File

@ -325,26 +325,25 @@ int clHCA_getInfo(clHCA* hca, clHCA_stInfo *info) {
return 0; return 0;
} }
//HCADecoder_DecodeBlockInt32
void clHCA_ReadSamples16(clHCA* hca, signed short *samples) { void clHCA_ReadSamples16(clHCA* hca, signed short *samples) {
const float scale = 32768.0f; const float scale_f = 32768.0f;
float f; float f;
signed int s; signed int s;
unsigned int i, j, k; unsigned int i, j, k;
/* PCM output is generally unused, but lib functions seem to use SIMD for f32 to s32 + round to zero */
for (i = 0; i < HCA_SUBFRAMES_PER_FRAME; i++) { for (i = 0; i < HCA_SUBFRAMES_PER_FRAME; i++) {
for (j = 0; j < HCA_SAMPLES_PER_SUBFRAME; j++) { for (j = 0; j < HCA_SAMPLES_PER_SUBFRAME; j++) {
for (k = 0; k < hca->channels; k++) { for (k = 0; k < hca->channels; k++) {
f = hca->channel[k].wave[i][j]; f = hca->channel[k].wave[i][j];
//f = f * hca->rva_volume; /* rare, won't apply for now */ //f = f * hca->rva_volume; /* rare, won't apply for now */
if (f > 1.0f) { s = (signed int)(f * scale_f);
f = 1.0f; if (s > 32767)
} else if (f < -1.0f) { s = 32767;
f = -1.0f; else if (s < -32768)
} s = -32768;
s = (signed int) (f * scale); *samples++ = (signed short)s;
if ((unsigned) (s + 0x8000) & 0xFFFF0000)
s = (s >> 31) ^ 0x7FFF;
*samples++ = (signed short) s;
} }
} }
} }
@ -1860,57 +1859,58 @@ static void imdct_transform(stChannel* ch, int subframe) {
static const unsigned int size = HCA_SAMPLES_PER_SUBFRAME; static const unsigned int size = HCA_SAMPLES_PER_SUBFRAME;
static const unsigned int half = HCA_SAMPLES_PER_SUBFRAME / 2; static const unsigned int half = HCA_SAMPLES_PER_SUBFRAME / 2;
static const unsigned int mdct_bits = HCA_MDCT_BITS; static const unsigned int mdct_bits = HCA_MDCT_BITS;
unsigned int i, j, k;
/* This IMDCT (supposedly standard) is all too crafty for me to simplify, see VGAudio (Mdct.Dct4). */
/* pre-pre-rotation(?) */
{ {
unsigned int i, j, k; unsigned int count1 = 1;
unsigned int count1a, count2a, count1b, count2b; unsigned int count2 = half;
const float *temp1a, *temp1b; float* temp1 = ch->spectra;
float *temp2a, *temp2b; float* temp2 = ch->temp;
/* this is all too crafty for me to simplify, see VGAudio (Mdct.Dct4) */
temp1a = ch->spectra;
temp2a = ch->temp;
count1a = 1;
count2a = half;
for (i = 0; i < mdct_bits; i++) { for (i = 0; i < mdct_bits; i++) {
float* swap; float* swap;
float* d1 = &temp2a[0]; float* d1 = &temp2[0];
float* d2 = &temp2a[count2a]; float* d2 = &temp2[count2];
for (j = 0; j < count1a; j++) { for (j = 0; j < count1; j++) {
for (k = 0; k < count2a; k++) { for (k = 0; k < count2; k++) {
float a = *(temp1a++); float a = *(temp1++);
float b = *(temp1a++); float b = *(temp1++);
*(d1++) = b + a; *(d1++) = a + b;
*(d2++) = a - b; *(d2++) = a - b;
} }
d1 += count2a; d1 += count2;
d2 += count2a; d2 += count2;
} }
swap = (float*) temp1a - HCA_SAMPLES_PER_SUBFRAME; /* move spectra/temp to beginning */ swap = temp1 - HCA_SAMPLES_PER_SUBFRAME; /* move spectra or temp to beginning */
temp1a = temp2a; temp1 = temp2;
temp2a = swap; temp2 = swap;
count1a = count1a << 1; count1 = count1 << 1;
count2a = count2a >> 1; count2 = count2 >> 1;
} }
}
{
unsigned int count1 = half;
unsigned int count2 = 1;
float* temp1 = ch->temp;
float* temp2 = ch->spectra;
temp1b = ch->temp;
temp2b = ch->spectra;
count1b = half;
count2b = 1;
for (i = 0; i < mdct_bits; i++) { for (i = 0; i < mdct_bits; i++) {
const float* sin_table = (const float*) sin_tables_hex[i];//todo cleanup const float* sin_table = (const float*) sin_tables_hex[i];//todo cleanup
const float* cos_table = (const float*) cos_tables_hex[i]; const float* cos_table = (const float*) cos_tables_hex[i];
float* swap; float* swap;
float* d1 = temp2b; float* d1 = &temp2[0];
float* d2 = &temp2b[count2b * 2 - 1]; float* d2 = &temp2[count2 * 2 - 1];
const float* s1 = &temp1b[0]; const float* s1 = &temp1[0];
const float* s2 = &temp1b[count2b]; const float* s2 = &temp1[count2];
for (j = 0; j < count1b; j++) { for (j = 0; j < count1; j++) {
for (k = 0; k < count2b; k++) { for (k = 0; k < count2; k++) {
float a = *(s1++); float a = *(s1++);
float b = *(s2++); float b = *(s2++);
float sin = *(sin_table++); float sin = *(sin_table++);
@ -1918,38 +1918,41 @@ static void imdct_transform(stChannel* ch, int subframe) {
*(d1++) = a * sin - b * cos; *(d1++) = a * sin - b * cos;
*(d2--) = a * cos + b * sin; *(d2--) = a * cos + b * sin;
} }
s1 += count2b; s1 += count2;
s2 += count2b; s2 += count2;
d1 += count2b; d1 += count2;
d2 += count2b * 3; d2 += count2 * 3;
} }
swap = (float*) temp1b; swap = temp1;
temp1b = temp2b; temp1 = temp2;
temp2b = swap; temp2 = swap;
count1b = count1b >> 1; count1 = count1 >> 1;
count2b = count2b << 1; count2 = count2 << 1;
} }
#if 0
/* copy dct */ /* copy dct */
/* (with the above optimization spectra is already modified, so this is redundant) */ /* (with the above optimization spectra is already modified, so this is redundant) */
for (i = 0; i < size; i++) { for (i = 0; i < size; i++) {
ch->dct[i] = ch->spectra[i]; ch->dct[i] = ch->spectra[i];
} }
#endif
} }
/* update output/imdct (lib fuses this with the above) */ /* update output/imdct with overlapped window (lib fuses this with the above) */
{ {
unsigned int i; unsigned int i;
const float* dct = ch->spectra; //ch->dct;
const float* prev = ch->imdct_previous;
for (i = 0; i < half; i++) { for (i = 0; i < half; i++) {
ch->wave[subframe][i] = hcaimdct_window_float[i] * ch->dct[i + half] + ch->imdct_previous[i]; ch->wave[subframe][i] = hcaimdct_window_float[i] * dct[i + half] + prev[i];
ch->wave[subframe][i + half] = hcaimdct_window_float[i + half] * ch->dct[size - 1 - i] - ch->imdct_previous[i + half]; ch->wave[subframe][i + half] = hcaimdct_window_float[i + half] * dct[size - 1 - i] - prev[i + half];
ch->imdct_previous[i] = hcaimdct_window_float[size - 1 - i] * ch->dct[half - i - 1]; ch->imdct_previous[i] = hcaimdct_window_float[size - 1 - i] * dct[half - i - 1];
ch->imdct_previous[i + half] = hcaimdct_window_float[half - i - 1] * ch->dct[i]; ch->imdct_previous[i + half] = hcaimdct_window_float[half - i - 1] * dct[i];
} }
#if 0 #if 0
/* over-optimized IMDCT (for reference), barely noticeable even when decoding hundred of files */ /* over-optimized IMDCT window (for reference), barely noticeable even when decoding hundred of files */
const float* imdct_window = hcaimdct_window_float; const float* imdct_window = hcaimdct_window_float;
const float* dct; const float* dct;
float* imdct_previous; float* imdct_previous;