some change precision audio processing (#94)

* some change precision audio processing * fix clipping problem in resample resample sometimes causes signal clipping, not just librosa.resample * fix error
2024-11-24 07:30:16 +01:00 · 2023-04-22 20:39:47 +09:00 · 2023-04-22 20:39:47 +09:00 · 297d92bf5d
commit 297d92bf5d
parent c423f77a16
4 changed files with 32 additions and 9 deletions
--- a/extract_f0_print.py
+++ b/extract_f0_print.py
@ -33,7 +33,9 @@ class FeatureInput(object):
        self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
    def compute_f0(self, path, f0_method):
-        x, sr = librosa.load(path, self.fs)
+        # default resample type of librosa.resample is "soxr_hq".
        # Quality: soxr_vhq > soxr_hq
        x, sr = librosa.load(path, self.fs, res_type='soxr_vhq')
        p_len = x.shape[0] // self.hop
        assert sr == self.fs
        if f0_method == "pm":
--- a/my_utils.py
+++ b/my_utils.py
@ -12,10 +12,10 @@ def load_audio(file, sr):
        )  # 防止小白拷路径头尾带了空格和"和回车
        out, _ = (
            ffmpeg.input(file, threads=0)
-            .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
+            .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
            .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
        )
    except Exception as e:
        raise RuntimeError(f"Failed to load audio: {e}")
-    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
+    return np.frombuffer(out, np.float32).flatten()
--- a/train/data_utils.py
+++ b/train/data_utils.py
@ -98,7 +98,10 @@ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):
                    sampling_rate, self.sampling_rate
                )
            )
-        audio_norm = audio / self.max_wav_value
+        audio_norm = audio
 #        audio_norm = audio / self.max_wav_value
 #        audio_norm = audio / np.abs(audio).max()
        audio_norm = audio_norm.unsqueeze(0)
        spec_filename = filename.replace(".wav", ".spec.pt")
        if os.path.exists(spec_filename):
@ -287,7 +290,10 @@ class TextAudioLoader(torch.utils.data.Dataset):
                    sampling_rate, self.sampling_rate
                )
            )
-        audio_norm = audio / self.max_wav_value
+        audio_norm = audio
 #        audio_norm = audio / self.max_wav_value
 #        audio_norm = audio / np.abs(audio).max()
        audio_norm = audio_norm.unsqueeze(0)
        spec_filename = filename.replace(".wav", ".spec.pt")
        if os.path.exists(spec_filename):
--- a/trainset_preprocess_pipeline_print.py
+++ b/trainset_preprocess_pipeline_print.py
@ -59,19 +59,34 @@ class PreProcess:
        wavfile.write(
            "%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1),
            self.sr,
-            (tmp_audio * 32768).astype(np.int16),
+            (tmp_audio * 1).astype(np.float32),
        )
-        tmp_audio = librosa.resample(tmp_audio, orig_sr=self.sr, target_sr=16000)
+
        # default resample type of librosa.resample is "soxr_hq".
        # Quality: soxr_vhq > soxr_hq
        tmp_audio = librosa.resample(tmp_audio, orig_sr=self.sr, target_sr=16000, res_type="soxr_vhq")
        tmp_audio = (tmp_audio / np.abs(tmp_audio).max() * (self.max * self.alpha)) + (
            1 - self.alpha
        ) * tmp_audio
        wavfile.write(
            "%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1),
            self.sr,
            (tmp_audio * 1).astype(np.float32),
        )
        wavfile.write(
            "%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1),
            16000,
-            (tmp_audio * 32768).astype(np.int16),
+            (tmp_audio * 1).astype(np.float32),
        )
    def pipeline(self, path, idx0):
        try:
            audio = load_audio(path, self.sr)
-            audio = signal.filtfilt(self.bh, self.ah, audio)
+            # zero phased digital filter cause pre-ringing noise...
            # audio = signal.filtfilt(self.bh, self.ah, audio) 
            audio = signal.lfilter(self.bh, self.ah, audio)
            idx1 = 0
            for audio in self.slicer.slice(audio):
                i = 0