1
0
mirror of synced 2024-11-24 07:30:16 +01:00

some change precision audio processing (#94)

* some change precision audio processing

* fix clipping problem in resample

resample sometimes causes signal clipping, not just librosa.resample

* fix error
This commit is contained in:
autumnmotor 2023-04-22 20:39:47 +09:00 committed by GitHub
parent c423f77a16
commit 297d92bf5d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 32 additions and 9 deletions

View File

@ -33,7 +33,9 @@ class FeatureInput(object):
self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
def compute_f0(self, path, f0_method): def compute_f0(self, path, f0_method):
x, sr = librosa.load(path, self.fs) # default resample type of librosa.resample is "soxr_hq".
# Quality: soxr_vhq > soxr_hq
x, sr = librosa.load(path, self.fs, res_type='soxr_vhq')
p_len = x.shape[0] // self.hop p_len = x.shape[0] // self.hop
assert sr == self.fs assert sr == self.fs
if f0_method == "pm": if f0_method == "pm":

View File

@ -12,10 +12,10 @@ def load_audio(file, sr):
) # 防止小白拷路径头尾带了空格和"和回车 ) # 防止小白拷路径头尾带了空格和"和回车
out, _ = ( out, _ = (
ffmpeg.input(file, threads=0) ffmpeg.input(file, threads=0)
.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr) .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
) )
except Exception as e: except Exception as e:
raise RuntimeError(f"Failed to load audio: {e}") raise RuntimeError(f"Failed to load audio: {e}")
return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0 return np.frombuffer(out, np.float32).flatten()

View File

@ -98,7 +98,10 @@ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):
sampling_rate, self.sampling_rate sampling_rate, self.sampling_rate
) )
) )
audio_norm = audio / self.max_wav_value audio_norm = audio
# audio_norm = audio / self.max_wav_value
# audio_norm = audio / np.abs(audio).max()
audio_norm = audio_norm.unsqueeze(0) audio_norm = audio_norm.unsqueeze(0)
spec_filename = filename.replace(".wav", ".spec.pt") spec_filename = filename.replace(".wav", ".spec.pt")
if os.path.exists(spec_filename): if os.path.exists(spec_filename):
@ -287,7 +290,10 @@ class TextAudioLoader(torch.utils.data.Dataset):
sampling_rate, self.sampling_rate sampling_rate, self.sampling_rate
) )
) )
audio_norm = audio / self.max_wav_value audio_norm = audio
# audio_norm = audio / self.max_wav_value
# audio_norm = audio / np.abs(audio).max()
audio_norm = audio_norm.unsqueeze(0) audio_norm = audio_norm.unsqueeze(0)
spec_filename = filename.replace(".wav", ".spec.pt") spec_filename = filename.replace(".wav", ".spec.pt")
if os.path.exists(spec_filename): if os.path.exists(spec_filename):

View File

@ -59,19 +59,34 @@ class PreProcess:
wavfile.write( wavfile.write(
"%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1), "%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1),
self.sr, self.sr,
(tmp_audio * 32768).astype(np.int16), (tmp_audio * 1).astype(np.float32),
) )
tmp_audio = librosa.resample(tmp_audio, orig_sr=self.sr, target_sr=16000)
# default resample type of librosa.resample is "soxr_hq".
# Quality: soxr_vhq > soxr_hq
tmp_audio = librosa.resample(tmp_audio, orig_sr=self.sr, target_sr=16000, res_type="soxr_vhq")
tmp_audio = (tmp_audio / np.abs(tmp_audio).max() * (self.max * self.alpha)) + (
1 - self.alpha
) * tmp_audio
wavfile.write(
"%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1),
self.sr,
(tmp_audio * 1).astype(np.float32),
)
wavfile.write( wavfile.write(
"%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1), "%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1),
16000, 16000,
(tmp_audio * 32768).astype(np.int16), (tmp_audio * 1).astype(np.float32),
) )
def pipeline(self, path, idx0): def pipeline(self, path, idx0):
try: try:
audio = load_audio(path, self.sr) audio = load_audio(path, self.sr)
audio = signal.filtfilt(self.bh, self.ah, audio) # zero phased digital filter cause pre-ringing noise...
# audio = signal.filtfilt(self.bh, self.ah, audio)
audio = signal.lfilter(self.bh, self.ah, audio)
idx1 = 0 idx1 = 0
for audio in self.slicer.slice(audio): for audio in self.slicer.slice(audio):
i = 0 i = 0