diff --git a/gui_v1.py b/gui_v1.py index e5c6757..86b52d3 100644 --- a/gui_v1.py +++ b/gui_v1.py @@ -38,10 +38,14 @@ def phase_vocoder(a, b, fade_out, fade_in): deltaphase = deltaphase - 2 * np.pi * torch.floor(deltaphase / 2 / np.pi + 0.5) w = 2 * np.pi * torch.arange(n // 2 + 1).to(a) + deltaphase t = torch.arange(n).unsqueeze(-1).to(a) / n - result = a * (fade_out ** 2) + b * (fade_in ** 2) + torch.sum(absab * torch.cos(w * t + phia), -1) * window / n + result = ( + a * (fade_out**2) + + b * (fade_in**2) + + torch.sum(absab * torch.cos(w * t + phia), -1) * window / n + ) return result - + class Harvest(multiprocessing.Process): def __init__(self, inp_q, opt_q): multiprocessing.Process.__init__(self) @@ -592,11 +596,11 @@ if __name__ == "__main__": self.gui_config.pth_path = values["pth_path"] self.gui_config.index_path = values["index_path"] self.gui_config.sr_type = ["sr_model", "sr_device"][ - [ - values["sr_model"], - values["sr_device"], - ].index(True) - ] + [ + values["sr_model"], + values["sr_device"], + ].index(True) + ] self.gui_config.threhold = values["threhold"] self.gui_config.pitch = values["pitch"] self.gui_config.block_time = values["block_time"] @@ -633,7 +637,11 @@ if __name__ == "__main__": self.config, self.rvc if hasattr(self, "rvc") else None, ) - self.gui_config.samplerate = self.rvc.tgt_sr if self.gui_config.sr_type == "sr_model" else self.get_device_samplerate() + self.gui_config.samplerate = ( + self.rvc.tgt_sr + if self.gui_config.sr_type == "sr_model" + else self.get_device_samplerate() + ) self.zc = self.gui_config.samplerate // 100 self.block_frame = ( int( @@ -690,7 +698,9 @@ if __name__ == "__main__": 2 * self.zc, device=self.config.device, dtype=torch.float32 ) self.skip_head = self.extra_frame // self.zc - self.return_length = (self.block_frame + self.sola_buffer_frame + self.sola_search_frame) // self.zc + self.return_length = ( + self.block_frame + self.sola_buffer_frame + self.sola_search_frame + ) // self.zc self.fade_in_window: torch.Tensor = ( torch.sin( 0.5 @@ -824,7 +834,11 @@ if __name__ == "__main__": # volume envelop mixing if self.gui_config.rms_mix_rate < 1 and self.function == "vc": rms1 = librosa.feature.rms( - y=self.input_wav_res[160 * self.skip_head : 160 * (self.skip_head + self.return_length)] + y=self.input_wav_res[ + 160 + * self.skip_head : 160 + * (self.skip_head + self.return_length) + ] .cpu() .numpy(), frame_length=640, @@ -871,21 +885,24 @@ if __name__ == "__main__": else: sola_offset = torch.argmax(cor_nom[0, 0] / cor_den[0, 0]) printt("sola_offset = %d", int(sola_offset)) - infer_wav = infer_wav[sola_offset :] + infer_wav = infer_wav[sola_offset:] if "privateuseone" in str(self.config.device) or not self.gui_config.use_pv: infer_wav[: self.sola_buffer_frame] *= self.fade_in_window - infer_wav[: self.sola_buffer_frame] += self.sola_buffer * self.fade_out_window + infer_wav[: self.sola_buffer_frame] += ( + self.sola_buffer * self.fade_out_window + ) else: infer_wav[: self.sola_buffer_frame] = phase_vocoder( - self.sola_buffer, - infer_wav[: self.sola_buffer_frame], - self.fade_out_window, - self.fade_in_window) - self.sola_buffer[:] = infer_wav[self.block_frame : self.block_frame + self.sola_buffer_frame] - if sys.platform == "darwin": - outdata[:] = ( - infer_wav[: self.block_frame].cpu().numpy()[:, np.newaxis] + self.sola_buffer, + infer_wav[: self.sola_buffer_frame], + self.fade_out_window, + self.fade_in_window, ) + self.sola_buffer[:] = infer_wav[ + self.block_frame : self.block_frame + self.sola_buffer_frame + ] + if sys.platform == "darwin": + outdata[:] = infer_wav[: self.block_frame].cpu().numpy()[:, np.newaxis] else: outdata[:] = ( infer_wav[: self.block_frame].repeat(2, 1).t().cpu().numpy() @@ -930,7 +947,7 @@ if __name__ == "__main__": input_devices_indices, output_devices_indices, ) - + def set_devices(self, input_device, output_device): """设置输出设备""" ( @@ -947,8 +964,10 @@ if __name__ == "__main__": ] printt("Input device: %s:%s", str(sd.default.device[0]), input_device) printt("Output device: %s:%s", str(sd.default.device[1]), output_device) - + def get_device_samplerate(self): - return int(sd.query_devices(device=sd.default.device[0])['default_samplerate']) - + return int( + sd.query_devices(device=sd.default.device[0])["default_samplerate"] + ) + gui = GUI() diff --git a/infer/lib/infer_pack/models.py b/infer/lib/infer_pack/models.py index a81c1de..e489634 100644 --- a/infer/lib/infer_pack/models.py +++ b/infer/lib/infer_pack/models.py @@ -795,9 +795,9 @@ class SynthesizerTrnMs256NSFsid(nn.Module): assert isinstance(return_length, torch.Tensor) head = int(skip_head.item()) length = int(return_length.item()) - z_p = z_p[:, :, head: head + length] - x_mask = x_mask[:, :, head: head + length] - nsff0 = nsff0[:, head: head + length] + z_p = z_p[:, :, head : head + length] + x_mask = x_mask[:, :, head : head + length] + nsff0 = nsff0[:, head : head + length] z = self.flow(z_p, x_mask, g=g, reverse=True) o = self.dec(z * x_mask, nsff0, g=g) return o, x_mask, (z, z_p, m_p, logs_p) @@ -957,9 +957,9 @@ class SynthesizerTrnMs768NSFsid(nn.Module): assert isinstance(return_length, torch.Tensor) head = int(skip_head.item()) length = int(return_length.item()) - z_p = z_p[:, :, head: head + length] - x_mask = x_mask[:, :, head: head + length] - nsff0 = nsff0[:, head: head + length] + z_p = z_p[:, :, head : head + length] + x_mask = x_mask[:, :, head : head + length] + nsff0 = nsff0[:, head : head + length] z = self.flow(z_p, x_mask, g=g, reverse=True) o = self.dec(z * x_mask, nsff0, g=g) return o, x_mask, (z, z_p, m_p, logs_p) @@ -1108,8 +1108,8 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module): assert isinstance(return_length, torch.Tensor) head = int(skip_head.item()) length = int(return_length.item()) - z_p = z_p[:, :, head: head + length] - x_mask = x_mask[:, :, head: head + length] + z_p = z_p[:, :, head : head + length] + x_mask = x_mask[:, :, head : head + length] z = self.flow(z_p, x_mask, g=g, reverse=True) o = self.dec(z * x_mask, g=g) return o, x_mask, (z, z_p, m_p, logs_p) @@ -1258,8 +1258,8 @@ class SynthesizerTrnMs768NSFsid_nono(nn.Module): assert isinstance(return_length, torch.Tensor) head = int(skip_head.item()) length = int(return_length.item()) - z_p = z_p[:, :, head: head + length] - x_mask = x_mask[:, :, head: head + length] + z_p = z_p[:, :, head : head + length] + x_mask = x_mask[:, :, head : head + length] z = self.flow(z_p, x_mask, g=g, reverse=True) o = self.dec(z * x_mask, g=g) return o, x_mask, (z, z_p, m_p, logs_p) diff --git a/infer/lib/train/mel_processing.py b/infer/lib/train/mel_processing.py index 14a960f..3751f1e 100644 --- a/infer/lib/train/mel_processing.py +++ b/infer/lib/train/mel_processing.py @@ -38,6 +38,7 @@ def spectral_de_normalize_torch(magnitudes): mel_basis = {} hann_window = {} + def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): """Convert waveform into Linear-frequency Linear-amplitude spectrogram. @@ -51,7 +52,7 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False) Returns: :: (B, Freq, Frame) - Linear-frequency Linear-amplitude spectrogram """ - + # Window - Cache if needed global hann_window dtype_device = str(y.dtype) + "_" + str(y.device) @@ -60,7 +61,7 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False) hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( dtype=y.dtype, device=y.device ) - + # Padding y = torch.nn.functional.pad( y.unsqueeze(1), @@ -68,7 +69,7 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False) mode="reflect", ) y = y.squeeze(1) - + # Complex Spectrogram :: (B, T) -> (B, Freq, Frame, RealComplex=2) spec = torch.stft( y, @@ -82,11 +83,12 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False) onesided=True, return_complex=True, ) - + # Linear-frequency Linear-amplitude spectrogram :: (B, Freq, Frame, RealComplex=2) -> (B, Freq, Frame) spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + 1e-6) return spec + def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): # MelBasis - Cache if needed global mel_basis diff --git a/tools/rvc_for_realtime.py b/tools/rvc_for_realtime.py index 257c44d..ff1ea88 100644 --- a/tools/rvc_for_realtime.py +++ b/tools/rvc_for_realtime.py @@ -46,22 +46,23 @@ def printt(strr, *args): # config.is_half=False########强制cpu测试 class RVC: def __init__( - self, - key, - pth_path, - index_path, - index_rate, - n_cpu, - inp_q, - opt_q, - config: Config, - last_rvc=None, + self, + key, + pth_path, + index_path, + index_rate, + n_cpu, + inp_q, + opt_q, + config: Config, + last_rvc=None, ) -> None: """ 初始化 """ try: if config.dml == True: + def forward_dml(ctx, x, scale): ctx.scale = scale res = x.clone().detach() @@ -92,7 +93,7 @@ class RVC: self.index_rate = index_rate self.cache_pitch: np.ndarray = np.zeros(1024, dtype="int32") self.cache_pitchf = np.zeros(1024, dtype="float32") - + if last_rvc is None: models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( ["assets/hubert/hubert_base.pt"], @@ -201,7 +202,7 @@ class RVC: f0bak = f0.copy() f0_mel = 1127 * np.log(1 + f0 / 700) f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / ( - self.f0_mel_max - self.f0_mel_min + self.f0_mel_max - self.f0_mel_min ) + 1 f0_mel[f0_mel <= 1] = 1 f0_mel[f0_mel > 255] = 255 @@ -258,7 +259,7 @@ class RVC: self.inp_q.put((idx, x[:tail], res_f0, n_cpu, ts)) else: self.inp_q.put( - (idx, x[part_length * idx - 320: tail], res_f0, n_cpu, ts) + (idx, x[part_length * idx - 320 : tail], res_f0, n_cpu, ts) ) while 1: res_ts = self.opt_q.get() @@ -273,7 +274,7 @@ class RVC: else: f0 = f0[2:] f0bak[ - part_length * idx // 160: part_length * idx // 160 + f0.shape[0] + part_length * idx // 160 : part_length * idx // 160 + f0.shape[0] ] = f0 f0bak = signal.medfilt(f0bak, 3) f0bak *= pow(2, f0_up_key / 12) @@ -320,6 +321,7 @@ class RVC: def get_f0_fcpe(self, x, f0_up_key): if hasattr(self, "model_fcpe") == False: from torchfcpe import spawn_bundled_infer_model + printt("Loading fcpe model") if "privateuseone" in str(self.device): self.device_fcpe = "cpu" @@ -329,7 +331,7 @@ class RVC: f0 = self.model_fcpe.infer( x.to(self.device_fcpe).unsqueeze(0).float(), sr=16000, - decoder_mode='local_argmax', + decoder_mode="local_argmax", threshold=0.006, ) f0 *= pow(2, f0_up_key / 12) @@ -337,12 +339,12 @@ class RVC: return self.get_f0_post(f0) def infer( - self, - input_wav: torch.Tensor, - block_frame_16k, - skip_head, - return_length, - f0method, + self, + input_wav: torch.Tensor, + block_frame_16k, + skip_head, + return_length, + f0method, ) -> np.ndarray: t1 = ttime() with torch.no_grad(): @@ -364,16 +366,16 @@ class RVC: t2 = ttime() try: if hasattr(self, "index") and self.index_rate != 0: - npy = feats[0][skip_head // 2:].cpu().numpy().astype("float32") + npy = feats[0][skip_head // 2 :].cpu().numpy().astype("float32") score, ix = self.index.search(npy, k=8) weight = np.square(1 / score) weight /= weight.sum(axis=1, keepdims=True) npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) if self.config.is_half: npy = npy.astype("float16") - feats[0][skip_head // 2:] = ( - torch.from_numpy(npy).unsqueeze(0).to(self.device) * self.index_rate - + (1 - self.index_rate) * feats[0][skip_head // 2:] + feats[0][skip_head // 2 :] = ( + torch.from_numpy(npy).unsqueeze(0).to(self.device) * self.index_rate + + (1 - self.index_rate) * feats[0][skip_head // 2 :] ) else: printt("Index search FAILED or disabled") @@ -384,21 +386,29 @@ class RVC: if self.if_f0 == 1: f0_extractor_frame = block_frame_16k + 800 if f0method == "rmvpe": - f0_extractor_frame = ( - 5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160 - ) - pitch, pitchf = self.get_f0(input_wav[-f0_extractor_frame: ], self.f0_up_key, self.n_cpu, f0method) + f0_extractor_frame = 5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160 + pitch, pitchf = self.get_f0( + input_wav[-f0_extractor_frame:], self.f0_up_key, self.n_cpu, f0method + ) start_frame = block_frame_16k // 160 end_frame = len(self.cache_pitch) - (pitch.shape[0] - 4) + start_frame - self.cache_pitch[:] = np.append(self.cache_pitch[start_frame: end_frame], pitch[3:-1]) + self.cache_pitch[:] = np.append( + self.cache_pitch[start_frame:end_frame], pitch[3:-1] + ) self.cache_pitchf[:] = np.append( - self.cache_pitchf[start_frame: end_frame], pitchf[3:-1] + self.cache_pitchf[start_frame:end_frame], pitchf[3:-1] ) t4 = ttime() p_len = input_wav.shape[0] // 160 if self.if_f0 == 1: - cache_pitch = torch.LongTensor(self.cache_pitch[-p_len: ]).to(self.device).unsqueeze(0) - cache_pitchf = torch.FloatTensor(self.cache_pitchf[-p_len: ]).to(self.device).unsqueeze(0) + cache_pitch = ( + torch.LongTensor(self.cache_pitch[-p_len:]).to(self.device).unsqueeze(0) + ) + cache_pitchf = ( + torch.FloatTensor(self.cache_pitchf[-p_len:]) + .to(self.device) + .unsqueeze(0) + ) feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) feats = feats[:, :p_len, :] p_len = torch.LongTensor([p_len]).to(self.device)