optimize real-time vc
This commit is contained in:
parent
78f03e7dc0
commit
3dec36568c
@ -1 +1 @@
|
||||
{"pth_path": "assets/weights/kikiV1.pth", "index_path": "logs/kikiV1.index", "sg_input_device": "VoiceMeeter Output (VB-Audio Vo (MME)", "sg_output_device": "VoiceMeeter Input (VB-Audio Voi (MME)", "threhold": -45.0, "pitch": 2.0, "rms_mix_rate": 0.0, "index_rate": 0.0, "block_time": 0.52, "crossfade_length": 0.15, "extra_time": 2.46, "n_cpu": 6.0, "use_jit": false, "f0method": "rmvpe"}
|
||||
{"pth_path": "assets/weights/kikiV1.pth", "index_path": "logs/kikiV1.index", "sg_input_device": "VoiceMeeter Output (VB-Audio Vo (MME)", "sg_output_device": "VoiceMeeter Input (VB-Audio Voi (MME)", "sr_type": "sr_model", "threhold": -60.0, "pitch": 12.0, "rms_mix_rate": 0.5, "index_rate": 0.0, "block_time": 0.2, "crossfade_length": 0.08, "extra_time": 2.00, "n_cpu": 4.0, "use_jit": false, "use_pv": false, "f0method": "fcpe"}
|
159
gui_v1.py
159
gui_v1.py
@ -22,6 +22,26 @@ def printt(strr, *args):
|
||||
print(strr % args)
|
||||
|
||||
|
||||
def phase_vocoder(a, b, fade_out, fade_in):
|
||||
window = torch.sqrt(fade_out * fade_in)
|
||||
fa = torch.fft.rfft(a * window)
|
||||
fb = torch.fft.rfft(b * window)
|
||||
absab = torch.abs(fa) + torch.abs(fb)
|
||||
n = a.shape[0]
|
||||
if n % 2 == 0:
|
||||
absab[1:-1] *= 2
|
||||
else:
|
||||
absab[1:] *= 2
|
||||
phia = torch.angle(fa)
|
||||
phib = torch.angle(fb)
|
||||
deltaphase = phib - phia
|
||||
deltaphase = deltaphase - 2 * np.pi * torch.floor(deltaphase / 2 / np.pi + 0.5)
|
||||
w = 2 * np.pi * torch.arange(n // 2 + 1).to(a) + deltaphase
|
||||
t = torch.arange(n).unsqueeze(-1).to(a) / n
|
||||
result = a * (fade_out ** 2) + b * (fade_in ** 2) + torch.sum(absab * torch.cos(w * t + phia), -1) * window / n
|
||||
return result
|
||||
|
||||
|
||||
class Harvest(multiprocessing.Process):
|
||||
def __init__(self, inp_q, opt_q):
|
||||
multiprocessing.Process.__init__(self)
|
||||
@ -118,6 +138,8 @@ if __name__ == "__main__":
|
||||
try:
|
||||
with open("configs/config.json", "r") as j:
|
||||
data = json.load(j)
|
||||
data["sr_model"] = data["sr_type"] == "sr_model"
|
||||
data["sr_device"] = data["sr_type"] == "sr_device"
|
||||
data["pm"] = data["f0method"] == "pm"
|
||||
data["harvest"] = data["f0method"] == "harvest"
|
||||
data["crepe"] = data["f0method"] == "crepe"
|
||||
@ -134,6 +156,7 @@ if __name__ == "__main__":
|
||||
"index_path": " ",
|
||||
"sg_input_device": input_devices[sd.default.device[0]],
|
||||
"sg_output_device": output_devices[sd.default.device[1]],
|
||||
"sr_type": "sr_model",
|
||||
"threhold": "-60",
|
||||
"pitch": "0",
|
||||
"index_rate": "0",
|
||||
@ -143,7 +166,10 @@ if __name__ == "__main__":
|
||||
"extra_time": "2.5",
|
||||
"f0method": "rmvpe",
|
||||
"use_jit": False,
|
||||
"use_pv": False,
|
||||
}
|
||||
data["sr_model"] = data["sr_type"] == "sr_model"
|
||||
data["sr_device"] = data["sr_type"] == "sr_device"
|
||||
data["pm"] = data["f0method"] == "pm"
|
||||
data["harvest"] = data["f0method"] == "harvest"
|
||||
data["crepe"] = data["f0method"] == "crepe"
|
||||
@ -207,7 +233,25 @@ if __name__ == "__main__":
|
||||
default_value=data.get("sg_output_device", ""),
|
||||
),
|
||||
],
|
||||
[sg.Button(i18n("重载设备列表"), key="reload_devices")],
|
||||
[
|
||||
sg.Button(i18n("重载设备列表"), key="reload_devices"),
|
||||
sg.Radio(
|
||||
i18n("使用模型采样率"),
|
||||
"sr_type",
|
||||
key="sr_model",
|
||||
default=data.get("sr_model", True),
|
||||
enable_events=True,
|
||||
),
|
||||
sg.Radio(
|
||||
i18n("使用设备采样率"),
|
||||
"sr_type",
|
||||
key="sr_device",
|
||||
default=data.get("sr_device", False),
|
||||
enable_events=True,
|
||||
),
|
||||
sg.Text(i18n("采样率:")),
|
||||
sg.Text("", key="sr_stream"),
|
||||
],
|
||||
],
|
||||
title=i18n("音频设备(请使用同种类驱动)"),
|
||||
)
|
||||
@ -222,7 +266,7 @@ if __name__ == "__main__":
|
||||
key="threhold",
|
||||
resolution=1,
|
||||
orientation="h",
|
||||
default_value=data.get("threhold", "-60"),
|
||||
default_value=data.get("threhold", -60),
|
||||
enable_events=True,
|
||||
),
|
||||
],
|
||||
@ -233,7 +277,7 @@ if __name__ == "__main__":
|
||||
key="pitch",
|
||||
resolution=1,
|
||||
orientation="h",
|
||||
default_value=data.get("pitch", "0"),
|
||||
default_value=data.get("pitch", 0),
|
||||
enable_events=True,
|
||||
),
|
||||
],
|
||||
@ -244,7 +288,7 @@ if __name__ == "__main__":
|
||||
key="index_rate",
|
||||
resolution=0.01,
|
||||
orientation="h",
|
||||
default_value=data.get("index_rate", "0"),
|
||||
default_value=data.get("index_rate", 0),
|
||||
enable_events=True,
|
||||
),
|
||||
],
|
||||
@ -255,7 +299,7 @@ if __name__ == "__main__":
|
||||
key="rms_mix_rate",
|
||||
resolution=0.01,
|
||||
orientation="h",
|
||||
default_value=data.get("rms_mix_rate", "0"),
|
||||
default_value=data.get("rms_mix_rate", 0),
|
||||
enable_events=True,
|
||||
),
|
||||
],
|
||||
@ -265,35 +309,35 @@ if __name__ == "__main__":
|
||||
"pm",
|
||||
"f0method",
|
||||
key="pm",
|
||||
default=data.get("pm", "") == True,
|
||||
default=data.get("pm", False),
|
||||
enable_events=True,
|
||||
),
|
||||
sg.Radio(
|
||||
"harvest",
|
||||
"f0method",
|
||||
key="harvest",
|
||||
default=data.get("harvest", "") == True,
|
||||
default=data.get("harvest", False),
|
||||
enable_events=True,
|
||||
),
|
||||
sg.Radio(
|
||||
"crepe",
|
||||
"f0method",
|
||||
key="crepe",
|
||||
default=data.get("crepe", "") == True,
|
||||
default=data.get("crepe", False),
|
||||
enable_events=True,
|
||||
),
|
||||
sg.Radio(
|
||||
"rmvpe",
|
||||
"f0method",
|
||||
key="rmvpe",
|
||||
default=data.get("rmvpe", "") == True,
|
||||
default=data.get("rmvpe", False),
|
||||
enable_events=True,
|
||||
),
|
||||
sg.Radio(
|
||||
"fcpe",
|
||||
"f0method",
|
||||
key="fcpe",
|
||||
default=data.get("fcpe", "") == True,
|
||||
default=data.get("fcpe", True),
|
||||
enable_events=True,
|
||||
),
|
||||
],
|
||||
@ -305,11 +349,11 @@ if __name__ == "__main__":
|
||||
[
|
||||
sg.Text(i18n("采样长度")),
|
||||
sg.Slider(
|
||||
range=(0.05, 2.4),
|
||||
range=(0.02, 2.4),
|
||||
key="block_time",
|
||||
resolution=0.01,
|
||||
orientation="h",
|
||||
default_value=data.get("block_time", "0.25"),
|
||||
default_value=data.get("block_time", 0.25),
|
||||
enable_events=True,
|
||||
),
|
||||
],
|
||||
@ -320,7 +364,7 @@ if __name__ == "__main__":
|
||||
# key="device_latency",
|
||||
# resolution=0.001,
|
||||
# orientation="h",
|
||||
# default_value=data.get("device_latency", "0.1"),
|
||||
# default_value=data.get("device_latency", 0.1),
|
||||
# enable_events=True,
|
||||
# ),
|
||||
# ],
|
||||
@ -344,7 +388,7 @@ if __name__ == "__main__":
|
||||
key="crossfade_length",
|
||||
resolution=0.01,
|
||||
orientation="h",
|
||||
default_value=data.get("crossfade_length", "0.05"),
|
||||
default_value=data.get("crossfade_length", 0.05),
|
||||
enable_events=True,
|
||||
),
|
||||
],
|
||||
@ -355,7 +399,7 @@ if __name__ == "__main__":
|
||||
key="extra_time",
|
||||
resolution=0.01,
|
||||
orientation="h",
|
||||
default_value=data.get("extra_time", "2.5"),
|
||||
default_value=data.get("extra_time", 2.5),
|
||||
enable_events=True,
|
||||
),
|
||||
],
|
||||
@ -370,6 +414,12 @@ if __name__ == "__main__":
|
||||
key="O_noise_reduce",
|
||||
enable_events=True,
|
||||
),
|
||||
sg.Checkbox(
|
||||
i18n("启用相位声码器"),
|
||||
key="use_pv",
|
||||
default=data.get("use_pv", False),
|
||||
enable_events=True,
|
||||
),
|
||||
# sg.Checkbox(
|
||||
# "JIT加速",
|
||||
# default=self.config.use_jit,
|
||||
@ -443,6 +493,12 @@ if __name__ == "__main__":
|
||||
"index_path": values["index_path"],
|
||||
"sg_input_device": values["sg_input_device"],
|
||||
"sg_output_device": values["sg_output_device"],
|
||||
"sr_type": ["sr_model", "sr_device"][
|
||||
[
|
||||
values["sr_model"],
|
||||
values["sr_device"],
|
||||
].index(True)
|
||||
],
|
||||
"threhold": values["threhold"],
|
||||
"pitch": values["pitch"],
|
||||
"rms_mix_rate": values["rms_mix_rate"],
|
||||
@ -454,6 +510,7 @@ if __name__ == "__main__":
|
||||
"n_cpu": values["n_cpu"],
|
||||
# "use_jit": values["use_jit"],
|
||||
"use_jit": False,
|
||||
"use_pv": values["use_pv"],
|
||||
"f0method": ["pm", "harvest", "crepe", "rmvpe", "fcpe"][
|
||||
[
|
||||
values["pm"],
|
||||
@ -477,6 +534,7 @@ if __name__ == "__main__":
|
||||
)
|
||||
if values["I_noise_reduce"]:
|
||||
self.delay_time += values["crossfade_length"]
|
||||
self.window["sr_stream"].update(self.gui_config.samplerate)
|
||||
self.window["delay_time"].update(int(self.delay_time * 1000))
|
||||
if event == "stop_vc" and self.flag_vc == True:
|
||||
self.flag_vc = False
|
||||
@ -505,6 +563,8 @@ if __name__ == "__main__":
|
||||
self.window["delay_time"].update(int(self.delay_time * 1000))
|
||||
elif event == "O_noise_reduce":
|
||||
self.gui_config.O_noise_reduce = values["O_noise_reduce"]
|
||||
elif event == "use_pv":
|
||||
self.gui_config.use_pv = values["use_pv"]
|
||||
elif event in ["vc", "im"]:
|
||||
self.function = event
|
||||
elif event != "start_vc" and self.flag_vc == True:
|
||||
@ -531,6 +591,12 @@ if __name__ == "__main__":
|
||||
# self.device_latency = values["device_latency"]
|
||||
self.gui_config.pth_path = values["pth_path"]
|
||||
self.gui_config.index_path = values["index_path"]
|
||||
self.gui_config.sr_type = ["sr_model", "sr_device"][
|
||||
[
|
||||
values["sr_model"],
|
||||
values["sr_device"],
|
||||
].index(True)
|
||||
]
|
||||
self.gui_config.threhold = values["threhold"]
|
||||
self.gui_config.pitch = values["pitch"]
|
||||
self.gui_config.block_time = values["block_time"]
|
||||
@ -538,6 +604,7 @@ if __name__ == "__main__":
|
||||
self.gui_config.extra_time = values["extra_time"]
|
||||
self.gui_config.I_noise_reduce = values["I_noise_reduce"]
|
||||
self.gui_config.O_noise_reduce = values["O_noise_reduce"]
|
||||
self.gui_config.use_pv = values["use_pv"]
|
||||
self.gui_config.rms_mix_rate = values["rms_mix_rate"]
|
||||
self.gui_config.index_rate = values["index_rate"]
|
||||
self.gui_config.n_cpu = values["n_cpu"]
|
||||
@ -566,8 +633,8 @@ if __name__ == "__main__":
|
||||
self.config,
|
||||
self.rvc if hasattr(self, "rvc") else None,
|
||||
)
|
||||
self.gui_config.samplerate = self.rvc.tgt_sr
|
||||
self.zc = self.rvc.tgt_sr // 100
|
||||
self.gui_config.samplerate = self.rvc.tgt_sr if self.gui_config.sr_type == "sr_model" else self.get_device_samplerate()
|
||||
self.zc = self.gui_config.samplerate // 100
|
||||
self.block_frame = (
|
||||
int(
|
||||
np.round(
|
||||
@ -589,6 +656,7 @@ if __name__ == "__main__":
|
||||
)
|
||||
* self.zc
|
||||
)
|
||||
self.sola_buffer_frame = min(self.crossfade_frame, 4 * self.zc)
|
||||
self.sola_search_frame = self.zc
|
||||
self.extra_frame = (
|
||||
int(
|
||||
@ -622,14 +690,14 @@ if __name__ == "__main__":
|
||||
dtype="float64",
|
||||
)
|
||||
self.sola_buffer: torch.Tensor = torch.zeros(
|
||||
self.crossfade_frame, device=self.config.device, dtype=torch.float32
|
||||
self.sola_buffer_frame, device=self.config.device, dtype=torch.float32
|
||||
)
|
||||
self.nr_buffer: torch.Tensor = self.sola_buffer.clone()
|
||||
self.output_buffer: torch.Tensor = self.input_wav.clone()
|
||||
self.res_buffer: torch.Tensor = torch.zeros(
|
||||
2 * self.zc, device=self.config.device, dtype=torch.float32
|
||||
)
|
||||
self.valid_rate = 1 - (self.extra_frame - 1) / self.input_wav.shape[0]
|
||||
self.skip_head = self.extra_frame // self.zc
|
||||
self.fade_in_window: torch.Tensor = (
|
||||
torch.sin(
|
||||
0.5
|
||||
@ -637,7 +705,7 @@ if __name__ == "__main__":
|
||||
* torch.linspace(
|
||||
0.0,
|
||||
1.0,
|
||||
steps=self.crossfade_frame,
|
||||
steps=self.sola_buffer_frame,
|
||||
device=self.config.device,
|
||||
dtype=torch.float32,
|
||||
)
|
||||
@ -650,6 +718,14 @@ if __name__ == "__main__":
|
||||
new_freq=16000,
|
||||
dtype=torch.float32,
|
||||
).to(self.config.device)
|
||||
if self.rvc.tgt_sr != self.gui_config.samplerate:
|
||||
self.resampler2 = tat.Resample(
|
||||
orig_freq=self.rvc.tgt_sr,
|
||||
new_freq=self.gui_config.samplerate,
|
||||
dtype=torch.float32,
|
||||
).to(self.config.device)
|
||||
else:
|
||||
self.resampler2 = None
|
||||
self.tg = TorchGate(
|
||||
sr=self.gui_config.samplerate, n_fft=4 * self.zc, prop_decrease=0.9
|
||||
).to(self.config.device)
|
||||
@ -710,11 +786,11 @@ if __name__ == "__main__":
|
||||
input_wav = self.tg(
|
||||
input_wav.unsqueeze(0), self.input_wav.unsqueeze(0)
|
||||
)[0, 2 * self.zc :]
|
||||
input_wav[: self.crossfade_frame] *= self.fade_in_window
|
||||
input_wav[: self.crossfade_frame] += (
|
||||
input_wav[: self.sola_buffer_frame] *= self.fade_in_window
|
||||
input_wav[: self.sola_buffer_frame] += (
|
||||
self.nr_buffer * self.fade_out_window
|
||||
)
|
||||
self.nr_buffer[:] = input_wav[-self.crossfade_frame :]
|
||||
self.nr_buffer[:] = input_wav[self.block_frame : self.block_frame + self.sola_buffer_frame]
|
||||
input_wav = torch.cat(
|
||||
(self.res_buffer[:], input_wav[: self.block_frame])
|
||||
)
|
||||
@ -728,23 +804,16 @@ if __name__ == "__main__":
|
||||
)[160:]
|
||||
# infer
|
||||
if self.function == "vc":
|
||||
f0_extractor_frame = self.block_frame_16k + 800
|
||||
if self.gui_config.f0method == "rmvpe":
|
||||
f0_extractor_frame = (
|
||||
5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160
|
||||
)
|
||||
infer_wav = self.rvc.infer(
|
||||
self.input_wav_res,
|
||||
self.input_wav_res[-f0_extractor_frame:].cpu().numpy(),
|
||||
self.block_frame_16k,
|
||||
self.valid_rate,
|
||||
self.skip_head,
|
||||
self.pitch,
|
||||
self.pitchf,
|
||||
self.gui_config.f0method,
|
||||
)
|
||||
infer_wav = infer_wav[
|
||||
-self.crossfade_frame - self.sola_search_frame - self.block_frame :
|
||||
]
|
||||
if self.resampler2 is not None:
|
||||
infer_wav = self.resampler2(infer_wav)
|
||||
else:
|
||||
infer_wav = self.input_wav[
|
||||
-self.crossfade_frame - self.sola_search_frame - self.block_frame :
|
||||
@ -794,13 +863,13 @@ if __name__ == "__main__":
|
||||
)
|
||||
# SOLA algorithm from https://github.com/yxlllc/DDSP-SVC
|
||||
conv_input = infer_wav[
|
||||
None, None, : self.crossfade_frame + self.sola_search_frame
|
||||
None, None, : self.sola_buffer_frame + self.sola_search_frame
|
||||
]
|
||||
cor_nom = F.conv1d(conv_input, self.sola_buffer[None, None, :])
|
||||
cor_den = torch.sqrt(
|
||||
F.conv1d(
|
||||
conv_input**2,
|
||||
torch.ones(1, 1, self.crossfade_frame, device=self.config.device),
|
||||
torch.ones(1, 1, self.sola_buffer_frame, device=self.config.device),
|
||||
)
|
||||
+ 1e-8
|
||||
)
|
||||
@ -813,9 +882,16 @@ if __name__ == "__main__":
|
||||
infer_wav = infer_wav[
|
||||
sola_offset : sola_offset + self.block_frame + self.crossfade_frame
|
||||
]
|
||||
infer_wav[: self.crossfade_frame] *= self.fade_in_window
|
||||
infer_wav[: self.crossfade_frame] += self.sola_buffer * self.fade_out_window
|
||||
self.sola_buffer[:] = infer_wav[-self.crossfade_frame :]
|
||||
if "privateuseone" in str(self.config.device) or not self.gui_config.use_pv:
|
||||
infer_wav[: self.sola_buffer_frame] *= self.fade_in_window
|
||||
infer_wav[: self.sola_buffer_frame] += self.sola_buffer * self.fade_out_window
|
||||
else:
|
||||
infer_wav[: self.sola_buffer_frame] = phase_vocoder(
|
||||
self.sola_buffer,
|
||||
infer_wav[: self.sola_buffer_frame],
|
||||
self.fade_out_window,
|
||||
self.fade_in_window)
|
||||
self.sola_buffer[:] = infer_wav[self.block_frame : self.block_frame + self.sola_buffer_frame]
|
||||
if sys.platform == "darwin":
|
||||
outdata[:] = (
|
||||
infer_wav[: -self.crossfade_frame].cpu().numpy()[:, np.newaxis]
|
||||
@ -864,7 +940,7 @@ if __name__ == "__main__":
|
||||
input_devices_indices,
|
||||
output_devices_indices,
|
||||
)
|
||||
|
||||
|
||||
def set_devices(self, input_device, output_device):
|
||||
"""设置输出设备"""
|
||||
(
|
||||
@ -881,5 +957,8 @@ if __name__ == "__main__":
|
||||
]
|
||||
printt("Input device: %s:%s", str(sd.default.device[0]), input_device)
|
||||
printt("Output device: %s:%s", str(sd.default.device[1]), output_device)
|
||||
|
||||
|
||||
def get_device_samplerate(self):
|
||||
return int(sd.query_devices(device=sd.default.device[0])['default_samplerate'])
|
||||
|
||||
gui = GUI()
|
||||
|
@ -722,7 +722,8 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
|
||||
def remove_weight_norm(self):
|
||||
self.dec.remove_weight_norm()
|
||||
self.flow.remove_weight_norm()
|
||||
self.enc_q.remove_weight_norm()
|
||||
if hasattr(self, "enc_q"):
|
||||
self.enc_q.remove_weight_norm()
|
||||
|
||||
def __prepare_scriptable__(self):
|
||||
for hook in self.dec._forward_pre_hooks.values():
|
||||
@ -783,14 +784,14 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
|
||||
pitch: torch.Tensor,
|
||||
nsff0: torch.Tensor,
|
||||
sid: torch.Tensor,
|
||||
rate: Optional[torch.Tensor] = None,
|
||||
skip_head: Optional[torch.Tensor] = None,
|
||||
):
|
||||
g = self.emb_g(sid).unsqueeze(-1)
|
||||
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
|
||||
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
|
||||
if rate is not None:
|
||||
assert isinstance(rate, torch.Tensor)
|
||||
head = int(z_p.shape[2] * (1 - rate.item()))
|
||||
if skip_head is not None:
|
||||
assert isinstance(skip_head, torch.Tensor)
|
||||
head = int(skip_head.item())
|
||||
z_p = z_p[:, :, head:]
|
||||
x_mask = x_mask[:, :, head:]
|
||||
nsff0 = nsff0[:, head:]
|
||||
@ -887,7 +888,8 @@ class SynthesizerTrnMs768NSFsid(nn.Module):
|
||||
def remove_weight_norm(self):
|
||||
self.dec.remove_weight_norm()
|
||||
self.flow.remove_weight_norm()
|
||||
self.enc_q.remove_weight_norm()
|
||||
if hasattr(self, "enc_q"):
|
||||
self.enc_q.remove_weight_norm()
|
||||
|
||||
def __prepare_scriptable__(self):
|
||||
for hook in self.dec._forward_pre_hooks.values():
|
||||
@ -941,13 +943,14 @@ class SynthesizerTrnMs768NSFsid(nn.Module):
|
||||
pitch: torch.Tensor,
|
||||
nsff0: torch.Tensor,
|
||||
sid: torch.Tensor,
|
||||
rate: Optional[torch.Tensor] = None,
|
||||
skip_head: Optional[torch.Tensor] = None,
|
||||
):
|
||||
g = self.emb_g(sid).unsqueeze(-1)
|
||||
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
|
||||
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
|
||||
if rate is not None:
|
||||
head = int(z_p.shape[2] * (1.0 - rate.item()))
|
||||
if skip_head is not None:
|
||||
assert isinstance(skip_head, torch.Tensor)
|
||||
head = int(skip_head.item())
|
||||
z_p = z_p[:, :, head:]
|
||||
x_mask = x_mask[:, :, head:]
|
||||
nsff0 = nsff0[:, head:]
|
||||
@ -1041,7 +1044,8 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
|
||||
def remove_weight_norm(self):
|
||||
self.dec.remove_weight_norm()
|
||||
self.flow.remove_weight_norm()
|
||||
self.enc_q.remove_weight_norm()
|
||||
if hasattr(self, "enc_q"):
|
||||
self.enc_q.remove_weight_norm()
|
||||
|
||||
def __prepare_scriptable__(self):
|
||||
for hook in self.dec._forward_pre_hooks.values():
|
||||
@ -1087,13 +1091,14 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
|
||||
phone: torch.Tensor,
|
||||
phone_lengths: torch.Tensor,
|
||||
sid: torch.Tensor,
|
||||
rate: Optional[torch.Tensor] = None,
|
||||
skip_head: Optional[torch.Tensor] = None,
|
||||
):
|
||||
g = self.emb_g(sid).unsqueeze(-1)
|
||||
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
|
||||
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
|
||||
if rate is not None:
|
||||
head = int(z_p.shape[2] * (1.0 - rate.item()))
|
||||
if skip_head is not None:
|
||||
assert isinstance(skip_head, torch.Tensor)
|
||||
head = int(skip_head.item())
|
||||
z_p = z_p[:, :, head:]
|
||||
x_mask = x_mask[:, :, head:]
|
||||
z = self.flow(z_p, x_mask, g=g, reverse=True)
|
||||
@ -1186,7 +1191,8 @@ class SynthesizerTrnMs768NSFsid_nono(nn.Module):
|
||||
def remove_weight_norm(self):
|
||||
self.dec.remove_weight_norm()
|
||||
self.flow.remove_weight_norm()
|
||||
self.enc_q.remove_weight_norm()
|
||||
if hasattr(self, "enc_q"):
|
||||
self.enc_q.remove_weight_norm()
|
||||
|
||||
def __prepare_scriptable__(self):
|
||||
for hook in self.dec._forward_pre_hooks.values():
|
||||
@ -1232,13 +1238,14 @@ class SynthesizerTrnMs768NSFsid_nono(nn.Module):
|
||||
phone: torch.Tensor,
|
||||
phone_lengths: torch.Tensor,
|
||||
sid: torch.Tensor,
|
||||
rate: Optional[torch.Tensor] = None,
|
||||
skip_head: Optional[torch.Tensor] = None,
|
||||
):
|
||||
g = self.emb_g(sid).unsqueeze(-1)
|
||||
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
|
||||
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
|
||||
if rate is not None:
|
||||
head = int(z_p.shape[2] * (1.0 - rate.item()))
|
||||
if skip_head is not None:
|
||||
assert isinstance(skip_head, torch.Tensor)
|
||||
head = int(skip_head.item())
|
||||
z_p = z_p[:, :, head:]
|
||||
x_mask = x_mask[:, :, head:]
|
||||
z = self.flow(z_p, x_mask, g=g, reverse=True)
|
||||
|
@ -34,4 +34,5 @@ def get_synthesizer(pth_path, device=torch.device("cpu")):
|
||||
net_g.load_state_dict(cpt["weight"], strict=False)
|
||||
net_g = net_g.float()
|
||||
net_g.eval().to(device)
|
||||
net_g.remove_weight_norm()
|
||||
return net_g, cpt
|
||||
|
@ -593,16 +593,18 @@ class RMVPE:
|
||||
|
||||
def infer_from_audio(self, audio, thred=0.03):
|
||||
# torch.cuda.synchronize()
|
||||
t0 = ttime()
|
||||
# t0 = ttime()
|
||||
if not torch.is_tensor(audio):
|
||||
audio = torch.from_numpy(audio)
|
||||
mel = self.mel_extractor(
|
||||
torch.from_numpy(audio).float().to(self.device).unsqueeze(0), center=True
|
||||
audio.float().to(self.device).unsqueeze(0), center=True
|
||||
)
|
||||
# print(123123123,mel.device.type)
|
||||
# torch.cuda.synchronize()
|
||||
t1 = ttime()
|
||||
# t1 = ttime()
|
||||
hidden = self.mel2hidden(mel)
|
||||
# torch.cuda.synchronize()
|
||||
t2 = ttime()
|
||||
# t2 = ttime()
|
||||
# print(234234,hidden.device.type)
|
||||
if "privateuseone" not in str(self.device):
|
||||
hidden = hidden.squeeze(0).cpu().numpy()
|
||||
@ -613,7 +615,7 @@ class RMVPE:
|
||||
|
||||
f0 = self.decode(hidden, thred=thred)
|
||||
# torch.cuda.synchronize()
|
||||
t3 = ttime()
|
||||
# t3 = ttime()
|
||||
# print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0))
|
||||
return f0
|
||||
|
||||
|
@ -46,23 +46,22 @@ def printt(strr, *args):
|
||||
# config.is_half=False########强制cpu测试
|
||||
class RVC:
|
||||
def __init__(
|
||||
self,
|
||||
key,
|
||||
pth_path,
|
||||
index_path,
|
||||
index_rate,
|
||||
n_cpu,
|
||||
inp_q,
|
||||
opt_q,
|
||||
config: Config,
|
||||
last_rvc=None,
|
||||
self,
|
||||
key,
|
||||
pth_path,
|
||||
index_path,
|
||||
index_rate,
|
||||
n_cpu,
|
||||
inp_q,
|
||||
opt_q,
|
||||
config: Config,
|
||||
last_rvc=None,
|
||||
) -> None:
|
||||
"""
|
||||
初始化
|
||||
"""
|
||||
try:
|
||||
if config.dml == True:
|
||||
|
||||
def forward_dml(ctx, x, scale):
|
||||
ctx.scale = scale
|
||||
res = x.clone().detach()
|
||||
@ -76,13 +75,10 @@ class RVC:
|
||||
# device="cpu"########强制cpu测试
|
||||
self.device = config.device
|
||||
self.f0_up_key = key
|
||||
self.time_step = 160 / 16000 * 1000
|
||||
self.f0_min = 50
|
||||
self.f0_max = 1100
|
||||
self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
|
||||
self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
|
||||
self.sr = 16000
|
||||
self.window = 160
|
||||
self.n_cpu = n_cpu
|
||||
self.use_jit = self.config.use_jit
|
||||
self.is_half = config.is_half
|
||||
@ -184,6 +180,7 @@ class RVC:
|
||||
if last_rvc is not None and hasattr(last_rvc, "model_rmvpe"):
|
||||
self.model_rmvpe = last_rvc.model_rmvpe
|
||||
if last_rvc is not None and hasattr(last_rvc, "model_fcpe"):
|
||||
self.device_fcpe = last_rvc.device_fcpe
|
||||
self.model_fcpe = last_rvc.model_fcpe
|
||||
except:
|
||||
printt(traceback.format_exc())
|
||||
@ -199,14 +196,10 @@ class RVC:
|
||||
self.index_rate = new_index_rate
|
||||
|
||||
def get_f0_post(self, f0):
|
||||
f0_min = self.f0_min
|
||||
f0_max = self.f0_max
|
||||
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
|
||||
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
|
||||
f0bak = f0.copy()
|
||||
f0_mel = 1127 * np.log(1 + f0 / 700)
|
||||
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
|
||||
f0_mel_max - f0_mel_min
|
||||
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / (
|
||||
self.f0_mel_max - self.f0_mel_min
|
||||
) + 1
|
||||
f0_mel[f0_mel <= 1] = 1
|
||||
f0_mel[f0_mel > 255] = 255
|
||||
@ -221,6 +214,7 @@ class RVC:
|
||||
return self.get_f0_rmvpe(x, f0_up_key)
|
||||
if method == "fcpe":
|
||||
return self.get_f0_fcpe(x, f0_up_key)
|
||||
x = x.cpu().numpy()
|
||||
if method == "pm":
|
||||
p_len = x.shape[0] // 160 + 1
|
||||
f0_min = 65
|
||||
@ -262,7 +256,7 @@ class RVC:
|
||||
self.inp_q.put((idx, x[:tail], res_f0, n_cpu, ts))
|
||||
else:
|
||||
self.inp_q.put(
|
||||
(idx, x[part_length * idx - 320 : tail], res_f0, n_cpu, ts)
|
||||
(idx, x[part_length * idx - 320: tail], res_f0, n_cpu, ts)
|
||||
)
|
||||
while 1:
|
||||
res_ts = self.opt_q.get()
|
||||
@ -277,20 +271,19 @@ class RVC:
|
||||
else:
|
||||
f0 = f0[2:]
|
||||
f0bak[
|
||||
part_length * idx // 160 : part_length * idx // 160 + f0.shape[0]
|
||||
part_length * idx // 160: part_length * idx // 160 + f0.shape[0]
|
||||
] = f0
|
||||
f0bak = signal.medfilt(f0bak, 3)
|
||||
f0bak *= pow(2, f0_up_key / 12)
|
||||
return self.get_f0_post(f0bak)
|
||||
|
||||
def get_f0_crepe(self, x, f0_up_key):
|
||||
if "privateuseone" in str(self.device): ###不支持dml,cpu又太慢用不成,拿pm顶替
|
||||
return self.get_f0(x, f0_up_key, 1, "pm")
|
||||
audio = torch.tensor(np.copy(x))[None].float()
|
||||
if "privateuseone" in str(self.device): ###不支持dml,cpu又太慢用不成,拿fcpe顶替
|
||||
return self.get_f0(x, f0_up_key, 1, "fcpe")
|
||||
# printt("using crepe,device:%s"%self.device)
|
||||
f0, pd = torchcrepe.predict(
|
||||
audio,
|
||||
self.sr,
|
||||
x.unsqueeze(0).float(),
|
||||
16000,
|
||||
160,
|
||||
self.f0_min,
|
||||
self.f0_max,
|
||||
@ -313,15 +306,11 @@ class RVC:
|
||||
|
||||
printt("Loading rmvpe model")
|
||||
self.model_rmvpe = RMVPE(
|
||||
# "rmvpe.pt", is_half=self.is_half if self.device.type!="privateuseone" else False, device=self.device if self.device.type!="privateuseone"else "cpu"####dml时强制对rmvpe用cpu跑
|
||||
# "rmvpe.pt", is_half=False, device=self.device####dml配置
|
||||
# "rmvpe.pt", is_half=False, device="cpu"####锁定cpu配置
|
||||
"assets/rmvpe/rmvpe.pt",
|
||||
is_half=self.is_half,
|
||||
device=self.device, ####正常逻辑
|
||||
device=self.device,
|
||||
use_jit=self.config.use_jit,
|
||||
)
|
||||
# self.model_rmvpe = RMVPE("aug2_58000_half.pt", is_half=self.is_half, device=self.device)
|
||||
f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
|
||||
f0 *= pow(2, f0_up_key / 12)
|
||||
return self.get_f0_post(f0)
|
||||
@ -329,41 +318,36 @@ class RVC:
|
||||
def get_f0_fcpe(self, x, f0_up_key):
|
||||
if hasattr(self, "model_fcpe") == False:
|
||||
from torchfcpe import spawn_bundled_infer_model
|
||||
|
||||
printt("Loading fcpe model")
|
||||
self.model_fcpe = spawn_bundled_infer_model(self.device)
|
||||
f0 = (
|
||||
self.model_fcpe.infer(
|
||||
torch.from_numpy(x).to(self.device).unsqueeze(0).float(),
|
||||
sr=16000,
|
||||
decoder_mode="local_argmax",
|
||||
threshold=0.006,
|
||||
)
|
||||
.squeeze()
|
||||
.cpu()
|
||||
.numpy()
|
||||
)
|
||||
if "privateuseone" in str(self.device):
|
||||
self.device_fcpe = "cpu"
|
||||
else:
|
||||
self.device_fcpe = self.device
|
||||
self.model_fcpe = spawn_bundled_infer_model(self.device_fcpe)
|
||||
f0 = self.model_fcpe.infer(
|
||||
x.to(self.device_fcpe).unsqueeze(0).float(),
|
||||
sr=16000,
|
||||
decoder_mode='local_argmax',
|
||||
threshold=0.006,
|
||||
).squeeze().cpu().numpy()
|
||||
f0 *= pow(2, f0_up_key / 12)
|
||||
return self.get_f0_post(f0)
|
||||
|
||||
def infer(
|
||||
self,
|
||||
feats: torch.Tensor,
|
||||
indata: np.ndarray,
|
||||
block_frame_16k,
|
||||
rate,
|
||||
cache_pitch,
|
||||
cache_pitchf,
|
||||
f0method,
|
||||
self,
|
||||
input_wav: torch.Tensor,
|
||||
block_frame_16k,
|
||||
skip_head,
|
||||
cache_pitch,
|
||||
cache_pitchf,
|
||||
f0method,
|
||||
) -> np.ndarray:
|
||||
feats = feats.view(1, -1)
|
||||
if self.config.is_half:
|
||||
feats = feats.half()
|
||||
else:
|
||||
feats = feats.float()
|
||||
feats = feats.to(self.device)
|
||||
t1 = ttime()
|
||||
with torch.no_grad():
|
||||
if self.config.is_half:
|
||||
feats = input_wav.half().view(1, -1)
|
||||
else:
|
||||
feats = input_wav.float().view(1, -1)
|
||||
padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
|
||||
inputs = {
|
||||
"source": feats,
|
||||
@ -387,8 +371,8 @@ class RVC:
|
||||
if self.config.is_half:
|
||||
npy = npy.astype("float16")
|
||||
feats[0][-leng_replace_head:] = (
|
||||
torch.from_numpy(npy).unsqueeze(0).to(self.device) * self.index_rate
|
||||
+ (1 - self.index_rate) * feats[0][-leng_replace_head:]
|
||||
torch.from_numpy(npy).unsqueeze(0).to(self.device) * self.index_rate
|
||||
+ (1 - self.index_rate) * feats[0][-leng_replace_head:]
|
||||
)
|
||||
else:
|
||||
printt("Index search FAILED or disabled")
|
||||
@ -398,7 +382,13 @@ class RVC:
|
||||
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
|
||||
t3 = ttime()
|
||||
if self.if_f0 == 1:
|
||||
pitch, pitchf = self.get_f0(indata, self.f0_up_key, self.n_cpu, f0method)
|
||||
f0_extractor_frame = block_frame_16k + 800
|
||||
if f0method == "rmvpe":
|
||||
f0_extractor_frame = (
|
||||
5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160
|
||||
)
|
||||
input_wav = input_wav[-f0_extractor_frame:]
|
||||
pitch, pitchf = self.get_f0(input_wav, self.f0_up_key, self.n_cpu, f0method)
|
||||
start_frame = block_frame_16k // 160
|
||||
end_frame = len(cache_pitch) - (pitch.shape[0] - 4) + start_frame
|
||||
cache_pitch[:] = np.append(cache_pitch[start_frame:end_frame], pitch[3:-1])
|
||||
@ -412,31 +402,28 @@ class RVC:
|
||||
t4 = ttime()
|
||||
feats = feats[:, :p_len, :]
|
||||
if self.if_f0 == 1:
|
||||
cache_pitch = cache_pitch[:p_len]
|
||||
cache_pitchf = cache_pitchf[:p_len]
|
||||
cache_pitch = torch.LongTensor(cache_pitch).unsqueeze(0).to(self.device)
|
||||
cache_pitchf = torch.FloatTensor(cache_pitchf).unsqueeze(0).to(self.device)
|
||||
cache_pitch = torch.LongTensor(cache_pitch[:p_len]).to(self.device).unsqueeze(0)
|
||||
cache_pitchf = torch.FloatTensor(cache_pitchf[:p_len]).to(self.device).unsqueeze(0)
|
||||
p_len = torch.LongTensor([p_len]).to(self.device)
|
||||
ii = 0 # sid
|
||||
sid = torch.LongTensor([ii]).to(self.device)
|
||||
sid = torch.LongTensor([0]).to(self.device)
|
||||
skip_head = torch.LongTensor([skip_head])
|
||||
with torch.no_grad():
|
||||
if self.if_f0 == 1:
|
||||
# printt(12222222222,feats.device,p_len.device,cache_pitch.device,cache_pitchf.device,sid.device,rate2)
|
||||
infered_audio = self.net_g.infer(
|
||||
feats,
|
||||
p_len,
|
||||
cache_pitch,
|
||||
cache_pitchf,
|
||||
sid,
|
||||
torch.FloatTensor([rate]),
|
||||
skip_head,
|
||||
)[0][0, 0].data.float()
|
||||
else:
|
||||
infered_audio = self.net_g.infer(
|
||||
feats, p_len, sid, torch.FloatTensor([rate])
|
||||
feats, p_len, sid, skip_head
|
||||
)[0][0, 0].data.float()
|
||||
t5 = ttime()
|
||||
printt(
|
||||
"Spent time: fea = %.2fs, index = %.2fs, f0 = %.2fs, model = %.2fs",
|
||||
"Spent time: fea = %.3fs, index = %.3fs, f0 = %.3fs, model = %.3fs",
|
||||
t2 - t1,
|
||||
t3 - t2,
|
||||
t4 - t3,
|
||||
|
Loading…
x
Reference in New Issue
Block a user