diff --git a/README.md b/README.md
index f221877..5385a04 100644
--- a/README.md
+++ b/README.md
@@ -68,12 +68,16 @@ poetry install
你也可以通过 pip 来安装依赖:
```bash
N卡:
-
-pip install -r requirements.txt
+ pip install -r requirements.txt
A卡/I卡:
-pip install -r requirements-dml.txt
+ pip install -r requirements-dml.txt
+A卡Rocm(Linux):
+ pip install -r requirements-amd.txt
+
+I卡IPEX(Linux):
+ pip install -r requirements-ipex.txt
```
------
@@ -122,11 +126,34 @@ https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/rmvpe.pt
```bash
python infer-web.py
```
-
如果你正在使用Windows 或 macOS,你可以直接下载并解压`RVC-beta.7z`,前者可以运行`go-web.bat`以启动WebUI,后者则运行命令`sh ./run.sh`以启动WebUI。
+对于需要使用IPEX技术的I卡用户,请先在终端执行`source /opt/intel/oneapi/setvars.sh`(仅Linux)。
+
仓库内还有一份`小白简易教程.doc`以供参考。
+## AMD显卡Rocm相关(仅Linux)
+如果你想基于AMD的Rocm技术在Linux系统上运行RVC,请先在[这里](https://rocm.docs.amd.com/en/latest/deploy/linux/os-native/install.html)安装所需的驱动。
+
+若你使用的是Arch Linux,可以使用pacman来安装所需驱动:
+````
+pacman -S rocm-hip-sdk rocm-opencl-sdk
+````
+对于某些型号的显卡,你可能需要额外配置如下的环境变量(如:RX6700XT):
+````
+export ROCM_PATH=/opt/rocm
+export HSA_OVERRIDE_GFX_VERSION=10.3.0
+````
+同时确保你的当前用户处于`render`与`video`用户组内:
+````
+sudo usermod -aG render $USERNAME
+sudo usermod -aG video $USERNAME
+````
+之后运行WebUI:
+```bash
+python infer-web.py
+```
+
## 参考项目
+ [ContentVec](https://github.com/auspicious3000/contentvec/)
+ [VITS](https://github.com/jaywalnut310/vits)
diff --git a/assets/Synthesizer_inputs.pth b/assets/Synthesizer_inputs.pth
new file mode 100644
index 0000000..faa509e
Binary files /dev/null and b/assets/Synthesizer_inputs.pth differ
diff --git a/assets/hubert/.gitignore b/assets/hubert/.gitignore
index d6b7ef3..03dfb38 100644
--- a/assets/hubert/.gitignore
+++ b/assets/hubert/.gitignore
@@ -1,2 +1,3 @@
*
!.gitignore
+!hubert_inputs.pth
\ No newline at end of file
diff --git a/assets/hubert/hubert_inputs.pth b/assets/hubert/hubert_inputs.pth
new file mode 100644
index 0000000..46d2886
Binary files /dev/null and b/assets/hubert/hubert_inputs.pth differ
diff --git a/assets/rmvpe/.gitignore b/assets/rmvpe/.gitignore
index d6b7ef3..dbb24a6 100644
--- a/assets/rmvpe/.gitignore
+++ b/assets/rmvpe/.gitignore
@@ -1,2 +1,3 @@
*
!.gitignore
+!rmvpe_inputs.pth
\ No newline at end of file
diff --git a/assets/rmvpe/rmvpe_inputs.pth b/assets/rmvpe/rmvpe_inputs.pth
new file mode 100644
index 0000000..a4cfb86
Binary files /dev/null and b/assets/rmvpe/rmvpe_inputs.pth differ
diff --git a/configs/config.json b/configs/config.json
index 8e9c176..0861200 100644
--- a/configs/config.json
+++ b/configs/config.json
@@ -1,15 +1 @@
-{
- "pth_path": "assets/weights/kikiV1.pth",
- "index_path": "logs/kikiV1.index",
- "sg_input_device": "VoiceMeeter Output (VB-Audio Vo (MME)",
- "sg_output_device": "VoiceMeeter Aux Input (VB-Audio (MME)",
- "threhold": -45.0,
- "pitch": 12.0,
- "index_rate": 0.0,
- "rms_mix_rate": 0.0,
- "block_time": 0.25,
- "crossfade_length": 0.04,
- "extra_time": 2.0,
- "n_cpu": 6.0,
- "f0method": "rmvpe"
-}
+{"pth_path": "assets/weights/kikiV1.pth", "index_path": "logs/kikiV1.index", "sg_input_device": "VoiceMeeter Output (VB-Audio Vo (MME)", "sg_output_device": "VoiceMeeter Input (VB-Audio Voi (MME)", "threhold": -45.0, "pitch": 2.0, "rms_mix_rate": 0.0, "index_rate": 0.0, "block_time": 0.52, "crossfade_length": 0.15, "extra_time": 2.46, "n_cpu": 6.0, "use_jit": false, "f0method": "rmvpe"}
\ No newline at end of file
diff --git a/configs/config.py b/configs/config.py
index 20bbb36..af2350f 100644
--- a/configs/config.py
+++ b/configs/config.py
@@ -13,7 +13,7 @@ try:
from infer.modules.ipex import ipex_init
ipex_init()
-except Exception:
+except Exception: # pylint: disable=broad-exception-caught
pass
import logging
@@ -44,6 +44,7 @@ class Config:
def __init__(self):
self.device = "cuda:0"
self.is_half = True
+ self.use_jit = False
self.n_cpu = 0
self.gpu_name = None
self.json_config = self.load_config_json()
@@ -122,6 +123,15 @@ class Config:
def use_fp32_config(self):
for config_file in version_config_list:
self.json_config[config_file]["train"]["fp16_run"] = False
+ with open(f"configs/{config_file}", "r") as f:
+ strr = f.read().replace("true", "false")
+ with open(f"configs/{config_file}", "w") as f:
+ f.write(strr)
+ with open("infer/modules/train/preprocess.py", "r") as f:
+ strr = f.read().replace("3.7", "3.0")
+ with open("infer/modules/train/preprocess.py", "w") as f:
+ f.write(strr)
+ print("overwrite preprocess and configs.json")
def device_config(self) -> tuple:
if torch.cuda.is_available():
@@ -237,4 +247,5 @@ class Config:
)
except:
pass
+ print("is_half:%s, device:%s" % (self.is_half, self.device))
return x_pad, x_query, x_center, x_max
diff --git a/docs/en/README.en.md b/docs/en/README.en.md
index 1e52b81..f880869 100644
--- a/docs/en/README.en.md
+++ b/docs/en/README.en.md
@@ -97,7 +97,12 @@ sh ./run.sh
## Preparation of other Pre-models
RVC requires other pre-models to infer and train.
-You need to download them from our [Huggingface space](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/).
+```bash
+#Download all needed models from https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/
+python tools/download_models.py
+```
+
+Or just download them by yourself from our [Huggingface space](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/).
Here's a list of Pre-models and other files that RVC needs:
```bash
diff --git a/gui_v1.py b/gui_v1.py
index fee9e20..3254892 100644
--- a/gui_v1.py
+++ b/gui_v1.py
@@ -1,5 +1,4 @@
import os
-import logging
import sys
from dotenv import load_dotenv
@@ -13,10 +12,16 @@ now_dir = os.getcwd()
sys.path.append(now_dir)
import multiprocessing
-logger = logging.getLogger(__name__)
stream_latency = -1
+def printt(strr, *args):
+ if len(args) == 0:
+ print(strr)
+ else:
+ print(strr % args)
+
+
class Harvest(multiprocessing.Process):
def __init__(self, inp_q, opt_q):
multiprocessing.Process.__init__(self)
@@ -62,9 +67,11 @@ if __name__ == "__main__":
import tools.rvc_for_realtime as rvc_for_realtime
from i18n.i18n import I18nAuto
+ from configs.config import Config
i18n = I18nAuto()
- device = rvc_for_realtime.config.device
+
+ # device = rvc_for_realtime.config.device
# device = torch.device(
# "cuda"
# if torch.cuda.is_available()
@@ -86,8 +93,8 @@ if __name__ == "__main__":
self.block_time: float = 1.0 # s
self.buffer_num: int = 1
self.threhold: int = -60
- self.crossfade_time: float = 0.04
- self.extra_time: float = 2.0
+ self.crossfade_time: float = 0.05
+ self.extra_time: float = 2.5
self.I_noise_reduce = False
self.O_noise_reduce = False
self.rms_mix_rate = 0.0
@@ -99,7 +106,8 @@ if __name__ == "__main__":
class GUI:
def __init__(self) -> None:
- self.config = GUIConfig()
+ self.gui_config = GUIConfig()
+ self.config = Config()
self.flag_vc = False
self.function = "vc"
self.delay_time = 0
@@ -130,9 +138,10 @@ if __name__ == "__main__":
"index_rate": "0",
"rms_mix_rate": "0",
"block_time": "0.25",
- "crossfade_length": "0.04",
- "extra_time": "2",
+ "crossfade_length": "0.05",
+ "extra_time": "2.5",
"f0method": "rmvpe",
+ "use_jit": False,
}
data["pm"] = data["f0method"] == "pm"
data["harvest"] = data["f0method"] == "harvest"
@@ -142,6 +151,7 @@ if __name__ == "__main__":
def launcher(self):
data = self.load()
+ self.config.use_jit = False # data.get("use_jit", self.config.use_jit)
sg.theme("LightBlue3")
input_devices, output_devices, _, _ = self.get_devices()
layout = [
@@ -294,6 +304,17 @@ if __name__ == "__main__":
enable_events=True,
),
],
+ # [
+ # sg.Text("设备延迟"),
+ # sg.Slider(
+ # range=(0, 1),
+ # key="device_latency",
+ # resolution=0.001,
+ # orientation="h",
+ # default_value=data.get("device_latency", "0.1"),
+ # enable_events=True,
+ # ),
+ # ],
[
sg.Text(i18n("harvest进程数")),
sg.Slider(
@@ -302,7 +323,7 @@ if __name__ == "__main__":
resolution=1,
orientation="h",
default_value=data.get(
- "n_cpu", min(self.config.n_cpu, n_cpu)
+ "n_cpu", min(self.gui_config.n_cpu, n_cpu)
),
enable_events=True,
),
@@ -314,7 +335,7 @@ if __name__ == "__main__":
key="crossfade_length",
resolution=0.01,
orientation="h",
- default_value=data.get("crossfade_length", "0.04"),
+ default_value=data.get("crossfade_length", "0.05"),
enable_events=True,
),
],
@@ -325,7 +346,7 @@ if __name__ == "__main__":
key="extra_time",
resolution=0.01,
orientation="h",
- default_value=data.get("extra_time", "2.0"),
+ default_value=data.get("extra_time", "2.5"),
enable_events=True,
),
],
@@ -340,7 +361,14 @@ if __name__ == "__main__":
key="O_noise_reduce",
enable_events=True,
),
+ # sg.Checkbox(
+ # "JIT加速",
+ # default=self.config.use_jit,
+ # key="use_jit",
+ # enable_events=False,
+ # ),
],
+ # [sg.Text("注:首次使用JIT加速时,会出现卡顿,\n 并伴随一些噪音,但这是正常现象!")],
],
title=i18n("性能设置"),
),
@@ -382,24 +410,24 @@ if __name__ == "__main__":
prev_output = self.window["sg_output_device"].get()
input_devices, output_devices, _, _ = self.get_devices(update=True)
if prev_input not in input_devices:
- self.config.sg_input_device = input_devices[0]
+ self.gui_config.sg_input_device = input_devices[0]
else:
- self.config.sg_input_device = prev_input
+ self.gui_config.sg_input_device = prev_input
self.window["sg_input_device"].Update(values=input_devices)
self.window["sg_input_device"].Update(
- value=self.config.sg_input_device
+ value=self.gui_config.sg_input_device
)
if prev_output not in output_devices:
- self.config.sg_output_device = output_devices[0]
+ self.gui_config.sg_output_device = output_devices[0]
else:
- self.config.sg_output_device = prev_output
+ self.gui_config.sg_output_device = prev_output
self.window["sg_output_device"].Update(values=output_devices)
self.window["sg_output_device"].Update(
- value=self.config.sg_output_device
+ value=self.gui_config.sg_output_device
)
if event == "start_vc" and self.flag_vc == False:
if self.set_values(values) == True:
- logger.info("cuda_is_available: %s", torch.cuda.is_available())
+ printt("cuda_is_available: %s", torch.cuda.is_available())
self.start_vc()
settings = {
"pth_path": values["pth_path"],
@@ -410,10 +438,13 @@ if __name__ == "__main__":
"pitch": values["pitch"],
"rms_mix_rate": values["rms_mix_rate"],
"index_rate": values["index_rate"],
+ # "device_latency": values["device_latency"],
"block_time": values["block_time"],
"crossfade_length": values["crossfade_length"],
"extra_time": values["extra_time"],
"n_cpu": values["n_cpu"],
+ # "use_jit": values["use_jit"],
+ "use_jit": False,
"f0method": ["pm", "harvest", "crepe", "rmvpe"][
[
values["pm"],
@@ -442,28 +473,28 @@ if __name__ == "__main__":
stream_latency = -1
# Parameter hot update
if event == "threhold":
- self.config.threhold = values["threhold"]
+ self.gui_config.threhold = values["threhold"]
elif event == "pitch":
- self.config.pitch = values["pitch"]
+ self.gui_config.pitch = values["pitch"]
if hasattr(self, "rvc"):
self.rvc.change_key(values["pitch"])
elif event == "index_rate":
- self.config.index_rate = values["index_rate"]
+ self.gui_config.index_rate = values["index_rate"]
if hasattr(self, "rvc"):
self.rvc.change_index_rate(values["index_rate"])
elif event == "rms_mix_rate":
- self.config.rms_mix_rate = values["rms_mix_rate"]
+ self.gui_config.rms_mix_rate = values["rms_mix_rate"]
elif event in ["pm", "harvest", "crepe", "rmvpe"]:
- self.config.f0method = event
+ self.gui_config.f0method = event
elif event == "I_noise_reduce":
- self.config.I_noise_reduce = values["I_noise_reduce"]
+ self.gui_config.I_noise_reduce = values["I_noise_reduce"]
if stream_latency > 0:
self.delay_time += (
1 if values["I_noise_reduce"] else -1
) * values["crossfade_length"]
self.window["delay_time"].update(int(self.delay_time * 1000))
elif event == "O_noise_reduce":
- self.config.O_noise_reduce = values["O_noise_reduce"]
+ self.gui_config.O_noise_reduce = values["O_noise_reduce"]
elif event in ["vc", "im"]:
self.function = event
elif event != "start_vc" and self.flag_vc == True:
@@ -486,19 +517,21 @@ if __name__ == "__main__":
sg.popup(i18n("index文件路径不可包含中文"))
return False
self.set_devices(values["sg_input_device"], values["sg_output_device"])
- self.config.pth_path = values["pth_path"]
- self.config.index_path = values["index_path"]
- self.config.threhold = values["threhold"]
- self.config.pitch = values["pitch"]
- self.config.block_time = values["block_time"]
- self.config.crossfade_time = values["crossfade_length"]
- self.config.extra_time = values["extra_time"]
- self.config.I_noise_reduce = values["I_noise_reduce"]
- self.config.O_noise_reduce = values["O_noise_reduce"]
- self.config.rms_mix_rate = values["rms_mix_rate"]
- self.config.index_rate = values["index_rate"]
- self.config.n_cpu = values["n_cpu"]
- self.config.f0method = ["pm", "harvest", "crepe", "rmvpe"][
+ self.config.use_jit = False # values["use_jit"]
+ # self.device_latency = values["device_latency"]
+ self.gui_config.pth_path = values["pth_path"]
+ self.gui_config.index_path = values["index_path"]
+ self.gui_config.threhold = values["threhold"]
+ self.gui_config.pitch = values["pitch"]
+ self.gui_config.block_time = values["block_time"]
+ self.gui_config.crossfade_time = values["crossfade_length"]
+ self.gui_config.extra_time = values["extra_time"]
+ self.gui_config.I_noise_reduce = values["I_noise_reduce"]
+ self.gui_config.O_noise_reduce = values["O_noise_reduce"]
+ self.gui_config.rms_mix_rate = values["rms_mix_rate"]
+ self.gui_config.index_rate = values["index_rate"]
+ self.gui_config.n_cpu = values["n_cpu"]
+ self.gui_config.f0method = ["pm", "harvest", "crepe", "rmvpe"][
[
values["pm"],
values["harvest"],
@@ -512,34 +545,48 @@ if __name__ == "__main__":
torch.cuda.empty_cache()
self.flag_vc = True
self.rvc = rvc_for_realtime.RVC(
- self.config.pitch,
- self.config.pth_path,
- self.config.index_path,
- self.config.index_rate,
- self.config.n_cpu,
+ self.gui_config.pitch,
+ self.gui_config.pth_path,
+ self.gui_config.index_path,
+ self.gui_config.index_rate,
+ self.gui_config.n_cpu,
inp_q,
opt_q,
- device,
+ self.config,
self.rvc if hasattr(self, "rvc") else None,
)
- self.config.samplerate = self.rvc.tgt_sr
+ self.gui_config.samplerate = self.rvc.tgt_sr
self.zc = self.rvc.tgt_sr // 100
self.block_frame = (
- int(np.round(self.config.block_time * self.config.samplerate / self.zc))
+ int(
+ np.round(
+ self.gui_config.block_time
+ * self.gui_config.samplerate
+ / self.zc
+ )
+ )
* self.zc
)
self.block_frame_16k = 160 * self.block_frame // self.zc
self.crossfade_frame = (
int(
np.round(
- self.config.crossfade_time * self.config.samplerate / self.zc
+ self.gui_config.crossfade_time
+ * self.gui_config.samplerate
+ / self.zc
)
)
* self.zc
)
self.sola_search_frame = self.zc
self.extra_frame = (
- int(np.round(self.config.extra_time * self.config.samplerate / self.zc))
+ int(
+ np.round(
+ self.gui_config.extra_time
+ * self.gui_config.samplerate
+ / self.zc
+ )
+ )
* self.zc
)
self.input_wav: torch.Tensor = torch.zeros(
@@ -547,12 +594,12 @@ if __name__ == "__main__":
+ self.crossfade_frame
+ self.sola_search_frame
+ self.block_frame,
- device=device,
+ device=self.config.device,
dtype=torch.float32,
)
self.input_wav_res: torch.Tensor = torch.zeros(
160 * self.input_wav.shape[0] // self.zc,
- device=device,
+ device=self.config.device,
dtype=torch.float32,
)
self.pitch: np.ndarray = np.zeros(
@@ -564,12 +611,12 @@ if __name__ == "__main__":
dtype="float64",
)
self.sola_buffer: torch.Tensor = torch.zeros(
- self.crossfade_frame, device=device, dtype=torch.float32
+ self.crossfade_frame, device=self.config.device, dtype=torch.float32
)
self.nr_buffer: torch.Tensor = self.sola_buffer.clone()
self.output_buffer: torch.Tensor = self.input_wav.clone()
self.res_buffer: torch.Tensor = torch.zeros(
- 2 * self.zc, device=device, dtype=torch.float32
+ 2 * self.zc, device=self.config.device, dtype=torch.float32
)
self.valid_rate = 1 - (self.extra_frame - 1) / self.input_wav.shape[0]
self.fade_in_window: torch.Tensor = (
@@ -580,7 +627,7 @@ if __name__ == "__main__":
0.0,
1.0,
steps=self.crossfade_frame,
- device=device,
+ device=self.config.device,
dtype=torch.float32,
)
)
@@ -588,11 +635,13 @@ if __name__ == "__main__":
)
self.fade_out_window: torch.Tensor = 1 - self.fade_in_window
self.resampler = tat.Resample(
- orig_freq=self.config.samplerate, new_freq=16000, dtype=torch.float32
- ).to(device)
+ orig_freq=self.gui_config.samplerate,
+ new_freq=16000,
+ dtype=torch.float32,
+ ).to(self.config.device)
self.tg = TorchGate(
- sr=self.config.samplerate, n_fft=4 * self.zc, prop_decrease=0.9
- ).to(device)
+ sr=self.gui_config.samplerate, n_fft=4 * self.zc, prop_decrease=0.9
+ ).to(self.config.device)
thread_vc = threading.Thread(target=self.soundinput)
thread_vc.start()
@@ -605,15 +654,15 @@ if __name__ == "__main__":
channels=channels,
callback=self.audio_callback,
blocksize=self.block_frame,
- samplerate=self.config.samplerate,
+ samplerate=self.gui_config.samplerate,
dtype="float32",
) as stream:
global stream_latency
stream_latency = stream.latency[-1]
while self.flag_vc:
- time.sleep(self.config.block_time)
- logger.debug("Audio block passed.")
- logger.debug("ENDing VC")
+ time.sleep(self.gui_config.block_time)
+ printt("Audio block passed.")
+ printt("ENDing VC")
def audio_callback(
self, indata: np.ndarray, outdata: np.ndarray, frames, times, status
@@ -623,12 +672,12 @@ if __name__ == "__main__":
"""
start_time = time.perf_counter()
indata = librosa.to_mono(indata.T)
- if self.config.threhold > -60:
+ if self.gui_config.threhold > -60:
rms = librosa.feature.rms(
y=indata, frame_length=4 * self.zc, hop_length=self.zc
)
db_threhold = (
- librosa.amplitude_to_db(rms, ref=1.0)[0] < self.config.threhold
+ librosa.amplitude_to_db(rms, ref=1.0)[0] < self.gui_config.threhold
)
for i in range(db_threhold.shape[0]):
if db_threhold[i]:
@@ -636,12 +685,14 @@ if __name__ == "__main__":
self.input_wav[: -self.block_frame] = self.input_wav[
self.block_frame :
].clone()
- self.input_wav[-self.block_frame :] = torch.from_numpy(indata).to(device)
+ self.input_wav[-self.block_frame :] = torch.from_numpy(indata).to(
+ self.config.device
+ )
self.input_wav_res[: -self.block_frame_16k] = self.input_wav_res[
self.block_frame_16k :
].clone()
# input noise reduction and resampling
- if self.config.I_noise_reduce and self.function == "vc":
+ if self.gui_config.I_noise_reduce and self.function == "vc":
input_wav = self.input_wav[
-self.crossfade_frame - self.block_frame - 2 * self.zc :
]
@@ -667,7 +718,7 @@ if __name__ == "__main__":
# infer
if self.function == "vc":
f0_extractor_frame = self.block_frame_16k + 800
- if self.config.f0method == "rmvpe":
+ if self.gui_config.f0method == "rmvpe":
f0_extractor_frame = (
5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160
)
@@ -678,7 +729,7 @@ if __name__ == "__main__":
self.valid_rate,
self.pitch,
self.pitchf,
- self.config.f0method,
+ self.gui_config.f0method,
)
infer_wav = infer_wav[
-self.crossfade_frame - self.sola_search_frame - self.block_frame :
@@ -688,8 +739,8 @@ if __name__ == "__main__":
-self.crossfade_frame - self.sola_search_frame - self.block_frame :
].clone()
# output noise reduction
- if (self.config.O_noise_reduce and self.function == "vc") or (
- self.config.I_noise_reduce and self.function == "im"
+ if (self.gui_config.O_noise_reduce and self.function == "vc") or (
+ self.gui_config.I_noise_reduce and self.function == "im"
):
self.output_buffer[: -self.block_frame] = self.output_buffer[
self.block_frame :
@@ -699,7 +750,7 @@ if __name__ == "__main__":
infer_wav.unsqueeze(0), self.output_buffer.unsqueeze(0)
).squeeze(0)
# volume envelop mixing
- if self.config.rms_mix_rate < 1 and self.function == "vc":
+ if self.gui_config.rms_mix_rate < 1 and self.function == "vc":
rms1 = librosa.feature.rms(
y=self.input_wav_res[-160 * infer_wav.shape[0] // self.zc :]
.cpu()
@@ -707,7 +758,7 @@ if __name__ == "__main__":
frame_length=640,
hop_length=160,
)
- rms1 = torch.from_numpy(rms1).to(device)
+ rms1 = torch.from_numpy(rms1).to(self.config.device)
rms1 = F.interpolate(
rms1.unsqueeze(0),
size=infer_wav.shape[0] + 1,
@@ -719,7 +770,7 @@ if __name__ == "__main__":
frame_length=4 * self.zc,
hop_length=self.zc,
)
- rms2 = torch.from_numpy(rms2).to(device)
+ rms2 = torch.from_numpy(rms2).to(self.config.device)
rms2 = F.interpolate(
rms2.unsqueeze(0),
size=infer_wav.shape[0] + 1,
@@ -728,7 +779,7 @@ if __name__ == "__main__":
)[0, 0, :-1]
rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-3)
infer_wav *= torch.pow(
- rms1 / rms2, torch.tensor(1 - self.config.rms_mix_rate)
+ rms1 / rms2, torch.tensor(1 - self.gui_config.rms_mix_rate)
)
# SOLA algorithm from https://github.com/yxlllc/DDSP-SVC
conv_input = infer_wav[
@@ -738,7 +789,7 @@ if __name__ == "__main__":
cor_den = torch.sqrt(
F.conv1d(
conv_input**2,
- torch.ones(1, 1, self.crossfade_frame, device=device),
+ torch.ones(1, 1, self.crossfade_frame, device=self.config.device),
)
+ 1e-8
)
@@ -747,7 +798,7 @@ if __name__ == "__main__":
sola_offset = sola_offset.item()
else:
sola_offset = torch.argmax(cor_nom[0, 0] / cor_den[0, 0])
- logger.debug("sola_offset = %d", int(sola_offset))
+ printt("sola_offset = %d", int(sola_offset))
infer_wav = infer_wav[
sola_offset : sola_offset + self.block_frame + self.crossfade_frame
]
@@ -764,7 +815,7 @@ if __name__ == "__main__":
)
total_time = time.perf_counter() - start_time
self.window["infer_time"].update(int(total_time * 1000))
- logger.info("Infer time: %.2f", total_time)
+ printt("Infer time: %.2f", total_time)
def get_devices(self, update: bool = True):
"""获取设备列表"""
@@ -817,9 +868,7 @@ if __name__ == "__main__":
sd.default.device[1] = output_device_indices[
output_devices.index(output_device)
]
- logger.info("Input device: %s:%s", str(sd.default.device[0]), input_device)
- logger.info(
- "Output device: %s:%s", str(sd.default.device[1]), output_device
- )
+ printt("Input device: %s:%s", str(sd.default.device[0]), input_device)
+ printt("Output device: %s:%s", str(sd.default.device[1]), output_device)
gui = GUI()
diff --git a/i18n/locale/en_US.json b/i18n/locale/en_US.json
index 9fa744b..dba5ec3 100644
--- a/i18n/locale/en_US.json
+++ b/i18n/locale/en_US.json
@@ -38,6 +38,7 @@
"加载模型": "Load model",
"加载预训练底模D路径": "Load pre-trained base model D path:",
"加载预训练底模G路径": "Load pre-trained base model G path:",
+ "单次推理": "单次推理",
"卸载音色省显存": "Unload voice to save GPU memory:",
"变调(整数, 半音数量, 升八度12降八度-12)": "Transpose (integer, number of semitones, raise by an octave: 12, lower by an octave: -12):",
"后处理重采样至最终采样率,0为不进行重采样": "Resample the output audio in post-processing to the final sample rate. Set to 0 for no resampling:",
@@ -53,6 +54,7 @@
"很遗憾您这没有能用的显卡来支持您训练": "Unfortunately, there is no compatible GPU available to support your training.",
"性能设置": "Performance settings",
"总训练轮数total_epoch": "Total training epochs (total_epoch):",
+ "批量推理": "批量推理",
"批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Batch conversion. Enter the folder containing the audio files to be converted or upload multiple audio files. The converted audio will be output in the specified folder (default: 'opt').",
"指定输出主人声文件夹": "Specify the output folder for vocals:",
"指定输出文件夹": "Specify output folder:",
@@ -86,7 +88,7 @@
"特征检索库文件路径,为空则使用下拉的选择结果": "Path to the feature index file. Leave blank to use the selected result from the dropdown:",
"男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "Recommended +12 key for male to female conversion, and -12 key for female to male conversion. If the sound range goes too far and the voice is distorted, you can also adjust it to the appropriate range by yourself.",
"目标采样率": "Target sample rate:",
- "算法延迟(ms):": "算法延迟(ms):",
+ "算法延迟(ms):": "Algorithmic delays(ms):",
"自动检测index路径,下拉式选择(dropdown)": "Auto-detect index path and select from the dropdown:",
"融合": "Fusion",
"要改的模型信息": "Model information to be modified:",
@@ -96,8 +98,8 @@
"训练特征索引": "Train feature index",
"训练结束, 您可查看控制台训练日志或实验文件夹下的train.log": "Training complete. You can check the training logs in the console or the 'train.log' file under the experiment folder.",
"请指定说话人id": "Please specify the speaker/singer ID:",
- "请选择index文件": "请选择index文件",
- "请选择pth文件": "请选择pth文件",
+ "请选择index文件": "Please choose the .index file",
+ "请选择pth文件": "Please choose the .pth file",
"请选择说话人id": "Select Speaker/Singer ID:",
"转换": "Convert",
"输入实验名": "Enter the experiment name:",
@@ -105,12 +107,12 @@
"输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "Enter the path of the audio folder to be processed (copy it from the address bar of the file manager):",
"输入待处理音频文件路径(默认是正确格式示例)": "Enter the path of the audio file to be processed (default is the correct format example):",
"输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络": "Adjust the volume envelope scaling. Closer to 0, the more it mimicks the volume of the original vocals. Can help mask noise and make volume sound more natural when set relatively low. Closer to 1 will be more of a consistently loud volume:",
- "输入监听": "输入监听",
+ "输入监听": "Input voice monitor",
"输入训练文件夹路径": "Enter the path of the training folder:",
"输入设备": "Input device",
"输入降噪": "Input noise reduction",
"输出信息": "Output information",
- "输出变声": "输出变声",
+ "输出变声": "Output converted voice",
"输出设备": "Output device",
"输出降噪": "Output noise reduction",
"输出音频(右下角三个点,点了可以下载)": "Export audio (click on the three dots in the lower right corner to download)",
diff --git a/i18n/locale/es_ES.json b/i18n/locale/es_ES.json
index 961cb3a..fdd17f0 100644
--- a/i18n/locale/es_ES.json
+++ b/i18n/locale/es_ES.json
@@ -38,6 +38,7 @@
"加载模型": "Cargar modelo",
"加载预训练底模D路径": "Cargue la ruta del modelo D base pre-entrenada.",
"加载预训练底模G路径": "Cargue la ruta del modelo G base pre-entrenada.",
+ "单次推理": "单次推理",
"卸载音色省显存": "Descargue la voz para ahorrar memoria GPU",
"变调(整数, 半音数量, 升八度12降八度-12)": "Cambio de tono (entero, número de semitonos, subir una octava +12 o bajar una octava -12)",
"后处理重采样至最终采样率,0为不进行重采样": "Remuestreo posterior al proceso a la tasa de muestreo final, 0 significa no remuestrear",
@@ -53,6 +54,7 @@
"很遗憾您这没有能用的显卡来支持您训练": "Lamentablemente, no tiene una tarjeta gráfica adecuada para soportar su entrenamiento",
"性能设置": "Configuración de rendimiento",
"总训练轮数total_epoch": "Total de épocas de entrenamiento (total_epoch)",
+ "批量推理": "批量推理",
"批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Conversión por lotes, ingrese la carpeta que contiene los archivos de audio para convertir o cargue varios archivos de audio. El audio convertido se emitirá en la carpeta especificada (opción predeterminada).",
"指定输出主人声文件夹": "Especifique la carpeta de salida para la voz principal",
"指定输出文件夹": "Especificar carpeta de salida",
diff --git a/i18n/locale/fr_FR.json b/i18n/locale/fr_FR.json
index d12078e..64bb37a 100644
--- a/i18n/locale/fr_FR.json
+++ b/i18n/locale/fr_FR.json
@@ -38,6 +38,7 @@
"加载模型": "Charger le modèle.",
"加载预训练底模D路径": "Charger le chemin du modèle de base pré-entraîné D :",
"加载预训练底模G路径": "Charger le chemin du modèle de base pré-entraîné G :",
+ "单次推理": "单次推理",
"卸载音色省显存": "Décharger la voix pour économiser la mémoire GPU.",
"变调(整数, 半音数量, 升八度12降八度-12)": "Transposer (entier, nombre de demi-tons, monter d'une octave : 12, descendre d'une octave : -12) :",
"后处理重采样至最终采样率,0为不进行重采样": "Rééchantillonner l'audio de sortie en post-traitement à la fréquence d'échantillonnage finale. Réglez sur 0 pour ne pas effectuer de rééchantillonnage :",
@@ -53,6 +54,7 @@
"很遗憾您这没有能用的显卡来支持您训练": "Malheureusement, il n'y a pas de GPU compatible disponible pour prendre en charge votre entrainement.",
"性能设置": "Paramètres de performance",
"总训练轮数total_epoch": "Nombre total d'époques d'entraînement (total_epoch) :",
+ "批量推理": "批量推理",
"批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Conversion en lot. Entrez le dossier contenant les fichiers audio à convertir ou téléchargez plusieurs fichiers audio. Les fichiers audio convertis seront enregistrés dans le dossier spécifié (par défaut : 'opt').",
"指定输出主人声文件夹": "Spécifiez le dossier de sortie pour les fichiers de voix :",
"指定输出文件夹": "Spécifiez le dossier de sortie :",
diff --git a/i18n/locale/it_IT.json b/i18n/locale/it_IT.json
index 38fdef8..02eac59 100644
--- a/i18n/locale/it_IT.json
+++ b/i18n/locale/it_IT.json
@@ -38,6 +38,7 @@
"加载模型": "Carica modello",
"加载预训练底模D路径": "Carica il percorso D del modello base pre-addestrato:",
"加载预训练底模G路径": "Carica il percorso G del modello base pre-addestrato:",
+ "单次推理": "单次推理",
"卸载音色省显存": "Scarica la voce per risparmiare memoria della GPU:",
"变调(整数, 半音数量, 升八度12降八度-12)": "Trasposizione (numero intero, numero di semitoni, alza di un'ottava: 12, abbassa di un'ottava: -12):",
"后处理重采样至最终采样率,0为不进行重采样": "Ricampiona l'audio di output in post-elaborazione alla frequenza di campionamento finale. ",
@@ -53,6 +54,7 @@
"很遗憾您这没有能用的显卡来支持您训练": "Sfortunatamente, non è disponibile alcuna GPU compatibile per supportare l'addestramento.",
"性能设置": "Impostazioni delle prestazioni",
"总训练轮数total_epoch": "Epoch totali di addestramento (total_epoch):",
+ "批量推理": "批量推理",
"批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Conversione massiva. Inserisci il percorso della cartella che contiene i file da convertire o carica più file audio. I file convertiti finiranno nella cartella specificata. (default: opt) ",
"指定输出主人声文件夹": "Specifica la cartella di output per le voci:",
"指定输出文件夹": "Specifica la cartella di output:",
diff --git a/i18n/locale/ja_JP.json b/i18n/locale/ja_JP.json
index 903ed87..d02f331 100644
--- a/i18n/locale/ja_JP.json
+++ b/i18n/locale/ja_JP.json
@@ -38,6 +38,7 @@
"加载模型": "モデルをロード",
"加载预训练底模D路径": "事前学習済みのDモデルのパス",
"加载预训练底模G路径": "事前学習済みのGモデルのパス",
+ "单次推理": "单次推理",
"卸载音色省显存": "音源を削除してメモリを節約",
"变调(整数, 半音数量, 升八度12降八度-12)": "ピッチ変更(整数、半音数、上下オクターブ12-12)",
"后处理重采样至最终采样率,0为不进行重采样": "最終的なサンプリングレートへのポストプロセッシングのリサンプリング リサンプリングしない場合は0",
@@ -53,6 +54,7 @@
"很遗憾您这没有能用的显卡来支持您训练": "トレーニングに対応したGPUが動作しないのは残念です。",
"性能设置": "パフォーマンス設定",
"总训练轮数total_epoch": "総エポック数",
+ "批量推理": "批量推理",
"批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "一括変換、変換する音声フォルダを入力、または複数の音声ファイルをアップロードし、指定したフォルダ(デフォルトのopt)に変換した音声を出力します。",
"指定输出主人声文件夹": "マスターの出力音声フォルダーを指定する",
"指定输出文件夹": "出力フォルダを指定してください",
diff --git a/i18n/locale/ru_RU.json b/i18n/locale/ru_RU.json
index b6530ac..9d7ef8e 100644
--- a/i18n/locale/ru_RU.json
+++ b/i18n/locale/ru_RU.json
@@ -38,6 +38,7 @@
"加载模型": "Загрузить модель",
"加载预训练底模D路径": "Путь к предварительно обученной базовой модели D:",
"加载预训练底模G路径": "Путь к предварительно обученной базовой модели G:",
+ "单次推理": "单次推理",
"卸载音色省显存": "Выгрузить модель из памяти GPU для освобождения ресурсов",
"变调(整数, 半音数量, 升八度12降八度-12)": "Изменить высоту голоса (укажите количество полутонов; чтобы поднять голос на октаву, выберите 12, понизить на октаву — -12):",
"后处理重采样至最终采样率,0为不进行重采样": "Изменить частоту дискретизации в выходном файле на финальную. Поставьте 0, чтобы ничего не изменялось:",
@@ -53,6 +54,7 @@
"很遗憾您这没有能用的显卡来支持您训练": "К сожалению, у вас нету графического процессора, который поддерживает обучение моделей.",
"性能设置": "Настройки быстроты",
"总训练轮数total_epoch": "Полное количество эпох (total_epoch):",
+ "批量推理": "批量推理",
"批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Массовое преобразование. Введите путь к папке, в которой находятся файлы для преобразования голоса или выгрузите несколько аудиофайлов. Сконвертированные файлы будут сохранены в указанной папке (по умолчанию: 'opt').",
"指定输出主人声文件夹": "Путь к папке для сохранения вокала:",
"指定输出文件夹": "Папка для результатов:",
diff --git a/i18n/locale/tr_TR.json b/i18n/locale/tr_TR.json
index efd921b..04c6102 100644
--- a/i18n/locale/tr_TR.json
+++ b/i18n/locale/tr_TR.json
@@ -38,6 +38,7 @@
"加载模型": "Model yükle",
"加载预训练底模D路径": "Önceden eğitilmiş temel D modelini yükleme yolu:",
"加载预训练底模G路径": "Önceden eğitilmiş temel G modelini yükleme yolu:",
+ "单次推理": "单次推理",
"卸载音色省显存": "GPU bellek kullanımını azaltmak için sesi kaldır",
"变调(整数, 半音数量, 升八度12降八度-12)": "Transpoze et (tamsayı, yarıton sayısıyla; bir oktav yükseltmek için: 12, bir oktav düşürmek için: -12):",
"后处理重采样至最终采样率,0为不进行重采样": "Son işleme aşamasında çıktı sesini son örnekleme hızına yeniden örnekle. 0 değeri için yeniden örnekleme yapılmaz:",
@@ -53,6 +54,7 @@
"很遗憾您这没有能用的显卡来支持您训练": "Maalesef, eğitiminizi desteklemek için uyumlu bir GPU bulunmamaktadır.",
"性能设置": "Performans ayarları",
"总训练轮数total_epoch": "Toplam eğitim turu (total_epoch):",
+ "批量推理": "批量推理",
"批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "Toplu dönüştür. Dönüştürülecek ses dosyalarının bulunduğu klasörü girin veya birden çok ses dosyasını yükleyin. Dönüştürülen ses dosyaları belirtilen klasöre ('opt' varsayılan olarak) dönüştürülecektir",
"指定输出主人声文件夹": "Vokal için çıkış klasörünü belirtin:",
"指定输出文件夹": "Çıkış klasörünü belirt:",
diff --git a/i18n/locale/zh_CN.json b/i18n/locale/zh_CN.json
index b14e5f0..2c77001 100644
--- a/i18n/locale/zh_CN.json
+++ b/i18n/locale/zh_CN.json
@@ -38,6 +38,7 @@
"加载模型": "加载模型",
"加载预训练底模D路径": "加载预训练底模D路径",
"加载预训练底模G路径": "加载预训练底模G路径",
+ "单次推理": "单次推理",
"卸载音色省显存": "卸载音色省显存",
"变调(整数, 半音数量, 升八度12降八度-12)": "变调(整数, 半音数量, 升八度12降八度-12)",
"后处理重采样至最终采样率,0为不进行重采样": "后处理重采样至最终采样率,0为不进行重采样",
@@ -53,6 +54,7 @@
"很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练",
"性能设置": "性能设置",
"总训练轮数total_epoch": "总训练轮数total_epoch",
+ "批量推理": "批量推理",
"批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ",
"指定输出主人声文件夹": "指定输出主人声文件夹",
"指定输出文件夹": "指定输出文件夹",
diff --git a/i18n/locale/zh_HK.json b/i18n/locale/zh_HK.json
index fa2fbad..b7f6171 100644
--- a/i18n/locale/zh_HK.json
+++ b/i18n/locale/zh_HK.json
@@ -38,6 +38,7 @@
"加载模型": "載入模型",
"加载预训练底模D路径": "加載預訓練底模D路徑",
"加载预训练底模G路径": "加載預訓練底模G路徑",
+ "单次推理": "单次推理",
"卸载音色省显存": "卸載音色節省 VRAM",
"变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)",
"后处理重采样至最终采样率,0为不进行重采样": "後處理重採樣至最終採樣率,0為不進行重採樣",
@@ -53,6 +54,7 @@
"很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练",
"性能设置": "效能設定",
"总训练轮数total_epoch": "總訓練輪數total_epoch",
+ "批量推理": "批量推理",
"批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量轉換,輸入待轉換音頻資料夾,或上傳多個音頻檔案,在指定資料夾(默認opt)下輸出轉換的音頻。",
"指定输出主人声文件夹": "指定输出主人声文件夹",
"指定输出文件夹": "指定輸出資料夾",
diff --git a/i18n/locale/zh_SG.json b/i18n/locale/zh_SG.json
index fa2fbad..b7f6171 100644
--- a/i18n/locale/zh_SG.json
+++ b/i18n/locale/zh_SG.json
@@ -38,6 +38,7 @@
"加载模型": "載入模型",
"加载预训练底模D路径": "加載預訓練底模D路徑",
"加载预训练底模G路径": "加載預訓練底模G路徑",
+ "单次推理": "单次推理",
"卸载音色省显存": "卸載音色節省 VRAM",
"变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)",
"后处理重采样至最终采样率,0为不进行重采样": "後處理重採樣至最終採樣率,0為不進行重採樣",
@@ -53,6 +54,7 @@
"很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练",
"性能设置": "效能設定",
"总训练轮数total_epoch": "總訓練輪數total_epoch",
+ "批量推理": "批量推理",
"批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量轉換,輸入待轉換音頻資料夾,或上傳多個音頻檔案,在指定資料夾(默認opt)下輸出轉換的音頻。",
"指定输出主人声文件夹": "指定输出主人声文件夹",
"指定输出文件夹": "指定輸出資料夾",
diff --git a/i18n/locale/zh_TW.json b/i18n/locale/zh_TW.json
index fa2fbad..b7f6171 100644
--- a/i18n/locale/zh_TW.json
+++ b/i18n/locale/zh_TW.json
@@ -38,6 +38,7 @@
"加载模型": "載入模型",
"加载预训练底模D路径": "加載預訓練底模D路徑",
"加载预训练底模G路径": "加載預訓練底模G路徑",
+ "单次推理": "单次推理",
"卸载音色省显存": "卸載音色節省 VRAM",
"变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)",
"后处理重采样至最终采样率,0为不进行重采样": "後處理重採樣至最終採樣率,0為不進行重採樣",
@@ -53,6 +54,7 @@
"很遗憾您这没有能用的显卡来支持您训练": "很遗憾您这没有能用的显卡来支持您训练",
"性能设置": "效能設定",
"总训练轮数total_epoch": "總訓練輪數total_epoch",
+ "批量推理": "批量推理",
"批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ": "批量轉換,輸入待轉換音頻資料夾,或上傳多個音頻檔案,在指定資料夾(默認opt)下輸出轉換的音頻。",
"指定输出主人声文件夹": "指定输出主人声文件夹",
"指定输出文件夹": "指定輸出資料夾",
diff --git a/infer-web.py b/infer-web.py
index 542b42b..57c8845 100644
--- a/infer-web.py
+++ b/infer-web.py
@@ -1,36 +1,46 @@
-import os, sys
+import os
+import sys
now_dir = os.getcwd()
sys.path.append(now_dir)
-import logging
-import shutil
-import threading
-import traceback
-import warnings
-from random import shuffle
-from subprocess import Popen
-from time import sleep
-import json
-import pathlib
-
-import fairseq
-import faiss
-import gradio as gr
-import numpy as np
-import torch
-from dotenv import load_dotenv
-from sklearn.cluster import MiniBatchKMeans
-
-from configs.config import Config
-from i18n.i18n import I18nAuto
+from infer.modules.vc.modules import VC
+from infer.modules.uvr5.modules import uvr
from infer.lib.train.process_ckpt import (
change_info,
extract_small_model,
merge,
show_info,
)
-from infer.modules.uvr5.modules import uvr
-from infer.modules.vc.modules import VC
+from i18n.i18n import I18nAuto
+from configs.config import Config
+from sklearn.cluster import MiniBatchKMeans
+from dotenv import load_dotenv
+import torch
+
+try:
+ import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
+
+ if torch.xpu.is_available():
+ from infer.modules.ipex import ipex_init
+
+ ipex_init()
+except Exception: # pylint: disable=broad-exception-caught
+ pass
+import numpy as np
+import gradio as gr
+import faiss
+import fairseq
+import pathlib
+import json
+from time import sleep
+from subprocess import Popen
+from random import shuffle
+import warnings
+import traceback
+import threading
+import shutil
+import logging
+
logging.getLogger("numba").setLevel(logging.WARNING)
@@ -165,10 +175,10 @@ def clean():
return {"value": "", "__type__": "update"}
-def export_onnx():
+def export_onnx(ModelPath, ExportedPath):
from infer.modules.onnx.export import export_onnx as eo
- eo()
+ eo(ModelPath, ExportedPath)
sr_dict = {
@@ -219,8 +229,9 @@ def preprocess_dataset(trainset_dir, exp_dir, sr, n_p):
per,
)
logger.info(cmd)
- p = Popen(cmd, shell=True) # , stdin=PIPE, stdout=PIPE,stderr=PIPE,cwd=now_dir
- ###煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
+ # , stdin=PIPE, stdout=PIPE,stderr=PIPE,cwd=now_dir
+ p = Popen(cmd, shell=True)
+ # 煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
done = [False]
threading.Thread(
target=if_done,
@@ -263,7 +274,7 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvp
p = Popen(
cmd, shell=True, cwd=now_dir
) # , stdin=PIPE, stdout=PIPE,stderr=PIPE
- ###煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
+ # 煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
done = [False]
threading.Thread(
target=if_done,
@@ -295,7 +306,7 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvp
cmd, shell=True, cwd=now_dir
) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir
ps.append(p)
- ###煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
+ # 煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
done = [False]
threading.Thread(
target=if_done_multi, #
@@ -331,7 +342,7 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvp
log = f.read()
logger.info(log)
yield log
- ####对不同part分别开多进程
+ # 对不同part分别开多进程
"""
n_part=int(sys.argv[1])
i_part=int(sys.argv[2])
@@ -360,7 +371,7 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvp
cmd, shell=True, cwd=now_dir
) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir
ps.append(p)
- ###煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
+ # 煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
done = [False]
threading.Thread(
target=if_done_multi,
@@ -701,11 +712,11 @@ def train1key(
infos.append(strr)
return "\n".join(infos)
- ####### step1:处理数据
+ # step1:处理数据
yield get_info_str(i18n("step1:正在处理数据"))
[get_info_str(_) for _ in preprocess_dataset(trainset_dir4, exp_dir1, sr2, np7)]
- ####### step2a:提取音高
+ # step2a:提取音高
yield get_info_str(i18n("step2:正在提取音高&正在提取特征"))
[
get_info_str(_)
@@ -714,7 +725,7 @@ def train1key(
)
]
- ####### step3a:训练模型
+ # step3a:训练模型
yield get_info_str(i18n("step3a:正在训练模型"))
click_train(
exp_dir1,
@@ -734,7 +745,7 @@ def train1key(
)
yield get_info_str(i18n("训练结束, 您可查看控制台训练日志或实验文件夹下的train.log"))
- ####### step3b:训练索引
+ # step3b:训练索引
[get_info_str(_) for _ in train_index(exp_dir1, version19)]
yield get_info_str(i18n("全流程结束!"))
@@ -768,6 +779,7 @@ def change_f0_method(f0method8):
with gr.Blocks(title="RVC WebUI") as app:
+ gr.Markdown("## RVC WebUI")
gr.Markdown(
value=i18n(
"本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE."
@@ -777,8 +789,9 @@ with gr.Blocks(title="RVC WebUI") as app:
with gr.TabItem(i18n("模型推理")):
with gr.Row():
sid0 = gr.Dropdown(label=i18n("推理音色"), choices=sorted(names))
- refresh_button = gr.Button(i18n("刷新音色列表和索引路径"), variant="primary")
- clean_button = gr.Button(i18n("卸载音色省显存"), variant="primary")
+ with gr.Column():
+ refresh_button = gr.Button(i18n("刷新音色列表和索引路径"), variant="primary")
+ clean_button = gr.Button(i18n("卸载音色省显存"), variant="primary")
spk_item = gr.Slider(
minimum=0,
maximum=2333,
@@ -791,118 +804,125 @@ with gr.Blocks(title="RVC WebUI") as app:
clean_button.click(
fn=clean, inputs=[], outputs=[sid0], api_name="infer_clean"
)
- with gr.Group():
- gr.Markdown(
- value=i18n("男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ")
- )
- with gr.Row():
- with gr.Column():
- vc_transform0 = gr.Number(
- label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"), value=0
- )
- input_audio0 = gr.Textbox(
- label=i18n("输入待处理音频文件路径(默认是正确格式示例)"),
- value="E:\\codes\\py39\\test-20230416b\\todo-songs\\冬之花clip1.wav",
- )
- f0method0 = gr.Radio(
- label=i18n(
- "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU"
- ),
- choices=["pm", "harvest", "crepe", "rmvpe"]
- if config.dml == False
- else ["pm", "harvest", "rmvpe"],
- value="pm",
- interactive=True,
- )
- filter_radius0 = gr.Slider(
- minimum=0,
- maximum=7,
- label=i18n(">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音"),
- value=3,
- step=1,
- interactive=True,
- )
- with gr.Column():
- file_index1 = gr.Textbox(
- label=i18n("特征检索库文件路径,为空则使用下拉的选择结果"),
- value="",
- interactive=True,
- )
- file_index2 = gr.Dropdown(
- label=i18n("自动检测index路径,下拉式选择(dropdown)"),
- choices=sorted(index_paths),
- interactive=True,
- )
- refresh_button.click(
- fn=change_choices,
- inputs=[],
- outputs=[sid0, file_index2],
- api_name="infer_refresh",
- )
- # file_big_npy1 = gr.Textbox(
- # label=i18n("特征文件路径"),
- # value="E:\\codes\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy",
- # interactive=True,
- # )
- index_rate1 = gr.Slider(
- minimum=0,
- maximum=1,
- label=i18n("检索特征占比"),
- value=0.75,
- interactive=True,
- )
- with gr.Column():
- resample_sr0 = gr.Slider(
- minimum=0,
- maximum=48000,
- label=i18n("后处理重采样至最终采样率,0为不进行重采样"),
- value=0,
- step=1,
- interactive=True,
- )
- rms_mix_rate0 = gr.Slider(
- minimum=0,
- maximum=1,
- label=i18n("输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络"),
- value=0.25,
- interactive=True,
- )
- protect0 = gr.Slider(
- minimum=0,
- maximum=0.5,
- label=i18n(
- "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果"
- ),
- value=0.33,
- step=0.01,
- interactive=True,
- )
- f0_file = gr.File(label=i18n("F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调"))
- but0 = gr.Button(i18n("转换"), variant="primary")
+ with gr.TabItem(i18n("单次推理")):
+ with gr.Group():
with gr.Row():
- vc_output1 = gr.Textbox(label=i18n("输出信息"))
- vc_output2 = gr.Audio(label=i18n("输出音频(右下角三个点,点了可以下载)"))
- but0.click(
- vc.vc_single,
- [
- spk_item,
- input_audio0,
- vc_transform0,
- f0_file,
- f0method0,
- file_index1,
- file_index2,
- # file_big_npy1,
- index_rate1,
- filter_radius0,
- resample_sr0,
- rms_mix_rate0,
- protect0,
- ],
- [vc_output1, vc_output2],
- api_name="infer_convert",
- )
- with gr.Group():
+ with gr.Column():
+ vc_transform0 = gr.Number(
+ label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"), value=0
+ )
+ input_audio0 = gr.Textbox(
+ label=i18n("输入待处理音频文件路径(默认是正确格式示例)"),
+ placeholder="C:\\Users\\Desktop\\audio_example.wav",
+ )
+ file_index1 = gr.Textbox(
+ label=i18n("特征检索库文件路径,为空则使用下拉的选择结果"),
+ placeholder="C:\\Users\\Desktop\\model_example.index",
+ interactive=True,
+ )
+ file_index2 = gr.Dropdown(
+ label=i18n("自动检测index路径,下拉式选择(dropdown)"),
+ choices=sorted(index_paths),
+ interactive=True,
+ )
+ f0method0 = gr.Radio(
+ label=i18n(
+ "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU"
+ ),
+ choices=["pm", "harvest", "crepe", "rmvpe"]
+ if config.dml == False
+ else ["pm", "harvest", "rmvpe"],
+ value="rmvpe",
+ interactive=True,
+ )
+
+ with gr.Column():
+ resample_sr0 = gr.Slider(
+ minimum=0,
+ maximum=48000,
+ label=i18n("后处理重采样至最终采样率,0为不进行重采样"),
+ value=0,
+ step=1,
+ interactive=True,
+ )
+ rms_mix_rate0 = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络"),
+ value=0.25,
+ interactive=True,
+ )
+ protect0 = gr.Slider(
+ minimum=0,
+ maximum=0.5,
+ label=i18n(
+ "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果"
+ ),
+ value=0.33,
+ step=0.01,
+ interactive=True,
+ )
+ filter_radius0 = gr.Slider(
+ minimum=0,
+ maximum=7,
+ label=i18n(
+ ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音"
+ ),
+ value=3,
+ step=1,
+ interactive=True,
+ )
+ index_rate1 = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("检索特征占比"),
+ value=0.75,
+ interactive=True,
+ )
+ f0_file = gr.File(
+ label=i18n("F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调"),
+ visible=False,
+ )
+
+ refresh_button.click(
+ fn=change_choices,
+ inputs=[],
+ outputs=[sid0, file_index2],
+ api_name="infer_refresh",
+ )
+ # file_big_npy1 = gr.Textbox(
+ # label=i18n("特征文件路径"),
+ # value="E:\\codes\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy",
+ # interactive=True,
+ # )
+ with gr.Group():
+ with gr.Column():
+ but0 = gr.Button(i18n("转换"), variant="primary")
+ with gr.Row():
+ vc_output1 = gr.Textbox(label=i18n("输出信息"))
+ vc_output2 = gr.Audio(label=i18n("输出音频(右下角三个点,点了可以下载)"))
+
+ but0.click(
+ vc.vc_single,
+ [
+ spk_item,
+ input_audio0,
+ vc_transform0,
+ f0_file,
+ f0method0,
+ file_index1,
+ file_index2,
+ # file_big_npy1,
+ index_rate1,
+ filter_radius0,
+ resample_sr0,
+ rms_mix_rate0,
+ protect0,
+ ],
+ [vc_output1, vc_output2],
+ api_name="infer_convert",
+ )
+ with gr.TabItem(i18n("批量推理")):
gr.Markdown(
value=i18n("批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ")
)
@@ -912,25 +932,6 @@ with gr.Blocks(title="RVC WebUI") as app:
label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"), value=0
)
opt_input = gr.Textbox(label=i18n("指定输出文件夹"), value="opt")
- f0method1 = gr.Radio(
- label=i18n(
- "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU"
- ),
- choices=["pm", "harvest", "crepe", "rmvpe"]
- if config.dml == False
- else ["pm", "harvest", "rmvpe"],
- value="pm",
- interactive=True,
- )
- filter_radius1 = gr.Slider(
- minimum=0,
- maximum=7,
- label=i18n(">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音"),
- value=3,
- step=1,
- interactive=True,
- )
- with gr.Column():
file_index3 = gr.Textbox(
label=i18n("特征检索库文件路径,为空则使用下拉的选择结果"),
value="",
@@ -941,6 +942,23 @@ with gr.Blocks(title="RVC WebUI") as app:
choices=sorted(index_paths),
interactive=True,
)
+ f0method1 = gr.Radio(
+ label=i18n(
+ "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU,rmvpe效果最好且微吃GPU"
+ ),
+ choices=["pm", "harvest", "crepe", "rmvpe"]
+ if config.dml == False
+ else ["pm", "harvest", "rmvpe"],
+ value="rmvpe",
+ interactive=True,
+ )
+ format1 = gr.Radio(
+ label=i18n("导出文件格式"),
+ choices=["wav", "flac", "mp3", "m4a"],
+ value="wav",
+ interactive=True,
+ )
+
refresh_button.click(
fn=lambda: change_choices()[1],
inputs=[],
@@ -952,13 +970,7 @@ with gr.Blocks(title="RVC WebUI") as app:
# value="E:\\codes\\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy",
# interactive=True,
# )
- index_rate2 = gr.Slider(
- minimum=0,
- maximum=1,
- label=i18n("检索特征占比"),
- value=1,
- interactive=True,
- )
+
with gr.Column():
resample_sr1 = gr.Slider(
minimum=0,
@@ -985,23 +997,34 @@ with gr.Blocks(title="RVC WebUI") as app:
step=0.01,
interactive=True,
)
- with gr.Column():
- dir_input = gr.Textbox(
- label=i18n("输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)"),
- value="E:\codes\py39\\test-20230416b\\todo-songs",
- )
- inputs = gr.File(
- file_count="multiple", label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹")
- )
- with gr.Row():
- format1 = gr.Radio(
- label=i18n("导出文件格式"),
- choices=["wav", "flac", "mp3", "m4a"],
- value="flac",
+ filter_radius1 = gr.Slider(
+ minimum=0,
+ maximum=7,
+ label=i18n(">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音"),
+ value=3,
+ step=1,
interactive=True,
)
- but1 = gr.Button(i18n("转换"), variant="primary")
- vc_output3 = gr.Textbox(label=i18n("输出信息"))
+ index_rate2 = gr.Slider(
+ minimum=0,
+ maximum=1,
+ label=i18n("检索特征占比"),
+ value=1,
+ interactive=True,
+ )
+ with gr.Row():
+ dir_input = gr.Textbox(
+ label=i18n("输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)"),
+ placeholder="C:\\Users\\Desktop\\input_vocal_dir",
+ )
+ inputs = gr.File(
+ file_count="multiple", label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹")
+ )
+
+ with gr.Row():
+ but1 = gr.Button(i18n("转换"), variant="primary")
+ vc_output3 = gr.Textbox(label=i18n("输出信息"))
+
but1.click(
vc.vc_multi,
[
@@ -1024,12 +1047,12 @@ with gr.Blocks(title="RVC WebUI") as app:
[vc_output3],
api_name="infer_convert_batch",
)
- sid0.change(
- fn=vc.get_vc,
- inputs=[sid0, protect0, protect1],
- outputs=[spk_item, protect0, protect1, file_index2, file_index4],
- api_name="infer_change_voice",
- )
+ sid0.change(
+ fn=vc.get_vc,
+ inputs=[sid0, protect0, protect1],
+ outputs=[spk_item, protect0, protect1, file_index2, file_index4],
+ api_name="infer_change_voice",
+ )
with gr.TabItem(i18n("伴奏人声分离&去混响&去回声")):
with gr.Group():
gr.Markdown(
@@ -1041,7 +1064,7 @@ with gr.Blocks(title="RVC WebUI") as app:
with gr.Column():
dir_wav_input = gr.Textbox(
label=i18n("输入待处理音频文件夹路径"),
- value="E:\\codes\\py39\\test-20230416b\\todo-songs\\todo-songs",
+ placeholder="C:\\Users\\Desktop\\todo-songs",
)
wav_inputs = gr.File(
file_count="multiple", label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹")
diff --git a/infer/lib/infer_pack/attentions.py b/infer/lib/infer_pack/attentions.py
index 2b6060c..2cc745a 100644
--- a/infer/lib/infer_pack/attentions.py
+++ b/infer/lib/infer_pack/attentions.py
@@ -1,5 +1,6 @@
import copy
import math
+from typing import Optional
import numpy as np
import torch
@@ -22,11 +23,11 @@ class Encoder(nn.Module):
window_size=10,
**kwargs
):
- super().__init__()
+ super(Encoder, self).__init__()
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
- self.n_layers = n_layers
+ self.n_layers = int(n_layers)
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.window_size = window_size
@@ -61,14 +62,17 @@ class Encoder(nn.Module):
def forward(self, x, x_mask):
attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
x = x * x_mask
- for i in range(self.n_layers):
- y = self.attn_layers[i](x, x, attn_mask)
+ zippep = zip(
+ self.attn_layers, self.norm_layers_1, self.ffn_layers, self.norm_layers_2
+ )
+ for attn_layers, norm_layers_1, ffn_layers, norm_layers_2 in zippep:
+ y = attn_layers(x, x, attn_mask)
y = self.drop(y)
- x = self.norm_layers_1[i](x + y)
+ x = norm_layers_1(x + y)
- y = self.ffn_layers[i](x, x_mask)
+ y = ffn_layers(x, x_mask)
y = self.drop(y)
- x = self.norm_layers_2[i](x + y)
+ x = norm_layers_2(x + y)
x = x * x_mask
return x
@@ -86,7 +90,7 @@ class Decoder(nn.Module):
proximal_init=True,
**kwargs
):
- super().__init__()
+ super(Decoder, self).__init__()
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
@@ -172,7 +176,7 @@ class MultiHeadAttention(nn.Module):
proximal_bias=False,
proximal_init=False,
):
- super().__init__()
+ super(MultiHeadAttention, self).__init__()
assert channels % n_heads == 0
self.channels = channels
@@ -213,19 +217,28 @@ class MultiHeadAttention(nn.Module):
self.conv_k.weight.copy_(self.conv_q.weight)
self.conv_k.bias.copy_(self.conv_q.bias)
- def forward(self, x, c, attn_mask=None):
+ def forward(
+ self, x: torch.Tensor, c: torch.Tensor, attn_mask: Optional[torch.Tensor] = None
+ ):
q = self.conv_q(x)
k = self.conv_k(c)
v = self.conv_v(c)
- x, self.attn = self.attention(q, k, v, mask=attn_mask)
+ x, _ = self.attention(q, k, v, mask=attn_mask)
x = self.conv_o(x)
return x
- def attention(self, query, key, value, mask=None):
+ def attention(
+ self,
+ query: torch.Tensor,
+ key: torch.Tensor,
+ value: torch.Tensor,
+ mask: Optional[torch.Tensor] = None,
+ ):
# reshape [b, d, t] -> [b, n_h, t, d_k]
- b, d, t_s, t_t = (*key.size(), query.size(2))
+ b, d, t_s = key.size()
+ t_t = query.size(2)
query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
@@ -292,16 +305,17 @@ class MultiHeadAttention(nn.Module):
ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
return ret
- def _get_relative_embeddings(self, relative_embeddings, length):
+ def _get_relative_embeddings(self, relative_embeddings, length: int):
max_relative_position = 2 * self.window_size + 1
# Pad first before slice to avoid using cond ops.
- pad_length = max(length - (self.window_size + 1), 0)
+ pad_length: int = max(length - (self.window_size + 1), 0)
slice_start_position = max((self.window_size + 1) - length, 0)
slice_end_position = slice_start_position + 2 * length - 1
if pad_length > 0:
padded_relative_embeddings = F.pad(
relative_embeddings,
- commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
+ # commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
+ [0, 0, pad_length, pad_length, 0, 0],
)
else:
padded_relative_embeddings = relative_embeddings
@@ -317,12 +331,18 @@ class MultiHeadAttention(nn.Module):
"""
batch, heads, length, _ = x.size()
# Concat columns of pad to shift from relative to absolute indexing.
- x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
+ x = F.pad(
+ x,
+ # commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])
+ [0, 1, 0, 0, 0, 0, 0, 0],
+ )
# Concat extra elements so to add up to shape (len+1, 2*len-1).
x_flat = x.view([batch, heads, length * 2 * length])
x_flat = F.pad(
- x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
+ x_flat,
+ # commons.convert_pad_shape([[0, 0], [0, 0], [0, int(length) - 1]])
+ [0, int(length) - 1, 0, 0, 0, 0],
)
# Reshape and slice out the padded elements.
@@ -339,15 +359,21 @@ class MultiHeadAttention(nn.Module):
batch, heads, length, _ = x.size()
# padd along column
x = F.pad(
- x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
+ x,
+ # commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, int(length) - 1]])
+ [0, int(length) - 1, 0, 0, 0, 0, 0, 0],
)
- x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
+ x_flat = x.view([batch, heads, int(length**2) + int(length * (length - 1))])
# add 0's in the beginning that will skew the elements after reshape
- x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
+ x_flat = F.pad(
+ x_flat,
+ # commons.convert_pad_shape([[0, 0], [0, 0], [int(length), 0]])
+ [length, 0, 0, 0, 0, 0],
+ )
x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
return x_final
- def _attention_bias_proximal(self, length):
+ def _attention_bias_proximal(self, length: int):
"""Bias for self-attention to encourage attention to close positions.
Args:
length: an integer scalar.
@@ -367,10 +393,10 @@ class FFN(nn.Module):
filter_channels,
kernel_size,
p_dropout=0.0,
- activation=None,
+ activation: str = None,
causal=False,
):
- super().__init__()
+ super(FFN, self).__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.filter_channels = filter_channels
@@ -378,40 +404,56 @@ class FFN(nn.Module):
self.p_dropout = p_dropout
self.activation = activation
self.causal = causal
-
- if causal:
- self.padding = self._causal_padding
- else:
- self.padding = self._same_padding
+ self.is_activation = True if activation == "gelu" else False
+ # if causal:
+ # self.padding = self._causal_padding
+ # else:
+ # self.padding = self._same_padding
self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
self.drop = nn.Dropout(p_dropout)
- def forward(self, x, x_mask):
- x = self.conv_1(self.padding(x * x_mask))
- if self.activation == "gelu":
+ def padding(self, x: torch.Tensor, x_mask: torch.Tensor) -> torch.Tensor:
+ if self.causal:
+ padding = self._causal_padding(x * x_mask)
+ else:
+ padding = self._same_padding(x * x_mask)
+ return padding
+
+ def forward(self, x: torch.Tensor, x_mask: torch.Tensor):
+ x = self.conv_1(self.padding(x, x_mask))
+ if self.is_activation:
x = x * torch.sigmoid(1.702 * x)
else:
x = torch.relu(x)
x = self.drop(x)
- x = self.conv_2(self.padding(x * x_mask))
+
+ x = self.conv_2(self.padding(x, x_mask))
return x * x_mask
def _causal_padding(self, x):
if self.kernel_size == 1:
return x
- pad_l = self.kernel_size - 1
- pad_r = 0
- padding = [[0, 0], [0, 0], [pad_l, pad_r]]
- x = F.pad(x, commons.convert_pad_shape(padding))
+ pad_l: int = self.kernel_size - 1
+ pad_r: int = 0
+ # padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+ x = F.pad(
+ x,
+ # commons.convert_pad_shape(padding)
+ [pad_l, pad_r, 0, 0, 0, 0],
+ )
return x
def _same_padding(self, x):
if self.kernel_size == 1:
return x
- pad_l = (self.kernel_size - 1) // 2
- pad_r = self.kernel_size // 2
- padding = [[0, 0], [0, 0], [pad_l, pad_r]]
- x = F.pad(x, commons.convert_pad_shape(padding))
+ pad_l: int = (self.kernel_size - 1) // 2
+ pad_r: int = self.kernel_size // 2
+ # padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+ x = F.pad(
+ x,
+ # commons.convert_pad_shape(padding)
+ [pad_l, pad_r, 0, 0, 0, 0],
+ )
return x
diff --git a/infer/lib/infer_pack/commons.py b/infer/lib/infer_pack/commons.py
index 7ba7d21..4ec6c24 100644
--- a/infer/lib/infer_pack/commons.py
+++ b/infer/lib/infer_pack/commons.py
@@ -1,3 +1,4 @@
+from typing import List, Optional
import math
import numpy as np
@@ -16,10 +17,10 @@ def get_padding(kernel_size, dilation=1):
return int((kernel_size * dilation - dilation) / 2)
-def convert_pad_shape(pad_shape):
- l = pad_shape[::-1]
- pad_shape = [item for sublist in l for item in sublist]
- return pad_shape
+# def convert_pad_shape(pad_shape):
+# l = pad_shape[::-1]
+# pad_shape = [item for sublist in l for item in sublist]
+# return pad_shape
def kl_divergence(m_p, logs_p, m_q, logs_q):
@@ -113,10 +114,14 @@ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
return acts
-def convert_pad_shape(pad_shape):
- l = pad_shape[::-1]
- pad_shape = [item for sublist in l for item in sublist]
- return pad_shape
+# def convert_pad_shape(pad_shape):
+# l = pad_shape[::-1]
+# pad_shape = [item for sublist in l for item in sublist]
+# return pad_shape
+
+
+def convert_pad_shape(pad_shape: List[List[int]]) -> List[int]:
+ return torch.tensor(pad_shape).flip(0).reshape(-1).int().tolist()
def shift_1d(x):
@@ -124,7 +129,7 @@ def shift_1d(x):
return x
-def sequence_mask(length, max_length=None):
+def sequence_mask(length: torch.Tensor, max_length: Optional[int] = None):
if max_length is None:
max_length = length.max()
x = torch.arange(max_length, dtype=length.dtype, device=length.device)
diff --git a/infer/lib/infer_pack/models.py b/infer/lib/infer_pack/models.py
index 711db22..a60ced6 100644
--- a/infer/lib/infer_pack/models.py
+++ b/infer/lib/infer_pack/models.py
@@ -1,5 +1,6 @@
import math
import logging
+from typing import Optional
logger = logging.getLogger(__name__)
@@ -28,25 +29,32 @@ class TextEncoder256(nn.Module):
p_dropout,
f0=True,
):
- super().__init__()
+ super(TextEncoder256, self).__init__()
self.out_channels = out_channels
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
- self.p_dropout = p_dropout
+ self.p_dropout = float(p_dropout)
self.emb_phone = nn.Linear(256, hidden_channels)
self.lrelu = nn.LeakyReLU(0.1, inplace=True)
if f0 == True:
self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
self.encoder = attentions.Encoder(
- hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers,
+ kernel_size,
+ float(p_dropout),
)
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
- def forward(self, phone, pitch, lengths):
- if pitch == None:
+ def forward(
+ self, phone: torch.Tensor, pitch: Optional[torch.Tensor], lengths: torch.Tensor
+ ):
+ if pitch is None:
x = self.emb_phone(phone)
else:
x = self.emb_phone(phone) + self.emb_pitch(pitch)
@@ -75,25 +83,30 @@ class TextEncoder768(nn.Module):
p_dropout,
f0=True,
):
- super().__init__()
+ super(TextEncoder768, self).__init__()
self.out_channels = out_channels
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
- self.p_dropout = p_dropout
+ self.p_dropout = float(p_dropout)
self.emb_phone = nn.Linear(768, hidden_channels)
self.lrelu = nn.LeakyReLU(0.1, inplace=True)
if f0 == True:
self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
self.encoder = attentions.Encoder(
- hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers,
+ kernel_size,
+ float(p_dropout),
)
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
- def forward(self, phone, pitch, lengths):
- if pitch == None:
+ def forward(self, phone: torch.Tensor, pitch: torch.Tensor, lengths: torch.Tensor):
+ if pitch is None:
x = self.emb_phone(phone)
else:
x = self.emb_phone(phone) + self.emb_pitch(pitch)
@@ -121,7 +134,7 @@ class ResidualCouplingBlock(nn.Module):
n_flows=4,
gin_channels=0,
):
- super().__init__()
+ super(ResidualCouplingBlock, self).__init__()
self.channels = channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
@@ -145,19 +158,36 @@ class ResidualCouplingBlock(nn.Module):
)
self.flows.append(modules.Flip())
- def forward(self, x, x_mask, g=None, reverse=False):
+ def forward(
+ self,
+ x: torch.Tensor,
+ x_mask: torch.Tensor,
+ g: Optional[torch.Tensor] = None,
+ reverse: bool = False,
+ ):
if not reverse:
for flow in self.flows:
x, _ = flow(x, x_mask, g=g, reverse=reverse)
else:
- for flow in reversed(self.flows):
- x = flow(x, x_mask, g=g, reverse=reverse)
+ for flow in self.flows[::-1]:
+ x, _ = flow.forward(x, x_mask, g=g, reverse=reverse)
return x
def remove_weight_norm(self):
for i in range(self.n_flows):
self.flows[i * 2].remove_weight_norm()
+ def __prepare_scriptable__(self):
+ for i in range(self.n_flows):
+ for hook in self.flows[i * 2]._forward_pre_hooks.values():
+ if (
+ hook.__module__ == "torch.nn.utils.weight_norm"
+ and hook.__class__.__name__ == "WeightNorm"
+ ):
+ torch.nn.utils.remove_weight_norm(self.flows[i * 2])
+
+ return self
+
class PosteriorEncoder(nn.Module):
def __init__(
@@ -170,7 +200,7 @@ class PosteriorEncoder(nn.Module):
n_layers,
gin_channels=0,
):
- super().__init__()
+ super(PosteriorEncoder, self).__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.hidden_channels = hidden_channels
@@ -189,7 +219,9 @@ class PosteriorEncoder(nn.Module):
)
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
- def forward(self, x, x_lengths, g=None):
+ def forward(
+ self, x: torch.Tensor, x_lengths: torch.Tensor, g: Optional[torch.Tensor] = None
+ ):
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
x.dtype
)
@@ -203,6 +235,15 @@ class PosteriorEncoder(nn.Module):
def remove_weight_norm(self):
self.enc.remove_weight_norm()
+ def __prepare_scriptable__(self):
+ for hook in self.enc._forward_pre_hooks.values():
+ if (
+ hook.__module__ == "torch.nn.utils.weight_norm"
+ and hook.__class__.__name__ == "WeightNorm"
+ ):
+ torch.nn.utils.remove_weight_norm(self.enc)
+ return self
+
class Generator(torch.nn.Module):
def __init__(
@@ -252,7 +293,7 @@ class Generator(torch.nn.Module):
if gin_channels != 0:
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
- def forward(self, x, g=None):
+ def forward(self, x: torch.Tensor, g: Optional[torch.Tensor] = None):
x = self.conv_pre(x)
if g is not None:
x = x + self.cond(g)
@@ -273,6 +314,28 @@ class Generator(torch.nn.Module):
return x
+ def __prepare_scriptable__(self):
+ for l in self.ups:
+ for hook in l._forward_pre_hooks.values():
+ # The hook we want to remove is an instance of WeightNorm class, so
+ # normally we would do `if isinstance(...)` but this class is not accessible
+ # because of shadowing, so we check the module name directly.
+ # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
+ if (
+ hook.__module__ == "torch.nn.utils.weight_norm"
+ and hook.__class__.__name__ == "WeightNorm"
+ ):
+ torch.nn.utils.remove_weight_norm(l)
+
+ for l in self.resblocks:
+ for hook in l._forward_pre_hooks.values():
+ if (
+ hook.__module__ == "torch.nn.utils.weight_norm"
+ and hook.__class__.__name__ == "WeightNorm"
+ ):
+ torch.nn.utils.remove_weight_norm(l)
+ return self
+
def remove_weight_norm(self):
for l in self.ups:
remove_weight_norm(l)
@@ -293,7 +356,7 @@ class SineGen(torch.nn.Module):
voiced_thoreshold: F0 threshold for U/V classification (default 0)
flag_for_pulse: this SinGen is used inside PulseGen (default False)
Note: when flag_for_pulse is True, the first time step of a voiced
- segment is always sin(np.pi) or cos(0)
+ segment is always sin(torch.pi) or cos(0)
"""
def __init__(
@@ -321,7 +384,7 @@ class SineGen(torch.nn.Module):
uv = uv.float()
return uv
- def forward(self, f0, upp):
+ def forward(self, f0: torch.Tensor, upp: int):
"""sine_tensor, uv = forward(f0)
input F0: tensor(batchsize=1, length, dim=1)
f0 for unvoiced steps should be 0
@@ -333,7 +396,7 @@ class SineGen(torch.nn.Module):
f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
# fundamental component
f0_buf[:, :, 0] = f0[:, :, 0]
- for idx in np.arange(self.harmonic_num):
+ for idx in range(self.harmonic_num):
f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
idx + 2
) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
@@ -347,12 +410,12 @@ class SineGen(torch.nn.Module):
tmp_over_one *= upp
tmp_over_one = F.interpolate(
tmp_over_one.transpose(2, 1),
- scale_factor=upp,
+ scale_factor=float(upp),
mode="linear",
align_corners=True,
).transpose(2, 1)
rad_values = F.interpolate(
- rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
+ rad_values.transpose(2, 1), scale_factor=float(upp), mode="nearest"
).transpose(
2, 1
) #######
@@ -361,12 +424,12 @@ class SineGen(torch.nn.Module):
cumsum_shift = torch.zeros_like(rad_values)
cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
sine_waves = torch.sin(
- torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
+ torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * torch.pi
)
sine_waves = sine_waves * self.sine_amp
uv = self._f02uv(f0)
uv = F.interpolate(
- uv.transpose(2, 1), scale_factor=upp, mode="nearest"
+ uv.transpose(2, 1), scale_factor=float(upp), mode="nearest"
).transpose(2, 1)
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
noise = noise_amp * torch.randn_like(sine_waves)
@@ -414,18 +477,19 @@ class SourceModuleHnNSF(torch.nn.Module):
# to merge source harmonics into a single excitation
self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
self.l_tanh = torch.nn.Tanh()
+ # self.ddtype:int = -1
- def forward(self, x, upp=None):
- if hasattr(self, "ddtype") == False:
- self.ddtype = self.l_linear.weight.dtype
+ def forward(self, x: torch.Tensor, upp: int = 1):
+ # if self.ddtype ==-1:
+ # self.ddtype = self.l_linear.weight.dtype
sine_wavs, uv, _ = self.l_sin_gen(x, upp)
# print(x.dtype,sine_wavs.dtype,self.l_linear.weight.dtype)
# if self.is_half:
# sine_wavs = sine_wavs.half()
# sine_merge = self.l_tanh(self.l_linear(sine_wavs.to(x)))
# print(sine_wavs.dtype,self.ddtype)
- if sine_wavs.dtype != self.ddtype:
- sine_wavs = sine_wavs.to(self.ddtype)
+ # if sine_wavs.dtype != self.l_linear.weight.dtype:
+ sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype)
sine_merge = self.l_tanh(self.l_linear(sine_wavs))
return sine_merge, None, None # noise, uv
@@ -448,7 +512,7 @@ class GeneratorNSF(torch.nn.Module):
self.num_kernels = len(resblock_kernel_sizes)
self.num_upsamples = len(upsample_rates)
- self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=math.prod(upsample_rates))
self.m_source = SourceModuleHnNSF(
sampling_rate=sr, harmonic_num=0, is_half=is_half
)
@@ -473,7 +537,7 @@ class GeneratorNSF(torch.nn.Module):
)
)
if i + 1 < len(upsample_rates):
- stride_f0 = np.prod(upsample_rates[i + 1 :])
+ stride_f0 = math.prod(upsample_rates[i + 1 :])
self.noise_convs.append(
Conv1d(
1,
@@ -500,27 +564,36 @@ class GeneratorNSF(torch.nn.Module):
if gin_channels != 0:
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
- self.upp = np.prod(upsample_rates)
+ self.upp = math.prod(upsample_rates)
- def forward(self, x, f0, g=None):
+ self.lrelu_slope = modules.LRELU_SLOPE
+
+ def forward(self, x, f0, g: Optional[torch.Tensor] = None):
har_source, noi_source, uv = self.m_source(f0, self.upp)
har_source = har_source.transpose(1, 2)
x = self.conv_pre(x)
if g is not None:
x = x + self.cond(g)
-
- for i in range(self.num_upsamples):
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
- x = self.ups[i](x)
- x_source = self.noise_convs[i](har_source)
- x = x + x_source
- xs = None
- for j in range(self.num_kernels):
- if xs is None:
- xs = self.resblocks[i * self.num_kernels + j](x)
- else:
- xs += self.resblocks[i * self.num_kernels + j](x)
- x = xs / self.num_kernels
+ # torch.jit.script() does not support direct indexing of torch modules
+ # That's why I wrote this
+ for i, (ups, noise_convs) in enumerate(zip(self.ups, self.noise_convs)):
+ if i < self.num_upsamples:
+ x = F.leaky_relu(x, self.lrelu_slope)
+ x = ups(x)
+ x_source = noise_convs(har_source)
+ x = x + x_source
+ xs: Optional[torch.Tensor] = None
+ l = [i * self.num_kernels + j for j in range(self.num_kernels)]
+ for j, resblock in enumerate(self.resblocks):
+ if j in l:
+ if xs is None:
+ xs = resblock(x)
+ else:
+ xs += resblock(x)
+ # This assertion cannot be ignored! \
+ # If ignored, it will cause torch.jit.script() compilation errors
+ assert isinstance(xs, torch.Tensor)
+ x = xs / self.num_kernels
x = F.leaky_relu(x)
x = self.conv_post(x)
x = torch.tanh(x)
@@ -532,6 +605,27 @@ class GeneratorNSF(torch.nn.Module):
for l in self.resblocks:
l.remove_weight_norm()
+ def __prepare_scriptable__(self):
+ for l in self.ups:
+ for hook in l._forward_pre_hooks.values():
+ # The hook we want to remove is an instance of WeightNorm class, so
+ # normally we would do `if isinstance(...)` but this class is not accessible
+ # because of shadowing, so we check the module name directly.
+ # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
+ if (
+ hook.__module__ == "torch.nn.utils.weight_norm"
+ and hook.__class__.__name__ == "WeightNorm"
+ ):
+ torch.nn.utils.remove_weight_norm(l)
+ for l in self.resblocks:
+ for hook in self.resblocks._forward_pre_hooks.values():
+ if (
+ hook.__module__ == "torch.nn.utils.weight_norm"
+ and hook.__class__.__name__ == "WeightNorm"
+ ):
+ torch.nn.utils.remove_weight_norm(l)
+ return self
+
sr2sr = {
"32k": 32000,
@@ -563,8 +657,8 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
sr,
**kwargs
):
- super().__init__()
- if type(sr) == type("strr"):
+ super(SynthesizerTrnMs256NSFsid, self).__init__()
+ if isinstance(sr, str):
sr = sr2sr[sr]
self.spec_channels = spec_channels
self.inter_channels = inter_channels
@@ -573,7 +667,7 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
- self.p_dropout = p_dropout
+ self.p_dropout = float(p_dropout)
self.resblock = resblock
self.resblock_kernel_sizes = resblock_kernel_sizes
self.resblock_dilation_sizes = resblock_dilation_sizes
@@ -591,7 +685,7 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
n_heads,
n_layers,
kernel_size,
- p_dropout,
+ float(p_dropout),
)
self.dec = GeneratorNSF(
inter_channels,
@@ -630,8 +724,42 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
self.flow.remove_weight_norm()
self.enc_q.remove_weight_norm()
+ def __prepare_scriptable__(self):
+ for hook in self.dec._forward_pre_hooks.values():
+ # The hook we want to remove is an instance of WeightNorm class, so
+ # normally we would do `if isinstance(...)` but this class is not accessible
+ # because of shadowing, so we check the module name directly.
+ # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
+ if (
+ hook.__module__ == "torch.nn.utils.weight_norm"
+ and hook.__class__.__name__ == "WeightNorm"
+ ):
+ torch.nn.utils.remove_weight_norm(self.dec)
+ for hook in self.flow._forward_pre_hooks.values():
+ if (
+ hook.__module__ == "torch.nn.utils.weight_norm"
+ and hook.__class__.__name__ == "WeightNorm"
+ ):
+ torch.nn.utils.remove_weight_norm(self.flow)
+ if hasattr(self, "enc_q"):
+ for hook in self.enc_q._forward_pre_hooks.values():
+ if (
+ hook.__module__ == "torch.nn.utils.weight_norm"
+ and hook.__class__.__name__ == "WeightNorm"
+ ):
+ torch.nn.utils.remove_weight_norm(self.enc_q)
+ return self
+
+ @torch.jit.ignore
def forward(
- self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
+ self,
+ phone: torch.Tensor,
+ phone_lengths: torch.Tensor,
+ pitch: torch.Tensor,
+ pitchf: torch.Tensor,
+ y: torch.Tensor,
+ y_lengths: torch.Tensor,
+ ds: Optional[torch.Tensor] = None,
): # 这里ds是id,[bs,1]
# print(1,pitch.shape)#[bs,t]
g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
@@ -647,15 +775,25 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
o = self.dec(z_slice, pitchf, g=g)
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
- def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None):
+ @torch.jit.export
+ def infer(
+ self,
+ phone: torch.Tensor,
+ phone_lengths: torch.Tensor,
+ pitch: torch.Tensor,
+ nsff0: torch.Tensor,
+ sid: torch.Tensor,
+ rate: Optional[torch.Tensor] = None,
+ ):
g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
- if rate:
- head = int(z_p.shape[2] * rate)
- z_p = z_p[:, :, -head:]
- x_mask = x_mask[:, :, -head:]
- nsff0 = nsff0[:, -head:]
+ if rate is not None:
+ assert isinstance(rate, torch.Tensor)
+ head = int(z_p.shape[2] * (1 - rate.item()))
+ z_p = z_p[:, :, head:]
+ x_mask = x_mask[:, :, head:]
+ nsff0 = nsff0[:, head:]
z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec(z * x_mask, nsff0, g=g)
return o, x_mask, (z, z_p, m_p, logs_p)
@@ -684,8 +822,8 @@ class SynthesizerTrnMs768NSFsid(nn.Module):
sr,
**kwargs
):
- super().__init__()
- if type(sr) == type("strr"):
+ super(SynthesizerTrnMs768NSFsid, self).__init__()
+ if isinstance(sr, str):
sr = sr2sr[sr]
self.spec_channels = spec_channels
self.inter_channels = inter_channels
@@ -694,7 +832,7 @@ class SynthesizerTrnMs768NSFsid(nn.Module):
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
- self.p_dropout = p_dropout
+ self.p_dropout = float(p_dropout)
self.resblock = resblock
self.resblock_kernel_sizes = resblock_kernel_sizes
self.resblock_dilation_sizes = resblock_dilation_sizes
@@ -712,7 +850,7 @@ class SynthesizerTrnMs768NSFsid(nn.Module):
n_heads,
n_layers,
kernel_size,
- p_dropout,
+ float(p_dropout),
)
self.dec = GeneratorNSF(
inter_channels,
@@ -751,6 +889,33 @@ class SynthesizerTrnMs768NSFsid(nn.Module):
self.flow.remove_weight_norm()
self.enc_q.remove_weight_norm()
+ def __prepare_scriptable__(self):
+ for hook in self.dec._forward_pre_hooks.values():
+ # The hook we want to remove is an instance of WeightNorm class, so
+ # normally we would do `if isinstance(...)` but this class is not accessible
+ # because of shadowing, so we check the module name directly.
+ # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
+ if (
+ hook.__module__ == "torch.nn.utils.weight_norm"
+ and hook.__class__.__name__ == "WeightNorm"
+ ):
+ torch.nn.utils.remove_weight_norm(self.dec)
+ for hook in self.flow._forward_pre_hooks.values():
+ if (
+ hook.__module__ == "torch.nn.utils.weight_norm"
+ and hook.__class__.__name__ == "WeightNorm"
+ ):
+ torch.nn.utils.remove_weight_norm(self.flow)
+ if hasattr(self, "enc_q"):
+ for hook in self.enc_q._forward_pre_hooks.values():
+ if (
+ hook.__module__ == "torch.nn.utils.weight_norm"
+ and hook.__class__.__name__ == "WeightNorm"
+ ):
+ torch.nn.utils.remove_weight_norm(self.enc_q)
+ return self
+
+ @torch.jit.ignore
def forward(
self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
): # 这里ds是id,[bs,1]
@@ -768,15 +933,24 @@ class SynthesizerTrnMs768NSFsid(nn.Module):
o = self.dec(z_slice, pitchf, g=g)
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
- def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None):
+ @torch.jit.export
+ def infer(
+ self,
+ phone: torch.Tensor,
+ phone_lengths: torch.Tensor,
+ pitch: torch.Tensor,
+ nsff0: torch.Tensor,
+ sid: torch.Tensor,
+ rate: Optional[torch.Tensor] = None,
+ ):
g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
- if rate:
- head = int(z_p.shape[2] * rate)
- z_p = z_p[:, :, -head:]
- x_mask = x_mask[:, :, -head:]
- nsff0 = nsff0[:, -head:]
+ if rate is not None:
+ head = int(z_p.shape[2] * (1.0 - rate.item()))
+ z_p = z_p[:, :, head:]
+ x_mask = x_mask[:, :, head:]
+ nsff0 = nsff0[:, head:]
z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec(z * x_mask, nsff0, g=g)
return o, x_mask, (z, z_p, m_p, logs_p)
@@ -805,7 +979,7 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
sr=None,
**kwargs
):
- super().__init__()
+ super(SynthesizerTrnMs256NSFsid_nono, self).__init__()
self.spec_channels = spec_channels
self.inter_channels = inter_channels
self.hidden_channels = hidden_channels
@@ -813,7 +987,7 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
- self.p_dropout = p_dropout
+ self.p_dropout = float(p_dropout)
self.resblock = resblock
self.resblock_kernel_sizes = resblock_kernel_sizes
self.resblock_dilation_sizes = resblock_dilation_sizes
@@ -831,7 +1005,7 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
n_heads,
n_layers,
kernel_size,
- p_dropout,
+ float(p_dropout),
f0=False,
)
self.dec = Generator(
@@ -869,6 +1043,33 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
self.flow.remove_weight_norm()
self.enc_q.remove_weight_norm()
+ def __prepare_scriptable__(self):
+ for hook in self.dec._forward_pre_hooks.values():
+ # The hook we want to remove is an instance of WeightNorm class, so
+ # normally we would do `if isinstance(...)` but this class is not accessible
+ # because of shadowing, so we check the module name directly.
+ # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
+ if (
+ hook.__module__ == "torch.nn.utils.weight_norm"
+ and hook.__class__.__name__ == "WeightNorm"
+ ):
+ torch.nn.utils.remove_weight_norm(self.dec)
+ for hook in self.flow._forward_pre_hooks.values():
+ if (
+ hook.__module__ == "torch.nn.utils.weight_norm"
+ and hook.__class__.__name__ == "WeightNorm"
+ ):
+ torch.nn.utils.remove_weight_norm(self.flow)
+ if hasattr(self, "enc_q"):
+ for hook in self.enc_q._forward_pre_hooks.values():
+ if (
+ hook.__module__ == "torch.nn.utils.weight_norm"
+ and hook.__class__.__name__ == "WeightNorm"
+ ):
+ torch.nn.utils.remove_weight_norm(self.enc_q)
+ return self
+
+ @torch.jit.ignore
def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
@@ -880,14 +1081,22 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
o = self.dec(z_slice, g=g)
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
- def infer(self, phone, phone_lengths, sid, rate=None):
+ @torch.jit.export
+ def infer(
+ self,
+ phone: torch.Tensor,
+ phone_lengths: torch.Tensor,
+ sid: torch.Tensor,
+ rate: Optional[torch.Tensor] = None,
+ ):
g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
- if rate:
- head = int(z_p.shape[2] * rate)
- z_p = z_p[:, :, -head:]
- x_mask = x_mask[:, :, -head:]
+ if rate is not None:
+ head = int(z_p.shape[2] * (1.0 - rate.item()))
+ z_p = z_p[:, :, head:]
+ x_mask = x_mask[:, :, head:]
+ nsff0 = nsff0[:, head:]
z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec(z * x_mask, g=g)
return o, x_mask, (z, z_p, m_p, logs_p)
@@ -916,7 +1125,7 @@ class SynthesizerTrnMs768NSFsid_nono(nn.Module):
sr=None,
**kwargs
):
- super().__init__()
+ super(self, SynthesizerTrnMs768NSFsid_nono).__init__()
self.spec_channels = spec_channels
self.inter_channels = inter_channels
self.hidden_channels = hidden_channels
@@ -924,7 +1133,7 @@ class SynthesizerTrnMs768NSFsid_nono(nn.Module):
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
- self.p_dropout = p_dropout
+ self.p_dropout = float(p_dropout)
self.resblock = resblock
self.resblock_kernel_sizes = resblock_kernel_sizes
self.resblock_dilation_sizes = resblock_dilation_sizes
@@ -942,7 +1151,7 @@ class SynthesizerTrnMs768NSFsid_nono(nn.Module):
n_heads,
n_layers,
kernel_size,
- p_dropout,
+ float(p_dropout),
f0=False,
)
self.dec = Generator(
@@ -980,6 +1189,33 @@ class SynthesizerTrnMs768NSFsid_nono(nn.Module):
self.flow.remove_weight_norm()
self.enc_q.remove_weight_norm()
+ def __prepare_scriptable__(self):
+ for hook in self.dec._forward_pre_hooks.values():
+ # The hook we want to remove is an instance of WeightNorm class, so
+ # normally we would do `if isinstance(...)` but this class is not accessible
+ # because of shadowing, so we check the module name directly.
+ # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
+ if (
+ hook.__module__ == "torch.nn.utils.weight_norm"
+ and hook.__class__.__name__ == "WeightNorm"
+ ):
+ torch.nn.utils.remove_weight_norm(self.dec)
+ for hook in self.flow._forward_pre_hooks.values():
+ if (
+ hook.__module__ == "torch.nn.utils.weight_norm"
+ and hook.__class__.__name__ == "WeightNorm"
+ ):
+ torch.nn.utils.remove_weight_norm(self.flow)
+ if hasattr(self, "enc_q"):
+ for hook in self.enc_q._forward_pre_hooks.values():
+ if (
+ hook.__module__ == "torch.nn.utils.weight_norm"
+ and hook.__class__.__name__ == "WeightNorm"
+ ):
+ torch.nn.utils.remove_weight_norm(self.enc_q)
+ return self
+
+ @torch.jit.ignore
def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
@@ -991,14 +1227,22 @@ class SynthesizerTrnMs768NSFsid_nono(nn.Module):
o = self.dec(z_slice, g=g)
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
- def infer(self, phone, phone_lengths, sid, rate=None):
+ @torch.jit.export
+ def infer(
+ self,
+ phone: torch.Tensor,
+ phone_lengths: torch.Tensor,
+ sid: torch.Tensor,
+ rate: Optional[torch.Tensor] = None,
+ ):
g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
- if rate:
- head = int(z_p.shape[2] * rate)
- z_p = z_p[:, :, -head:]
- x_mask = x_mask[:, :, -head:]
+ if rate is not None:
+ head = int(z_p.shape[2] * (1.0 - rate.item()))
+ z_p = z_p[:, :, head:]
+ x_mask = x_mask[:, :, head:]
+ nsff0 = nsff0[:, head:]
z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec(z * x_mask, g=g)
return o, x_mask, (z, z_p, m_p, logs_p)
diff --git a/infer/lib/infer_pack/models_onnx.py b/infer/lib/infer_pack/models_onnx.py
index 3e99763..ff60414 100644
--- a/infer/lib/infer_pack/models_onnx.py
+++ b/infer/lib/infer_pack/models_onnx.py
@@ -551,7 +551,7 @@ class SynthesizerTrnMsNSFsidM(nn.Module):
gin_channels,
sr,
version,
- **kwargs
+ **kwargs,
):
super().__init__()
if type(sr) == type("strr"):
@@ -621,10 +621,7 @@ class SynthesizerTrnMsNSFsidM(nn.Module):
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
self.speaker_map = None
logger.debug(
- "gin_channels: "
- + gin_channels
- + ", self.spk_embed_dim: "
- + self.spk_embed_dim
+ f"gin_channels: {gin_channels}, self.spk_embed_dim: {self.spk_embed_dim}"
)
def remove_weight_norm(self):
diff --git a/infer/lib/infer_pack/modules.py b/infer/lib/infer_pack/modules.py
index edf2207..51aeaf0 100644
--- a/infer/lib/infer_pack/modules.py
+++ b/infer/lib/infer_pack/modules.py
@@ -1,5 +1,6 @@
import copy
import math
+from typing import Optional, Tuple
import numpy as np
import scipy
@@ -18,7 +19,7 @@ LRELU_SLOPE = 0.1
class LayerNorm(nn.Module):
def __init__(self, channels, eps=1e-5):
- super().__init__()
+ super(LayerNorm, self).__init__()
self.channels = channels
self.eps = eps
@@ -41,13 +42,13 @@ class ConvReluNorm(nn.Module):
n_layers,
p_dropout,
):
- super().__init__()
+ super(ConvReluNorm, self).__init__()
self.in_channels = in_channels
self.hidden_channels = hidden_channels
self.out_channels = out_channels
self.kernel_size = kernel_size
self.n_layers = n_layers
- self.p_dropout = p_dropout
+ self.p_dropout = float(p_dropout)
assert n_layers > 1, "Number of layers should be larger than 0."
self.conv_layers = nn.ModuleList()
@@ -58,7 +59,7 @@ class ConvReluNorm(nn.Module):
)
)
self.norm_layers.append(LayerNorm(hidden_channels))
- self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
+ self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(float(p_dropout)))
for _ in range(n_layers - 1):
self.conv_layers.append(
nn.Conv1d(
@@ -89,13 +90,13 @@ class DDSConv(nn.Module):
"""
def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
- super().__init__()
+ super(DDSConv, self).__init__()
self.channels = channels
self.kernel_size = kernel_size
self.n_layers = n_layers
- self.p_dropout = p_dropout
+ self.p_dropout = float(p_dropout)
- self.drop = nn.Dropout(p_dropout)
+ self.drop = nn.Dropout(float(p_dropout))
self.convs_sep = nn.ModuleList()
self.convs_1x1 = nn.ModuleList()
self.norms_1 = nn.ModuleList()
@@ -117,7 +118,7 @@ class DDSConv(nn.Module):
self.norms_1.append(LayerNorm(channels))
self.norms_2.append(LayerNorm(channels))
- def forward(self, x, x_mask, g=None):
+ def forward(self, x, x_mask, g: Optional[torch.Tensor] = None):
if g is not None:
x = x + g
for i in range(self.n_layers):
@@ -149,11 +150,11 @@ class WN(torch.nn.Module):
self.dilation_rate = dilation_rate
self.n_layers = n_layers
self.gin_channels = gin_channels
- self.p_dropout = p_dropout
+ self.p_dropout = float(p_dropout)
self.in_layers = torch.nn.ModuleList()
self.res_skip_layers = torch.nn.ModuleList()
- self.drop = nn.Dropout(p_dropout)
+ self.drop = nn.Dropout(float(p_dropout))
if gin_channels != 0:
cond_layer = torch.nn.Conv1d(
@@ -184,15 +185,19 @@ class WN(torch.nn.Module):
res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
self.res_skip_layers.append(res_skip_layer)
- def forward(self, x, x_mask, g=None, **kwargs):
+ def forward(
+ self, x: torch.Tensor, x_mask: torch.Tensor, g: Optional[torch.Tensor] = None
+ ):
output = torch.zeros_like(x)
n_channels_tensor = torch.IntTensor([self.hidden_channels])
if g is not None:
g = self.cond_layer(g)
- for i in range(self.n_layers):
- x_in = self.in_layers[i](x)
+ for i, (in_layer, res_skip_layer) in enumerate(
+ zip(self.in_layers, self.res_skip_layers)
+ ):
+ x_in = in_layer(x)
if g is not None:
cond_offset = i * 2 * self.hidden_channels
g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
@@ -202,7 +207,7 @@ class WN(torch.nn.Module):
acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
acts = self.drop(acts)
- res_skip_acts = self.res_skip_layers[i](acts)
+ res_skip_acts = res_skip_layer(acts)
if i < self.n_layers - 1:
res_acts = res_skip_acts[:, : self.hidden_channels, :]
x = (x + res_acts) * x_mask
@@ -219,6 +224,30 @@ class WN(torch.nn.Module):
for l in self.res_skip_layers:
torch.nn.utils.remove_weight_norm(l)
+ def __prepare_scriptable__(self):
+ if self.gin_channels != 0:
+ for hook in self.cond_layer._forward_pre_hooks.values():
+ if (
+ hook.__module__ == "torch.nn.utils.weight_norm"
+ and hook.__class__.__name__ == "WeightNorm"
+ ):
+ torch.nn.utils.remove_weight_norm(self.cond_layer)
+ for l in self.in_layers:
+ for hook in l._forward_pre_hooks.values():
+ if (
+ hook.__module__ == "torch.nn.utils.weight_norm"
+ and hook.__class__.__name__ == "WeightNorm"
+ ):
+ torch.nn.utils.remove_weight_norm(l)
+ for l in self.res_skip_layers:
+ for hook in l._forward_pre_hooks.values():
+ if (
+ hook.__module__ == "torch.nn.utils.weight_norm"
+ and hook.__class__.__name__ == "WeightNorm"
+ ):
+ torch.nn.utils.remove_weight_norm(l)
+ return self
+
class ResBlock1(torch.nn.Module):
def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
@@ -294,14 +323,15 @@ class ResBlock1(torch.nn.Module):
]
)
self.convs2.apply(init_weights)
+ self.lrelu_slope = LRELU_SLOPE
- def forward(self, x, x_mask=None):
+ def forward(self, x: torch.Tensor, x_mask: Optional[torch.Tensor] = None):
for c1, c2 in zip(self.convs1, self.convs2):
- xt = F.leaky_relu(x, LRELU_SLOPE)
+ xt = F.leaky_relu(x, self.lrelu_slope)
if x_mask is not None:
xt = xt * x_mask
xt = c1(xt)
- xt = F.leaky_relu(xt, LRELU_SLOPE)
+ xt = F.leaky_relu(xt, self.lrelu_slope)
if x_mask is not None:
xt = xt * x_mask
xt = c2(xt)
@@ -316,6 +346,23 @@ class ResBlock1(torch.nn.Module):
for l in self.convs2:
remove_weight_norm(l)
+ def __prepare_scriptable__(self):
+ for l in self.convs1:
+ for hook in l._forward_pre_hooks.values():
+ if (
+ hook.__module__ == "torch.nn.utils.weight_norm"
+ and hook.__class__.__name__ == "WeightNorm"
+ ):
+ torch.nn.utils.remove_weight_norm(l)
+ for l in self.convs2:
+ for hook in l._forward_pre_hooks.values():
+ if (
+ hook.__module__ == "torch.nn.utils.weight_norm"
+ and hook.__class__.__name__ == "WeightNorm"
+ ):
+ torch.nn.utils.remove_weight_norm(l)
+ return self
+
class ResBlock2(torch.nn.Module):
def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
@@ -345,10 +392,11 @@ class ResBlock2(torch.nn.Module):
]
)
self.convs.apply(init_weights)
+ self.lrelu_slope = LRELU_SLOPE
- def forward(self, x, x_mask=None):
+ def forward(self, x, x_mask: Optional[torch.Tensor] = None):
for c in self.convs:
- xt = F.leaky_relu(x, LRELU_SLOPE)
+ xt = F.leaky_relu(x, self.lrelu_slope)
if x_mask is not None:
xt = xt * x_mask
xt = c(xt)
@@ -361,9 +409,25 @@ class ResBlock2(torch.nn.Module):
for l in self.convs:
remove_weight_norm(l)
+ def __prepare_scriptable__(self):
+ for l in self.convs:
+ for hook in l._forward_pre_hooks.values():
+ if (
+ hook.__module__ == "torch.nn.utils.weight_norm"
+ and hook.__class__.__name__ == "WeightNorm"
+ ):
+ torch.nn.utils.remove_weight_norm(l)
+ return self
+
class Log(nn.Module):
- def forward(self, x, x_mask, reverse=False, **kwargs):
+ def forward(
+ self,
+ x: torch.Tensor,
+ x_mask: torch.Tensor,
+ g: Optional[torch.Tensor] = None,
+ reverse: bool = False,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
if not reverse:
y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
logdet = torch.sum(-y, [1, 2])
@@ -374,18 +438,27 @@ class Log(nn.Module):
class Flip(nn.Module):
- def forward(self, x, *args, reverse=False, **kwargs):
+ # torch.jit.script() Compiled functions \
+ # can't take variable number of arguments or \
+ # use keyword-only arguments with defaults
+ def forward(
+ self,
+ x: torch.Tensor,
+ x_mask: torch.Tensor,
+ g: Optional[torch.Tensor] = None,
+ reverse: bool = False,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
x = torch.flip(x, [1])
if not reverse:
logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
return x, logdet
else:
- return x
+ return x, torch.zeros([1], device=x.device)
class ElementwiseAffine(nn.Module):
def __init__(self, channels):
- super().__init__()
+ super(ElementwiseAffine, self).__init__()
self.channels = channels
self.m = nn.Parameter(torch.zeros(channels, 1))
self.logs = nn.Parameter(torch.zeros(channels, 1))
@@ -414,7 +487,7 @@ class ResidualCouplingLayer(nn.Module):
mean_only=False,
):
assert channels % 2 == 0, "channels should be divisible by 2"
- super().__init__()
+ super(ResidualCouplingLayer, self).__init__()
self.channels = channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
@@ -429,14 +502,20 @@ class ResidualCouplingLayer(nn.Module):
kernel_size,
dilation_rate,
n_layers,
- p_dropout=p_dropout,
+ p_dropout=float(p_dropout),
gin_channels=gin_channels,
)
self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
self.post.weight.data.zero_()
self.post.bias.data.zero_()
- def forward(self, x, x_mask, g=None, reverse=False):
+ def forward(
+ self,
+ x: torch.Tensor,
+ x_mask: torch.Tensor,
+ g: Optional[torch.Tensor] = None,
+ reverse: bool = False,
+ ):
x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
h = self.pre(x0) * x_mask
h = self.enc(h, x_mask, g=g)
@@ -455,11 +534,20 @@ class ResidualCouplingLayer(nn.Module):
else:
x1 = (x1 - m) * torch.exp(-logs) * x_mask
x = torch.cat([x0, x1], 1)
- return x
+ return x, torch.zeros([1])
def remove_weight_norm(self):
self.enc.remove_weight_norm()
+ def __prepare_scriptable__(self):
+ for hook in self.enc._forward_pre_hooks.values():
+ if (
+ hook.__module__ == "torch.nn.utils.weight_norm"
+ and hook.__class__.__name__ == "WeightNorm"
+ ):
+ torch.nn.utils.remove_weight_norm(self.enc)
+ return self
+
class ConvFlow(nn.Module):
def __init__(
@@ -471,7 +559,7 @@ class ConvFlow(nn.Module):
num_bins=10,
tail_bound=5.0,
):
- super().__init__()
+ super(ConvFlow, self).__init__()
self.in_channels = in_channels
self.filter_channels = filter_channels
self.kernel_size = kernel_size
@@ -488,7 +576,13 @@ class ConvFlow(nn.Module):
self.proj.weight.data.zero_()
self.proj.bias.data.zero_()
- def forward(self, x, x_mask, g=None, reverse=False):
+ def forward(
+ self,
+ x: torch.Tensor,
+ x_mask: torch.Tensor,
+ g: Optional[torch.Tensor] = None,
+ reverse=False,
+ ):
x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
h = self.pre(x0)
h = self.convs(h, x_mask, g=g)
diff --git a/infer/lib/jit/__init__.py b/infer/lib/jit/__init__.py
new file mode 100644
index 0000000..d7f41dd
--- /dev/null
+++ b/infer/lib/jit/__init__.py
@@ -0,0 +1,163 @@
+from io import BytesIO
+import pickle
+import time
+import torch
+from tqdm import tqdm
+from collections import OrderedDict
+
+
+def load_inputs(path, device, is_half=False):
+ parm = torch.load(path, map_location=torch.device("cpu"))
+ for key in parm.keys():
+ parm[key] = parm[key].to(device)
+ if is_half and parm[key].dtype == torch.float32:
+ parm[key] = parm[key].half()
+ elif not is_half and parm[key].dtype == torch.float16:
+ parm[key] = parm[key].float()
+ return parm
+
+
+def benchmark(
+ model, inputs_path, device=torch.device("cpu"), epoch=1000, is_half=False
+):
+ parm = load_inputs(inputs_path, device, is_half)
+ total_ts = 0.0
+ bar = tqdm(range(epoch))
+ for i in bar:
+ start_time = time.perf_counter()
+ o = model(**parm)
+ total_ts += time.perf_counter() - start_time
+ print(f"num_epoch: {epoch} | avg time(ms): {(total_ts*1000)/epoch}")
+
+
+def jit_warm_up(model, inputs_path, device=torch.device("cpu"), epoch=5, is_half=False):
+ benchmark(model, inputs_path, device, epoch=epoch, is_half=is_half)
+
+
+def to_jit_model(
+ model_path,
+ model_type: str,
+ mode: str = "trace",
+ inputs_path: str = None,
+ device=torch.device("cpu"),
+ is_half=False,
+):
+ model = None
+ if model_type.lower() == "synthesizer":
+ from .get_synthesizer import get_synthesizer
+
+ model, _ = get_synthesizer(model_path, device)
+ model.forward = model.infer
+ elif model_type.lower() == "rmvpe":
+ from .get_rmvpe import get_rmvpe
+
+ model = get_rmvpe(model_path, device)
+ elif model_type.lower() == "hubert":
+ from .get_hubert import get_hubert_model
+
+ model = get_hubert_model(model_path, device)
+ model.forward = model.infer
+ else:
+ raise ValueError(f"No model type named {model_type}")
+ model = model.eval()
+ model = model.half() if is_half else model.float()
+ if mode == "trace":
+ assert not inputs_path
+ inputs = load_inputs(inputs_path, device, is_half)
+ model_jit = torch.jit.trace(model, example_kwarg_inputs=inputs)
+ elif mode == "script":
+ model_jit = torch.jit.script(model)
+ model_jit.to(device)
+ model_jit = model_jit.half() if is_half else model_jit.float()
+ # model = model.half() if is_half else model.float()
+ return (model, model_jit)
+
+
+def export(
+ model: torch.nn.Module,
+ mode: str = "trace",
+ inputs: dict = None,
+ device=torch.device("cpu"),
+ is_half: bool = False,
+) -> dict:
+ model = model.half() if is_half else model.float()
+ model.eval()
+ if mode == "trace":
+ assert inputs is not None
+ model_jit = torch.jit.trace(model, example_kwarg_inputs=inputs)
+ elif mode == "script":
+ model_jit = torch.jit.script(model)
+ model_jit.to(device)
+ model_jit = model_jit.half() if is_half else model_jit.float()
+ buffer = BytesIO()
+ # model_jit=model_jit.cpu()
+ torch.jit.save(model_jit, buffer)
+ del model_jit
+ cpt = OrderedDict()
+ cpt["model"] = buffer.getvalue()
+ cpt["is_half"] = is_half
+ return cpt
+
+
+def load(path: str):
+ with open(path, "rb") as f:
+ return pickle.load(f)
+
+
+def save(ckpt: dict, save_path: str):
+ with open(save_path, "wb") as f:
+ pickle.dump(ckpt, f)
+
+
+def rmvpe_jit_export(
+ model_path: str,
+ mode: str = "script",
+ inputs_path: str = None,
+ save_path: str = None,
+ device=torch.device("cpu"),
+ is_half=False,
+):
+ if not save_path:
+ save_path = model_path.rstrip(".pth")
+ save_path += ".half.jit" if is_half else ".jit"
+ if "cuda" in str(device) and ":" not in str(device):
+ device = torch.device("cuda:0")
+ from .get_rmvpe import get_rmvpe
+
+ model = get_rmvpe(model_path, device)
+ inputs = None
+ if mode == "trace":
+ inputs = load_inputs(inputs_path, device, is_half)
+ ckpt = export(model, mode, inputs, device, is_half)
+ ckpt["device"] = str(device)
+ save(ckpt, save_path)
+ return ckpt
+
+
+def synthesizer_jit_export(
+ model_path: str,
+ mode: str = "script",
+ inputs_path: str = None,
+ save_path: str = None,
+ device=torch.device("cpu"),
+ is_half=False,
+):
+ if not save_path:
+ save_path = model_path.rstrip(".pth")
+ save_path += ".half.jit" if is_half else ".jit"
+ if "cuda" in str(device) and ":" not in str(device):
+ device = torch.device("cuda:0")
+ from .get_synthesizer import get_synthesizer
+
+ model, cpt = get_synthesizer(model_path, device)
+ assert isinstance(cpt, dict)
+ model.forward = model.infer
+ inputs = None
+ if mode == "trace":
+ inputs = load_inputs(inputs_path, device, is_half)
+ ckpt = export(model, mode, inputs, device, is_half)
+ cpt.pop("weight")
+ cpt["model"] = ckpt["model"]
+ cpt["device"] = device
+ save(cpt, save_path)
+ return cpt
diff --git a/infer/lib/jit/get_hubert.py b/infer/lib/jit/get_hubert.py
new file mode 100644
index 0000000..aec7132
--- /dev/null
+++ b/infer/lib/jit/get_hubert.py
@@ -0,0 +1,342 @@
+import math
+import random
+from typing import Optional, Tuple
+from fairseq.checkpoint_utils import load_model_ensemble_and_task
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+# from fairseq.data.data_utils import compute_mask_indices
+from fairseq.utils import index_put
+
+
+# @torch.jit.script
+def pad_to_multiple(x, multiple, dim=-1, value=0):
+ # Inspired from https://github.com/lucidrains/local-attention/blob/master/local_attention/local_attention.py#L41
+ if x is None:
+ return None, 0
+ tsz = x.size(dim)
+ m = tsz / multiple
+ remainder = math.ceil(m) * multiple - tsz
+ if int(tsz % multiple) == 0:
+ return x, 0
+ pad_offset = (0,) * (-1 - dim) * 2
+
+ return F.pad(x, (*pad_offset, 0, remainder), value=value), remainder
+
+
+def extract_features(
+ self,
+ x,
+ padding_mask=None,
+ tgt_layer=None,
+ min_layer=0,
+):
+ if padding_mask is not None:
+ x = index_put(x, padding_mask, 0)
+
+ x_conv = self.pos_conv(x.transpose(1, 2))
+ x_conv = x_conv.transpose(1, 2)
+ x = x + x_conv
+
+ if not self.layer_norm_first:
+ x = self.layer_norm(x)
+
+ # pad to the sequence length dimension
+ x, pad_length = pad_to_multiple(x, self.required_seq_len_multiple, dim=-2, value=0)
+ if pad_length > 0 and padding_mask is None:
+ padding_mask = x.new_zeros((x.size(0), x.size(1)), dtype=torch.bool)
+ padding_mask[:, -pad_length:] = True
+ else:
+ padding_mask, _ = pad_to_multiple(
+ padding_mask, self.required_seq_len_multiple, dim=-1, value=True
+ )
+ x = F.dropout(x, p=self.dropout, training=self.training)
+
+ # B x T x C -> T x B x C
+ x = x.transpose(0, 1)
+
+ layer_results = []
+ r = None
+ for i, layer in enumerate(self.layers):
+ dropout_probability = np.random.random() if self.layerdrop > 0 else 1
+ if not self.training or (dropout_probability > self.layerdrop):
+ x, (z, lr) = layer(
+ x, self_attn_padding_mask=padding_mask, need_weights=False
+ )
+ if i >= min_layer:
+ layer_results.append((x, z, lr))
+ if i == tgt_layer:
+ r = x
+ break
+
+ if r is not None:
+ x = r
+
+ # T x B x C -> B x T x C
+ x = x.transpose(0, 1)
+
+ # undo paddding
+ if pad_length > 0:
+ x = x[:, :-pad_length]
+
+ def undo_pad(a, b, c):
+ return (
+ a[:-pad_length],
+ b[:-pad_length] if b is not None else b,
+ c[:-pad_length],
+ )
+
+ layer_results = [undo_pad(*u) for u in layer_results]
+
+ return x, layer_results
+
+
+def compute_mask_indices(
+ shape: Tuple[int, int],
+ padding_mask: Optional[torch.Tensor],
+ mask_prob: float,
+ mask_length: int,
+ mask_type: str = "static",
+ mask_other: float = 0.0,
+ min_masks: int = 0,
+ no_overlap: bool = False,
+ min_space: int = 0,
+ require_same_masks: bool = True,
+ mask_dropout: float = 0.0,
+) -> torch.Tensor:
+ """
+ Computes random mask spans for a given shape
+
+ Args:
+ shape: the the shape for which to compute masks.
+ should be of size 2 where first element is batch size and 2nd is timesteps
+ padding_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
+ mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
+ number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
+ however due to overlaps, the actual number will be smaller (unless no_overlap is True)
+ mask_type: how to compute mask lengths
+ static = fixed size
+ uniform = sample from uniform distribution [mask_other, mask_length*2]
+ normal = sample from normal distribution with mean mask_length and stdev mask_other. mask is min 1 element
+ poisson = sample from possion distribution with lambda = mask length
+ min_masks: minimum number of masked spans
+ no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping
+ min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans
+ require_same_masks: if true, will randomly drop out masks until same amount of masks remains in each sample
+ mask_dropout: randomly dropout this percentage of masks in each example
+ """
+
+ bsz, all_sz = shape
+ mask = torch.full((bsz, all_sz), False)
+
+ all_num_mask = int(
+ # add a random number for probabilistic rounding
+ mask_prob * all_sz / float(mask_length)
+ + torch.rand([1]).item()
+ )
+
+ all_num_mask = max(min_masks, all_num_mask)
+
+ mask_idcs = []
+ for i in range(bsz):
+ if padding_mask is not None:
+ sz = all_sz - padding_mask[i].long().sum().item()
+ num_mask = int(mask_prob * sz / float(mask_length) + np.random.rand())
+ num_mask = max(min_masks, num_mask)
+ else:
+ sz = all_sz
+ num_mask = all_num_mask
+
+ if mask_type == "static":
+ lengths = torch.full([num_mask], mask_length)
+ elif mask_type == "uniform":
+ lengths = torch.randint(mask_other, mask_length * 2 + 1, size=[num_mask])
+ elif mask_type == "normal":
+ lengths = torch.normal(mask_length, mask_other, size=[num_mask])
+ lengths = [max(1, int(round(x))) for x in lengths]
+ else:
+ raise Exception("unknown mask selection " + mask_type)
+
+ if sum(lengths) == 0:
+ lengths[0] = min(mask_length, sz - 1)
+
+ if no_overlap:
+ mask_idc = []
+
+ def arrange(s, e, length, keep_length):
+ span_start = torch.randint(low=s, high=e - length, size=[1]).item()
+ mask_idc.extend(span_start + i for i in range(length))
+
+ new_parts = []
+ if span_start - s - min_space >= keep_length:
+ new_parts.append((s, span_start - min_space + 1))
+ if e - span_start - length - min_space > keep_length:
+ new_parts.append((span_start + length + min_space, e))
+ return new_parts
+
+ parts = [(0, sz)]
+ min_length = min(lengths)
+ for length in sorted(lengths, reverse=True):
+ t = [e - s if e - s >= length + min_space else 0 for s, e in parts]
+ lens = torch.asarray(t, dtype=torch.int)
+ l_sum = torch.sum(lens)
+ if l_sum == 0:
+ break
+ probs = lens / torch.sum(lens)
+ c = torch.multinomial(probs.float(), len(parts)).item()
+ s, e = parts.pop(c)
+ parts.extend(arrange(s, e, length, min_length))
+ mask_idc = torch.asarray(mask_idc)
+ else:
+ min_len = min(lengths)
+ if sz - min_len <= num_mask:
+ min_len = sz - num_mask - 1
+ mask_idc = torch.asarray(
+ random.sample([i for i in range(sz - min_len)], num_mask)
+ )
+ mask_idc = torch.asarray(
+ [
+ mask_idc[j] + offset
+ for j in range(len(mask_idc))
+ for offset in range(lengths[j])
+ ]
+ )
+
+ mask_idcs.append(torch.unique(mask_idc[mask_idc < sz]))
+
+ min_len = min([len(m) for m in mask_idcs])
+ for i, mask_idc in enumerate(mask_idcs):
+ if isinstance(mask_idc, torch.Tensor):
+ mask_idc = torch.asarray(mask_idc, dtype=torch.float)
+ if len(mask_idc) > min_len and require_same_masks:
+ mask_idc = torch.asarray(
+ random.sample([i for i in range(mask_idc)], min_len)
+ )
+ if mask_dropout > 0:
+ num_holes = int(round(len(mask_idc) * mask_dropout))
+ mask_idc = torch.asarray(
+ random.sample([i for i in range(mask_idc)], len(mask_idc) - num_holes)
+ )
+
+ mask[i, mask_idc.int()] = True
+
+ return mask
+
+
+def apply_mask(self, x, padding_mask, target_list):
+ B, T, C = x.shape
+ torch.zeros_like(x)
+ if self.mask_prob > 0:
+ mask_indices = compute_mask_indices(
+ (B, T),
+ padding_mask,
+ self.mask_prob,
+ self.mask_length,
+ self.mask_selection,
+ self.mask_other,
+ min_masks=2,
+ no_overlap=self.no_mask_overlap,
+ min_space=self.mask_min_space,
+ )
+ mask_indices = mask_indices.to(x.device)
+ x[mask_indices] = self.mask_emb
+ else:
+ mask_indices = None
+
+ if self.mask_channel_prob > 0:
+ mask_channel_indices = compute_mask_indices(
+ (B, C),
+ None,
+ self.mask_channel_prob,
+ self.mask_channel_length,
+ self.mask_channel_selection,
+ self.mask_channel_other,
+ no_overlap=self.no_mask_channel_overlap,
+ min_space=self.mask_channel_min_space,
+ )
+ mask_channel_indices = (
+ mask_channel_indices.to(x.device).unsqueeze(1).expand(-1, T, -1)
+ )
+ x[mask_channel_indices] = 0
+
+ return x, mask_indices
+
+
+def get_hubert_model(
+ model_path="assets/hubert/hubert_base.pt", device=torch.device("cpu")
+):
+ models, _, _ = load_model_ensemble_and_task(
+ [model_path],
+ suffix="",
+ )
+ hubert_model = models[0]
+ hubert_model = hubert_model.to(device)
+
+ def _apply_mask(x, padding_mask, target_list):
+ return apply_mask(hubert_model, x, padding_mask, target_list)
+
+ hubert_model.apply_mask = _apply_mask
+
+ def _extract_features(
+ x,
+ padding_mask=None,
+ tgt_layer=None,
+ min_layer=0,
+ ):
+ return extract_features(
+ hubert_model.encoder,
+ x,
+ padding_mask=padding_mask,
+ tgt_layer=tgt_layer,
+ min_layer=min_layer,
+ )
+
+ hubert_model.encoder.extract_features = _extract_features
+
+ hubert_model._forward = hubert_model.forward
+
+ def hubert_extract_features(
+ self,
+ source: torch.Tensor,
+ padding_mask: Optional[torch.Tensor] = None,
+ mask: bool = False,
+ ret_conv: bool = False,
+ output_layer: Optional[int] = None,
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ res = self._forward(
+ source,
+ padding_mask=padding_mask,
+ mask=mask,
+ features_only=True,
+ output_layer=output_layer,
+ )
+ feature = res["features"] if ret_conv else res["x"]
+ return feature, res["padding_mask"]
+
+ def _hubert_extract_features(
+ source: torch.Tensor,
+ padding_mask: Optional[torch.Tensor] = None,
+ mask: bool = False,
+ ret_conv: bool = False,
+ output_layer: Optional[int] = None,
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ return hubert_extract_features(
+ hubert_model, source, padding_mask, mask, ret_conv, output_layer
+ )
+
+ hubert_model.extract_features = _hubert_extract_features
+
+ def infer(source, padding_mask, output_layer: torch.Tensor):
+ output_layer = output_layer.item()
+ logits = hubert_model.extract_features(
+ source=source, padding_mask=padding_mask, output_layer=output_layer
+ )
+ feats = hubert_model.final_proj(logits[0]) if output_layer == 9 else logits[0]
+ return feats
+
+ hubert_model.infer = infer
+ # hubert_model.forward=infer
+ # hubert_model.forward
+
+ return hubert_model
diff --git a/infer/lib/jit/get_rmvpe.py b/infer/lib/jit/get_rmvpe.py
new file mode 100644
index 0000000..e71c39f
--- /dev/null
+++ b/infer/lib/jit/get_rmvpe.py
@@ -0,0 +1,12 @@
+import torch
+
+
+def get_rmvpe(model_path="assets/rmvpe/rmvpe.pt", device=torch.device("cpu")):
+ from infer.lib.rmvpe import E2E
+
+ model = E2E(4, 1, (2, 2))
+ ckpt = torch.load(model_path, map_location=device)
+ model.load_state_dict(ckpt)
+ model.eval()
+ model = model.to(device)
+ return model
diff --git a/infer/lib/jit/get_synthesizer.py b/infer/lib/jit/get_synthesizer.py
new file mode 100644
index 0000000..ef5fe58
--- /dev/null
+++ b/infer/lib/jit/get_synthesizer.py
@@ -0,0 +1,37 @@
+import torch
+
+
+def get_synthesizer(pth_path, device=torch.device("cpu")):
+ from infer.lib.infer_pack.models import (
+ SynthesizerTrnMs256NSFsid,
+ SynthesizerTrnMs256NSFsid_nono,
+ SynthesizerTrnMs768NSFsid,
+ SynthesizerTrnMs768NSFsid_nono,
+ )
+
+ cpt = torch.load(pth_path, map_location=torch.device("cpu"))
+ # tgt_sr = cpt["config"][-1]
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
+ if_f0 = cpt.get("f0", 1)
+ version = cpt.get("version", "v1")
+ if version == "v1":
+ if if_f0 == 1:
+ net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=False)
+ else:
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
+ elif version == "v2":
+ if if_f0 == 1:
+ net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=False)
+ else:
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
+ del net_g.enc_q
+ # net_g.forward = net_g.infer
+ # ckpt = {}
+ # ckpt["config"] = cpt["config"]
+ # ckpt["f0"] = if_f0
+ # ckpt["version"] = version
+ # ckpt["info"] = cpt.get("info", "0epoch")
+ net_g.load_state_dict(cpt["weight"], strict=False)
+ net_g = net_g.float()
+ net_g.eval().to(device)
+ return net_g, cpt
diff --git a/infer/lib/rmvpe.py b/infer/lib/rmvpe.py
index d305b53..9010d28 100644
--- a/infer/lib/rmvpe.py
+++ b/infer/lib/rmvpe.py
@@ -1,8 +1,11 @@
-import pdb, os
-
+from io import BytesIO
+import os
+from typing import List, Optional, Tuple
import numpy as np
import torch
+from infer.lib import jit
+
try:
# Fix "Torch not compiled with CUDA enabled"
import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
@@ -11,7 +14,7 @@ try:
from infer.modules.ipex import ipex_init
ipex_init()
-except Exception:
+except Exception: # pylint: disable=broad-exception-caught
pass
import torch.nn as nn
import torch.nn.functional as F
@@ -23,58 +26,6 @@ import logging
logger = logging.getLogger(__name__)
-###stft codes from https://github.com/pseeth/torch-stft/blob/master/torch_stft/util.py
-def window_sumsquare(
- window,
- n_frames,
- hop_length=200,
- win_length=800,
- n_fft=800,
- dtype=np.float32,
- norm=None,
-):
- """
- # from librosa 0.6
- Compute the sum-square envelope of a window function at a given hop length.
- This is used to estimate modulation effects induced by windowing
- observations in short-time fourier transforms.
- Parameters
- ----------
- window : string, tuple, number, callable, or list-like
- Window specification, as in `get_window`
- n_frames : int > 0
- The number of analysis frames
- hop_length : int > 0
- The number of samples to advance between frames
- win_length : [optional]
- The length of the window function. By default, this matches `n_fft`.
- n_fft : int > 0
- The length of each analysis frame.
- dtype : np.dtype
- The data type of the output
- Returns
- -------
- wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
- The sum-squared envelope of the window function
- """
- if win_length is None:
- win_length = n_fft
-
- n = n_fft + hop_length * (n_frames - 1)
- x = np.zeros(n, dtype=dtype)
-
- # Compute the squared window at the desired length
- win_sq = get_window(window, win_length, fftbins=True)
- win_sq = normalize(win_sq, norm=norm) ** 2
- win_sq = pad_center(win_sq, n_fft)
-
- # Fill the envelope
- for i in range(n_frames):
- sample = i * hop_length
- x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))]
- return x
-
-
class STFT(torch.nn.Module):
def __init__(
self, filter_length=1024, hop_length=512, win_length=None, window="hann"
@@ -101,17 +52,14 @@ class STFT(torch.nn.Module):
self.window = window
self.forward_transform = None
self.pad_amount = int(self.filter_length / 2)
- scale = self.filter_length / self.hop_length
fourier_basis = np.fft.fft(np.eye(self.filter_length))
cutoff = int((self.filter_length / 2 + 1))
fourier_basis = np.vstack(
[np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])]
)
- forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
- inverse_basis = torch.FloatTensor(
- np.linalg.pinv(scale * fourier_basis).T[:, None, :]
- )
+ forward_basis = torch.FloatTensor(fourier_basis)
+ inverse_basis = torch.FloatTensor(np.linalg.pinv(fourier_basis))
assert filter_length >= self.win_length
# get window and zero center pad it to filter_length
@@ -121,12 +69,13 @@ class STFT(torch.nn.Module):
# window the bases
forward_basis *= fft_window
- inverse_basis *= fft_window
+ inverse_basis = (inverse_basis.T * fft_window).T
self.register_buffer("forward_basis", forward_basis.float())
self.register_buffer("inverse_basis", inverse_basis.float())
+ self.register_buffer("fft_window", fft_window.float())
- def transform(self, input_data):
+ def transform(self, input_data, return_phase=False):
"""Take input data (audio) to STFT domain.
Arguments:
@@ -138,33 +87,24 @@ class STFT(torch.nn.Module):
phase {tensor} -- Phase of STFT with shape (num_batch,
num_frequencies, num_frames)
"""
- num_batches = input_data.shape[0]
- num_samples = input_data.shape[-1]
-
- self.num_samples = num_samples
-
- # similar to librosa, reflect-pad the input
- input_data = input_data.view(num_batches, 1, num_samples)
- # print(1234,input_data.shape)
input_data = F.pad(
- input_data.unsqueeze(1),
- (self.pad_amount, self.pad_amount, 0, 0, 0, 0),
+ input_data,
+ (self.pad_amount, self.pad_amount),
mode="reflect",
- ).squeeze(1)
- # print(2333,input_data.shape,self.forward_basis.shape,self.hop_length)
- # pdb.set_trace()
- forward_transform = F.conv1d(
- input_data, self.forward_basis, stride=self.hop_length, padding=0
)
-
+ forward_transform = input_data.unfold(
+ 1, self.filter_length, self.hop_length
+ ).permute(0, 2, 1)
+ forward_transform = torch.matmul(self.forward_basis, forward_transform)
cutoff = int((self.filter_length / 2) + 1)
real_part = forward_transform[:, :cutoff, :]
imag_part = forward_transform[:, cutoff:, :]
-
magnitude = torch.sqrt(real_part**2 + imag_part**2)
- # phase = torch.atan2(imag_part.data, real_part.data)
-
- return magnitude # , phase
+ if return_phase:
+ phase = torch.atan2(imag_part.data, real_part.data)
+ return magnitude, phase
+ else:
+ return magnitude
def inverse(self, magnitude, phase):
"""Call the inverse STFT (iSTFT), given magnitude and phase tensors produced
@@ -180,42 +120,25 @@ class STFT(torch.nn.Module):
inverse_transform {tensor} -- Reconstructed audio given magnitude and phase. Of
shape (num_batch, num_samples)
"""
- recombine_magnitude_phase = torch.cat(
+ cat = torch.cat(
[magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1
)
-
- inverse_transform = F.conv_transpose1d(
- recombine_magnitude_phase,
- self.inverse_basis,
- stride=self.hop_length,
- padding=0,
+ fold = torch.nn.Fold(
+ output_size=(1, (cat.size(-1) - 1) * self.hop_length + self.filter_length),
+ kernel_size=(1, self.filter_length),
+ stride=(1, self.hop_length),
)
-
- if self.window is not None:
- window_sum = window_sumsquare(
- self.window,
- magnitude.size(-1),
- hop_length=self.hop_length,
- win_length=self.win_length,
- n_fft=self.filter_length,
- dtype=np.float32,
- )
- # remove modulation effects
- approx_nonzero_indices = torch.from_numpy(
- np.where(window_sum > tiny(window_sum))[0]
- )
- window_sum = torch.from_numpy(window_sum).to(inverse_transform.device)
- inverse_transform[:, :, approx_nonzero_indices] /= window_sum[
- approx_nonzero_indices
- ]
-
- # scale by hop ratio
- inverse_transform *= float(self.filter_length) / self.hop_length
-
- inverse_transform = inverse_transform[..., self.pad_amount :]
- inverse_transform = inverse_transform[..., : self.num_samples]
- inverse_transform = inverse_transform.squeeze(1)
-
+ inverse_transform = torch.matmul(self.inverse_basis, cat)
+ inverse_transform = fold(inverse_transform)[
+ :, 0, 0, self.pad_amount : -self.pad_amount
+ ]
+ window_square_sum = (
+ self.fft_window.pow(2).repeat(cat.size(-1), 1).T.unsqueeze(0)
+ )
+ window_square_sum = fold(window_square_sum)[
+ :, 0, 0, self.pad_amount : -self.pad_amount
+ ]
+ inverse_transform /= window_square_sum
return inverse_transform
def forward(self, input_data):
@@ -228,7 +151,7 @@ class STFT(torch.nn.Module):
reconstruction {tensor} -- Reconstructed audio given magnitude and phase. Of
shape (num_batch, num_samples)
"""
- self.magnitude, self.phase = self.transform(input_data)
+ self.magnitude, self.phase = self.transform(input_data, return_phase=True)
reconstruction = self.inverse(self.magnitude, self.phase)
return reconstruction
@@ -276,17 +199,15 @@ class ConvBlockRes(nn.Module):
nn.BatchNorm2d(out_channels, momentum=momentum),
nn.ReLU(),
)
+ # self.shortcut:Optional[nn.Module] = None
if in_channels != out_channels:
self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1))
- self.is_shortcut = True
- else:
- self.is_shortcut = False
- def forward(self, x):
- if self.is_shortcut:
- return self.conv(x) + self.shortcut(x)
- else:
+ def forward(self, x: torch.Tensor):
+ if not hasattr(self, "shortcut"):
return self.conv(x) + x
+ else:
+ return self.conv(x) + self.shortcut(x)
class Encoder(nn.Module):
@@ -318,12 +239,12 @@ class Encoder(nn.Module):
self.out_size = in_size
self.out_channel = out_channels
- def forward(self, x):
- concat_tensors = []
+ def forward(self, x: torch.Tensor):
+ concat_tensors: List[torch.Tensor] = []
x = self.bn(x)
- for i in range(self.n_encoders):
- _, x = self.layers[i](x)
- concat_tensors.append(_)
+ for i, layer in enumerate(self.layers):
+ t, x = layer(x)
+ concat_tensors.append(t)
return x, concat_tensors
@@ -342,8 +263,8 @@ class ResEncoderBlock(nn.Module):
self.pool = nn.AvgPool2d(kernel_size=kernel_size)
def forward(self, x):
- for i in range(self.n_blocks):
- x = self.conv[i](x)
+ for i, conv in enumerate(self.conv):
+ x = conv(x)
if self.kernel_size is not None:
return x, self.pool(x)
else:
@@ -364,8 +285,8 @@ class Intermediate(nn.Module): #
)
def forward(self, x):
- for i in range(self.n_inters):
- x = self.layers[i](x)
+ for i, layer in enumerate(self.layers):
+ x = layer(x)
return x
@@ -395,8 +316,8 @@ class ResDecoderBlock(nn.Module):
def forward(self, x, concat_tensor):
x = self.conv1(x)
x = torch.cat((x, concat_tensor), dim=1)
- for i in range(self.n_blocks):
- x = self.conv2[i](x)
+ for i, conv2 in enumerate(self.conv2):
+ x = conv2(x)
return x
@@ -412,9 +333,9 @@ class Decoder(nn.Module):
)
in_channels = out_channels
- def forward(self, x, concat_tensors):
- for i in range(self.n_decoders):
- x = self.layers[i](x, concat_tensors[-1 - i])
+ def forward(self, x: torch.Tensor, concat_tensors: List[torch.Tensor]):
+ for i, layer in enumerate(self.layers):
+ x = layer(x, concat_tensors[-1 - i])
return x
@@ -442,7 +363,7 @@ class DeepUnet(nn.Module):
self.encoder.out_channel, en_de_layers, kernel_size, n_blocks
)
- def forward(self, x):
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
x, concat_tensors = self.encoder(x)
x = self.intermediate(x)
x = self.decoder(x, concat_tensors)
@@ -536,33 +457,28 @@ class MelSpectrogram(torch.nn.Module):
keyshift_key = str(keyshift) + "_" + str(audio.device)
if keyshift_key not in self.hann_window:
self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(
- # "cpu"if(audio.device.type=="privateuseone") else audio.device
audio.device
)
- # fft = torch.stft(#doesn't support pytorch_dml
- # # audio.cpu() if(audio.device.type=="privateuseone")else audio,
- # audio,
- # n_fft=n_fft_new,
- # hop_length=hop_length_new,
- # win_length=win_length_new,
- # window=self.hann_window[keyshift_key],
- # center=center,
- # return_complex=True,
- # )
- # magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
- # print(1111111111)
- # print(222222222222222,audio.device,self.is_half)
- if hasattr(self, "stft") == False:
- # print(n_fft_new,hop_length_new,win_length_new,audio.shape)
- self.stft = STFT(
- filter_length=n_fft_new,
+ if "privateuseone" in str(audio.device):
+ if not hasattr(self, "stft"):
+ self.stft = STFT(
+ filter_length=n_fft_new,
+ hop_length=hop_length_new,
+ win_length=win_length_new,
+ window="hann",
+ ).to(audio.device)
+ magnitude = self.stft.transform(audio)
+ else:
+ fft = torch.stft(
+ audio,
+ n_fft=n_fft_new,
hop_length=hop_length_new,
win_length=win_length_new,
- window="hann",
- ).to(audio.device)
- magnitude = self.stft.transform(audio) # phase
- # if (audio.device.type == "privateuseone"):
- # magnitude=magnitude.to(audio.device)
+ window=self.hann_window[keyshift_key],
+ center=center,
+ return_complex=True,
+ )
+ magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
if keyshift != 0:
size = self.n_fft // 2 + 1
resize = magnitude.size(1)
@@ -573,17 +489,16 @@ class MelSpectrogram(torch.nn.Module):
if self.is_half == True:
mel_output = mel_output.half()
log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
- # print(log_mel_spec.device.type)
return log_mel_spec
class RMVPE:
- def __init__(self, model_path, is_half, device=None):
+ def __init__(self, model_path: str, is_half, device=None, use_jit=False):
self.resample_kernel = {}
self.resample_kernel = {}
self.is_half = is_half
if device is None:
- device = "cuda" if torch.cuda.is_available() else "cpu"
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
self.device = device
self.mel_extractor = MelSpectrogram(
is_half, 128, 16000, 1024, 160, None, 30, 8000
@@ -597,13 +512,56 @@ class RMVPE:
)
self.model = ort_session
else:
- model = E2E(4, 1, (2, 2))
- ckpt = torch.load(model_path, map_location="cpu")
- model.load_state_dict(ckpt)
- model.eval()
- if is_half == True:
- model = model.half()
- self.model = model
+ if str(self.device) == "cuda":
+ self.device = torch.device("cuda:0")
+
+ def get_jit_model():
+ jit_model_path = model_path.rstrip(".pth")
+ jit_model_path += ".half.jit" if is_half else ".jit"
+ reload = False
+ if os.path.exists(jit_model_path):
+ ckpt = jit.load(jit_model_path)
+ model_device = ckpt["device"]
+ if model_device != str(self.device):
+ reload = True
+ else:
+ reload = True
+
+ if reload:
+ ckpt = jit.rmvpe_jit_export(
+ model_path=model_path,
+ mode="script",
+ inputs_path=None,
+ save_path=jit_model_path,
+ device=device,
+ is_half=is_half,
+ )
+ model = torch.jit.load(BytesIO(ckpt["model"]), map_location=device)
+ return model
+
+ def get_default_model():
+ model = E2E(4, 1, (2, 2))
+ ckpt = torch.load(model_path, map_location="cpu")
+ model.load_state_dict(ckpt)
+ model.eval()
+ if is_half:
+ model = model.half()
+ else:
+ model = model.float()
+ return model
+
+ if use_jit:
+ if is_half and "cpu" in str(self.device):
+ logger.warning(
+ "Use default rmvpe model. \
+ Jit is not supported on the CPU for half floating point"
+ )
+ self.model = get_default_model()
+ else:
+ self.model = get_jit_model()
+ else:
+ self.model = get_default_model()
+
self.model = self.model.to(device)
cents_mapping = 20 * np.arange(360) + 1997.3794084376191
self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368
@@ -611,9 +569,9 @@ class RMVPE:
def mel2hidden(self, mel):
with torch.no_grad():
n_frames = mel.shape[-1]
- mel = F.pad(
- mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="constant"
- )
+ n_pad = 32 * ((n_frames - 1) // 32 + 1) - n_frames
+ if n_pad > 0:
+ mel = F.pad(mel, (0, n_pad), mode="constant")
if "privateuseone" in str(self.device):
onnx_input_name = self.model.get_inputs()[0].name
onnx_outputs_names = self.model.get_outputs()[0].name
@@ -622,6 +580,7 @@ class RMVPE:
input_feed={onnx_input_name: mel.cpu().numpy()},
)[0]
else:
+ mel = mel.half() if self.is_half else mel.float()
hidden = self.model(mel)
return hidden[:, :n_frames]
diff --git a/infer/modules/ipex/__init__.py b/infer/modules/ipex/__init__.py
index f8ad98a..cd27bc1 100644
--- a/infer/modules/ipex/__init__.py
+++ b/infer/modules/ipex/__init__.py
@@ -17,7 +17,6 @@ def ipex_init(): # pylint: disable=too-many-statements
torch.cuda.device = torch.xpu.device
torch.cuda.device_count = torch.xpu.device_count
torch.cuda.device_of = torch.xpu.device_of
- torch.cuda.getDeviceIdListForCard = torch.xpu.getDeviceIdListForCard
torch.cuda.get_device_name = torch.xpu.get_device_name
torch.cuda.get_device_properties = torch.xpu.get_device_properties
torch.cuda.init = torch.xpu.init
@@ -169,9 +168,23 @@ def ipex_init(): # pylint: disable=too-many-statements
torch.cuda.get_device_properties.minor = 7
torch.cuda.ipc_collect = lambda *args, **kwargs: None
torch.cuda.utilization = lambda *args, **kwargs: 0
+ if hasattr(torch.xpu, "getDeviceIdListForCard"):
+ torch.cuda.getDeviceIdListForCard = torch.xpu.getDeviceIdListForCard
+ torch.cuda.get_device_id_list_per_card = torch.xpu.getDeviceIdListForCard
+ else:
+ torch.cuda.getDeviceIdListForCard = torch.xpu.get_device_id_list_per_card
+ torch.cuda.get_device_id_list_per_card = (
+ torch.xpu.get_device_id_list_per_card
+ )
ipex_hijacks()
attention_init()
+ try:
+ from .diffusers import ipex_diffusers
+
+ ipex_diffusers()
+ except Exception: # pylint: disable=broad-exception-caught
+ pass
except Exception as e:
return False, e
return True, None
diff --git a/infer/modules/ipex/attention.py b/infer/modules/ipex/attention.py
index be17f7a..0cc2803 100644
--- a/infer/modules/ipex/attention.py
+++ b/infer/modules/ipex/attention.py
@@ -16,17 +16,15 @@ def torch_bmm(input, mat2, *, out=None):
input.shape[1],
mat2.shape[2],
)
- block_multiply = 2.4 if input.dtype == torch.float32 else 1.2
- block_size = (
- (batch_size_attention * input_tokens * mat2_shape) / 1024 * block_multiply
- ) # MB
+ block_multiply = input.element_size()
+ slice_block_size = input_tokens * mat2_shape / 1024 / 1024 * block_multiply
+ block_size = batch_size_attention * slice_block_size
+
split_slice_size = batch_size_attention
- if block_size >= 4000:
+ if block_size > 4:
do_split = True
# Find something divisible with the input_tokens
- while (
- (split_slice_size * input_tokens * mat2_shape) / 1024 * block_multiply
- ) > 4000:
+ while (split_slice_size * slice_block_size) > 4:
split_slice_size = split_slice_size // 2
if split_slice_size <= 1:
split_slice_size = 1
@@ -34,16 +32,12 @@ def torch_bmm(input, mat2, *, out=None):
else:
do_split = False
- split_block_size = (
- (split_slice_size * input_tokens * mat2_shape) / 1024 * block_multiply
- ) # MB
split_2_slice_size = input_tokens
- if split_block_size >= 4000:
+ if split_slice_size * slice_block_size > 4:
+ slice_block_size2 = split_slice_size * mat2_shape / 1024 / 1024 * block_multiply
do_split_2 = True
# Find something divisible with the input_tokens
- while (
- (split_slice_size * split_2_slice_size * mat2_shape) / 1024 * block_multiply
- ) > 4000:
+ while (split_2_slice_size * slice_block_size2) > 4:
split_2_slice_size = split_2_slice_size // 2
if split_2_slice_size <= 1:
split_2_slice_size = 1
@@ -91,22 +85,25 @@ def scaled_dot_product_attention(
query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False
):
# ARC GPUs can't allocate more than 4GB to a single block, Slice it:
- shape_one, batch_size_attention, query_tokens, shape_four = query.shape
- block_multiply = 2.4 if query.dtype == torch.float32 else 1.2
- block_size = (
- (shape_one * batch_size_attention * query_tokens * shape_four)
- / 1024
- * block_multiply
- ) # MB
+ if len(query.shape) == 3:
+ batch_size_attention, query_tokens, shape_four = query.shape
+ shape_one = 1
+ no_shape_one = True
+ else:
+ shape_one, batch_size_attention, query_tokens, shape_four = query.shape
+ no_shape_one = False
+
+ block_multiply = query.element_size()
+ slice_block_size = (
+ shape_one * query_tokens * shape_four / 1024 / 1024 * block_multiply
+ )
+ block_size = batch_size_attention * slice_block_size
+
split_slice_size = batch_size_attention
- if block_size >= 4000:
+ if block_size > 4:
do_split = True
# Find something divisible with the shape_one
- while (
- (shape_one * split_slice_size * query_tokens * shape_four)
- / 1024
- * block_multiply
- ) > 4000:
+ while (split_slice_size * slice_block_size) > 4:
split_slice_size = split_slice_size // 2
if split_slice_size <= 1:
split_slice_size = 1
@@ -114,20 +111,14 @@ def scaled_dot_product_attention(
else:
do_split = False
- split_block_size = (
- (shape_one * split_slice_size * query_tokens * shape_four)
- / 1024
- * block_multiply
- ) # MB
split_2_slice_size = query_tokens
- if split_block_size >= 4000:
+ if split_slice_size * slice_block_size > 4:
+ slice_block_size2 = (
+ shape_one * split_slice_size * shape_four / 1024 / 1024 * block_multiply
+ )
do_split_2 = True
# Find something divisible with the batch_size_attention
- while (
- (shape_one * split_slice_size * split_2_slice_size * shape_four)
- / 1024
- * block_multiply
- ) > 4000:
+ while (split_2_slice_size * slice_block_size2) > 4:
split_2_slice_size = split_2_slice_size // 2
if split_2_slice_size <= 1:
split_2_slice_size = 1
@@ -146,31 +137,63 @@ def scaled_dot_product_attention(
): # pylint: disable=invalid-name
start_idx_2 = i2 * split_2_slice_size
end_idx_2 = (i2 + 1) * split_2_slice_size
+ if no_shape_one:
+ hidden_states[
+ start_idx:end_idx, start_idx_2:end_idx_2
+ ] = original_scaled_dot_product_attention(
+ query[start_idx:end_idx, start_idx_2:end_idx_2],
+ key[start_idx:end_idx, start_idx_2:end_idx_2],
+ value[start_idx:end_idx, start_idx_2:end_idx_2],
+ attn_mask=attn_mask[
+ start_idx:end_idx, start_idx_2:end_idx_2
+ ]
+ if attn_mask is not None
+ else attn_mask,
+ dropout_p=dropout_p,
+ is_causal=is_causal,
+ )
+ else:
+ hidden_states[
+ :, start_idx:end_idx, start_idx_2:end_idx_2
+ ] = original_scaled_dot_product_attention(
+ query[:, start_idx:end_idx, start_idx_2:end_idx_2],
+ key[:, start_idx:end_idx, start_idx_2:end_idx_2],
+ value[:, start_idx:end_idx, start_idx_2:end_idx_2],
+ attn_mask=attn_mask[
+ :, start_idx:end_idx, start_idx_2:end_idx_2
+ ]
+ if attn_mask is not None
+ else attn_mask,
+ dropout_p=dropout_p,
+ is_causal=is_causal,
+ )
+ else:
+ if no_shape_one:
hidden_states[
- :, start_idx:end_idx, start_idx_2:end_idx_2
+ start_idx:end_idx
] = original_scaled_dot_product_attention(
- query[:, start_idx:end_idx, start_idx_2:end_idx_2],
- key[:, start_idx:end_idx, start_idx_2:end_idx_2],
- value[:, start_idx:end_idx, start_idx_2:end_idx_2],
- attn_mask=attn_mask[:, start_idx:end_idx, start_idx_2:end_idx_2]
+ query[start_idx:end_idx],
+ key[start_idx:end_idx],
+ value[start_idx:end_idx],
+ attn_mask=attn_mask[start_idx:end_idx]
+ if attn_mask is not None
+ else attn_mask,
+ dropout_p=dropout_p,
+ is_causal=is_causal,
+ )
+ else:
+ hidden_states[
+ :, start_idx:end_idx
+ ] = original_scaled_dot_product_attention(
+ query[:, start_idx:end_idx],
+ key[:, start_idx:end_idx],
+ value[:, start_idx:end_idx],
+ attn_mask=attn_mask[:, start_idx:end_idx]
if attn_mask is not None
else attn_mask,
dropout_p=dropout_p,
is_causal=is_causal,
)
- else:
- hidden_states[
- :, start_idx:end_idx
- ] = original_scaled_dot_product_attention(
- query[:, start_idx:end_idx],
- key[:, start_idx:end_idx],
- value[:, start_idx:end_idx],
- attn_mask=attn_mask[:, start_idx:end_idx]
- if attn_mask is not None
- else attn_mask,
- dropout_p=dropout_p,
- is_causal=is_causal,
- )
else:
return original_scaled_dot_product_attention(
query,
diff --git a/infer/modules/train/train.py b/infer/modules/train/train.py
index 763ad06..ad9a5b5 100644
--- a/infer/modules/train/train.py
+++ b/infer/modules/train/train.py
@@ -23,14 +23,16 @@ try:
if torch.xpu.is_available():
from infer.modules.ipex import ipex_init
- from infer.modules.ipex.gradscaler import gradscaler_init
+
+ ipex_init()
+
from torch.xpu.amp import autocast
+ from infer.modules.ipex.gradscaler import gradscaler_init
GradScaler = gradscaler_init()
- ipex_init()
else:
from torch.cuda.amp import GradScaler, autocast
-except Exception:
+except Exception: # pylint: disable=broad-exception-caught
from torch.cuda.amp import GradScaler, autocast
torch.backends.cudnn.deterministic = False
@@ -104,14 +106,11 @@ def main():
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = str(randint(20000, 55555))
children = []
+ logger = utils.get_logger(hps.model_dir)
for i in range(n_gpus):
subproc = mp.Process(
target=run,
- args=(
- i,
- n_gpus,
- hps,
- ),
+ args=(i, n_gpus, hps, logger),
)
children.append(subproc)
subproc.start()
@@ -120,10 +119,10 @@ def main():
children[i].join()
-def run(rank, n_gpus, hps):
+def run(rank, n_gpus, hps, logger: logging.Logger):
global global_step
if rank == 0:
- logger = utils.get_logger(hps.model_dir)
+ # logger = utils.get_logger(hps.model_dir)
logger.info(hps)
# utils.check_git_hash(hps.model_dir)
writer = SummaryWriter(log_dir=hps.model_dir)
diff --git a/infer/modules/uvr5/preprocess.py b/infer/modules/uvr5/preprocess.py
index 19f1111..c22b291 100644
--- a/infer/modules/uvr5/preprocess.py
+++ b/infer/modules/uvr5/preprocess.py
@@ -16,13 +16,13 @@ from infer.lib.uvr5_pack.utils import inference
class AudioPre:
- def __init__(self, agg, model_path, device, is_half):
+ def __init__(self, agg, model_path, device, is_half, tta=False):
self.model_path = model_path
self.device = device
self.data = {
# Processing Options
"postprocess": False,
- "tta": False,
+ "tta": tta,
# Constants
"window_size": 512,
"agg": agg,
@@ -180,13 +180,13 @@ class AudioPre:
class AudioPreDeEcho:
- def __init__(self, agg, model_path, device, is_half):
+ def __init__(self, agg, model_path, device, is_half, tta=False):
self.model_path = model_path
self.device = device
self.data = {
# Processing Options
"postprocess": False,
- "tta": False,
+ "tta": tta,
# Constants
"window_size": 512,
"agg": agg,
diff --git a/infer/modules/vc/modules.py b/infer/modules/vc/modules.py
index d90379b..3e7cdbb 100644
--- a/infer/modules/vc/modules.py
+++ b/infer/modules/vc/modules.py
@@ -54,16 +54,10 @@ class VC:
if sid == "" or sid == []:
if self.hubert_model is not None: # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的
logger.info("Clean model cache")
- del (
- self.net_g,
- self.n_spk,
- self.vc,
- self.hubert_model,
- self.tgt_sr,
- ) # ,cpt
+ del (self.net_g, self.n_spk, self.hubert_model, self.tgt_sr) # ,cpt
self.hubert_model = (
self.net_g
- ) = self.n_spk = self.vc = self.hubert_model = self.tgt_sr = None
+ ) = self.n_spk = self.hubert_model = self.tgt_sr = None
if torch.cuda.is_available():
torch.cuda.empty_cache()
###楼下不这么折腾清理不干净
diff --git a/modules.py b/modules.py
deleted file mode 100644
index d90379b..0000000
--- a/modules.py
+++ /dev/null
@@ -1,307 +0,0 @@
-import traceback
-import logging
-
-logger = logging.getLogger(__name__)
-
-import numpy as np
-import soundfile as sf
-import torch
-from io import BytesIO
-
-from infer.lib.audio import load_audio, wav2
-from infer.lib.infer_pack.models import (
- SynthesizerTrnMs256NSFsid,
- SynthesizerTrnMs256NSFsid_nono,
- SynthesizerTrnMs768NSFsid,
- SynthesizerTrnMs768NSFsid_nono,
-)
-from infer.modules.vc.pipeline import Pipeline
-from infer.modules.vc.utils import *
-
-
-class VC:
- def __init__(self, config):
- self.n_spk = None
- self.tgt_sr = None
- self.net_g = None
- self.pipeline = None
- self.cpt = None
- self.version = None
- self.if_f0 = None
- self.version = None
- self.hubert_model = None
-
- self.config = config
-
- def get_vc(self, sid, *to_return_protect):
- logger.info("Get sid: " + sid)
-
- to_return_protect0 = {
- "visible": self.if_f0 != 0,
- "value": to_return_protect[0]
- if self.if_f0 != 0 and to_return_protect
- else 0.5,
- "__type__": "update",
- }
- to_return_protect1 = {
- "visible": self.if_f0 != 0,
- "value": to_return_protect[1]
- if self.if_f0 != 0 and to_return_protect
- else 0.33,
- "__type__": "update",
- }
-
- if sid == "" or sid == []:
- if self.hubert_model is not None: # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的
- logger.info("Clean model cache")
- del (
- self.net_g,
- self.n_spk,
- self.vc,
- self.hubert_model,
- self.tgt_sr,
- ) # ,cpt
- self.hubert_model = (
- self.net_g
- ) = self.n_spk = self.vc = self.hubert_model = self.tgt_sr = None
- if torch.cuda.is_available():
- torch.cuda.empty_cache()
- ###楼下不这么折腾清理不干净
- self.if_f0 = self.cpt.get("f0", 1)
- self.version = self.cpt.get("version", "v1")
- if self.version == "v1":
- if self.if_f0 == 1:
- self.net_g = SynthesizerTrnMs256NSFsid(
- *self.cpt["config"], is_half=self.config.is_half
- )
- else:
- self.net_g = SynthesizerTrnMs256NSFsid_nono(*self.cpt["config"])
- elif self.version == "v2":
- if self.if_f0 == 1:
- self.net_g = SynthesizerTrnMs768NSFsid(
- *self.cpt["config"], is_half=self.config.is_half
- )
- else:
- self.net_g = SynthesizerTrnMs768NSFsid_nono(*self.cpt["config"])
- del self.net_g, self.cpt
- if torch.cuda.is_available():
- torch.cuda.empty_cache()
- return (
- {"visible": False, "__type__": "update"},
- {
- "visible": True,
- "value": to_return_protect0,
- "__type__": "update",
- },
- {
- "visible": True,
- "value": to_return_protect1,
- "__type__": "update",
- },
- "",
- "",
- )
- person = f'{os.getenv("weight_root")}/{sid}'
- logger.info(f"Loading: {person}")
-
- self.cpt = torch.load(person, map_location="cpu")
- self.tgt_sr = self.cpt["config"][-1]
- self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0] # n_spk
- self.if_f0 = self.cpt.get("f0", 1)
- self.version = self.cpt.get("version", "v1")
-
- synthesizer_class = {
- ("v1", 1): SynthesizerTrnMs256NSFsid,
- ("v1", 0): SynthesizerTrnMs256NSFsid_nono,
- ("v2", 1): SynthesizerTrnMs768NSFsid,
- ("v2", 0): SynthesizerTrnMs768NSFsid_nono,
- }
-
- self.net_g = synthesizer_class.get(
- (self.version, self.if_f0), SynthesizerTrnMs256NSFsid
- )(*self.cpt["config"], is_half=self.config.is_half)
-
- del self.net_g.enc_q
-
- self.net_g.load_state_dict(self.cpt["weight"], strict=False)
- self.net_g.eval().to(self.config.device)
- if self.config.is_half:
- self.net_g = self.net_g.half()
- else:
- self.net_g = self.net_g.float()
-
- self.pipeline = Pipeline(self.tgt_sr, self.config)
- n_spk = self.cpt["config"][-3]
- index = {"value": get_index_path_from_model(sid), "__type__": "update"}
- logger.info("Select index: " + index["value"])
-
- return (
- (
- {"visible": True, "maximum": n_spk, "__type__": "update"},
- to_return_protect0,
- to_return_protect1,
- index,
- index,
- )
- if to_return_protect
- else {"visible": True, "maximum": n_spk, "__type__": "update"}
- )
-
- def vc_single(
- self,
- sid,
- input_audio_path,
- f0_up_key,
- f0_file,
- f0_method,
- file_index,
- file_index2,
- index_rate,
- filter_radius,
- resample_sr,
- rms_mix_rate,
- protect,
- ):
- if input_audio_path is None:
- return "You need to upload an audio", None
- f0_up_key = int(f0_up_key)
- try:
- audio = load_audio(input_audio_path, 16000)
- audio_max = np.abs(audio).max() / 0.95
- if audio_max > 1:
- audio /= audio_max
- times = [0, 0, 0]
-
- if self.hubert_model is None:
- self.hubert_model = load_hubert(self.config)
-
- file_index = (
- (
- file_index.strip(" ")
- .strip('"')
- .strip("\n")
- .strip('"')
- .strip(" ")
- .replace("trained", "added")
- )
- if file_index != ""
- else file_index2
- ) # 防止小白写错,自动帮他替换掉
-
- audio_opt = self.pipeline.pipeline(
- self.hubert_model,
- self.net_g,
- sid,
- audio,
- input_audio_path,
- times,
- f0_up_key,
- f0_method,
- file_index,
- index_rate,
- self.if_f0,
- filter_radius,
- self.tgt_sr,
- resample_sr,
- rms_mix_rate,
- self.version,
- protect,
- f0_file,
- )
- if self.tgt_sr != resample_sr >= 16000:
- tgt_sr = resample_sr
- else:
- tgt_sr = self.tgt_sr
- index_info = (
- "Index:\n%s." % file_index
- if os.path.exists(file_index)
- else "Index not used."
- )
- return (
- "Success.\n%s\nTime:\nnpy: %.2fs, f0: %.2fs, infer: %.2fs."
- % (index_info, *times),
- (tgt_sr, audio_opt),
- )
- except:
- info = traceback.format_exc()
- logger.warning(info)
- return info, (None, None)
-
- def vc_multi(
- self,
- sid,
- dir_path,
- opt_root,
- paths,
- f0_up_key,
- f0_method,
- file_index,
- file_index2,
- index_rate,
- filter_radius,
- resample_sr,
- rms_mix_rate,
- protect,
- format1,
- ):
- try:
- dir_path = (
- dir_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
- ) # 防止小白拷路径头尾带了空格和"和回车
- opt_root = opt_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
- os.makedirs(opt_root, exist_ok=True)
- try:
- if dir_path != "":
- paths = [
- os.path.join(dir_path, name) for name in os.listdir(dir_path)
- ]
- else:
- paths = [path.name for path in paths]
- except:
- traceback.print_exc()
- paths = [path.name for path in paths]
- infos = []
- for path in paths:
- info, opt = self.vc_single(
- sid,
- path,
- f0_up_key,
- None,
- f0_method,
- file_index,
- file_index2,
- # file_big_npy,
- index_rate,
- filter_radius,
- resample_sr,
- rms_mix_rate,
- protect,
- )
- if "Success" in info:
- try:
- tgt_sr, audio_opt = opt
- if format1 in ["wav", "flac"]:
- sf.write(
- "%s/%s.%s"
- % (opt_root, os.path.basename(path), format1),
- audio_opt,
- tgt_sr,
- )
- else:
- path = "%s/%s.%s" % (
- opt_root,
- os.path.basename(path),
- format1,
- )
- with BytesIO() as wavf:
- sf.write(wavf, audio_opt, tgt_sr, format="wav")
- wavf.seek(0, 0)
- with open(path, "wb") as outf:
- wav2(wavf, outf, format1)
- except:
- info += traceback.format_exc()
- infos.append("%s->%s" % (os.path.basename(path), info))
- yield "\n".join(infos)
- yield "\n".join(infos)
- except:
- yield traceback.format_exc()
diff --git a/requirements-ipex.txt b/requirements-ipex.txt
index 1a96cf0..610a0ce 100644
--- a/requirements-ipex.txt
+++ b/requirements-ipex.txt
@@ -2,7 +2,7 @@ torch==2.0.1a0
intel_extension_for_pytorch==2.0.110+xpu
torchvision==0.15.2a0
https://github.com/Disty0/Retrieval-based-Voice-Conversion-WebUI/releases/download/torchaudio_wheels_for_ipex/torchaudio-2.0.2+31de77d-cp310-cp310-linux_x86_64.whl
--f https://developer.intel.com/ipex-whl-stable-xpu
+--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
joblib>=1.1.0
numba==0.56.4
numpy==1.23.5
diff --git a/tools/download_models.py b/tools/download_models.py
new file mode 100644
index 0000000..94e0389
--- /dev/null
+++ b/tools/download_models.py
@@ -0,0 +1,79 @@
+import os
+from pathlib import Path
+import requests
+
+RVC_DOWNLOAD_LINK = "https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/"
+
+BASE_DIR = Path(__file__).resolve().parent.parent
+
+
+def dl_model(link, model_name, dir_name):
+ with requests.get(f"{link}{model_name}") as r:
+ r.raise_for_status()
+ os.makedirs(os.path.dirname(dir_name / model_name), exist_ok=True)
+ with open(dir_name / model_name, "wb") as f:
+ for chunk in r.iter_content(chunk_size=8192):
+ f.write(chunk)
+
+
+if __name__ == "__main__":
+ print("Downloading hubert_base.pt...")
+ dl_model(RVC_DOWNLOAD_LINK, "hubert_base.pt", BASE_DIR / "assets/hubert")
+ print("Downloading rmvpe.pt...")
+ dl_model(RVC_DOWNLOAD_LINK, "rmvpe.pt", BASE_DIR / "assets/rmvpe")
+ print("Downloading vocals.onnx...")
+ dl_model(
+ RVC_DOWNLOAD_LINK + "uvr5_weights/onnx_dereverb_By_FoxJoy/",
+ "vocals.onnx",
+ BASE_DIR / "assets/uvr5_weights/onnx_dereverb_By_FoxJoy",
+ )
+
+ rvc_models_dir = BASE_DIR / "assets/pretrained"
+
+ print("Downloading pretrained models:")
+
+ model_names = [
+ "D32k.pth",
+ "D40k.pth",
+ "D48k.pth",
+ "G32k.pth",
+ "G40k.pth",
+ "G48k.pth",
+ "f0D32k.pth",
+ "f0D40k.pth",
+ "f0D48k.pth",
+ "f0G32k.pth",
+ "f0G40k.pth",
+ "f0G48k.pth",
+ ]
+ for model in model_names:
+ print(f"Downloading {model}...")
+ dl_model(RVC_DOWNLOAD_LINK + "pretrained/", model, rvc_models_dir)
+
+ rvc_models_dir = BASE_DIR / "assets/pretrained_v2"
+
+ print("Downloading pretrained models v2:")
+
+ for model in model_names:
+ print(f"Downloading {model}...")
+ dl_model(RVC_DOWNLOAD_LINK + "pretrained_v2/", model, rvc_models_dir)
+
+ print("Downloading uvr5_weights:")
+
+ rvc_models_dir = BASE_DIR / "assets/uvr5_weights"
+
+ model_names = [
+ "HP2-%E4%BA%BA%E5%A3%B0vocals%2B%E9%9D%9E%E4%BA%BA%E5%A3%B0instrumentals.pth",
+ "HP2_all_vocals.pth",
+ "HP3_all_vocals.pth",
+ "HP5-%E4%B8%BB%E6%97%8B%E5%BE%8B%E4%BA%BA%E5%A3%B0vocals%2B%E5%85%B6%E4%BB%96instrumentals.pth",
+ "HP5_only_main_vocal.pth",
+ "VR-DeEchoAggressive.pth",
+ "VR-DeEchoDeReverb.pth",
+ "VR-DeEchoNormal.pth",
+ ]
+ for model in model_names:
+ print(f"Downloading {model}...")
+ dl_model(RVC_DOWNLOAD_LINK + "uvr5_weights/", model, rvc_models_dir)
+
+ print("All models downloaded!")
diff --git a/tools/rvc_for_realtime.py b/tools/rvc_for_realtime.py
index 094e307..378c40b 100644
--- a/tools/rvc_for_realtime.py
+++ b/tools/rvc_for_realtime.py
@@ -1,12 +1,11 @@
+from io import BytesIO
import os
+import pickle
import sys
import traceback
-import logging
-
-logger = logging.getLogger(__name__)
-
+from infer.lib import jit
+from infer.lib.jit.get_synthesizer import get_synthesizer
from time import time as ttime
-
import fairseq
import faiss
import numpy as np
@@ -31,17 +30,16 @@ from multiprocessing import Manager as M
from configs.config import Config
-config = Config()
+# config = Config()
mm = M()
-if config.dml == True:
- def forward_dml(ctx, x, scale):
- ctx.scale = scale
- res = x.clone().detach()
- return res
- fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml
+def printt(strr, *args):
+ if len(args) == 0:
+ print(strr)
+ else:
+ print(strr % args)
# config.device=torch.device("cpu")########强制cpu测试
@@ -56,18 +54,27 @@ class RVC:
n_cpu,
inp_q,
opt_q,
- device,
+ config: Config,
last_rvc=None,
) -> None:
"""
初始化
"""
try:
- global config
+ if config.dml == True:
+
+ def forward_dml(ctx, x, scale):
+ ctx.scale = scale
+ res = x.clone().detach()
+ return res
+
+ fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml
+ # global config
+ self.config = config
self.inp_q = inp_q
self.opt_q = opt_q
# device="cpu"########强制cpu测试
- self.device = device
+ self.device = config.device
self.f0_up_key = key
self.time_step = 160 / 16000 * 1000
self.f0_min = 50
@@ -77,11 +84,14 @@ class RVC:
self.sr = 16000
self.window = 160
self.n_cpu = n_cpu
+ self.use_jit = self.config.use_jit
+ self.is_half = config.is_half
+
if index_rate != 0:
self.index = faiss.read_index(index_path)
self.big_npy = self.index.reconstruct_n(0, self.index.ntotal)
- logger.info("Index search enabled")
- self.pth_path = pth_path
+ printt("Index search enabled")
+ self.pth_path: str = pth_path
self.index_path = index_path
self.index_rate = index_rate
@@ -91,8 +101,8 @@ class RVC:
suffix="",
)
hubert_model = models[0]
- hubert_model = hubert_model.to(device)
- if config.is_half:
+ hubert_model = hubert_model.to(self.device)
+ if self.is_half:
hubert_model = hubert_model.half()
else:
hubert_model = hubert_model.float()
@@ -101,46 +111,80 @@ class RVC:
else:
self.model = last_rvc.model
- if last_rvc is None or last_rvc.pth_path != self.pth_path:
- cpt = torch.load(self.pth_path, map_location="cpu")
+ self.net_g: nn.Module = None
+
+ def set_default_model():
+ self.net_g, cpt = get_synthesizer(self.pth_path, self.device)
self.tgt_sr = cpt["config"][-1]
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
self.if_f0 = cpt.get("f0", 1)
self.version = cpt.get("version", "v1")
- if self.version == "v1":
- if self.if_f0 == 1:
- self.net_g = SynthesizerTrnMs256NSFsid(
- *cpt["config"], is_half=config.is_half
- )
- else:
- self.net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
- elif self.version == "v2":
- if self.if_f0 == 1:
- self.net_g = SynthesizerTrnMs768NSFsid(
- *cpt["config"], is_half=config.is_half
- )
- else:
- self.net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
- del self.net_g.enc_q
- logger.debug(self.net_g.load_state_dict(cpt["weight"], strict=False))
- self.net_g.eval().to(device)
- # print(2333333333,device,config.device,self.device)#net_g是device,hubert是config.device
- if config.is_half:
+ if self.is_half:
self.net_g = self.net_g.half()
else:
self.net_g = self.net_g.float()
- self.is_half = config.is_half
+
+ def set_jit_model():
+ jit_pth_path = self.pth_path.rstrip(".pth")
+ jit_pth_path += ".half.jit" if self.is_half else ".jit"
+ reload = False
+ if str(self.device) == "cuda":
+ self.device = torch.device("cuda:0")
+ if os.path.exists(jit_pth_path):
+ cpt = jit.load(jit_pth_path)
+ model_device = cpt["device"]
+ if model_device != str(self.device):
+ reload = True
+ else:
+ reload = True
+
+ if reload:
+ cpt = jit.synthesizer_jit_export(
+ self.pth_path,
+ "script",
+ None,
+ device=self.device,
+ is_half=self.is_half,
+ )
+
+ self.tgt_sr = cpt["config"][-1]
+ self.if_f0 = cpt.get("f0", 1)
+ self.version = cpt.get("version", "v1")
+ self.net_g = torch.jit.load(
+ BytesIO(cpt["model"]), map_location=self.device
+ )
+ self.net_g.infer = self.net_g.forward
+ self.net_g.eval().to(self.device)
+
+ def set_synthesizer():
+ if self.use_jit and not config.dml:
+ if self.is_half and "cpu" in str(self.device):
+ printt(
+ "Use default Synthesizer model. \
+ Jit is not supported on the CPU for half floating point"
+ )
+ set_default_model()
+ else:
+ set_jit_model()
+ else:
+ set_default_model()
+
+ if last_rvc is None or last_rvc.pth_path != self.pth_path:
+ set_synthesizer()
else:
self.tgt_sr = last_rvc.tgt_sr
self.if_f0 = last_rvc.if_f0
self.version = last_rvc.version
- self.net_g = last_rvc.net_g
self.is_half = last_rvc.is_half
+ if last_rvc.use_jit != self.use_jit:
+ set_synthesizer()
+ else:
+ self.net_g = last_rvc.net_g
if last_rvc is not None and hasattr(last_rvc, "model_rmvpe"):
self.model_rmvpe = last_rvc.model_rmvpe
except:
- logger.warning(traceback.format_exc())
+ printt(traceback.format_exc())
def change_key(self, new_key):
self.f0_up_key = new_key
@@ -149,7 +193,7 @@ class RVC:
if new_index_rate != 0 and self.index_rate == 0:
self.index = faiss.read_index(self.index_path)
self.big_npy = self.index.reconstruct_n(0, self.index.ntotal)
- logger.info("Index search enabled")
+ printt("Index search enabled")
self.index_rate = new_index_rate
def get_f0_post(self, f0):
@@ -188,7 +232,7 @@ class RVC:
pad_size = (p_len - len(f0) + 1) // 2
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
- # print(pad_size, p_len - len(f0) - pad_size)
+ # printt(pad_size, p_len - len(f0) - pad_size)
f0 = np.pad(
f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
)
@@ -243,7 +287,7 @@ class RVC:
if "privateuseone" in str(self.device): ###不支持dml,cpu又太慢用不成,拿pm顶替
return self.get_f0(x, f0_up_key, 1, "pm")
audio = torch.tensor(np.copy(x))[None].float()
- # print("using crepe,device:%s"%self.device)
+ # printt("using crepe,device:%s"%self.device)
f0, pd = torchcrepe.predict(
audio,
self.sr,
@@ -267,7 +311,7 @@ class RVC:
if hasattr(self, "model_rmvpe") == False:
from infer.lib.rmvpe import RMVPE
- logger.info("Loading rmvpe model")
+ printt("Loading rmvpe model")
self.model_rmvpe = RMVPE(
# "rmvpe.pt", is_half=self.is_half if self.device.type!="privateuseone" else False, device=self.device if self.device.type!="privateuseone"else "cpu"####dml时强制对rmvpe用cpu跑
# "rmvpe.pt", is_half=False, device=self.device####dml配置
@@ -275,6 +319,7 @@ class RVC:
"assets/rmvpe/rmvpe.pt",
is_half=self.is_half,
device=self.device, ####正常逻辑
+ use_jit=self.config.use_jit,
)
# self.model_rmvpe = RMVPE("aug2_58000_half.pt", is_half=self.is_half, device=self.device)
f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
@@ -292,7 +337,7 @@ class RVC:
f0method,
) -> np.ndarray:
feats = feats.view(1, -1)
- if config.is_half:
+ if self.config.is_half:
feats = feats.half()
else:
feats = feats.float()
@@ -319,17 +364,17 @@ class RVC:
weight = np.square(1 / score)
weight /= weight.sum(axis=1, keepdims=True)
npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
- if config.is_half:
+ if self.config.is_half:
npy = npy.astype("float16")
feats[0][-leng_replace_head:] = (
torch.from_numpy(npy).unsqueeze(0).to(self.device) * self.index_rate
+ (1 - self.index_rate) * feats[0][-leng_replace_head:]
)
else:
- logger.warning("Index search FAILED or disabled")
+ printt("Index search FAILED or disabled")
except:
- traceback.print_exc()
- logger.warning("Index search FAILED")
+ traceback.printt_exc()
+ printt("Index search FAILED")
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
t3 = ttime()
if self.if_f0 == 1:
@@ -356,16 +401,21 @@ class RVC:
sid = torch.LongTensor([ii]).to(self.device)
with torch.no_grad():
if self.if_f0 == 1:
- # print(12222222222,feats.device,p_len.device,cache_pitch.device,cache_pitchf.device,sid.device,rate2)
+ # printt(12222222222,feats.device,p_len.device,cache_pitch.device,cache_pitchf.device,sid.device,rate2)
infered_audio = self.net_g.infer(
- feats, p_len, cache_pitch, cache_pitchf, sid, rate
+ feats,
+ p_len,
+ cache_pitch,
+ cache_pitchf,
+ sid,
+ torch.FloatTensor([rate]),
)[0][0, 0].data.float()
else:
- infered_audio = self.net_g.infer(feats, p_len, sid, rate)[0][
- 0, 0
- ].data.float()
+ infered_audio = self.net_g.infer(
+ feats, p_len, sid, torch.FloatTensor([rate])
+ )[0][0, 0].data.float()
t5 = ttime()
- logger.info(
+ printt(
"Spent time: fea = %.2fs, index = %.2fs, f0 = %.2fs, model = %.2fs",
t2 - t1,
t3 - t2,
diff --git a/tools/torchgate/torchgate.py b/tools/torchgate/torchgate.py
index f95ffef..e4b80c4 100644
--- a/tools/torchgate/torchgate.py
+++ b/tools/torchgate/torchgate.py
@@ -1,4 +1,5 @@
import torch
+from infer.lib.rmvpe import STFT
from torch.nn.functional import conv1d, conv2d
from typing import Union, Optional
from .utils import linspace, temperature_sigmoid, amp_to_db
@@ -139,17 +140,26 @@ class TorchGate(torch.nn.Module):
are set to 1, and the rest are set to 0.
"""
if xn is not None:
- XN = torch.stft(
- xn,
- n_fft=self.n_fft,
- hop_length=self.hop_length,
- win_length=self.win_length,
- return_complex=True,
- pad_mode="constant",
- center=True,
- window=torch.hann_window(self.win_length).to(xn.device),
- )
-
+ if "privateuseone" in str(xn.device):
+ if not hasattr(self, "stft"):
+ self.stft = STFT(
+ filter_length=self.n_fft,
+ hop_length=self.hop_length,
+ win_length=self.win_length,
+ window="hann",
+ ).to(xn.device)
+ XN = self.stft.transform(xn)
+ else:
+ XN = torch.stft(
+ xn,
+ n_fft=self.n_fft,
+ hop_length=self.hop_length,
+ win_length=self.win_length,
+ return_complex=True,
+ pad_mode="constant",
+ center=True,
+ window=torch.hann_window(self.win_length).to(xn.device),
+ )
XN_db = amp_to_db(XN).to(dtype=X_db.dtype)
else:
XN_db = X_db
@@ -213,16 +223,26 @@ class TorchGate(torch.nn.Module):
"""
# Compute short-time Fourier transform (STFT)
- X = torch.stft(
- x,
- n_fft=self.n_fft,
- hop_length=self.hop_length,
- win_length=self.win_length,
- return_complex=True,
- pad_mode="constant",
- center=True,
- window=torch.hann_window(self.win_length).to(x.device),
- )
+ if "privateuseone" in str(x.device):
+ if not hasattr(self, "stft"):
+ self.stft = STFT(
+ filter_length=self.n_fft,
+ hop_length=self.hop_length,
+ win_length=self.win_length,
+ window="hann",
+ ).to(x.device)
+ X, phase = self.stft.transform(x, return_phase=True)
+ else:
+ X = torch.stft(
+ x,
+ n_fft=self.n_fft,
+ hop_length=self.hop_length,
+ win_length=self.win_length,
+ return_complex=True,
+ pad_mode="constant",
+ center=True,
+ window=torch.hann_window(self.win_length).to(x.device),
+ )
# Compute signal mask based on stationary or nonstationary assumptions
if self.nonstationary:
@@ -231,7 +251,7 @@ class TorchGate(torch.nn.Module):
sig_mask = self._stationary_mask(amp_to_db(X), xn)
# Propagate decrease in signal power
- sig_mask = self.prop_decrease * (sig_mask * 1.0 - 1.0) + 1.0
+ sig_mask = self.prop_decrease * (sig_mask.float() - 1.0) + 1.0
# Smooth signal mask with 2D convolution
if self.smoothing_filter is not None:
@@ -245,13 +265,16 @@ class TorchGate(torch.nn.Module):
Y = X * sig_mask.squeeze(1)
# Inverse STFT to obtain time-domain signal
- y = torch.istft(
- Y,
- n_fft=self.n_fft,
- hop_length=self.hop_length,
- win_length=self.win_length,
- center=True,
- window=torch.hann_window(self.win_length).to(Y.device),
- )
+ if "privateuseone" in str(Y.device):
+ y = self.stft.inverse(Y, phase)
+ else:
+ y = torch.istft(
+ Y,
+ n_fft=self.n_fft,
+ hop_length=self.hop_length,
+ win_length=self.win_length,
+ center=True,
+ window=torch.hann_window(self.win_length).to(Y.device),
+ )
return y.to(dtype=x.dtype)