Format code (#142)
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
This commit is contained in:
parent
376bd31c19
commit
b4c653142d
@ -2,27 +2,29 @@ from infer_pack.models_onnx_moess import SynthesizerTrnMs256NSFsidM
|
|||||||
from infer_pack.models_onnx import SynthesizerTrnMs256NSFsidO
|
from infer_pack.models_onnx import SynthesizerTrnMs256NSFsidO
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
MoeVS = True #模型是否为MoeVoiceStudio(原MoeSS)使用
|
MoeVS = True # 模型是否为MoeVoiceStudio(原MoeSS)使用
|
||||||
|
|
||||||
ModelPath = "Shiroha/shiroha.pth" #模型路径
|
ModelPath = "Shiroha/shiroha.pth" # 模型路径
|
||||||
ExportedPath = "model.onnx" #输出路径
|
ExportedPath = "model.onnx" # 输出路径
|
||||||
hidden_channels = 256 # hidden_channels,为768Vec做准备
|
hidden_channels = 256 # hidden_channels,为768Vec做准备
|
||||||
cpt = torch.load(ModelPath, map_location="cpu")
|
cpt = torch.load(ModelPath, map_location="cpu")
|
||||||
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
|
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
|
||||||
print(*cpt["config"])
|
print(*cpt["config"])
|
||||||
|
|
||||||
test_phone = torch.rand(1, 200, hidden_channels) # hidden unit
|
test_phone = torch.rand(1, 200, hidden_channels) # hidden unit
|
||||||
test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用)
|
test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用)
|
||||||
test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹)
|
test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹)
|
||||||
test_pitchf = torch.rand(1, 200) # nsf基频
|
test_pitchf = torch.rand(1, 200) # nsf基频
|
||||||
test_ds = torch.LongTensor([0]) # 说话人ID
|
test_ds = torch.LongTensor([0]) # 说话人ID
|
||||||
test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子)
|
test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子)
|
||||||
|
|
||||||
device = "cpu" #导出时设备(不影响使用模型)
|
device = "cpu" # 导出时设备(不影响使用模型)
|
||||||
|
|
||||||
if MoeVS:
|
if MoeVS:
|
||||||
net_g = SynthesizerTrnMs256NSFsidM(*cpt["config"], is_half=False) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16)
|
net_g = SynthesizerTrnMs256NSFsidM(
|
||||||
|
*cpt["config"], is_half=False
|
||||||
|
) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16)
|
||||||
net_g.load_state_dict(cpt["weight"], strict=False)
|
net_g.load_state_dict(cpt["weight"], strict=False)
|
||||||
input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
|
input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
|
||||||
output_names = [
|
output_names = [
|
||||||
@ -52,7 +54,9 @@ if __name__ == '__main__':
|
|||||||
output_names=output_names,
|
output_names=output_names,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
net_g = SynthesizerTrnMs256NSFsidO(*cpt["config"], is_half=False) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16)
|
net_g = SynthesizerTrnMs256NSFsidO(
|
||||||
|
*cpt["config"], is_half=False
|
||||||
|
) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16)
|
||||||
net_g.load_state_dict(cpt["weight"], strict=False)
|
net_g.load_state_dict(cpt["weight"], strict=False)
|
||||||
input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds"]
|
input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds"]
|
||||||
output_names = [
|
output_names = [
|
||||||
@ -78,4 +82,4 @@ if __name__ == '__main__':
|
|||||||
verbose=False,
|
verbose=False,
|
||||||
input_names=input_names,
|
input_names=input_names,
|
||||||
output_names=output_names,
|
output_names=output_names,
|
||||||
)
|
)
|
||||||
|
@ -35,7 +35,7 @@ class FeatureInput(object):
|
|||||||
def compute_f0(self, path, f0_method):
|
def compute_f0(self, path, f0_method):
|
||||||
# default resample type of librosa.resample is "soxr_hq".
|
# default resample type of librosa.resample is "soxr_hq".
|
||||||
# Quality: soxr_vhq > soxr_hq
|
# Quality: soxr_vhq > soxr_hq
|
||||||
x, sr = librosa.load(path, self.fs)#, res_type='soxr_vhq'
|
x, sr = librosa.load(path, self.fs) # , res_type='soxr_vhq'
|
||||||
p_len = x.shape[0] // self.hop
|
p_len = x.shape[0] // self.hop
|
||||||
assert sr == self.fs
|
assert sr == self.fs
|
||||||
if f0_method == "pm":
|
if f0_method == "pm":
|
||||||
|
17
gui.py
17
gui.py
@ -67,7 +67,7 @@ class RVC:
|
|||||||
print(e)
|
print(e)
|
||||||
|
|
||||||
def get_f0(self, x, f0_up_key, inp_f0=None):
|
def get_f0(self, x, f0_up_key, inp_f0=None):
|
||||||
x_pad=1
|
x_pad = 1
|
||||||
f0_min = 50
|
f0_min = 50
|
||||||
f0_max = 1100
|
f0_max = 1100
|
||||||
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
|
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
|
||||||
@ -137,7 +137,7 @@ class RVC:
|
|||||||
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
|
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
print(feats.shape)
|
print(feats.shape)
|
||||||
if(self.if_f0==1):
|
if self.if_f0 == 1:
|
||||||
pitch, pitchf = self.get_f0(audio, self.f0_up_key)
|
pitch, pitchf = self.get_f0(audio, self.f0_up_key)
|
||||||
p_len = min(feats.shape[1], 13000, pitch.shape[0]) # 太大了爆显存
|
p_len = min(feats.shape[1], 13000, pitch.shape[0]) # 太大了爆显存
|
||||||
else:
|
else:
|
||||||
@ -146,7 +146,7 @@ class RVC:
|
|||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
# print(feats.shape,pitch.shape)
|
# print(feats.shape,pitch.shape)
|
||||||
feats = feats[:, :p_len, :]
|
feats = feats[:, :p_len, :]
|
||||||
if(self.if_f0==1):
|
if self.if_f0 == 1:
|
||||||
pitch = pitch[:p_len]
|
pitch = pitch[:p_len]
|
||||||
pitchf = pitchf[:p_len]
|
pitchf = pitchf[:p_len]
|
||||||
pitch = torch.LongTensor(pitch).unsqueeze(0).to(device)
|
pitch = torch.LongTensor(pitch).unsqueeze(0).to(device)
|
||||||
@ -155,17 +155,15 @@ class RVC:
|
|||||||
ii = 0 # sid
|
ii = 0 # sid
|
||||||
sid = torch.LongTensor([ii]).to(device)
|
sid = torch.LongTensor([ii]).to(device)
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
if(self.if_f0==1):
|
if self.if_f0 == 1:
|
||||||
infered_audio = (
|
infered_audio = (
|
||||||
self.net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]
|
self.net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]
|
||||||
.data.cpu()
|
.data.cpu()
|
||||||
.float()
|
.float()
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
infered_audio = (
|
infered_audio = (
|
||||||
self.net_g.infer(feats, p_len, sid)[0][0, 0]
|
self.net_g.infer(feats, p_len, sid)[0][0, 0].data.cpu().float()
|
||||||
.data.cpu()
|
|
||||||
.float()
|
|
||||||
)
|
)
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
return infered_audio
|
return infered_audio
|
||||||
@ -387,7 +385,7 @@ class GUI:
|
|||||||
self.config.pth_path,
|
self.config.pth_path,
|
||||||
self.config.index_path,
|
self.config.index_path,
|
||||||
self.config.npy_path,
|
self.config.npy_path,
|
||||||
self.config.index_rate
|
self.config.index_rate,
|
||||||
)
|
)
|
||||||
self.input_wav: np.ndarray = np.zeros(
|
self.input_wav: np.ndarray = np.zeros(
|
||||||
self.extra_frame
|
self.extra_frame
|
||||||
@ -511,7 +509,6 @@ class GUI:
|
|||||||
total_time = time.perf_counter() - start_time
|
total_time = time.perf_counter() - start_time
|
||||||
self.window["infer_time"].update(int(total_time * 1000))
|
self.window["infer_time"].update(int(total_time * 1000))
|
||||||
print("infer time:" + str(total_time))
|
print("infer time:" + str(total_time))
|
||||||
|
|
||||||
|
|
||||||
def get_devices(self, update: bool = True):
|
def get_devices(self, update: bool = True):
|
||||||
"""获取设备列表"""
|
"""获取设备列表"""
|
||||||
|
6
i18n.py
6
i18n.py
@ -11,8 +11,10 @@ def load_language_list(language):
|
|||||||
|
|
||||||
class I18nAuto:
|
class I18nAuto:
|
||||||
def __init__(self, language=None):
|
def __init__(self, language=None):
|
||||||
if language in ['auto', None]:
|
if language in ["auto", None]:
|
||||||
language = locale.getdefaultlocale()[0]#getlocale can't identify the system's language ((None, None))
|
language = locale.getdefaultlocale()[
|
||||||
|
0
|
||||||
|
] # getlocale can't identify the system's language ((None, None))
|
||||||
if not os.path.exists(f"./i18n/{language}.json"):
|
if not os.path.exists(f"./i18n/{language}.json"):
|
||||||
language = "en_US"
|
language = "en_US"
|
||||||
self.language = language
|
self.language = language
|
||||||
|
36
infer-web.py
36
infer-web.py
@ -119,7 +119,6 @@ for name in os.listdir(weight_uvr5_root):
|
|||||||
uvr5_names.append(name.replace(".pth", ""))
|
uvr5_names.append(name.replace(".pth", ""))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def vc_single(
|
def vc_single(
|
||||||
sid,
|
sid,
|
||||||
input_audio,
|
input_audio,
|
||||||
@ -888,23 +887,27 @@ def change_info_(ckpt_path):
|
|||||||
|
|
||||||
from infer_pack.models_onnx_moess import SynthesizerTrnMs256NSFsidM
|
from infer_pack.models_onnx_moess import SynthesizerTrnMs256NSFsidM
|
||||||
from infer_pack.models_onnx import SynthesizerTrnMs256NSFsidO
|
from infer_pack.models_onnx import SynthesizerTrnMs256NSFsidO
|
||||||
|
|
||||||
|
|
||||||
def export_onnx(ModelPath, ExportedPath, MoeVS=True):
|
def export_onnx(ModelPath, ExportedPath, MoeVS=True):
|
||||||
hidden_channels = 256 # hidden_channels,为768Vec做准备
|
hidden_channels = 256 # hidden_channels,为768Vec做准备
|
||||||
cpt = torch.load(ModelPath, map_location="cpu")
|
cpt = torch.load(ModelPath, map_location="cpu")
|
||||||
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
|
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
|
||||||
print(*cpt["config"])
|
print(*cpt["config"])
|
||||||
|
|
||||||
test_phone = torch.rand(1, 200, hidden_channels) # hidden unit
|
test_phone = torch.rand(1, 200, hidden_channels) # hidden unit
|
||||||
test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用)
|
test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用)
|
||||||
test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹)
|
test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹)
|
||||||
test_pitchf = torch.rand(1, 200) # nsf基频
|
test_pitchf = torch.rand(1, 200) # nsf基频
|
||||||
test_ds = torch.LongTensor([0]) # 说话人ID
|
test_ds = torch.LongTensor([0]) # 说话人ID
|
||||||
test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子)
|
test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子)
|
||||||
|
|
||||||
device = "cpu" #导出时设备(不影响使用模型)
|
device = "cpu" # 导出时设备(不影响使用模型)
|
||||||
|
|
||||||
if MoeVS:
|
if MoeVS:
|
||||||
net_g = SynthesizerTrnMs256NSFsidM(*cpt["config"], is_half=False) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16)
|
net_g = SynthesizerTrnMs256NSFsidM(
|
||||||
|
*cpt["config"], is_half=False
|
||||||
|
) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16)
|
||||||
net_g.load_state_dict(cpt["weight"], strict=False)
|
net_g.load_state_dict(cpt["weight"], strict=False)
|
||||||
input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
|
input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
|
||||||
output_names = [
|
output_names = [
|
||||||
@ -934,7 +937,9 @@ def export_onnx(ModelPath, ExportedPath, MoeVS=True):
|
|||||||
output_names=output_names,
|
output_names=output_names,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
net_g = SynthesizerTrnMs256NSFsidO(*cpt["config"], is_half=False) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16)
|
net_g = SynthesizerTrnMs256NSFsidO(
|
||||||
|
*cpt["config"], is_half=False
|
||||||
|
) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16)
|
||||||
net_g.load_state_dict(cpt["weight"], strict=False)
|
net_g.load_state_dict(cpt["weight"], strict=False)
|
||||||
input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds"]
|
input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds"]
|
||||||
output_names = [
|
output_names = [
|
||||||
@ -963,6 +968,7 @@ def export_onnx(ModelPath, ExportedPath, MoeVS=True):
|
|||||||
)
|
)
|
||||||
return "Finished"
|
return "Finished"
|
||||||
|
|
||||||
|
|
||||||
with gr.Blocks() as app:
|
with gr.Blocks() as app:
|
||||||
gr.Markdown(
|
gr.Markdown(
|
||||||
value=i18n(
|
value=i18n(
|
||||||
@ -1443,7 +1449,9 @@ with gr.Blocks() as app:
|
|||||||
with gr.Row():
|
with gr.Row():
|
||||||
ckpt_dir = gr.Textbox(label=i18n("RVC模型路径"), value="", interactive=True)
|
ckpt_dir = gr.Textbox(label=i18n("RVC模型路径"), value="", interactive=True)
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
onnx_dir = gr.Textbox(label=i18n("Onnx输出路径"), value="", interactive=True)
|
onnx_dir = gr.Textbox(
|
||||||
|
label=i18n("Onnx输出路径"), value="", interactive=True
|
||||||
|
)
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
moevs = gr.Checkbox(label=i18n("MoeVS模型"), value=True)
|
moevs = gr.Checkbox(label=i18n("MoeVS模型"), value=True)
|
||||||
infoOnnx = gr.Label(label="Null")
|
infoOnnx = gr.Label(label="Null")
|
||||||
|
@ -18,4 +18,4 @@ def load_audio(file, sr):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise RuntimeError(f"Failed to load audio: {e}")
|
raise RuntimeError(f"Failed to load audio: {e}")
|
||||||
|
|
||||||
return np.frombuffer(out, np.float32).flatten()
|
return np.frombuffer(out, np.float32).flatten()
|
||||||
|
@ -99,8 +99,8 @@ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
audio_norm = audio
|
audio_norm = audio
|
||||||
# audio_norm = audio / self.max_wav_value
|
# audio_norm = audio / self.max_wav_value
|
||||||
# audio_norm = audio / np.abs(audio).max()
|
# audio_norm = audio / np.abs(audio).max()
|
||||||
|
|
||||||
audio_norm = audio_norm.unsqueeze(0)
|
audio_norm = audio_norm.unsqueeze(0)
|
||||||
spec_filename = filename.replace(".wav", ".spec.pt")
|
spec_filename = filename.replace(".wav", ".spec.pt")
|
||||||
@ -291,8 +291,8 @@ class TextAudioLoader(torch.utils.data.Dataset):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
audio_norm = audio
|
audio_norm = audio
|
||||||
# audio_norm = audio / self.max_wav_value
|
# audio_norm = audio / self.max_wav_value
|
||||||
# audio_norm = audio / np.abs(audio).max()
|
# audio_norm = audio / np.abs(audio).max()
|
||||||
|
|
||||||
audio_norm = audio_norm.unsqueeze(0)
|
audio_norm = audio_norm.unsqueeze(0)
|
||||||
spec_filename = filename.replace(".wav", ".spec.pt")
|
spec_filename = filename.replace(".wav", ".spec.pt")
|
||||||
|
@ -61,7 +61,9 @@ class PreProcess:
|
|||||||
self.sr,
|
self.sr,
|
||||||
tmp_audio.astype(np.float32),
|
tmp_audio.astype(np.float32),
|
||||||
)
|
)
|
||||||
tmp_audio = librosa.resample(tmp_audio, orig_sr=self.sr, target_sr=16000)#, res_type="soxr_vhq"
|
tmp_audio = librosa.resample(
|
||||||
|
tmp_audio, orig_sr=self.sr, target_sr=16000
|
||||||
|
) # , res_type="soxr_vhq"
|
||||||
wavfile.write(
|
wavfile.write(
|
||||||
"%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1),
|
"%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1),
|
||||||
16000,
|
16000,
|
||||||
@ -72,7 +74,7 @@ class PreProcess:
|
|||||||
try:
|
try:
|
||||||
audio = load_audio(path, self.sr)
|
audio = load_audio(path, self.sr)
|
||||||
# zero phased digital filter cause pre-ringing noise...
|
# zero phased digital filter cause pre-ringing noise...
|
||||||
# audio = signal.filtfilt(self.bh, self.ah, audio)
|
# audio = signal.filtfilt(self.bh, self.ah, audio)
|
||||||
audio = signal.lfilter(self.bh, self.ah, audio)
|
audio = signal.lfilter(self.bh, self.ah, audio)
|
||||||
|
|
||||||
idx1 = 0
|
idx1 = 0
|
||||||
|
Loading…
Reference in New Issue
Block a user