Format code (#142)
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
This commit is contained in:
parent
376bd31c19
commit
b4c653142d
@ -2,27 +2,29 @@ from infer_pack.models_onnx_moess import SynthesizerTrnMs256NSFsidM
|
||||
from infer_pack.models_onnx import SynthesizerTrnMs256NSFsidO
|
||||
import torch
|
||||
|
||||
if __name__ == '__main__':
|
||||
MoeVS = True #模型是否为MoeVoiceStudio(原MoeSS)使用
|
||||
if __name__ == "__main__":
|
||||
MoeVS = True # 模型是否为MoeVoiceStudio(原MoeSS)使用
|
||||
|
||||
ModelPath = "Shiroha/shiroha.pth" #模型路径
|
||||
ExportedPath = "model.onnx" #输出路径
|
||||
hidden_channels = 256 # hidden_channels,为768Vec做准备
|
||||
cpt = torch.load(ModelPath, map_location="cpu")
|
||||
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
|
||||
ModelPath = "Shiroha/shiroha.pth" # 模型路径
|
||||
ExportedPath = "model.onnx" # 输出路径
|
||||
hidden_channels = 256 # hidden_channels,为768Vec做准备
|
||||
cpt = torch.load(ModelPath, map_location="cpu")
|
||||
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
|
||||
print(*cpt["config"])
|
||||
|
||||
test_phone = torch.rand(1, 200, hidden_channels) # hidden unit
|
||||
test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用)
|
||||
test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹)
|
||||
test_pitchf = torch.rand(1, 200) # nsf基频
|
||||
test_ds = torch.LongTensor([0]) # 说话人ID
|
||||
test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子)
|
||||
test_phone = torch.rand(1, 200, hidden_channels) # hidden unit
|
||||
test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用)
|
||||
test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹)
|
||||
test_pitchf = torch.rand(1, 200) # nsf基频
|
||||
test_ds = torch.LongTensor([0]) # 说话人ID
|
||||
test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子)
|
||||
|
||||
device = "cpu" #导出时设备(不影响使用模型)
|
||||
device = "cpu" # 导出时设备(不影响使用模型)
|
||||
|
||||
if MoeVS:
|
||||
net_g = SynthesizerTrnMs256NSFsidM(*cpt["config"], is_half=False) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16)
|
||||
net_g = SynthesizerTrnMs256NSFsidM(
|
||||
*cpt["config"], is_half=False
|
||||
) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16)
|
||||
net_g.load_state_dict(cpt["weight"], strict=False)
|
||||
input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
|
||||
output_names = [
|
||||
@ -52,7 +54,9 @@ if __name__ == '__main__':
|
||||
output_names=output_names,
|
||||
)
|
||||
else:
|
||||
net_g = SynthesizerTrnMs256NSFsidO(*cpt["config"], is_half=False) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16)
|
||||
net_g = SynthesizerTrnMs256NSFsidO(
|
||||
*cpt["config"], is_half=False
|
||||
) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16)
|
||||
net_g.load_state_dict(cpt["weight"], strict=False)
|
||||
input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds"]
|
||||
output_names = [
|
||||
@ -78,4 +82,4 @@ if __name__ == '__main__':
|
||||
verbose=False,
|
||||
input_names=input_names,
|
||||
output_names=output_names,
|
||||
)
|
||||
)
|
||||
|
@ -35,7 +35,7 @@ class FeatureInput(object):
|
||||
def compute_f0(self, path, f0_method):
|
||||
# default resample type of librosa.resample is "soxr_hq".
|
||||
# Quality: soxr_vhq > soxr_hq
|
||||
x, sr = librosa.load(path, self.fs)#, res_type='soxr_vhq'
|
||||
x, sr = librosa.load(path, self.fs) # , res_type='soxr_vhq'
|
||||
p_len = x.shape[0] // self.hop
|
||||
assert sr == self.fs
|
||||
if f0_method == "pm":
|
||||
|
17
gui.py
17
gui.py
@ -67,7 +67,7 @@ class RVC:
|
||||
print(e)
|
||||
|
||||
def get_f0(self, x, f0_up_key, inp_f0=None):
|
||||
x_pad=1
|
||||
x_pad = 1
|
||||
f0_min = 50
|
||||
f0_max = 1100
|
||||
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
|
||||
@ -137,7 +137,7 @@ class RVC:
|
||||
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
|
||||
torch.cuda.synchronize()
|
||||
print(feats.shape)
|
||||
if(self.if_f0==1):
|
||||
if self.if_f0 == 1:
|
||||
pitch, pitchf = self.get_f0(audio, self.f0_up_key)
|
||||
p_len = min(feats.shape[1], 13000, pitch.shape[0]) # 太大了爆显存
|
||||
else:
|
||||
@ -146,7 +146,7 @@ class RVC:
|
||||
torch.cuda.synchronize()
|
||||
# print(feats.shape,pitch.shape)
|
||||
feats = feats[:, :p_len, :]
|
||||
if(self.if_f0==1):
|
||||
if self.if_f0 == 1:
|
||||
pitch = pitch[:p_len]
|
||||
pitchf = pitchf[:p_len]
|
||||
pitch = torch.LongTensor(pitch).unsqueeze(0).to(device)
|
||||
@ -155,17 +155,15 @@ class RVC:
|
||||
ii = 0 # sid
|
||||
sid = torch.LongTensor([ii]).to(device)
|
||||
with torch.no_grad():
|
||||
if(self.if_f0==1):
|
||||
if self.if_f0 == 1:
|
||||
infered_audio = (
|
||||
self.net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]
|
||||
.data.cpu()
|
||||
.float()
|
||||
)
|
||||
else:
|
||||
infered_audio = (
|
||||
self.net_g.infer(feats, p_len, sid)[0][0, 0]
|
||||
.data.cpu()
|
||||
.float()
|
||||
infered_audio = (
|
||||
self.net_g.infer(feats, p_len, sid)[0][0, 0].data.cpu().float()
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
return infered_audio
|
||||
@ -387,7 +385,7 @@ class GUI:
|
||||
self.config.pth_path,
|
||||
self.config.index_path,
|
||||
self.config.npy_path,
|
||||
self.config.index_rate
|
||||
self.config.index_rate,
|
||||
)
|
||||
self.input_wav: np.ndarray = np.zeros(
|
||||
self.extra_frame
|
||||
@ -511,7 +509,6 @@ class GUI:
|
||||
total_time = time.perf_counter() - start_time
|
||||
self.window["infer_time"].update(int(total_time * 1000))
|
||||
print("infer time:" + str(total_time))
|
||||
|
||||
|
||||
def get_devices(self, update: bool = True):
|
||||
"""获取设备列表"""
|
||||
|
6
i18n.py
6
i18n.py
@ -11,8 +11,10 @@ def load_language_list(language):
|
||||
|
||||
class I18nAuto:
|
||||
def __init__(self, language=None):
|
||||
if language in ['auto', None]:
|
||||
language = locale.getdefaultlocale()[0]#getlocale can't identify the system's language ((None, None))
|
||||
if language in ["auto", None]:
|
||||
language = locale.getdefaultlocale()[
|
||||
0
|
||||
] # getlocale can't identify the system's language ((None, None))
|
||||
if not os.path.exists(f"./i18n/{language}.json"):
|
||||
language = "en_US"
|
||||
self.language = language
|
||||
|
36
infer-web.py
36
infer-web.py
@ -119,7 +119,6 @@ for name in os.listdir(weight_uvr5_root):
|
||||
uvr5_names.append(name.replace(".pth", ""))
|
||||
|
||||
|
||||
|
||||
def vc_single(
|
||||
sid,
|
||||
input_audio,
|
||||
@ -888,23 +887,27 @@ def change_info_(ckpt_path):
|
||||
|
||||
from infer_pack.models_onnx_moess import SynthesizerTrnMs256NSFsidM
|
||||
from infer_pack.models_onnx import SynthesizerTrnMs256NSFsidO
|
||||
|
||||
|
||||
def export_onnx(ModelPath, ExportedPath, MoeVS=True):
|
||||
hidden_channels = 256 # hidden_channels,为768Vec做准备
|
||||
cpt = torch.load(ModelPath, map_location="cpu")
|
||||
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
|
||||
hidden_channels = 256 # hidden_channels,为768Vec做准备
|
||||
cpt = torch.load(ModelPath, map_location="cpu")
|
||||
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
|
||||
print(*cpt["config"])
|
||||
|
||||
test_phone = torch.rand(1, 200, hidden_channels) # hidden unit
|
||||
test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用)
|
||||
test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹)
|
||||
test_pitchf = torch.rand(1, 200) # nsf基频
|
||||
test_ds = torch.LongTensor([0]) # 说话人ID
|
||||
test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子)
|
||||
test_phone = torch.rand(1, 200, hidden_channels) # hidden unit
|
||||
test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用)
|
||||
test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹)
|
||||
test_pitchf = torch.rand(1, 200) # nsf基频
|
||||
test_ds = torch.LongTensor([0]) # 说话人ID
|
||||
test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子)
|
||||
|
||||
device = "cpu" #导出时设备(不影响使用模型)
|
||||
device = "cpu" # 导出时设备(不影响使用模型)
|
||||
|
||||
if MoeVS:
|
||||
net_g = SynthesizerTrnMs256NSFsidM(*cpt["config"], is_half=False) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16)
|
||||
net_g = SynthesizerTrnMs256NSFsidM(
|
||||
*cpt["config"], is_half=False
|
||||
) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16)
|
||||
net_g.load_state_dict(cpt["weight"], strict=False)
|
||||
input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
|
||||
output_names = [
|
||||
@ -934,7 +937,9 @@ def export_onnx(ModelPath, ExportedPath, MoeVS=True):
|
||||
output_names=output_names,
|
||||
)
|
||||
else:
|
||||
net_g = SynthesizerTrnMs256NSFsidO(*cpt["config"], is_half=False) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16)
|
||||
net_g = SynthesizerTrnMs256NSFsidO(
|
||||
*cpt["config"], is_half=False
|
||||
) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16)
|
||||
net_g.load_state_dict(cpt["weight"], strict=False)
|
||||
input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds"]
|
||||
output_names = [
|
||||
@ -963,6 +968,7 @@ def export_onnx(ModelPath, ExportedPath, MoeVS=True):
|
||||
)
|
||||
return "Finished"
|
||||
|
||||
|
||||
with gr.Blocks() as app:
|
||||
gr.Markdown(
|
||||
value=i18n(
|
||||
@ -1443,7 +1449,9 @@ with gr.Blocks() as app:
|
||||
with gr.Row():
|
||||
ckpt_dir = gr.Textbox(label=i18n("RVC模型路径"), value="", interactive=True)
|
||||
with gr.Row():
|
||||
onnx_dir = gr.Textbox(label=i18n("Onnx输出路径"), value="", interactive=True)
|
||||
onnx_dir = gr.Textbox(
|
||||
label=i18n("Onnx输出路径"), value="", interactive=True
|
||||
)
|
||||
with gr.Row():
|
||||
moevs = gr.Checkbox(label=i18n("MoeVS模型"), value=True)
|
||||
infoOnnx = gr.Label(label="Null")
|
||||
|
@ -18,4 +18,4 @@ def load_audio(file, sr):
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to load audio: {e}")
|
||||
|
||||
return np.frombuffer(out, np.float32).flatten()
|
||||
return np.frombuffer(out, np.float32).flatten()
|
||||
|
@ -99,8 +99,8 @@ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):
|
||||
)
|
||||
)
|
||||
audio_norm = audio
|
||||
# audio_norm = audio / self.max_wav_value
|
||||
# audio_norm = audio / np.abs(audio).max()
|
||||
# audio_norm = audio / self.max_wav_value
|
||||
# audio_norm = audio / np.abs(audio).max()
|
||||
|
||||
audio_norm = audio_norm.unsqueeze(0)
|
||||
spec_filename = filename.replace(".wav", ".spec.pt")
|
||||
@ -291,8 +291,8 @@ class TextAudioLoader(torch.utils.data.Dataset):
|
||||
)
|
||||
)
|
||||
audio_norm = audio
|
||||
# audio_norm = audio / self.max_wav_value
|
||||
# audio_norm = audio / np.abs(audio).max()
|
||||
# audio_norm = audio / self.max_wav_value
|
||||
# audio_norm = audio / np.abs(audio).max()
|
||||
|
||||
audio_norm = audio_norm.unsqueeze(0)
|
||||
spec_filename = filename.replace(".wav", ".spec.pt")
|
||||
|
@ -61,7 +61,9 @@ class PreProcess:
|
||||
self.sr,
|
||||
tmp_audio.astype(np.float32),
|
||||
)
|
||||
tmp_audio = librosa.resample(tmp_audio, orig_sr=self.sr, target_sr=16000)#, res_type="soxr_vhq"
|
||||
tmp_audio = librosa.resample(
|
||||
tmp_audio, orig_sr=self.sr, target_sr=16000
|
||||
) # , res_type="soxr_vhq"
|
||||
wavfile.write(
|
||||
"%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1),
|
||||
16000,
|
||||
@ -72,7 +74,7 @@ class PreProcess:
|
||||
try:
|
||||
audio = load_audio(path, self.sr)
|
||||
# zero phased digital filter cause pre-ringing noise...
|
||||
# audio = signal.filtfilt(self.bh, self.ah, audio)
|
||||
# audio = signal.filtfilt(self.bh, self.ah, audio)
|
||||
audio = signal.lfilter(self.bh, self.ah, audio)
|
||||
|
||||
idx1 = 0
|
||||
|
Loading…
Reference in New Issue
Block a user