1
0
mirror of synced 2024-11-27 17:00:54 +01:00

Format code (#142)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
This commit is contained in:
github-actions[bot] 2023-04-24 20:35:56 +08:00 committed by GitHub
parent 376bd31c19
commit b4c653142d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 64 additions and 51 deletions

View File

@ -2,27 +2,29 @@ from infer_pack.models_onnx_moess import SynthesizerTrnMs256NSFsidM
from infer_pack.models_onnx import SynthesizerTrnMs256NSFsidO from infer_pack.models_onnx import SynthesizerTrnMs256NSFsidO
import torch import torch
if __name__ == '__main__': if __name__ == "__main__":
MoeVS = True #模型是否为MoeVoiceStudio原MoeSS使用 MoeVS = True # 模型是否为MoeVoiceStudio原MoeSS使用
ModelPath = "Shiroha/shiroha.pth" #模型路径 ModelPath = "Shiroha/shiroha.pth" # 模型路径
ExportedPath = "model.onnx" #输出路径 ExportedPath = "model.onnx" # 输出路径
hidden_channels = 256 # hidden_channels为768Vec做准备 hidden_channels = 256 # hidden_channels为768Vec做准备
cpt = torch.load(ModelPath, map_location="cpu") cpt = torch.load(ModelPath, map_location="cpu")
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
print(*cpt["config"]) print(*cpt["config"])
test_phone = torch.rand(1, 200, hidden_channels) # hidden unit test_phone = torch.rand(1, 200, hidden_channels) # hidden unit
test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用) test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用)
test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹) test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹)
test_pitchf = torch.rand(1, 200) # nsf基频 test_pitchf = torch.rand(1, 200) # nsf基频
test_ds = torch.LongTensor([0]) # 说话人ID test_ds = torch.LongTensor([0]) # 说话人ID
test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子) test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子)
device = "cpu" #导出时设备(不影响使用模型) device = "cpu" # 导出时设备(不影响使用模型)
if MoeVS: if MoeVS:
net_g = SynthesizerTrnMs256NSFsidM(*cpt["config"], is_half=False) # fp32导出C++要支持fp16必须手动将内存重新排列所以暂时不用fp16 net_g = SynthesizerTrnMs256NSFsidM(
*cpt["config"], is_half=False
) # fp32导出C++要支持fp16必须手动将内存重新排列所以暂时不用fp16
net_g.load_state_dict(cpt["weight"], strict=False) net_g.load_state_dict(cpt["weight"], strict=False)
input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"] input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
output_names = [ output_names = [
@ -52,7 +54,9 @@ if __name__ == '__main__':
output_names=output_names, output_names=output_names,
) )
else: else:
net_g = SynthesizerTrnMs256NSFsidO(*cpt["config"], is_half=False) # fp32导出C++要支持fp16必须手动将内存重新排列所以暂时不用fp16 net_g = SynthesizerTrnMs256NSFsidO(
*cpt["config"], is_half=False
) # fp32导出C++要支持fp16必须手动将内存重新排列所以暂时不用fp16
net_g.load_state_dict(cpt["weight"], strict=False) net_g.load_state_dict(cpt["weight"], strict=False)
input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds"] input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds"]
output_names = [ output_names = [
@ -78,4 +82,4 @@ if __name__ == '__main__':
verbose=False, verbose=False,
input_names=input_names, input_names=input_names,
output_names=output_names, output_names=output_names,
) )

View File

@ -35,7 +35,7 @@ class FeatureInput(object):
def compute_f0(self, path, f0_method): def compute_f0(self, path, f0_method):
# default resample type of librosa.resample is "soxr_hq". # default resample type of librosa.resample is "soxr_hq".
# Quality: soxr_vhq > soxr_hq # Quality: soxr_vhq > soxr_hq
x, sr = librosa.load(path, self.fs)#, res_type='soxr_vhq' x, sr = librosa.load(path, self.fs) # , res_type='soxr_vhq'
p_len = x.shape[0] // self.hop p_len = x.shape[0] // self.hop
assert sr == self.fs assert sr == self.fs
if f0_method == "pm": if f0_method == "pm":

17
gui.py
View File

@ -67,7 +67,7 @@ class RVC:
print(e) print(e)
def get_f0(self, x, f0_up_key, inp_f0=None): def get_f0(self, x, f0_up_key, inp_f0=None):
x_pad=1 x_pad = 1
f0_min = 50 f0_min = 50
f0_max = 1100 f0_max = 1100
f0_mel_min = 1127 * np.log(1 + f0_min / 700) f0_mel_min = 1127 * np.log(1 + f0_min / 700)
@ -137,7 +137,7 @@ class RVC:
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
torch.cuda.synchronize() torch.cuda.synchronize()
print(feats.shape) print(feats.shape)
if(self.if_f0==1): if self.if_f0 == 1:
pitch, pitchf = self.get_f0(audio, self.f0_up_key) pitch, pitchf = self.get_f0(audio, self.f0_up_key)
p_len = min(feats.shape[1], 13000, pitch.shape[0]) # 太大了爆显存 p_len = min(feats.shape[1], 13000, pitch.shape[0]) # 太大了爆显存
else: else:
@ -146,7 +146,7 @@ class RVC:
torch.cuda.synchronize() torch.cuda.synchronize()
# print(feats.shape,pitch.shape) # print(feats.shape,pitch.shape)
feats = feats[:, :p_len, :] feats = feats[:, :p_len, :]
if(self.if_f0==1): if self.if_f0 == 1:
pitch = pitch[:p_len] pitch = pitch[:p_len]
pitchf = pitchf[:p_len] pitchf = pitchf[:p_len]
pitch = torch.LongTensor(pitch).unsqueeze(0).to(device) pitch = torch.LongTensor(pitch).unsqueeze(0).to(device)
@ -155,17 +155,15 @@ class RVC:
ii = 0 # sid ii = 0 # sid
sid = torch.LongTensor([ii]).to(device) sid = torch.LongTensor([ii]).to(device)
with torch.no_grad(): with torch.no_grad():
if(self.if_f0==1): if self.if_f0 == 1:
infered_audio = ( infered_audio = (
self.net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] self.net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]
.data.cpu() .data.cpu()
.float() .float()
) )
else: else:
infered_audio = ( infered_audio = (
self.net_g.infer(feats, p_len, sid)[0][0, 0] self.net_g.infer(feats, p_len, sid)[0][0, 0].data.cpu().float()
.data.cpu()
.float()
) )
torch.cuda.synchronize() torch.cuda.synchronize()
return infered_audio return infered_audio
@ -387,7 +385,7 @@ class GUI:
self.config.pth_path, self.config.pth_path,
self.config.index_path, self.config.index_path,
self.config.npy_path, self.config.npy_path,
self.config.index_rate self.config.index_rate,
) )
self.input_wav: np.ndarray = np.zeros( self.input_wav: np.ndarray = np.zeros(
self.extra_frame self.extra_frame
@ -511,7 +509,6 @@ class GUI:
total_time = time.perf_counter() - start_time total_time = time.perf_counter() - start_time
self.window["infer_time"].update(int(total_time * 1000)) self.window["infer_time"].update(int(total_time * 1000))
print("infer time:" + str(total_time)) print("infer time:" + str(total_time))
def get_devices(self, update: bool = True): def get_devices(self, update: bool = True):
"""获取设备列表""" """获取设备列表"""

View File

@ -11,8 +11,10 @@ def load_language_list(language):
class I18nAuto: class I18nAuto:
def __init__(self, language=None): def __init__(self, language=None):
if language in ['auto', None]: if language in ["auto", None]:
language = locale.getdefaultlocale()[0]#getlocale can't identify the system's language ((None, None)) language = locale.getdefaultlocale()[
0
] # getlocale can't identify the system's language ((None, None))
if not os.path.exists(f"./i18n/{language}.json"): if not os.path.exists(f"./i18n/{language}.json"):
language = "en_US" language = "en_US"
self.language = language self.language = language

View File

@ -119,7 +119,6 @@ for name in os.listdir(weight_uvr5_root):
uvr5_names.append(name.replace(".pth", "")) uvr5_names.append(name.replace(".pth", ""))
def vc_single( def vc_single(
sid, sid,
input_audio, input_audio,
@ -888,23 +887,27 @@ def change_info_(ckpt_path):
from infer_pack.models_onnx_moess import SynthesizerTrnMs256NSFsidM from infer_pack.models_onnx_moess import SynthesizerTrnMs256NSFsidM
from infer_pack.models_onnx import SynthesizerTrnMs256NSFsidO from infer_pack.models_onnx import SynthesizerTrnMs256NSFsidO
def export_onnx(ModelPath, ExportedPath, MoeVS=True): def export_onnx(ModelPath, ExportedPath, MoeVS=True):
hidden_channels = 256 # hidden_channels为768Vec做准备 hidden_channels = 256 # hidden_channels为768Vec做准备
cpt = torch.load(ModelPath, map_location="cpu") cpt = torch.load(ModelPath, map_location="cpu")
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
print(*cpt["config"]) print(*cpt["config"])
test_phone = torch.rand(1, 200, hidden_channels) # hidden unit test_phone = torch.rand(1, 200, hidden_channels) # hidden unit
test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用) test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用)
test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹) test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹)
test_pitchf = torch.rand(1, 200) # nsf基频 test_pitchf = torch.rand(1, 200) # nsf基频
test_ds = torch.LongTensor([0]) # 说话人ID test_ds = torch.LongTensor([0]) # 说话人ID
test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子) test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子)
device = "cpu" #导出时设备(不影响使用模型) device = "cpu" # 导出时设备(不影响使用模型)
if MoeVS: if MoeVS:
net_g = SynthesizerTrnMs256NSFsidM(*cpt["config"], is_half=False) # fp32导出C++要支持fp16必须手动将内存重新排列所以暂时不用fp16 net_g = SynthesizerTrnMs256NSFsidM(
*cpt["config"], is_half=False
) # fp32导出C++要支持fp16必须手动将内存重新排列所以暂时不用fp16
net_g.load_state_dict(cpt["weight"], strict=False) net_g.load_state_dict(cpt["weight"], strict=False)
input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"] input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
output_names = [ output_names = [
@ -934,7 +937,9 @@ def export_onnx(ModelPath, ExportedPath, MoeVS=True):
output_names=output_names, output_names=output_names,
) )
else: else:
net_g = SynthesizerTrnMs256NSFsidO(*cpt["config"], is_half=False) # fp32导出C++要支持fp16必须手动将内存重新排列所以暂时不用fp16 net_g = SynthesizerTrnMs256NSFsidO(
*cpt["config"], is_half=False
) # fp32导出C++要支持fp16必须手动将内存重新排列所以暂时不用fp16
net_g.load_state_dict(cpt["weight"], strict=False) net_g.load_state_dict(cpt["weight"], strict=False)
input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds"] input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds"]
output_names = [ output_names = [
@ -963,6 +968,7 @@ def export_onnx(ModelPath, ExportedPath, MoeVS=True):
) )
return "Finished" return "Finished"
with gr.Blocks() as app: with gr.Blocks() as app:
gr.Markdown( gr.Markdown(
value=i18n( value=i18n(
@ -1443,7 +1449,9 @@ with gr.Blocks() as app:
with gr.Row(): with gr.Row():
ckpt_dir = gr.Textbox(label=i18n("RVC模型路径"), value="", interactive=True) ckpt_dir = gr.Textbox(label=i18n("RVC模型路径"), value="", interactive=True)
with gr.Row(): with gr.Row():
onnx_dir = gr.Textbox(label=i18n("Onnx输出路径"), value="", interactive=True) onnx_dir = gr.Textbox(
label=i18n("Onnx输出路径"), value="", interactive=True
)
with gr.Row(): with gr.Row():
moevs = gr.Checkbox(label=i18n("MoeVS模型"), value=True) moevs = gr.Checkbox(label=i18n("MoeVS模型"), value=True)
infoOnnx = gr.Label(label="Null") infoOnnx = gr.Label(label="Null")

View File

@ -18,4 +18,4 @@ def load_audio(file, sr):
except Exception as e: except Exception as e:
raise RuntimeError(f"Failed to load audio: {e}") raise RuntimeError(f"Failed to load audio: {e}")
return np.frombuffer(out, np.float32).flatten() return np.frombuffer(out, np.float32).flatten()

View File

@ -99,8 +99,8 @@ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):
) )
) )
audio_norm = audio audio_norm = audio
# audio_norm = audio / self.max_wav_value # audio_norm = audio / self.max_wav_value
# audio_norm = audio / np.abs(audio).max() # audio_norm = audio / np.abs(audio).max()
audio_norm = audio_norm.unsqueeze(0) audio_norm = audio_norm.unsqueeze(0)
spec_filename = filename.replace(".wav", ".spec.pt") spec_filename = filename.replace(".wav", ".spec.pt")
@ -291,8 +291,8 @@ class TextAudioLoader(torch.utils.data.Dataset):
) )
) )
audio_norm = audio audio_norm = audio
# audio_norm = audio / self.max_wav_value # audio_norm = audio / self.max_wav_value
# audio_norm = audio / np.abs(audio).max() # audio_norm = audio / np.abs(audio).max()
audio_norm = audio_norm.unsqueeze(0) audio_norm = audio_norm.unsqueeze(0)
spec_filename = filename.replace(".wav", ".spec.pt") spec_filename = filename.replace(".wav", ".spec.pt")

View File

@ -61,7 +61,9 @@ class PreProcess:
self.sr, self.sr,
tmp_audio.astype(np.float32), tmp_audio.astype(np.float32),
) )
tmp_audio = librosa.resample(tmp_audio, orig_sr=self.sr, target_sr=16000)#, res_type="soxr_vhq" tmp_audio = librosa.resample(
tmp_audio, orig_sr=self.sr, target_sr=16000
) # , res_type="soxr_vhq"
wavfile.write( wavfile.write(
"%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1), "%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1),
16000, 16000,
@ -72,7 +74,7 @@ class PreProcess:
try: try:
audio = load_audio(path, self.sr) audio = load_audio(path, self.sr)
# zero phased digital filter cause pre-ringing noise... # zero phased digital filter cause pre-ringing noise...
# audio = signal.filtfilt(self.bh, self.ah, audio) # audio = signal.filtfilt(self.bh, self.ah, audio)
audio = signal.lfilter(self.bh, self.ah, audio) audio = signal.lfilter(self.bh, self.ah, audio)
idx1 = 0 idx1 = 0