diff --git a/extract_f0_print.py b/extract_f0_print.py index d2fef0f..fc689f7 100644 --- a/extract_f0_print.py +++ b/extract_f0_print.py @@ -1,5 +1,7 @@ import os, traceback, sys, parselmouth -import librosa +now_dir = os.getcwd() +sys.path.append(now_dir) +from my_utils import load_audio import pyworld from scipy.io import wavfile import numpy as np, logging @@ -33,17 +35,14 @@ class FeatureInput(object): self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) def compute_f0(self, path, f0_method): - # default resample type of librosa.resample is "soxr_hq". - # Quality: soxr_vhq > soxr_hq - x, sr = librosa.load(path, self.fs) # , res_type='soxr_vhq' + x=load_audio(path,self.fs) p_len = x.shape[0] // self.hop - assert sr == self.fs if f0_method == "pm": time_step = 160 / 16000 * 1000 f0_min = 50 f0_max = 1100 f0 = ( - parselmouth.Sound(x, sr) + parselmouth.Sound(x, self.fs) .to_pitch_ac( time_step=time_step / 1000, voicing_threshold=0.6, @@ -60,19 +59,19 @@ class FeatureInput(object): elif f0_method == "harvest": f0, t = pyworld.harvest( x.astype(np.double), - fs=sr, + fs=self.fs, f0_ceil=self.f0_max, f0_floor=self.f0_min, - frame_period=1000 * self.hop / sr, + frame_period=1000 * self.hop / self.fs, ) f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs) elif f0_method == "dio": f0, t = pyworld.dio( x.astype(np.double), - fs=sr, + fs=self.fs, f0_ceil=self.f0_max, f0_floor=self.f0_min, - frame_period=1000 * self.hop / sr, + frame_period=1000 * self.hop / self.fs, ) f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs) return f0 diff --git a/extract_feature_print.py b/extract_feature_print.py index f1c4f9a..37cc4e2 100644 --- a/extract_feature_print.py +++ b/extract_feature_print.py @@ -9,7 +9,7 @@ else: i_gpu = sys.argv[4] exp_dir = sys.argv[5] os.environ["CUDA_VISIBLE_DEVICES"] = str(i_gpu) - +version = sys.argv[6] import torch import torch.nn.functional as F import soundfile as sf @@ -18,12 +18,9 @@ from fairseq import checkpoint_utils device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -if torch.cuda.is_available(): - device = "cuda" -elif torch.backends.mps.is_available(): - device = "mps" -else: - device = "cpu" +if torch.cuda.is_available():device="cuda" +elif torch.backends.mps.is_available():device="mps" +else:device="cpu" f = open("%s/extract_f0_feature.log" % exp_dir, "a+") @@ -39,7 +36,7 @@ model_path = "hubert_base.pt" printt(exp_dir) wavPath = "%s/1_16k_wavs" % exp_dir -outPath = "%s/3_feature256" % exp_dir +outPath = "%s/3_feature256" % exp_dir if version=="v1"else "%s/3_feature768" % exp_dir os.makedirs(outPath, exist_ok=True) @@ -67,7 +64,7 @@ models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( model = models[0] model = model.to(device) printt("move model to %s" % device) -if device not in ["mps", "cpu"]: +if device not in ["mps","cpu"]: model = model.half() model.eval() @@ -93,11 +90,11 @@ else: if device not in ["mps", "cpu"] else feats.to(device), "padding_mask": padding_mask.to(device), - "output_layer": 9, # layer 9 + "output_layer": 9 if version=="v1"else 12, # layer 9 } with torch.no_grad(): logits = model.extract_features(**inputs) - feats = model.final_proj(logits[0]) + feats = model.final_proj(logits[0])if version=="v1"else logits[0] feats = feats.squeeze(0).float().cpu().numpy() if np.isnan(feats).sum() == 0: diff --git a/infer-web.py b/infer-web.py index 089eab4..200f2c1 100644 --- a/infer-web.py +++ b/infer-web.py @@ -1,9 +1,9 @@ +import torch, os, traceback, sys, warnings, shutil, numpy as np +os.environ["no_proxy"]="localhost, 127.0.0.1, ::1" from multiprocessing import cpu_count import threading from time import sleep from subprocess import Popen -from time import sleep -import torch, os, traceback, sys, warnings, shutil, numpy as np import faiss from random import shuffle @@ -11,8 +11,8 @@ now_dir = os.getcwd() sys.path.append(now_dir) tmp = os.path.join(now_dir, "TEMP") shutil.rmtree(tmp, ignore_errors=True) -shutil.rmtree("%s/runtime/Lib/site-packages/infer_pack" % (now_dir), ignore_errors=True) -shutil.rmtree("%s/runtime/Lib/site-packages/uvr5_pack" % (now_dir), ignore_errors=True) +shutil.rmtree("%s/runtime/Lib/site-packages/infer_pack"%(now_dir), ignore_errors=True) +shutil.rmtree("%s/runtime/Lib/site-packages/uvr5_pack"%(now_dir) , ignore_errors=True) os.makedirs(tmp, exist_ok=True) os.makedirs(os.path.join(now_dir, "logs"), exist_ok=True) os.makedirs(os.path.join(now_dir, "weights"), exist_ok=True) @@ -70,7 +70,7 @@ else: gpu_info = i18n("很遗憾您这没有能用的显卡来支持您训练") default_batch_size = 1 gpus = "-".join([i[0] for i in gpu_infos]) -from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono +from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono,SynthesizerTrnMs768NSFsid, SynthesizerTrnMs768NSFsid_nono from scipy.io import wavfile from fairseq import checkpoint_utils import gradio as gr @@ -121,11 +121,11 @@ names = [] for name in os.listdir(weight_root): if name.endswith(".pth"): names.append(name) -index_paths = [] +index_paths=[] for root, dirs, files in os.walk(index_root, topdown=False): for name in files: if name.endswith(".index") and "trained" not in name: - index_paths.append("%s/%s" % (root, name)) + index_paths.append("%s/%s"%(root,name)) uvr5_names = [] for name in os.listdir(weight_uvr5_root): if name.endswith(".pth"): @@ -144,29 +144,29 @@ def vc_single( index_rate, filter_radius, resample_sr, + rms_mix_rate ): # spk_item, input_audio0, vc_transform0,f0_file,f0method0 - global tgt_sr, net_g, vc, hubert_model + global tgt_sr, net_g, vc, hubert_model,version if input_audio_path is None: return "You need to upload an audio", None f0_up_key = int(f0_up_key) try: audio = load_audio(input_audio_path, 16000) + audio_max=np.abs(audio).max()/0.95 + if(audio_max>1): + audio/=audio_max times = [0, 0, 0] if hubert_model == None: load_hubert() if_f0 = cpt.get("f0", 1) file_index = ( - ( - file_index.strip(" ") - .strip('"') - .strip("\n") - .strip('"') - .strip(" ") - .replace("trained", "added") - ) - if file_index != "" - else file_index2 - ) # 防止小白写错,自动帮他替换掉 + file_index.strip(" ") + .strip('"') + .strip("\n") + .strip('"') + .strip(" ") + .replace("trained", "added") + )if file_index!=""else file_index2 # 防止小白写错,自动帮他替换掉 # file_big_npy = ( # file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ") # ) @@ -186,21 +186,14 @@ def vc_single( filter_radius, tgt_sr, resample_sr, + rms_mix_rate, + version, f0_file=f0_file, ) - if resample_sr >= 16000 and tgt_sr != resample_sr: - tgt_sr = resample_sr - index_info = ( - "Using index:%s." % file_index - if os.path.exists(file_index) - else "Index not used." - ) - return "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % ( - index_info, - times[0], - times[1], - times[2], - ), (tgt_sr, audio_opt) + if(resample_sr>=16000 and tgt_sr!=resample_sr): + tgt_sr=resample_sr + index_info="Using index:%s."%file_index if os.path.exists(file_index)else"Index not used." + return "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss"%(index_info,times[0],times[1],times[2]), (tgt_sr, audio_opt) except: info = traceback.format_exc() print(info) @@ -220,6 +213,7 @@ def vc_multi( index_rate, filter_radius, resample_sr, + rms_mix_rate ): try: dir_path = ( @@ -249,8 +243,9 @@ def vc_multi( index_rate, filter_radius, resample_sr, + rms_mix_rate ) - if "Success" in info: + if "Success"in info: try: tgt_sr, audio_opt = opt wavfile.write( @@ -335,8 +330,8 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg): # 一个选项卡全局只能有一个音色 def get_vc(sid): - global n_spk, tgt_sr, net_g, vc, cpt - if sid == "" or sid == []: + global n_spk, tgt_sr, net_g, vc, cpt,version + if sid == ""or sid==[]: global hubert_model if hubert_model != None: # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的 print("clean_empty_cache") @@ -346,12 +341,17 @@ def get_vc(sid): torch.cuda.empty_cache() ###楼下不这么折腾清理不干净 if_f0 = cpt.get("f0", 1) - if if_f0 == 1: - net_g = SynthesizerTrnMs256NSFsid( - *cpt["config"], is_half=config.is_half - ) - else: - net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) + version = cpt.get("version", "v1") + if (version == "v1"): + if if_f0 == 1: + net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half) + else: + net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) + elif (version == "v2"): + if if_f0 == 1: + net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half) + else: + net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) del net_g, cpt if torch.cuda.is_available(): torch.cuda.empty_cache() @@ -363,12 +363,19 @@ def get_vc(sid): tgt_sr = cpt["config"][-1] cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk if_f0 = cpt.get("f0", 1) - if if_f0 == 1: - net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half) - else: - net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) + version = cpt.get("version", "v1") + if(version=="v1"): + if if_f0 == 1: + net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half) + else: + net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) + elif(version=="v2"): + if if_f0 == 1: + net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half) + else: + net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) del net_g.enc_q - print(net_g.load_state_dict(cpt["weight"], strict=False)) # 不加这一行清不干净, 真奇葩 + print(net_g.load_state_dict(cpt["weight"], strict=False)) net_g.eval().to(config.device) if config.is_half: net_g = net_g.half() @@ -384,37 +391,17 @@ def change_choices(): for name in os.listdir(weight_root): if name.endswith(".pth"): names.append(name) - index_paths = [] + index_paths=[] for root, dirs, files in os.walk(index_root, topdown=False): for name in files: if name.endswith(".index") and "trained" not in name: index_paths.append("%s/%s" % (root, name)) - return {"choices": sorted(names), "__type__": "update"}, { - "choices": sorted(index_paths), - "__type__": "update", - } + return {"choices": sorted(names), "__type__": "update"},{"choices": sorted(index_paths), "__type__": "update"} def clean(): return {"value": "", "__type__": "update"} - -def change_f0(if_f0_3, sr2): # np7, f0method8,pretrained_G14,pretrained_D15 - if if_f0_3: - return ( - {"visible": True, "__type__": "update"}, - {"visible": True, "__type__": "update"}, - "pretrained/f0G%s.pth" % sr2, - "pretrained/f0D%s.pth" % sr2, - ) - return ( - {"visible": False, "__type__": "update"}, - {"visible": False, "__type__": "update"}, - "pretrained/G%s.pth" % sr2, - "pretrained/D%s.pth" % sr2, - ) - - sr_dict = { "32k": 32000, "40k": 40000, @@ -481,7 +468,7 @@ def preprocess_dataset(trainset_dir, exp_dir, sr, n_p): # but2.click(extract_f0,[gpus6,np7,f0method8,if_f0_3,trainset_dir4],[info2]) -def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir): +def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir,version19): gpus = gpus.split("-") os.makedirs("%s/logs/%s" % (now_dir, exp_dir), exist_ok=True) f = open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "w") @@ -527,13 +514,14 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir): leng = len(gpus) ps = [] for idx, n_g in enumerate(gpus): - cmd = config.python_cmd + " extract_feature_print.py %s %s %s %s %s/logs/%s" % ( + cmd = config.python_cmd + " extract_feature_print.py %s %s %s %s %s/logs/%s %s" % ( config.device, leng, idx, n_g, now_dir, exp_dir, + version19, ) print(cmd) p = Popen( @@ -561,12 +549,33 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir): yield log -def change_sr2(sr2, if_f0_3): - if if_f0_3: - return "pretrained/f0G%s.pth" % sr2, "pretrained/f0D%s.pth" % sr2 - else: - return "pretrained/G%s.pth" % sr2, "pretrained/D%s.pth" % sr2 +def change_sr2(sr2, if_f0_3,version19): + vis_v=True if sr2=="40k"else False + if(sr2!="40k"):version19="v1" + path_str=""if version19=="v1"else "_v2" + version_state={"visible": vis_v, "__type__": "update"} + if(vis_v==False):version_state["value"]="v1" + f0_str="f0"if if_f0_3 else"" + return "pretrained%s/%sG%s.pth" % (path_str,f0_str,sr2), "pretrained%s/%sD%s.pth" % (path_str,f0_str,sr2),version_state +def change_version19(sr2,if_f0_3,version19): + path_str=""if version19=="v1"else "_v2" + f0_str="f0"if if_f0_3 else"" + return "pretrained%s/%sG%s.pth" % (path_str,f0_str,sr2), "pretrained%s/%sD%s.pth" % (path_str,f0_str,sr2) + +def change_f0(if_f0_3, sr2,version19): # f0method8,pretrained_G14,pretrained_D15 + path_str=""if version19=="v1"else "_v2" + if if_f0_3: + return ( + {"visible": True, "__type__": "update"}, + "pretrained%s/f0G%s.pth" % (path_str,sr2), + "pretrained%s/f0D%s.pth" % (path_str,sr2), + ) + return ( + {"visible": False, "__type__": "update"}, + "pretrained%s/G%s.pth" % (path_str,sr2), + "pretrained%s/D%s.pth" % (path_str,sr2), + ) # but3.click(click_train,[exp_dir1,sr2,if_f0_3,save_epoch10,total_epoch11,batch_size12,if_save_latest13,pretrained_G14,pretrained_D15,gpus16]) def click_train( @@ -582,24 +591,26 @@ def click_train( pretrained_D15, gpus16, if_cache_gpu17, + if_save_every_weights18, + version19, ): # 生成filelist exp_dir = "%s/logs/%s" % (now_dir, exp_dir1) os.makedirs(exp_dir, exist_ok=True) gt_wavs_dir = "%s/0_gt_wavs" % (exp_dir) - co256_dir = "%s/3_feature256" % (exp_dir) + feature_dir = "%s/3_feature256" % (exp_dir)if version19=="v1"else "%s/3_feature768" % (exp_dir) if if_f0_3: f0_dir = "%s/2a_f0" % (exp_dir) f0nsf_dir = "%s/2b-f0nsf" % (exp_dir) names = ( set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)]) - & set([name.split(".")[0] for name in os.listdir(co256_dir)]) + & set([name.split(".")[0] for name in os.listdir(feature_dir)]) & set([name.split(".")[0] for name in os.listdir(f0_dir)]) & set([name.split(".")[0] for name in os.listdir(f0nsf_dir)]) ) else: names = set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)]) & set( - [name.split(".")[0] for name in os.listdir(co256_dir)] + [name.split(".")[0] for name in os.listdir(feature_dir)] ) opt = [] for name in names: @@ -609,7 +620,7 @@ def click_train( % ( gt_wavs_dir.replace("\\", "\\\\"), name, - co256_dir.replace("\\", "\\\\"), + feature_dir.replace("\\", "\\\\"), name, f0_dir.replace("\\", "\\\\"), name, @@ -624,22 +635,23 @@ def click_train( % ( gt_wavs_dir.replace("\\", "\\\\"), name, - co256_dir.replace("\\", "\\\\"), + feature_dir.replace("\\", "\\\\"), name, spk_id5, ) ) + fea_dim = 256 if version19 == "v1"else 768 if if_f0_3: for _ in range(2): opt.append( - "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature256/mute.npy|%s/logs/mute/2a_f0/mute.wav.npy|%s/logs/mute/2b-f0nsf/mute.wav.npy|%s" - % (now_dir, sr2, now_dir, now_dir, now_dir, spk_id5) + "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s/logs/mute/2a_f0/mute.wav.npy|%s/logs/mute/2b-f0nsf/mute.wav.npy|%s" + % (now_dir, sr2, now_dir,fea_dim, now_dir, now_dir, spk_id5) ) else: for _ in range(2): opt.append( - "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature256/mute.npy|%s" - % (now_dir, sr2, now_dir, spk_id5) + "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s" + % (now_dir, sr2, now_dir,fea_dim, spk_id5) ) shuffle(opt) with open("%s/filelist.txt" % exp_dir, "w") as f: @@ -651,7 +663,7 @@ def click_train( if gpus16: cmd = ( config.python_cmd - + " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -g %s -te %s -se %s -pg %s -pd %s -l %s -c %s" + + " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -g %s -te %s -se %s -pg %s -pd %s -l %s -c %s -sw %s -v %s" % ( exp_dir1, sr2, @@ -664,12 +676,14 @@ def click_train( pretrained_D15, 1 if if_save_latest13 == i18n("是") else 0, 1 if if_cache_gpu17 == i18n("是") else 0, + 1 if if_save_every_weights18 == i18n("是") else 0, + version19, ) ) else: cmd = ( config.python_cmd - + " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -te %s -se %s -pg %s -pd %s -l %s -c %s" + + " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -te %s -se %s -pg %s -pd %s -l %s -c %s -sw %s -v %s" % ( exp_dir1, sr2, @@ -681,6 +695,8 @@ def click_train( pretrained_D15, 1 if if_save_latest13 == i18n("是") else 0, 1 if if_cache_gpu17 == i18n("是") else 0, + 1 if if_save_every_weights18 == i18n("是") else 0, + version19, ) ) print(cmd) @@ -690,10 +706,10 @@ def click_train( # but4.click(train_index, [exp_dir1], info3) -def train_index(exp_dir1): +def train_index(exp_dir1,version19): exp_dir = "%s/logs/%s" % (now_dir, exp_dir1) os.makedirs(exp_dir, exist_ok=True) - feature_dir = "%s/3_feature256" % (exp_dir) + feature_dir = "%s/3_feature256" % (exp_dir)if version19=="v1"else "%s/3_feature768" % (exp_dir) if os.path.exists(feature_dir) == False: return "请先进行特征提取!" listdir_res = list(os.listdir(feature_dir)) @@ -713,8 +729,8 @@ def train_index(exp_dir1): infos = [] infos.append("%s,%s" % (big_npy.shape, n_ivf)) yield "\n".join(infos) - index = faiss.index_factory(256, "IVF%s,Flat" % n_ivf) - # index = faiss.index_factory(256, "IVF%s,PQ128x4fs,RFlat"%n_ivf) + index = faiss.index_factory(256if version19=="v1"else 768, "IVF%s,Flat" % n_ivf) + # index = faiss.index_factory(256if version19=="v1"else 768, "IVF%s,PQ128x4fs,RFlat"%n_ivf) infos.append("training") yield "\n".join(infos) index_ivf = faiss.extract_index_ivf(index) # @@ -722,9 +738,9 @@ def train_index(exp_dir1): index.train(big_npy) faiss.write_index( index, - "%s/trained_IVF%s_Flat_nprobe_%s.index" % (exp_dir, n_ivf, index_ivf.nprobe), + "%s/trained_IVF%s_Flat_nprobe_%s_%s.index" % (exp_dir, n_ivf, index_ivf.nprobe,version19), ) - # faiss.write_index(index, '%s/trained_IVF%s_Flat_FastScan.index'%(exp_dir,n_ivf)) + # faiss.write_index(index, '%s/trained_IVF%s_Flat_FastScan_%s.index'%(exp_dir,n_ivf,version19)) infos.append("adding") yield "\n".join(infos) batch_size_add = 8192 @@ -734,9 +750,9 @@ def train_index(exp_dir1): index, "%s/added_IVF%s_Flat_nprobe_%s.index" % (exp_dir, n_ivf, index_ivf.nprobe), ) - infos.append("成功构建索引,added_IVF%s_Flat_nprobe_%s.index" % (n_ivf, index_ivf.nprobe)) - # faiss.write_index(index, '%s/added_IVF%s_Flat_FastScan.index'%(exp_dir,n_ivf)) - # infos.append("成功构建索引,added_IVF%s_Flat_FastScan.index"%(n_ivf)) + infos.append("成功构建索引,added_IVF%s_Flat_nprobe_%s_%s.index" % (n_ivf, index_ivf.nprobe,version19)) + # faiss.write_index(index, '%s/added_IVF%s_Flat_FastScan_%s.index'%(exp_dir,n_ivf,version19)) + # infos.append("成功构建索引,added_IVF%s_Flat_FastScan_%s.index"%(n_ivf,version19)) yield "\n".join(infos) @@ -757,6 +773,8 @@ def train1key( pretrained_D15, gpus16, if_cache_gpu17, + if_save_every_weights18, + version19, ): infos = [] @@ -768,7 +786,7 @@ def train1key( preprocess_log_path = "%s/preprocess.log" % model_log_dir extract_f0_feature_log_path = "%s/extract_f0_feature.log" % model_log_dir gt_wavs_dir = "%s/0_gt_wavs" % model_log_dir - feature256_dir = "%s/3_feature256" % model_log_dir + feature_dir = "%s/3_feature256" % model_log_dir if version19=="v1"else "%s/3_feature768" % model_log_dir os.makedirs(model_log_dir, exist_ok=True) #########step1:处理数据 @@ -807,12 +825,12 @@ def train1key( leng = len(gpus) ps = [] for idx, n_g in enumerate(gpus): - cmd = config.python_cmd + " extract_feature_print.py %s %s %s %s %s" % ( + cmd = config.python_cmd + " extract_feature_print.py %s %s %s %s %s %s" % ( config.device, leng, idx, n_g, - model_log_dir, + model_log_dir,version19, ) yield get_info_str(cmd) p = Popen( @@ -831,13 +849,13 @@ def train1key( f0nsf_dir = "%s/2b-f0nsf" % model_log_dir names = ( set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)]) - & set([name.split(".")[0] for name in os.listdir(feature256_dir)]) + & set([name.split(".")[0] for name in os.listdir(feature_dir)]) & set([name.split(".")[0] for name in os.listdir(f0_dir)]) & set([name.split(".")[0] for name in os.listdir(f0nsf_dir)]) ) else: names = set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)]) & set( - [name.split(".")[0] for name in os.listdir(feature256_dir)] + [name.split(".")[0] for name in os.listdir(feature_dir)] ) opt = [] for name in names: @@ -847,7 +865,7 @@ def train1key( % ( gt_wavs_dir.replace("\\", "\\\\"), name, - feature256_dir.replace("\\", "\\\\"), + feature_dir.replace("\\", "\\\\"), name, f0_dir.replace("\\", "\\\\"), name, @@ -862,22 +880,23 @@ def train1key( % ( gt_wavs_dir.replace("\\", "\\\\"), name, - feature256_dir.replace("\\", "\\\\"), + feature_dir.replace("\\", "\\\\"), name, spk_id5, ) ) + fea_dim=256 if version19=="v1"else 768 if if_f0_3: for _ in range(2): opt.append( - "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature256/mute.npy|%s/logs/mute/2a_f0/mute.wav.npy|%s/logs/mute/2b-f0nsf/mute.wav.npy|%s" - % (now_dir, sr2, now_dir, now_dir, now_dir, spk_id5) + "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s/logs/mute/2a_f0/mute.wav.npy|%s/logs/mute/2b-f0nsf/mute.wav.npy|%s" + % (now_dir, sr2, now_dir,fea_dim, now_dir, now_dir, spk_id5) ) else: for _ in range(2): opt.append( - "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature256/mute.npy|%s" - % (now_dir, sr2, now_dir, spk_id5) + "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s" + % (now_dir, sr2, now_dir,fea_dim, spk_id5) ) shuffle(opt) with open("%s/filelist.txt" % model_log_dir, "w") as f: @@ -886,7 +905,7 @@ def train1key( if gpus16: cmd = ( config.python_cmd - + " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -g %s -te %s -se %s -pg %s -pd %s -l %s -c %s" + + " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -g %s -te %s -se %s -pg %s -pd %s -l %s -c %s -sw %s -v %s" % ( exp_dir1, sr2, @@ -899,12 +918,14 @@ def train1key( pretrained_D15, 1 if if_save_latest13 == i18n("是") else 0, 1 if if_cache_gpu17 == i18n("是") else 0, + 1 if if_save_every_weights18 == i18n("是") else 0, + version19, ) ) else: cmd = ( config.python_cmd - + " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -te %s -se %s -pg %s -pd %s -l %s -c %s" + + " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -te %s -se %s -pg %s -pd %s -l %s -c %s -sw %s -v %s" % ( exp_dir1, sr2, @@ -916,6 +937,8 @@ def train1key( pretrained_D15, 1 if if_save_latest13 == i18n("是") else 0, 1 if if_cache_gpu17 == i18n("是") else 0, + 1 if if_save_every_weights18 == i18n("是") else 0, + version19, ) ) yield get_info_str(cmd) @@ -924,9 +947,9 @@ def train1key( yield get_info_str(i18n("训练结束, 您可查看控制台训练日志或实验文件夹下的train.log")) #######step3b:训练索引 npys = [] - listdir_res = list(os.listdir(feature256_dir)) + listdir_res = list(os.listdir(feature_dir)) for name in sorted(listdir_res): - phone = np.load("%s/%s" % (feature256_dir, name)) + phone = np.load("%s/%s" % (feature_dir, name)) npys.append(phone) big_npy = np.concatenate(npys, 0) @@ -938,15 +961,15 @@ def train1key( # n_ivf = big_npy.shape[0] // 39 n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39) yield get_info_str("%s,%s" % (big_npy.shape, n_ivf)) - index = faiss.index_factory(256, "IVF%s,Flat" % n_ivf) + index = faiss.index_factory(256 if version19=="v1"else 768, "IVF%s,Flat" % n_ivf) yield get_info_str("training index") index_ivf = faiss.extract_index_ivf(index) # index_ivf.nprobe = 1 index.train(big_npy) faiss.write_index( index, - "%s/trained_IVF%s_Flat_nprobe_%s.index" - % (model_log_dir, n_ivf, index_ivf.nprobe), + "%s/trained_IVF%s_Flat_nprobe_%s_%s.index" + % (model_log_dir, n_ivf, index_ivf.nprobe,version19), ) yield get_info_str("adding index") batch_size_add = 8192 @@ -954,11 +977,11 @@ def train1key( index.add(big_npy[i : i + batch_size_add]) faiss.write_index( index, - "%s/added_IVF%s_Flat_nprobe_%s.index" - % (model_log_dir, n_ivf, index_ivf.nprobe), + "%s/added_IVF%s_Flat_nprobe_%s_%s.index" + % (model_log_dir, n_ivf, index_ivf.nprobe,version19), ) yield get_info_str( - "成功构建索引, added_IVF%s_Flat_nprobe_%s.index" % (n_ivf, index_ivf.nprobe) + "成功构建索引, added_IVF%s_Flat_nprobe_%s_%s.index" % (n_ivf, index_ivf.nprobe,version19) ) yield get_info_str(i18n("全流程结束!")) @@ -969,17 +992,18 @@ def change_info_(ckpt_path): os.path.exists(ckpt_path.replace(os.path.basename(ckpt_path), "train.log")) == False ): - return {"__type__": "update"}, {"__type__": "update"} + return {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"} try: with open( ckpt_path.replace(os.path.basename(ckpt_path), "train.log"), "r" ) as f: info = eval(f.read().strip("\n").split("\n")[0].split("\t")[-1]) sr, f0 = info["sample_rate"], info["if_f0"] - return sr, str(f0) + version="v2"if("version"in info and info["version"]=="v2")else"v1" + return sr, str(f0),version except: traceback.print_exc() - return {"__type__": "update"}, {"__type__": "update"} + return {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"} from infer_pack.models_onnx_moess import SynthesizerTrnMs256NSFsidM @@ -1112,7 +1136,7 @@ with gr.Blocks() as app: value="pm", interactive=True, ) - filter_radius0 = gr.Slider( + filter_radius0=gr.Slider( minimum=0, maximum=7, label=i18n(">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音"), @@ -1131,9 +1155,7 @@ with gr.Blocks() as app: choices=sorted(index_paths), interactive=True, ) - refresh_button.click( - fn=change_choices, inputs=[], outputs=[sid0, file_index2] - ) + refresh_button.click(fn=change_choices, inputs=[], outputs=[sid0, file_index2]) # file_big_npy1 = gr.Textbox( # label=i18n("特征文件路径"), # value="E:\\codes\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy", @@ -1146,7 +1168,7 @@ with gr.Blocks() as app: value=0.76, interactive=True, ) - resample_sr0 = gr.Slider( + resample_sr0=gr.Slider( minimum=0, maximum=48000, label=i18n("后处理重采样至最终采样率,0为不进行重采样"), @@ -1154,6 +1176,13 @@ with gr.Blocks() as app: step=1, interactive=True, ) + rms_mix_rate0 = gr.Slider( + minimum=0, + maximum=1, + label=i18n("输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络"), + value=1, + interactive=True, + ) f0_file = gr.File(label=i18n("F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调")) but0 = gr.Button(i18n("转换"), variant="primary") with gr.Column(): @@ -1173,6 +1202,7 @@ with gr.Blocks() as app: index_rate1, filter_radius0, resample_sr0, + rms_mix_rate0 ], [vc_output1, vc_output2], ) @@ -1192,7 +1222,7 @@ with gr.Blocks() as app: value="pm", interactive=True, ) - filter_radius1 = gr.Slider( + filter_radius1=gr.Slider( minimum=0, maximum=7, label=i18n(">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音"), @@ -1223,7 +1253,7 @@ with gr.Blocks() as app: value=1, interactive=True, ) - resample_sr1 = gr.Slider( + resample_sr1=gr.Slider( minimum=0, maximum=48000, label=i18n("后处理重采样至最终采样率,0为不进行重采样"), @@ -1231,6 +1261,13 @@ with gr.Blocks() as app: step=1, interactive=True, ) + rms_mix_rate1 = gr.Slider( + minimum=0, + maximum=1, + label=i18n("输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络"), + value=1, + interactive=True, + ) with gr.Column(): dir_input = gr.Textbox( label=i18n("输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)"), @@ -1256,6 +1293,7 @@ with gr.Blocks() as app: index_rate2, filter_radius1, resample_sr1, + rms_mix_rate1 ], [vc_output3], ) @@ -1324,6 +1362,13 @@ with gr.Blocks() as app: value=True, interactive=True, ) + version19 = gr.Radio( + label=i18n("版本(目前仅40k支持了v2)"), + choices=["v1", "v2"], + value="v1", + interactive=True, + visible=True, + ) np7 = gr.Slider( minimum=0, maximum=ncpu, @@ -1353,7 +1398,7 @@ with gr.Blocks() as app: but1 = gr.Button(i18n("处理数据"), variant="primary") info1 = gr.Textbox(label=i18n("输出信息"), value="") but1.click( - preprocess_dataset, [trainset_dir4, exp_dir1, sr2, np7], [info1] + preprocess_dataset, [trainset_dir4, exp_dir1, sr2,np7], [info1] ) with gr.Group(): gr.Markdown(value=i18n("step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)")) @@ -1378,7 +1423,7 @@ with gr.Blocks() as app: info2 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8) but2.click( extract_f0_feature, - [gpus6, np7, f0method8, if_f0_3, exp_dir1], + [gpus6, np7, f0method8, if_f0_3, exp_dir1,version19], [info2], ) with gr.Group(): @@ -1422,6 +1467,14 @@ with gr.Blocks() as app: value=i18n("否"), interactive=True, ) + if_save_every_weights18 = gr.Radio( + label=i18n( + "是否在每次保存时间点将最终小模型保存至weights文件夹" + ), + choices=[i18n("是"), i18n("否")], + value=i18n("否"), + interactive=True, + ) with gr.Row(): pretrained_G14 = gr.Textbox( label=i18n("加载预训练底模G路径"), @@ -1434,12 +1487,15 @@ with gr.Blocks() as app: interactive=True, ) sr2.change( - change_sr2, [sr2, if_f0_3], [pretrained_G14, pretrained_D15] + change_sr2, [sr2, if_f0_3,version19], [pretrained_G14, pretrained_D15,version19] + ) + version19.change( + change_version19, [sr2, if_f0_3,version19], [pretrained_G14, pretrained_D15] ) if_f0_3.change( change_f0, - [if_f0_3, sr2], - [np7, f0method8, pretrained_G14, pretrained_D15], + [if_f0_3, sr2,version19], + [f0method8, pretrained_G14, pretrained_D15], ) gpus16 = gr.Textbox( label=i18n("以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2"), @@ -1465,10 +1521,12 @@ with gr.Blocks() as app: pretrained_D15, gpus16, if_cache_gpu17, + if_save_every_weights18, + version19, ], info3, ) - but4.click(train_index, [exp_dir1], info3) + but4.click(train_index, [exp_dir1,version19], info3) but5.click( train1key, [ @@ -1487,6 +1545,8 @@ with gr.Blocks() as app: pretrained_D15, gpus16, if_cache_gpu17, + if_save_every_weights18, + version19, ], info3, ) @@ -1526,12 +1586,18 @@ with gr.Blocks() as app: max_lines=1, interactive=True, ) + version_2=gr.Radio( + label=i18n("模型版本型号"), + choices=["v1", "v2"], + value="v1", + interactive=True, + ) with gr.Row(): but6 = gr.Button(i18n("融合"), variant="primary") info4 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8) but6.click( merge, - [ckpt_a, ckpt_b, alpha_a, sr_, if_f0_, info__, name_to_save0], + [ckpt_a, ckpt_b, alpha_a, sr_, if_f0_, info__, name_to_save0,version_2], info4, ) # def merge(path1,path2,alpha1,sr,f0,info): with gr.Group(): @@ -1589,15 +1655,21 @@ with gr.Blocks() as app: value="1", interactive=True, ) + version_1=gr.Radio( + label=i18n("模型版本型号"), + choices=["v1", "v2"], + value="v1", + interactive=True, + ) info___ = gr.Textbox( label=i18n("要置入的模型信息"), value="", max_lines=8, interactive=True ) but9 = gr.Button(i18n("提取"), variant="primary") info7 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8) - ckpt_path2.change(change_info_, [ckpt_path2], [sr__, if_f0__]) + ckpt_path2.change(change_info_, [ckpt_path2], [sr__, if_f0__,version_1]) but9.click( extract_small_model, - [ckpt_path2, save_name, sr__, if_f0__, info___], + [ckpt_path2, save_name, sr__, if_f0__, info___,version_1], info7, ) @@ -1615,16 +1687,16 @@ with gr.Blocks() as app: butOnnx = gr.Button(i18n("导出Onnx模型"), variant="primary") butOnnx.click(export_onnx, [ckpt_dir, onnx_dir, moevs], infoOnnx) - tab_faq = i18n("常见问题解答") + tab_faq=i18n("常见问题解答") with gr.TabItem(tab_faq): try: - if tab_faq == "常见问题解答": - with open("docs/faq.md", "r", encoding="utf8") as f: - info = f.read() + if(tab_faq=="常见问题解答"): + with open("docs/faq.md","r",encoding="utf8")as f:info=f.read() else: - with open("docs/faq_en.md", "r") as f: - info = f.read() - gr.Markdown(value=info) + with open("docs/faq_en.md", "r")as f:info = f.read() + gr.Markdown( + value=info + ) except: gr.Markdown(traceback.format_exc()) diff --git a/train_nsf_sim_cache_sid_load_pretrain.py b/train_nsf_sim_cache_sid_load_pretrain.py index 6078490..494c561 100644 --- a/train_nsf_sim_cache_sid_load_pretrain.py +++ b/train_nsf_sim_cache_sid_load_pretrain.py @@ -31,14 +31,21 @@ from data_utils import ( TextAudioCollate, DistributedBucketSampler, ) -from infer_pack.models import ( - SynthesizerTrnMs256NSFsid, - SynthesizerTrnMs256NSFsid_nono, - MultiPeriodDiscriminator, -) +if(hps.version=="v1"): + from infer_pack.models import ( + SynthesizerTrnMs256NSFsid as RVC_Model_f0, + SynthesizerTrnMs256NSFsid_nono as RVC_Model_nof0, + MultiPeriodDiscriminator, + ) +else: + from infer_pack.models import ( + SynthesizerTrnMs768NSFsid as RVC_Model_f0, + SynthesizerTrnMs768NSFsid_nono as RVC_Model_nof0, + MultiPeriodDiscriminatorV2 as MultiPeriodDiscriminator, + ) from losses import generator_loss, discriminator_loss, feature_loss, kl_loss from mel_processing import mel_spectrogram_torch, spec_to_mel_torch - +from process_ckpt import savee global_step = 0 @@ -63,7 +70,7 @@ def run(rank, n_gpus, hps): if rank == 0: logger = utils.get_logger(hps.model_dir) logger.info(hps) - utils.check_git_hash(hps.model_dir) + # utils.check_git_hash(hps.model_dir) writer = SummaryWriter(log_dir=hps.model_dir) writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval")) @@ -104,7 +111,7 @@ def run(rank, n_gpus, hps): prefetch_factor=8, ) if hps.if_f0 == 1: - net_g = SynthesizerTrnMs256NSFsid( + net_g = RVC_Model_f0( hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, **hps.model, @@ -112,7 +119,7 @@ def run(rank, n_gpus, hps): sr=hps.sample_rate, ) else: - net_g = SynthesizerTrnMs256NSFsid_nono( + net_g = RVC_Model_nof0( hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, **hps.model, @@ -343,7 +350,7 @@ def train_and_evaluate( spec = spec.cuda(rank, non_blocking=True) spec_lengths = spec_lengths.cuda(rank, non_blocking=True) wave = wave.cuda(rank, non_blocking=True) - wave_lengths = wave_lengths.cuda(rank, non_blocking=True) + # wave_lengths = wave_lengths.cuda(rank, non_blocking=True) # Calculate with autocast(enabled=hps.train.fp16_run): @@ -428,10 +435,10 @@ def train_and_evaluate( ) ) # Amor For Tensorboard display - if loss_mel > 50: - loss_mel = 50 - if loss_kl > 5: - loss_kl = 5 + if loss_mel > 75: + loss_mel = 75 + if loss_kl > 9: + loss_kl = 9 logger.info([global_step, lr]) logger.info( @@ -512,12 +519,20 @@ def train_and_evaluate( epoch, os.path.join(hps.model_dir, "D_{}.pth".format(2333333)), ) + if(rank==0 and hps.save_every_weights=="1"): + if hasattr(net_g, "module"): + ckpt = net_g.module.state_dict() + else: + ckpt = net_g.state_dict() + logger.info( + "saving ckpt %s_e%s:%s" + % (hps.name,epoch,savee(ckpt, hps.sample_rate, hps.if_f0, hps.name+"_e%s"%epoch, epoch,hps.version)) + ) if rank == 0: logger.info("====> Epoch: {}".format(epoch)) if epoch >= hps.total_epoch and rank == 0: logger.info("Training is done. The program is closed.") - from process_ckpt import savee # def savee(ckpt,sr,if_f0,name,epoch): if hasattr(net_g, "module"): ckpt = net_g.module.state_dict() @@ -525,7 +540,7 @@ def train_and_evaluate( ckpt = net_g.state_dict() logger.info( "saving final ckpt:%s" - % (savee(ckpt, hps.sample_rate, hps.if_f0, hps.name, epoch)) + % (savee(ckpt, hps.sample_rate, hps.if_f0, hps.name, epoch,hps.version)) ) sleep(1) os._exit(2333333) diff --git a/trainset_preprocess_pipeline_print.py b/trainset_preprocess_pipeline_print.py index 1f4039c..5c87061 100644 --- a/trainset_preprocess_pipeline_print.py +++ b/trainset_preprocess_pipeline_print.py @@ -32,19 +32,19 @@ class PreProcess: def __init__(self, sr, exp_dir): self.slicer = Slicer( sr=sr, - threshold=-40, - min_length=800, + threshold=-42, + min_length=1500, min_interval=400, hop_size=15, - max_sil_kept=150, + max_sil_kept=500, ) self.sr = sr self.bh, self.ah = signal.butter(N=5, Wn=48, btype="high", fs=self.sr) self.per = 3.7 self.overlap = 0.3 self.tail = self.per + self.overlap - self.max = 0.95 - self.alpha = 0.8 + self.max = 0.9 + self.alpha = 0.75 self.exp_dir = exp_dir self.gt_wavs_dir = "%s/0_gt_wavs" % exp_dir self.wavs16k_dir = "%s/1_16k_wavs" % exp_dir diff --git a/vc_infer_pipeline.py b/vc_infer_pipeline.py index 9564493..7470a91 100644 --- a/vc_infer_pipeline.py +++ b/vc_infer_pipeline.py @@ -2,18 +2,16 @@ import numpy as np, parselmouth, torch, pdb from time import time as ttime import torch.nn.functional as F import scipy.signal as signal -import pyworld, os, traceback, faiss, librosa +import pyworld, os, traceback, faiss,librosa from scipy import signal from functools import lru_cache bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) -input_audio_path2wav = {} - - +input_audio_path2wav={} @lru_cache -def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period): - audio = input_audio_path2wav[input_audio_path] +def cache_harvest_f0(input_audio_path,fs,f0max,f0min,frame_period): + audio=input_audio_path2wav[input_audio_path] f0, t = pyworld.harvest( audio, fs=fs, @@ -24,6 +22,17 @@ def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period): f0 = pyworld.stonemask(audio, f0, t, fs) return f0 +def change_rms(data1,sr1,data2,sr2,rate):#1是输入音频,2是输出音频,rate是2的占比 + # print(data1.max(),data2.max()) + rms1 = librosa.feature.rms(y=data1, frame_length=sr1//2*2, hop_length=sr1//2)#每半秒一个点 + rms2 = librosa.feature.rms(y=data2, frame_length=sr2//2*2, hop_length=sr2//2) + rms1=torch.from_numpy(rms1) + rms1=F.interpolate(rms1.unsqueeze(0), size=data2.shape[0],mode='linear').squeeze() + rms2=torch.from_numpy(rms2) + rms2=F.interpolate(rms2.unsqueeze(0), size=data2.shape[0],mode='linear').squeeze() + rms2=torch.max(rms2,torch.zeros_like(rms2)+1e-6) + data2*=(torch.pow(rms1,torch.tensor(1-rate))*torch.pow(rms2,torch.tensor(rate-1))).numpy() + return data2 class VC(object): def __init__(self, tgt_sr, config): @@ -44,16 +53,7 @@ class VC(object): self.t_max = self.sr * self.x_max # 免查询时长阈值 self.device = config.device - def get_f0( - self, - input_audio_path, - x, - p_len, - f0_up_key, - f0_method, - filter_radius, - inp_f0=None, - ): + def get_f0(self, input_audio_path,x, p_len, f0_up_key, f0_method,filter_radius, inp_f0=None): global input_audio_path2wav time_step = self.window / self.sr * 1000 f0_min = 50 @@ -77,9 +77,9 @@ class VC(object): f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" ) elif f0_method == "harvest": - input_audio_path2wav[input_audio_path] = x.astype(np.double) - f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10) - if filter_radius > 2: + input_audio_path2wav[input_audio_path]=x.astype(np.double) + f0=cache_harvest_f0(input_audio_path,self.sr,f0_max,f0_min,10) + if(filter_radius>2): f0 = signal.medfilt(f0, 3) f0 *= pow(2, f0_up_key / 12) # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) @@ -118,6 +118,7 @@ class VC(object): index, big_npy, index_rate, + version, ): # ,file_index,file_big_npy feats = torch.from_numpy(audio0) if self.is_half: @@ -133,12 +134,12 @@ class VC(object): inputs = { "source": feats.to(self.device), "padding_mask": padding_mask, - "output_layer": 9, # layer 9 + "output_layer": 9if version=="v1"else 12, } t0 = ttime() with torch.no_grad(): logits = model.extract_features(**inputs) - feats = model.final_proj(logits[0]) + feats = model.final_proj(logits[0])if version=="v1"else logits[0] if ( isinstance(index, type(None)) == False @@ -176,14 +177,14 @@ class VC(object): with torch.no_grad(): if pitch != None and pitchf != None: audio1 = ( - (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768) + (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]) .data.cpu() .float() .numpy() ) else: audio1 = ( - (net_g.infer(feats, p_len, sid)[0][0, 0] * 32768) + (net_g.infer(feats, p_len, sid)[0][0, 0]) .data.cpu() .float() .numpy() @@ -213,6 +214,8 @@ class VC(object): filter_radius, tgt_sr, resample_sr, + rms_mix_rate, + version, f0_file=None, ): if ( @@ -267,15 +270,7 @@ class VC(object): sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() pitch, pitchf = None, None if if_f0 == 1: - pitch, pitchf = self.get_f0( - input_audio_path, - audio_pad, - p_len, - f0_up_key, - f0_method, - filter_radius, - inp_f0, - ) + pitch, pitchf = self.get_f0(input_audio_path,audio_pad, p_len, f0_up_key, f0_method,filter_radius, inp_f0) pitch = pitch[:p_len] pitchf = pitchf[:p_len] if self.device == "mps": @@ -299,6 +294,7 @@ class VC(object): index, big_npy, index_rate, + version, )[self.t_pad_tgt : -self.t_pad_tgt] ) else: @@ -314,6 +310,7 @@ class VC(object): index, big_npy, index_rate, + version, )[self.t_pad_tgt : -self.t_pad_tgt] ) s = t @@ -330,6 +327,7 @@ class VC(object): index, big_npy, index_rate, + version, )[self.t_pad_tgt : -self.t_pad_tgt] ) else: @@ -345,14 +343,20 @@ class VC(object): index, big_npy, index_rate, + version, )[self.t_pad_tgt : -self.t_pad_tgt] ) audio_opt = np.concatenate(audio_opt) - if resample_sr >= 16000 and tgt_sr != resample_sr: + if(rms_mix_rate!=1): + audio_opt=change_rms(audio,16000,audio_opt,tgt_sr,rms_mix_rate) + if(resample_sr>=16000 and tgt_sr!=resample_sr): audio_opt = librosa.resample( audio_opt, orig_sr=tgt_sr, target_sr=resample_sr ) - audio_opt = audio_opt.astype(np.int16) + audio_max=np.abs(audio_opt).max()/0.99 + max_int16=32768 + if(audio_max>1):max_int16/=audio_max + audio_opt=(audio_opt * max_int16).astype(np.int16) del pitch, pitchf, sid if torch.cuda.is_available(): torch.cuda.empty_cache()