diff --git a/config.py b/config.py new file mode 100644 index 0000000..f379ea7 --- /dev/null +++ b/config.py @@ -0,0 +1,38 @@ +############离线VC参数 +inp_root=r"白鹭霜华长条"#对输入目录下所有音频进行转换,别放非音频文件 +opt_root=r"opt"#输出目录 +f0_up_key=0#升降调,整数,男转女12,女转男-12 +person=r"weights\洛天依v3.pt"#目前只有洛天依v3 +############硬件参数 +device = "cuda:0"#填写cuda:x或cpu,x指代第几张卡,只支持N卡加速 +is_half=True#9-10-20-30-40系显卡无脑True,不影响质量,>=20显卡开启有加速 +n_cpu=0#默认0用上所有线程,写数字限制CPU资源使用 +############下头别动 +import torch +if(torch.cuda.is_available()==False): + print("没有发现支持的N卡,使用CPU进行推理") + device="cpu" + is_half=False +if(device!="cpu"): + gpu_name=torch.cuda.get_device_name(int(device.split(":")[-1])) + if("16"in gpu_name or "MX"in gpu_name): + print("16系显卡/MX系显卡强制单精度") + is_half=False +from multiprocessing import cpu_count +if(n_cpu==0):n_cpu=cpu_count() +if(is_half==True): + #6G显存配置 + x_pad = 3 + x_query = 10 + x_center = 60 + x_max = 65 +else: + #5G显存配置 + x_pad = 1 + # x_query = 6 + # x_center = 30 + # x_max = 32 + #6G显存配置 + x_query = 6 + x_center = 38 + x_max = 41 diff --git a/extract_f0_print.py b/extract_f0_print.py new file mode 100644 index 0000000..9631b8d --- /dev/null +++ b/extract_f0_print.py @@ -0,0 +1,120 @@ +import os,traceback,sys,parselmouth +import librosa +import pyworld +from scipy.io import wavfile +import numpy as np,logging +logging.getLogger('numba').setLevel(logging.WARNING) +from multiprocessing import Process + +exp_dir = sys.argv[1] +f = open("%s/extract_f0_feature.log"%exp_dir, "a+") +def printt(strr): + print(strr) + f.write("%s\n" % strr) + f.flush() + +n_p = int(sys.argv[2]) +f0method = sys.argv[3] + +class FeatureInput(object): + def __init__(self, samplerate=16000, hop_size=160): + self.fs = samplerate + self.hop = hop_size + + self.f0_bin = 256 + self.f0_max = 1100.0 + self.f0_min = 50.0 + self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) + self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) + + def compute_f0(self, path,f0_method): + x, sr = librosa.load(path, self.fs) + p_len=x.shape[0]//self.hop + assert sr == self.fs + if(f0_method=="pm"): + time_step = 160 / 16000 * 1000 + f0_min = 50 + f0_max = 1100 + f0 = parselmouth.Sound(x, sr).to_pitch_ac( + time_step=time_step / 1000, voicing_threshold=0.6, + pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency'] + pad_size=(p_len - len(f0) + 1) // 2 + if(pad_size>0 or p_len - len(f0) - pad_size>0): + f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant') + elif(f0_method=="harvest"): + f0, t = pyworld.harvest( + x.astype(np.double), + fs=sr, + f0_ceil=1100, + frame_period=1000 * self.hop / sr, + ) + f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs) + elif(f0_method=="dio"): + f0, t = pyworld.dio( + x.astype(np.double), + fs=sr, + f0_ceil=1100, + frame_period=1000 * self.hop / sr, + ) + f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs) + return f0 + + def coarse_f0(self, f0): + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * ( + self.f0_bin - 2 + ) / (self.f0_mel_max - self.f0_mel_min) + 1 + + # use 0 or 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1 + f0_coarse = np.rint(f0_mel).astype(np.int) + assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, ( + f0_coarse.max(), + f0_coarse.min(), + ) + return f0_coarse + + def go(self,paths,f0_method): + if (len(paths) == 0): printt("no-f0-todo") + else: + printt("todo-f0-%s"%len(paths)) + n=max(len(paths)//5,1)#每个进程最多打印5条 + for idx,(inp_path,opt_path1,opt_path2) in enumerate(paths): + try: + if(idx%n==0):printt("f0ing,now-%s,all-%s,-%s"%(idx,len(paths),inp_path)) + if(os.path.exists(opt_path1+".npy")==True and os.path.exists(opt_path2+".npy")==True):continue + featur_pit = self.compute_f0(inp_path,f0_method) + np.save(opt_path2,featur_pit,allow_pickle=False,)#nsf + coarse_pit = self.coarse_f0(featur_pit) + np.save(opt_path1,coarse_pit,allow_pickle=False,)#ori + except: + printt("f0fail-%s-%s-%s" % (idx, inp_path,traceback.format_exc())) + +if __name__=='__main__': + # exp_dir=r"E:\codes\py39\dataset\mi-test" + # n_p=16 + # f = open("%s/log_extract_f0.log"%exp_dir, "w") + printt(sys.argv) + featureInput = FeatureInput() + paths=[] + inp_root= "%s/1_16k_wavs"%(exp_dir) + opt_root1="%s/2a_f0"%(exp_dir) + opt_root2="%s/2b-f0nsf"%(exp_dir) + + os.makedirs(opt_root1,exist_ok=True) + os.makedirs(opt_root2,exist_ok=True) + for name in sorted(list(os.listdir(inp_root))): + inp_path="%s/%s"%(inp_root,name) + if ("spec" in inp_path): continue + opt_path1="%s/%s"%(opt_root1,name) + opt_path2="%s/%s"%(opt_root2,name) + paths.append([inp_path,opt_path1,opt_path2]) + + ps=[] + for i in range(n_p): + p=Process(target=featureInput.go,args=(paths[i::n_p],f0method,)) + p.start() + ps.append(p) + for p in ps: + p.join() diff --git a/extract_feature_print.py b/extract_feature_print.py new file mode 100644 index 0000000..7a0ff4d --- /dev/null +++ b/extract_feature_print.py @@ -0,0 +1,84 @@ +import os,sys,traceback +n_part=int(sys.argv[1]) +i_part=int(sys.argv[2]) +i_gpu=sys.argv[3] +exp_dir=sys.argv[4] +os.environ["CUDA_VISIBLE_DEVICES"]=str(i_gpu) + +import torch +import torch.nn.functional as F +import soundfile as sf +import numpy as np +import joblib +from fairseq import checkpoint_utils +import pdb +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +f = open("%s/extract_f0_feature.log"%exp_dir, "a+") +def printt(strr): + print(strr) + f.write("%s\n" % strr) + f.flush() +printt(sys.argv) +# model_path = "/bili-coeus/jupyter/jupyterhub-liujing04/speech/pretrain/ContentVec_legacy500.pt" +model_path = "hubert_base.pt" + +printt(exp_dir) +wavPath = "%s/1_16k_wavs"%exp_dir +outPath = "%s/3_feature256"%exp_dir +os.makedirs(outPath,exist_ok=True) +# wave must be 16k, hop_size=320 +def readwave(wav_path, normalize=False): + wav, sr = sf.read(wav_path) + assert sr == 16000 + feats = torch.from_numpy(wav).float() + if feats.dim() == 2: # double channels + feats = feats.mean(-1) + assert feats.dim() == 1, feats.dim() + if normalize: + with torch.no_grad(): + feats = F.layer_norm(feats, feats.shape) + feats = feats.view(1, -1) + return feats +# HuBERT model +printt("load model(s) from {}".format(model_path)) +models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( + [model_path], + suffix="", +) +model = models[0] +model = model.to(device) +model = model.half() +model.eval() + +todo=sorted(list(os.listdir(wavPath)))[i_part::n_part] +n = max(1,len(todo) // 10) # 最多打印十条 +if(len(todo)==0):printt("no-feature-todo") +else: + printt("all-feature-%s"%len(todo)) + for idx,file in enumerate(todo): + try: + if file.endswith(".wav"): + wav_path = "%s/%s"%(wavPath,file) + out_path = "%s/%s"%(outPath,file.replace("wav","npy")) + + if(os.path.exists(out_path)):continue + + feats = readwave(wav_path, normalize=saved_cfg.task.normalize) + padding_mask = torch.BoolTensor(feats.shape).fill_(False) + inputs = { + "source": feats.half().to(device), + "padding_mask": padding_mask.to(device), + "output_layer": 9, # layer 9 + } + with torch.no_grad(): + logits = model.extract_features(**inputs) + feats = model.final_proj(logits[0]) + + feats = feats.squeeze(0).float().cpu().numpy() + # feats = np.repeat(feats, 2,0) # 20ms -> 10ms + np.save(out_path, feats, allow_pickle=False) + if (idx % n == 0):printt("now-%s,all-%s,%s,%s"%(len(todo),idx,file,feats.shape)) + except: + printt(traceback.format_exc()) + printt("all-feature-done") \ No newline at end of file diff --git a/infer-web.py b/infer-web.py new file mode 100644 index 0000000..cf0f242 --- /dev/null +++ b/infer-web.py @@ -0,0 +1,630 @@ +from multiprocessing import cpu_count +import threading +from time import sleep +from subprocess import Popen,PIPE,run as runn +from time import sleep +import torch, pdb, os,traceback,sys,warnings,shutil,numpy as np,faiss +#判断是否有能用来训练和加速推理的N卡 +ncpu=cpu_count() +ngpu=torch.cuda.device_count() +gpu_infos=[] +if(torch.cuda.is_available()==False or ngpu==0):if_gpu_ok=False +else: + if_gpu_ok = False + for i in range(ngpu): + gpu_name=torch.cuda.get_device_name(i) + if("16"in gpu_name or "MX"in gpu_name):continue + if("10"in gpu_name or "20"in gpu_name or "30"in gpu_name or "40"in gpu_name or "A50"in gpu_name.upper() or "70"in gpu_name or "80"in gpu_name or "90"in gpu_name or "M4"in gpu_name or "T4"in gpu_name or "TITAN"in gpu_name.upper()):#A10#A100#V100#A40#P40#M40#K80 + if_gpu_ok=True#至少有一张能用的N卡 + gpu_infos.append("%s\t%s"%(i,gpu_name)) +gpu_info="\n".join(gpu_infos)if if_gpu_ok==True and len(gpu_infos)>0 else "很遗憾您这没有能用的显卡来支持您训练" +gpus="-".join([i[0]for i in gpu_infos]) +now_dir=os.getcwd() +sys.path.append(now_dir) +tmp=os.path.join(now_dir,"TEMP") +shutil.rmtree(tmp,ignore_errors=True) +os.makedirs(tmp,exist_ok=True) +os.makedirs(os.path.join(now_dir,"logs"),exist_ok=True) +os.makedirs(os.path.join(now_dir,"weights"),exist_ok=True) +os.environ["TEMP"]=tmp +warnings.filterwarnings("ignore") +torch.manual_seed(114514) +from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono +from scipy.io import wavfile +from fairseq import checkpoint_utils +import gradio as gr +import librosa +import logging +from vc_infer_pipeline import VC +import soundfile as sf +from config import is_half,device,is_half +from infer_uvr5 import _audio_pre_ +from my_utils import load_audio +from train.process_ckpt import show_info,change_info,merge,extract_small_model +# from trainset_preprocess_pipeline import PreProcess +logging.getLogger('numba').setLevel(logging.WARNING) + +class ToolButton(gr.Button, gr.components.FormComponent): + """Small button with single emoji as text, fits inside gradio forms""" + def __init__(self, **kwargs): + super().__init__(variant="tool", **kwargs) + def get_block_name(self): + return "button" + +hubert_model=None +def load_hubert(): + global hubert_model + models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(["hubert_base.pt"],suffix="",) + hubert_model = models[0] + hubert_model = hubert_model.to(device) + if(is_half):hubert_model = hubert_model.half() + else:hubert_model = hubert_model.float() + hubert_model.eval() + +weight_root="weights" +weight_uvr5_root="uvr5_weights" +names=[] +for name in os.listdir(weight_root):names.append(name) +uvr5_names=[] +for name in os.listdir(weight_uvr5_root):uvr5_names.append(name.replace(".pth","")) + +def vc_single(sid,input_audio,f0_up_key,f0_file,f0_method,file_index,file_big_npy,index_rate):#spk_item, input_audio0, vc_transform0,f0_file,f0method0 + global tgt_sr,net_g,vc,hubert_model + if input_audio is None:return "You need to upload an audio", None + f0_up_key = int(f0_up_key) + try: + audio=load_audio(input_audio,16000) + times = [0, 0, 0] + if(hubert_model==None):load_hubert() + if_f0 = cpt.get("f0", 1) + audio_opt=vc.pipeline(hubert_model,net_g,sid,audio,times,f0_up_key,f0_method,file_index,file_big_npy,index_rate,if_f0,f0_file=f0_file) + print(times) + return "Success", (tgt_sr, audio_opt) + except: + info=traceback.format_exc() + print(info) + return info,(None,None) + +def vc_multi(sid,dir_path,opt_root,paths,f0_up_key,f0_method,file_index,file_big_npy,index_rate): + try: + dir_path=dir_path.strip(" ")#防止小白拷路径头尾带了空格 + opt_root=opt_root.strip(" ") + os.makedirs(opt_root, exist_ok=True) + try: + if(dir_path!=""):paths=[os.path.join(dir_path,name)for name in os.listdir(dir_path)] + else:paths=[path.name for path in paths] + except: + traceback.print_exc() + paths = [path.name for path in paths] + infos=[] + for path in paths: + info,opt=vc_single(sid,path,f0_up_key,None,f0_method,file_index,file_big_npy,index_rate) + if(info=="Success"): + try: + tgt_sr,audio_opt=opt + wavfile.write("%s/%s" % (opt_root, os.path.basename(path)), tgt_sr, audio_opt) + except: + info=traceback.format_exc() + infos.append("%s->%s"%(os.path.basename(path),info)) + yield "\n".join(infos) + yield "\n".join(infos) + except: + yield traceback.format_exc() + +def uvr(model_name,inp_root,save_root_vocal,paths,save_root_ins): + infos = [] + try: + inp_root = inp_root.strip(" ").strip("\n") + save_root_vocal = save_root_vocal.strip(" ").strip("\n") + save_root_ins = save_root_ins.strip(" ").strip("\n") + pre_fun = _audio_pre_(model_path=os.path.join(weight_uvr5_root,model_name+".pth"), device=device, is_half=is_half) + if (inp_root != ""):paths = [os.path.join(inp_root, name) for name in os.listdir(inp_root)] + else:paths = [path.name for path in paths] + for name in paths: + inp_path=os.path.join(inp_root,name) + try: + pre_fun._path_audio_(inp_path , save_root_ins,save_root_vocal) + infos.append("%s->Success"%(os.path.basename(inp_path))) + yield "\n".join(infos) + except: + infos.append("%s->%s" % (os.path.basename(inp_path),traceback.format_exc())) + yield "\n".join(infos) + except: + infos.append(traceback.format_exc()) + yield "\n".join(infos) + finally: + try: + del pre_fun.model + del pre_fun + except: + traceback.print_exc() + print("clean_empty_cache") + torch.cuda.empty_cache() + yield "\n".join(infos) + +#一个选项卡全局只能有一个音色 +def get_vc(sid): + global n_spk,tgt_sr,net_g,vc,cpt + if(sid==""): + global hubert_model + print("clean_empty_cache") + del net_g, n_spk, vc, hubert_model,tgt_sr#,cpt + hubert_model = net_g=n_spk=vc=hubert_model=tgt_sr=None + torch.cuda.empty_cache() + ###楼下不这么折腾清理不干净 + if_f0 = cpt.get("f0", 1) + if (if_f0 == 1): + net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half) + else: + net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) + del net_g,cpt + torch.cuda.empty_cache() + cpt=None + return {"visible": False, "__type__": "update"} + person = "%s/%s" % (weight_root, sid) + print("loading %s"%person) + cpt = torch.load(person, map_location="cpu") + tgt_sr = cpt["config"][-1] + cpt["config"][-3]=cpt["weight"]["emb_g.weight"].shape[0]#n_spk + if_f0=cpt.get("f0",1) + if(if_f0==1): + net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half) + else: + net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) + del net_g.enc_q + print(net_g.load_state_dict(cpt["weight"], strict=False)) # 不加这一行清不干净,真奇葩 + net_g.eval().to(device) + if (is_half):net_g = net_g.half() + else:net_g = net_g.float() + vc = VC(tgt_sr, device, is_half) + n_spk=cpt["config"][-3] + return {"visible": True,"maximum": n_spk, "__type__": "update"} + +def change_choices():return {"choices": sorted(list(os.listdir(weight_root))), "__type__": "update"} +def clean():return {"value": "", "__type__": "update"} +def change_f0(if_f0_3,sr2):#np7, f0method8,pretrained_G14,pretrained_D15 + if(if_f0_3=="是"):return {"visible": True, "__type__": "update"},{"visible": True, "__type__": "update"},"pretrained/f0G%s.pth"%sr2,"pretrained/f0D%s.pth"%sr2 + return {"visible": False, "__type__": "update"}, {"visible": False, "__type__": "update"},"pretrained/G%s.pth"%sr2,"pretrained/D%s.pth"%sr2 + +sr_dict={ + "32k":32000, + "40k":40000, + "48k":48000, +} + +def if_done(done,p): + while 1: + if(p.poll()==None):sleep(0.5) + else:break + done[0]=True + + +def if_done_multi(done,ps): + while 1: + #poll==None代表进程未结束 + #只要有一个进程未结束都不停 + flag=1 + for p in ps: + if(p.poll()==None): + flag = 0 + sleep(0.5) + break + if(flag==1):break + done[0]=True + +def preprocess_dataset(trainset_dir,exp_dir,sr,n_p=ncpu): + sr=sr_dict[sr] + os.makedirs("%s/logs/%s"%(now_dir,exp_dir),exist_ok=True) + f = open("%s/logs/%s/preprocess.log"%(now_dir,exp_dir), "w") + f.close() + cmd="python trainset_preprocess_pipeline_print.py %s %s %s %s/logs/%s"%(trainset_dir,sr,n_p,now_dir,exp_dir) + print(cmd) + p = Popen(cmd, shell=True)#, stdin=PIPE, stdout=PIPE,stderr=PIPE,cwd=now_dir + ###煞笔gr,popen read都非得全跑完了再一次性读取,不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读 + done=[False] + threading.Thread(target=if_done,args=(done,p,)).start() + while(1): + with open("%s/logs/%s/preprocess.log"%(now_dir,exp_dir),"r")as f:yield(f.read()) + sleep(1) + if(done[0]==True):break + with open("%s/logs/%s/preprocess.log"%(now_dir,exp_dir), "r")as f:log = f.read() + print(log) + yield log +#but2.click(extract_f0,[gpus6,np7,f0method8,if_f0_3,trainset_dir4],[info2]) +def extract_f0_feature(gpus,n_p,f0method,if_f0,exp_dir): + gpus=gpus.split("-") + os.makedirs("%s/logs/%s"%(now_dir,exp_dir),exist_ok=True) + f = open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir), "w") + f.close() + if(if_f0=="是"): + cmd="python extract_f0_print.py %s/logs/%s %s %s"%(now_dir,exp_dir,n_p,f0method) + print(cmd) + p = Popen(cmd, shell=True,cwd=now_dir)#, stdin=PIPE, stdout=PIPE,stderr=PIPE + ###煞笔gr,popen read都非得全跑完了再一次性读取,不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读 + done=[False] + threading.Thread(target=if_done,args=(done,p,)).start() + while(1): + with open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir),"r")as f:yield(f.read()) + sleep(1) + if(done[0]==True):break + with open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir), "r")as f:log = f.read() + print(log) + yield log + ####对不同part分别开多进程 + ''' + n_part=int(sys.argv[1]) + i_part=int(sys.argv[2]) + i_gpu=sys.argv[3] + exp_dir=sys.argv[4] + os.environ["CUDA_VISIBLE_DEVICES"]=str(i_gpu) + ''' + leng=len(gpus) + ps=[] + for idx,n_g in enumerate(gpus): + cmd="python extract_feature_print.py %s %s %s %s/logs/%s"%(leng,idx,n_g,now_dir,exp_dir) + print(cmd) + p = Popen(cmd, shell=True, cwd=now_dir)#, shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir + ps.append(p) + ###煞笔gr,popen read都非得全跑完了再一次性读取,不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读 + done = [False] + threading.Thread(target=if_done_multi, args=(done, ps,)).start() + while (1): + with open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir), "r")as f:yield (f.read()) + sleep(1) + if (done[0] == True): break + with open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir), "r")as f:log = f.read() + print(log) + yield log +def change_sr2(sr2,if_f0_3): + if(if_f0_3=="是"):return "pretrained/f0G%s.pth"%sr2,"pretrained/f0D%s.pth"%sr2 + else:return "pretrained/G%s.pth"%sr2,"pretrained/D%s.pth"%sr2 +#but3.click(click_train,[exp_dir1,sr2,if_f0_3,save_epoch10,total_epoch11,batch_size12,if_save_latest13,pretrained_G14,pretrained_D15,gpus16]) +def click_train(exp_dir1,sr2,if_f0_3,spk_id5,save_epoch10,total_epoch11,batch_size12,if_save_latest13,pretrained_G14,pretrained_D15,gpus16,if_cache_gpu17): + #生成filelist + exp_dir="%s/logs/%s"%(now_dir,exp_dir1) + os.makedirs(exp_dir,exist_ok=True) + gt_wavs_dir="%s/0_gt_wavs"%(exp_dir) + co256_dir="%s/3_feature256"%(exp_dir) + if(if_f0_3=="是"): + f0_dir = "%s/2a_f0" % (exp_dir) + f0nsf_dir="%s/2b-f0nsf"%(exp_dir) + names=set([name.split(".")[0]for name in os.listdir(gt_wavs_dir)])&set([name.split(".")[0]for name in os.listdir(co256_dir)])&set([name.split(".")[0]for name in os.listdir(f0_dir)])&set([name.split(".")[0]for name in os.listdir(f0nsf_dir)]) + else: + names=set([name.split(".")[0]for name in os.listdir(gt_wavs_dir)])&set([name.split(".")[0]for name in os.listdir(co256_dir)]) + opt=[] + for name in names: + if (if_f0_3 == "是"): + opt.append("%s/%s.wav|%s/%s.npy|%s/%s.wav.npy|%s/%s.wav.npy|%s"%(gt_wavs_dir.replace("\\","\\\\"),name,co256_dir.replace("\\","\\\\"),name,f0_dir.replace("\\","\\\\"),name,f0nsf_dir.replace("\\","\\\\"),name,spk_id5)) + else: + opt.append("%s/%s.wav|%s/%s.npy|%s"%(gt_wavs_dir.replace("\\","\\\\"),name,co256_dir.replace("\\","\\\\"),name,spk_id5)) + with open("%s/filelist.txt"%exp_dir,"w")as f:f.write("\n".join(opt)) + print("write filelist done") + #生成config#无需生成config + # cmd = "python train_nsf_sim_cache_sid_load_pretrain.py -e mi-test -sr 40k -f0 1 -bs 4 -g 0 -te 10 -se 5 -pg pretrained/f0G40k.pth -pd pretrained/f0D40k.pth -l 1 -c 0" + cmd = "python train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -g %s -te %s -se %s -pg %s -pd %s -l %s -c %s" % (exp_dir1,sr2,1 if if_f0_3=="是"else 0,batch_size12,gpus16,total_epoch11,save_epoch10,pretrained_G14,pretrained_D15,1 if if_save_latest13=="是"else 0,1 if if_cache_gpu17=="是"else 0) + print(cmd) + p = Popen(cmd, shell=True, cwd=now_dir) + p.wait() + return "训练结束,您可查看控制台训练日志或实验文件夹下的train.log" +# but4.click(train_index, [exp_dir1], info3) +def train_index(exp_dir1): + exp_dir="%s/logs/%s"%(now_dir,exp_dir1) + os.makedirs(exp_dir,exist_ok=True) + feature_dir="%s/3_feature256"%(exp_dir) + if(os.path.exists(feature_dir)==False):return "请先进行特征提取!" + listdir_res=list(os.listdir(feature_dir)) + if(len(listdir_res)==0):return "请先进行特征提取!" + npys = [] + for name in sorted(listdir_res): + phone = np.load("%s/%s" % (feature_dir, name)) + npys.append(phone) + big_npy = np.concatenate(npys, 0) + np.save("%s/total_fea.npy"%exp_dir, big_npy) + n_ivf = big_npy.shape[0] // 39 + infos=[] + infos.append("%s,%s"%(big_npy.shape,n_ivf)) + yield "\n".join(infos) + index = faiss.index_factory(256, "IVF%s,Flat"%n_ivf) + infos.append("training") + yield "\n".join(infos) + index_ivf = faiss.extract_index_ivf(index) # + index_ivf.nprobe = int(np.power(n_ivf,0.3)) + index.train(big_npy) + faiss.write_index(index, '%s/trained_IVF%s_Flat_nprobe_%s.index'%(exp_dir,n_ivf,index_ivf.nprobe)) + infos.append("adding") + yield "\n".join(infos) + index.add(big_npy) + faiss.write_index(index, '%s/added_IVF%s_Flat_nprobe_%s.index'%(exp_dir,n_ivf,index_ivf.nprobe)) + infos.append("成功构建索引,added_IVF%s_Flat_nprobe_%s.index"%(n_ivf,index_ivf.nprobe)) + yield "\n".join(infos) +#but5.click(train1key, [exp_dir1, sr2, if_f0_3, trainset_dir4, spk_id5, gpus6, np7, f0method8, save_epoch10, total_epoch11, batch_size12, if_save_latest13, pretrained_G14, pretrained_D15, gpus16, if_cache_gpu17], info3) +def train1key(exp_dir1, sr2, if_f0_3, trainset_dir4, spk_id5, gpus6, np7, f0method8, save_epoch10, total_epoch11, batch_size12, if_save_latest13, pretrained_G14, pretrained_D15, gpus16, if_cache_gpu17): + infos=[] + def get_info_str(strr): + infos.append(strr) + return "\n".join(infos) + os.makedirs("%s/logs/%s"%(now_dir,exp_dir1),exist_ok=True) + #########step1:处理数据 + open("%s/logs/%s/preprocess.log"%(now_dir,exp_dir1), "w").close() + cmd="python trainset_preprocess_pipeline_print.py %s %s %s %s/logs/%s"%(trainset_dir4,sr_dict[sr2],ncpu,now_dir,exp_dir1) + yield get_info_str("step1:正在处理数据") + yield get_info_str(cmd) + p = Popen(cmd, shell=True) + p.wait() + with open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir1), "r")as f: print(f.read()) + #########step2a:提取音高 + open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir1), "w") + if(if_f0_3=="是"): + yield get_info_str("step2a:正在提取音高") + cmd="python extract_f0_print.py %s/logs/%s %s %s"%(now_dir,exp_dir1,np7,f0method8) + yield get_info_str(cmd) + p = Popen(cmd, shell=True,cwd=now_dir) + p.wait() + with open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir1), "r")as f:print(f.read()) + else:yield get_info_str("step2a:无需提取音高") + #######step2b:提取特征 + yield get_info_str("step2b:正在提取特征") + gpus=gpus16.split("-") + leng=len(gpus) + ps=[] + for idx,n_g in enumerate(gpus): + cmd="python extract_feature_print.py %s %s %s %s/logs/%s"%(leng,idx,n_g,now_dir,exp_dir1) + yield get_info_str(cmd) + p = Popen(cmd, shell=True, cwd=now_dir)#, shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir + ps.append(p) + for p in ps:p.wait() + with open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir1), "r")as f:print(f.read()) + #######step3a:训练模型 + yield get_info_str("step3a:正在训练模型") + #生成filelist + exp_dir="%s/logs/%s"%(now_dir,exp_dir1) + gt_wavs_dir="%s/0_gt_wavs"%(exp_dir) + co256_dir="%s/3_feature256"%(exp_dir) + if(if_f0_3=="是"): + f0_dir = "%s/2a_f0" % (exp_dir) + f0nsf_dir="%s/2b-f0nsf"%(exp_dir) + names=set([name.split(".")[0]for name in os.listdir(gt_wavs_dir)])&set([name.split(".")[0]for name in os.listdir(co256_dir)])&set([name.split(".")[0]for name in os.listdir(f0_dir)])&set([name.split(".")[0]for name in os.listdir(f0nsf_dir)]) + else: + names=set([name.split(".")[0]for name in os.listdir(gt_wavs_dir)])&set([name.split(".")[0]for name in os.listdir(co256_dir)]) + opt=[] + for name in names: + if (if_f0_3 == "是"): + opt.append("%s/%s.wav|%s/%s.npy|%s/%s.wav.npy|%s/%s.wav.npy|%s"%(gt_wavs_dir.replace("\\","\\\\"),name,co256_dir.replace("\\","\\\\"),name,f0_dir.replace("\\","\\\\"),name,f0nsf_dir.replace("\\","\\\\"),name,spk_id5)) + else: + opt.append("%s/%s.wav|%s/%s.npy|%s"%(gt_wavs_dir.replace("\\","\\\\"),name,co256_dir.replace("\\","\\\\"),name,spk_id5)) + with open("%s/filelist.txt"%exp_dir,"w")as f:f.write("\n".join(opt)) + yield get_info_str("write filelist done") + cmd = "python train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -g %s -te %s -se %s -pg %s -pd %s -l %s -c %s" % (exp_dir1,sr2,1 if if_f0_3=="是"else 0,batch_size12,gpus16,total_epoch11,save_epoch10,pretrained_G14,pretrained_D15,1 if if_save_latest13=="是"else 0,1 if if_cache_gpu17=="是"else 0) + yield get_info_str(cmd) + p = Popen(cmd, shell=True, cwd=now_dir) + p.wait() + yield get_info_str("训练结束,您可查看控制台训练日志或实验文件夹下的train.log") + #######step3b:训练索引 + feature_dir="%s/3_feature256"%(exp_dir) + npys = [] + listdir_res=list(os.listdir(feature_dir)) + for name in sorted(listdir_res): + phone = np.load("%s/%s" % (feature_dir, name)) + npys.append(phone) + big_npy = np.concatenate(npys, 0) + np.save("%s/total_fea.npy"%exp_dir, big_npy) + n_ivf = big_npy.shape[0] // 39 + yield get_info_str("%s,%s"%(big_npy.shape,n_ivf)) + index = faiss.index_factory(256, "IVF%s,Flat"%n_ivf) + yield get_info_str("training index") + index_ivf = faiss.extract_index_ivf(index) # + index_ivf.nprobe = int(np.power(n_ivf,0.3)) + index.train(big_npy) + faiss.write_index(index, '%s/trained_IVF%s_Flat_nprobe_%s.index'%(exp_dir,n_ivf,index_ivf.nprobe)) + yield get_info_str("adding index") + index.add(big_npy) + faiss.write_index(index, '%s/added_IVF%s_Flat_nprobe_%s.index'%(exp_dir,n_ivf,index_ivf.nprobe)) + yield get_info_str("成功构建索引,added_IVF%s_Flat_nprobe_%s.index"%(n_ivf,index_ivf.nprobe)) + yield get_info_str("全流程结束!") + +# ckpt_path2.change(change_info_,[ckpt_path2],[sr__,if_f0__]) +def change_info_(ckpt_path): + if(os.path.exists(ckpt_path.replace(os.path.basename(ckpt_path),"train.log"))==False):return {"__type__": "update"},{"__type__": "update"} + try: + with open(ckpt_path.replace(os.path.basename(ckpt_path),"train.log"),"r")as f: + info=eval(f.read().strip("\n").split("\n")[0].split("\t")[-1]) + sr,f0=info["sample_rate"],info["if_f0"] + return sr,str(f0) + except: + traceback.print_exc() + return {"__type__": "update"}, {"__type__": "update"} + + +with gr.Blocks() as app: + gr.Markdown(value=""" + 本软件以MIT协议开源,作者不对软件具备任何控制力,使用软件者、传播软件导出的声音者自负全责。
+ 如不认可该条款,则不能使用或引用软件包内任何代码和文件。详见根目录"使用需遵守的协议-LICENSE.txt"。 + """) + with gr.Tabs(): + with gr.TabItem("模型推理"): + with gr.Row(): + sid0 = gr.Dropdown(label="推理音色", choices=names) + refresh_button = gr.Button("刷新音色列表", variant="primary") + refresh_button.click( + fn=change_choices, + inputs=[], + outputs=[sid0] + ) + clean_button = gr.Button("卸载音色省显存", variant="primary") + spk_item = gr.Slider(minimum=0, maximum=2333, step=1, label='请选择说话人id', value=0, visible=False, interactive=True) + clean_button.click( + fn=clean, + inputs=[], + outputs=[sid0] + ) + sid0.change( + fn=get_vc, + inputs=[sid0], + outputs=[spk_item], + ) + with gr.Group(): + gr.Markdown(value=""" + 男转女推荐+12key,女转男推荐-12key,如果音域爆炸导致音色失真也可以自己调整到合适音域。 + """) + with gr.Row(): + with gr.Column(): + vc_transform0 = gr.Number(label="变调(整数,半音数量,升八度12降八度-12)", value=0) + input_audio0 = gr.Textbox(label="输入待处理音频文件路径(默认是正确格式示例)",value="E:\codes\py39\\vits_vc_gpu_train\\todo-songs\冬之花clip1.wav") + f0method0=gr.Radio(label="选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比", choices=["pm","harvest"],value="pm", interactive=True) + with gr.Column(): + file_index1 = gr.Textbox(label="特征检索库文件路径",value="E:\codes\py39\\vits_vc_gpu_train\logs\mi-test-1key\\added_IVF677_Flat_nprobe_7.index", interactive=True) + file_big_npy1 = gr.Textbox(label="特征文件路径",value="E:\codes\py39\\vits_vc_gpu_train\logs\mi-test-1key\\total_fea.npy", interactive=True) + index_rate1 = gr.Slider(minimum=0, maximum=1,label='检索特征占比', value=1,interactive=True) + f0_file = gr.File(label="F0曲线文件,可选,一行一个音高,代替默认F0及升降调") + but0=gr.Button("转换", variant="primary") + with gr.Column(): + vc_output1 = gr.Textbox(label="输出信息") + vc_output2 = gr.Audio(label="输出音频(右下角三个点,点了可以下载)") + but0.click(vc_single, [spk_item, input_audio0, vc_transform0,f0_file,f0method0,file_index1,file_big_npy1,index_rate1], [vc_output1, vc_output2]) + with gr.Group(): + gr.Markdown(value=""" + 批量转换,输入待转换音频文件夹,或上传多个音频文件,在指定文件夹(默认opt)下输出转换的音频。 + """) + with gr.Row(): + with gr.Column(): + vc_transform1 = gr.Number(label="变调(整数,半音数量,升八度12降八度-12)", value=0) + opt_input = gr.Textbox(label="指定输出文件夹",value="opt") + f0method1=gr.Radio(label="选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比", choices=["pm","harvest"],value="pm", interactive=True) + with gr.Column(): + file_index2 = gr.Textbox(label="特征检索库文件路径",value="E:\codes\py39\\vits_vc_gpu_train\logs\mi-test-1key\\added_IVF677_Flat_nprobe_7.index", interactive=True) + file_big_npy2 = gr.Textbox(label="特征文件路径",value="E:\codes\py39\\vits_vc_gpu_train\logs\mi-test-1key\\total_fea.npy", interactive=True) + index_rate2 = gr.Slider(minimum=0, maximum=1,label='检索特征占比', value=1,interactive=True) + with gr.Column(): + dir_input = gr.Textbox(label="输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)",value="E:\codes\py39\\vits_vc_gpu_train\\todo-songs") + inputs = gr.File(file_count="multiple", label="也可批量输入音频文件,二选一,优先读文件夹") + but1=gr.Button("转换", variant="primary") + vc_output3 = gr.Textbox(label="输出信息") + but1.click(vc_multi, [spk_item, dir_input,opt_input,inputs, vc_transform1,f0method1,file_index2,file_big_npy2,index_rate2], [vc_output3]) + with gr.TabItem("伴奏人声分离"): + with gr.Group(): + gr.Markdown(value=""" + 人声伴奏分离批量处理,使用UVR5模型。
+ 不带和声用HP2,带和声且提取的人声不需要和声用HP5
+ 合格的文件夹路径格式举例:E:\codes\py39\\vits_vc_gpu\白鹭霜华测试样例(去文件管理器地址栏拷就行了) + """) + with gr.Row(): + with gr.Column(): + dir_wav_input = gr.Textbox(label="输入待处理音频文件夹路径",value="E:\codes\py39\\vits_vc_gpu_train\\todo-songs") + wav_inputs = gr.File(file_count="multiple", label="也可批量输入音频文件,二选一,优先读文件夹") + with gr.Column(): + model_choose = gr.Dropdown(label="模型", choices=uvr5_names) + opt_vocal_root = gr.Textbox(label="指定输出人声文件夹",value="opt") + opt_ins_root = gr.Textbox(label="指定输出乐器文件夹",value="opt") + but2=gr.Button("转换", variant="primary") + vc_output4 = gr.Textbox(label="输出信息") + but2.click(uvr, [model_choose, dir_wav_input,opt_vocal_root,wav_inputs,opt_ins_root], [vc_output4]) + with gr.TabItem("训练"): + gr.Markdown(value=""" + step1:填写实验配置。实验数据放在logs下,每个实验一个文件夹,需手工输入实验名路径,内含实验配置,日志,训练得到的模型文件。 + """) + with gr.Row(): + exp_dir1 = gr.Textbox(label="输入实验名",value="mi-test") + sr2 = gr.Radio(label="目标采样率", choices=["32k","40k","48k"],value="40k", interactive=True) + if_f0_3 = gr.Radio(label="模型是否带音高指导(唱歌一定要,语音可以不要)", choices=["是","否"],value="是", interactive=True) + with gr.Group():#暂时单人的,后面支持最多4人的#数据处理 + gr.Markdown(value=""" + step2a:自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化,在实验目录下生成2个wav文件夹;暂时只支持单人训练。 + """) + with gr.Row(): + trainset_dir4 = gr.Textbox(label="输入训练文件夹路径",value="E:\语音音频+标注\米津玄师\src") + spk_id5 = gr.Slider(minimum=0, maximum=4, step=1, label='请指定说话人id', value=0,interactive=True) + but1=gr.Button("处理数据", variant="primary") + info1=gr.Textbox(label="输出信息",value="") + but1.click(preprocess_dataset,[trainset_dir4,exp_dir1,sr2],[info1]) + with gr.Group(): + gr.Markdown(value=""" + step2b:使用CPU提取音高(如果模型带音高),使用GPU提取特征(选择卡号) + """) + with gr.Row(): + with gr.Column(): + gpus6 = gr.Textbox(label="以-分隔输入使用的卡号,例如 0-1-2 使用卡0和卡1和卡2",value=gpus,interactive=True) + gpu_info9 = gr.Textbox(label="显卡信息",value=gpu_info) + with gr.Column(): + np7 = gr.Slider(minimum=0, maximum=ncpu, step=1, label='提取音高使用的CPU进程数', value=ncpu,interactive=True) + f0method8 = gr.Radio(label="选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢", choices=["pm", "harvest","dio"], value="harvest", interactive=True) + but2=gr.Button("特征提取", variant="primary") + info2=gr.Textbox(label="输出信息",value="",max_lines=8) + but2.click(extract_f0_feature,[gpus6,np7,f0method8,if_f0_3,exp_dir1],[info2]) + with gr.Group(): + gr.Markdown(value=""" + step3:填写训练设置,开始训练模型和索引 + """) + with gr.Row(): + save_epoch10 = gr.Slider(minimum=0, maximum=50, step=1, label='保存频率save_every_epoch', value=5,interactive=True) + total_epoch11 = gr.Slider(minimum=0, maximum=100, step=1, label='总训练轮数total_epoch', value=10,interactive=True) + batch_size12 = gr.Slider(minimum=0, maximum=32, step=1, label='batch_size', value=4,interactive=True) + if_save_latest13 = gr.Radio(label="是否仅保存最新的ckpt文件以节省硬盘空间", choices=["是", "否"], value="否", interactive=True) + if_cache_gpu17 = gr.Radio(label="是否缓存所有训练集至显存。10min以下小数据可缓存以加速训练,大数据缓存会炸显存也加不了多少速", choices=["是", "否"], value="否", interactive=True) + with gr.Row(): + pretrained_G14 = gr.Textbox(label="加载预训练底模G路径", value="pretrained/f0G40k.pth",interactive=True) + pretrained_D15 = gr.Textbox(label="加载预训练底模D路径", value="pretrained/f0D40k.pth",interactive=True) + sr2.change(change_sr2, [sr2,if_f0_3], [pretrained_G14,pretrained_D15]) + if_f0_3.change(change_f0, [if_f0_3, sr2], [np7, f0method8, pretrained_G14, pretrained_D15]) + gpus16 = gr.Textbox(label="以-分隔输入使用的卡号,例如 0-1-2 使用卡0和卡1和卡2", value=gpus,interactive=True) + but3 = gr.Button("训练模型", variant="primary") + but4 = gr.Button("训练特征索引", variant="primary") + but5 = gr.Button("一键训练", variant="primary") + info3 = gr.Textbox(label="输出信息", value="",max_lines=10) + but3.click(click_train,[exp_dir1,sr2,if_f0_3,spk_id5,save_epoch10,total_epoch11,batch_size12,if_save_latest13,pretrained_G14,pretrained_D15,gpus16,if_cache_gpu17],info3) + but4.click(train_index,[exp_dir1],info3) + but5.click(train1key,[exp_dir1,sr2,if_f0_3,trainset_dir4,spk_id5,gpus6,np7,f0method8,save_epoch10,total_epoch11,batch_size12,if_save_latest13,pretrained_G14,pretrained_D15,gpus16,if_cache_gpu17],info3) + + with gr.TabItem("ckpt处理"): + with gr.Group(): + gr.Markdown(value="""模型融合,可用于测试音色融合""") + with gr.Row(): + ckpt_a = gr.Textbox(label="A模型路径", value="", interactive=True) + ckpt_b = gr.Textbox(label="B模型路径", value="", interactive=True) + alpha_a = gr.Slider(minimum=0, maximum=1, label='A模型权重', value=0.5, interactive=True) + with gr.Row(): + sr_ = gr.Radio(label="目标采样率", choices=["32k","40k","48k"],value="40k", interactive=True) + if_f0_ = gr.Radio(label="模型是否带音高指导", choices=["是","否"],value="是", interactive=True) + info__ = gr.Textbox(label="要置入的模型信息", value="", max_lines=8, interactive=True) + name_to_save0=gr.Textbox(label="保存的模型名不带后缀", value="", max_lines=1, interactive=True) + with gr.Row(): + but6 = gr.Button("融合", variant="primary") + info4 = gr.Textbox(label="输出信息", value="", max_lines=8) + but6.click(merge, [ckpt_a,ckpt_b,alpha_a,sr_,if_f0_,info__,name_to_save0], info4)#def merge(path1,path2,alpha1,sr,f0,info): + with gr.Group(): + gr.Markdown(value="修改模型信息(仅支持weights文件夹下提取的小模型文件)") + with gr.Row(): + ckpt_path0 = gr.Textbox(label="模型路径", value="", interactive=True) + info_=gr.Textbox(label="要改的模型信息", value="", max_lines=8, interactive=True) + name_to_save1=gr.Textbox(label="保存的文件名,默认空为和源文件同名", value="", max_lines=8, interactive=True) + with gr.Row(): + but7 = gr.Button("修改", variant="primary") + info5 = gr.Textbox(label="输出信息", value="", max_lines=8) + but7.click(change_info, [ckpt_path0,info_,name_to_save1], info5) + with gr.Group(): + gr.Markdown(value="查看模型信息(仅支持weights文件夹下提取的小模型文件)") + with gr.Row(): + ckpt_path1 = gr.Textbox(label="模型路径", value="", interactive=True) + but8 = gr.Button("查看", variant="primary") + info6 = gr.Textbox(label="输出信息", value="", max_lines=8) + but8.click(show_info, [ckpt_path1], info6) + with gr.Group(): + gr.Markdown(value="模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况") + with gr.Row(): + ckpt_path2 = gr.Textbox(label="模型路径", value="E:\codes\py39\logs\mi-test_f0_48k\\G_23333.pth", interactive=True) + save_name = gr.Textbox(label="保存名", value="", interactive=True) + sr__ = gr.Radio(label="目标采样率", choices=["32k","40k","48k"],value="40k", interactive=True) + if_f0__ = gr.Radio(label="模型是否带音高指导,1是0否", choices=["1","0"],value="1", interactive=True) + info___ = gr.Textbox(label="要置入的模型信息", value="", max_lines=8, interactive=True) + but9 = gr.Button("提取", variant="primary") + info7 = gr.Textbox(label="输出信息", value="", max_lines=8) + ckpt_path2.change(change_info_,[ckpt_path2],[sr__,if_f0__]) + but9.click(extract_small_model, [ckpt_path2,save_name,sr__,if_f0__,info___], info7) + + with gr.TabItem("招募音高曲线前端编辑器"): + gr.Markdown(value="""加开发群联系我xxxxx""") + with gr.TabItem("点击查看交流、问题反馈群号"): + gr.Markdown(value="""xxxxx""") + + # app.launch(server_name="0.0.0.0",server_port=7860) + # app.queue(concurrency_count=511, max_size=1022).launch(server_name="127.0.0.1",inbrowser=True,server_port=7861,quiet=True) + app.queue(concurrency_count=511, max_size=1022).launch(server_name="0.0.0.0",inbrowser=True,server_port=7865,quiet=True) \ No newline at end of file diff --git a/infer_uvr5.py b/infer_uvr5.py new file mode 100644 index 0000000..b38dd73 --- /dev/null +++ b/infer_uvr5.py @@ -0,0 +1,108 @@ +import os,sys,torch,warnings,pdb +warnings.filterwarnings("ignore") +import librosa +import importlib +import numpy as np +import hashlib , math +from tqdm import tqdm +from uvr5_pack.lib_v5 import spec_utils +from uvr5_pack.utils import _get_name_params,inference +from uvr5_pack.lib_v5.model_param_init import ModelParameters +from scipy.io import wavfile + +class _audio_pre_(): + def __init__(self, model_path,device,is_half): + self.model_path = model_path + self.device = device + self.data = { + # Processing Options + 'postprocess': False, + 'tta': False, + # Constants + 'window_size': 512, + 'agg': 10, + 'high_end_process': 'mirroring', + } + nn_arch_sizes = [ + 31191, # default + 33966,61968, 123821, 123812, 537238 # custom + ] + self.nn_architecture = list('{}KB'.format(s) for s in nn_arch_sizes) + model_size = math.ceil(os.stat(model_path ).st_size / 1024) + nn_architecture = '{}KB'.format(min(nn_arch_sizes, key=lambda x:abs(x-model_size))) + nets = importlib.import_module('uvr5_pack.lib_v5.nets' + f'_{nn_architecture}'.replace('_{}KB'.format(nn_arch_sizes[0]), ''), package=None) + model_hash = hashlib.md5(open(model_path,'rb').read()).hexdigest() + param_name ,model_params_d = _get_name_params(model_path , model_hash) + + mp = ModelParameters(model_params_d) + model = nets.CascadedASPPNet(mp.param['bins'] * 2) + cpk = torch.load( model_path , map_location='cpu') + model.load_state_dict(cpk) + model.eval() + if(is_half==True):model = model.half().to(device) + else:model = model.to(device) + + self.mp = mp + self.model = model + + def _path_audio_(self, music_file ,ins_root=None,vocal_root=None): + if(ins_root is None and vocal_root is None):return "No save root." + name=os.path.basename(music_file) + if(ins_root is not None):os.makedirs(ins_root, exist_ok=True) + if(vocal_root is not None):os.makedirs(vocal_root , exist_ok=True) + X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {} + bands_n = len(self.mp.param['band']) + # print(bands_n) + for d in range(bands_n, 0, -1): + bp = self.mp.param['band'][d] + if d == bands_n: # high-end band + X_wave[d], _ = librosa.core.load(#理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑 + music_file, bp['sr'], False, dtype=np.float32, res_type=bp['res_type']) + if X_wave[d].ndim == 1: + X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]]) + else: # lower bands + X_wave[d] = librosa.core.resample(X_wave[d+1], self.mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type']) + # Stft of wave source + X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(X_wave[d], bp['hl'], bp['n_fft'], self.mp.param['mid_side'], self.mp.param['mid_side_b2'], self.mp.param['reverse']) + # pdb.set_trace() + if d == bands_n and self.data['high_end_process'] != 'none': + input_high_end_h = (bp['n_fft']//2 - bp['crop_stop']) + ( self.mp.param['pre_filter_stop'] - self.mp.param['pre_filter_start']) + input_high_end = X_spec_s[d][:, bp['n_fft']//2-input_high_end_h:bp['n_fft']//2, :] + + X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp) + aggresive_set = float(self.data['agg']/100) + aggressiveness = {'value': aggresive_set, 'split_bin': self.mp.param['band'][1]['crop_stop']} + with torch.no_grad(): + pred, X_mag, X_phase = inference(X_spec_m,self.device,self.model, aggressiveness,self.data) + # Postprocess + if self.data['postprocess']: + pred_inv = np.clip(X_mag - pred, 0, np.inf) + pred = spec_utils.mask_silence(pred, pred_inv) + y_spec_m = pred * X_phase + v_spec_m = X_spec_m - y_spec_m + + if (ins_root is not None): + if self.data['high_end_process'].startswith('mirroring'): + input_high_end_ = spec_utils.mirroring(self.data['high_end_process'], y_spec_m, input_high_end, self.mp) + wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp,input_high_end_h, input_high_end_) + else: + wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp) + print ('%s instruments done'%name) + wavfile.write(os.path.join(ins_root, 'instrument_{}.wav'.format(name) ), self.mp.param['sr'], (np.array(wav_instrument)*32768).astype("int16")) # + if (vocal_root is not None): + if self.data['high_end_process'].startswith('mirroring'): + input_high_end_ = spec_utils.mirroring(self.data['high_end_process'], v_spec_m, input_high_end, self.mp) + wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp, input_high_end_h, input_high_end_) + else: + wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp) + print ('%s vocals done'%name) + wavfile.write(os.path.join(vocal_root , 'vocal_{}.wav'.format(name) ), self.mp.param['sr'], (np.array(wav_vocals)*32768).astype("int16")) + +if __name__ == '__main__': + device = 'cuda' + is_half=True + model_path='uvr5_weights/2_HP-UVR.pth' + pre_fun = _audio_pre_(model_path=model_path,device=device,is_half=True) + audio_path = '神女劈观.aac' + save_path = 'opt' + pre_fun._path_audio_(audio_path , save_path,save_path) diff --git a/my_utils.py b/my_utils.py new file mode 100644 index 0000000..48a93b6 --- /dev/null +++ b/my_utils.py @@ -0,0 +1,18 @@ +import ffmpeg,numpy as np +def load_audio(file,sr): + try: + # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 + # This launches a subprocess to decode audio while down-mixing and resampling as necessary. + # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. + out, _ = ( + ffmpeg.input(file, threads=0) + .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr) + .run(cmd=["./ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) + ) + except ffmpeg.Error as e: + raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e + + return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0 + +if __name__=='__main__' : + print(load_audio(r"C:\CloudMusic\宮野幸子,森下唯 - 月夜に謳う君 -LUNA-.mp3",16000).shape) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..3163bcd --- /dev/null +++ b/requirements.txt @@ -0,0 +1,196 @@ +absl-py==1.1.0 +aiofiles==23.1.0 +aiohttp==3.8.4 +aiosignal==1.3.1 +altair==4.2.0 +antlr4-python3-runtime==4.8 +anyio==3.6.1 +appdirs==1.4.4 +argon2-cffi==21.3.0 +argon2-cffi-bindings==21.2.0 +asttokens==2.0.5 +async-timeout==4.0.2 +attrs==21.4.0 +audioread==3.0.0 +Babel==2.10.3 +backcall==0.2.0 +beautifulsoup4==4.11.1 +bitarray==2.7.3 +bleach==5.0.0 +brotlipy==0.7.0 +cachetools==5.2.0 +certifi==2021.5.30 +cffi +chardet +charset-normalizer==3.0.1 +click==8.1.3 +cmake==3.25.0 +colorama==0.4.5 +cryptography +cycler==0.11.0 +Cython==0.29.32 +debugpy==1.6.0 +decorator==5.1.1 +defusedxml==0.7.1 +entrypoints==0.4 +executing==0.8.3 +fairseq==0.12.2 +faiss-gpu==1.7.2 +fastapi==0.92.0 +fastjsonschema==2.15.3 +ffmpeg==1.4 +ffmpy==0.3.0 +filelock==3.9.0 +fonttools==4.33.3 +frozenlist==1.3.3 +fsspec==2022.11.0 +functorch==2.0.0 +future==0.18.3 +google==3.0.0 +google-auth==2.8.0 +google-auth-oauthlib==0.4.6 +googleads==3.8.0 +gradio==3.19.1 +grpcio==1.46.3 +h11==0.13.0 +httpcore==0.16.3 +httplib2==0.21.0 +httpx==0.23.1 +Hydra==2.5 +hydra-core==1.0.7 +idna +importlib-metadata==4.11.4 +importlib-resources==5.8.0 +ipykernel==6.15.0 +ipython==8.4.0 +ipython-genutils==0.2.0 +ipywidgets==7.7.0 +jedi==0.18.1 +Jinja2==3.1.2 +joblib==1.1.0 +json5==0.9.8 +jsonschema==4.6.0 +jupyter-client==7.3.4 +jupyter-core==4.10.0 +jupyter-server==1.17.1 +jupyterlab==3.4.3 +jupyterlab-language-pack-zh-CN==3.4.post1 +jupyterlab-pygments==0.2.2 +jupyterlab-server==2.14.0 +jupyterlab-widgets==1.1.0 +kiwisolver==1.4.3 +lazy-loader==0.1 +librosa==0.9.2 +linkify-it-py==2.0.0 +lit==15.0.7 +llvmlite==0.39.0 +lxml==4.8.0 +Markdown==3.3.7 +markdown-it-py==2.2.0 +MarkupSafe==2.1.1 +matplotlib==3.5.2 +matplotlib-inline==0.1.3 +mdit-py-plugins==0.3.3 +mdurl==0.1.1 +mistune==0.8.4 +mpmath==1.2.1 +msgpack==1.0.4 +multidict==6.0.2 +nbclassic==0.3.7 +nbclient==0.6.4 +nbconvert==6.5.0 +nbformat==5.4.0 +nest-asyncio==1.5.5 +networkx==2.8.8 +notebook==6.4.12 +notebook-shim==0.1.0 +numba==0.56.4 +numpy==1.23.5 +oauth2client==4.1.3 +oauthlib==3.2.0 +omegaconf==2.0.6 +orjson==3.8.6 +packaging==21.3 +pandas==1.5.2 +pandocfilters==1.5.0 +parso==0.8.3 +pexpect==4.8.0 +pickleshare==0.7.5 +Pillow==9.1.1 +pooch==1.6.0 +portalocker==2.5.1 +praat-parselmouth==0.4.2 +prometheus-client==0.14.1 +prompt-toolkit==3.0.29 +protobuf==3.19.4 +psutil==5.9.1 +ptyprocess==0.7.0 +pure-eval==0.2.2 +pyasn1==0.4.8 +pyasn1-modules==0.2.8 +pycosat==0.6.3 +pycparser +pycryptodome==3.16.0 +pydantic==1.10.5 +pydub==0.25.1 +Pygments==2.12.0 +pyOpenSSL +pyparsing==3.0.9 +pyrsistent==0.18.1 +PySocks +python-dateutil==2.8.2 +python-multipart==0.0.5 +pytz==2022.6 +pyworld==0.3.2 +PyYAML==6.0 +pyzmq==23.2.0 +regex==2022.10.31 +requests +requests-oauthlib==1.3.1 +resampy==0.4.2 +rfc3986==1.5.0 +rsa==4.8 +ruamel-yaml-conda +sacrebleu==2.3.1 +scikit-learn==1.1.3 +scipy==1.9.3 +Send2Trash==1.8.0 +six +sniffio==1.2.0 +soundfile==0.12.1 +soupsieve==2.3.2.post1 +soxr==0.3.3 +stack-data==0.3.0 +starlette==0.25.0 +stopit==1.1.1 +suds-jurko==0.6 +supervisor==4.2.4 +sympy==1.11.1 +tabulate==0.8.10 +tensorboard==2.9.1 +tensorboard-data-server==0.6.1 +tensorboard-plugin-wit==1.8.1 +terminado==0.15.0 +threadpoolctl==3.1.0 +tinycss2==1.1.1 +toolz==0.12.0 +torch==2.0.0+cu117 +torchaudio==2.0.1+cu117 +torchgen==0.0.1 +torchvision==0.15.1+cu117 +tornado==6.1 +tqdm +traitlets==5.3.0 +triton==2.0.0 +typing-extensions==4.2.0 +uc-micro-py==1.0.1 +urllib3==1.26.13 +uvicorn==0.21.1 +wcwidth==0.2.5 +webencodings==0.5.1 +websocket-client==1.3.3 +websockets==10.3 +Werkzeug==2.1.2 +widgetsnbextension==3.6.0 +yarl==1.8.1 +zipp==3.8.0 diff --git a/slicer2.py b/slicer2.py new file mode 100644 index 0000000..84ea78c --- /dev/null +++ b/slicer2.py @@ -0,0 +1,186 @@ +import numpy as np + + +# This function is obtained from librosa. +def get_rms( + y, + *, + frame_length=2048, + hop_length=512, + pad_mode="constant", +): + padding = (int(frame_length // 2), int(frame_length // 2)) + y = np.pad(y, padding, mode=pad_mode) + + axis = -1 + # put our new within-frame axis at the end for now + out_strides = y.strides + tuple([y.strides[axis]]) + # Reduce the shape on the framing axis + x_shape_trimmed = list(y.shape) + x_shape_trimmed[axis] -= frame_length - 1 + out_shape = tuple(x_shape_trimmed) + tuple([frame_length]) + xw = np.lib.stride_tricks.as_strided( + y, shape=out_shape, strides=out_strides + ) + if axis < 0: + target_axis = axis - 1 + else: + target_axis = axis + 1 + xw = np.moveaxis(xw, -1, target_axis) + # Downsample along the target axis + slices = [slice(None)] * xw.ndim + slices[axis] = slice(0, None, hop_length) + x = xw[tuple(slices)] + + # Calculate power + power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True) + + return np.sqrt(power) + + +class Slicer: + def __init__(self, + sr: int, + threshold: float = -40., + min_length: int = 5000, + min_interval: int = 300, + hop_size: int = 20, + max_sil_kept: int = 5000): + if not min_length >= min_interval >= hop_size: + raise ValueError('The following condition must be satisfied: min_length >= min_interval >= hop_size') + if not max_sil_kept >= hop_size: + raise ValueError('The following condition must be satisfied: max_sil_kept >= hop_size') + min_interval = sr * min_interval / 1000 + self.threshold = 10 ** (threshold / 20.) + self.hop_size = round(sr * hop_size / 1000) + self.win_size = min(round(min_interval), 4 * self.hop_size) + self.min_length = round(sr * min_length / 1000 / self.hop_size) + self.min_interval = round(min_interval / self.hop_size) + self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size) + + def _apply_slice(self, waveform, begin, end): + if len(waveform.shape) > 1: + return waveform[:, begin * self.hop_size: min(waveform.shape[1], end * self.hop_size)] + else: + return waveform[begin * self.hop_size: min(waveform.shape[0], end * self.hop_size)] + + # @timeit + def slice(self, waveform): + if len(waveform.shape) > 1: + samples = waveform.mean(axis=0) + else: + samples = waveform + if samples.shape[0] <= self.min_length: + return [waveform] + rms_list = get_rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0) + sil_tags = [] + silence_start = None + clip_start = 0 + for i, rms in enumerate(rms_list): + # Keep looping while frame is silent. + if rms < self.threshold: + # Record start of silent frames. + if silence_start is None: + silence_start = i + continue + # Keep looping while frame is not silent and silence start has not been recorded. + if silence_start is None: + continue + # Clear recorded silence start if interval is not enough or clip is too short + is_leading_silence = silence_start == 0 and i > self.max_sil_kept + need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length + if not is_leading_silence and not need_slice_middle: + silence_start = None + continue + # Need slicing. Record the range of silent frames to be removed. + if i - silence_start <= self.max_sil_kept: + pos = rms_list[silence_start: i + 1].argmin() + silence_start + if silence_start == 0: + sil_tags.append((0, pos)) + else: + sil_tags.append((pos, pos)) + clip_start = pos + elif i - silence_start <= self.max_sil_kept * 2: + pos = rms_list[i - self.max_sil_kept: silence_start + self.max_sil_kept + 1].argmin() + pos += i - self.max_sil_kept + pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start + pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept + if silence_start == 0: + sil_tags.append((0, pos_r)) + clip_start = pos_r + else: + sil_tags.append((min(pos_l, pos), max(pos_r, pos))) + clip_start = max(pos_r, pos) + else: + pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start + pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept + if silence_start == 0: + sil_tags.append((0, pos_r)) + else: + sil_tags.append((pos_l, pos_r)) + clip_start = pos_r + silence_start = None + # Deal with trailing silence. + total_frames = rms_list.shape[0] + if silence_start is not None and total_frames - silence_start >= self.min_interval: + silence_end = min(total_frames, silence_start + self.max_sil_kept) + pos = rms_list[silence_start: silence_end + 1].argmin() + silence_start + sil_tags.append((pos, total_frames + 1)) + # Apply and return slices. + if len(sil_tags) == 0: + return [waveform] + else: + chunks = [] + if sil_tags[0][0] > 0: + chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0])) + for i in range(len(sil_tags) - 1): + chunks.append(self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0])) + if sil_tags[-1][1] < total_frames: + chunks.append(self._apply_slice(waveform, sil_tags[-1][1], total_frames)) + return chunks + + +def main(): + import os.path + from argparse import ArgumentParser + + import librosa + import soundfile + + parser = ArgumentParser() + parser.add_argument('audio', type=str, help='The audio to be sliced') + parser.add_argument('--out', type=str, help='Output directory of the sliced audio clips') + parser.add_argument('--db_thresh', type=float, required=False, default=-40, + help='The dB threshold for silence detection') + parser.add_argument('--min_length', type=int, required=False, default=5000, + help='The minimum milliseconds required for each sliced audio clip') + parser.add_argument('--min_interval', type=int, required=False, default=300, + help='The minimum milliseconds for a silence part to be sliced') + parser.add_argument('--hop_size', type=int, required=False, default=10, + help='Frame length in milliseconds') + parser.add_argument('--max_sil_kept', type=int, required=False, default=500, + help='The maximum silence length kept around the sliced clip, presented in milliseconds') + args = parser.parse_args() + out = args.out + if out is None: + out = os.path.dirname(os.path.abspath(args.audio)) + audio, sr = librosa.load(args.audio, sr=None, mono=False) + slicer = Slicer( + sr=sr, + threshold=args.db_thresh, + min_length=args.min_length, + min_interval=args.min_interval, + hop_size=args.hop_size, + max_sil_kept=args.max_sil_kept + ) + chunks = slicer.slice(audio) + if not os.path.exists(out): + os.makedirs(out) + for i, chunk in enumerate(chunks): + if len(chunk.shape) > 1: + chunk = chunk.T + soundfile.write(os.path.join(out, f'%s_%d.wav' % (os.path.basename(args.audio).rsplit('.', maxsplit=1)[0], i)), chunk, sr) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/train_nsf_sim_cache_sid_load_pretrain.py b/train_nsf_sim_cache_sid_load_pretrain.py new file mode 100644 index 0000000..3d84d8c --- /dev/null +++ b/train_nsf_sim_cache_sid_load_pretrain.py @@ -0,0 +1,509 @@ +import sys,os +now_dir=os.getcwd() +sys.path.append(os.path.join(now_dir,"train")) +import utils +hps = utils.get_hparams() +os.environ["CUDA_VISIBLE_DEVICES"]=hps.gpus.replace("-",",") +n_gpus=len(hps.gpus.split("-")) +from random import shuffle +import traceback,json,argparse,itertools,math,torch,pdb +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = False +from torch import nn, optim +from torch.nn import functional as F +from torch.utils.data import DataLoader +from torch.utils.tensorboard import SummaryWriter +import torch.multiprocessing as mp +import torch.distributed as dist +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.cuda.amp import autocast, GradScaler +from infer_pack import commons + +from time import time as ttime +from data_utils import TextAudioLoaderMultiNSFsid,TextAudioLoader, TextAudioCollateMultiNSFsid,TextAudioCollate, DistributedBucketSampler +from infer_pack.models import ( + SynthesizerTrnMs256NSFsid,SynthesizerTrnMs256NSFsid_nono, + MultiPeriodDiscriminator, +) +from losses import generator_loss, discriminator_loss, feature_loss, kl_loss +from mel_processing import mel_spectrogram_torch, spec_to_mel_torch + + +global_step = 0 + + + +def main(): + """Assume Single Node Multi GPUs Training Only""" + assert torch.cuda.is_available(), "CPU training is not allowed." + + # n_gpus = torch.cuda.device_count() + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = "5555" + + + mp.spawn( + run, + nprocs=n_gpus, + args=( + n_gpus, + hps, + ), + ) + + +def run(rank, n_gpus, hps): + global global_step + if rank == 0: + logger = utils.get_logger(hps.model_dir) + logger.info(hps) + utils.check_git_hash(hps.model_dir) + writer = SummaryWriter(log_dir=hps.model_dir) + writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval")) + + dist.init_process_group( + backend="gloo", init_method="env://", world_size=n_gpus, rank=rank + ) + torch.manual_seed(hps.train.seed) + torch.cuda.set_device(rank) + + if (hps.if_f0 == 1):train_dataset = TextAudioLoaderMultiNSFsid(hps.data.training_files, hps.data) + else:train_dataset = TextAudioLoader(hps.data.training_files, hps.data) + train_sampler = DistributedBucketSampler( + train_dataset, + hps.train.batch_size, + # [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1200,1400], # 16s + [100, 200, 300, 400, 500, 600, 700, 800, 900], # 16s + num_replicas=n_gpus, + rank=rank, + shuffle=True, + ) + # It is possible that dataloader's workers are out of shared memory. Please try to raise your shared memory limit. + # num_workers=8 -> num_workers=4 + if (hps.if_f0 == 1):collate_fn = TextAudioCollateMultiNSFsid() + else:collate_fn = TextAudioCollate() + train_loader = DataLoader( + train_dataset, + num_workers=4, + shuffle=False, + pin_memory=True, + collate_fn=collate_fn, + batch_sampler=train_sampler, + persistent_workers=True, + prefetch_factor=8, + ) + if(hps.if_f0==1):net_g = SynthesizerTrnMs256NSFsid(hps.data.filter_length // 2 + 1,hps.train.segment_size // hps.data.hop_length,**hps.model,is_half=hps.train.fp16_run,sr=hps.sample_rate).cuda(rank) + else:net_g = SynthesizerTrnMs256NSFsid_nono(hps.data.filter_length // 2 + 1,hps.train.segment_size // hps.data.hop_length,**hps.model,is_half=hps.train.fp16_run).cuda(rank) + net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank) + optim_g = torch.optim.AdamW( + net_g.parameters(), + hps.train.learning_rate, + betas=hps.train.betas, + eps=hps.train.eps, + ) + optim_d = torch.optim.AdamW( + net_d.parameters(), + hps.train.learning_rate, + betas=hps.train.betas, + eps=hps.train.eps, + ) + # net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True) + # net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True) + net_g = DDP(net_g, device_ids=[rank]) + net_d = DDP(net_d, device_ids=[rank]) + + try:#如果能加载自动resume + _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, optim_d) # D多半加载没事 + if rank == 0: + logger.info("loaded D") + # _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g,load_opt=0) + _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g) + global_step = (epoch_str - 1) * len(train_loader) + # epoch_str = 1 + # global_step = 0 + except:#如果首次不能加载,加载pretrain + traceback.print_exc() + epoch_str = 1 + global_step = 0 + if rank == 0: + logger.info("loaded pretrained %s %s"%(hps.pretrainG,hps.pretrainD)) + print(net_g.module.load_state_dict(torch.load(hps.pretrainG,map_location="cpu")["model"]))##测试不加载优化器 + print(net_d.module.load_state_dict(torch.load(hps.pretrainD,map_location="cpu")["model"])) + + scheduler_g = torch.optim.lr_scheduler.ExponentialLR( + optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2 + ) + scheduler_d = torch.optim.lr_scheduler.ExponentialLR( + optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2 + ) + + scaler = GradScaler(enabled=hps.train.fp16_run) + + cache=[] + for epoch in range(epoch_str, hps.train.epochs + 1): + if rank == 0: + train_and_evaluate( + rank, + epoch, + hps, + [net_g, net_d], + [optim_g, optim_d], + [scheduler_g, scheduler_d], + scaler, + [train_loader, None], + logger, + [writer, writer_eval],cache + ) + else: + train_and_evaluate( + rank, + epoch, + hps, + [net_g, net_d], + [optim_g, optim_d], + [scheduler_g, scheduler_d], + scaler, + [train_loader, None], + None, + None,cache + ) + scheduler_g.step() + scheduler_d.step() + + +def train_and_evaluate( + rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers,cache +): + net_g, net_d = nets + optim_g, optim_d = optims + train_loader, eval_loader = loaders + if writers is not None: + writer, writer_eval = writers + + train_loader.batch_sampler.set_epoch(epoch) + global global_step + + net_g.train() + net_d.train() + if(cache==[]or hps.if_cache_data_in_gpu==False):#第一个epoch把cache全部填满训练集 + # print("caching") + for batch_idx, info in enumerate(train_loader): + if (hps.if_f0 == 1):phone,phone_lengths,pitch,pitchf,spec,spec_lengths,wave,wave_lengths,sid=info + else:phone,phone_lengths,spec,spec_lengths,wave,wave_lengths,sid=info + phone, phone_lengths = phone.cuda(rank, non_blocking=True),phone_lengths.cuda(rank, non_blocking=True ) + if (hps.if_f0 == 1):pitch,pitchf = pitch.cuda(rank, non_blocking=True),pitchf.cuda(rank, non_blocking=True) + sid = sid.cuda(rank, non_blocking=True) + spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda(rank, non_blocking=True) + wave, wave_lengths = wave.cuda(rank, non_blocking=True), wave_lengths.cuda(rank, non_blocking=True) + if(hps.if_cache_data_in_gpu==True): + if (hps.if_f0 == 1):cache.append((batch_idx, (phone,phone_lengths,pitch,pitchf,spec,spec_lengths,wave,wave_lengths ,sid))) + else:cache.append((batch_idx, (phone,phone_lengths,spec,spec_lengths,wave,wave_lengths ,sid))) + with autocast(enabled=hps.train.fp16_run): + if (hps.if_f0 == 1):y_hat,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(phone, phone_lengths, pitch,pitchf, spec, spec_lengths,sid) + else:y_hat,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(phone, phone_lengths, spec, spec_lengths,sid) + mel = spec_to_mel_torch(spec,hps.data.filter_length,hps.data.n_mel_channels,hps.data.sampling_rate,hps.data.mel_fmin,hps.data.mel_fmax,) + y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length) + with autocast(enabled=False): + y_hat_mel = mel_spectrogram_torch( + y_hat.float().squeeze(1), + hps.data.filter_length, + hps.data.n_mel_channels, + hps.data.sampling_rate, + hps.data.hop_length, + hps.data.win_length, + hps.data.mel_fmin, + hps.data.mel_fmax, + ) + if(hps.train.fp16_run==True): + y_hat_mel=y_hat_mel.half() + wave = commons.slice_segments( + wave, ids_slice * hps.data.hop_length, hps.train.segment_size + ) # slice + + # Discriminator + y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach()) + with autocast(enabled=False): + loss_disc, losses_disc_r, losses_disc_g = discriminator_loss( + y_d_hat_r, y_d_hat_g + ) + optim_d.zero_grad() + scaler.scale(loss_disc).backward() + scaler.unscale_(optim_d) + grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None) + scaler.step(optim_d) + + with autocast(enabled=hps.train.fp16_run): + # Generator + y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat) + with autocast(enabled=False): + loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel + loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl + loss_fm = feature_loss(fmap_r, fmap_g) + loss_gen, losses_gen = generator_loss(y_d_hat_g) + loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl + optim_g.zero_grad() + scaler.scale(loss_gen_all).backward() + scaler.unscale_(optim_g) + grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None) + scaler.step(optim_g) + scaler.update() + + if rank == 0: + if global_step % hps.train.log_interval == 0: + lr = optim_g.param_groups[0]["lr"] + logger.info( + "Train Epoch: {} [{:.0f}%]".format( + epoch, 100.0 * batch_idx / len(train_loader) + ) + ) + # Amor For Tensorboard display + if loss_mel > 50: + loss_mel = 50 + if loss_kl > 5: + loss_kl = 5 + + logger.info([global_step, lr]) + logger.info( + f"loss_disc={loss_disc:.3f}, loss_gen={loss_gen:.3f}, loss_fm={loss_fm:.3f},loss_mel={loss_mel:.3f}, loss_kl={loss_kl:.3f}" + ) + scalar_dict = { + "loss/g/total": loss_gen_all, + "loss/d/total": loss_disc, + "learning_rate": lr, + "grad_norm_d": grad_norm_d, + "grad_norm_g": grad_norm_g, + } + scalar_dict.update( + {"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/kl": loss_kl} + ) + + scalar_dict.update( + {"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)} + ) + scalar_dict.update( + {"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)} + ) + scalar_dict.update( + {"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)} + ) + image_dict = { + "slice/mel_org": utils.plot_spectrogram_to_numpy( + y_mel[0].data.cpu().numpy() + ), + "slice/mel_gen": utils.plot_spectrogram_to_numpy( + y_hat_mel[0].data.cpu().numpy() + ), + "all/mel": utils.plot_spectrogram_to_numpy( + mel[0].data.cpu().numpy() + ), + } + utils.summarize( + writer=writer, + global_step=global_step, + images=image_dict, + scalars=scalar_dict, + ) + global_step += 1 + # if global_step % hps.train.eval_interval == 0: + if epoch % hps.save_every_epoch == 0: + if(hps.if_latest==0): + utils.save_checkpoint( + net_g, + optim_g, + hps.train.learning_rate, + epoch, + os.path.join(hps.model_dir, "G_{}.pth".format(global_step)), + ) + utils.save_checkpoint( + net_d, + optim_d, + hps.train.learning_rate, + epoch, + os.path.join(hps.model_dir, "D_{}.pth".format(global_step)), + ) + else: + utils.save_checkpoint( + net_g, + optim_g, + hps.train.learning_rate, + epoch, + os.path.join(hps.model_dir, "G_{}.pth".format(2333333)), + ) + utils.save_checkpoint( + net_d, + optim_d, + hps.train.learning_rate, + epoch, + os.path.join(hps.model_dir, "D_{}.pth".format(2333333)), + ) + + else:#后续的epoch直接使用打乱的cache + shuffle(cache) + # print("using cache") + for batch_idx, info in cache: + if (hps.if_f0 == 1):phone,phone_lengths,pitch,pitchf,spec,spec_lengths,wave,wave_lengths,sid=info + else:phone,phone_lengths,spec,spec_lengths,wave,wave_lengths,sid=info + with autocast(enabled=hps.train.fp16_run): + if (hps.if_f0 == 1):y_hat,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(phone, phone_lengths, pitch,pitchf, spec, spec_lengths,sid) + else:y_hat,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(phone, phone_lengths, spec, spec_lengths,sid) + mel = spec_to_mel_torch( + spec, + hps.data.filter_length, + hps.data.n_mel_channels, + hps.data.sampling_rate, + hps.data.mel_fmin, + hps.data.mel_fmax, + ) + y_mel = commons.slice_segments( + mel, ids_slice, hps.train.segment_size // hps.data.hop_length + ) + with autocast(enabled=False): + y_hat_mel = mel_spectrogram_torch( + y_hat.float().squeeze(1), + hps.data.filter_length, + hps.data.n_mel_channels, + hps.data.sampling_rate, + hps.data.hop_length, + hps.data.win_length, + hps.data.mel_fmin, + hps.data.mel_fmax, + ) + if(hps.train.fp16_run==True): + y_hat_mel=y_hat_mel.half() + wave = commons.slice_segments( + wave, ids_slice * hps.data.hop_length, hps.train.segment_size + ) # slice + + # Discriminator + y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach()) + with autocast(enabled=False): + loss_disc, losses_disc_r, losses_disc_g = discriminator_loss( + y_d_hat_r, y_d_hat_g + ) + optim_d.zero_grad() + scaler.scale(loss_disc).backward() + scaler.unscale_(optim_d) + grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None) + scaler.step(optim_d) + + with autocast(enabled=hps.train.fp16_run): + # Generator + y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat) + with autocast(enabled=False): + loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel + loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl + + loss_fm = feature_loss(fmap_r, fmap_g) + loss_gen, losses_gen = generator_loss(y_d_hat_g) + loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl + optim_g.zero_grad() + scaler.scale(loss_gen_all).backward() + scaler.unscale_(optim_g) + grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None) + scaler.step(optim_g) + scaler.update() + + if rank == 0: + if global_step % hps.train.log_interval == 0: + lr = optim_g.param_groups[0]["lr"] + logger.info( + "Train Epoch: {} [{:.0f}%]".format( + epoch, 100.0 * batch_idx / len(train_loader) + ) + ) + # Amor For Tensorboard display + if loss_mel > 50: + loss_mel = 50 + if loss_kl > 5: + loss_kl = 5 + + logger.info([global_step, lr]) + logger.info( + f"loss_disc={loss_disc:.3f}, loss_gen={loss_gen:.3f}, loss_fm={loss_fm:.3f},loss_mel={loss_mel:.3f}, loss_kl={loss_kl:.3f}" + ) + scalar_dict = { + "loss/g/total": loss_gen_all, + "loss/d/total": loss_disc, + "learning_rate": lr, + "grad_norm_d": grad_norm_d, + "grad_norm_g": grad_norm_g, + } + scalar_dict.update( + {"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/kl": loss_kl} + ) + + scalar_dict.update( + {"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)} + ) + scalar_dict.update( + {"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)} + ) + scalar_dict.update( + {"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)} + ) + image_dict = { + "slice/mel_org": utils.plot_spectrogram_to_numpy( + y_mel[0].data.cpu().numpy() + ), + "slice/mel_gen": utils.plot_spectrogram_to_numpy( + y_hat_mel[0].data.cpu().numpy() + ), + "all/mel": utils.plot_spectrogram_to_numpy( + mel[0].data.cpu().numpy() + ), + } + utils.summarize( + writer=writer, + global_step=global_step, + images=image_dict, + scalars=scalar_dict, + ) + global_step += 1 + # if global_step % hps.train.eval_interval == 0: + if epoch % hps.save_every_epoch == 0: + if(hps.if_latest==0): + utils.save_checkpoint( + net_g, + optim_g, + hps.train.learning_rate, + epoch, + os.path.join(hps.model_dir, "G_{}.pth".format(global_step)), + ) + utils.save_checkpoint( + net_d, + optim_d, + hps.train.learning_rate, + epoch, + os.path.join(hps.model_dir, "D_{}.pth".format(global_step)), + ) + else: + utils.save_checkpoint( + net_g, + optim_g, + hps.train.learning_rate, + epoch, + os.path.join(hps.model_dir, "G_{}.pth".format(2333333)), + ) + utils.save_checkpoint( + net_d, + optim_d, + hps.train.learning_rate, + epoch, + os.path.join(hps.model_dir, "D_{}.pth".format(2333333)), + ) + + + if rank == 0: + logger.info("====> Epoch: {}".format(epoch)) + if(epoch>=hps.total_epoch): + if rank == 0: + logger.info("Training is done. The program is closed.") + from process_ckpt import savee#def savee(ckpt,sr,if_f0,name,epoch): + if hasattr(net_g, 'module'):ckpt = net_g.module.state_dict() + else:ckpt = net_g.state_dict() + print("saving final ckpt:",savee(ckpt,hps.sample_rate,hps.if_f0,hps.name,epoch)) + os._exit(2333333) + + +if __name__ == "__main__": + main()