diff --git a/config.py b/config.py
new file mode 100644
index 0000000..f379ea7
--- /dev/null
+++ b/config.py
@@ -0,0 +1,38 @@
+############离线VC参数
+inp_root=r"白鹭霜华长条"#对输入目录下所有音频进行转换,别放非音频文件
+opt_root=r"opt"#输出目录
+f0_up_key=0#升降调,整数,男转女12,女转男-12
+person=r"weights\洛天依v3.pt"#目前只有洛天依v3
+############硬件参数
+device = "cuda:0"#填写cuda:x或cpu,x指代第几张卡,只支持N卡加速
+is_half=True#9-10-20-30-40系显卡无脑True,不影响质量,>=20显卡开启有加速
+n_cpu=0#默认0用上所有线程,写数字限制CPU资源使用
+############下头别动
+import torch
+if(torch.cuda.is_available()==False):
+ print("没有发现支持的N卡,使用CPU进行推理")
+ device="cpu"
+ is_half=False
+if(device!="cpu"):
+ gpu_name=torch.cuda.get_device_name(int(device.split(":")[-1]))
+ if("16"in gpu_name or "MX"in gpu_name):
+ print("16系显卡/MX系显卡强制单精度")
+ is_half=False
+from multiprocessing import cpu_count
+if(n_cpu==0):n_cpu=cpu_count()
+if(is_half==True):
+ #6G显存配置
+ x_pad = 3
+ x_query = 10
+ x_center = 60
+ x_max = 65
+else:
+ #5G显存配置
+ x_pad = 1
+ # x_query = 6
+ # x_center = 30
+ # x_max = 32
+ #6G显存配置
+ x_query = 6
+ x_center = 38
+ x_max = 41
diff --git a/extract_f0_print.py b/extract_f0_print.py
new file mode 100644
index 0000000..9631b8d
--- /dev/null
+++ b/extract_f0_print.py
@@ -0,0 +1,120 @@
+import os,traceback,sys,parselmouth
+import librosa
+import pyworld
+from scipy.io import wavfile
+import numpy as np,logging
+logging.getLogger('numba').setLevel(logging.WARNING)
+from multiprocessing import Process
+
+exp_dir = sys.argv[1]
+f = open("%s/extract_f0_feature.log"%exp_dir, "a+")
+def printt(strr):
+ print(strr)
+ f.write("%s\n" % strr)
+ f.flush()
+
+n_p = int(sys.argv[2])
+f0method = sys.argv[3]
+
+class FeatureInput(object):
+ def __init__(self, samplerate=16000, hop_size=160):
+ self.fs = samplerate
+ self.hop = hop_size
+
+ self.f0_bin = 256
+ self.f0_max = 1100.0
+ self.f0_min = 50.0
+ self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
+ self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
+
+ def compute_f0(self, path,f0_method):
+ x, sr = librosa.load(path, self.fs)
+ p_len=x.shape[0]//self.hop
+ assert sr == self.fs
+ if(f0_method=="pm"):
+ time_step = 160 / 16000 * 1000
+ f0_min = 50
+ f0_max = 1100
+ f0 = parselmouth.Sound(x, sr).to_pitch_ac(
+ time_step=time_step / 1000, voicing_threshold=0.6,
+ pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
+ pad_size=(p_len - len(f0) + 1) // 2
+ if(pad_size>0 or p_len - len(f0) - pad_size>0):
+ f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
+ elif(f0_method=="harvest"):
+ f0, t = pyworld.harvest(
+ x.astype(np.double),
+ fs=sr,
+ f0_ceil=1100,
+ frame_period=1000 * self.hop / sr,
+ )
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
+ elif(f0_method=="dio"):
+ f0, t = pyworld.dio(
+ x.astype(np.double),
+ fs=sr,
+ f0_ceil=1100,
+ frame_period=1000 * self.hop / sr,
+ )
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
+ return f0
+
+ def coarse_f0(self, f0):
+ f0_mel = 1127 * np.log(1 + f0 / 700)
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
+ self.f0_bin - 2
+ ) / (self.f0_mel_max - self.f0_mel_min) + 1
+
+ # use 0 or 1
+ f0_mel[f0_mel <= 1] = 1
+ f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
+ f0_coarse = np.rint(f0_mel).astype(np.int)
+ assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
+ f0_coarse.max(),
+ f0_coarse.min(),
+ )
+ return f0_coarse
+
+ def go(self,paths,f0_method):
+ if (len(paths) == 0): printt("no-f0-todo")
+ else:
+ printt("todo-f0-%s"%len(paths))
+ n=max(len(paths)//5,1)#每个进程最多打印5条
+ for idx,(inp_path,opt_path1,opt_path2) in enumerate(paths):
+ try:
+ if(idx%n==0):printt("f0ing,now-%s,all-%s,-%s"%(idx,len(paths),inp_path))
+ if(os.path.exists(opt_path1+".npy")==True and os.path.exists(opt_path2+".npy")==True):continue
+ featur_pit = self.compute_f0(inp_path,f0_method)
+ np.save(opt_path2,featur_pit,allow_pickle=False,)#nsf
+ coarse_pit = self.coarse_f0(featur_pit)
+ np.save(opt_path1,coarse_pit,allow_pickle=False,)#ori
+ except:
+ printt("f0fail-%s-%s-%s" % (idx, inp_path,traceback.format_exc()))
+
+if __name__=='__main__':
+ # exp_dir=r"E:\codes\py39\dataset\mi-test"
+ # n_p=16
+ # f = open("%s/log_extract_f0.log"%exp_dir, "w")
+ printt(sys.argv)
+ featureInput = FeatureInput()
+ paths=[]
+ inp_root= "%s/1_16k_wavs"%(exp_dir)
+ opt_root1="%s/2a_f0"%(exp_dir)
+ opt_root2="%s/2b-f0nsf"%(exp_dir)
+
+ os.makedirs(opt_root1,exist_ok=True)
+ os.makedirs(opt_root2,exist_ok=True)
+ for name in sorted(list(os.listdir(inp_root))):
+ inp_path="%s/%s"%(inp_root,name)
+ if ("spec" in inp_path): continue
+ opt_path1="%s/%s"%(opt_root1,name)
+ opt_path2="%s/%s"%(opt_root2,name)
+ paths.append([inp_path,opt_path1,opt_path2])
+
+ ps=[]
+ for i in range(n_p):
+ p=Process(target=featureInput.go,args=(paths[i::n_p],f0method,))
+ p.start()
+ ps.append(p)
+ for p in ps:
+ p.join()
diff --git a/extract_feature_print.py b/extract_feature_print.py
new file mode 100644
index 0000000..7a0ff4d
--- /dev/null
+++ b/extract_feature_print.py
@@ -0,0 +1,84 @@
+import os,sys,traceback
+n_part=int(sys.argv[1])
+i_part=int(sys.argv[2])
+i_gpu=sys.argv[3]
+exp_dir=sys.argv[4]
+os.environ["CUDA_VISIBLE_DEVICES"]=str(i_gpu)
+
+import torch
+import torch.nn.functional as F
+import soundfile as sf
+import numpy as np
+import joblib
+from fairseq import checkpoint_utils
+import pdb
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+f = open("%s/extract_f0_feature.log"%exp_dir, "a+")
+def printt(strr):
+ print(strr)
+ f.write("%s\n" % strr)
+ f.flush()
+printt(sys.argv)
+# model_path = "/bili-coeus/jupyter/jupyterhub-liujing04/speech/pretrain/ContentVec_legacy500.pt"
+model_path = "hubert_base.pt"
+
+printt(exp_dir)
+wavPath = "%s/1_16k_wavs"%exp_dir
+outPath = "%s/3_feature256"%exp_dir
+os.makedirs(outPath,exist_ok=True)
+# wave must be 16k, hop_size=320
+def readwave(wav_path, normalize=False):
+ wav, sr = sf.read(wav_path)
+ assert sr == 16000
+ feats = torch.from_numpy(wav).float()
+ if feats.dim() == 2: # double channels
+ feats = feats.mean(-1)
+ assert feats.dim() == 1, feats.dim()
+ if normalize:
+ with torch.no_grad():
+ feats = F.layer_norm(feats, feats.shape)
+ feats = feats.view(1, -1)
+ return feats
+# HuBERT model
+printt("load model(s) from {}".format(model_path))
+models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
+ [model_path],
+ suffix="",
+)
+model = models[0]
+model = model.to(device)
+model = model.half()
+model.eval()
+
+todo=sorted(list(os.listdir(wavPath)))[i_part::n_part]
+n = max(1,len(todo) // 10) # 最多打印十条
+if(len(todo)==0):printt("no-feature-todo")
+else:
+ printt("all-feature-%s"%len(todo))
+ for idx,file in enumerate(todo):
+ try:
+ if file.endswith(".wav"):
+ wav_path = "%s/%s"%(wavPath,file)
+ out_path = "%s/%s"%(outPath,file.replace("wav","npy"))
+
+ if(os.path.exists(out_path)):continue
+
+ feats = readwave(wav_path, normalize=saved_cfg.task.normalize)
+ padding_mask = torch.BoolTensor(feats.shape).fill_(False)
+ inputs = {
+ "source": feats.half().to(device),
+ "padding_mask": padding_mask.to(device),
+ "output_layer": 9, # layer 9
+ }
+ with torch.no_grad():
+ logits = model.extract_features(**inputs)
+ feats = model.final_proj(logits[0])
+
+ feats = feats.squeeze(0).float().cpu().numpy()
+ # feats = np.repeat(feats, 2,0) # 20ms -> 10ms
+ np.save(out_path, feats, allow_pickle=False)
+ if (idx % n == 0):printt("now-%s,all-%s,%s,%s"%(len(todo),idx,file,feats.shape))
+ except:
+ printt(traceback.format_exc())
+ printt("all-feature-done")
\ No newline at end of file
diff --git a/infer-web.py b/infer-web.py
new file mode 100644
index 0000000..cf0f242
--- /dev/null
+++ b/infer-web.py
@@ -0,0 +1,630 @@
+from multiprocessing import cpu_count
+import threading
+from time import sleep
+from subprocess import Popen,PIPE,run as runn
+from time import sleep
+import torch, pdb, os,traceback,sys,warnings,shutil,numpy as np,faiss
+#判断是否有能用来训练和加速推理的N卡
+ncpu=cpu_count()
+ngpu=torch.cuda.device_count()
+gpu_infos=[]
+if(torch.cuda.is_available()==False or ngpu==0):if_gpu_ok=False
+else:
+ if_gpu_ok = False
+ for i in range(ngpu):
+ gpu_name=torch.cuda.get_device_name(i)
+ if("16"in gpu_name or "MX"in gpu_name):continue
+ if("10"in gpu_name or "20"in gpu_name or "30"in gpu_name or "40"in gpu_name or "A50"in gpu_name.upper() or "70"in gpu_name or "80"in gpu_name or "90"in gpu_name or "M4"in gpu_name or "T4"in gpu_name or "TITAN"in gpu_name.upper()):#A10#A100#V100#A40#P40#M40#K80
+ if_gpu_ok=True#至少有一张能用的N卡
+ gpu_infos.append("%s\t%s"%(i,gpu_name))
+gpu_info="\n".join(gpu_infos)if if_gpu_ok==True and len(gpu_infos)>0 else "很遗憾您这没有能用的显卡来支持您训练"
+gpus="-".join([i[0]for i in gpu_infos])
+now_dir=os.getcwd()
+sys.path.append(now_dir)
+tmp=os.path.join(now_dir,"TEMP")
+shutil.rmtree(tmp,ignore_errors=True)
+os.makedirs(tmp,exist_ok=True)
+os.makedirs(os.path.join(now_dir,"logs"),exist_ok=True)
+os.makedirs(os.path.join(now_dir,"weights"),exist_ok=True)
+os.environ["TEMP"]=tmp
+warnings.filterwarnings("ignore")
+torch.manual_seed(114514)
+from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono
+from scipy.io import wavfile
+from fairseq import checkpoint_utils
+import gradio as gr
+import librosa
+import logging
+from vc_infer_pipeline import VC
+import soundfile as sf
+from config import is_half,device,is_half
+from infer_uvr5 import _audio_pre_
+from my_utils import load_audio
+from train.process_ckpt import show_info,change_info,merge,extract_small_model
+# from trainset_preprocess_pipeline import PreProcess
+logging.getLogger('numba').setLevel(logging.WARNING)
+
+class ToolButton(gr.Button, gr.components.FormComponent):
+ """Small button with single emoji as text, fits inside gradio forms"""
+ def __init__(self, **kwargs):
+ super().__init__(variant="tool", **kwargs)
+ def get_block_name(self):
+ return "button"
+
+hubert_model=None
+def load_hubert():
+ global hubert_model
+ models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(["hubert_base.pt"],suffix="",)
+ hubert_model = models[0]
+ hubert_model = hubert_model.to(device)
+ if(is_half):hubert_model = hubert_model.half()
+ else:hubert_model = hubert_model.float()
+ hubert_model.eval()
+
+weight_root="weights"
+weight_uvr5_root="uvr5_weights"
+names=[]
+for name in os.listdir(weight_root):names.append(name)
+uvr5_names=[]
+for name in os.listdir(weight_uvr5_root):uvr5_names.append(name.replace(".pth",""))
+
+def vc_single(sid,input_audio,f0_up_key,f0_file,f0_method,file_index,file_big_npy,index_rate):#spk_item, input_audio0, vc_transform0,f0_file,f0method0
+ global tgt_sr,net_g,vc,hubert_model
+ if input_audio is None:return "You need to upload an audio", None
+ f0_up_key = int(f0_up_key)
+ try:
+ audio=load_audio(input_audio,16000)
+ times = [0, 0, 0]
+ if(hubert_model==None):load_hubert()
+ if_f0 = cpt.get("f0", 1)
+ audio_opt=vc.pipeline(hubert_model,net_g,sid,audio,times,f0_up_key,f0_method,file_index,file_big_npy,index_rate,if_f0,f0_file=f0_file)
+ print(times)
+ return "Success", (tgt_sr, audio_opt)
+ except:
+ info=traceback.format_exc()
+ print(info)
+ return info,(None,None)
+
+def vc_multi(sid,dir_path,opt_root,paths,f0_up_key,f0_method,file_index,file_big_npy,index_rate):
+ try:
+ dir_path=dir_path.strip(" ")#防止小白拷路径头尾带了空格
+ opt_root=opt_root.strip(" ")
+ os.makedirs(opt_root, exist_ok=True)
+ try:
+ if(dir_path!=""):paths=[os.path.join(dir_path,name)for name in os.listdir(dir_path)]
+ else:paths=[path.name for path in paths]
+ except:
+ traceback.print_exc()
+ paths = [path.name for path in paths]
+ infos=[]
+ for path in paths:
+ info,opt=vc_single(sid,path,f0_up_key,None,f0_method,file_index,file_big_npy,index_rate)
+ if(info=="Success"):
+ try:
+ tgt_sr,audio_opt=opt
+ wavfile.write("%s/%s" % (opt_root, os.path.basename(path)), tgt_sr, audio_opt)
+ except:
+ info=traceback.format_exc()
+ infos.append("%s->%s"%(os.path.basename(path),info))
+ yield "\n".join(infos)
+ yield "\n".join(infos)
+ except:
+ yield traceback.format_exc()
+
+def uvr(model_name,inp_root,save_root_vocal,paths,save_root_ins):
+ infos = []
+ try:
+ inp_root = inp_root.strip(" ").strip("\n")
+ save_root_vocal = save_root_vocal.strip(" ").strip("\n")
+ save_root_ins = save_root_ins.strip(" ").strip("\n")
+ pre_fun = _audio_pre_(model_path=os.path.join(weight_uvr5_root,model_name+".pth"), device=device, is_half=is_half)
+ if (inp_root != ""):paths = [os.path.join(inp_root, name) for name in os.listdir(inp_root)]
+ else:paths = [path.name for path in paths]
+ for name in paths:
+ inp_path=os.path.join(inp_root,name)
+ try:
+ pre_fun._path_audio_(inp_path , save_root_ins,save_root_vocal)
+ infos.append("%s->Success"%(os.path.basename(inp_path)))
+ yield "\n".join(infos)
+ except:
+ infos.append("%s->%s" % (os.path.basename(inp_path),traceback.format_exc()))
+ yield "\n".join(infos)
+ except:
+ infos.append(traceback.format_exc())
+ yield "\n".join(infos)
+ finally:
+ try:
+ del pre_fun.model
+ del pre_fun
+ except:
+ traceback.print_exc()
+ print("clean_empty_cache")
+ torch.cuda.empty_cache()
+ yield "\n".join(infos)
+
+#一个选项卡全局只能有一个音色
+def get_vc(sid):
+ global n_spk,tgt_sr,net_g,vc,cpt
+ if(sid==""):
+ global hubert_model
+ print("clean_empty_cache")
+ del net_g, n_spk, vc, hubert_model,tgt_sr#,cpt
+ hubert_model = net_g=n_spk=vc=hubert_model=tgt_sr=None
+ torch.cuda.empty_cache()
+ ###楼下不这么折腾清理不干净
+ if_f0 = cpt.get("f0", 1)
+ if (if_f0 == 1):
+ net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half)
+ else:
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
+ del net_g,cpt
+ torch.cuda.empty_cache()
+ cpt=None
+ return {"visible": False, "__type__": "update"}
+ person = "%s/%s" % (weight_root, sid)
+ print("loading %s"%person)
+ cpt = torch.load(person, map_location="cpu")
+ tgt_sr = cpt["config"][-1]
+ cpt["config"][-3]=cpt["weight"]["emb_g.weight"].shape[0]#n_spk
+ if_f0=cpt.get("f0",1)
+ if(if_f0==1):
+ net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half)
+ else:
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
+ del net_g.enc_q
+ print(net_g.load_state_dict(cpt["weight"], strict=False)) # 不加这一行清不干净,真奇葩
+ net_g.eval().to(device)
+ if (is_half):net_g = net_g.half()
+ else:net_g = net_g.float()
+ vc = VC(tgt_sr, device, is_half)
+ n_spk=cpt["config"][-3]
+ return {"visible": True,"maximum": n_spk, "__type__": "update"}
+
+def change_choices():return {"choices": sorted(list(os.listdir(weight_root))), "__type__": "update"}
+def clean():return {"value": "", "__type__": "update"}
+def change_f0(if_f0_3,sr2):#np7, f0method8,pretrained_G14,pretrained_D15
+ if(if_f0_3=="是"):return {"visible": True, "__type__": "update"},{"visible": True, "__type__": "update"},"pretrained/f0G%s.pth"%sr2,"pretrained/f0D%s.pth"%sr2
+ return {"visible": False, "__type__": "update"}, {"visible": False, "__type__": "update"},"pretrained/G%s.pth"%sr2,"pretrained/D%s.pth"%sr2
+
+sr_dict={
+ "32k":32000,
+ "40k":40000,
+ "48k":48000,
+}
+
+def if_done(done,p):
+ while 1:
+ if(p.poll()==None):sleep(0.5)
+ else:break
+ done[0]=True
+
+
+def if_done_multi(done,ps):
+ while 1:
+ #poll==None代表进程未结束
+ #只要有一个进程未结束都不停
+ flag=1
+ for p in ps:
+ if(p.poll()==None):
+ flag = 0
+ sleep(0.5)
+ break
+ if(flag==1):break
+ done[0]=True
+
+def preprocess_dataset(trainset_dir,exp_dir,sr,n_p=ncpu):
+ sr=sr_dict[sr]
+ os.makedirs("%s/logs/%s"%(now_dir,exp_dir),exist_ok=True)
+ f = open("%s/logs/%s/preprocess.log"%(now_dir,exp_dir), "w")
+ f.close()
+ cmd="python trainset_preprocess_pipeline_print.py %s %s %s %s/logs/%s"%(trainset_dir,sr,n_p,now_dir,exp_dir)
+ print(cmd)
+ p = Popen(cmd, shell=True)#, stdin=PIPE, stdout=PIPE,stderr=PIPE,cwd=now_dir
+ ###煞笔gr,popen read都非得全跑完了再一次性读取,不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
+ done=[False]
+ threading.Thread(target=if_done,args=(done,p,)).start()
+ while(1):
+ with open("%s/logs/%s/preprocess.log"%(now_dir,exp_dir),"r")as f:yield(f.read())
+ sleep(1)
+ if(done[0]==True):break
+ with open("%s/logs/%s/preprocess.log"%(now_dir,exp_dir), "r")as f:log = f.read()
+ print(log)
+ yield log
+#but2.click(extract_f0,[gpus6,np7,f0method8,if_f0_3,trainset_dir4],[info2])
+def extract_f0_feature(gpus,n_p,f0method,if_f0,exp_dir):
+ gpus=gpus.split("-")
+ os.makedirs("%s/logs/%s"%(now_dir,exp_dir),exist_ok=True)
+ f = open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir), "w")
+ f.close()
+ if(if_f0=="是"):
+ cmd="python extract_f0_print.py %s/logs/%s %s %s"%(now_dir,exp_dir,n_p,f0method)
+ print(cmd)
+ p = Popen(cmd, shell=True,cwd=now_dir)#, stdin=PIPE, stdout=PIPE,stderr=PIPE
+ ###煞笔gr,popen read都非得全跑完了再一次性读取,不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
+ done=[False]
+ threading.Thread(target=if_done,args=(done,p,)).start()
+ while(1):
+ with open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir),"r")as f:yield(f.read())
+ sleep(1)
+ if(done[0]==True):break
+ with open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir), "r")as f:log = f.read()
+ print(log)
+ yield log
+ ####对不同part分别开多进程
+ '''
+ n_part=int(sys.argv[1])
+ i_part=int(sys.argv[2])
+ i_gpu=sys.argv[3]
+ exp_dir=sys.argv[4]
+ os.environ["CUDA_VISIBLE_DEVICES"]=str(i_gpu)
+ '''
+ leng=len(gpus)
+ ps=[]
+ for idx,n_g in enumerate(gpus):
+ cmd="python extract_feature_print.py %s %s %s %s/logs/%s"%(leng,idx,n_g,now_dir,exp_dir)
+ print(cmd)
+ p = Popen(cmd, shell=True, cwd=now_dir)#, shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir
+ ps.append(p)
+ ###煞笔gr,popen read都非得全跑完了再一次性读取,不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
+ done = [False]
+ threading.Thread(target=if_done_multi, args=(done, ps,)).start()
+ while (1):
+ with open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir), "r")as f:yield (f.read())
+ sleep(1)
+ if (done[0] == True): break
+ with open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir), "r")as f:log = f.read()
+ print(log)
+ yield log
+def change_sr2(sr2,if_f0_3):
+ if(if_f0_3=="是"):return "pretrained/f0G%s.pth"%sr2,"pretrained/f0D%s.pth"%sr2
+ else:return "pretrained/G%s.pth"%sr2,"pretrained/D%s.pth"%sr2
+#but3.click(click_train,[exp_dir1,sr2,if_f0_3,save_epoch10,total_epoch11,batch_size12,if_save_latest13,pretrained_G14,pretrained_D15,gpus16])
+def click_train(exp_dir1,sr2,if_f0_3,spk_id5,save_epoch10,total_epoch11,batch_size12,if_save_latest13,pretrained_G14,pretrained_D15,gpus16,if_cache_gpu17):
+ #生成filelist
+ exp_dir="%s/logs/%s"%(now_dir,exp_dir1)
+ os.makedirs(exp_dir,exist_ok=True)
+ gt_wavs_dir="%s/0_gt_wavs"%(exp_dir)
+ co256_dir="%s/3_feature256"%(exp_dir)
+ if(if_f0_3=="是"):
+ f0_dir = "%s/2a_f0" % (exp_dir)
+ f0nsf_dir="%s/2b-f0nsf"%(exp_dir)
+ names=set([name.split(".")[0]for name in os.listdir(gt_wavs_dir)])&set([name.split(".")[0]for name in os.listdir(co256_dir)])&set([name.split(".")[0]for name in os.listdir(f0_dir)])&set([name.split(".")[0]for name in os.listdir(f0nsf_dir)])
+ else:
+ names=set([name.split(".")[0]for name in os.listdir(gt_wavs_dir)])&set([name.split(".")[0]for name in os.listdir(co256_dir)])
+ opt=[]
+ for name in names:
+ if (if_f0_3 == "是"):
+ opt.append("%s/%s.wav|%s/%s.npy|%s/%s.wav.npy|%s/%s.wav.npy|%s"%(gt_wavs_dir.replace("\\","\\\\"),name,co256_dir.replace("\\","\\\\"),name,f0_dir.replace("\\","\\\\"),name,f0nsf_dir.replace("\\","\\\\"),name,spk_id5))
+ else:
+ opt.append("%s/%s.wav|%s/%s.npy|%s"%(gt_wavs_dir.replace("\\","\\\\"),name,co256_dir.replace("\\","\\\\"),name,spk_id5))
+ with open("%s/filelist.txt"%exp_dir,"w")as f:f.write("\n".join(opt))
+ print("write filelist done")
+ #生成config#无需生成config
+ # cmd = "python train_nsf_sim_cache_sid_load_pretrain.py -e mi-test -sr 40k -f0 1 -bs 4 -g 0 -te 10 -se 5 -pg pretrained/f0G40k.pth -pd pretrained/f0D40k.pth -l 1 -c 0"
+ cmd = "python train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -g %s -te %s -se %s -pg %s -pd %s -l %s -c %s" % (exp_dir1,sr2,1 if if_f0_3=="是"else 0,batch_size12,gpus16,total_epoch11,save_epoch10,pretrained_G14,pretrained_D15,1 if if_save_latest13=="是"else 0,1 if if_cache_gpu17=="是"else 0)
+ print(cmd)
+ p = Popen(cmd, shell=True, cwd=now_dir)
+ p.wait()
+ return "训练结束,您可查看控制台训练日志或实验文件夹下的train.log"
+# but4.click(train_index, [exp_dir1], info3)
+def train_index(exp_dir1):
+ exp_dir="%s/logs/%s"%(now_dir,exp_dir1)
+ os.makedirs(exp_dir,exist_ok=True)
+ feature_dir="%s/3_feature256"%(exp_dir)
+ if(os.path.exists(feature_dir)==False):return "请先进行特征提取!"
+ listdir_res=list(os.listdir(feature_dir))
+ if(len(listdir_res)==0):return "请先进行特征提取!"
+ npys = []
+ for name in sorted(listdir_res):
+ phone = np.load("%s/%s" % (feature_dir, name))
+ npys.append(phone)
+ big_npy = np.concatenate(npys, 0)
+ np.save("%s/total_fea.npy"%exp_dir, big_npy)
+ n_ivf = big_npy.shape[0] // 39
+ infos=[]
+ infos.append("%s,%s"%(big_npy.shape,n_ivf))
+ yield "\n".join(infos)
+ index = faiss.index_factory(256, "IVF%s,Flat"%n_ivf)
+ infos.append("training")
+ yield "\n".join(infos)
+ index_ivf = faiss.extract_index_ivf(index) #
+ index_ivf.nprobe = int(np.power(n_ivf,0.3))
+ index.train(big_npy)
+ faiss.write_index(index, '%s/trained_IVF%s_Flat_nprobe_%s.index'%(exp_dir,n_ivf,index_ivf.nprobe))
+ infos.append("adding")
+ yield "\n".join(infos)
+ index.add(big_npy)
+ faiss.write_index(index, '%s/added_IVF%s_Flat_nprobe_%s.index'%(exp_dir,n_ivf,index_ivf.nprobe))
+ infos.append("成功构建索引,added_IVF%s_Flat_nprobe_%s.index"%(n_ivf,index_ivf.nprobe))
+ yield "\n".join(infos)
+#but5.click(train1key, [exp_dir1, sr2, if_f0_3, trainset_dir4, spk_id5, gpus6, np7, f0method8, save_epoch10, total_epoch11, batch_size12, if_save_latest13, pretrained_G14, pretrained_D15, gpus16, if_cache_gpu17], info3)
+def train1key(exp_dir1, sr2, if_f0_3, trainset_dir4, spk_id5, gpus6, np7, f0method8, save_epoch10, total_epoch11, batch_size12, if_save_latest13, pretrained_G14, pretrained_D15, gpus16, if_cache_gpu17):
+ infos=[]
+ def get_info_str(strr):
+ infos.append(strr)
+ return "\n".join(infos)
+ os.makedirs("%s/logs/%s"%(now_dir,exp_dir1),exist_ok=True)
+ #########step1:处理数据
+ open("%s/logs/%s/preprocess.log"%(now_dir,exp_dir1), "w").close()
+ cmd="python trainset_preprocess_pipeline_print.py %s %s %s %s/logs/%s"%(trainset_dir4,sr_dict[sr2],ncpu,now_dir,exp_dir1)
+ yield get_info_str("step1:正在处理数据")
+ yield get_info_str(cmd)
+ p = Popen(cmd, shell=True)
+ p.wait()
+ with open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir1), "r")as f: print(f.read())
+ #########step2a:提取音高
+ open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir1), "w")
+ if(if_f0_3=="是"):
+ yield get_info_str("step2a:正在提取音高")
+ cmd="python extract_f0_print.py %s/logs/%s %s %s"%(now_dir,exp_dir1,np7,f0method8)
+ yield get_info_str(cmd)
+ p = Popen(cmd, shell=True,cwd=now_dir)
+ p.wait()
+ with open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir1), "r")as f:print(f.read())
+ else:yield get_info_str("step2a:无需提取音高")
+ #######step2b:提取特征
+ yield get_info_str("step2b:正在提取特征")
+ gpus=gpus16.split("-")
+ leng=len(gpus)
+ ps=[]
+ for idx,n_g in enumerate(gpus):
+ cmd="python extract_feature_print.py %s %s %s %s/logs/%s"%(leng,idx,n_g,now_dir,exp_dir1)
+ yield get_info_str(cmd)
+ p = Popen(cmd, shell=True, cwd=now_dir)#, shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir
+ ps.append(p)
+ for p in ps:p.wait()
+ with open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir1), "r")as f:print(f.read())
+ #######step3a:训练模型
+ yield get_info_str("step3a:正在训练模型")
+ #生成filelist
+ exp_dir="%s/logs/%s"%(now_dir,exp_dir1)
+ gt_wavs_dir="%s/0_gt_wavs"%(exp_dir)
+ co256_dir="%s/3_feature256"%(exp_dir)
+ if(if_f0_3=="是"):
+ f0_dir = "%s/2a_f0" % (exp_dir)
+ f0nsf_dir="%s/2b-f0nsf"%(exp_dir)
+ names=set([name.split(".")[0]for name in os.listdir(gt_wavs_dir)])&set([name.split(".")[0]for name in os.listdir(co256_dir)])&set([name.split(".")[0]for name in os.listdir(f0_dir)])&set([name.split(".")[0]for name in os.listdir(f0nsf_dir)])
+ else:
+ names=set([name.split(".")[0]for name in os.listdir(gt_wavs_dir)])&set([name.split(".")[0]for name in os.listdir(co256_dir)])
+ opt=[]
+ for name in names:
+ if (if_f0_3 == "是"):
+ opt.append("%s/%s.wav|%s/%s.npy|%s/%s.wav.npy|%s/%s.wav.npy|%s"%(gt_wavs_dir.replace("\\","\\\\"),name,co256_dir.replace("\\","\\\\"),name,f0_dir.replace("\\","\\\\"),name,f0nsf_dir.replace("\\","\\\\"),name,spk_id5))
+ else:
+ opt.append("%s/%s.wav|%s/%s.npy|%s"%(gt_wavs_dir.replace("\\","\\\\"),name,co256_dir.replace("\\","\\\\"),name,spk_id5))
+ with open("%s/filelist.txt"%exp_dir,"w")as f:f.write("\n".join(opt))
+ yield get_info_str("write filelist done")
+ cmd = "python train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -g %s -te %s -se %s -pg %s -pd %s -l %s -c %s" % (exp_dir1,sr2,1 if if_f0_3=="是"else 0,batch_size12,gpus16,total_epoch11,save_epoch10,pretrained_G14,pretrained_D15,1 if if_save_latest13=="是"else 0,1 if if_cache_gpu17=="是"else 0)
+ yield get_info_str(cmd)
+ p = Popen(cmd, shell=True, cwd=now_dir)
+ p.wait()
+ yield get_info_str("训练结束,您可查看控制台训练日志或实验文件夹下的train.log")
+ #######step3b:训练索引
+ feature_dir="%s/3_feature256"%(exp_dir)
+ npys = []
+ listdir_res=list(os.listdir(feature_dir))
+ for name in sorted(listdir_res):
+ phone = np.load("%s/%s" % (feature_dir, name))
+ npys.append(phone)
+ big_npy = np.concatenate(npys, 0)
+ np.save("%s/total_fea.npy"%exp_dir, big_npy)
+ n_ivf = big_npy.shape[0] // 39
+ yield get_info_str("%s,%s"%(big_npy.shape,n_ivf))
+ index = faiss.index_factory(256, "IVF%s,Flat"%n_ivf)
+ yield get_info_str("training index")
+ index_ivf = faiss.extract_index_ivf(index) #
+ index_ivf.nprobe = int(np.power(n_ivf,0.3))
+ index.train(big_npy)
+ faiss.write_index(index, '%s/trained_IVF%s_Flat_nprobe_%s.index'%(exp_dir,n_ivf,index_ivf.nprobe))
+ yield get_info_str("adding index")
+ index.add(big_npy)
+ faiss.write_index(index, '%s/added_IVF%s_Flat_nprobe_%s.index'%(exp_dir,n_ivf,index_ivf.nprobe))
+ yield get_info_str("成功构建索引,added_IVF%s_Flat_nprobe_%s.index"%(n_ivf,index_ivf.nprobe))
+ yield get_info_str("全流程结束!")
+
+# ckpt_path2.change(change_info_,[ckpt_path2],[sr__,if_f0__])
+def change_info_(ckpt_path):
+ if(os.path.exists(ckpt_path.replace(os.path.basename(ckpt_path),"train.log"))==False):return {"__type__": "update"},{"__type__": "update"}
+ try:
+ with open(ckpt_path.replace(os.path.basename(ckpt_path),"train.log"),"r")as f:
+ info=eval(f.read().strip("\n").split("\n")[0].split("\t")[-1])
+ sr,f0=info["sample_rate"],info["if_f0"]
+ return sr,str(f0)
+ except:
+ traceback.print_exc()
+ return {"__type__": "update"}, {"__type__": "update"}
+
+
+with gr.Blocks() as app:
+ gr.Markdown(value="""
+ 本软件以MIT协议开源,作者不对软件具备任何控制力,使用软件者、传播软件导出的声音者自负全责。
+ 如不认可该条款,则不能使用或引用软件包内任何代码和文件。详见根目录"使用需遵守的协议-LICENSE.txt"。
+ """)
+ with gr.Tabs():
+ with gr.TabItem("模型推理"):
+ with gr.Row():
+ sid0 = gr.Dropdown(label="推理音色", choices=names)
+ refresh_button = gr.Button("刷新音色列表", variant="primary")
+ refresh_button.click(
+ fn=change_choices,
+ inputs=[],
+ outputs=[sid0]
+ )
+ clean_button = gr.Button("卸载音色省显存", variant="primary")
+ spk_item = gr.Slider(minimum=0, maximum=2333, step=1, label='请选择说话人id', value=0, visible=False, interactive=True)
+ clean_button.click(
+ fn=clean,
+ inputs=[],
+ outputs=[sid0]
+ )
+ sid0.change(
+ fn=get_vc,
+ inputs=[sid0],
+ outputs=[spk_item],
+ )
+ with gr.Group():
+ gr.Markdown(value="""
+ 男转女推荐+12key,女转男推荐-12key,如果音域爆炸导致音色失真也可以自己调整到合适音域。
+ """)
+ with gr.Row():
+ with gr.Column():
+ vc_transform0 = gr.Number(label="变调(整数,半音数量,升八度12降八度-12)", value=0)
+ input_audio0 = gr.Textbox(label="输入待处理音频文件路径(默认是正确格式示例)",value="E:\codes\py39\\vits_vc_gpu_train\\todo-songs\冬之花clip1.wav")
+ f0method0=gr.Radio(label="选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比", choices=["pm","harvest"],value="pm", interactive=True)
+ with gr.Column():
+ file_index1 = gr.Textbox(label="特征检索库文件路径",value="E:\codes\py39\\vits_vc_gpu_train\logs\mi-test-1key\\added_IVF677_Flat_nprobe_7.index", interactive=True)
+ file_big_npy1 = gr.Textbox(label="特征文件路径",value="E:\codes\py39\\vits_vc_gpu_train\logs\mi-test-1key\\total_fea.npy", interactive=True)
+ index_rate1 = gr.Slider(minimum=0, maximum=1,label='检索特征占比', value=1,interactive=True)
+ f0_file = gr.File(label="F0曲线文件,可选,一行一个音高,代替默认F0及升降调")
+ but0=gr.Button("转换", variant="primary")
+ with gr.Column():
+ vc_output1 = gr.Textbox(label="输出信息")
+ vc_output2 = gr.Audio(label="输出音频(右下角三个点,点了可以下载)")
+ but0.click(vc_single, [spk_item, input_audio0, vc_transform0,f0_file,f0method0,file_index1,file_big_npy1,index_rate1], [vc_output1, vc_output2])
+ with gr.Group():
+ gr.Markdown(value="""
+ 批量转换,输入待转换音频文件夹,或上传多个音频文件,在指定文件夹(默认opt)下输出转换的音频。
+ """)
+ with gr.Row():
+ with gr.Column():
+ vc_transform1 = gr.Number(label="变调(整数,半音数量,升八度12降八度-12)", value=0)
+ opt_input = gr.Textbox(label="指定输出文件夹",value="opt")
+ f0method1=gr.Radio(label="选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比", choices=["pm","harvest"],value="pm", interactive=True)
+ with gr.Column():
+ file_index2 = gr.Textbox(label="特征检索库文件路径",value="E:\codes\py39\\vits_vc_gpu_train\logs\mi-test-1key\\added_IVF677_Flat_nprobe_7.index", interactive=True)
+ file_big_npy2 = gr.Textbox(label="特征文件路径",value="E:\codes\py39\\vits_vc_gpu_train\logs\mi-test-1key\\total_fea.npy", interactive=True)
+ index_rate2 = gr.Slider(minimum=0, maximum=1,label='检索特征占比', value=1,interactive=True)
+ with gr.Column():
+ dir_input = gr.Textbox(label="输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)",value="E:\codes\py39\\vits_vc_gpu_train\\todo-songs")
+ inputs = gr.File(file_count="multiple", label="也可批量输入音频文件,二选一,优先读文件夹")
+ but1=gr.Button("转换", variant="primary")
+ vc_output3 = gr.Textbox(label="输出信息")
+ but1.click(vc_multi, [spk_item, dir_input,opt_input,inputs, vc_transform1,f0method1,file_index2,file_big_npy2,index_rate2], [vc_output3])
+ with gr.TabItem("伴奏人声分离"):
+ with gr.Group():
+ gr.Markdown(value="""
+ 人声伴奏分离批量处理,使用UVR5模型。
+ 不带和声用HP2,带和声且提取的人声不需要和声用HP5
+ 合格的文件夹路径格式举例:E:\codes\py39\\vits_vc_gpu\白鹭霜华测试样例(去文件管理器地址栏拷就行了)
+ """)
+ with gr.Row():
+ with gr.Column():
+ dir_wav_input = gr.Textbox(label="输入待处理音频文件夹路径",value="E:\codes\py39\\vits_vc_gpu_train\\todo-songs")
+ wav_inputs = gr.File(file_count="multiple", label="也可批量输入音频文件,二选一,优先读文件夹")
+ with gr.Column():
+ model_choose = gr.Dropdown(label="模型", choices=uvr5_names)
+ opt_vocal_root = gr.Textbox(label="指定输出人声文件夹",value="opt")
+ opt_ins_root = gr.Textbox(label="指定输出乐器文件夹",value="opt")
+ but2=gr.Button("转换", variant="primary")
+ vc_output4 = gr.Textbox(label="输出信息")
+ but2.click(uvr, [model_choose, dir_wav_input,opt_vocal_root,wav_inputs,opt_ins_root], [vc_output4])
+ with gr.TabItem("训练"):
+ gr.Markdown(value="""
+ step1:填写实验配置。实验数据放在logs下,每个实验一个文件夹,需手工输入实验名路径,内含实验配置,日志,训练得到的模型文件。
+ """)
+ with gr.Row():
+ exp_dir1 = gr.Textbox(label="输入实验名",value="mi-test")
+ sr2 = gr.Radio(label="目标采样率", choices=["32k","40k","48k"],value="40k", interactive=True)
+ if_f0_3 = gr.Radio(label="模型是否带音高指导(唱歌一定要,语音可以不要)", choices=["是","否"],value="是", interactive=True)
+ with gr.Group():#暂时单人的,后面支持最多4人的#数据处理
+ gr.Markdown(value="""
+ step2a:自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化,在实验目录下生成2个wav文件夹;暂时只支持单人训练。
+ """)
+ with gr.Row():
+ trainset_dir4 = gr.Textbox(label="输入训练文件夹路径",value="E:\语音音频+标注\米津玄师\src")
+ spk_id5 = gr.Slider(minimum=0, maximum=4, step=1, label='请指定说话人id', value=0,interactive=True)
+ but1=gr.Button("处理数据", variant="primary")
+ info1=gr.Textbox(label="输出信息",value="")
+ but1.click(preprocess_dataset,[trainset_dir4,exp_dir1,sr2],[info1])
+ with gr.Group():
+ gr.Markdown(value="""
+ step2b:使用CPU提取音高(如果模型带音高),使用GPU提取特征(选择卡号)
+ """)
+ with gr.Row():
+ with gr.Column():
+ gpus6 = gr.Textbox(label="以-分隔输入使用的卡号,例如 0-1-2 使用卡0和卡1和卡2",value=gpus,interactive=True)
+ gpu_info9 = gr.Textbox(label="显卡信息",value=gpu_info)
+ with gr.Column():
+ np7 = gr.Slider(minimum=0, maximum=ncpu, step=1, label='提取音高使用的CPU进程数', value=ncpu,interactive=True)
+ f0method8 = gr.Radio(label="选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢", choices=["pm", "harvest","dio"], value="harvest", interactive=True)
+ but2=gr.Button("特征提取", variant="primary")
+ info2=gr.Textbox(label="输出信息",value="",max_lines=8)
+ but2.click(extract_f0_feature,[gpus6,np7,f0method8,if_f0_3,exp_dir1],[info2])
+ with gr.Group():
+ gr.Markdown(value="""
+ step3:填写训练设置,开始训练模型和索引
+ """)
+ with gr.Row():
+ save_epoch10 = gr.Slider(minimum=0, maximum=50, step=1, label='保存频率save_every_epoch', value=5,interactive=True)
+ total_epoch11 = gr.Slider(minimum=0, maximum=100, step=1, label='总训练轮数total_epoch', value=10,interactive=True)
+ batch_size12 = gr.Slider(minimum=0, maximum=32, step=1, label='batch_size', value=4,interactive=True)
+ if_save_latest13 = gr.Radio(label="是否仅保存最新的ckpt文件以节省硬盘空间", choices=["是", "否"], value="否", interactive=True)
+ if_cache_gpu17 = gr.Radio(label="是否缓存所有训练集至显存。10min以下小数据可缓存以加速训练,大数据缓存会炸显存也加不了多少速", choices=["是", "否"], value="否", interactive=True)
+ with gr.Row():
+ pretrained_G14 = gr.Textbox(label="加载预训练底模G路径", value="pretrained/f0G40k.pth",interactive=True)
+ pretrained_D15 = gr.Textbox(label="加载预训练底模D路径", value="pretrained/f0D40k.pth",interactive=True)
+ sr2.change(change_sr2, [sr2,if_f0_3], [pretrained_G14,pretrained_D15])
+ if_f0_3.change(change_f0, [if_f0_3, sr2], [np7, f0method8, pretrained_G14, pretrained_D15])
+ gpus16 = gr.Textbox(label="以-分隔输入使用的卡号,例如 0-1-2 使用卡0和卡1和卡2", value=gpus,interactive=True)
+ but3 = gr.Button("训练模型", variant="primary")
+ but4 = gr.Button("训练特征索引", variant="primary")
+ but5 = gr.Button("一键训练", variant="primary")
+ info3 = gr.Textbox(label="输出信息", value="",max_lines=10)
+ but3.click(click_train,[exp_dir1,sr2,if_f0_3,spk_id5,save_epoch10,total_epoch11,batch_size12,if_save_latest13,pretrained_G14,pretrained_D15,gpus16,if_cache_gpu17],info3)
+ but4.click(train_index,[exp_dir1],info3)
+ but5.click(train1key,[exp_dir1,sr2,if_f0_3,trainset_dir4,spk_id5,gpus6,np7,f0method8,save_epoch10,total_epoch11,batch_size12,if_save_latest13,pretrained_G14,pretrained_D15,gpus16,if_cache_gpu17],info3)
+
+ with gr.TabItem("ckpt处理"):
+ with gr.Group():
+ gr.Markdown(value="""模型融合,可用于测试音色融合""")
+ with gr.Row():
+ ckpt_a = gr.Textbox(label="A模型路径", value="", interactive=True)
+ ckpt_b = gr.Textbox(label="B模型路径", value="", interactive=True)
+ alpha_a = gr.Slider(minimum=0, maximum=1, label='A模型权重', value=0.5, interactive=True)
+ with gr.Row():
+ sr_ = gr.Radio(label="目标采样率", choices=["32k","40k","48k"],value="40k", interactive=True)
+ if_f0_ = gr.Radio(label="模型是否带音高指导", choices=["是","否"],value="是", interactive=True)
+ info__ = gr.Textbox(label="要置入的模型信息", value="", max_lines=8, interactive=True)
+ name_to_save0=gr.Textbox(label="保存的模型名不带后缀", value="", max_lines=1, interactive=True)
+ with gr.Row():
+ but6 = gr.Button("融合", variant="primary")
+ info4 = gr.Textbox(label="输出信息", value="", max_lines=8)
+ but6.click(merge, [ckpt_a,ckpt_b,alpha_a,sr_,if_f0_,info__,name_to_save0], info4)#def merge(path1,path2,alpha1,sr,f0,info):
+ with gr.Group():
+ gr.Markdown(value="修改模型信息(仅支持weights文件夹下提取的小模型文件)")
+ with gr.Row():
+ ckpt_path0 = gr.Textbox(label="模型路径", value="", interactive=True)
+ info_=gr.Textbox(label="要改的模型信息", value="", max_lines=8, interactive=True)
+ name_to_save1=gr.Textbox(label="保存的文件名,默认空为和源文件同名", value="", max_lines=8, interactive=True)
+ with gr.Row():
+ but7 = gr.Button("修改", variant="primary")
+ info5 = gr.Textbox(label="输出信息", value="", max_lines=8)
+ but7.click(change_info, [ckpt_path0,info_,name_to_save1], info5)
+ with gr.Group():
+ gr.Markdown(value="查看模型信息(仅支持weights文件夹下提取的小模型文件)")
+ with gr.Row():
+ ckpt_path1 = gr.Textbox(label="模型路径", value="", interactive=True)
+ but8 = gr.Button("查看", variant="primary")
+ info6 = gr.Textbox(label="输出信息", value="", max_lines=8)
+ but8.click(show_info, [ckpt_path1], info6)
+ with gr.Group():
+ gr.Markdown(value="模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况")
+ with gr.Row():
+ ckpt_path2 = gr.Textbox(label="模型路径", value="E:\codes\py39\logs\mi-test_f0_48k\\G_23333.pth", interactive=True)
+ save_name = gr.Textbox(label="保存名", value="", interactive=True)
+ sr__ = gr.Radio(label="目标采样率", choices=["32k","40k","48k"],value="40k", interactive=True)
+ if_f0__ = gr.Radio(label="模型是否带音高指导,1是0否", choices=["1","0"],value="1", interactive=True)
+ info___ = gr.Textbox(label="要置入的模型信息", value="", max_lines=8, interactive=True)
+ but9 = gr.Button("提取", variant="primary")
+ info7 = gr.Textbox(label="输出信息", value="", max_lines=8)
+ ckpt_path2.change(change_info_,[ckpt_path2],[sr__,if_f0__])
+ but9.click(extract_small_model, [ckpt_path2,save_name,sr__,if_f0__,info___], info7)
+
+ with gr.TabItem("招募音高曲线前端编辑器"):
+ gr.Markdown(value="""加开发群联系我xxxxx""")
+ with gr.TabItem("点击查看交流、问题反馈群号"):
+ gr.Markdown(value="""xxxxx""")
+
+ # app.launch(server_name="0.0.0.0",server_port=7860)
+ # app.queue(concurrency_count=511, max_size=1022).launch(server_name="127.0.0.1",inbrowser=True,server_port=7861,quiet=True)
+ app.queue(concurrency_count=511, max_size=1022).launch(server_name="0.0.0.0",inbrowser=True,server_port=7865,quiet=True)
\ No newline at end of file
diff --git a/infer_uvr5.py b/infer_uvr5.py
new file mode 100644
index 0000000..b38dd73
--- /dev/null
+++ b/infer_uvr5.py
@@ -0,0 +1,108 @@
+import os,sys,torch,warnings,pdb
+warnings.filterwarnings("ignore")
+import librosa
+import importlib
+import numpy as np
+import hashlib , math
+from tqdm import tqdm
+from uvr5_pack.lib_v5 import spec_utils
+from uvr5_pack.utils import _get_name_params,inference
+from uvr5_pack.lib_v5.model_param_init import ModelParameters
+from scipy.io import wavfile
+
+class _audio_pre_():
+ def __init__(self, model_path,device,is_half):
+ self.model_path = model_path
+ self.device = device
+ self.data = {
+ # Processing Options
+ 'postprocess': False,
+ 'tta': False,
+ # Constants
+ 'window_size': 512,
+ 'agg': 10,
+ 'high_end_process': 'mirroring',
+ }
+ nn_arch_sizes = [
+ 31191, # default
+ 33966,61968, 123821, 123812, 537238 # custom
+ ]
+ self.nn_architecture = list('{}KB'.format(s) for s in nn_arch_sizes)
+ model_size = math.ceil(os.stat(model_path ).st_size / 1024)
+ nn_architecture = '{}KB'.format(min(nn_arch_sizes, key=lambda x:abs(x-model_size)))
+ nets = importlib.import_module('uvr5_pack.lib_v5.nets' + f'_{nn_architecture}'.replace('_{}KB'.format(nn_arch_sizes[0]), ''), package=None)
+ model_hash = hashlib.md5(open(model_path,'rb').read()).hexdigest()
+ param_name ,model_params_d = _get_name_params(model_path , model_hash)
+
+ mp = ModelParameters(model_params_d)
+ model = nets.CascadedASPPNet(mp.param['bins'] * 2)
+ cpk = torch.load( model_path , map_location='cpu')
+ model.load_state_dict(cpk)
+ model.eval()
+ if(is_half==True):model = model.half().to(device)
+ else:model = model.to(device)
+
+ self.mp = mp
+ self.model = model
+
+ def _path_audio_(self, music_file ,ins_root=None,vocal_root=None):
+ if(ins_root is None and vocal_root is None):return "No save root."
+ name=os.path.basename(music_file)
+ if(ins_root is not None):os.makedirs(ins_root, exist_ok=True)
+ if(vocal_root is not None):os.makedirs(vocal_root , exist_ok=True)
+ X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
+ bands_n = len(self.mp.param['band'])
+ # print(bands_n)
+ for d in range(bands_n, 0, -1):
+ bp = self.mp.param['band'][d]
+ if d == bands_n: # high-end band
+ X_wave[d], _ = librosa.core.load(#理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑
+ music_file, bp['sr'], False, dtype=np.float32, res_type=bp['res_type'])
+ if X_wave[d].ndim == 1:
+ X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
+ else: # lower bands
+ X_wave[d] = librosa.core.resample(X_wave[d+1], self.mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type'])
+ # Stft of wave source
+ X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(X_wave[d], bp['hl'], bp['n_fft'], self.mp.param['mid_side'], self.mp.param['mid_side_b2'], self.mp.param['reverse'])
+ # pdb.set_trace()
+ if d == bands_n and self.data['high_end_process'] != 'none':
+ input_high_end_h = (bp['n_fft']//2 - bp['crop_stop']) + ( self.mp.param['pre_filter_stop'] - self.mp.param['pre_filter_start'])
+ input_high_end = X_spec_s[d][:, bp['n_fft']//2-input_high_end_h:bp['n_fft']//2, :]
+
+ X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
+ aggresive_set = float(self.data['agg']/100)
+ aggressiveness = {'value': aggresive_set, 'split_bin': self.mp.param['band'][1]['crop_stop']}
+ with torch.no_grad():
+ pred, X_mag, X_phase = inference(X_spec_m,self.device,self.model, aggressiveness,self.data)
+ # Postprocess
+ if self.data['postprocess']:
+ pred_inv = np.clip(X_mag - pred, 0, np.inf)
+ pred = spec_utils.mask_silence(pred, pred_inv)
+ y_spec_m = pred * X_phase
+ v_spec_m = X_spec_m - y_spec_m
+
+ if (ins_root is not None):
+ if self.data['high_end_process'].startswith('mirroring'):
+ input_high_end_ = spec_utils.mirroring(self.data['high_end_process'], y_spec_m, input_high_end, self.mp)
+ wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp,input_high_end_h, input_high_end_)
+ else:
+ wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
+ print ('%s instruments done'%name)
+ wavfile.write(os.path.join(ins_root, 'instrument_{}.wav'.format(name) ), self.mp.param['sr'], (np.array(wav_instrument)*32768).astype("int16")) #
+ if (vocal_root is not None):
+ if self.data['high_end_process'].startswith('mirroring'):
+ input_high_end_ = spec_utils.mirroring(self.data['high_end_process'], v_spec_m, input_high_end, self.mp)
+ wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp, input_high_end_h, input_high_end_)
+ else:
+ wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
+ print ('%s vocals done'%name)
+ wavfile.write(os.path.join(vocal_root , 'vocal_{}.wav'.format(name) ), self.mp.param['sr'], (np.array(wav_vocals)*32768).astype("int16"))
+
+if __name__ == '__main__':
+ device = 'cuda'
+ is_half=True
+ model_path='uvr5_weights/2_HP-UVR.pth'
+ pre_fun = _audio_pre_(model_path=model_path,device=device,is_half=True)
+ audio_path = '神女劈观.aac'
+ save_path = 'opt'
+ pre_fun._path_audio_(audio_path , save_path,save_path)
diff --git a/my_utils.py b/my_utils.py
new file mode 100644
index 0000000..48a93b6
--- /dev/null
+++ b/my_utils.py
@@ -0,0 +1,18 @@
+import ffmpeg,numpy as np
+def load_audio(file,sr):
+ try:
+ # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
+ # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
+ # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
+ out, _ = (
+ ffmpeg.input(file, threads=0)
+ .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
+ .run(cmd=["./ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
+ )
+ except ffmpeg.Error as e:
+ raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
+
+ return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
+
+if __name__=='__main__' :
+ print(load_audio(r"C:\CloudMusic\宮野幸子,森下唯 - 月夜に謳う君 -LUNA-.mp3",16000).shape)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..3163bcd
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,196 @@
+absl-py==1.1.0
+aiofiles==23.1.0
+aiohttp==3.8.4
+aiosignal==1.3.1
+altair==4.2.0
+antlr4-python3-runtime==4.8
+anyio==3.6.1
+appdirs==1.4.4
+argon2-cffi==21.3.0
+argon2-cffi-bindings==21.2.0
+asttokens==2.0.5
+async-timeout==4.0.2
+attrs==21.4.0
+audioread==3.0.0
+Babel==2.10.3
+backcall==0.2.0
+beautifulsoup4==4.11.1
+bitarray==2.7.3
+bleach==5.0.0
+brotlipy==0.7.0
+cachetools==5.2.0
+certifi==2021.5.30
+cffi
+chardet
+charset-normalizer==3.0.1
+click==8.1.3
+cmake==3.25.0
+colorama==0.4.5
+cryptography
+cycler==0.11.0
+Cython==0.29.32
+debugpy==1.6.0
+decorator==5.1.1
+defusedxml==0.7.1
+entrypoints==0.4
+executing==0.8.3
+fairseq==0.12.2
+faiss-gpu==1.7.2
+fastapi==0.92.0
+fastjsonschema==2.15.3
+ffmpeg==1.4
+ffmpy==0.3.0
+filelock==3.9.0
+fonttools==4.33.3
+frozenlist==1.3.3
+fsspec==2022.11.0
+functorch==2.0.0
+future==0.18.3
+google==3.0.0
+google-auth==2.8.0
+google-auth-oauthlib==0.4.6
+googleads==3.8.0
+gradio==3.19.1
+grpcio==1.46.3
+h11==0.13.0
+httpcore==0.16.3
+httplib2==0.21.0
+httpx==0.23.1
+Hydra==2.5
+hydra-core==1.0.7
+idna
+importlib-metadata==4.11.4
+importlib-resources==5.8.0
+ipykernel==6.15.0
+ipython==8.4.0
+ipython-genutils==0.2.0
+ipywidgets==7.7.0
+jedi==0.18.1
+Jinja2==3.1.2
+joblib==1.1.0
+json5==0.9.8
+jsonschema==4.6.0
+jupyter-client==7.3.4
+jupyter-core==4.10.0
+jupyter-server==1.17.1
+jupyterlab==3.4.3
+jupyterlab-language-pack-zh-CN==3.4.post1
+jupyterlab-pygments==0.2.2
+jupyterlab-server==2.14.0
+jupyterlab-widgets==1.1.0
+kiwisolver==1.4.3
+lazy-loader==0.1
+librosa==0.9.2
+linkify-it-py==2.0.0
+lit==15.0.7
+llvmlite==0.39.0
+lxml==4.8.0
+Markdown==3.3.7
+markdown-it-py==2.2.0
+MarkupSafe==2.1.1
+matplotlib==3.5.2
+matplotlib-inline==0.1.3
+mdit-py-plugins==0.3.3
+mdurl==0.1.1
+mistune==0.8.4
+mpmath==1.2.1
+msgpack==1.0.4
+multidict==6.0.2
+nbclassic==0.3.7
+nbclient==0.6.4
+nbconvert==6.5.0
+nbformat==5.4.0
+nest-asyncio==1.5.5
+networkx==2.8.8
+notebook==6.4.12
+notebook-shim==0.1.0
+numba==0.56.4
+numpy==1.23.5
+oauth2client==4.1.3
+oauthlib==3.2.0
+omegaconf==2.0.6
+orjson==3.8.6
+packaging==21.3
+pandas==1.5.2
+pandocfilters==1.5.0
+parso==0.8.3
+pexpect==4.8.0
+pickleshare==0.7.5
+Pillow==9.1.1
+pooch==1.6.0
+portalocker==2.5.1
+praat-parselmouth==0.4.2
+prometheus-client==0.14.1
+prompt-toolkit==3.0.29
+protobuf==3.19.4
+psutil==5.9.1
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyasn1==0.4.8
+pyasn1-modules==0.2.8
+pycosat==0.6.3
+pycparser
+pycryptodome==3.16.0
+pydantic==1.10.5
+pydub==0.25.1
+Pygments==2.12.0
+pyOpenSSL
+pyparsing==3.0.9
+pyrsistent==0.18.1
+PySocks
+python-dateutil==2.8.2
+python-multipart==0.0.5
+pytz==2022.6
+pyworld==0.3.2
+PyYAML==6.0
+pyzmq==23.2.0
+regex==2022.10.31
+requests
+requests-oauthlib==1.3.1
+resampy==0.4.2
+rfc3986==1.5.0
+rsa==4.8
+ruamel-yaml-conda
+sacrebleu==2.3.1
+scikit-learn==1.1.3
+scipy==1.9.3
+Send2Trash==1.8.0
+six
+sniffio==1.2.0
+soundfile==0.12.1
+soupsieve==2.3.2.post1
+soxr==0.3.3
+stack-data==0.3.0
+starlette==0.25.0
+stopit==1.1.1
+suds-jurko==0.6
+supervisor==4.2.4
+sympy==1.11.1
+tabulate==0.8.10
+tensorboard==2.9.1
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+terminado==0.15.0
+threadpoolctl==3.1.0
+tinycss2==1.1.1
+toolz==0.12.0
+torch==2.0.0+cu117
+torchaudio==2.0.1+cu117
+torchgen==0.0.1
+torchvision==0.15.1+cu117
+tornado==6.1
+tqdm
+traitlets==5.3.0
+triton==2.0.0
+typing-extensions==4.2.0
+uc-micro-py==1.0.1
+urllib3==1.26.13
+uvicorn==0.21.1
+wcwidth==0.2.5
+webencodings==0.5.1
+websocket-client==1.3.3
+websockets==10.3
+Werkzeug==2.1.2
+widgetsnbextension==3.6.0
+yarl==1.8.1
+zipp==3.8.0
diff --git a/slicer2.py b/slicer2.py
new file mode 100644
index 0000000..84ea78c
--- /dev/null
+++ b/slicer2.py
@@ -0,0 +1,186 @@
+import numpy as np
+
+
+# This function is obtained from librosa.
+def get_rms(
+ y,
+ *,
+ frame_length=2048,
+ hop_length=512,
+ pad_mode="constant",
+):
+ padding = (int(frame_length // 2), int(frame_length // 2))
+ y = np.pad(y, padding, mode=pad_mode)
+
+ axis = -1
+ # put our new within-frame axis at the end for now
+ out_strides = y.strides + tuple([y.strides[axis]])
+ # Reduce the shape on the framing axis
+ x_shape_trimmed = list(y.shape)
+ x_shape_trimmed[axis] -= frame_length - 1
+ out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
+ xw = np.lib.stride_tricks.as_strided(
+ y, shape=out_shape, strides=out_strides
+ )
+ if axis < 0:
+ target_axis = axis - 1
+ else:
+ target_axis = axis + 1
+ xw = np.moveaxis(xw, -1, target_axis)
+ # Downsample along the target axis
+ slices = [slice(None)] * xw.ndim
+ slices[axis] = slice(0, None, hop_length)
+ x = xw[tuple(slices)]
+
+ # Calculate power
+ power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)
+
+ return np.sqrt(power)
+
+
+class Slicer:
+ def __init__(self,
+ sr: int,
+ threshold: float = -40.,
+ min_length: int = 5000,
+ min_interval: int = 300,
+ hop_size: int = 20,
+ max_sil_kept: int = 5000):
+ if not min_length >= min_interval >= hop_size:
+ raise ValueError('The following condition must be satisfied: min_length >= min_interval >= hop_size')
+ if not max_sil_kept >= hop_size:
+ raise ValueError('The following condition must be satisfied: max_sil_kept >= hop_size')
+ min_interval = sr * min_interval / 1000
+ self.threshold = 10 ** (threshold / 20.)
+ self.hop_size = round(sr * hop_size / 1000)
+ self.win_size = min(round(min_interval), 4 * self.hop_size)
+ self.min_length = round(sr * min_length / 1000 / self.hop_size)
+ self.min_interval = round(min_interval / self.hop_size)
+ self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
+
+ def _apply_slice(self, waveform, begin, end):
+ if len(waveform.shape) > 1:
+ return waveform[:, begin * self.hop_size: min(waveform.shape[1], end * self.hop_size)]
+ else:
+ return waveform[begin * self.hop_size: min(waveform.shape[0], end * self.hop_size)]
+
+ # @timeit
+ def slice(self, waveform):
+ if len(waveform.shape) > 1:
+ samples = waveform.mean(axis=0)
+ else:
+ samples = waveform
+ if samples.shape[0] <= self.min_length:
+ return [waveform]
+ rms_list = get_rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0)
+ sil_tags = []
+ silence_start = None
+ clip_start = 0
+ for i, rms in enumerate(rms_list):
+ # Keep looping while frame is silent.
+ if rms < self.threshold:
+ # Record start of silent frames.
+ if silence_start is None:
+ silence_start = i
+ continue
+ # Keep looping while frame is not silent and silence start has not been recorded.
+ if silence_start is None:
+ continue
+ # Clear recorded silence start if interval is not enough or clip is too short
+ is_leading_silence = silence_start == 0 and i > self.max_sil_kept
+ need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length
+ if not is_leading_silence and not need_slice_middle:
+ silence_start = None
+ continue
+ # Need slicing. Record the range of silent frames to be removed.
+ if i - silence_start <= self.max_sil_kept:
+ pos = rms_list[silence_start: i + 1].argmin() + silence_start
+ if silence_start == 0:
+ sil_tags.append((0, pos))
+ else:
+ sil_tags.append((pos, pos))
+ clip_start = pos
+ elif i - silence_start <= self.max_sil_kept * 2:
+ pos = rms_list[i - self.max_sil_kept: silence_start + self.max_sil_kept + 1].argmin()
+ pos += i - self.max_sil_kept
+ pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
+ pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
+ if silence_start == 0:
+ sil_tags.append((0, pos_r))
+ clip_start = pos_r
+ else:
+ sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
+ clip_start = max(pos_r, pos)
+ else:
+ pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
+ pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
+ if silence_start == 0:
+ sil_tags.append((0, pos_r))
+ else:
+ sil_tags.append((pos_l, pos_r))
+ clip_start = pos_r
+ silence_start = None
+ # Deal with trailing silence.
+ total_frames = rms_list.shape[0]
+ if silence_start is not None and total_frames - silence_start >= self.min_interval:
+ silence_end = min(total_frames, silence_start + self.max_sil_kept)
+ pos = rms_list[silence_start: silence_end + 1].argmin() + silence_start
+ sil_tags.append((pos, total_frames + 1))
+ # Apply and return slices.
+ if len(sil_tags) == 0:
+ return [waveform]
+ else:
+ chunks = []
+ if sil_tags[0][0] > 0:
+ chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0]))
+ for i in range(len(sil_tags) - 1):
+ chunks.append(self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]))
+ if sil_tags[-1][1] < total_frames:
+ chunks.append(self._apply_slice(waveform, sil_tags[-1][1], total_frames))
+ return chunks
+
+
+def main():
+ import os.path
+ from argparse import ArgumentParser
+
+ import librosa
+ import soundfile
+
+ parser = ArgumentParser()
+ parser.add_argument('audio', type=str, help='The audio to be sliced')
+ parser.add_argument('--out', type=str, help='Output directory of the sliced audio clips')
+ parser.add_argument('--db_thresh', type=float, required=False, default=-40,
+ help='The dB threshold for silence detection')
+ parser.add_argument('--min_length', type=int, required=False, default=5000,
+ help='The minimum milliseconds required for each sliced audio clip')
+ parser.add_argument('--min_interval', type=int, required=False, default=300,
+ help='The minimum milliseconds for a silence part to be sliced')
+ parser.add_argument('--hop_size', type=int, required=False, default=10,
+ help='Frame length in milliseconds')
+ parser.add_argument('--max_sil_kept', type=int, required=False, default=500,
+ help='The maximum silence length kept around the sliced clip, presented in milliseconds')
+ args = parser.parse_args()
+ out = args.out
+ if out is None:
+ out = os.path.dirname(os.path.abspath(args.audio))
+ audio, sr = librosa.load(args.audio, sr=None, mono=False)
+ slicer = Slicer(
+ sr=sr,
+ threshold=args.db_thresh,
+ min_length=args.min_length,
+ min_interval=args.min_interval,
+ hop_size=args.hop_size,
+ max_sil_kept=args.max_sil_kept
+ )
+ chunks = slicer.slice(audio)
+ if not os.path.exists(out):
+ os.makedirs(out)
+ for i, chunk in enumerate(chunks):
+ if len(chunk.shape) > 1:
+ chunk = chunk.T
+ soundfile.write(os.path.join(out, f'%s_%d.wav' % (os.path.basename(args.audio).rsplit('.', maxsplit=1)[0], i)), chunk, sr)
+
+
+if __name__ == '__main__':
+ main()
\ No newline at end of file
diff --git a/train_nsf_sim_cache_sid_load_pretrain.py b/train_nsf_sim_cache_sid_load_pretrain.py
new file mode 100644
index 0000000..3d84d8c
--- /dev/null
+++ b/train_nsf_sim_cache_sid_load_pretrain.py
@@ -0,0 +1,509 @@
+import sys,os
+now_dir=os.getcwd()
+sys.path.append(os.path.join(now_dir,"train"))
+import utils
+hps = utils.get_hparams()
+os.environ["CUDA_VISIBLE_DEVICES"]=hps.gpus.replace("-",",")
+n_gpus=len(hps.gpus.split("-"))
+from random import shuffle
+import traceback,json,argparse,itertools,math,torch,pdb
+torch.backends.cudnn.deterministic = True
+torch.backends.cudnn.benchmark = False
+from torch import nn, optim
+from torch.nn import functional as F
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+import torch.multiprocessing as mp
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.cuda.amp import autocast, GradScaler
+from infer_pack import commons
+
+from time import time as ttime
+from data_utils import TextAudioLoaderMultiNSFsid,TextAudioLoader, TextAudioCollateMultiNSFsid,TextAudioCollate, DistributedBucketSampler
+from infer_pack.models import (
+ SynthesizerTrnMs256NSFsid,SynthesizerTrnMs256NSFsid_nono,
+ MultiPeriodDiscriminator,
+)
+from losses import generator_loss, discriminator_loss, feature_loss, kl_loss
+from mel_processing import mel_spectrogram_torch, spec_to_mel_torch
+
+
+global_step = 0
+
+
+
+def main():
+ """Assume Single Node Multi GPUs Training Only"""
+ assert torch.cuda.is_available(), "CPU training is not allowed."
+
+ # n_gpus = torch.cuda.device_count()
+ os.environ["MASTER_ADDR"] = "localhost"
+ os.environ["MASTER_PORT"] = "5555"
+
+
+ mp.spawn(
+ run,
+ nprocs=n_gpus,
+ args=(
+ n_gpus,
+ hps,
+ ),
+ )
+
+
+def run(rank, n_gpus, hps):
+ global global_step
+ if rank == 0:
+ logger = utils.get_logger(hps.model_dir)
+ logger.info(hps)
+ utils.check_git_hash(hps.model_dir)
+ writer = SummaryWriter(log_dir=hps.model_dir)
+ writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))
+
+ dist.init_process_group(
+ backend="gloo", init_method="env://", world_size=n_gpus, rank=rank
+ )
+ torch.manual_seed(hps.train.seed)
+ torch.cuda.set_device(rank)
+
+ if (hps.if_f0 == 1):train_dataset = TextAudioLoaderMultiNSFsid(hps.data.training_files, hps.data)
+ else:train_dataset = TextAudioLoader(hps.data.training_files, hps.data)
+ train_sampler = DistributedBucketSampler(
+ train_dataset,
+ hps.train.batch_size,
+ # [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1200,1400], # 16s
+ [100, 200, 300, 400, 500, 600, 700, 800, 900], # 16s
+ num_replicas=n_gpus,
+ rank=rank,
+ shuffle=True,
+ )
+ # It is possible that dataloader's workers are out of shared memory. Please try to raise your shared memory limit.
+ # num_workers=8 -> num_workers=4
+ if (hps.if_f0 == 1):collate_fn = TextAudioCollateMultiNSFsid()
+ else:collate_fn = TextAudioCollate()
+ train_loader = DataLoader(
+ train_dataset,
+ num_workers=4,
+ shuffle=False,
+ pin_memory=True,
+ collate_fn=collate_fn,
+ batch_sampler=train_sampler,
+ persistent_workers=True,
+ prefetch_factor=8,
+ )
+ if(hps.if_f0==1):net_g = SynthesizerTrnMs256NSFsid(hps.data.filter_length // 2 + 1,hps.train.segment_size // hps.data.hop_length,**hps.model,is_half=hps.train.fp16_run,sr=hps.sample_rate).cuda(rank)
+ else:net_g = SynthesizerTrnMs256NSFsid_nono(hps.data.filter_length // 2 + 1,hps.train.segment_size // hps.data.hop_length,**hps.model,is_half=hps.train.fp16_run).cuda(rank)
+ net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank)
+ optim_g = torch.optim.AdamW(
+ net_g.parameters(),
+ hps.train.learning_rate,
+ betas=hps.train.betas,
+ eps=hps.train.eps,
+ )
+ optim_d = torch.optim.AdamW(
+ net_d.parameters(),
+ hps.train.learning_rate,
+ betas=hps.train.betas,
+ eps=hps.train.eps,
+ )
+ # net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True)
+ # net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True)
+ net_g = DDP(net_g, device_ids=[rank])
+ net_d = DDP(net_d, device_ids=[rank])
+
+ try:#如果能加载自动resume
+ _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, optim_d) # D多半加载没事
+ if rank == 0:
+ logger.info("loaded D")
+ # _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g,load_opt=0)
+ _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g)
+ global_step = (epoch_str - 1) * len(train_loader)
+ # epoch_str = 1
+ # global_step = 0
+ except:#如果首次不能加载,加载pretrain
+ traceback.print_exc()
+ epoch_str = 1
+ global_step = 0
+ if rank == 0:
+ logger.info("loaded pretrained %s %s"%(hps.pretrainG,hps.pretrainD))
+ print(net_g.module.load_state_dict(torch.load(hps.pretrainG,map_location="cpu")["model"]))##测试不加载优化器
+ print(net_d.module.load_state_dict(torch.load(hps.pretrainD,map_location="cpu")["model"]))
+
+ scheduler_g = torch.optim.lr_scheduler.ExponentialLR(
+ optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2
+ )
+ scheduler_d = torch.optim.lr_scheduler.ExponentialLR(
+ optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2
+ )
+
+ scaler = GradScaler(enabled=hps.train.fp16_run)
+
+ cache=[]
+ for epoch in range(epoch_str, hps.train.epochs + 1):
+ if rank == 0:
+ train_and_evaluate(
+ rank,
+ epoch,
+ hps,
+ [net_g, net_d],
+ [optim_g, optim_d],
+ [scheduler_g, scheduler_d],
+ scaler,
+ [train_loader, None],
+ logger,
+ [writer, writer_eval],cache
+ )
+ else:
+ train_and_evaluate(
+ rank,
+ epoch,
+ hps,
+ [net_g, net_d],
+ [optim_g, optim_d],
+ [scheduler_g, scheduler_d],
+ scaler,
+ [train_loader, None],
+ None,
+ None,cache
+ )
+ scheduler_g.step()
+ scheduler_d.step()
+
+
+def train_and_evaluate(
+ rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers,cache
+):
+ net_g, net_d = nets
+ optim_g, optim_d = optims
+ train_loader, eval_loader = loaders
+ if writers is not None:
+ writer, writer_eval = writers
+
+ train_loader.batch_sampler.set_epoch(epoch)
+ global global_step
+
+ net_g.train()
+ net_d.train()
+ if(cache==[]or hps.if_cache_data_in_gpu==False):#第一个epoch把cache全部填满训练集
+ # print("caching")
+ for batch_idx, info in enumerate(train_loader):
+ if (hps.if_f0 == 1):phone,phone_lengths,pitch,pitchf,spec,spec_lengths,wave,wave_lengths,sid=info
+ else:phone,phone_lengths,spec,spec_lengths,wave,wave_lengths,sid=info
+ phone, phone_lengths = phone.cuda(rank, non_blocking=True),phone_lengths.cuda(rank, non_blocking=True )
+ if (hps.if_f0 == 1):pitch,pitchf = pitch.cuda(rank, non_blocking=True),pitchf.cuda(rank, non_blocking=True)
+ sid = sid.cuda(rank, non_blocking=True)
+ spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda(rank, non_blocking=True)
+ wave, wave_lengths = wave.cuda(rank, non_blocking=True), wave_lengths.cuda(rank, non_blocking=True)
+ if(hps.if_cache_data_in_gpu==True):
+ if (hps.if_f0 == 1):cache.append((batch_idx, (phone,phone_lengths,pitch,pitchf,spec,spec_lengths,wave,wave_lengths ,sid)))
+ else:cache.append((batch_idx, (phone,phone_lengths,spec,spec_lengths,wave,wave_lengths ,sid)))
+ with autocast(enabled=hps.train.fp16_run):
+ if (hps.if_f0 == 1):y_hat,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(phone, phone_lengths, pitch,pitchf, spec, spec_lengths,sid)
+ else:y_hat,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(phone, phone_lengths, spec, spec_lengths,sid)
+ mel = spec_to_mel_torch(spec,hps.data.filter_length,hps.data.n_mel_channels,hps.data.sampling_rate,hps.data.mel_fmin,hps.data.mel_fmax,)
+ y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length)
+ with autocast(enabled=False):
+ y_hat_mel = mel_spectrogram_torch(
+ y_hat.float().squeeze(1),
+ hps.data.filter_length,
+ hps.data.n_mel_channels,
+ hps.data.sampling_rate,
+ hps.data.hop_length,
+ hps.data.win_length,
+ hps.data.mel_fmin,
+ hps.data.mel_fmax,
+ )
+ if(hps.train.fp16_run==True):
+ y_hat_mel=y_hat_mel.half()
+ wave = commons.slice_segments(
+ wave, ids_slice * hps.data.hop_length, hps.train.segment_size
+ ) # slice
+
+ # Discriminator
+ y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach())
+ with autocast(enabled=False):
+ loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(
+ y_d_hat_r, y_d_hat_g
+ )
+ optim_d.zero_grad()
+ scaler.scale(loss_disc).backward()
+ scaler.unscale_(optim_d)
+ grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
+ scaler.step(optim_d)
+
+ with autocast(enabled=hps.train.fp16_run):
+ # Generator
+ y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat)
+ with autocast(enabled=False):
+ loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
+ loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
+ loss_fm = feature_loss(fmap_r, fmap_g)
+ loss_gen, losses_gen = generator_loss(y_d_hat_g)
+ loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl
+ optim_g.zero_grad()
+ scaler.scale(loss_gen_all).backward()
+ scaler.unscale_(optim_g)
+ grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
+ scaler.step(optim_g)
+ scaler.update()
+
+ if rank == 0:
+ if global_step % hps.train.log_interval == 0:
+ lr = optim_g.param_groups[0]["lr"]
+ logger.info(
+ "Train Epoch: {} [{:.0f}%]".format(
+ epoch, 100.0 * batch_idx / len(train_loader)
+ )
+ )
+ # Amor For Tensorboard display
+ if loss_mel > 50:
+ loss_mel = 50
+ if loss_kl > 5:
+ loss_kl = 5
+
+ logger.info([global_step, lr])
+ logger.info(
+ f"loss_disc={loss_disc:.3f}, loss_gen={loss_gen:.3f}, loss_fm={loss_fm:.3f},loss_mel={loss_mel:.3f}, loss_kl={loss_kl:.3f}"
+ )
+ scalar_dict = {
+ "loss/g/total": loss_gen_all,
+ "loss/d/total": loss_disc,
+ "learning_rate": lr,
+ "grad_norm_d": grad_norm_d,
+ "grad_norm_g": grad_norm_g,
+ }
+ scalar_dict.update(
+ {"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/kl": loss_kl}
+ )
+
+ scalar_dict.update(
+ {"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}
+ )
+ scalar_dict.update(
+ {"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}
+ )
+ scalar_dict.update(
+ {"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}
+ )
+ image_dict = {
+ "slice/mel_org": utils.plot_spectrogram_to_numpy(
+ y_mel[0].data.cpu().numpy()
+ ),
+ "slice/mel_gen": utils.plot_spectrogram_to_numpy(
+ y_hat_mel[0].data.cpu().numpy()
+ ),
+ "all/mel": utils.plot_spectrogram_to_numpy(
+ mel[0].data.cpu().numpy()
+ ),
+ }
+ utils.summarize(
+ writer=writer,
+ global_step=global_step,
+ images=image_dict,
+ scalars=scalar_dict,
+ )
+ global_step += 1
+ # if global_step % hps.train.eval_interval == 0:
+ if epoch % hps.save_every_epoch == 0:
+ if(hps.if_latest==0):
+ utils.save_checkpoint(
+ net_g,
+ optim_g,
+ hps.train.learning_rate,
+ epoch,
+ os.path.join(hps.model_dir, "G_{}.pth".format(global_step)),
+ )
+ utils.save_checkpoint(
+ net_d,
+ optim_d,
+ hps.train.learning_rate,
+ epoch,
+ os.path.join(hps.model_dir, "D_{}.pth".format(global_step)),
+ )
+ else:
+ utils.save_checkpoint(
+ net_g,
+ optim_g,
+ hps.train.learning_rate,
+ epoch,
+ os.path.join(hps.model_dir, "G_{}.pth".format(2333333)),
+ )
+ utils.save_checkpoint(
+ net_d,
+ optim_d,
+ hps.train.learning_rate,
+ epoch,
+ os.path.join(hps.model_dir, "D_{}.pth".format(2333333)),
+ )
+
+ else:#后续的epoch直接使用打乱的cache
+ shuffle(cache)
+ # print("using cache")
+ for batch_idx, info in cache:
+ if (hps.if_f0 == 1):phone,phone_lengths,pitch,pitchf,spec,spec_lengths,wave,wave_lengths,sid=info
+ else:phone,phone_lengths,spec,spec_lengths,wave,wave_lengths,sid=info
+ with autocast(enabled=hps.train.fp16_run):
+ if (hps.if_f0 == 1):y_hat,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(phone, phone_lengths, pitch,pitchf, spec, spec_lengths,sid)
+ else:y_hat,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(phone, phone_lengths, spec, spec_lengths,sid)
+ mel = spec_to_mel_torch(
+ spec,
+ hps.data.filter_length,
+ hps.data.n_mel_channels,
+ hps.data.sampling_rate,
+ hps.data.mel_fmin,
+ hps.data.mel_fmax,
+ )
+ y_mel = commons.slice_segments(
+ mel, ids_slice, hps.train.segment_size // hps.data.hop_length
+ )
+ with autocast(enabled=False):
+ y_hat_mel = mel_spectrogram_torch(
+ y_hat.float().squeeze(1),
+ hps.data.filter_length,
+ hps.data.n_mel_channels,
+ hps.data.sampling_rate,
+ hps.data.hop_length,
+ hps.data.win_length,
+ hps.data.mel_fmin,
+ hps.data.mel_fmax,
+ )
+ if(hps.train.fp16_run==True):
+ y_hat_mel=y_hat_mel.half()
+ wave = commons.slice_segments(
+ wave, ids_slice * hps.data.hop_length, hps.train.segment_size
+ ) # slice
+
+ # Discriminator
+ y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach())
+ with autocast(enabled=False):
+ loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(
+ y_d_hat_r, y_d_hat_g
+ )
+ optim_d.zero_grad()
+ scaler.scale(loss_disc).backward()
+ scaler.unscale_(optim_d)
+ grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
+ scaler.step(optim_d)
+
+ with autocast(enabled=hps.train.fp16_run):
+ # Generator
+ y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat)
+ with autocast(enabled=False):
+ loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
+ loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
+
+ loss_fm = feature_loss(fmap_r, fmap_g)
+ loss_gen, losses_gen = generator_loss(y_d_hat_g)
+ loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl
+ optim_g.zero_grad()
+ scaler.scale(loss_gen_all).backward()
+ scaler.unscale_(optim_g)
+ grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
+ scaler.step(optim_g)
+ scaler.update()
+
+ if rank == 0:
+ if global_step % hps.train.log_interval == 0:
+ lr = optim_g.param_groups[0]["lr"]
+ logger.info(
+ "Train Epoch: {} [{:.0f}%]".format(
+ epoch, 100.0 * batch_idx / len(train_loader)
+ )
+ )
+ # Amor For Tensorboard display
+ if loss_mel > 50:
+ loss_mel = 50
+ if loss_kl > 5:
+ loss_kl = 5
+
+ logger.info([global_step, lr])
+ logger.info(
+ f"loss_disc={loss_disc:.3f}, loss_gen={loss_gen:.3f}, loss_fm={loss_fm:.3f},loss_mel={loss_mel:.3f}, loss_kl={loss_kl:.3f}"
+ )
+ scalar_dict = {
+ "loss/g/total": loss_gen_all,
+ "loss/d/total": loss_disc,
+ "learning_rate": lr,
+ "grad_norm_d": grad_norm_d,
+ "grad_norm_g": grad_norm_g,
+ }
+ scalar_dict.update(
+ {"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/kl": loss_kl}
+ )
+
+ scalar_dict.update(
+ {"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}
+ )
+ scalar_dict.update(
+ {"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}
+ )
+ scalar_dict.update(
+ {"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}
+ )
+ image_dict = {
+ "slice/mel_org": utils.plot_spectrogram_to_numpy(
+ y_mel[0].data.cpu().numpy()
+ ),
+ "slice/mel_gen": utils.plot_spectrogram_to_numpy(
+ y_hat_mel[0].data.cpu().numpy()
+ ),
+ "all/mel": utils.plot_spectrogram_to_numpy(
+ mel[0].data.cpu().numpy()
+ ),
+ }
+ utils.summarize(
+ writer=writer,
+ global_step=global_step,
+ images=image_dict,
+ scalars=scalar_dict,
+ )
+ global_step += 1
+ # if global_step % hps.train.eval_interval == 0:
+ if epoch % hps.save_every_epoch == 0:
+ if(hps.if_latest==0):
+ utils.save_checkpoint(
+ net_g,
+ optim_g,
+ hps.train.learning_rate,
+ epoch,
+ os.path.join(hps.model_dir, "G_{}.pth".format(global_step)),
+ )
+ utils.save_checkpoint(
+ net_d,
+ optim_d,
+ hps.train.learning_rate,
+ epoch,
+ os.path.join(hps.model_dir, "D_{}.pth".format(global_step)),
+ )
+ else:
+ utils.save_checkpoint(
+ net_g,
+ optim_g,
+ hps.train.learning_rate,
+ epoch,
+ os.path.join(hps.model_dir, "G_{}.pth".format(2333333)),
+ )
+ utils.save_checkpoint(
+ net_d,
+ optim_d,
+ hps.train.learning_rate,
+ epoch,
+ os.path.join(hps.model_dir, "D_{}.pth".format(2333333)),
+ )
+
+
+ if rank == 0:
+ logger.info("====> Epoch: {}".format(epoch))
+ if(epoch>=hps.total_epoch):
+ if rank == 0:
+ logger.info("Training is done. The program is closed.")
+ from process_ckpt import savee#def savee(ckpt,sr,if_f0,name,epoch):
+ if hasattr(net_g, 'module'):ckpt = net_g.module.state_dict()
+ else:ckpt = net_g.state_dict()
+ print("saving final ckpt:",savee(ckpt,hps.sample_rate,hps.if_f0,hps.name,epoch))
+ os._exit(2333333)
+
+
+if __name__ == "__main__":
+ main()