Add files via upload
This commit is contained in:
parent
a7857f12ee
commit
2207b9647e
38
config.py
Normal file
38
config.py
Normal file
@ -0,0 +1,38 @@
|
||||
############离线VC参数
|
||||
inp_root=r"白鹭霜华长条"#对输入目录下所有音频进行转换,别放非音频文件
|
||||
opt_root=r"opt"#输出目录
|
||||
f0_up_key=0#升降调,整数,男转女12,女转男-12
|
||||
person=r"weights\洛天依v3.pt"#目前只有洛天依v3
|
||||
############硬件参数
|
||||
device = "cuda:0"#填写cuda:x或cpu,x指代第几张卡,只支持N卡加速
|
||||
is_half=True#9-10-20-30-40系显卡无脑True,不影响质量,>=20显卡开启有加速
|
||||
n_cpu=0#默认0用上所有线程,写数字限制CPU资源使用
|
||||
############下头别动
|
||||
import torch
|
||||
if(torch.cuda.is_available()==False):
|
||||
print("没有发现支持的N卡,使用CPU进行推理")
|
||||
device="cpu"
|
||||
is_half=False
|
||||
if(device!="cpu"):
|
||||
gpu_name=torch.cuda.get_device_name(int(device.split(":")[-1]))
|
||||
if("16"in gpu_name or "MX"in gpu_name):
|
||||
print("16系显卡/MX系显卡强制单精度")
|
||||
is_half=False
|
||||
from multiprocessing import cpu_count
|
||||
if(n_cpu==0):n_cpu=cpu_count()
|
||||
if(is_half==True):
|
||||
#6G显存配置
|
||||
x_pad = 3
|
||||
x_query = 10
|
||||
x_center = 60
|
||||
x_max = 65
|
||||
else:
|
||||
#5G显存配置
|
||||
x_pad = 1
|
||||
# x_query = 6
|
||||
# x_center = 30
|
||||
# x_max = 32
|
||||
#6G显存配置
|
||||
x_query = 6
|
||||
x_center = 38
|
||||
x_max = 41
|
120
extract_f0_print.py
Normal file
120
extract_f0_print.py
Normal file
@ -0,0 +1,120 @@
|
||||
import os,traceback,sys,parselmouth
|
||||
import librosa
|
||||
import pyworld
|
||||
from scipy.io import wavfile
|
||||
import numpy as np,logging
|
||||
logging.getLogger('numba').setLevel(logging.WARNING)
|
||||
from multiprocessing import Process
|
||||
|
||||
exp_dir = sys.argv[1]
|
||||
f = open("%s/extract_f0_feature.log"%exp_dir, "a+")
|
||||
def printt(strr):
|
||||
print(strr)
|
||||
f.write("%s\n" % strr)
|
||||
f.flush()
|
||||
|
||||
n_p = int(sys.argv[2])
|
||||
f0method = sys.argv[3]
|
||||
|
||||
class FeatureInput(object):
|
||||
def __init__(self, samplerate=16000, hop_size=160):
|
||||
self.fs = samplerate
|
||||
self.hop = hop_size
|
||||
|
||||
self.f0_bin = 256
|
||||
self.f0_max = 1100.0
|
||||
self.f0_min = 50.0
|
||||
self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
|
||||
self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
|
||||
|
||||
def compute_f0(self, path,f0_method):
|
||||
x, sr = librosa.load(path, self.fs)
|
||||
p_len=x.shape[0]//self.hop
|
||||
assert sr == self.fs
|
||||
if(f0_method=="pm"):
|
||||
time_step = 160 / 16000 * 1000
|
||||
f0_min = 50
|
||||
f0_max = 1100
|
||||
f0 = parselmouth.Sound(x, sr).to_pitch_ac(
|
||||
time_step=time_step / 1000, voicing_threshold=0.6,
|
||||
pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
|
||||
pad_size=(p_len - len(f0) + 1) // 2
|
||||
if(pad_size>0 or p_len - len(f0) - pad_size>0):
|
||||
f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
|
||||
elif(f0_method=="harvest"):
|
||||
f0, t = pyworld.harvest(
|
||||
x.astype(np.double),
|
||||
fs=sr,
|
||||
f0_ceil=1100,
|
||||
frame_period=1000 * self.hop / sr,
|
||||
)
|
||||
f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
|
||||
elif(f0_method=="dio"):
|
||||
f0, t = pyworld.dio(
|
||||
x.astype(np.double),
|
||||
fs=sr,
|
||||
f0_ceil=1100,
|
||||
frame_period=1000 * self.hop / sr,
|
||||
)
|
||||
f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
|
||||
return f0
|
||||
|
||||
def coarse_f0(self, f0):
|
||||
f0_mel = 1127 * np.log(1 + f0 / 700)
|
||||
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
|
||||
self.f0_bin - 2
|
||||
) / (self.f0_mel_max - self.f0_mel_min) + 1
|
||||
|
||||
# use 0 or 1
|
||||
f0_mel[f0_mel <= 1] = 1
|
||||
f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
|
||||
f0_coarse = np.rint(f0_mel).astype(np.int)
|
||||
assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
|
||||
f0_coarse.max(),
|
||||
f0_coarse.min(),
|
||||
)
|
||||
return f0_coarse
|
||||
|
||||
def go(self,paths,f0_method):
|
||||
if (len(paths) == 0): printt("no-f0-todo")
|
||||
else:
|
||||
printt("todo-f0-%s"%len(paths))
|
||||
n=max(len(paths)//5,1)#每个进程最多打印5条
|
||||
for idx,(inp_path,opt_path1,opt_path2) in enumerate(paths):
|
||||
try:
|
||||
if(idx%n==0):printt("f0ing,now-%s,all-%s,-%s"%(idx,len(paths),inp_path))
|
||||
if(os.path.exists(opt_path1+".npy")==True and os.path.exists(opt_path2+".npy")==True):continue
|
||||
featur_pit = self.compute_f0(inp_path,f0_method)
|
||||
np.save(opt_path2,featur_pit,allow_pickle=False,)#nsf
|
||||
coarse_pit = self.coarse_f0(featur_pit)
|
||||
np.save(opt_path1,coarse_pit,allow_pickle=False,)#ori
|
||||
except:
|
||||
printt("f0fail-%s-%s-%s" % (idx, inp_path,traceback.format_exc()))
|
||||
|
||||
if __name__=='__main__':
|
||||
# exp_dir=r"E:\codes\py39\dataset\mi-test"
|
||||
# n_p=16
|
||||
# f = open("%s/log_extract_f0.log"%exp_dir, "w")
|
||||
printt(sys.argv)
|
||||
featureInput = FeatureInput()
|
||||
paths=[]
|
||||
inp_root= "%s/1_16k_wavs"%(exp_dir)
|
||||
opt_root1="%s/2a_f0"%(exp_dir)
|
||||
opt_root2="%s/2b-f0nsf"%(exp_dir)
|
||||
|
||||
os.makedirs(opt_root1,exist_ok=True)
|
||||
os.makedirs(opt_root2,exist_ok=True)
|
||||
for name in sorted(list(os.listdir(inp_root))):
|
||||
inp_path="%s/%s"%(inp_root,name)
|
||||
if ("spec" in inp_path): continue
|
||||
opt_path1="%s/%s"%(opt_root1,name)
|
||||
opt_path2="%s/%s"%(opt_root2,name)
|
||||
paths.append([inp_path,opt_path1,opt_path2])
|
||||
|
||||
ps=[]
|
||||
for i in range(n_p):
|
||||
p=Process(target=featureInput.go,args=(paths[i::n_p],f0method,))
|
||||
p.start()
|
||||
ps.append(p)
|
||||
for p in ps:
|
||||
p.join()
|
84
extract_feature_print.py
Normal file
84
extract_feature_print.py
Normal file
@ -0,0 +1,84 @@
|
||||
import os,sys,traceback
|
||||
n_part=int(sys.argv[1])
|
||||
i_part=int(sys.argv[2])
|
||||
i_gpu=sys.argv[3]
|
||||
exp_dir=sys.argv[4]
|
||||
os.environ["CUDA_VISIBLE_DEVICES"]=str(i_gpu)
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import soundfile as sf
|
||||
import numpy as np
|
||||
import joblib
|
||||
from fairseq import checkpoint_utils
|
||||
import pdb
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
f = open("%s/extract_f0_feature.log"%exp_dir, "a+")
|
||||
def printt(strr):
|
||||
print(strr)
|
||||
f.write("%s\n" % strr)
|
||||
f.flush()
|
||||
printt(sys.argv)
|
||||
# model_path = "/bili-coeus/jupyter/jupyterhub-liujing04/speech/pretrain/ContentVec_legacy500.pt"
|
||||
model_path = "hubert_base.pt"
|
||||
|
||||
printt(exp_dir)
|
||||
wavPath = "%s/1_16k_wavs"%exp_dir
|
||||
outPath = "%s/3_feature256"%exp_dir
|
||||
os.makedirs(outPath,exist_ok=True)
|
||||
# wave must be 16k, hop_size=320
|
||||
def readwave(wav_path, normalize=False):
|
||||
wav, sr = sf.read(wav_path)
|
||||
assert sr == 16000
|
||||
feats = torch.from_numpy(wav).float()
|
||||
if feats.dim() == 2: # double channels
|
||||
feats = feats.mean(-1)
|
||||
assert feats.dim() == 1, feats.dim()
|
||||
if normalize:
|
||||
with torch.no_grad():
|
||||
feats = F.layer_norm(feats, feats.shape)
|
||||
feats = feats.view(1, -1)
|
||||
return feats
|
||||
# HuBERT model
|
||||
printt("load model(s) from {}".format(model_path))
|
||||
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
|
||||
[model_path],
|
||||
suffix="",
|
||||
)
|
||||
model = models[0]
|
||||
model = model.to(device)
|
||||
model = model.half()
|
||||
model.eval()
|
||||
|
||||
todo=sorted(list(os.listdir(wavPath)))[i_part::n_part]
|
||||
n = max(1,len(todo) // 10) # 最多打印十条
|
||||
if(len(todo)==0):printt("no-feature-todo")
|
||||
else:
|
||||
printt("all-feature-%s"%len(todo))
|
||||
for idx,file in enumerate(todo):
|
||||
try:
|
||||
if file.endswith(".wav"):
|
||||
wav_path = "%s/%s"%(wavPath,file)
|
||||
out_path = "%s/%s"%(outPath,file.replace("wav","npy"))
|
||||
|
||||
if(os.path.exists(out_path)):continue
|
||||
|
||||
feats = readwave(wav_path, normalize=saved_cfg.task.normalize)
|
||||
padding_mask = torch.BoolTensor(feats.shape).fill_(False)
|
||||
inputs = {
|
||||
"source": feats.half().to(device),
|
||||
"padding_mask": padding_mask.to(device),
|
||||
"output_layer": 9, # layer 9
|
||||
}
|
||||
with torch.no_grad():
|
||||
logits = model.extract_features(**inputs)
|
||||
feats = model.final_proj(logits[0])
|
||||
|
||||
feats = feats.squeeze(0).float().cpu().numpy()
|
||||
# feats = np.repeat(feats, 2,0) # 20ms -> 10ms
|
||||
np.save(out_path, feats, allow_pickle=False)
|
||||
if (idx % n == 0):printt("now-%s,all-%s,%s,%s"%(len(todo),idx,file,feats.shape))
|
||||
except:
|
||||
printt(traceback.format_exc())
|
||||
printt("all-feature-done")
|
630
infer-web.py
Normal file
630
infer-web.py
Normal file
@ -0,0 +1,630 @@
|
||||
from multiprocessing import cpu_count
|
||||
import threading
|
||||
from time import sleep
|
||||
from subprocess import Popen,PIPE,run as runn
|
||||
from time import sleep
|
||||
import torch, pdb, os,traceback,sys,warnings,shutil,numpy as np,faiss
|
||||
#判断是否有能用来训练和加速推理的N卡
|
||||
ncpu=cpu_count()
|
||||
ngpu=torch.cuda.device_count()
|
||||
gpu_infos=[]
|
||||
if(torch.cuda.is_available()==False or ngpu==0):if_gpu_ok=False
|
||||
else:
|
||||
if_gpu_ok = False
|
||||
for i in range(ngpu):
|
||||
gpu_name=torch.cuda.get_device_name(i)
|
||||
if("16"in gpu_name or "MX"in gpu_name):continue
|
||||
if("10"in gpu_name or "20"in gpu_name or "30"in gpu_name or "40"in gpu_name or "A50"in gpu_name.upper() or "70"in gpu_name or "80"in gpu_name or "90"in gpu_name or "M4"in gpu_name or "T4"in gpu_name or "TITAN"in gpu_name.upper()):#A10#A100#V100#A40#P40#M40#K80
|
||||
if_gpu_ok=True#至少有一张能用的N卡
|
||||
gpu_infos.append("%s\t%s"%(i,gpu_name))
|
||||
gpu_info="\n".join(gpu_infos)if if_gpu_ok==True and len(gpu_infos)>0 else "很遗憾您这没有能用的显卡来支持您训练"
|
||||
gpus="-".join([i[0]for i in gpu_infos])
|
||||
now_dir=os.getcwd()
|
||||
sys.path.append(now_dir)
|
||||
tmp=os.path.join(now_dir,"TEMP")
|
||||
shutil.rmtree(tmp,ignore_errors=True)
|
||||
os.makedirs(tmp,exist_ok=True)
|
||||
os.makedirs(os.path.join(now_dir,"logs"),exist_ok=True)
|
||||
os.makedirs(os.path.join(now_dir,"weights"),exist_ok=True)
|
||||
os.environ["TEMP"]=tmp
|
||||
warnings.filterwarnings("ignore")
|
||||
torch.manual_seed(114514)
|
||||
from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono
|
||||
from scipy.io import wavfile
|
||||
from fairseq import checkpoint_utils
|
||||
import gradio as gr
|
||||
import librosa
|
||||
import logging
|
||||
from vc_infer_pipeline import VC
|
||||
import soundfile as sf
|
||||
from config import is_half,device,is_half
|
||||
from infer_uvr5 import _audio_pre_
|
||||
from my_utils import load_audio
|
||||
from train.process_ckpt import show_info,change_info,merge,extract_small_model
|
||||
# from trainset_preprocess_pipeline import PreProcess
|
||||
logging.getLogger('numba').setLevel(logging.WARNING)
|
||||
|
||||
class ToolButton(gr.Button, gr.components.FormComponent):
|
||||
"""Small button with single emoji as text, fits inside gradio forms"""
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(variant="tool", **kwargs)
|
||||
def get_block_name(self):
|
||||
return "button"
|
||||
|
||||
hubert_model=None
|
||||
def load_hubert():
|
||||
global hubert_model
|
||||
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(["hubert_base.pt"],suffix="",)
|
||||
hubert_model = models[0]
|
||||
hubert_model = hubert_model.to(device)
|
||||
if(is_half):hubert_model = hubert_model.half()
|
||||
else:hubert_model = hubert_model.float()
|
||||
hubert_model.eval()
|
||||
|
||||
weight_root="weights"
|
||||
weight_uvr5_root="uvr5_weights"
|
||||
names=[]
|
||||
for name in os.listdir(weight_root):names.append(name)
|
||||
uvr5_names=[]
|
||||
for name in os.listdir(weight_uvr5_root):uvr5_names.append(name.replace(".pth",""))
|
||||
|
||||
def vc_single(sid,input_audio,f0_up_key,f0_file,f0_method,file_index,file_big_npy,index_rate):#spk_item, input_audio0, vc_transform0,f0_file,f0method0
|
||||
global tgt_sr,net_g,vc,hubert_model
|
||||
if input_audio is None:return "You need to upload an audio", None
|
||||
f0_up_key = int(f0_up_key)
|
||||
try:
|
||||
audio=load_audio(input_audio,16000)
|
||||
times = [0, 0, 0]
|
||||
if(hubert_model==None):load_hubert()
|
||||
if_f0 = cpt.get("f0", 1)
|
||||
audio_opt=vc.pipeline(hubert_model,net_g,sid,audio,times,f0_up_key,f0_method,file_index,file_big_npy,index_rate,if_f0,f0_file=f0_file)
|
||||
print(times)
|
||||
return "Success", (tgt_sr, audio_opt)
|
||||
except:
|
||||
info=traceback.format_exc()
|
||||
print(info)
|
||||
return info,(None,None)
|
||||
|
||||
def vc_multi(sid,dir_path,opt_root,paths,f0_up_key,f0_method,file_index,file_big_npy,index_rate):
|
||||
try:
|
||||
dir_path=dir_path.strip(" ")#防止小白拷路径头尾带了空格
|
||||
opt_root=opt_root.strip(" ")
|
||||
os.makedirs(opt_root, exist_ok=True)
|
||||
try:
|
||||
if(dir_path!=""):paths=[os.path.join(dir_path,name)for name in os.listdir(dir_path)]
|
||||
else:paths=[path.name for path in paths]
|
||||
except:
|
||||
traceback.print_exc()
|
||||
paths = [path.name for path in paths]
|
||||
infos=[]
|
||||
for path in paths:
|
||||
info,opt=vc_single(sid,path,f0_up_key,None,f0_method,file_index,file_big_npy,index_rate)
|
||||
if(info=="Success"):
|
||||
try:
|
||||
tgt_sr,audio_opt=opt
|
||||
wavfile.write("%s/%s" % (opt_root, os.path.basename(path)), tgt_sr, audio_opt)
|
||||
except:
|
||||
info=traceback.format_exc()
|
||||
infos.append("%s->%s"%(os.path.basename(path),info))
|
||||
yield "\n".join(infos)
|
||||
yield "\n".join(infos)
|
||||
except:
|
||||
yield traceback.format_exc()
|
||||
|
||||
def uvr(model_name,inp_root,save_root_vocal,paths,save_root_ins):
|
||||
infos = []
|
||||
try:
|
||||
inp_root = inp_root.strip(" ").strip("\n")
|
||||
save_root_vocal = save_root_vocal.strip(" ").strip("\n")
|
||||
save_root_ins = save_root_ins.strip(" ").strip("\n")
|
||||
pre_fun = _audio_pre_(model_path=os.path.join(weight_uvr5_root,model_name+".pth"), device=device, is_half=is_half)
|
||||
if (inp_root != ""):paths = [os.path.join(inp_root, name) for name in os.listdir(inp_root)]
|
||||
else:paths = [path.name for path in paths]
|
||||
for name in paths:
|
||||
inp_path=os.path.join(inp_root,name)
|
||||
try:
|
||||
pre_fun._path_audio_(inp_path , save_root_ins,save_root_vocal)
|
||||
infos.append("%s->Success"%(os.path.basename(inp_path)))
|
||||
yield "\n".join(infos)
|
||||
except:
|
||||
infos.append("%s->%s" % (os.path.basename(inp_path),traceback.format_exc()))
|
||||
yield "\n".join(infos)
|
||||
except:
|
||||
infos.append(traceback.format_exc())
|
||||
yield "\n".join(infos)
|
||||
finally:
|
||||
try:
|
||||
del pre_fun.model
|
||||
del pre_fun
|
||||
except:
|
||||
traceback.print_exc()
|
||||
print("clean_empty_cache")
|
||||
torch.cuda.empty_cache()
|
||||
yield "\n".join(infos)
|
||||
|
||||
#一个选项卡全局只能有一个音色
|
||||
def get_vc(sid):
|
||||
global n_spk,tgt_sr,net_g,vc,cpt
|
||||
if(sid==""):
|
||||
global hubert_model
|
||||
print("clean_empty_cache")
|
||||
del net_g, n_spk, vc, hubert_model,tgt_sr#,cpt
|
||||
hubert_model = net_g=n_spk=vc=hubert_model=tgt_sr=None
|
||||
torch.cuda.empty_cache()
|
||||
###楼下不这么折腾清理不干净
|
||||
if_f0 = cpt.get("f0", 1)
|
||||
if (if_f0 == 1):
|
||||
net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half)
|
||||
else:
|
||||
net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
|
||||
del net_g,cpt
|
||||
torch.cuda.empty_cache()
|
||||
cpt=None
|
||||
return {"visible": False, "__type__": "update"}
|
||||
person = "%s/%s" % (weight_root, sid)
|
||||
print("loading %s"%person)
|
||||
cpt = torch.load(person, map_location="cpu")
|
||||
tgt_sr = cpt["config"][-1]
|
||||
cpt["config"][-3]=cpt["weight"]["emb_g.weight"].shape[0]#n_spk
|
||||
if_f0=cpt.get("f0",1)
|
||||
if(if_f0==1):
|
||||
net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half)
|
||||
else:
|
||||
net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
|
||||
del net_g.enc_q
|
||||
print(net_g.load_state_dict(cpt["weight"], strict=False)) # 不加这一行清不干净,真奇葩
|
||||
net_g.eval().to(device)
|
||||
if (is_half):net_g = net_g.half()
|
||||
else:net_g = net_g.float()
|
||||
vc = VC(tgt_sr, device, is_half)
|
||||
n_spk=cpt["config"][-3]
|
||||
return {"visible": True,"maximum": n_spk, "__type__": "update"}
|
||||
|
||||
def change_choices():return {"choices": sorted(list(os.listdir(weight_root))), "__type__": "update"}
|
||||
def clean():return {"value": "", "__type__": "update"}
|
||||
def change_f0(if_f0_3,sr2):#np7, f0method8,pretrained_G14,pretrained_D15
|
||||
if(if_f0_3=="是"):return {"visible": True, "__type__": "update"},{"visible": True, "__type__": "update"},"pretrained/f0G%s.pth"%sr2,"pretrained/f0D%s.pth"%sr2
|
||||
return {"visible": False, "__type__": "update"}, {"visible": False, "__type__": "update"},"pretrained/G%s.pth"%sr2,"pretrained/D%s.pth"%sr2
|
||||
|
||||
sr_dict={
|
||||
"32k":32000,
|
||||
"40k":40000,
|
||||
"48k":48000,
|
||||
}
|
||||
|
||||
def if_done(done,p):
|
||||
while 1:
|
||||
if(p.poll()==None):sleep(0.5)
|
||||
else:break
|
||||
done[0]=True
|
||||
|
||||
|
||||
def if_done_multi(done,ps):
|
||||
while 1:
|
||||
#poll==None代表进程未结束
|
||||
#只要有一个进程未结束都不停
|
||||
flag=1
|
||||
for p in ps:
|
||||
if(p.poll()==None):
|
||||
flag = 0
|
||||
sleep(0.5)
|
||||
break
|
||||
if(flag==1):break
|
||||
done[0]=True
|
||||
|
||||
def preprocess_dataset(trainset_dir,exp_dir,sr,n_p=ncpu):
|
||||
sr=sr_dict[sr]
|
||||
os.makedirs("%s/logs/%s"%(now_dir,exp_dir),exist_ok=True)
|
||||
f = open("%s/logs/%s/preprocess.log"%(now_dir,exp_dir), "w")
|
||||
f.close()
|
||||
cmd="python trainset_preprocess_pipeline_print.py %s %s %s %s/logs/%s"%(trainset_dir,sr,n_p,now_dir,exp_dir)
|
||||
print(cmd)
|
||||
p = Popen(cmd, shell=True)#, stdin=PIPE, stdout=PIPE,stderr=PIPE,cwd=now_dir
|
||||
###煞笔gr,popen read都非得全跑完了再一次性读取,不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
|
||||
done=[False]
|
||||
threading.Thread(target=if_done,args=(done,p,)).start()
|
||||
while(1):
|
||||
with open("%s/logs/%s/preprocess.log"%(now_dir,exp_dir),"r")as f:yield(f.read())
|
||||
sleep(1)
|
||||
if(done[0]==True):break
|
||||
with open("%s/logs/%s/preprocess.log"%(now_dir,exp_dir), "r")as f:log = f.read()
|
||||
print(log)
|
||||
yield log
|
||||
#but2.click(extract_f0,[gpus6,np7,f0method8,if_f0_3,trainset_dir4],[info2])
|
||||
def extract_f0_feature(gpus,n_p,f0method,if_f0,exp_dir):
|
||||
gpus=gpus.split("-")
|
||||
os.makedirs("%s/logs/%s"%(now_dir,exp_dir),exist_ok=True)
|
||||
f = open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir), "w")
|
||||
f.close()
|
||||
if(if_f0=="是"):
|
||||
cmd="python extract_f0_print.py %s/logs/%s %s %s"%(now_dir,exp_dir,n_p,f0method)
|
||||
print(cmd)
|
||||
p = Popen(cmd, shell=True,cwd=now_dir)#, stdin=PIPE, stdout=PIPE,stderr=PIPE
|
||||
###煞笔gr,popen read都非得全跑完了再一次性读取,不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
|
||||
done=[False]
|
||||
threading.Thread(target=if_done,args=(done,p,)).start()
|
||||
while(1):
|
||||
with open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir),"r")as f:yield(f.read())
|
||||
sleep(1)
|
||||
if(done[0]==True):break
|
||||
with open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir), "r")as f:log = f.read()
|
||||
print(log)
|
||||
yield log
|
||||
####对不同part分别开多进程
|
||||
'''
|
||||
n_part=int(sys.argv[1])
|
||||
i_part=int(sys.argv[2])
|
||||
i_gpu=sys.argv[3]
|
||||
exp_dir=sys.argv[4]
|
||||
os.environ["CUDA_VISIBLE_DEVICES"]=str(i_gpu)
|
||||
'''
|
||||
leng=len(gpus)
|
||||
ps=[]
|
||||
for idx,n_g in enumerate(gpus):
|
||||
cmd="python extract_feature_print.py %s %s %s %s/logs/%s"%(leng,idx,n_g,now_dir,exp_dir)
|
||||
print(cmd)
|
||||
p = Popen(cmd, shell=True, cwd=now_dir)#, shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir
|
||||
ps.append(p)
|
||||
###煞笔gr,popen read都非得全跑完了再一次性读取,不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
|
||||
done = [False]
|
||||
threading.Thread(target=if_done_multi, args=(done, ps,)).start()
|
||||
while (1):
|
||||
with open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir), "r")as f:yield (f.read())
|
||||
sleep(1)
|
||||
if (done[0] == True): break
|
||||
with open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir), "r")as f:log = f.read()
|
||||
print(log)
|
||||
yield log
|
||||
def change_sr2(sr2,if_f0_3):
|
||||
if(if_f0_3=="是"):return "pretrained/f0G%s.pth"%sr2,"pretrained/f0D%s.pth"%sr2
|
||||
else:return "pretrained/G%s.pth"%sr2,"pretrained/D%s.pth"%sr2
|
||||
#but3.click(click_train,[exp_dir1,sr2,if_f0_3,save_epoch10,total_epoch11,batch_size12,if_save_latest13,pretrained_G14,pretrained_D15,gpus16])
|
||||
def click_train(exp_dir1,sr2,if_f0_3,spk_id5,save_epoch10,total_epoch11,batch_size12,if_save_latest13,pretrained_G14,pretrained_D15,gpus16,if_cache_gpu17):
|
||||
#生成filelist
|
||||
exp_dir="%s/logs/%s"%(now_dir,exp_dir1)
|
||||
os.makedirs(exp_dir,exist_ok=True)
|
||||
gt_wavs_dir="%s/0_gt_wavs"%(exp_dir)
|
||||
co256_dir="%s/3_feature256"%(exp_dir)
|
||||
if(if_f0_3=="是"):
|
||||
f0_dir = "%s/2a_f0" % (exp_dir)
|
||||
f0nsf_dir="%s/2b-f0nsf"%(exp_dir)
|
||||
names=set([name.split(".")[0]for name in os.listdir(gt_wavs_dir)])&set([name.split(".")[0]for name in os.listdir(co256_dir)])&set([name.split(".")[0]for name in os.listdir(f0_dir)])&set([name.split(".")[0]for name in os.listdir(f0nsf_dir)])
|
||||
else:
|
||||
names=set([name.split(".")[0]for name in os.listdir(gt_wavs_dir)])&set([name.split(".")[0]for name in os.listdir(co256_dir)])
|
||||
opt=[]
|
||||
for name in names:
|
||||
if (if_f0_3 == "是"):
|
||||
opt.append("%s/%s.wav|%s/%s.npy|%s/%s.wav.npy|%s/%s.wav.npy|%s"%(gt_wavs_dir.replace("\\","\\\\"),name,co256_dir.replace("\\","\\\\"),name,f0_dir.replace("\\","\\\\"),name,f0nsf_dir.replace("\\","\\\\"),name,spk_id5))
|
||||
else:
|
||||
opt.append("%s/%s.wav|%s/%s.npy|%s"%(gt_wavs_dir.replace("\\","\\\\"),name,co256_dir.replace("\\","\\\\"),name,spk_id5))
|
||||
with open("%s/filelist.txt"%exp_dir,"w")as f:f.write("\n".join(opt))
|
||||
print("write filelist done")
|
||||
#生成config#无需生成config
|
||||
# cmd = "python train_nsf_sim_cache_sid_load_pretrain.py -e mi-test -sr 40k -f0 1 -bs 4 -g 0 -te 10 -se 5 -pg pretrained/f0G40k.pth -pd pretrained/f0D40k.pth -l 1 -c 0"
|
||||
cmd = "python train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -g %s -te %s -se %s -pg %s -pd %s -l %s -c %s" % (exp_dir1,sr2,1 if if_f0_3=="是"else 0,batch_size12,gpus16,total_epoch11,save_epoch10,pretrained_G14,pretrained_D15,1 if if_save_latest13=="是"else 0,1 if if_cache_gpu17=="是"else 0)
|
||||
print(cmd)
|
||||
p = Popen(cmd, shell=True, cwd=now_dir)
|
||||
p.wait()
|
||||
return "训练结束,您可查看控制台训练日志或实验文件夹下的train.log"
|
||||
# but4.click(train_index, [exp_dir1], info3)
|
||||
def train_index(exp_dir1):
|
||||
exp_dir="%s/logs/%s"%(now_dir,exp_dir1)
|
||||
os.makedirs(exp_dir,exist_ok=True)
|
||||
feature_dir="%s/3_feature256"%(exp_dir)
|
||||
if(os.path.exists(feature_dir)==False):return "请先进行特征提取!"
|
||||
listdir_res=list(os.listdir(feature_dir))
|
||||
if(len(listdir_res)==0):return "请先进行特征提取!"
|
||||
npys = []
|
||||
for name in sorted(listdir_res):
|
||||
phone = np.load("%s/%s" % (feature_dir, name))
|
||||
npys.append(phone)
|
||||
big_npy = np.concatenate(npys, 0)
|
||||
np.save("%s/total_fea.npy"%exp_dir, big_npy)
|
||||
n_ivf = big_npy.shape[0] // 39
|
||||
infos=[]
|
||||
infos.append("%s,%s"%(big_npy.shape,n_ivf))
|
||||
yield "\n".join(infos)
|
||||
index = faiss.index_factory(256, "IVF%s,Flat"%n_ivf)
|
||||
infos.append("training")
|
||||
yield "\n".join(infos)
|
||||
index_ivf = faiss.extract_index_ivf(index) #
|
||||
index_ivf.nprobe = int(np.power(n_ivf,0.3))
|
||||
index.train(big_npy)
|
||||
faiss.write_index(index, '%s/trained_IVF%s_Flat_nprobe_%s.index'%(exp_dir,n_ivf,index_ivf.nprobe))
|
||||
infos.append("adding")
|
||||
yield "\n".join(infos)
|
||||
index.add(big_npy)
|
||||
faiss.write_index(index, '%s/added_IVF%s_Flat_nprobe_%s.index'%(exp_dir,n_ivf,index_ivf.nprobe))
|
||||
infos.append("成功构建索引,added_IVF%s_Flat_nprobe_%s.index"%(n_ivf,index_ivf.nprobe))
|
||||
yield "\n".join(infos)
|
||||
#but5.click(train1key, [exp_dir1, sr2, if_f0_3, trainset_dir4, spk_id5, gpus6, np7, f0method8, save_epoch10, total_epoch11, batch_size12, if_save_latest13, pretrained_G14, pretrained_D15, gpus16, if_cache_gpu17], info3)
|
||||
def train1key(exp_dir1, sr2, if_f0_3, trainset_dir4, spk_id5, gpus6, np7, f0method8, save_epoch10, total_epoch11, batch_size12, if_save_latest13, pretrained_G14, pretrained_D15, gpus16, if_cache_gpu17):
|
||||
infos=[]
|
||||
def get_info_str(strr):
|
||||
infos.append(strr)
|
||||
return "\n".join(infos)
|
||||
os.makedirs("%s/logs/%s"%(now_dir,exp_dir1),exist_ok=True)
|
||||
#########step1:处理数据
|
||||
open("%s/logs/%s/preprocess.log"%(now_dir,exp_dir1), "w").close()
|
||||
cmd="python trainset_preprocess_pipeline_print.py %s %s %s %s/logs/%s"%(trainset_dir4,sr_dict[sr2],ncpu,now_dir,exp_dir1)
|
||||
yield get_info_str("step1:正在处理数据")
|
||||
yield get_info_str(cmd)
|
||||
p = Popen(cmd, shell=True)
|
||||
p.wait()
|
||||
with open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir1), "r")as f: print(f.read())
|
||||
#########step2a:提取音高
|
||||
open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir1), "w")
|
||||
if(if_f0_3=="是"):
|
||||
yield get_info_str("step2a:正在提取音高")
|
||||
cmd="python extract_f0_print.py %s/logs/%s %s %s"%(now_dir,exp_dir1,np7,f0method8)
|
||||
yield get_info_str(cmd)
|
||||
p = Popen(cmd, shell=True,cwd=now_dir)
|
||||
p.wait()
|
||||
with open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir1), "r")as f:print(f.read())
|
||||
else:yield get_info_str("step2a:无需提取音高")
|
||||
#######step2b:提取特征
|
||||
yield get_info_str("step2b:正在提取特征")
|
||||
gpus=gpus16.split("-")
|
||||
leng=len(gpus)
|
||||
ps=[]
|
||||
for idx,n_g in enumerate(gpus):
|
||||
cmd="python extract_feature_print.py %s %s %s %s/logs/%s"%(leng,idx,n_g,now_dir,exp_dir1)
|
||||
yield get_info_str(cmd)
|
||||
p = Popen(cmd, shell=True, cwd=now_dir)#, shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir
|
||||
ps.append(p)
|
||||
for p in ps:p.wait()
|
||||
with open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir1), "r")as f:print(f.read())
|
||||
#######step3a:训练模型
|
||||
yield get_info_str("step3a:正在训练模型")
|
||||
#生成filelist
|
||||
exp_dir="%s/logs/%s"%(now_dir,exp_dir1)
|
||||
gt_wavs_dir="%s/0_gt_wavs"%(exp_dir)
|
||||
co256_dir="%s/3_feature256"%(exp_dir)
|
||||
if(if_f0_3=="是"):
|
||||
f0_dir = "%s/2a_f0" % (exp_dir)
|
||||
f0nsf_dir="%s/2b-f0nsf"%(exp_dir)
|
||||
names=set([name.split(".")[0]for name in os.listdir(gt_wavs_dir)])&set([name.split(".")[0]for name in os.listdir(co256_dir)])&set([name.split(".")[0]for name in os.listdir(f0_dir)])&set([name.split(".")[0]for name in os.listdir(f0nsf_dir)])
|
||||
else:
|
||||
names=set([name.split(".")[0]for name in os.listdir(gt_wavs_dir)])&set([name.split(".")[0]for name in os.listdir(co256_dir)])
|
||||
opt=[]
|
||||
for name in names:
|
||||
if (if_f0_3 == "是"):
|
||||
opt.append("%s/%s.wav|%s/%s.npy|%s/%s.wav.npy|%s/%s.wav.npy|%s"%(gt_wavs_dir.replace("\\","\\\\"),name,co256_dir.replace("\\","\\\\"),name,f0_dir.replace("\\","\\\\"),name,f0nsf_dir.replace("\\","\\\\"),name,spk_id5))
|
||||
else:
|
||||
opt.append("%s/%s.wav|%s/%s.npy|%s"%(gt_wavs_dir.replace("\\","\\\\"),name,co256_dir.replace("\\","\\\\"),name,spk_id5))
|
||||
with open("%s/filelist.txt"%exp_dir,"w")as f:f.write("\n".join(opt))
|
||||
yield get_info_str("write filelist done")
|
||||
cmd = "python train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -g %s -te %s -se %s -pg %s -pd %s -l %s -c %s" % (exp_dir1,sr2,1 if if_f0_3=="是"else 0,batch_size12,gpus16,total_epoch11,save_epoch10,pretrained_G14,pretrained_D15,1 if if_save_latest13=="是"else 0,1 if if_cache_gpu17=="是"else 0)
|
||||
yield get_info_str(cmd)
|
||||
p = Popen(cmd, shell=True, cwd=now_dir)
|
||||
p.wait()
|
||||
yield get_info_str("训练结束,您可查看控制台训练日志或实验文件夹下的train.log")
|
||||
#######step3b:训练索引
|
||||
feature_dir="%s/3_feature256"%(exp_dir)
|
||||
npys = []
|
||||
listdir_res=list(os.listdir(feature_dir))
|
||||
for name in sorted(listdir_res):
|
||||
phone = np.load("%s/%s" % (feature_dir, name))
|
||||
npys.append(phone)
|
||||
big_npy = np.concatenate(npys, 0)
|
||||
np.save("%s/total_fea.npy"%exp_dir, big_npy)
|
||||
n_ivf = big_npy.shape[0] // 39
|
||||
yield get_info_str("%s,%s"%(big_npy.shape,n_ivf))
|
||||
index = faiss.index_factory(256, "IVF%s,Flat"%n_ivf)
|
||||
yield get_info_str("training index")
|
||||
index_ivf = faiss.extract_index_ivf(index) #
|
||||
index_ivf.nprobe = int(np.power(n_ivf,0.3))
|
||||
index.train(big_npy)
|
||||
faiss.write_index(index, '%s/trained_IVF%s_Flat_nprobe_%s.index'%(exp_dir,n_ivf,index_ivf.nprobe))
|
||||
yield get_info_str("adding index")
|
||||
index.add(big_npy)
|
||||
faiss.write_index(index, '%s/added_IVF%s_Flat_nprobe_%s.index'%(exp_dir,n_ivf,index_ivf.nprobe))
|
||||
yield get_info_str("成功构建索引,added_IVF%s_Flat_nprobe_%s.index"%(n_ivf,index_ivf.nprobe))
|
||||
yield get_info_str("全流程结束!")
|
||||
|
||||
# ckpt_path2.change(change_info_,[ckpt_path2],[sr__,if_f0__])
|
||||
def change_info_(ckpt_path):
|
||||
if(os.path.exists(ckpt_path.replace(os.path.basename(ckpt_path),"train.log"))==False):return {"__type__": "update"},{"__type__": "update"}
|
||||
try:
|
||||
with open(ckpt_path.replace(os.path.basename(ckpt_path),"train.log"),"r")as f:
|
||||
info=eval(f.read().strip("\n").split("\n")[0].split("\t")[-1])
|
||||
sr,f0=info["sample_rate"],info["if_f0"]
|
||||
return sr,str(f0)
|
||||
except:
|
||||
traceback.print_exc()
|
||||
return {"__type__": "update"}, {"__type__": "update"}
|
||||
|
||||
|
||||
with gr.Blocks() as app:
|
||||
gr.Markdown(value="""
|
||||
本软件以MIT协议开源,作者不对软件具备任何控制力,使用软件者、传播软件导出的声音者自负全责。<br>
|
||||
如不认可该条款,则不能使用或引用软件包内任何代码和文件。详见根目录"使用需遵守的协议-LICENSE.txt"。
|
||||
""")
|
||||
with gr.Tabs():
|
||||
with gr.TabItem("模型推理"):
|
||||
with gr.Row():
|
||||
sid0 = gr.Dropdown(label="推理音色", choices=names)
|
||||
refresh_button = gr.Button("刷新音色列表", variant="primary")
|
||||
refresh_button.click(
|
||||
fn=change_choices,
|
||||
inputs=[],
|
||||
outputs=[sid0]
|
||||
)
|
||||
clean_button = gr.Button("卸载音色省显存", variant="primary")
|
||||
spk_item = gr.Slider(minimum=0, maximum=2333, step=1, label='请选择说话人id', value=0, visible=False, interactive=True)
|
||||
clean_button.click(
|
||||
fn=clean,
|
||||
inputs=[],
|
||||
outputs=[sid0]
|
||||
)
|
||||
sid0.change(
|
||||
fn=get_vc,
|
||||
inputs=[sid0],
|
||||
outputs=[spk_item],
|
||||
)
|
||||
with gr.Group():
|
||||
gr.Markdown(value="""
|
||||
男转女推荐+12key,女转男推荐-12key,如果音域爆炸导致音色失真也可以自己调整到合适音域。
|
||||
""")
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
vc_transform0 = gr.Number(label="变调(整数,半音数量,升八度12降八度-12)", value=0)
|
||||
input_audio0 = gr.Textbox(label="输入待处理音频文件路径(默认是正确格式示例)",value="E:\codes\py39\\vits_vc_gpu_train\\todo-songs\冬之花clip1.wav")
|
||||
f0method0=gr.Radio(label="选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比", choices=["pm","harvest"],value="pm", interactive=True)
|
||||
with gr.Column():
|
||||
file_index1 = gr.Textbox(label="特征检索库文件路径",value="E:\codes\py39\\vits_vc_gpu_train\logs\mi-test-1key\\added_IVF677_Flat_nprobe_7.index", interactive=True)
|
||||
file_big_npy1 = gr.Textbox(label="特征文件路径",value="E:\codes\py39\\vits_vc_gpu_train\logs\mi-test-1key\\total_fea.npy", interactive=True)
|
||||
index_rate1 = gr.Slider(minimum=0, maximum=1,label='检索特征占比', value=1,interactive=True)
|
||||
f0_file = gr.File(label="F0曲线文件,可选,一行一个音高,代替默认F0及升降调")
|
||||
but0=gr.Button("转换", variant="primary")
|
||||
with gr.Column():
|
||||
vc_output1 = gr.Textbox(label="输出信息")
|
||||
vc_output2 = gr.Audio(label="输出音频(右下角三个点,点了可以下载)")
|
||||
but0.click(vc_single, [spk_item, input_audio0, vc_transform0,f0_file,f0method0,file_index1,file_big_npy1,index_rate1], [vc_output1, vc_output2])
|
||||
with gr.Group():
|
||||
gr.Markdown(value="""
|
||||
批量转换,输入待转换音频文件夹,或上传多个音频文件,在指定文件夹(默认opt)下输出转换的音频。
|
||||
""")
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
vc_transform1 = gr.Number(label="变调(整数,半音数量,升八度12降八度-12)", value=0)
|
||||
opt_input = gr.Textbox(label="指定输出文件夹",value="opt")
|
||||
f0method1=gr.Radio(label="选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比", choices=["pm","harvest"],value="pm", interactive=True)
|
||||
with gr.Column():
|
||||
file_index2 = gr.Textbox(label="特征检索库文件路径",value="E:\codes\py39\\vits_vc_gpu_train\logs\mi-test-1key\\added_IVF677_Flat_nprobe_7.index", interactive=True)
|
||||
file_big_npy2 = gr.Textbox(label="特征文件路径",value="E:\codes\py39\\vits_vc_gpu_train\logs\mi-test-1key\\total_fea.npy", interactive=True)
|
||||
index_rate2 = gr.Slider(minimum=0, maximum=1,label='检索特征占比', value=1,interactive=True)
|
||||
with gr.Column():
|
||||
dir_input = gr.Textbox(label="输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)",value="E:\codes\py39\\vits_vc_gpu_train\\todo-songs")
|
||||
inputs = gr.File(file_count="multiple", label="也可批量输入音频文件,二选一,优先读文件夹")
|
||||
but1=gr.Button("转换", variant="primary")
|
||||
vc_output3 = gr.Textbox(label="输出信息")
|
||||
but1.click(vc_multi, [spk_item, dir_input,opt_input,inputs, vc_transform1,f0method1,file_index2,file_big_npy2,index_rate2], [vc_output3])
|
||||
with gr.TabItem("伴奏人声分离"):
|
||||
with gr.Group():
|
||||
gr.Markdown(value="""
|
||||
人声伴奏分离批量处理,使用UVR5模型。<br>
|
||||
不带和声用HP2,带和声且提取的人声不需要和声用HP5<br>
|
||||
合格的文件夹路径格式举例:E:\codes\py39\\vits_vc_gpu\白鹭霜华测试样例(去文件管理器地址栏拷就行了)
|
||||
""")
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
dir_wav_input = gr.Textbox(label="输入待处理音频文件夹路径",value="E:\codes\py39\\vits_vc_gpu_train\\todo-songs")
|
||||
wav_inputs = gr.File(file_count="multiple", label="也可批量输入音频文件,二选一,优先读文件夹")
|
||||
with gr.Column():
|
||||
model_choose = gr.Dropdown(label="模型", choices=uvr5_names)
|
||||
opt_vocal_root = gr.Textbox(label="指定输出人声文件夹",value="opt")
|
||||
opt_ins_root = gr.Textbox(label="指定输出乐器文件夹",value="opt")
|
||||
but2=gr.Button("转换", variant="primary")
|
||||
vc_output4 = gr.Textbox(label="输出信息")
|
||||
but2.click(uvr, [model_choose, dir_wav_input,opt_vocal_root,wav_inputs,opt_ins_root], [vc_output4])
|
||||
with gr.TabItem("训练"):
|
||||
gr.Markdown(value="""
|
||||
step1:填写实验配置。实验数据放在logs下,每个实验一个文件夹,需手工输入实验名路径,内含实验配置,日志,训练得到的模型文件。
|
||||
""")
|
||||
with gr.Row():
|
||||
exp_dir1 = gr.Textbox(label="输入实验名",value="mi-test")
|
||||
sr2 = gr.Radio(label="目标采样率", choices=["32k","40k","48k"],value="40k", interactive=True)
|
||||
if_f0_3 = gr.Radio(label="模型是否带音高指导(唱歌一定要,语音可以不要)", choices=["是","否"],value="是", interactive=True)
|
||||
with gr.Group():#暂时单人的,后面支持最多4人的#数据处理
|
||||
gr.Markdown(value="""
|
||||
step2a:自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化,在实验目录下生成2个wav文件夹;暂时只支持单人训练。
|
||||
""")
|
||||
with gr.Row():
|
||||
trainset_dir4 = gr.Textbox(label="输入训练文件夹路径",value="E:\语音音频+标注\米津玄师\src")
|
||||
spk_id5 = gr.Slider(minimum=0, maximum=4, step=1, label='请指定说话人id', value=0,interactive=True)
|
||||
but1=gr.Button("处理数据", variant="primary")
|
||||
info1=gr.Textbox(label="输出信息",value="")
|
||||
but1.click(preprocess_dataset,[trainset_dir4,exp_dir1,sr2],[info1])
|
||||
with gr.Group():
|
||||
gr.Markdown(value="""
|
||||
step2b:使用CPU提取音高(如果模型带音高),使用GPU提取特征(选择卡号)
|
||||
""")
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
gpus6 = gr.Textbox(label="以-分隔输入使用的卡号,例如 0-1-2 使用卡0和卡1和卡2",value=gpus,interactive=True)
|
||||
gpu_info9 = gr.Textbox(label="显卡信息",value=gpu_info)
|
||||
with gr.Column():
|
||||
np7 = gr.Slider(minimum=0, maximum=ncpu, step=1, label='提取音高使用的CPU进程数', value=ncpu,interactive=True)
|
||||
f0method8 = gr.Radio(label="选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢", choices=["pm", "harvest","dio"], value="harvest", interactive=True)
|
||||
but2=gr.Button("特征提取", variant="primary")
|
||||
info2=gr.Textbox(label="输出信息",value="",max_lines=8)
|
||||
but2.click(extract_f0_feature,[gpus6,np7,f0method8,if_f0_3,exp_dir1],[info2])
|
||||
with gr.Group():
|
||||
gr.Markdown(value="""
|
||||
step3:填写训练设置,开始训练模型和索引
|
||||
""")
|
||||
with gr.Row():
|
||||
save_epoch10 = gr.Slider(minimum=0, maximum=50, step=1, label='保存频率save_every_epoch', value=5,interactive=True)
|
||||
total_epoch11 = gr.Slider(minimum=0, maximum=100, step=1, label='总训练轮数total_epoch', value=10,interactive=True)
|
||||
batch_size12 = gr.Slider(minimum=0, maximum=32, step=1, label='batch_size', value=4,interactive=True)
|
||||
if_save_latest13 = gr.Radio(label="是否仅保存最新的ckpt文件以节省硬盘空间", choices=["是", "否"], value="否", interactive=True)
|
||||
if_cache_gpu17 = gr.Radio(label="是否缓存所有训练集至显存。10min以下小数据可缓存以加速训练,大数据缓存会炸显存也加不了多少速", choices=["是", "否"], value="否", interactive=True)
|
||||
with gr.Row():
|
||||
pretrained_G14 = gr.Textbox(label="加载预训练底模G路径", value="pretrained/f0G40k.pth",interactive=True)
|
||||
pretrained_D15 = gr.Textbox(label="加载预训练底模D路径", value="pretrained/f0D40k.pth",interactive=True)
|
||||
sr2.change(change_sr2, [sr2,if_f0_3], [pretrained_G14,pretrained_D15])
|
||||
if_f0_3.change(change_f0, [if_f0_3, sr2], [np7, f0method8, pretrained_G14, pretrained_D15])
|
||||
gpus16 = gr.Textbox(label="以-分隔输入使用的卡号,例如 0-1-2 使用卡0和卡1和卡2", value=gpus,interactive=True)
|
||||
but3 = gr.Button("训练模型", variant="primary")
|
||||
but4 = gr.Button("训练特征索引", variant="primary")
|
||||
but5 = gr.Button("一键训练", variant="primary")
|
||||
info3 = gr.Textbox(label="输出信息", value="",max_lines=10)
|
||||
but3.click(click_train,[exp_dir1,sr2,if_f0_3,spk_id5,save_epoch10,total_epoch11,batch_size12,if_save_latest13,pretrained_G14,pretrained_D15,gpus16,if_cache_gpu17],info3)
|
||||
but4.click(train_index,[exp_dir1],info3)
|
||||
but5.click(train1key,[exp_dir1,sr2,if_f0_3,trainset_dir4,spk_id5,gpus6,np7,f0method8,save_epoch10,total_epoch11,batch_size12,if_save_latest13,pretrained_G14,pretrained_D15,gpus16,if_cache_gpu17],info3)
|
||||
|
||||
with gr.TabItem("ckpt处理"):
|
||||
with gr.Group():
|
||||
gr.Markdown(value="""模型融合,可用于测试音色融合""")
|
||||
with gr.Row():
|
||||
ckpt_a = gr.Textbox(label="A模型路径", value="", interactive=True)
|
||||
ckpt_b = gr.Textbox(label="B模型路径", value="", interactive=True)
|
||||
alpha_a = gr.Slider(minimum=0, maximum=1, label='A模型权重', value=0.5, interactive=True)
|
||||
with gr.Row():
|
||||
sr_ = gr.Radio(label="目标采样率", choices=["32k","40k","48k"],value="40k", interactive=True)
|
||||
if_f0_ = gr.Radio(label="模型是否带音高指导", choices=["是","否"],value="是", interactive=True)
|
||||
info__ = gr.Textbox(label="要置入的模型信息", value="", max_lines=8, interactive=True)
|
||||
name_to_save0=gr.Textbox(label="保存的模型名不带后缀", value="", max_lines=1, interactive=True)
|
||||
with gr.Row():
|
||||
but6 = gr.Button("融合", variant="primary")
|
||||
info4 = gr.Textbox(label="输出信息", value="", max_lines=8)
|
||||
but6.click(merge, [ckpt_a,ckpt_b,alpha_a,sr_,if_f0_,info__,name_to_save0], info4)#def merge(path1,path2,alpha1,sr,f0,info):
|
||||
with gr.Group():
|
||||
gr.Markdown(value="修改模型信息(仅支持weights文件夹下提取的小模型文件)")
|
||||
with gr.Row():
|
||||
ckpt_path0 = gr.Textbox(label="模型路径", value="", interactive=True)
|
||||
info_=gr.Textbox(label="要改的模型信息", value="", max_lines=8, interactive=True)
|
||||
name_to_save1=gr.Textbox(label="保存的文件名,默认空为和源文件同名", value="", max_lines=8, interactive=True)
|
||||
with gr.Row():
|
||||
but7 = gr.Button("修改", variant="primary")
|
||||
info5 = gr.Textbox(label="输出信息", value="", max_lines=8)
|
||||
but7.click(change_info, [ckpt_path0,info_,name_to_save1], info5)
|
||||
with gr.Group():
|
||||
gr.Markdown(value="查看模型信息(仅支持weights文件夹下提取的小模型文件)")
|
||||
with gr.Row():
|
||||
ckpt_path1 = gr.Textbox(label="模型路径", value="", interactive=True)
|
||||
but8 = gr.Button("查看", variant="primary")
|
||||
info6 = gr.Textbox(label="输出信息", value="", max_lines=8)
|
||||
but8.click(show_info, [ckpt_path1], info6)
|
||||
with gr.Group():
|
||||
gr.Markdown(value="模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况")
|
||||
with gr.Row():
|
||||
ckpt_path2 = gr.Textbox(label="模型路径", value="E:\codes\py39\logs\mi-test_f0_48k\\G_23333.pth", interactive=True)
|
||||
save_name = gr.Textbox(label="保存名", value="", interactive=True)
|
||||
sr__ = gr.Radio(label="目标采样率", choices=["32k","40k","48k"],value="40k", interactive=True)
|
||||
if_f0__ = gr.Radio(label="模型是否带音高指导,1是0否", choices=["1","0"],value="1", interactive=True)
|
||||
info___ = gr.Textbox(label="要置入的模型信息", value="", max_lines=8, interactive=True)
|
||||
but9 = gr.Button("提取", variant="primary")
|
||||
info7 = gr.Textbox(label="输出信息", value="", max_lines=8)
|
||||
ckpt_path2.change(change_info_,[ckpt_path2],[sr__,if_f0__])
|
||||
but9.click(extract_small_model, [ckpt_path2,save_name,sr__,if_f0__,info___], info7)
|
||||
|
||||
with gr.TabItem("招募音高曲线前端编辑器"):
|
||||
gr.Markdown(value="""加开发群联系我xxxxx""")
|
||||
with gr.TabItem("点击查看交流、问题反馈群号"):
|
||||
gr.Markdown(value="""xxxxx""")
|
||||
|
||||
# app.launch(server_name="0.0.0.0",server_port=7860)
|
||||
# app.queue(concurrency_count=511, max_size=1022).launch(server_name="127.0.0.1",inbrowser=True,server_port=7861,quiet=True)
|
||||
app.queue(concurrency_count=511, max_size=1022).launch(server_name="0.0.0.0",inbrowser=True,server_port=7865,quiet=True)
|
108
infer_uvr5.py
Normal file
108
infer_uvr5.py
Normal file
@ -0,0 +1,108 @@
|
||||
import os,sys,torch,warnings,pdb
|
||||
warnings.filterwarnings("ignore")
|
||||
import librosa
|
||||
import importlib
|
||||
import numpy as np
|
||||
import hashlib , math
|
||||
from tqdm import tqdm
|
||||
from uvr5_pack.lib_v5 import spec_utils
|
||||
from uvr5_pack.utils import _get_name_params,inference
|
||||
from uvr5_pack.lib_v5.model_param_init import ModelParameters
|
||||
from scipy.io import wavfile
|
||||
|
||||
class _audio_pre_():
|
||||
def __init__(self, model_path,device,is_half):
|
||||
self.model_path = model_path
|
||||
self.device = device
|
||||
self.data = {
|
||||
# Processing Options
|
||||
'postprocess': False,
|
||||
'tta': False,
|
||||
# Constants
|
||||
'window_size': 512,
|
||||
'agg': 10,
|
||||
'high_end_process': 'mirroring',
|
||||
}
|
||||
nn_arch_sizes = [
|
||||
31191, # default
|
||||
33966,61968, 123821, 123812, 537238 # custom
|
||||
]
|
||||
self.nn_architecture = list('{}KB'.format(s) for s in nn_arch_sizes)
|
||||
model_size = math.ceil(os.stat(model_path ).st_size / 1024)
|
||||
nn_architecture = '{}KB'.format(min(nn_arch_sizes, key=lambda x:abs(x-model_size)))
|
||||
nets = importlib.import_module('uvr5_pack.lib_v5.nets' + f'_{nn_architecture}'.replace('_{}KB'.format(nn_arch_sizes[0]), ''), package=None)
|
||||
model_hash = hashlib.md5(open(model_path,'rb').read()).hexdigest()
|
||||
param_name ,model_params_d = _get_name_params(model_path , model_hash)
|
||||
|
||||
mp = ModelParameters(model_params_d)
|
||||
model = nets.CascadedASPPNet(mp.param['bins'] * 2)
|
||||
cpk = torch.load( model_path , map_location='cpu')
|
||||
model.load_state_dict(cpk)
|
||||
model.eval()
|
||||
if(is_half==True):model = model.half().to(device)
|
||||
else:model = model.to(device)
|
||||
|
||||
self.mp = mp
|
||||
self.model = model
|
||||
|
||||
def _path_audio_(self, music_file ,ins_root=None,vocal_root=None):
|
||||
if(ins_root is None and vocal_root is None):return "No save root."
|
||||
name=os.path.basename(music_file)
|
||||
if(ins_root is not None):os.makedirs(ins_root, exist_ok=True)
|
||||
if(vocal_root is not None):os.makedirs(vocal_root , exist_ok=True)
|
||||
X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
|
||||
bands_n = len(self.mp.param['band'])
|
||||
# print(bands_n)
|
||||
for d in range(bands_n, 0, -1):
|
||||
bp = self.mp.param['band'][d]
|
||||
if d == bands_n: # high-end band
|
||||
X_wave[d], _ = librosa.core.load(#理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑
|
||||
music_file, bp['sr'], False, dtype=np.float32, res_type=bp['res_type'])
|
||||
if X_wave[d].ndim == 1:
|
||||
X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
|
||||
else: # lower bands
|
||||
X_wave[d] = librosa.core.resample(X_wave[d+1], self.mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type'])
|
||||
# Stft of wave source
|
||||
X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(X_wave[d], bp['hl'], bp['n_fft'], self.mp.param['mid_side'], self.mp.param['mid_side_b2'], self.mp.param['reverse'])
|
||||
# pdb.set_trace()
|
||||
if d == bands_n and self.data['high_end_process'] != 'none':
|
||||
input_high_end_h = (bp['n_fft']//2 - bp['crop_stop']) + ( self.mp.param['pre_filter_stop'] - self.mp.param['pre_filter_start'])
|
||||
input_high_end = X_spec_s[d][:, bp['n_fft']//2-input_high_end_h:bp['n_fft']//2, :]
|
||||
|
||||
X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
|
||||
aggresive_set = float(self.data['agg']/100)
|
||||
aggressiveness = {'value': aggresive_set, 'split_bin': self.mp.param['band'][1]['crop_stop']}
|
||||
with torch.no_grad():
|
||||
pred, X_mag, X_phase = inference(X_spec_m,self.device,self.model, aggressiveness,self.data)
|
||||
# Postprocess
|
||||
if self.data['postprocess']:
|
||||
pred_inv = np.clip(X_mag - pred, 0, np.inf)
|
||||
pred = spec_utils.mask_silence(pred, pred_inv)
|
||||
y_spec_m = pred * X_phase
|
||||
v_spec_m = X_spec_m - y_spec_m
|
||||
|
||||
if (ins_root is not None):
|
||||
if self.data['high_end_process'].startswith('mirroring'):
|
||||
input_high_end_ = spec_utils.mirroring(self.data['high_end_process'], y_spec_m, input_high_end, self.mp)
|
||||
wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp,input_high_end_h, input_high_end_)
|
||||
else:
|
||||
wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
|
||||
print ('%s instruments done'%name)
|
||||
wavfile.write(os.path.join(ins_root, 'instrument_{}.wav'.format(name) ), self.mp.param['sr'], (np.array(wav_instrument)*32768).astype("int16")) #
|
||||
if (vocal_root is not None):
|
||||
if self.data['high_end_process'].startswith('mirroring'):
|
||||
input_high_end_ = spec_utils.mirroring(self.data['high_end_process'], v_spec_m, input_high_end, self.mp)
|
||||
wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp, input_high_end_h, input_high_end_)
|
||||
else:
|
||||
wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
|
||||
print ('%s vocals done'%name)
|
||||
wavfile.write(os.path.join(vocal_root , 'vocal_{}.wav'.format(name) ), self.mp.param['sr'], (np.array(wav_vocals)*32768).astype("int16"))
|
||||
|
||||
if __name__ == '__main__':
|
||||
device = 'cuda'
|
||||
is_half=True
|
||||
model_path='uvr5_weights/2_HP-UVR.pth'
|
||||
pre_fun = _audio_pre_(model_path=model_path,device=device,is_half=True)
|
||||
audio_path = '神女劈观.aac'
|
||||
save_path = 'opt'
|
||||
pre_fun._path_audio_(audio_path , save_path,save_path)
|
18
my_utils.py
Normal file
18
my_utils.py
Normal file
@ -0,0 +1,18 @@
|
||||
import ffmpeg,numpy as np
|
||||
def load_audio(file,sr):
|
||||
try:
|
||||
# https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
|
||||
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
|
||||
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
|
||||
out, _ = (
|
||||
ffmpeg.input(file, threads=0)
|
||||
.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
|
||||
.run(cmd=["./ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
|
||||
)
|
||||
except ffmpeg.Error as e:
|
||||
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
|
||||
|
||||
return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
|
||||
|
||||
if __name__=='__main__' :
|
||||
print(load_audio(r"C:\CloudMusic\宮野幸子,森下唯 - 月夜に謳う君 -LUNA-.mp3",16000).shape)
|
196
requirements.txt
Normal file
196
requirements.txt
Normal file
@ -0,0 +1,196 @@
|
||||
absl-py==1.1.0
|
||||
aiofiles==23.1.0
|
||||
aiohttp==3.8.4
|
||||
aiosignal==1.3.1
|
||||
altair==4.2.0
|
||||
antlr4-python3-runtime==4.8
|
||||
anyio==3.6.1
|
||||
appdirs==1.4.4
|
||||
argon2-cffi==21.3.0
|
||||
argon2-cffi-bindings==21.2.0
|
||||
asttokens==2.0.5
|
||||
async-timeout==4.0.2
|
||||
attrs==21.4.0
|
||||
audioread==3.0.0
|
||||
Babel==2.10.3
|
||||
backcall==0.2.0
|
||||
beautifulsoup4==4.11.1
|
||||
bitarray==2.7.3
|
||||
bleach==5.0.0
|
||||
brotlipy==0.7.0
|
||||
cachetools==5.2.0
|
||||
certifi==2021.5.30
|
||||
cffi
|
||||
chardet
|
||||
charset-normalizer==3.0.1
|
||||
click==8.1.3
|
||||
cmake==3.25.0
|
||||
colorama==0.4.5
|
||||
cryptography
|
||||
cycler==0.11.0
|
||||
Cython==0.29.32
|
||||
debugpy==1.6.0
|
||||
decorator==5.1.1
|
||||
defusedxml==0.7.1
|
||||
entrypoints==0.4
|
||||
executing==0.8.3
|
||||
fairseq==0.12.2
|
||||
faiss-gpu==1.7.2
|
||||
fastapi==0.92.0
|
||||
fastjsonschema==2.15.3
|
||||
ffmpeg==1.4
|
||||
ffmpy==0.3.0
|
||||
filelock==3.9.0
|
||||
fonttools==4.33.3
|
||||
frozenlist==1.3.3
|
||||
fsspec==2022.11.0
|
||||
functorch==2.0.0
|
||||
future==0.18.3
|
||||
google==3.0.0
|
||||
google-auth==2.8.0
|
||||
google-auth-oauthlib==0.4.6
|
||||
googleads==3.8.0
|
||||
gradio==3.19.1
|
||||
grpcio==1.46.3
|
||||
h11==0.13.0
|
||||
httpcore==0.16.3
|
||||
httplib2==0.21.0
|
||||
httpx==0.23.1
|
||||
Hydra==2.5
|
||||
hydra-core==1.0.7
|
||||
idna
|
||||
importlib-metadata==4.11.4
|
||||
importlib-resources==5.8.0
|
||||
ipykernel==6.15.0
|
||||
ipython==8.4.0
|
||||
ipython-genutils==0.2.0
|
||||
ipywidgets==7.7.0
|
||||
jedi==0.18.1
|
||||
Jinja2==3.1.2
|
||||
joblib==1.1.0
|
||||
json5==0.9.8
|
||||
jsonschema==4.6.0
|
||||
jupyter-client==7.3.4
|
||||
jupyter-core==4.10.0
|
||||
jupyter-server==1.17.1
|
||||
jupyterlab==3.4.3
|
||||
jupyterlab-language-pack-zh-CN==3.4.post1
|
||||
jupyterlab-pygments==0.2.2
|
||||
jupyterlab-server==2.14.0
|
||||
jupyterlab-widgets==1.1.0
|
||||
kiwisolver==1.4.3
|
||||
lazy-loader==0.1
|
||||
librosa==0.9.2
|
||||
linkify-it-py==2.0.0
|
||||
lit==15.0.7
|
||||
llvmlite==0.39.0
|
||||
lxml==4.8.0
|
||||
Markdown==3.3.7
|
||||
markdown-it-py==2.2.0
|
||||
MarkupSafe==2.1.1
|
||||
matplotlib==3.5.2
|
||||
matplotlib-inline==0.1.3
|
||||
mdit-py-plugins==0.3.3
|
||||
mdurl==0.1.1
|
||||
mistune==0.8.4
|
||||
mpmath==1.2.1
|
||||
msgpack==1.0.4
|
||||
multidict==6.0.2
|
||||
nbclassic==0.3.7
|
||||
nbclient==0.6.4
|
||||
nbconvert==6.5.0
|
||||
nbformat==5.4.0
|
||||
nest-asyncio==1.5.5
|
||||
networkx==2.8.8
|
||||
notebook==6.4.12
|
||||
notebook-shim==0.1.0
|
||||
numba==0.56.4
|
||||
numpy==1.23.5
|
||||
oauth2client==4.1.3
|
||||
oauthlib==3.2.0
|
||||
omegaconf==2.0.6
|
||||
orjson==3.8.6
|
||||
packaging==21.3
|
||||
pandas==1.5.2
|
||||
pandocfilters==1.5.0
|
||||
parso==0.8.3
|
||||
pexpect==4.8.0
|
||||
pickleshare==0.7.5
|
||||
Pillow==9.1.1
|
||||
pooch==1.6.0
|
||||
portalocker==2.5.1
|
||||
praat-parselmouth==0.4.2
|
||||
prometheus-client==0.14.1
|
||||
prompt-toolkit==3.0.29
|
||||
protobuf==3.19.4
|
||||
psutil==5.9.1
|
||||
ptyprocess==0.7.0
|
||||
pure-eval==0.2.2
|
||||
pyasn1==0.4.8
|
||||
pyasn1-modules==0.2.8
|
||||
pycosat==0.6.3
|
||||
pycparser
|
||||
pycryptodome==3.16.0
|
||||
pydantic==1.10.5
|
||||
pydub==0.25.1
|
||||
Pygments==2.12.0
|
||||
pyOpenSSL
|
||||
pyparsing==3.0.9
|
||||
pyrsistent==0.18.1
|
||||
PySocks
|
||||
python-dateutil==2.8.2
|
||||
python-multipart==0.0.5
|
||||
pytz==2022.6
|
||||
pyworld==0.3.2
|
||||
PyYAML==6.0
|
||||
pyzmq==23.2.0
|
||||
regex==2022.10.31
|
||||
requests
|
||||
requests-oauthlib==1.3.1
|
||||
resampy==0.4.2
|
||||
rfc3986==1.5.0
|
||||
rsa==4.8
|
||||
ruamel-yaml-conda
|
||||
sacrebleu==2.3.1
|
||||
scikit-learn==1.1.3
|
||||
scipy==1.9.3
|
||||
Send2Trash==1.8.0
|
||||
six
|
||||
sniffio==1.2.0
|
||||
soundfile==0.12.1
|
||||
soupsieve==2.3.2.post1
|
||||
soxr==0.3.3
|
||||
stack-data==0.3.0
|
||||
starlette==0.25.0
|
||||
stopit==1.1.1
|
||||
suds-jurko==0.6
|
||||
supervisor==4.2.4
|
||||
sympy==1.11.1
|
||||
tabulate==0.8.10
|
||||
tensorboard==2.9.1
|
||||
tensorboard-data-server==0.6.1
|
||||
tensorboard-plugin-wit==1.8.1
|
||||
terminado==0.15.0
|
||||
threadpoolctl==3.1.0
|
||||
tinycss2==1.1.1
|
||||
toolz==0.12.0
|
||||
torch==2.0.0+cu117
|
||||
torchaudio==2.0.1+cu117
|
||||
torchgen==0.0.1
|
||||
torchvision==0.15.1+cu117
|
||||
tornado==6.1
|
||||
tqdm
|
||||
traitlets==5.3.0
|
||||
triton==2.0.0
|
||||
typing-extensions==4.2.0
|
||||
uc-micro-py==1.0.1
|
||||
urllib3==1.26.13
|
||||
uvicorn==0.21.1
|
||||
wcwidth==0.2.5
|
||||
webencodings==0.5.1
|
||||
websocket-client==1.3.3
|
||||
websockets==10.3
|
||||
Werkzeug==2.1.2
|
||||
widgetsnbextension==3.6.0
|
||||
yarl==1.8.1
|
||||
zipp==3.8.0
|
186
slicer2.py
Normal file
186
slicer2.py
Normal file
@ -0,0 +1,186 @@
|
||||
import numpy as np
|
||||
|
||||
|
||||
# This function is obtained from librosa.
|
||||
def get_rms(
|
||||
y,
|
||||
*,
|
||||
frame_length=2048,
|
||||
hop_length=512,
|
||||
pad_mode="constant",
|
||||
):
|
||||
padding = (int(frame_length // 2), int(frame_length // 2))
|
||||
y = np.pad(y, padding, mode=pad_mode)
|
||||
|
||||
axis = -1
|
||||
# put our new within-frame axis at the end for now
|
||||
out_strides = y.strides + tuple([y.strides[axis]])
|
||||
# Reduce the shape on the framing axis
|
||||
x_shape_trimmed = list(y.shape)
|
||||
x_shape_trimmed[axis] -= frame_length - 1
|
||||
out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
|
||||
xw = np.lib.stride_tricks.as_strided(
|
||||
y, shape=out_shape, strides=out_strides
|
||||
)
|
||||
if axis < 0:
|
||||
target_axis = axis - 1
|
||||
else:
|
||||
target_axis = axis + 1
|
||||
xw = np.moveaxis(xw, -1, target_axis)
|
||||
# Downsample along the target axis
|
||||
slices = [slice(None)] * xw.ndim
|
||||
slices[axis] = slice(0, None, hop_length)
|
||||
x = xw[tuple(slices)]
|
||||
|
||||
# Calculate power
|
||||
power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)
|
||||
|
||||
return np.sqrt(power)
|
||||
|
||||
|
||||
class Slicer:
|
||||
def __init__(self,
|
||||
sr: int,
|
||||
threshold: float = -40.,
|
||||
min_length: int = 5000,
|
||||
min_interval: int = 300,
|
||||
hop_size: int = 20,
|
||||
max_sil_kept: int = 5000):
|
||||
if not min_length >= min_interval >= hop_size:
|
||||
raise ValueError('The following condition must be satisfied: min_length >= min_interval >= hop_size')
|
||||
if not max_sil_kept >= hop_size:
|
||||
raise ValueError('The following condition must be satisfied: max_sil_kept >= hop_size')
|
||||
min_interval = sr * min_interval / 1000
|
||||
self.threshold = 10 ** (threshold / 20.)
|
||||
self.hop_size = round(sr * hop_size / 1000)
|
||||
self.win_size = min(round(min_interval), 4 * self.hop_size)
|
||||
self.min_length = round(sr * min_length / 1000 / self.hop_size)
|
||||
self.min_interval = round(min_interval / self.hop_size)
|
||||
self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
|
||||
|
||||
def _apply_slice(self, waveform, begin, end):
|
||||
if len(waveform.shape) > 1:
|
||||
return waveform[:, begin * self.hop_size: min(waveform.shape[1], end * self.hop_size)]
|
||||
else:
|
||||
return waveform[begin * self.hop_size: min(waveform.shape[0], end * self.hop_size)]
|
||||
|
||||
# @timeit
|
||||
def slice(self, waveform):
|
||||
if len(waveform.shape) > 1:
|
||||
samples = waveform.mean(axis=0)
|
||||
else:
|
||||
samples = waveform
|
||||
if samples.shape[0] <= self.min_length:
|
||||
return [waveform]
|
||||
rms_list = get_rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0)
|
||||
sil_tags = []
|
||||
silence_start = None
|
||||
clip_start = 0
|
||||
for i, rms in enumerate(rms_list):
|
||||
# Keep looping while frame is silent.
|
||||
if rms < self.threshold:
|
||||
# Record start of silent frames.
|
||||
if silence_start is None:
|
||||
silence_start = i
|
||||
continue
|
||||
# Keep looping while frame is not silent and silence start has not been recorded.
|
||||
if silence_start is None:
|
||||
continue
|
||||
# Clear recorded silence start if interval is not enough or clip is too short
|
||||
is_leading_silence = silence_start == 0 and i > self.max_sil_kept
|
||||
need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length
|
||||
if not is_leading_silence and not need_slice_middle:
|
||||
silence_start = None
|
||||
continue
|
||||
# Need slicing. Record the range of silent frames to be removed.
|
||||
if i - silence_start <= self.max_sil_kept:
|
||||
pos = rms_list[silence_start: i + 1].argmin() + silence_start
|
||||
if silence_start == 0:
|
||||
sil_tags.append((0, pos))
|
||||
else:
|
||||
sil_tags.append((pos, pos))
|
||||
clip_start = pos
|
||||
elif i - silence_start <= self.max_sil_kept * 2:
|
||||
pos = rms_list[i - self.max_sil_kept: silence_start + self.max_sil_kept + 1].argmin()
|
||||
pos += i - self.max_sil_kept
|
||||
pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
|
||||
pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
|
||||
if silence_start == 0:
|
||||
sil_tags.append((0, pos_r))
|
||||
clip_start = pos_r
|
||||
else:
|
||||
sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
|
||||
clip_start = max(pos_r, pos)
|
||||
else:
|
||||
pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
|
||||
pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
|
||||
if silence_start == 0:
|
||||
sil_tags.append((0, pos_r))
|
||||
else:
|
||||
sil_tags.append((pos_l, pos_r))
|
||||
clip_start = pos_r
|
||||
silence_start = None
|
||||
# Deal with trailing silence.
|
||||
total_frames = rms_list.shape[0]
|
||||
if silence_start is not None and total_frames - silence_start >= self.min_interval:
|
||||
silence_end = min(total_frames, silence_start + self.max_sil_kept)
|
||||
pos = rms_list[silence_start: silence_end + 1].argmin() + silence_start
|
||||
sil_tags.append((pos, total_frames + 1))
|
||||
# Apply and return slices.
|
||||
if len(sil_tags) == 0:
|
||||
return [waveform]
|
||||
else:
|
||||
chunks = []
|
||||
if sil_tags[0][0] > 0:
|
||||
chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0]))
|
||||
for i in range(len(sil_tags) - 1):
|
||||
chunks.append(self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]))
|
||||
if sil_tags[-1][1] < total_frames:
|
||||
chunks.append(self._apply_slice(waveform, sil_tags[-1][1], total_frames))
|
||||
return chunks
|
||||
|
||||
|
||||
def main():
|
||||
import os.path
|
||||
from argparse import ArgumentParser
|
||||
|
||||
import librosa
|
||||
import soundfile
|
||||
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument('audio', type=str, help='The audio to be sliced')
|
||||
parser.add_argument('--out', type=str, help='Output directory of the sliced audio clips')
|
||||
parser.add_argument('--db_thresh', type=float, required=False, default=-40,
|
||||
help='The dB threshold for silence detection')
|
||||
parser.add_argument('--min_length', type=int, required=False, default=5000,
|
||||
help='The minimum milliseconds required for each sliced audio clip')
|
||||
parser.add_argument('--min_interval', type=int, required=False, default=300,
|
||||
help='The minimum milliseconds for a silence part to be sliced')
|
||||
parser.add_argument('--hop_size', type=int, required=False, default=10,
|
||||
help='Frame length in milliseconds')
|
||||
parser.add_argument('--max_sil_kept', type=int, required=False, default=500,
|
||||
help='The maximum silence length kept around the sliced clip, presented in milliseconds')
|
||||
args = parser.parse_args()
|
||||
out = args.out
|
||||
if out is None:
|
||||
out = os.path.dirname(os.path.abspath(args.audio))
|
||||
audio, sr = librosa.load(args.audio, sr=None, mono=False)
|
||||
slicer = Slicer(
|
||||
sr=sr,
|
||||
threshold=args.db_thresh,
|
||||
min_length=args.min_length,
|
||||
min_interval=args.min_interval,
|
||||
hop_size=args.hop_size,
|
||||
max_sil_kept=args.max_sil_kept
|
||||
)
|
||||
chunks = slicer.slice(audio)
|
||||
if not os.path.exists(out):
|
||||
os.makedirs(out)
|
||||
for i, chunk in enumerate(chunks):
|
||||
if len(chunk.shape) > 1:
|
||||
chunk = chunk.T
|
||||
soundfile.write(os.path.join(out, f'%s_%d.wav' % (os.path.basename(args.audio).rsplit('.', maxsplit=1)[0], i)), chunk, sr)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
509
train_nsf_sim_cache_sid_load_pretrain.py
Normal file
509
train_nsf_sim_cache_sid_load_pretrain.py
Normal file
@ -0,0 +1,509 @@
|
||||
import sys,os
|
||||
now_dir=os.getcwd()
|
||||
sys.path.append(os.path.join(now_dir,"train"))
|
||||
import utils
|
||||
hps = utils.get_hparams()
|
||||
os.environ["CUDA_VISIBLE_DEVICES"]=hps.gpus.replace("-",",")
|
||||
n_gpus=len(hps.gpus.split("-"))
|
||||
from random import shuffle
|
||||
import traceback,json,argparse,itertools,math,torch,pdb
|
||||
torch.backends.cudnn.deterministic = True
|
||||
torch.backends.cudnn.benchmark = False
|
||||
from torch import nn, optim
|
||||
from torch.nn import functional as F
|
||||
from torch.utils.data import DataLoader
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
import torch.multiprocessing as mp
|
||||
import torch.distributed as dist
|
||||
from torch.nn.parallel import DistributedDataParallel as DDP
|
||||
from torch.cuda.amp import autocast, GradScaler
|
||||
from infer_pack import commons
|
||||
|
||||
from time import time as ttime
|
||||
from data_utils import TextAudioLoaderMultiNSFsid,TextAudioLoader, TextAudioCollateMultiNSFsid,TextAudioCollate, DistributedBucketSampler
|
||||
from infer_pack.models import (
|
||||
SynthesizerTrnMs256NSFsid,SynthesizerTrnMs256NSFsid_nono,
|
||||
MultiPeriodDiscriminator,
|
||||
)
|
||||
from losses import generator_loss, discriminator_loss, feature_loss, kl_loss
|
||||
from mel_processing import mel_spectrogram_torch, spec_to_mel_torch
|
||||
|
||||
|
||||
global_step = 0
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
"""Assume Single Node Multi GPUs Training Only"""
|
||||
assert torch.cuda.is_available(), "CPU training is not allowed."
|
||||
|
||||
# n_gpus = torch.cuda.device_count()
|
||||
os.environ["MASTER_ADDR"] = "localhost"
|
||||
os.environ["MASTER_PORT"] = "5555"
|
||||
|
||||
|
||||
mp.spawn(
|
||||
run,
|
||||
nprocs=n_gpus,
|
||||
args=(
|
||||
n_gpus,
|
||||
hps,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def run(rank, n_gpus, hps):
|
||||
global global_step
|
||||
if rank == 0:
|
||||
logger = utils.get_logger(hps.model_dir)
|
||||
logger.info(hps)
|
||||
utils.check_git_hash(hps.model_dir)
|
||||
writer = SummaryWriter(log_dir=hps.model_dir)
|
||||
writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))
|
||||
|
||||
dist.init_process_group(
|
||||
backend="gloo", init_method="env://", world_size=n_gpus, rank=rank
|
||||
)
|
||||
torch.manual_seed(hps.train.seed)
|
||||
torch.cuda.set_device(rank)
|
||||
|
||||
if (hps.if_f0 == 1):train_dataset = TextAudioLoaderMultiNSFsid(hps.data.training_files, hps.data)
|
||||
else:train_dataset = TextAudioLoader(hps.data.training_files, hps.data)
|
||||
train_sampler = DistributedBucketSampler(
|
||||
train_dataset,
|
||||
hps.train.batch_size,
|
||||
# [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1200,1400], # 16s
|
||||
[100, 200, 300, 400, 500, 600, 700, 800, 900], # 16s
|
||||
num_replicas=n_gpus,
|
||||
rank=rank,
|
||||
shuffle=True,
|
||||
)
|
||||
# It is possible that dataloader's workers are out of shared memory. Please try to raise your shared memory limit.
|
||||
# num_workers=8 -> num_workers=4
|
||||
if (hps.if_f0 == 1):collate_fn = TextAudioCollateMultiNSFsid()
|
||||
else:collate_fn = TextAudioCollate()
|
||||
train_loader = DataLoader(
|
||||
train_dataset,
|
||||
num_workers=4,
|
||||
shuffle=False,
|
||||
pin_memory=True,
|
||||
collate_fn=collate_fn,
|
||||
batch_sampler=train_sampler,
|
||||
persistent_workers=True,
|
||||
prefetch_factor=8,
|
||||
)
|
||||
if(hps.if_f0==1):net_g = SynthesizerTrnMs256NSFsid(hps.data.filter_length // 2 + 1,hps.train.segment_size // hps.data.hop_length,**hps.model,is_half=hps.train.fp16_run,sr=hps.sample_rate).cuda(rank)
|
||||
else:net_g = SynthesizerTrnMs256NSFsid_nono(hps.data.filter_length // 2 + 1,hps.train.segment_size // hps.data.hop_length,**hps.model,is_half=hps.train.fp16_run).cuda(rank)
|
||||
net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank)
|
||||
optim_g = torch.optim.AdamW(
|
||||
net_g.parameters(),
|
||||
hps.train.learning_rate,
|
||||
betas=hps.train.betas,
|
||||
eps=hps.train.eps,
|
||||
)
|
||||
optim_d = torch.optim.AdamW(
|
||||
net_d.parameters(),
|
||||
hps.train.learning_rate,
|
||||
betas=hps.train.betas,
|
||||
eps=hps.train.eps,
|
||||
)
|
||||
# net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True)
|
||||
# net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True)
|
||||
net_g = DDP(net_g, device_ids=[rank])
|
||||
net_d = DDP(net_d, device_ids=[rank])
|
||||
|
||||
try:#如果能加载自动resume
|
||||
_, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, optim_d) # D多半加载没事
|
||||
if rank == 0:
|
||||
logger.info("loaded D")
|
||||
# _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g,load_opt=0)
|
||||
_, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g)
|
||||
global_step = (epoch_str - 1) * len(train_loader)
|
||||
# epoch_str = 1
|
||||
# global_step = 0
|
||||
except:#如果首次不能加载,加载pretrain
|
||||
traceback.print_exc()
|
||||
epoch_str = 1
|
||||
global_step = 0
|
||||
if rank == 0:
|
||||
logger.info("loaded pretrained %s %s"%(hps.pretrainG,hps.pretrainD))
|
||||
print(net_g.module.load_state_dict(torch.load(hps.pretrainG,map_location="cpu")["model"]))##测试不加载优化器
|
||||
print(net_d.module.load_state_dict(torch.load(hps.pretrainD,map_location="cpu")["model"]))
|
||||
|
||||
scheduler_g = torch.optim.lr_scheduler.ExponentialLR(
|
||||
optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2
|
||||
)
|
||||
scheduler_d = torch.optim.lr_scheduler.ExponentialLR(
|
||||
optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2
|
||||
)
|
||||
|
||||
scaler = GradScaler(enabled=hps.train.fp16_run)
|
||||
|
||||
cache=[]
|
||||
for epoch in range(epoch_str, hps.train.epochs + 1):
|
||||
if rank == 0:
|
||||
train_and_evaluate(
|
||||
rank,
|
||||
epoch,
|
||||
hps,
|
||||
[net_g, net_d],
|
||||
[optim_g, optim_d],
|
||||
[scheduler_g, scheduler_d],
|
||||
scaler,
|
||||
[train_loader, None],
|
||||
logger,
|
||||
[writer, writer_eval],cache
|
||||
)
|
||||
else:
|
||||
train_and_evaluate(
|
||||
rank,
|
||||
epoch,
|
||||
hps,
|
||||
[net_g, net_d],
|
||||
[optim_g, optim_d],
|
||||
[scheduler_g, scheduler_d],
|
||||
scaler,
|
||||
[train_loader, None],
|
||||
None,
|
||||
None,cache
|
||||
)
|
||||
scheduler_g.step()
|
||||
scheduler_d.step()
|
||||
|
||||
|
||||
def train_and_evaluate(
|
||||
rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers,cache
|
||||
):
|
||||
net_g, net_d = nets
|
||||
optim_g, optim_d = optims
|
||||
train_loader, eval_loader = loaders
|
||||
if writers is not None:
|
||||
writer, writer_eval = writers
|
||||
|
||||
train_loader.batch_sampler.set_epoch(epoch)
|
||||
global global_step
|
||||
|
||||
net_g.train()
|
||||
net_d.train()
|
||||
if(cache==[]or hps.if_cache_data_in_gpu==False):#第一个epoch把cache全部填满训练集
|
||||
# print("caching")
|
||||
for batch_idx, info in enumerate(train_loader):
|
||||
if (hps.if_f0 == 1):phone,phone_lengths,pitch,pitchf,spec,spec_lengths,wave,wave_lengths,sid=info
|
||||
else:phone,phone_lengths,spec,spec_lengths,wave,wave_lengths,sid=info
|
||||
phone, phone_lengths = phone.cuda(rank, non_blocking=True),phone_lengths.cuda(rank, non_blocking=True )
|
||||
if (hps.if_f0 == 1):pitch,pitchf = pitch.cuda(rank, non_blocking=True),pitchf.cuda(rank, non_blocking=True)
|
||||
sid = sid.cuda(rank, non_blocking=True)
|
||||
spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda(rank, non_blocking=True)
|
||||
wave, wave_lengths = wave.cuda(rank, non_blocking=True), wave_lengths.cuda(rank, non_blocking=True)
|
||||
if(hps.if_cache_data_in_gpu==True):
|
||||
if (hps.if_f0 == 1):cache.append((batch_idx, (phone,phone_lengths,pitch,pitchf,spec,spec_lengths,wave,wave_lengths ,sid)))
|
||||
else:cache.append((batch_idx, (phone,phone_lengths,spec,spec_lengths,wave,wave_lengths ,sid)))
|
||||
with autocast(enabled=hps.train.fp16_run):
|
||||
if (hps.if_f0 == 1):y_hat,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(phone, phone_lengths, pitch,pitchf, spec, spec_lengths,sid)
|
||||
else:y_hat,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(phone, phone_lengths, spec, spec_lengths,sid)
|
||||
mel = spec_to_mel_torch(spec,hps.data.filter_length,hps.data.n_mel_channels,hps.data.sampling_rate,hps.data.mel_fmin,hps.data.mel_fmax,)
|
||||
y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length)
|
||||
with autocast(enabled=False):
|
||||
y_hat_mel = mel_spectrogram_torch(
|
||||
y_hat.float().squeeze(1),
|
||||
hps.data.filter_length,
|
||||
hps.data.n_mel_channels,
|
||||
hps.data.sampling_rate,
|
||||
hps.data.hop_length,
|
||||
hps.data.win_length,
|
||||
hps.data.mel_fmin,
|
||||
hps.data.mel_fmax,
|
||||
)
|
||||
if(hps.train.fp16_run==True):
|
||||
y_hat_mel=y_hat_mel.half()
|
||||
wave = commons.slice_segments(
|
||||
wave, ids_slice * hps.data.hop_length, hps.train.segment_size
|
||||
) # slice
|
||||
|
||||
# Discriminator
|
||||
y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach())
|
||||
with autocast(enabled=False):
|
||||
loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(
|
||||
y_d_hat_r, y_d_hat_g
|
||||
)
|
||||
optim_d.zero_grad()
|
||||
scaler.scale(loss_disc).backward()
|
||||
scaler.unscale_(optim_d)
|
||||
grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
|
||||
scaler.step(optim_d)
|
||||
|
||||
with autocast(enabled=hps.train.fp16_run):
|
||||
# Generator
|
||||
y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat)
|
||||
with autocast(enabled=False):
|
||||
loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
|
||||
loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
|
||||
loss_fm = feature_loss(fmap_r, fmap_g)
|
||||
loss_gen, losses_gen = generator_loss(y_d_hat_g)
|
||||
loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl
|
||||
optim_g.zero_grad()
|
||||
scaler.scale(loss_gen_all).backward()
|
||||
scaler.unscale_(optim_g)
|
||||
grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
|
||||
scaler.step(optim_g)
|
||||
scaler.update()
|
||||
|
||||
if rank == 0:
|
||||
if global_step % hps.train.log_interval == 0:
|
||||
lr = optim_g.param_groups[0]["lr"]
|
||||
logger.info(
|
||||
"Train Epoch: {} [{:.0f}%]".format(
|
||||
epoch, 100.0 * batch_idx / len(train_loader)
|
||||
)
|
||||
)
|
||||
# Amor For Tensorboard display
|
||||
if loss_mel > 50:
|
||||
loss_mel = 50
|
||||
if loss_kl > 5:
|
||||
loss_kl = 5
|
||||
|
||||
logger.info([global_step, lr])
|
||||
logger.info(
|
||||
f"loss_disc={loss_disc:.3f}, loss_gen={loss_gen:.3f}, loss_fm={loss_fm:.3f},loss_mel={loss_mel:.3f}, loss_kl={loss_kl:.3f}"
|
||||
)
|
||||
scalar_dict = {
|
||||
"loss/g/total": loss_gen_all,
|
||||
"loss/d/total": loss_disc,
|
||||
"learning_rate": lr,
|
||||
"grad_norm_d": grad_norm_d,
|
||||
"grad_norm_g": grad_norm_g,
|
||||
}
|
||||
scalar_dict.update(
|
||||
{"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/kl": loss_kl}
|
||||
)
|
||||
|
||||
scalar_dict.update(
|
||||
{"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}
|
||||
)
|
||||
scalar_dict.update(
|
||||
{"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}
|
||||
)
|
||||
scalar_dict.update(
|
||||
{"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}
|
||||
)
|
||||
image_dict = {
|
||||
"slice/mel_org": utils.plot_spectrogram_to_numpy(
|
||||
y_mel[0].data.cpu().numpy()
|
||||
),
|
||||
"slice/mel_gen": utils.plot_spectrogram_to_numpy(
|
||||
y_hat_mel[0].data.cpu().numpy()
|
||||
),
|
||||
"all/mel": utils.plot_spectrogram_to_numpy(
|
||||
mel[0].data.cpu().numpy()
|
||||
),
|
||||
}
|
||||
utils.summarize(
|
||||
writer=writer,
|
||||
global_step=global_step,
|
||||
images=image_dict,
|
||||
scalars=scalar_dict,
|
||||
)
|
||||
global_step += 1
|
||||
# if global_step % hps.train.eval_interval == 0:
|
||||
if epoch % hps.save_every_epoch == 0:
|
||||
if(hps.if_latest==0):
|
||||
utils.save_checkpoint(
|
||||
net_g,
|
||||
optim_g,
|
||||
hps.train.learning_rate,
|
||||
epoch,
|
||||
os.path.join(hps.model_dir, "G_{}.pth".format(global_step)),
|
||||
)
|
||||
utils.save_checkpoint(
|
||||
net_d,
|
||||
optim_d,
|
||||
hps.train.learning_rate,
|
||||
epoch,
|
||||
os.path.join(hps.model_dir, "D_{}.pth".format(global_step)),
|
||||
)
|
||||
else:
|
||||
utils.save_checkpoint(
|
||||
net_g,
|
||||
optim_g,
|
||||
hps.train.learning_rate,
|
||||
epoch,
|
||||
os.path.join(hps.model_dir, "G_{}.pth".format(2333333)),
|
||||
)
|
||||
utils.save_checkpoint(
|
||||
net_d,
|
||||
optim_d,
|
||||
hps.train.learning_rate,
|
||||
epoch,
|
||||
os.path.join(hps.model_dir, "D_{}.pth".format(2333333)),
|
||||
)
|
||||
|
||||
else:#后续的epoch直接使用打乱的cache
|
||||
shuffle(cache)
|
||||
# print("using cache")
|
||||
for batch_idx, info in cache:
|
||||
if (hps.if_f0 == 1):phone,phone_lengths,pitch,pitchf,spec,spec_lengths,wave,wave_lengths,sid=info
|
||||
else:phone,phone_lengths,spec,spec_lengths,wave,wave_lengths,sid=info
|
||||
with autocast(enabled=hps.train.fp16_run):
|
||||
if (hps.if_f0 == 1):y_hat,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(phone, phone_lengths, pitch,pitchf, spec, spec_lengths,sid)
|
||||
else:y_hat,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(phone, phone_lengths, spec, spec_lengths,sid)
|
||||
mel = spec_to_mel_torch(
|
||||
spec,
|
||||
hps.data.filter_length,
|
||||
hps.data.n_mel_channels,
|
||||
hps.data.sampling_rate,
|
||||
hps.data.mel_fmin,
|
||||
hps.data.mel_fmax,
|
||||
)
|
||||
y_mel = commons.slice_segments(
|
||||
mel, ids_slice, hps.train.segment_size // hps.data.hop_length
|
||||
)
|
||||
with autocast(enabled=False):
|
||||
y_hat_mel = mel_spectrogram_torch(
|
||||
y_hat.float().squeeze(1),
|
||||
hps.data.filter_length,
|
||||
hps.data.n_mel_channels,
|
||||
hps.data.sampling_rate,
|
||||
hps.data.hop_length,
|
||||
hps.data.win_length,
|
||||
hps.data.mel_fmin,
|
||||
hps.data.mel_fmax,
|
||||
)
|
||||
if(hps.train.fp16_run==True):
|
||||
y_hat_mel=y_hat_mel.half()
|
||||
wave = commons.slice_segments(
|
||||
wave, ids_slice * hps.data.hop_length, hps.train.segment_size
|
||||
) # slice
|
||||
|
||||
# Discriminator
|
||||
y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach())
|
||||
with autocast(enabled=False):
|
||||
loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(
|
||||
y_d_hat_r, y_d_hat_g
|
||||
)
|
||||
optim_d.zero_grad()
|
||||
scaler.scale(loss_disc).backward()
|
||||
scaler.unscale_(optim_d)
|
||||
grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
|
||||
scaler.step(optim_d)
|
||||
|
||||
with autocast(enabled=hps.train.fp16_run):
|
||||
# Generator
|
||||
y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat)
|
||||
with autocast(enabled=False):
|
||||
loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
|
||||
loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
|
||||
|
||||
loss_fm = feature_loss(fmap_r, fmap_g)
|
||||
loss_gen, losses_gen = generator_loss(y_d_hat_g)
|
||||
loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl
|
||||
optim_g.zero_grad()
|
||||
scaler.scale(loss_gen_all).backward()
|
||||
scaler.unscale_(optim_g)
|
||||
grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
|
||||
scaler.step(optim_g)
|
||||
scaler.update()
|
||||
|
||||
if rank == 0:
|
||||
if global_step % hps.train.log_interval == 0:
|
||||
lr = optim_g.param_groups[0]["lr"]
|
||||
logger.info(
|
||||
"Train Epoch: {} [{:.0f}%]".format(
|
||||
epoch, 100.0 * batch_idx / len(train_loader)
|
||||
)
|
||||
)
|
||||
# Amor For Tensorboard display
|
||||
if loss_mel > 50:
|
||||
loss_mel = 50
|
||||
if loss_kl > 5:
|
||||
loss_kl = 5
|
||||
|
||||
logger.info([global_step, lr])
|
||||
logger.info(
|
||||
f"loss_disc={loss_disc:.3f}, loss_gen={loss_gen:.3f}, loss_fm={loss_fm:.3f},loss_mel={loss_mel:.3f}, loss_kl={loss_kl:.3f}"
|
||||
)
|
||||
scalar_dict = {
|
||||
"loss/g/total": loss_gen_all,
|
||||
"loss/d/total": loss_disc,
|
||||
"learning_rate": lr,
|
||||
"grad_norm_d": grad_norm_d,
|
||||
"grad_norm_g": grad_norm_g,
|
||||
}
|
||||
scalar_dict.update(
|
||||
{"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/kl": loss_kl}
|
||||
)
|
||||
|
||||
scalar_dict.update(
|
||||
{"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}
|
||||
)
|
||||
scalar_dict.update(
|
||||
{"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}
|
||||
)
|
||||
scalar_dict.update(
|
||||
{"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}
|
||||
)
|
||||
image_dict = {
|
||||
"slice/mel_org": utils.plot_spectrogram_to_numpy(
|
||||
y_mel[0].data.cpu().numpy()
|
||||
),
|
||||
"slice/mel_gen": utils.plot_spectrogram_to_numpy(
|
||||
y_hat_mel[0].data.cpu().numpy()
|
||||
),
|
||||
"all/mel": utils.plot_spectrogram_to_numpy(
|
||||
mel[0].data.cpu().numpy()
|
||||
),
|
||||
}
|
||||
utils.summarize(
|
||||
writer=writer,
|
||||
global_step=global_step,
|
||||
images=image_dict,
|
||||
scalars=scalar_dict,
|
||||
)
|
||||
global_step += 1
|
||||
# if global_step % hps.train.eval_interval == 0:
|
||||
if epoch % hps.save_every_epoch == 0:
|
||||
if(hps.if_latest==0):
|
||||
utils.save_checkpoint(
|
||||
net_g,
|
||||
optim_g,
|
||||
hps.train.learning_rate,
|
||||
epoch,
|
||||
os.path.join(hps.model_dir, "G_{}.pth".format(global_step)),
|
||||
)
|
||||
utils.save_checkpoint(
|
||||
net_d,
|
||||
optim_d,
|
||||
hps.train.learning_rate,
|
||||
epoch,
|
||||
os.path.join(hps.model_dir, "D_{}.pth".format(global_step)),
|
||||
)
|
||||
else:
|
||||
utils.save_checkpoint(
|
||||
net_g,
|
||||
optim_g,
|
||||
hps.train.learning_rate,
|
||||
epoch,
|
||||
os.path.join(hps.model_dir, "G_{}.pth".format(2333333)),
|
||||
)
|
||||
utils.save_checkpoint(
|
||||
net_d,
|
||||
optim_d,
|
||||
hps.train.learning_rate,
|
||||
epoch,
|
||||
os.path.join(hps.model_dir, "D_{}.pth".format(2333333)),
|
||||
)
|
||||
|
||||
|
||||
if rank == 0:
|
||||
logger.info("====> Epoch: {}".format(epoch))
|
||||
if(epoch>=hps.total_epoch):
|
||||
if rank == 0:
|
||||
logger.info("Training is done. The program is closed.")
|
||||
from process_ckpt import savee#def savee(ckpt,sr,if_f0,name,epoch):
|
||||
if hasattr(net_g, 'module'):ckpt = net_g.module.state_dict()
|
||||
else:ckpt = net_g.state_dict()
|
||||
print("saving final ckpt:",savee(ckpt,hps.sample_rate,hps.if_f0,hps.name,epoch))
|
||||
os._exit(2333333)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue
Block a user