1
0
mirror of synced 2024-11-27 17:00:54 +01:00

Add files via upload

This commit is contained in:
liujing04 2023-03-31 17:54:38 +08:00 committed by GitHub
parent a7857f12ee
commit 2207b9647e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 1889 additions and 0 deletions

38
config.py Normal file
View File

@ -0,0 +1,38 @@
############离线VC参数
inp_root=r"白鹭霜华长条"#对输入目录下所有音频进行转换,别放非音频文件
opt_root=r"opt"#输出目录
f0_up_key=0#升降调整数男转女12女转男-12
person=r"weights\洛天依v3.pt"#目前只有洛天依v3
############硬件参数
device = "cuda:0"#填写cuda:x或cpux指代第几张卡只支持N卡加速
is_half=True#9-10-20-30-40系显卡无脑True不影响质量>=20显卡开启有加速
n_cpu=0#默认0用上所有线程写数字限制CPU资源使用
############下头别动
import torch
if(torch.cuda.is_available()==False):
print("没有发现支持的N卡使用CPU进行推理")
device="cpu"
is_half=False
if(device!="cpu"):
gpu_name=torch.cuda.get_device_name(int(device.split(":")[-1]))
if("16"in gpu_name or "MX"in gpu_name):
print("16系显卡/MX系显卡强制单精度")
is_half=False
from multiprocessing import cpu_count
if(n_cpu==0):n_cpu=cpu_count()
if(is_half==True):
#6G显存配置
x_pad = 3
x_query = 10
x_center = 60
x_max = 65
else:
#5G显存配置
x_pad = 1
# x_query = 6
# x_center = 30
# x_max = 32
#6G显存配置
x_query = 6
x_center = 38
x_max = 41

120
extract_f0_print.py Normal file
View File

@ -0,0 +1,120 @@
import os,traceback,sys,parselmouth
import librosa
import pyworld
from scipy.io import wavfile
import numpy as np,logging
logging.getLogger('numba').setLevel(logging.WARNING)
from multiprocessing import Process
exp_dir = sys.argv[1]
f = open("%s/extract_f0_feature.log"%exp_dir, "a+")
def printt(strr):
print(strr)
f.write("%s\n" % strr)
f.flush()
n_p = int(sys.argv[2])
f0method = sys.argv[3]
class FeatureInput(object):
def __init__(self, samplerate=16000, hop_size=160):
self.fs = samplerate
self.hop = hop_size
self.f0_bin = 256
self.f0_max = 1100.0
self.f0_min = 50.0
self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
def compute_f0(self, path,f0_method):
x, sr = librosa.load(path, self.fs)
p_len=x.shape[0]//self.hop
assert sr == self.fs
if(f0_method=="pm"):
time_step = 160 / 16000 * 1000
f0_min = 50
f0_max = 1100
f0 = parselmouth.Sound(x, sr).to_pitch_ac(
time_step=time_step / 1000, voicing_threshold=0.6,
pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
pad_size=(p_len - len(f0) + 1) // 2
if(pad_size>0 or p_len - len(f0) - pad_size>0):
f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
elif(f0_method=="harvest"):
f0, t = pyworld.harvest(
x.astype(np.double),
fs=sr,
f0_ceil=1100,
frame_period=1000 * self.hop / sr,
)
f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
elif(f0_method=="dio"):
f0, t = pyworld.dio(
x.astype(np.double),
fs=sr,
f0_ceil=1100,
frame_period=1000 * self.hop / sr,
)
f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
return f0
def coarse_f0(self, f0):
f0_mel = 1127 * np.log(1 + f0 / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
self.f0_bin - 2
) / (self.f0_mel_max - self.f0_mel_min) + 1
# use 0 or 1
f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
f0_coarse = np.rint(f0_mel).astype(np.int)
assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
f0_coarse.max(),
f0_coarse.min(),
)
return f0_coarse
def go(self,paths,f0_method):
if (len(paths) == 0): printt("no-f0-todo")
else:
printt("todo-f0-%s"%len(paths))
n=max(len(paths)//5,1)#每个进程最多打印5条
for idx,(inp_path,opt_path1,opt_path2) in enumerate(paths):
try:
if(idx%n==0):printt("f0ing,now-%s,all-%s,-%s"%(idx,len(paths),inp_path))
if(os.path.exists(opt_path1+".npy")==True and os.path.exists(opt_path2+".npy")==True):continue
featur_pit = self.compute_f0(inp_path,f0_method)
np.save(opt_path2,featur_pit,allow_pickle=False,)#nsf
coarse_pit = self.coarse_f0(featur_pit)
np.save(opt_path1,coarse_pit,allow_pickle=False,)#ori
except:
printt("f0fail-%s-%s-%s" % (idx, inp_path,traceback.format_exc()))
if __name__=='__main__':
# exp_dir=r"E:\codes\py39\dataset\mi-test"
# n_p=16
# f = open("%s/log_extract_f0.log"%exp_dir, "w")
printt(sys.argv)
featureInput = FeatureInput()
paths=[]
inp_root= "%s/1_16k_wavs"%(exp_dir)
opt_root1="%s/2a_f0"%(exp_dir)
opt_root2="%s/2b-f0nsf"%(exp_dir)
os.makedirs(opt_root1,exist_ok=True)
os.makedirs(opt_root2,exist_ok=True)
for name in sorted(list(os.listdir(inp_root))):
inp_path="%s/%s"%(inp_root,name)
if ("spec" in inp_path): continue
opt_path1="%s/%s"%(opt_root1,name)
opt_path2="%s/%s"%(opt_root2,name)
paths.append([inp_path,opt_path1,opt_path2])
ps=[]
for i in range(n_p):
p=Process(target=featureInput.go,args=(paths[i::n_p],f0method,))
p.start()
ps.append(p)
for p in ps:
p.join()

84
extract_feature_print.py Normal file
View File

@ -0,0 +1,84 @@
import os,sys,traceback
n_part=int(sys.argv[1])
i_part=int(sys.argv[2])
i_gpu=sys.argv[3]
exp_dir=sys.argv[4]
os.environ["CUDA_VISIBLE_DEVICES"]=str(i_gpu)
import torch
import torch.nn.functional as F
import soundfile as sf
import numpy as np
import joblib
from fairseq import checkpoint_utils
import pdb
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
f = open("%s/extract_f0_feature.log"%exp_dir, "a+")
def printt(strr):
print(strr)
f.write("%s\n" % strr)
f.flush()
printt(sys.argv)
# model_path = "/bili-coeus/jupyter/jupyterhub-liujing04/speech/pretrain/ContentVec_legacy500.pt"
model_path = "hubert_base.pt"
printt(exp_dir)
wavPath = "%s/1_16k_wavs"%exp_dir
outPath = "%s/3_feature256"%exp_dir
os.makedirs(outPath,exist_ok=True)
# wave must be 16k, hop_size=320
def readwave(wav_path, normalize=False):
wav, sr = sf.read(wav_path)
assert sr == 16000
feats = torch.from_numpy(wav).float()
if feats.dim() == 2: # double channels
feats = feats.mean(-1)
assert feats.dim() == 1, feats.dim()
if normalize:
with torch.no_grad():
feats = F.layer_norm(feats, feats.shape)
feats = feats.view(1, -1)
return feats
# HuBERT model
printt("load model(s) from {}".format(model_path))
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
[model_path],
suffix="",
)
model = models[0]
model = model.to(device)
model = model.half()
model.eval()
todo=sorted(list(os.listdir(wavPath)))[i_part::n_part]
n = max(1,len(todo) // 10) # 最多打印十条
if(len(todo)==0):printt("no-feature-todo")
else:
printt("all-feature-%s"%len(todo))
for idx,file in enumerate(todo):
try:
if file.endswith(".wav"):
wav_path = "%s/%s"%(wavPath,file)
out_path = "%s/%s"%(outPath,file.replace("wav","npy"))
if(os.path.exists(out_path)):continue
feats = readwave(wav_path, normalize=saved_cfg.task.normalize)
padding_mask = torch.BoolTensor(feats.shape).fill_(False)
inputs = {
"source": feats.half().to(device),
"padding_mask": padding_mask.to(device),
"output_layer": 9, # layer 9
}
with torch.no_grad():
logits = model.extract_features(**inputs)
feats = model.final_proj(logits[0])
feats = feats.squeeze(0).float().cpu().numpy()
# feats = np.repeat(feats, 2,0) # 20ms -> 10ms
np.save(out_path, feats, allow_pickle=False)
if (idx % n == 0):printt("now-%s,all-%s,%s,%s"%(len(todo),idx,file,feats.shape))
except:
printt(traceback.format_exc())
printt("all-feature-done")

630
infer-web.py Normal file
View File

@ -0,0 +1,630 @@
from multiprocessing import cpu_count
import threading
from time import sleep
from subprocess import Popen,PIPE,run as runn
from time import sleep
import torch, pdb, os,traceback,sys,warnings,shutil,numpy as np,faiss
#判断是否有能用来训练和加速推理的N卡
ncpu=cpu_count()
ngpu=torch.cuda.device_count()
gpu_infos=[]
if(torch.cuda.is_available()==False or ngpu==0):if_gpu_ok=False
else:
if_gpu_ok = False
for i in range(ngpu):
gpu_name=torch.cuda.get_device_name(i)
if("16"in gpu_name or "MX"in gpu_name):continue
if("10"in gpu_name or "20"in gpu_name or "30"in gpu_name or "40"in gpu_name or "A50"in gpu_name.upper() or "70"in gpu_name or "80"in gpu_name or "90"in gpu_name or "M4"in gpu_name or "T4"in gpu_name or "TITAN"in gpu_name.upper()):#A10#A100#V100#A40#P40#M40#K80
if_gpu_ok=True#至少有一张能用的N卡
gpu_infos.append("%s\t%s"%(i,gpu_name))
gpu_info="\n".join(gpu_infos)if if_gpu_ok==True and len(gpu_infos)>0 else "很遗憾您这没有能用的显卡来支持您训练"
gpus="-".join([i[0]for i in gpu_infos])
now_dir=os.getcwd()
sys.path.append(now_dir)
tmp=os.path.join(now_dir,"TEMP")
shutil.rmtree(tmp,ignore_errors=True)
os.makedirs(tmp,exist_ok=True)
os.makedirs(os.path.join(now_dir,"logs"),exist_ok=True)
os.makedirs(os.path.join(now_dir,"weights"),exist_ok=True)
os.environ["TEMP"]=tmp
warnings.filterwarnings("ignore")
torch.manual_seed(114514)
from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono
from scipy.io import wavfile
from fairseq import checkpoint_utils
import gradio as gr
import librosa
import logging
from vc_infer_pipeline import VC
import soundfile as sf
from config import is_half,device,is_half
from infer_uvr5 import _audio_pre_
from my_utils import load_audio
from train.process_ckpt import show_info,change_info,merge,extract_small_model
# from trainset_preprocess_pipeline import PreProcess
logging.getLogger('numba').setLevel(logging.WARNING)
class ToolButton(gr.Button, gr.components.FormComponent):
"""Small button with single emoji as text, fits inside gradio forms"""
def __init__(self, **kwargs):
super().__init__(variant="tool", **kwargs)
def get_block_name(self):
return "button"
hubert_model=None
def load_hubert():
global hubert_model
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(["hubert_base.pt"],suffix="",)
hubert_model = models[0]
hubert_model = hubert_model.to(device)
if(is_half):hubert_model = hubert_model.half()
else:hubert_model = hubert_model.float()
hubert_model.eval()
weight_root="weights"
weight_uvr5_root="uvr5_weights"
names=[]
for name in os.listdir(weight_root):names.append(name)
uvr5_names=[]
for name in os.listdir(weight_uvr5_root):uvr5_names.append(name.replace(".pth",""))
def vc_single(sid,input_audio,f0_up_key,f0_file,f0_method,file_index,file_big_npy,index_rate):#spk_item, input_audio0, vc_transform0,f0_file,f0method0
global tgt_sr,net_g,vc,hubert_model
if input_audio is None:return "You need to upload an audio", None
f0_up_key = int(f0_up_key)
try:
audio=load_audio(input_audio,16000)
times = [0, 0, 0]
if(hubert_model==None):load_hubert()
if_f0 = cpt.get("f0", 1)
audio_opt=vc.pipeline(hubert_model,net_g,sid,audio,times,f0_up_key,f0_method,file_index,file_big_npy,index_rate,if_f0,f0_file=f0_file)
print(times)
return "Success", (tgt_sr, audio_opt)
except:
info=traceback.format_exc()
print(info)
return info,(None,None)
def vc_multi(sid,dir_path,opt_root,paths,f0_up_key,f0_method,file_index,file_big_npy,index_rate):
try:
dir_path=dir_path.strip(" ")#防止小白拷路径头尾带了空格
opt_root=opt_root.strip(" ")
os.makedirs(opt_root, exist_ok=True)
try:
if(dir_path!=""):paths=[os.path.join(dir_path,name)for name in os.listdir(dir_path)]
else:paths=[path.name for path in paths]
except:
traceback.print_exc()
paths = [path.name for path in paths]
infos=[]
for path in paths:
info,opt=vc_single(sid,path,f0_up_key,None,f0_method,file_index,file_big_npy,index_rate)
if(info=="Success"):
try:
tgt_sr,audio_opt=opt
wavfile.write("%s/%s" % (opt_root, os.path.basename(path)), tgt_sr, audio_opt)
except:
info=traceback.format_exc()
infos.append("%s->%s"%(os.path.basename(path),info))
yield "\n".join(infos)
yield "\n".join(infos)
except:
yield traceback.format_exc()
def uvr(model_name,inp_root,save_root_vocal,paths,save_root_ins):
infos = []
try:
inp_root = inp_root.strip(" ").strip("\n")
save_root_vocal = save_root_vocal.strip(" ").strip("\n")
save_root_ins = save_root_ins.strip(" ").strip("\n")
pre_fun = _audio_pre_(model_path=os.path.join(weight_uvr5_root,model_name+".pth"), device=device, is_half=is_half)
if (inp_root != ""):paths = [os.path.join(inp_root, name) for name in os.listdir(inp_root)]
else:paths = [path.name for path in paths]
for name in paths:
inp_path=os.path.join(inp_root,name)
try:
pre_fun._path_audio_(inp_path , save_root_ins,save_root_vocal)
infos.append("%s->Success"%(os.path.basename(inp_path)))
yield "\n".join(infos)
except:
infos.append("%s->%s" % (os.path.basename(inp_path),traceback.format_exc()))
yield "\n".join(infos)
except:
infos.append(traceback.format_exc())
yield "\n".join(infos)
finally:
try:
del pre_fun.model
del pre_fun
except:
traceback.print_exc()
print("clean_empty_cache")
torch.cuda.empty_cache()
yield "\n".join(infos)
#一个选项卡全局只能有一个音色
def get_vc(sid):
global n_spk,tgt_sr,net_g,vc,cpt
if(sid==""):
global hubert_model
print("clean_empty_cache")
del net_g, n_spk, vc, hubert_model,tgt_sr#,cpt
hubert_model = net_g=n_spk=vc=hubert_model=tgt_sr=None
torch.cuda.empty_cache()
###楼下不这么折腾清理不干净
if_f0 = cpt.get("f0", 1)
if (if_f0 == 1):
net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half)
else:
net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
del net_g,cpt
torch.cuda.empty_cache()
cpt=None
return {"visible": False, "__type__": "update"}
person = "%s/%s" % (weight_root, sid)
print("loading %s"%person)
cpt = torch.load(person, map_location="cpu")
tgt_sr = cpt["config"][-1]
cpt["config"][-3]=cpt["weight"]["emb_g.weight"].shape[0]#n_spk
if_f0=cpt.get("f0",1)
if(if_f0==1):
net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half)
else:
net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
del net_g.enc_q
print(net_g.load_state_dict(cpt["weight"], strict=False)) # 不加这一行清不干净,真奇葩
net_g.eval().to(device)
if (is_half):net_g = net_g.half()
else:net_g = net_g.float()
vc = VC(tgt_sr, device, is_half)
n_spk=cpt["config"][-3]
return {"visible": True,"maximum": n_spk, "__type__": "update"}
def change_choices():return {"choices": sorted(list(os.listdir(weight_root))), "__type__": "update"}
def clean():return {"value": "", "__type__": "update"}
def change_f0(if_f0_3,sr2):#np7, f0method8,pretrained_G14,pretrained_D15
if(if_f0_3==""):return {"visible": True, "__type__": "update"},{"visible": True, "__type__": "update"},"pretrained/f0G%s.pth"%sr2,"pretrained/f0D%s.pth"%sr2
return {"visible": False, "__type__": "update"}, {"visible": False, "__type__": "update"},"pretrained/G%s.pth"%sr2,"pretrained/D%s.pth"%sr2
sr_dict={
"32k":32000,
"40k":40000,
"48k":48000,
}
def if_done(done,p):
while 1:
if(p.poll()==None):sleep(0.5)
else:break
done[0]=True
def if_done_multi(done,ps):
while 1:
#poll==None代表进程未结束
#只要有一个进程未结束都不停
flag=1
for p in ps:
if(p.poll()==None):
flag = 0
sleep(0.5)
break
if(flag==1):break
done[0]=True
def preprocess_dataset(trainset_dir,exp_dir,sr,n_p=ncpu):
sr=sr_dict[sr]
os.makedirs("%s/logs/%s"%(now_dir,exp_dir),exist_ok=True)
f = open("%s/logs/%s/preprocess.log"%(now_dir,exp_dir), "w")
f.close()
cmd="python trainset_preprocess_pipeline_print.py %s %s %s %s/logs/%s"%(trainset_dir,sr,n_p,now_dir,exp_dir)
print(cmd)
p = Popen(cmd, shell=True)#, stdin=PIPE, stdout=PIPE,stderr=PIPE,cwd=now_dir
###煞笔grpopen read都非得全跑完了再一次性读取不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
done=[False]
threading.Thread(target=if_done,args=(done,p,)).start()
while(1):
with open("%s/logs/%s/preprocess.log"%(now_dir,exp_dir),"r")as f:yield(f.read())
sleep(1)
if(done[0]==True):break
with open("%s/logs/%s/preprocess.log"%(now_dir,exp_dir), "r")as f:log = f.read()
print(log)
yield log
#but2.click(extract_f0,[gpus6,np7,f0method8,if_f0_3,trainset_dir4],[info2])
def extract_f0_feature(gpus,n_p,f0method,if_f0,exp_dir):
gpus=gpus.split("-")
os.makedirs("%s/logs/%s"%(now_dir,exp_dir),exist_ok=True)
f = open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir), "w")
f.close()
if(if_f0==""):
cmd="python extract_f0_print.py %s/logs/%s %s %s"%(now_dir,exp_dir,n_p,f0method)
print(cmd)
p = Popen(cmd, shell=True,cwd=now_dir)#, stdin=PIPE, stdout=PIPE,stderr=PIPE
###煞笔grpopen read都非得全跑完了再一次性读取不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
done=[False]
threading.Thread(target=if_done,args=(done,p,)).start()
while(1):
with open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir),"r")as f:yield(f.read())
sleep(1)
if(done[0]==True):break
with open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir), "r")as f:log = f.read()
print(log)
yield log
####对不同part分别开多进程
'''
n_part=int(sys.argv[1])
i_part=int(sys.argv[2])
i_gpu=sys.argv[3]
exp_dir=sys.argv[4]
os.environ["CUDA_VISIBLE_DEVICES"]=str(i_gpu)
'''
leng=len(gpus)
ps=[]
for idx,n_g in enumerate(gpus):
cmd="python extract_feature_print.py %s %s %s %s/logs/%s"%(leng,idx,n_g,now_dir,exp_dir)
print(cmd)
p = Popen(cmd, shell=True, cwd=now_dir)#, shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir
ps.append(p)
###煞笔grpopen read都非得全跑完了再一次性读取不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
done = [False]
threading.Thread(target=if_done_multi, args=(done, ps,)).start()
while (1):
with open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir), "r")as f:yield (f.read())
sleep(1)
if (done[0] == True): break
with open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir), "r")as f:log = f.read()
print(log)
yield log
def change_sr2(sr2,if_f0_3):
if(if_f0_3==""):return "pretrained/f0G%s.pth"%sr2,"pretrained/f0D%s.pth"%sr2
else:return "pretrained/G%s.pth"%sr2,"pretrained/D%s.pth"%sr2
#but3.click(click_train,[exp_dir1,sr2,if_f0_3,save_epoch10,total_epoch11,batch_size12,if_save_latest13,pretrained_G14,pretrained_D15,gpus16])
def click_train(exp_dir1,sr2,if_f0_3,spk_id5,save_epoch10,total_epoch11,batch_size12,if_save_latest13,pretrained_G14,pretrained_D15,gpus16,if_cache_gpu17):
#生成filelist
exp_dir="%s/logs/%s"%(now_dir,exp_dir1)
os.makedirs(exp_dir,exist_ok=True)
gt_wavs_dir="%s/0_gt_wavs"%(exp_dir)
co256_dir="%s/3_feature256"%(exp_dir)
if(if_f0_3==""):
f0_dir = "%s/2a_f0" % (exp_dir)
f0nsf_dir="%s/2b-f0nsf"%(exp_dir)
names=set([name.split(".")[0]for name in os.listdir(gt_wavs_dir)])&set([name.split(".")[0]for name in os.listdir(co256_dir)])&set([name.split(".")[0]for name in os.listdir(f0_dir)])&set([name.split(".")[0]for name in os.listdir(f0nsf_dir)])
else:
names=set([name.split(".")[0]for name in os.listdir(gt_wavs_dir)])&set([name.split(".")[0]for name in os.listdir(co256_dir)])
opt=[]
for name in names:
if (if_f0_3 == ""):
opt.append("%s/%s.wav|%s/%s.npy|%s/%s.wav.npy|%s/%s.wav.npy|%s"%(gt_wavs_dir.replace("\\","\\\\"),name,co256_dir.replace("\\","\\\\"),name,f0_dir.replace("\\","\\\\"),name,f0nsf_dir.replace("\\","\\\\"),name,spk_id5))
else:
opt.append("%s/%s.wav|%s/%s.npy|%s"%(gt_wavs_dir.replace("\\","\\\\"),name,co256_dir.replace("\\","\\\\"),name,spk_id5))
with open("%s/filelist.txt"%exp_dir,"w")as f:f.write("\n".join(opt))
print("write filelist done")
#生成config#无需生成config
# cmd = "python train_nsf_sim_cache_sid_load_pretrain.py -e mi-test -sr 40k -f0 1 -bs 4 -g 0 -te 10 -se 5 -pg pretrained/f0G40k.pth -pd pretrained/f0D40k.pth -l 1 -c 0"
cmd = "python train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -g %s -te %s -se %s -pg %s -pd %s -l %s -c %s" % (exp_dir1,sr2,1 if if_f0_3==""else 0,batch_size12,gpus16,total_epoch11,save_epoch10,pretrained_G14,pretrained_D15,1 if if_save_latest13==""else 0,1 if if_cache_gpu17==""else 0)
print(cmd)
p = Popen(cmd, shell=True, cwd=now_dir)
p.wait()
return "训练结束您可查看控制台训练日志或实验文件夹下的train.log"
# but4.click(train_index, [exp_dir1], info3)
def train_index(exp_dir1):
exp_dir="%s/logs/%s"%(now_dir,exp_dir1)
os.makedirs(exp_dir,exist_ok=True)
feature_dir="%s/3_feature256"%(exp_dir)
if(os.path.exists(feature_dir)==False):return "请先进行特征提取!"
listdir_res=list(os.listdir(feature_dir))
if(len(listdir_res)==0):return "请先进行特征提取!"
npys = []
for name in sorted(listdir_res):
phone = np.load("%s/%s" % (feature_dir, name))
npys.append(phone)
big_npy = np.concatenate(npys, 0)
np.save("%s/total_fea.npy"%exp_dir, big_npy)
n_ivf = big_npy.shape[0] // 39
infos=[]
infos.append("%s,%s"%(big_npy.shape,n_ivf))
yield "\n".join(infos)
index = faiss.index_factory(256, "IVF%s,Flat"%n_ivf)
infos.append("training")
yield "\n".join(infos)
index_ivf = faiss.extract_index_ivf(index) #
index_ivf.nprobe = int(np.power(n_ivf,0.3))
index.train(big_npy)
faiss.write_index(index, '%s/trained_IVF%s_Flat_nprobe_%s.index'%(exp_dir,n_ivf,index_ivf.nprobe))
infos.append("adding")
yield "\n".join(infos)
index.add(big_npy)
faiss.write_index(index, '%s/added_IVF%s_Flat_nprobe_%s.index'%(exp_dir,n_ivf,index_ivf.nprobe))
infos.append("成功构建索引added_IVF%s_Flat_nprobe_%s.index"%(n_ivf,index_ivf.nprobe))
yield "\n".join(infos)
#but5.click(train1key, [exp_dir1, sr2, if_f0_3, trainset_dir4, spk_id5, gpus6, np7, f0method8, save_epoch10, total_epoch11, batch_size12, if_save_latest13, pretrained_G14, pretrained_D15, gpus16, if_cache_gpu17], info3)
def train1key(exp_dir1, sr2, if_f0_3, trainset_dir4, spk_id5, gpus6, np7, f0method8, save_epoch10, total_epoch11, batch_size12, if_save_latest13, pretrained_G14, pretrained_D15, gpus16, if_cache_gpu17):
infos=[]
def get_info_str(strr):
infos.append(strr)
return "\n".join(infos)
os.makedirs("%s/logs/%s"%(now_dir,exp_dir1),exist_ok=True)
#########step1:处理数据
open("%s/logs/%s/preprocess.log"%(now_dir,exp_dir1), "w").close()
cmd="python trainset_preprocess_pipeline_print.py %s %s %s %s/logs/%s"%(trainset_dir4,sr_dict[sr2],ncpu,now_dir,exp_dir1)
yield get_info_str("step1:正在处理数据")
yield get_info_str(cmd)
p = Popen(cmd, shell=True)
p.wait()
with open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir1), "r")as f: print(f.read())
#########step2a:提取音高
open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir1), "w")
if(if_f0_3==""):
yield get_info_str("step2a:正在提取音高")
cmd="python extract_f0_print.py %s/logs/%s %s %s"%(now_dir,exp_dir1,np7,f0method8)
yield get_info_str(cmd)
p = Popen(cmd, shell=True,cwd=now_dir)
p.wait()
with open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir1), "r")as f:print(f.read())
else:yield get_info_str("step2a:无需提取音高")
#######step2b:提取特征
yield get_info_str("step2b:正在提取特征")
gpus=gpus16.split("-")
leng=len(gpus)
ps=[]
for idx,n_g in enumerate(gpus):
cmd="python extract_feature_print.py %s %s %s %s/logs/%s"%(leng,idx,n_g,now_dir,exp_dir1)
yield get_info_str(cmd)
p = Popen(cmd, shell=True, cwd=now_dir)#, shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir
ps.append(p)
for p in ps:p.wait()
with open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir1), "r")as f:print(f.read())
#######step3a:训练模型
yield get_info_str("step3a:正在训练模型")
#生成filelist
exp_dir="%s/logs/%s"%(now_dir,exp_dir1)
gt_wavs_dir="%s/0_gt_wavs"%(exp_dir)
co256_dir="%s/3_feature256"%(exp_dir)
if(if_f0_3==""):
f0_dir = "%s/2a_f0" % (exp_dir)
f0nsf_dir="%s/2b-f0nsf"%(exp_dir)
names=set([name.split(".")[0]for name in os.listdir(gt_wavs_dir)])&set([name.split(".")[0]for name in os.listdir(co256_dir)])&set([name.split(".")[0]for name in os.listdir(f0_dir)])&set([name.split(".")[0]for name in os.listdir(f0nsf_dir)])
else:
names=set([name.split(".")[0]for name in os.listdir(gt_wavs_dir)])&set([name.split(".")[0]for name in os.listdir(co256_dir)])
opt=[]
for name in names:
if (if_f0_3 == ""):
opt.append("%s/%s.wav|%s/%s.npy|%s/%s.wav.npy|%s/%s.wav.npy|%s"%(gt_wavs_dir.replace("\\","\\\\"),name,co256_dir.replace("\\","\\\\"),name,f0_dir.replace("\\","\\\\"),name,f0nsf_dir.replace("\\","\\\\"),name,spk_id5))
else:
opt.append("%s/%s.wav|%s/%s.npy|%s"%(gt_wavs_dir.replace("\\","\\\\"),name,co256_dir.replace("\\","\\\\"),name,spk_id5))
with open("%s/filelist.txt"%exp_dir,"w")as f:f.write("\n".join(opt))
yield get_info_str("write filelist done")
cmd = "python train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -g %s -te %s -se %s -pg %s -pd %s -l %s -c %s" % (exp_dir1,sr2,1 if if_f0_3==""else 0,batch_size12,gpus16,total_epoch11,save_epoch10,pretrained_G14,pretrained_D15,1 if if_save_latest13==""else 0,1 if if_cache_gpu17==""else 0)
yield get_info_str(cmd)
p = Popen(cmd, shell=True, cwd=now_dir)
p.wait()
yield get_info_str("训练结束您可查看控制台训练日志或实验文件夹下的train.log")
#######step3b:训练索引
feature_dir="%s/3_feature256"%(exp_dir)
npys = []
listdir_res=list(os.listdir(feature_dir))
for name in sorted(listdir_res):
phone = np.load("%s/%s" % (feature_dir, name))
npys.append(phone)
big_npy = np.concatenate(npys, 0)
np.save("%s/total_fea.npy"%exp_dir, big_npy)
n_ivf = big_npy.shape[0] // 39
yield get_info_str("%s,%s"%(big_npy.shape,n_ivf))
index = faiss.index_factory(256, "IVF%s,Flat"%n_ivf)
yield get_info_str("training index")
index_ivf = faiss.extract_index_ivf(index) #
index_ivf.nprobe = int(np.power(n_ivf,0.3))
index.train(big_npy)
faiss.write_index(index, '%s/trained_IVF%s_Flat_nprobe_%s.index'%(exp_dir,n_ivf,index_ivf.nprobe))
yield get_info_str("adding index")
index.add(big_npy)
faiss.write_index(index, '%s/added_IVF%s_Flat_nprobe_%s.index'%(exp_dir,n_ivf,index_ivf.nprobe))
yield get_info_str("成功构建索引added_IVF%s_Flat_nprobe_%s.index"%(n_ivf,index_ivf.nprobe))
yield get_info_str("全流程结束!")
# ckpt_path2.change(change_info_,[ckpt_path2],[sr__,if_f0__])
def change_info_(ckpt_path):
if(os.path.exists(ckpt_path.replace(os.path.basename(ckpt_path),"train.log"))==False):return {"__type__": "update"},{"__type__": "update"}
try:
with open(ckpt_path.replace(os.path.basename(ckpt_path),"train.log"),"r")as f:
info=eval(f.read().strip("\n").split("\n")[0].split("\t")[-1])
sr,f0=info["sample_rate"],info["if_f0"]
return sr,str(f0)
except:
traceback.print_exc()
return {"__type__": "update"}, {"__type__": "update"}
with gr.Blocks() as app:
gr.Markdown(value="""
本软件以MIT协议开源作者不对软件具备任何控制力使用软件者传播软件导出的声音者自负全责<br>
如不认可该条款则不能使用或引用软件包内任何代码和文件详见根目录"使用需遵守的协议-LICENSE.txt"
""")
with gr.Tabs():
with gr.TabItem("模型推理"):
with gr.Row():
sid0 = gr.Dropdown(label="推理音色", choices=names)
refresh_button = gr.Button("刷新音色列表", variant="primary")
refresh_button.click(
fn=change_choices,
inputs=[],
outputs=[sid0]
)
clean_button = gr.Button("卸载音色省显存", variant="primary")
spk_item = gr.Slider(minimum=0, maximum=2333, step=1, label='请选择说话人id', value=0, visible=False, interactive=True)
clean_button.click(
fn=clean,
inputs=[],
outputs=[sid0]
)
sid0.change(
fn=get_vc,
inputs=[sid0],
outputs=[spk_item],
)
with gr.Group():
gr.Markdown(value="""
男转女推荐+12key女转男推荐-12key如果音域爆炸导致音色失真也可以自己调整到合适音域
""")
with gr.Row():
with gr.Column():
vc_transform0 = gr.Number(label="变调整数半音数量升八度12降八度-12", value=0)
input_audio0 = gr.Textbox(label="输入待处理音频文件路径(默认是正确格式示例)",value="E:\codes\py39\\vits_vc_gpu_train\\todo-songs\冬之花clip1.wav")
f0method0=gr.Radio(label="选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比", choices=["pm","harvest"],value="pm", interactive=True)
with gr.Column():
file_index1 = gr.Textbox(label="特征检索库文件路径",value="E:\codes\py39\\vits_vc_gpu_train\logs\mi-test-1key\\added_IVF677_Flat_nprobe_7.index", interactive=True)
file_big_npy1 = gr.Textbox(label="特征文件路径",value="E:\codes\py39\\vits_vc_gpu_train\logs\mi-test-1key\\total_fea.npy", interactive=True)
index_rate1 = gr.Slider(minimum=0, maximum=1,label='检索特征占比', value=1,interactive=True)
f0_file = gr.File(label="F0曲线文件可选一行一个音高代替默认F0及升降调")
but0=gr.Button("转换", variant="primary")
with gr.Column():
vc_output1 = gr.Textbox(label="输出信息")
vc_output2 = gr.Audio(label="输出音频(右下角三个点,点了可以下载)")
but0.click(vc_single, [spk_item, input_audio0, vc_transform0,f0_file,f0method0,file_index1,file_big_npy1,index_rate1], [vc_output1, vc_output2])
with gr.Group():
gr.Markdown(value="""
批量转换输入待转换音频文件夹或上传多个音频文件在指定文件夹默认opt下输出转换的音频
""")
with gr.Row():
with gr.Column():
vc_transform1 = gr.Number(label="变调整数半音数量升八度12降八度-12", value=0)
opt_input = gr.Textbox(label="指定输出文件夹",value="opt")
f0method1=gr.Radio(label="选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比", choices=["pm","harvest"],value="pm", interactive=True)
with gr.Column():
file_index2 = gr.Textbox(label="特征检索库文件路径",value="E:\codes\py39\\vits_vc_gpu_train\logs\mi-test-1key\\added_IVF677_Flat_nprobe_7.index", interactive=True)
file_big_npy2 = gr.Textbox(label="特征文件路径",value="E:\codes\py39\\vits_vc_gpu_train\logs\mi-test-1key\\total_fea.npy", interactive=True)
index_rate2 = gr.Slider(minimum=0, maximum=1,label='检索特征占比', value=1,interactive=True)
with gr.Column():
dir_input = gr.Textbox(label="输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)",value="E:\codes\py39\\vits_vc_gpu_train\\todo-songs")
inputs = gr.File(file_count="multiple", label="也可批量输入音频文件,二选一,优先读文件夹")
but1=gr.Button("转换", variant="primary")
vc_output3 = gr.Textbox(label="输出信息")
but1.click(vc_multi, [spk_item, dir_input,opt_input,inputs, vc_transform1,f0method1,file_index2,file_big_npy2,index_rate2], [vc_output3])
with gr.TabItem("伴奏人声分离"):
with gr.Group():
gr.Markdown(value="""
人声伴奏分离批量处理使用UVR5模型<br>
不带和声用HP2带和声且提取的人声不需要和声用HP5<br>
合格的文件夹路径格式举例E:\codes\py39\\vits_vc_gpu\白鹭霜华测试样例去文件管理器地址栏拷就行了
""")
with gr.Row():
with gr.Column():
dir_wav_input = gr.Textbox(label="输入待处理音频文件夹路径",value="E:\codes\py39\\vits_vc_gpu_train\\todo-songs")
wav_inputs = gr.File(file_count="multiple", label="也可批量输入音频文件,二选一,优先读文件夹")
with gr.Column():
model_choose = gr.Dropdown(label="模型", choices=uvr5_names)
opt_vocal_root = gr.Textbox(label="指定输出人声文件夹",value="opt")
opt_ins_root = gr.Textbox(label="指定输出乐器文件夹",value="opt")
but2=gr.Button("转换", variant="primary")
vc_output4 = gr.Textbox(label="输出信息")
but2.click(uvr, [model_choose, dir_wav_input,opt_vocal_root,wav_inputs,opt_ins_root], [vc_output4])
with gr.TabItem("训练"):
gr.Markdown(value="""
step1填写实验配置实验数据放在logs下每个实验一个文件夹需手工输入实验名路径内含实验配置日志训练得到的模型文件
""")
with gr.Row():
exp_dir1 = gr.Textbox(label="输入实验名",value="mi-test")
sr2 = gr.Radio(label="目标采样率", choices=["32k","40k","48k"],value="40k", interactive=True)
if_f0_3 = gr.Radio(label="模型是否带音高指导(唱歌一定要,语音可以不要)", choices=["",""],value="", interactive=True)
with gr.Group():#暂时单人的后面支持最多4人的#数据处理
gr.Markdown(value="""
step2a自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化在实验目录下生成2个wav文件夹暂时只支持单人训练
""")
with gr.Row():
trainset_dir4 = gr.Textbox(label="输入训练文件夹路径",value="E:\语音音频+标注\米津玄师\src")
spk_id5 = gr.Slider(minimum=0, maximum=4, step=1, label='请指定说话人id', value=0,interactive=True)
but1=gr.Button("处理数据", variant="primary")
info1=gr.Textbox(label="输出信息",value="")
but1.click(preprocess_dataset,[trainset_dir4,exp_dir1,sr2],[info1])
with gr.Group():
gr.Markdown(value="""
step2b使用CPU提取音高(如果模型带音高)使用GPU提取特征(选择卡号)
""")
with gr.Row():
with gr.Column():
gpus6 = gr.Textbox(label="以-分隔输入使用的卡号,例如 0-1-2 使用卡0和卡1和卡2",value=gpus,interactive=True)
gpu_info9 = gr.Textbox(label="显卡信息",value=gpu_info)
with gr.Column():
np7 = gr.Slider(minimum=0, maximum=ncpu, step=1, label='提取音高使用的CPU进程数', value=ncpu,interactive=True)
f0method8 = gr.Radio(label="选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢", choices=["pm", "harvest","dio"], value="harvest", interactive=True)
but2=gr.Button("特征提取", variant="primary")
info2=gr.Textbox(label="输出信息",value="",max_lines=8)
but2.click(extract_f0_feature,[gpus6,np7,f0method8,if_f0_3,exp_dir1],[info2])
with gr.Group():
gr.Markdown(value="""
step3填写训练设置开始训练模型和索引
""")
with gr.Row():
save_epoch10 = gr.Slider(minimum=0, maximum=50, step=1, label='保存频率save_every_epoch', value=5,interactive=True)
total_epoch11 = gr.Slider(minimum=0, maximum=100, step=1, label='总训练轮数total_epoch', value=10,interactive=True)
batch_size12 = gr.Slider(minimum=0, maximum=32, step=1, label='batch_size', value=4,interactive=True)
if_save_latest13 = gr.Radio(label="是否仅保存最新的ckpt文件以节省硬盘空间", choices=["", ""], value="", interactive=True)
if_cache_gpu17 = gr.Radio(label="是否缓存所有训练集至显存。10min以下小数据可缓存以加速训练大数据缓存会炸显存也加不了多少速", choices=["", ""], value="", interactive=True)
with gr.Row():
pretrained_G14 = gr.Textbox(label="加载预训练底模G路径", value="pretrained/f0G40k.pth",interactive=True)
pretrained_D15 = gr.Textbox(label="加载预训练底模D路径", value="pretrained/f0D40k.pth",interactive=True)
sr2.change(change_sr2, [sr2,if_f0_3], [pretrained_G14,pretrained_D15])
if_f0_3.change(change_f0, [if_f0_3, sr2], [np7, f0method8, pretrained_G14, pretrained_D15])
gpus16 = gr.Textbox(label="以-分隔输入使用的卡号,例如 0-1-2 使用卡0和卡1和卡2", value=gpus,interactive=True)
but3 = gr.Button("训练模型", variant="primary")
but4 = gr.Button("训练特征索引", variant="primary")
but5 = gr.Button("一键训练", variant="primary")
info3 = gr.Textbox(label="输出信息", value="",max_lines=10)
but3.click(click_train,[exp_dir1,sr2,if_f0_3,spk_id5,save_epoch10,total_epoch11,batch_size12,if_save_latest13,pretrained_G14,pretrained_D15,gpus16,if_cache_gpu17],info3)
but4.click(train_index,[exp_dir1],info3)
but5.click(train1key,[exp_dir1,sr2,if_f0_3,trainset_dir4,spk_id5,gpus6,np7,f0method8,save_epoch10,total_epoch11,batch_size12,if_save_latest13,pretrained_G14,pretrained_D15,gpus16,if_cache_gpu17],info3)
with gr.TabItem("ckpt处理"):
with gr.Group():
gr.Markdown(value="""模型融合,可用于测试音色融合""")
with gr.Row():
ckpt_a = gr.Textbox(label="A模型路径", value="", interactive=True)
ckpt_b = gr.Textbox(label="B模型路径", value="", interactive=True)
alpha_a = gr.Slider(minimum=0, maximum=1, label='A模型权重', value=0.5, interactive=True)
with gr.Row():
sr_ = gr.Radio(label="目标采样率", choices=["32k","40k","48k"],value="40k", interactive=True)
if_f0_ = gr.Radio(label="模型是否带音高指导", choices=["",""],value="", interactive=True)
info__ = gr.Textbox(label="要置入的模型信息", value="", max_lines=8, interactive=True)
name_to_save0=gr.Textbox(label="保存的模型名不带后缀", value="", max_lines=1, interactive=True)
with gr.Row():
but6 = gr.Button("融合", variant="primary")
info4 = gr.Textbox(label="输出信息", value="", max_lines=8)
but6.click(merge, [ckpt_a,ckpt_b,alpha_a,sr_,if_f0_,info__,name_to_save0], info4)#def merge(path1,path2,alpha1,sr,f0,info):
with gr.Group():
gr.Markdown(value="修改模型信息(仅支持weights文件夹下提取的小模型文件)")
with gr.Row():
ckpt_path0 = gr.Textbox(label="模型路径", value="", interactive=True)
info_=gr.Textbox(label="要改的模型信息", value="", max_lines=8, interactive=True)
name_to_save1=gr.Textbox(label="保存的文件名,默认空为和源文件同名", value="", max_lines=8, interactive=True)
with gr.Row():
but7 = gr.Button("修改", variant="primary")
info5 = gr.Textbox(label="输出信息", value="", max_lines=8)
but7.click(change_info, [ckpt_path0,info_,name_to_save1], info5)
with gr.Group():
gr.Markdown(value="查看模型信息(仅支持weights文件夹下提取的小模型文件)")
with gr.Row():
ckpt_path1 = gr.Textbox(label="模型路径", value="", interactive=True)
but8 = gr.Button("查看", variant="primary")
info6 = gr.Textbox(label="输出信息", value="", max_lines=8)
but8.click(show_info, [ckpt_path1], info6)
with gr.Group():
gr.Markdown(value="模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况")
with gr.Row():
ckpt_path2 = gr.Textbox(label="模型路径", value="E:\codes\py39\logs\mi-test_f0_48k\\G_23333.pth", interactive=True)
save_name = gr.Textbox(label="保存名", value="", interactive=True)
sr__ = gr.Radio(label="目标采样率", choices=["32k","40k","48k"],value="40k", interactive=True)
if_f0__ = gr.Radio(label="模型是否带音高指导,1是0否", choices=["1","0"],value="1", interactive=True)
info___ = gr.Textbox(label="要置入的模型信息", value="", max_lines=8, interactive=True)
but9 = gr.Button("提取", variant="primary")
info7 = gr.Textbox(label="输出信息", value="", max_lines=8)
ckpt_path2.change(change_info_,[ckpt_path2],[sr__,if_f0__])
but9.click(extract_small_model, [ckpt_path2,save_name,sr__,if_f0__,info___], info7)
with gr.TabItem("招募音高曲线前端编辑器"):
gr.Markdown(value="""加开发群联系我xxxxx""")
with gr.TabItem("点击查看交流、问题反馈群号"):
gr.Markdown(value="""xxxxx""")
# app.launch(server_name="0.0.0.0",server_port=7860)
# app.queue(concurrency_count=511, max_size=1022).launch(server_name="127.0.0.1",inbrowser=True,server_port=7861,quiet=True)
app.queue(concurrency_count=511, max_size=1022).launch(server_name="0.0.0.0",inbrowser=True,server_port=7865,quiet=True)

108
infer_uvr5.py Normal file
View File

@ -0,0 +1,108 @@
import os,sys,torch,warnings,pdb
warnings.filterwarnings("ignore")
import librosa
import importlib
import numpy as np
import hashlib , math
from tqdm import tqdm
from uvr5_pack.lib_v5 import spec_utils
from uvr5_pack.utils import _get_name_params,inference
from uvr5_pack.lib_v5.model_param_init import ModelParameters
from scipy.io import wavfile
class _audio_pre_():
def __init__(self, model_path,device,is_half):
self.model_path = model_path
self.device = device
self.data = {
# Processing Options
'postprocess': False,
'tta': False,
# Constants
'window_size': 512,
'agg': 10,
'high_end_process': 'mirroring',
}
nn_arch_sizes = [
31191, # default
33966,61968, 123821, 123812, 537238 # custom
]
self.nn_architecture = list('{}KB'.format(s) for s in nn_arch_sizes)
model_size = math.ceil(os.stat(model_path ).st_size / 1024)
nn_architecture = '{}KB'.format(min(nn_arch_sizes, key=lambda x:abs(x-model_size)))
nets = importlib.import_module('uvr5_pack.lib_v5.nets' + f'_{nn_architecture}'.replace('_{}KB'.format(nn_arch_sizes[0]), ''), package=None)
model_hash = hashlib.md5(open(model_path,'rb').read()).hexdigest()
param_name ,model_params_d = _get_name_params(model_path , model_hash)
mp = ModelParameters(model_params_d)
model = nets.CascadedASPPNet(mp.param['bins'] * 2)
cpk = torch.load( model_path , map_location='cpu')
model.load_state_dict(cpk)
model.eval()
if(is_half==True):model = model.half().to(device)
else:model = model.to(device)
self.mp = mp
self.model = model
def _path_audio_(self, music_file ,ins_root=None,vocal_root=None):
if(ins_root is None and vocal_root is None):return "No save root."
name=os.path.basename(music_file)
if(ins_root is not None):os.makedirs(ins_root, exist_ok=True)
if(vocal_root is not None):os.makedirs(vocal_root , exist_ok=True)
X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
bands_n = len(self.mp.param['band'])
# print(bands_n)
for d in range(bands_n, 0, -1):
bp = self.mp.param['band'][d]
if d == bands_n: # high-end band
X_wave[d], _ = librosa.core.load(#理论上librosa读取可能对某些音频有bug应该上ffmpeg读取但是太麻烦了弃坑
music_file, bp['sr'], False, dtype=np.float32, res_type=bp['res_type'])
if X_wave[d].ndim == 1:
X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
else: # lower bands
X_wave[d] = librosa.core.resample(X_wave[d+1], self.mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type'])
# Stft of wave source
X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(X_wave[d], bp['hl'], bp['n_fft'], self.mp.param['mid_side'], self.mp.param['mid_side_b2'], self.mp.param['reverse'])
# pdb.set_trace()
if d == bands_n and self.data['high_end_process'] != 'none':
input_high_end_h = (bp['n_fft']//2 - bp['crop_stop']) + ( self.mp.param['pre_filter_stop'] - self.mp.param['pre_filter_start'])
input_high_end = X_spec_s[d][:, bp['n_fft']//2-input_high_end_h:bp['n_fft']//2, :]
X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
aggresive_set = float(self.data['agg']/100)
aggressiveness = {'value': aggresive_set, 'split_bin': self.mp.param['band'][1]['crop_stop']}
with torch.no_grad():
pred, X_mag, X_phase = inference(X_spec_m,self.device,self.model, aggressiveness,self.data)
# Postprocess
if self.data['postprocess']:
pred_inv = np.clip(X_mag - pred, 0, np.inf)
pred = spec_utils.mask_silence(pred, pred_inv)
y_spec_m = pred * X_phase
v_spec_m = X_spec_m - y_spec_m
if (ins_root is not None):
if self.data['high_end_process'].startswith('mirroring'):
input_high_end_ = spec_utils.mirroring(self.data['high_end_process'], y_spec_m, input_high_end, self.mp)
wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp,input_high_end_h, input_high_end_)
else:
wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
print ('%s instruments done'%name)
wavfile.write(os.path.join(ins_root, 'instrument_{}.wav'.format(name) ), self.mp.param['sr'], (np.array(wav_instrument)*32768).astype("int16")) #
if (vocal_root is not None):
if self.data['high_end_process'].startswith('mirroring'):
input_high_end_ = spec_utils.mirroring(self.data['high_end_process'], v_spec_m, input_high_end, self.mp)
wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp, input_high_end_h, input_high_end_)
else:
wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
print ('%s vocals done'%name)
wavfile.write(os.path.join(vocal_root , 'vocal_{}.wav'.format(name) ), self.mp.param['sr'], (np.array(wav_vocals)*32768).astype("int16"))
if __name__ == '__main__':
device = 'cuda'
is_half=True
model_path='uvr5_weights/2_HP-UVR.pth'
pre_fun = _audio_pre_(model_path=model_path,device=device,is_half=True)
audio_path = '神女劈观.aac'
save_path = 'opt'
pre_fun._path_audio_(audio_path , save_path,save_path)

18
my_utils.py Normal file
View File

@ -0,0 +1,18 @@
import ffmpeg,numpy as np
def load_audio(file,sr):
try:
# https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
out, _ = (
ffmpeg.input(file, threads=0)
.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
.run(cmd=["./ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
)
except ffmpeg.Error as e:
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
if __name__=='__main__' :
print(load_audio(r"C:\CloudMusic\宮野幸子,森下唯 - 月夜に謳う君 -LUNA-.mp3",16000).shape)

196
requirements.txt Normal file
View File

@ -0,0 +1,196 @@
absl-py==1.1.0
aiofiles==23.1.0
aiohttp==3.8.4
aiosignal==1.3.1
altair==4.2.0
antlr4-python3-runtime==4.8
anyio==3.6.1
appdirs==1.4.4
argon2-cffi==21.3.0
argon2-cffi-bindings==21.2.0
asttokens==2.0.5
async-timeout==4.0.2
attrs==21.4.0
audioread==3.0.0
Babel==2.10.3
backcall==0.2.0
beautifulsoup4==4.11.1
bitarray==2.7.3
bleach==5.0.0
brotlipy==0.7.0
cachetools==5.2.0
certifi==2021.5.30
cffi
chardet
charset-normalizer==3.0.1
click==8.1.3
cmake==3.25.0
colorama==0.4.5
cryptography
cycler==0.11.0
Cython==0.29.32
debugpy==1.6.0
decorator==5.1.1
defusedxml==0.7.1
entrypoints==0.4
executing==0.8.3
fairseq==0.12.2
faiss-gpu==1.7.2
fastapi==0.92.0
fastjsonschema==2.15.3
ffmpeg==1.4
ffmpy==0.3.0
filelock==3.9.0
fonttools==4.33.3
frozenlist==1.3.3
fsspec==2022.11.0
functorch==2.0.0
future==0.18.3
google==3.0.0
google-auth==2.8.0
google-auth-oauthlib==0.4.6
googleads==3.8.0
gradio==3.19.1
grpcio==1.46.3
h11==0.13.0
httpcore==0.16.3
httplib2==0.21.0
httpx==0.23.1
Hydra==2.5
hydra-core==1.0.7
idna
importlib-metadata==4.11.4
importlib-resources==5.8.0
ipykernel==6.15.0
ipython==8.4.0
ipython-genutils==0.2.0
ipywidgets==7.7.0
jedi==0.18.1
Jinja2==3.1.2
joblib==1.1.0
json5==0.9.8
jsonschema==4.6.0
jupyter-client==7.3.4
jupyter-core==4.10.0
jupyter-server==1.17.1
jupyterlab==3.4.3
jupyterlab-language-pack-zh-CN==3.4.post1
jupyterlab-pygments==0.2.2
jupyterlab-server==2.14.0
jupyterlab-widgets==1.1.0
kiwisolver==1.4.3
lazy-loader==0.1
librosa==0.9.2
linkify-it-py==2.0.0
lit==15.0.7
llvmlite==0.39.0
lxml==4.8.0
Markdown==3.3.7
markdown-it-py==2.2.0
MarkupSafe==2.1.1
matplotlib==3.5.2
matplotlib-inline==0.1.3
mdit-py-plugins==0.3.3
mdurl==0.1.1
mistune==0.8.4
mpmath==1.2.1
msgpack==1.0.4
multidict==6.0.2
nbclassic==0.3.7
nbclient==0.6.4
nbconvert==6.5.0
nbformat==5.4.0
nest-asyncio==1.5.5
networkx==2.8.8
notebook==6.4.12
notebook-shim==0.1.0
numba==0.56.4
numpy==1.23.5
oauth2client==4.1.3
oauthlib==3.2.0
omegaconf==2.0.6
orjson==3.8.6
packaging==21.3
pandas==1.5.2
pandocfilters==1.5.0
parso==0.8.3
pexpect==4.8.0
pickleshare==0.7.5
Pillow==9.1.1
pooch==1.6.0
portalocker==2.5.1
praat-parselmouth==0.4.2
prometheus-client==0.14.1
prompt-toolkit==3.0.29
protobuf==3.19.4
psutil==5.9.1
ptyprocess==0.7.0
pure-eval==0.2.2
pyasn1==0.4.8
pyasn1-modules==0.2.8
pycosat==0.6.3
pycparser
pycryptodome==3.16.0
pydantic==1.10.5
pydub==0.25.1
Pygments==2.12.0
pyOpenSSL
pyparsing==3.0.9
pyrsistent==0.18.1
PySocks
python-dateutil==2.8.2
python-multipart==0.0.5
pytz==2022.6
pyworld==0.3.2
PyYAML==6.0
pyzmq==23.2.0
regex==2022.10.31
requests
requests-oauthlib==1.3.1
resampy==0.4.2
rfc3986==1.5.0
rsa==4.8
ruamel-yaml-conda
sacrebleu==2.3.1
scikit-learn==1.1.3
scipy==1.9.3
Send2Trash==1.8.0
six
sniffio==1.2.0
soundfile==0.12.1
soupsieve==2.3.2.post1
soxr==0.3.3
stack-data==0.3.0
starlette==0.25.0
stopit==1.1.1
suds-jurko==0.6
supervisor==4.2.4
sympy==1.11.1
tabulate==0.8.10
tensorboard==2.9.1
tensorboard-data-server==0.6.1
tensorboard-plugin-wit==1.8.1
terminado==0.15.0
threadpoolctl==3.1.0
tinycss2==1.1.1
toolz==0.12.0
torch==2.0.0+cu117
torchaudio==2.0.1+cu117
torchgen==0.0.1
torchvision==0.15.1+cu117
tornado==6.1
tqdm
traitlets==5.3.0
triton==2.0.0
typing-extensions==4.2.0
uc-micro-py==1.0.1
urllib3==1.26.13
uvicorn==0.21.1
wcwidth==0.2.5
webencodings==0.5.1
websocket-client==1.3.3
websockets==10.3
Werkzeug==2.1.2
widgetsnbextension==3.6.0
yarl==1.8.1
zipp==3.8.0

186
slicer2.py Normal file
View File

@ -0,0 +1,186 @@
import numpy as np
# This function is obtained from librosa.
def get_rms(
y,
*,
frame_length=2048,
hop_length=512,
pad_mode="constant",
):
padding = (int(frame_length // 2), int(frame_length // 2))
y = np.pad(y, padding, mode=pad_mode)
axis = -1
# put our new within-frame axis at the end for now
out_strides = y.strides + tuple([y.strides[axis]])
# Reduce the shape on the framing axis
x_shape_trimmed = list(y.shape)
x_shape_trimmed[axis] -= frame_length - 1
out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
xw = np.lib.stride_tricks.as_strided(
y, shape=out_shape, strides=out_strides
)
if axis < 0:
target_axis = axis - 1
else:
target_axis = axis + 1
xw = np.moveaxis(xw, -1, target_axis)
# Downsample along the target axis
slices = [slice(None)] * xw.ndim
slices[axis] = slice(0, None, hop_length)
x = xw[tuple(slices)]
# Calculate power
power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)
return np.sqrt(power)
class Slicer:
def __init__(self,
sr: int,
threshold: float = -40.,
min_length: int = 5000,
min_interval: int = 300,
hop_size: int = 20,
max_sil_kept: int = 5000):
if not min_length >= min_interval >= hop_size:
raise ValueError('The following condition must be satisfied: min_length >= min_interval >= hop_size')
if not max_sil_kept >= hop_size:
raise ValueError('The following condition must be satisfied: max_sil_kept >= hop_size')
min_interval = sr * min_interval / 1000
self.threshold = 10 ** (threshold / 20.)
self.hop_size = round(sr * hop_size / 1000)
self.win_size = min(round(min_interval), 4 * self.hop_size)
self.min_length = round(sr * min_length / 1000 / self.hop_size)
self.min_interval = round(min_interval / self.hop_size)
self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
def _apply_slice(self, waveform, begin, end):
if len(waveform.shape) > 1:
return waveform[:, begin * self.hop_size: min(waveform.shape[1], end * self.hop_size)]
else:
return waveform[begin * self.hop_size: min(waveform.shape[0], end * self.hop_size)]
# @timeit
def slice(self, waveform):
if len(waveform.shape) > 1:
samples = waveform.mean(axis=0)
else:
samples = waveform
if samples.shape[0] <= self.min_length:
return [waveform]
rms_list = get_rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0)
sil_tags = []
silence_start = None
clip_start = 0
for i, rms in enumerate(rms_list):
# Keep looping while frame is silent.
if rms < self.threshold:
# Record start of silent frames.
if silence_start is None:
silence_start = i
continue
# Keep looping while frame is not silent and silence start has not been recorded.
if silence_start is None:
continue
# Clear recorded silence start if interval is not enough or clip is too short
is_leading_silence = silence_start == 0 and i > self.max_sil_kept
need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length
if not is_leading_silence and not need_slice_middle:
silence_start = None
continue
# Need slicing. Record the range of silent frames to be removed.
if i - silence_start <= self.max_sil_kept:
pos = rms_list[silence_start: i + 1].argmin() + silence_start
if silence_start == 0:
sil_tags.append((0, pos))
else:
sil_tags.append((pos, pos))
clip_start = pos
elif i - silence_start <= self.max_sil_kept * 2:
pos = rms_list[i - self.max_sil_kept: silence_start + self.max_sil_kept + 1].argmin()
pos += i - self.max_sil_kept
pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
if silence_start == 0:
sil_tags.append((0, pos_r))
clip_start = pos_r
else:
sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
clip_start = max(pos_r, pos)
else:
pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
if silence_start == 0:
sil_tags.append((0, pos_r))
else:
sil_tags.append((pos_l, pos_r))
clip_start = pos_r
silence_start = None
# Deal with trailing silence.
total_frames = rms_list.shape[0]
if silence_start is not None and total_frames - silence_start >= self.min_interval:
silence_end = min(total_frames, silence_start + self.max_sil_kept)
pos = rms_list[silence_start: silence_end + 1].argmin() + silence_start
sil_tags.append((pos, total_frames + 1))
# Apply and return slices.
if len(sil_tags) == 0:
return [waveform]
else:
chunks = []
if sil_tags[0][0] > 0:
chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0]))
for i in range(len(sil_tags) - 1):
chunks.append(self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]))
if sil_tags[-1][1] < total_frames:
chunks.append(self._apply_slice(waveform, sil_tags[-1][1], total_frames))
return chunks
def main():
import os.path
from argparse import ArgumentParser
import librosa
import soundfile
parser = ArgumentParser()
parser.add_argument('audio', type=str, help='The audio to be sliced')
parser.add_argument('--out', type=str, help='Output directory of the sliced audio clips')
parser.add_argument('--db_thresh', type=float, required=False, default=-40,
help='The dB threshold for silence detection')
parser.add_argument('--min_length', type=int, required=False, default=5000,
help='The minimum milliseconds required for each sliced audio clip')
parser.add_argument('--min_interval', type=int, required=False, default=300,
help='The minimum milliseconds for a silence part to be sliced')
parser.add_argument('--hop_size', type=int, required=False, default=10,
help='Frame length in milliseconds')
parser.add_argument('--max_sil_kept', type=int, required=False, default=500,
help='The maximum silence length kept around the sliced clip, presented in milliseconds')
args = parser.parse_args()
out = args.out
if out is None:
out = os.path.dirname(os.path.abspath(args.audio))
audio, sr = librosa.load(args.audio, sr=None, mono=False)
slicer = Slicer(
sr=sr,
threshold=args.db_thresh,
min_length=args.min_length,
min_interval=args.min_interval,
hop_size=args.hop_size,
max_sil_kept=args.max_sil_kept
)
chunks = slicer.slice(audio)
if not os.path.exists(out):
os.makedirs(out)
for i, chunk in enumerate(chunks):
if len(chunk.shape) > 1:
chunk = chunk.T
soundfile.write(os.path.join(out, f'%s_%d.wav' % (os.path.basename(args.audio).rsplit('.', maxsplit=1)[0], i)), chunk, sr)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,509 @@
import sys,os
now_dir=os.getcwd()
sys.path.append(os.path.join(now_dir,"train"))
import utils
hps = utils.get_hparams()
os.environ["CUDA_VISIBLE_DEVICES"]=hps.gpus.replace("-",",")
n_gpus=len(hps.gpus.split("-"))
from random import shuffle
import traceback,json,argparse,itertools,math,torch,pdb
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
import torch.multiprocessing as mp
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.cuda.amp import autocast, GradScaler
from infer_pack import commons
from time import time as ttime
from data_utils import TextAudioLoaderMultiNSFsid,TextAudioLoader, TextAudioCollateMultiNSFsid,TextAudioCollate, DistributedBucketSampler
from infer_pack.models import (
SynthesizerTrnMs256NSFsid,SynthesizerTrnMs256NSFsid_nono,
MultiPeriodDiscriminator,
)
from losses import generator_loss, discriminator_loss, feature_loss, kl_loss
from mel_processing import mel_spectrogram_torch, spec_to_mel_torch
global_step = 0
def main():
"""Assume Single Node Multi GPUs Training Only"""
assert torch.cuda.is_available(), "CPU training is not allowed."
# n_gpus = torch.cuda.device_count()
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "5555"
mp.spawn(
run,
nprocs=n_gpus,
args=(
n_gpus,
hps,
),
)
def run(rank, n_gpus, hps):
global global_step
if rank == 0:
logger = utils.get_logger(hps.model_dir)
logger.info(hps)
utils.check_git_hash(hps.model_dir)
writer = SummaryWriter(log_dir=hps.model_dir)
writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))
dist.init_process_group(
backend="gloo", init_method="env://", world_size=n_gpus, rank=rank
)
torch.manual_seed(hps.train.seed)
torch.cuda.set_device(rank)
if (hps.if_f0 == 1):train_dataset = TextAudioLoaderMultiNSFsid(hps.data.training_files, hps.data)
else:train_dataset = TextAudioLoader(hps.data.training_files, hps.data)
train_sampler = DistributedBucketSampler(
train_dataset,
hps.train.batch_size,
# [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1200,1400], # 16s
[100, 200, 300, 400, 500, 600, 700, 800, 900], # 16s
num_replicas=n_gpus,
rank=rank,
shuffle=True,
)
# It is possible that dataloader's workers are out of shared memory. Please try to raise your shared memory limit.
# num_workers=8 -> num_workers=4
if (hps.if_f0 == 1):collate_fn = TextAudioCollateMultiNSFsid()
else:collate_fn = TextAudioCollate()
train_loader = DataLoader(
train_dataset,
num_workers=4,
shuffle=False,
pin_memory=True,
collate_fn=collate_fn,
batch_sampler=train_sampler,
persistent_workers=True,
prefetch_factor=8,
)
if(hps.if_f0==1):net_g = SynthesizerTrnMs256NSFsid(hps.data.filter_length // 2 + 1,hps.train.segment_size // hps.data.hop_length,**hps.model,is_half=hps.train.fp16_run,sr=hps.sample_rate).cuda(rank)
else:net_g = SynthesizerTrnMs256NSFsid_nono(hps.data.filter_length // 2 + 1,hps.train.segment_size // hps.data.hop_length,**hps.model,is_half=hps.train.fp16_run).cuda(rank)
net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank)
optim_g = torch.optim.AdamW(
net_g.parameters(),
hps.train.learning_rate,
betas=hps.train.betas,
eps=hps.train.eps,
)
optim_d = torch.optim.AdamW(
net_d.parameters(),
hps.train.learning_rate,
betas=hps.train.betas,
eps=hps.train.eps,
)
# net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True)
# net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True)
net_g = DDP(net_g, device_ids=[rank])
net_d = DDP(net_d, device_ids=[rank])
try:#如果能加载自动resume
_, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, optim_d) # D多半加载没事
if rank == 0:
logger.info("loaded D")
# _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g,load_opt=0)
_, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g)
global_step = (epoch_str - 1) * len(train_loader)
# epoch_str = 1
# global_step = 0
except:#如果首次不能加载加载pretrain
traceback.print_exc()
epoch_str = 1
global_step = 0
if rank == 0:
logger.info("loaded pretrained %s %s"%(hps.pretrainG,hps.pretrainD))
print(net_g.module.load_state_dict(torch.load(hps.pretrainG,map_location="cpu")["model"]))##测试不加载优化器
print(net_d.module.load_state_dict(torch.load(hps.pretrainD,map_location="cpu")["model"]))
scheduler_g = torch.optim.lr_scheduler.ExponentialLR(
optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2
)
scheduler_d = torch.optim.lr_scheduler.ExponentialLR(
optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2
)
scaler = GradScaler(enabled=hps.train.fp16_run)
cache=[]
for epoch in range(epoch_str, hps.train.epochs + 1):
if rank == 0:
train_and_evaluate(
rank,
epoch,
hps,
[net_g, net_d],
[optim_g, optim_d],
[scheduler_g, scheduler_d],
scaler,
[train_loader, None],
logger,
[writer, writer_eval],cache
)
else:
train_and_evaluate(
rank,
epoch,
hps,
[net_g, net_d],
[optim_g, optim_d],
[scheduler_g, scheduler_d],
scaler,
[train_loader, None],
None,
None,cache
)
scheduler_g.step()
scheduler_d.step()
def train_and_evaluate(
rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers,cache
):
net_g, net_d = nets
optim_g, optim_d = optims
train_loader, eval_loader = loaders
if writers is not None:
writer, writer_eval = writers
train_loader.batch_sampler.set_epoch(epoch)
global global_step
net_g.train()
net_d.train()
if(cache==[]or hps.if_cache_data_in_gpu==False):#第一个epoch把cache全部填满训练集
# print("caching")
for batch_idx, info in enumerate(train_loader):
if (hps.if_f0 == 1):phone,phone_lengths,pitch,pitchf,spec,spec_lengths,wave,wave_lengths,sid=info
else:phone,phone_lengths,spec,spec_lengths,wave,wave_lengths,sid=info
phone, phone_lengths = phone.cuda(rank, non_blocking=True),phone_lengths.cuda(rank, non_blocking=True )
if (hps.if_f0 == 1):pitch,pitchf = pitch.cuda(rank, non_blocking=True),pitchf.cuda(rank, non_blocking=True)
sid = sid.cuda(rank, non_blocking=True)
spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda(rank, non_blocking=True)
wave, wave_lengths = wave.cuda(rank, non_blocking=True), wave_lengths.cuda(rank, non_blocking=True)
if(hps.if_cache_data_in_gpu==True):
if (hps.if_f0 == 1):cache.append((batch_idx, (phone,phone_lengths,pitch,pitchf,spec,spec_lengths,wave,wave_lengths ,sid)))
else:cache.append((batch_idx, (phone,phone_lengths,spec,spec_lengths,wave,wave_lengths ,sid)))
with autocast(enabled=hps.train.fp16_run):
if (hps.if_f0 == 1):y_hat,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(phone, phone_lengths, pitch,pitchf, spec, spec_lengths,sid)
else:y_hat,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(phone, phone_lengths, spec, spec_lengths,sid)
mel = spec_to_mel_torch(spec,hps.data.filter_length,hps.data.n_mel_channels,hps.data.sampling_rate,hps.data.mel_fmin,hps.data.mel_fmax,)
y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length)
with autocast(enabled=False):
y_hat_mel = mel_spectrogram_torch(
y_hat.float().squeeze(1),
hps.data.filter_length,
hps.data.n_mel_channels,
hps.data.sampling_rate,
hps.data.hop_length,
hps.data.win_length,
hps.data.mel_fmin,
hps.data.mel_fmax,
)
if(hps.train.fp16_run==True):
y_hat_mel=y_hat_mel.half()
wave = commons.slice_segments(
wave, ids_slice * hps.data.hop_length, hps.train.segment_size
) # slice
# Discriminator
y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach())
with autocast(enabled=False):
loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(
y_d_hat_r, y_d_hat_g
)
optim_d.zero_grad()
scaler.scale(loss_disc).backward()
scaler.unscale_(optim_d)
grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
scaler.step(optim_d)
with autocast(enabled=hps.train.fp16_run):
# Generator
y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat)
with autocast(enabled=False):
loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
loss_fm = feature_loss(fmap_r, fmap_g)
loss_gen, losses_gen = generator_loss(y_d_hat_g)
loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl
optim_g.zero_grad()
scaler.scale(loss_gen_all).backward()
scaler.unscale_(optim_g)
grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
scaler.step(optim_g)
scaler.update()
if rank == 0:
if global_step % hps.train.log_interval == 0:
lr = optim_g.param_groups[0]["lr"]
logger.info(
"Train Epoch: {} [{:.0f}%]".format(
epoch, 100.0 * batch_idx / len(train_loader)
)
)
# Amor For Tensorboard display
if loss_mel > 50:
loss_mel = 50
if loss_kl > 5:
loss_kl = 5
logger.info([global_step, lr])
logger.info(
f"loss_disc={loss_disc:.3f}, loss_gen={loss_gen:.3f}, loss_fm={loss_fm:.3f},loss_mel={loss_mel:.3f}, loss_kl={loss_kl:.3f}"
)
scalar_dict = {
"loss/g/total": loss_gen_all,
"loss/d/total": loss_disc,
"learning_rate": lr,
"grad_norm_d": grad_norm_d,
"grad_norm_g": grad_norm_g,
}
scalar_dict.update(
{"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/kl": loss_kl}
)
scalar_dict.update(
{"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}
)
scalar_dict.update(
{"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}
)
scalar_dict.update(
{"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}
)
image_dict = {
"slice/mel_org": utils.plot_spectrogram_to_numpy(
y_mel[0].data.cpu().numpy()
),
"slice/mel_gen": utils.plot_spectrogram_to_numpy(
y_hat_mel[0].data.cpu().numpy()
),
"all/mel": utils.plot_spectrogram_to_numpy(
mel[0].data.cpu().numpy()
),
}
utils.summarize(
writer=writer,
global_step=global_step,
images=image_dict,
scalars=scalar_dict,
)
global_step += 1
# if global_step % hps.train.eval_interval == 0:
if epoch % hps.save_every_epoch == 0:
if(hps.if_latest==0):
utils.save_checkpoint(
net_g,
optim_g,
hps.train.learning_rate,
epoch,
os.path.join(hps.model_dir, "G_{}.pth".format(global_step)),
)
utils.save_checkpoint(
net_d,
optim_d,
hps.train.learning_rate,
epoch,
os.path.join(hps.model_dir, "D_{}.pth".format(global_step)),
)
else:
utils.save_checkpoint(
net_g,
optim_g,
hps.train.learning_rate,
epoch,
os.path.join(hps.model_dir, "G_{}.pth".format(2333333)),
)
utils.save_checkpoint(
net_d,
optim_d,
hps.train.learning_rate,
epoch,
os.path.join(hps.model_dir, "D_{}.pth".format(2333333)),
)
else:#后续的epoch直接使用打乱的cache
shuffle(cache)
# print("using cache")
for batch_idx, info in cache:
if (hps.if_f0 == 1):phone,phone_lengths,pitch,pitchf,spec,spec_lengths,wave,wave_lengths,sid=info
else:phone,phone_lengths,spec,spec_lengths,wave,wave_lengths,sid=info
with autocast(enabled=hps.train.fp16_run):
if (hps.if_f0 == 1):y_hat,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(phone, phone_lengths, pitch,pitchf, spec, spec_lengths,sid)
else:y_hat,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(phone, phone_lengths, spec, spec_lengths,sid)
mel = spec_to_mel_torch(
spec,
hps.data.filter_length,
hps.data.n_mel_channels,
hps.data.sampling_rate,
hps.data.mel_fmin,
hps.data.mel_fmax,
)
y_mel = commons.slice_segments(
mel, ids_slice, hps.train.segment_size // hps.data.hop_length
)
with autocast(enabled=False):
y_hat_mel = mel_spectrogram_torch(
y_hat.float().squeeze(1),
hps.data.filter_length,
hps.data.n_mel_channels,
hps.data.sampling_rate,
hps.data.hop_length,
hps.data.win_length,
hps.data.mel_fmin,
hps.data.mel_fmax,
)
if(hps.train.fp16_run==True):
y_hat_mel=y_hat_mel.half()
wave = commons.slice_segments(
wave, ids_slice * hps.data.hop_length, hps.train.segment_size
) # slice
# Discriminator
y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach())
with autocast(enabled=False):
loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(
y_d_hat_r, y_d_hat_g
)
optim_d.zero_grad()
scaler.scale(loss_disc).backward()
scaler.unscale_(optim_d)
grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
scaler.step(optim_d)
with autocast(enabled=hps.train.fp16_run):
# Generator
y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat)
with autocast(enabled=False):
loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
loss_fm = feature_loss(fmap_r, fmap_g)
loss_gen, losses_gen = generator_loss(y_d_hat_g)
loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl
optim_g.zero_grad()
scaler.scale(loss_gen_all).backward()
scaler.unscale_(optim_g)
grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
scaler.step(optim_g)
scaler.update()
if rank == 0:
if global_step % hps.train.log_interval == 0:
lr = optim_g.param_groups[0]["lr"]
logger.info(
"Train Epoch: {} [{:.0f}%]".format(
epoch, 100.0 * batch_idx / len(train_loader)
)
)
# Amor For Tensorboard display
if loss_mel > 50:
loss_mel = 50
if loss_kl > 5:
loss_kl = 5
logger.info([global_step, lr])
logger.info(
f"loss_disc={loss_disc:.3f}, loss_gen={loss_gen:.3f}, loss_fm={loss_fm:.3f},loss_mel={loss_mel:.3f}, loss_kl={loss_kl:.3f}"
)
scalar_dict = {
"loss/g/total": loss_gen_all,
"loss/d/total": loss_disc,
"learning_rate": lr,
"grad_norm_d": grad_norm_d,
"grad_norm_g": grad_norm_g,
}
scalar_dict.update(
{"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/kl": loss_kl}
)
scalar_dict.update(
{"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}
)
scalar_dict.update(
{"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}
)
scalar_dict.update(
{"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}
)
image_dict = {
"slice/mel_org": utils.plot_spectrogram_to_numpy(
y_mel[0].data.cpu().numpy()
),
"slice/mel_gen": utils.plot_spectrogram_to_numpy(
y_hat_mel[0].data.cpu().numpy()
),
"all/mel": utils.plot_spectrogram_to_numpy(
mel[0].data.cpu().numpy()
),
}
utils.summarize(
writer=writer,
global_step=global_step,
images=image_dict,
scalars=scalar_dict,
)
global_step += 1
# if global_step % hps.train.eval_interval == 0:
if epoch % hps.save_every_epoch == 0:
if(hps.if_latest==0):
utils.save_checkpoint(
net_g,
optim_g,
hps.train.learning_rate,
epoch,
os.path.join(hps.model_dir, "G_{}.pth".format(global_step)),
)
utils.save_checkpoint(
net_d,
optim_d,
hps.train.learning_rate,
epoch,
os.path.join(hps.model_dir, "D_{}.pth".format(global_step)),
)
else:
utils.save_checkpoint(
net_g,
optim_g,
hps.train.learning_rate,
epoch,
os.path.join(hps.model_dir, "G_{}.pth".format(2333333)),
)
utils.save_checkpoint(
net_d,
optim_d,
hps.train.learning_rate,
epoch,
os.path.join(hps.model_dir, "D_{}.pth".format(2333333)),
)
if rank == 0:
logger.info("====> Epoch: {}".format(epoch))
if(epoch>=hps.total_epoch):
if rank == 0:
logger.info("Training is done. The program is closed.")
from process_ckpt import savee#def savee(ckpt,sr,if_f0,name,epoch):
if hasattr(net_g, 'module'):ckpt = net_g.module.state_dict()
else:ckpt = net_g.state_dict()
print("saving final ckpt:",savee(ckpt,hps.sample_rate,hps.if_f0,hps.name,epoch))
os._exit(2333333)
if __name__ == "__main__":
main()