From c8261b2ccc8a391933a95ddd7796dba1ebf4be52 Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Sat, 15 Apr 2023 20:44:24 +0900 Subject: [PATCH] Reformat and rewrite _get_name_params (#57) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Reformat * rewrite _get_name_params * Add workflow for automatic formatting * Revert "Add workflow for automatic formatting" This reverts commit 9111c5dbc1830248305fb075587a88be07ad3115. * revert Retrieval_based_Voice_Conversion_WebUI.ipynb --------- Co-authored-by: 源文雨 <41315874+fumiama@users.noreply.github.com> --- config.py | 73 +- export_onnx.py | 55 +- extract_f0_print.py | 116 +- extract_feature_print.py | 66 +- extract_locale.py | 10 +- gui.py | 572 ++++++--- infer-web.py | 1381 +++++++++++++++++----- infer/infer-pm-index256.py | 142 ++- infer/train-index.py | 30 +- infer/trans_weights.py | 13 +- infer_pack/commons.py | 4 +- infer_pack/models.py | 268 +++-- infer_pack/models_onnx.py | 233 ++-- infer_pack/transforms.py | 172 +-- infer_uvr5.py | 191 ++- locale/locale_diff.py | 4 +- my_utils.py | 8 +- slicer2.py | 159 ++- train/data_utils.py | 105 +- train/losses.py | 1 + train/mel_processing.py | 17 +- train/process_ckpt.py | 233 +++- train/utils.py | 642 +++++----- train_nsf_sim_cache_sid_load_pretrain.py | 567 +++++---- trainset_preprocess_pipeline_print.py | 117 +- uvr5_pack/lib_v5/dataset.py | 87 +- uvr5_pack/lib_v5/layers.py | 50 +- uvr5_pack/lib_v5/layers_123812KB .py | 50 +- uvr5_pack/lib_v5/layers_123821KB.py | 50 +- uvr5_pack/lib_v5/layers_33966KB.py | 56 +- uvr5_pack/lib_v5/layers_537227KB.py | 56 +- uvr5_pack/lib_v5/layers_537238KB.py | 56 +- uvr5_pack/lib_v5/model_param_init.py | 77 +- uvr5_pack/lib_v5/nets.py | 40 +- uvr5_pack/lib_v5/nets_123812KB.py | 38 +- uvr5_pack/lib_v5/nets_123821KB.py | 38 +- uvr5_pack/lib_v5/nets_33966KB.py | 36 +- uvr5_pack/lib_v5/nets_537227KB.py | 38 +- uvr5_pack/lib_v5/nets_537238KB.py | 38 +- uvr5_pack/lib_v5/nets_61968KB.py | 38 +- uvr5_pack/lib_v5/spec_utils.py | 562 ++++++--- uvr5_pack/name_params.json | 263 ++++ uvr5_pack/utils.py | 254 ++-- vc_infer_pipeline.py | 322 +++-- webui_locale.py | 6 +- 45 files changed, 4878 insertions(+), 2456 deletions(-) create mode 100644 uvr5_pack/name_params.json diff --git a/config.py b/config.py index 20c90a1..e5c0810 100644 --- a/config.py +++ b/config.py @@ -1,13 +1,13 @@ ########################硬件参数######################## -#填写cuda:x, cpu 或 mps, x指代第几张卡,只支持 N卡 / Apple Silicon 加速 -device = "cuda:0" +# 填写cuda:x, cpu 或 mps, x指代第几张卡,只支持 N卡 / Apple Silicon 加速 +device = "cuda:0" -#9-10-20-30-40系显卡无脑True,不影响质量,>=20显卡开启有加速 -is_half = True +# 9-10-20-30-40系显卡无脑True,不影响质量,>=20显卡开启有加速 +is_half = True -#默认0用上所有线程,写数字限制CPU资源使用 -n_cpu = 0 +# 默认0用上所有线程,写数字限制CPU资源使用 +n_cpu = 0 ########################硬件参数######################## @@ -16,31 +16,38 @@ n_cpu = 0 ########################命令行参数######################## import argparse + parser = argparse.ArgumentParser() parser.add_argument("--port", type=int, default=7865, help="Listen port") parser.add_argument("--pycmd", type=str, default="python", help="Python command") -parser.add_argument("--colab", action='store_true', help="Launch in colab") -parser.add_argument("--noparallel", action='store_true', help="Disable parallel processing") -parser.add_argument("--noautoopen", action='store_true', help="Do not open in browser automatically") +parser.add_argument("--colab", action="store_true", help="Launch in colab") +parser.add_argument( + "--noparallel", action="store_true", help="Disable parallel processing" +) +parser.add_argument( + "--noautoopen", action="store_true", help="Do not open in browser automatically" +) cmd_opts = parser.parse_args() -python_cmd=cmd_opts.pycmd -listen_port=cmd_opts.port -iscolab=cmd_opts.colab -noparallel=cmd_opts.noparallel -noautoopen=cmd_opts.noautoopen +python_cmd = cmd_opts.pycmd +listen_port = cmd_opts.port +iscolab = cmd_opts.colab +noparallel = cmd_opts.noparallel +noautoopen = cmd_opts.noautoopen ########################命令行参数######################## import sys import torch + # has_mps is only available in nightly pytorch (for now) and MasOS 12.3+. # check `getattr` and try it for compatibility def has_mps() -> bool: if sys.platform != "darwin": return False else: - if not getattr(torch, 'has_mps', False): return False + if not getattr(torch, "has_mps", False): + return False try: torch.zeros(1).to(torch.device("mps")) return True @@ -48,32 +55,34 @@ def has_mps() -> bool: return False -if(not torch.cuda.is_available()): +if not torch.cuda.is_available(): if has_mps(): print("没有发现支持的N卡, 使用MPS进行推理") - device = "mps" + device = "mps" else: print("没有发现支持的N卡, 使用CPU进行推理") - device = "cpu" + device = "cpu" is_half = False -if(device not in ["cpu", "mps"]): +if device not in ["cpu", "mps"]: gpu_name = torch.cuda.get_device_name(int(device.split(":")[-1])) - if("16" in gpu_name or "MX" in gpu_name): + if "16" in gpu_name or "MX" in gpu_name: print("16系显卡/MX系显卡强制单精度") is_half = False from multiprocessing import cpu_count -if(n_cpu==0): n_cpu=cpu_count() -if(is_half): - #6G显存配置 - x_pad = 3 - x_query = 10 - x_center = 60 - x_max = 65 + +if n_cpu == 0: + n_cpu = cpu_count() +if is_half: + # 6G显存配置 + x_pad = 3 + x_query = 10 + x_center = 60 + x_max = 65 else: - #5G显存配置 - x_pad = 1 - x_query = 6 - x_center = 38 - x_max = 41 + # 5G显存配置 + x_pad = 1 + x_query = 6 + x_center = 38 + x_max = 41 diff --git a/export_onnx.py b/export_onnx.py index 80f061b..d4a8c62 100644 --- a/export_onnx.py +++ b/export_onnx.py @@ -5,40 +5,43 @@ person = "Shiroha/shiroha.pth" exported_path = "model.onnx" - cpt = torch.load(person, map_location="cpu") -cpt["config"][-3]=cpt["weight"]["emb_g.weight"].shape[0]#n_spk +cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk print(*cpt["config"]) net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=False) net_g.load_state_dict(cpt["weight"], strict=False) test_phone = torch.rand(1, 200, 256) test_phone_lengths = torch.tensor([200]).long() -test_pitch = torch.randint(size=(1 ,200),low=5,high=255) +test_pitch = torch.randint(size=(1, 200), low=5, high=255) test_pitchf = torch.rand(1, 200) test_ds = torch.LongTensor([0]) test_rnd = torch.rand(1, 192, 200) input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"] -output_names = ["audio", ] -device="cpu" -torch.onnx.export(net_g, - ( - test_phone.to(device), - test_phone_lengths.to(device), - test_pitch.to(device), - test_pitchf.to(device), - test_ds.to(device), - test_rnd.to(device) - ), - exported_path, - dynamic_axes={ - "phone": [1], - "pitch": [1], - "pitchf": [1], - "rnd": [2], - }, - do_constant_folding=False, - opset_version=16, - verbose=False, - input_names=input_names, - output_names=output_names) \ No newline at end of file +output_names = [ + "audio", +] +device = "cpu" +torch.onnx.export( + net_g, + ( + test_phone.to(device), + test_phone_lengths.to(device), + test_pitch.to(device), + test_pitchf.to(device), + test_ds.to(device), + test_rnd.to(device), + ), + exported_path, + dynamic_axes={ + "phone": [1], + "pitch": [1], + "pitchf": [1], + "rnd": [2], + }, + do_constant_folding=False, + opset_version=16, + verbose=False, + input_names=input_names, + output_names=output_names, +) diff --git a/extract_f0_print.py b/extract_f0_print.py index 6c9549e..d330c90 100644 --- a/extract_f0_print.py +++ b/extract_f0_print.py @@ -1,21 +1,26 @@ -import os,traceback,sys,parselmouth +import os, traceback, sys, parselmouth import librosa import pyworld from scipy.io import wavfile -import numpy as np,logging -logging.getLogger('numba').setLevel(logging.WARNING) +import numpy as np, logging + +logging.getLogger("numba").setLevel(logging.WARNING) from multiprocessing import Process exp_dir = sys.argv[1] -f = open("%s/extract_f0_feature.log"%exp_dir, "a+") +f = open("%s/extract_f0_feature.log" % exp_dir, "a+") + + def printt(strr): print(strr) f.write("%s\n" % strr) f.flush() + n_p = int(sys.argv[2]) f0method = sys.argv[3] + class FeatureInput(object): def __init__(self, samplerate=16000, hop_size=160): self.fs = samplerate @@ -27,21 +32,30 @@ class FeatureInput(object): self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) - def compute_f0(self, path,f0_method): + def compute_f0(self, path, f0_method): x, sr = librosa.load(path, self.fs) - p_len=x.shape[0]//self.hop + p_len = x.shape[0] // self.hop assert sr == self.fs - if(f0_method=="pm"): + if f0_method == "pm": time_step = 160 / 16000 * 1000 f0_min = 50 f0_max = 1100 - f0 = parselmouth.Sound(x, sr).to_pitch_ac( - time_step=time_step / 1000, voicing_threshold=0.6, - pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency'] - pad_size=(p_len - len(f0) + 1) // 2 - if(pad_size>0 or p_len - len(f0) - pad_size>0): - f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant') - elif(f0_method=="harvest"): + f0 = ( + parselmouth.Sound(x, sr) + .to_pitch_ac( + time_step=time_step / 1000, + voicing_threshold=0.6, + pitch_floor=f0_min, + pitch_ceiling=f0_max, + ) + .selected_array["frequency"] + ) + pad_size = (p_len - len(f0) + 1) // 2 + if pad_size > 0 or p_len - len(f0) - pad_size > 0: + f0 = np.pad( + f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" + ) + elif f0_method == "harvest": f0, t = pyworld.harvest( x.astype(np.double), fs=sr, @@ -50,7 +64,7 @@ class FeatureInput(object): frame_period=1000 * self.hop / sr, ) f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs) - elif(f0_method=="dio"): + elif f0_method == "dio": f0, t = pyworld.dio( x.astype(np.double), fs=sr, @@ -77,45 +91,67 @@ class FeatureInput(object): ) return f0_coarse - def go(self,paths,f0_method): - if (len(paths) == 0): printt("no-f0-todo") + def go(self, paths, f0_method): + if len(paths) == 0: + printt("no-f0-todo") else: - printt("todo-f0-%s"%len(paths)) - n=max(len(paths)//5,1)#每个进程最多打印5条 - for idx,(inp_path,opt_path1,opt_path2) in enumerate(paths): + printt("todo-f0-%s" % len(paths)) + n = max(len(paths) // 5, 1) # 每个进程最多打印5条 + for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths): try: - if(idx%n==0):printt("f0ing,now-%s,all-%s,-%s"%(idx,len(paths),inp_path)) - if(os.path.exists(opt_path1+".npy")==True and os.path.exists(opt_path2+".npy")==True):continue - featur_pit = self.compute_f0(inp_path,f0_method) - np.save(opt_path2,featur_pit,allow_pickle=False,)#nsf + if idx % n == 0: + printt("f0ing,now-%s,all-%s,-%s" % (idx, len(paths), inp_path)) + if ( + os.path.exists(opt_path1 + ".npy") == True + and os.path.exists(opt_path2 + ".npy") == True + ): + continue + featur_pit = self.compute_f0(inp_path, f0_method) + np.save( + opt_path2, + featur_pit, + allow_pickle=False, + ) # nsf coarse_pit = self.coarse_f0(featur_pit) - np.save(opt_path1,coarse_pit,allow_pickle=False,)#ori + np.save( + opt_path1, + coarse_pit, + allow_pickle=False, + ) # ori except: - printt("f0fail-%s-%s-%s" % (idx, inp_path,traceback.format_exc())) + printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc())) -if __name__=='__main__': + +if __name__ == "__main__": # exp_dir=r"E:\codes\py39\dataset\mi-test" # n_p=16 # f = open("%s/log_extract_f0.log"%exp_dir, "w") printt(sys.argv) featureInput = FeatureInput() - paths=[] - inp_root= "%s/1_16k_wavs"%(exp_dir) - opt_root1="%s/2a_f0"%(exp_dir) - opt_root2="%s/2b-f0nsf"%(exp_dir) + paths = [] + inp_root = "%s/1_16k_wavs" % (exp_dir) + opt_root1 = "%s/2a_f0" % (exp_dir) + opt_root2 = "%s/2b-f0nsf" % (exp_dir) - os.makedirs(opt_root1,exist_ok=True) - os.makedirs(opt_root2,exist_ok=True) + os.makedirs(opt_root1, exist_ok=True) + os.makedirs(opt_root2, exist_ok=True) for name in sorted(list(os.listdir(inp_root))): - inp_path="%s/%s"%(inp_root,name) - if ("spec" in inp_path): continue - opt_path1="%s/%s"%(opt_root1,name) - opt_path2="%s/%s"%(opt_root2,name) - paths.append([inp_path,opt_path1,opt_path2]) + inp_path = "%s/%s" % (inp_root, name) + if "spec" in inp_path: + continue + opt_path1 = "%s/%s" % (opt_root1, name) + opt_path2 = "%s/%s" % (opt_root2, name) + paths.append([inp_path, opt_path1, opt_path2]) - ps=[] + ps = [] for i in range(n_p): - p=Process(target=featureInput.go,args=(paths[i::n_p],f0method,)) + p = Process( + target=featureInput.go, + args=( + paths[i::n_p], + f0method, + ), + ) p.start() ps.append(p) for p in ps: diff --git a/extract_feature_print.py b/extract_feature_print.py index 3cf7092..7cc0601 100644 --- a/extract_feature_print.py +++ b/extract_feature_print.py @@ -1,33 +1,41 @@ -import os,sys,traceback +import os, sys, traceback + # device=sys.argv[1] -n_part=int(sys.argv[2]) -i_part=int(sys.argv[3]) +n_part = int(sys.argv[2]) +i_part = int(sys.argv[3]) if len(sys.argv) == 5: - exp_dir=sys.argv[4] + exp_dir = sys.argv[4] else: - i_gpu=sys.argv[4] - exp_dir=sys.argv[5] - os.environ["CUDA_VISIBLE_DEVICES"]=str(i_gpu) + i_gpu = sys.argv[4] + exp_dir = sys.argv[5] + os.environ["CUDA_VISIBLE_DEVICES"] = str(i_gpu) import torch import torch.nn.functional as F import soundfile as sf import numpy as np from fairseq import checkpoint_utils + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -f = open("%s/extract_f0_feature.log"%exp_dir, "a+") +f = open("%s/extract_f0_feature.log" % exp_dir, "a+") + + def printt(strr): print(strr) f.write("%s\n" % strr) f.flush() + + printt(sys.argv) model_path = "hubert_base.pt" printt(exp_dir) -wavPath = "%s/1_16k_wavs"%exp_dir -outPath = "%s/3_feature256"%exp_dir -os.makedirs(outPath,exist_ok=True) +wavPath = "%s/1_16k_wavs" % exp_dir +outPath = "%s/3_feature256" % exp_dir +os.makedirs(outPath, exist_ok=True) + + # wave must be 16k, hop_size=320 def readwave(wav_path, normalize=False): wav, sr = sf.read(wav_path) @@ -41,6 +49,8 @@ def readwave(wav_path, normalize=False): feats = F.layer_norm(feats, feats.shape) feats = feats.view(1, -1) return feats + + # HuBERT model printt("load model(s) from {}".format(model_path)) models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( @@ -49,27 +59,32 @@ models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( ) model = models[0] model = model.to(device) -printt("move model to %s"%device) -if device != "cpu": model = model.half() +printt("move model to %s" % device) +if device != "cpu": + model = model.half() model.eval() -todo=sorted(list(os.listdir(wavPath)))[i_part::n_part] -n = max(1,len(todo) // 10) # 最多打印十条 -if(len(todo)==0):printt("no-feature-todo") +todo = sorted(list(os.listdir(wavPath)))[i_part::n_part] +n = max(1, len(todo) // 10) # 最多打印十条 +if len(todo) == 0: + printt("no-feature-todo") else: - printt("all-feature-%s"%len(todo)) - for idx,file in enumerate(todo): + printt("all-feature-%s" % len(todo)) + for idx, file in enumerate(todo): try: if file.endswith(".wav"): - wav_path = "%s/%s"%(wavPath,file) - out_path = "%s/%s"%(outPath,file.replace("wav","npy")) + wav_path = "%s/%s" % (wavPath, file) + out_path = "%s/%s" % (outPath, file.replace("wav", "npy")) - if(os.path.exists(out_path)):continue + if os.path.exists(out_path): + continue feats = readwave(wav_path, normalize=saved_cfg.task.normalize) padding_mask = torch.BoolTensor(feats.shape).fill_(False) inputs = { - "source": feats.half().to(device) if device != "cpu" else feats.to(device), + "source": feats.half().to(device) + if device != "cpu" + else feats.to(device), "padding_mask": padding_mask.to(device), "output_layer": 9, # layer 9 } @@ -78,11 +93,12 @@ else: feats = model.final_proj(logits[0]) feats = feats.squeeze(0).float().cpu().numpy() - if(np.isnan(feats).sum()==0): + if np.isnan(feats).sum() == 0: np.save(out_path, feats, allow_pickle=False) else: - printt("%s-contains nan"%file) - if (idx % n == 0):printt("now-%s,all-%s,%s,%s"%(len(todo),idx,file,feats.shape)) + printt("%s-contains nan" % file) + if idx % n == 0: + printt("now-%s,all-%s,%s,%s" % (len(todo), idx, file, feats.shape)) except: printt(traceback.format_exc()) printt("all-feature-done") diff --git a/extract_locale.py b/extract_locale.py index 1bb9e32..a6f1c69 100644 --- a/extract_locale.py +++ b/extract_locale.py @@ -7,9 +7,10 @@ pattern = r"""i18n\((["'][^"']+["'])\)""" # Initialize the dictionary to store key-value pairs data = {} + def process(fn: str): global data - with open(fn, 'r', encoding='utf-8') as f: + with open(fn, "r", encoding="utf-8") as f: contents = f.read() matches = re.findall(pattern, contents) for key in matches: @@ -17,12 +18,13 @@ def process(fn: str): print("extract:", key) data[key] = key + print("processing infer-web.py") -process('infer-web.py') +process("infer-web.py") print("processing gui.py") -process('gui.py') +process("gui.py") # Save as a JSON file -with open('./locale/zh_CN.json', 'w', encoding='utf-8') as f: +with open("./locale/zh_CN.json", "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=4) diff --git a/gui.py b/gui.py index e1c5952..2647121 100644 --- a/gui.py +++ b/gui.py @@ -3,32 +3,36 @@ import sounddevice as sd import noisereduce as nr import numpy as np from fairseq import checkpoint_utils -import librosa,torch,parselmouth,faiss,time,threading +import librosa, torch, parselmouth, faiss, time, threading import torch.nn.functional as F import torchaudio.transforms as tat -#import matplotlib.pyplot as plt +# import matplotlib.pyplot as plt from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono from webui_locale import I18nAuto + i18n = I18nAuto() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + class RVC: - def __init__(self,key,hubert_path,pth_path,index_path,npy_path,index_rate) -> None: - ''' + def __init__( + self, key, hubert_path, pth_path, index_path, npy_path, index_rate + ) -> None: + """ 初始化 - ''' - self.f0_up_key=key + """ + self.f0_up_key = key self.time_step = 160 / 16000 * 1000 self.f0_min = 50 self.f0_max = 1100 self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) - self.index=faiss.read_index(index_path) - self.index_rate=index_rate - '''NOT YET USED''' - self.big_npy=np.load(npy_path) + self.index = faiss.read_index(index_path) + self.index_rate = index_rate + """NOT YET USED""" + self.big_npy = np.load(npy_path) model_path = hubert_path print("load model(s) from {}".format(model_path)) models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( @@ -41,9 +45,9 @@ class RVC: self.model.eval() cpt = torch.load(pth_path, map_location="cpu") tgt_sr = cpt["config"][-1] - cpt["config"][-3]=cpt["weight"]["emb_g.weight"].shape[0]#n_spk - if_f0=cpt.get("f0",1) - if(if_f0==1): + cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk + if_f0 = cpt.get("f0", 1) + if if_f0 == 1: self.net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=True) else: self.net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) @@ -52,36 +56,43 @@ class RVC: self.net_g.eval().to(device) self.net_g.half() - - def get_f0_coarse(self,f0): + def get_f0_coarse(self, f0): f0_mel = 1127 * np.log(1 + f0 / 700) - f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / (self.f0_mel_max - self.f0_mel_min) + 1 + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / ( + self.f0_mel_max - self.f0_mel_min + ) + 1 f0_mel[f0_mel <= 1] = 1 f0_mel[f0_mel > 255] = 255 # f0_mel[f0_mel > 188] = 188 f0_coarse = np.rint(f0_mel).astype(np.int) return f0_coarse - - def get_f0(self,x, p_len,f0_up_key=0): - f0 = parselmouth.Sound(x, 16000).to_pitch_ac( - time_step=self.time_step / 1000, voicing_threshold=0.6, - pitch_floor=self.f0_min, pitch_ceiling=self.f0_max).selected_array['frequency'] - pad_size=(p_len - len(f0) + 1) // 2 - if(pad_size>0 or p_len - len(f0) - pad_size>0): - f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant') + def get_f0(self, x, p_len, f0_up_key=0): + f0 = ( + parselmouth.Sound(x, 16000) + .to_pitch_ac( + time_step=self.time_step / 1000, + voicing_threshold=0.6, + pitch_floor=self.f0_min, + pitch_ceiling=self.f0_max, + ) + .selected_array["frequency"] + ) + + pad_size = (p_len - len(f0) + 1) // 2 + if pad_size > 0 or p_len - len(f0) - pad_size > 0: + f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant") f0 *= pow(2, f0_up_key / 12) # f0=suofang(f0) f0bak = f0.copy() - f0_coarse=self.get_f0_coarse(f0) + f0_coarse = self.get_f0_coarse(f0) return f0_coarse, f0bak - - def infer(self,feats:torch.Tensor) -> np.ndarray: - ''' + def infer(self, feats: torch.Tensor) -> np.ndarray: + """ 推理函数 - ''' - audio=feats.clone().cpu().numpy() + """ + audio = feats.clone().cpu().numpy() assert feats.dim() == 1, feats.dim() feats = feats.view(1, -1) padding_mask = torch.BoolTensor(feats.shape).fill_(False) @@ -96,209 +107,389 @@ class RVC: feats = self.model.final_proj(logits[0]) ####索引优化 - if(isinstance(self.index,type(None))==False and isinstance(self.big_npy,type(None))==False and self.index_rate!=0): + if ( + isinstance(self.index, type(None)) == False + and isinstance(self.big_npy, type(None)) == False + and self.index_rate != 0 + ): npy = feats[0].cpu().numpy().astype("float32") _, I = self.index.search(npy, 1) - npy=self.big_npy[I.squeeze()].astype("float16") - feats = torch.from_numpy(npy).unsqueeze(0).to(device)*self.index_rate + (1-self.index_rate)*feats + npy = self.big_npy[I.squeeze()].astype("float16") + feats = ( + torch.from_numpy(npy).unsqueeze(0).to(device) * self.index_rate + + (1 - self.index_rate) * feats + ) - feats=F.interpolate(feats.permute(0,2,1),scale_factor=2).permute(0,2,1) + feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) torch.cuda.synchronize() # p_len = min(feats.shape[1],10000,pitch.shape[0])#太大了爆显存 - p_len = min(feats.shape[1],12000)# + p_len = min(feats.shape[1], 12000) # print(feats.shape) - pitch, pitchf = self.get_f0(audio, p_len,self.f0_up_key) - p_len = min(feats.shape[1],12000,pitch.shape[0])#太大了爆显存 + pitch, pitchf = self.get_f0(audio, p_len, self.f0_up_key) + p_len = min(feats.shape[1], 12000, pitch.shape[0]) # 太大了爆显存 torch.cuda.synchronize() # print(feats.shape,pitch.shape) - feats = feats[:,:p_len, :] + feats = feats[:, :p_len, :] pitch = pitch[:p_len] pitchf = pitchf[:p_len] p_len = torch.LongTensor([p_len]).to(device) pitch = torch.LongTensor(pitch).unsqueeze(0).to(device) pitchf = torch.FloatTensor(pitchf).unsqueeze(0).to(device) - ii=0#sid - sid=torch.LongTensor([ii]).to(device) + ii = 0 # sid + sid = torch.LongTensor([ii]).to(device) with torch.no_grad(): - infered_audio = self.net_g.infer(feats, p_len,pitch,pitchf,sid)[0][0, 0].data.cpu().float()#nsf + infered_audio = ( + self.net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] + .data.cpu() + .float() + ) # nsf torch.cuda.synchronize() return infered_audio class Config: def __init__(self) -> None: - self.hubert_path:str='' - self.pth_path:str='' - self.index_path:str='' - self.npy_path:str='' - self.pitch:int=12 - self.samplerate:int=44100 - self.block_time:float=1.0#s - self.buffer_num:int=1 - self.threhold:int=-30 - self.crossfade_time:float=0.08 - self.extra_time:float=0.04 - self.I_noise_reduce=False - self.O_noise_reduce=False - self.index_rate=0.3 + self.hubert_path: str = "" + self.pth_path: str = "" + self.index_path: str = "" + self.npy_path: str = "" + self.pitch: int = 12 + self.samplerate: int = 44100 + self.block_time: float = 1.0 # s + self.buffer_num: int = 1 + self.threhold: int = -30 + self.crossfade_time: float = 0.08 + self.extra_time: float = 0.04 + self.I_noise_reduce = False + self.O_noise_reduce = False + self.index_rate = 0.3 + class GUI: def __init__(self) -> None: - self.config=Config() - self.flag_vc=False - + self.config = Config() + self.flag_vc = False + self.launcher() - + def launcher(self): - sg.theme('LightBlue3') - input_devices,output_devices,_, _=self.get_devices() - layout=[ + sg.theme("LightBlue3") + input_devices, output_devices, _, _ = self.get_devices() + layout = [ [ - sg.Frame(title=i18n('加载模型'),layout=[ - [sg.Input(default_text='TEMP\\hubert_base.pt',key='hubert_path'),sg.FileBrowse(i18n('Hubert模型'))], - [sg.Input(default_text='TEMP\\atri.pth',key='pth_path'),sg.FileBrowse(i18n('选择.pth文件'))], - [sg.Input(default_text='TEMP\\added_IVF512_Flat_atri_baseline_src_feat.index',key='index_path'),sg.FileBrowse(i18n('选择.index文件'))], - [sg.Input(default_text='TEMP\\big_src_feature_atri.npy',key='npy_path'),sg.FileBrowse(i18n('选择.npy文件'))] - ]) + sg.Frame( + title=i18n("加载模型"), + layout=[ + [ + sg.Input( + default_text="TEMP\\hubert_base.pt", key="hubert_path" + ), + sg.FileBrowse(i18n("Hubert模型")), + ], + [ + sg.Input(default_text="TEMP\\atri.pth", key="pth_path"), + sg.FileBrowse(i18n("选择.pth文件")), + ], + [ + sg.Input( + default_text="TEMP\\added_IVF512_Flat_atri_baseline_src_feat.index", + key="index_path", + ), + sg.FileBrowse(i18n("选择.index文件")), + ], + [ + sg.Input( + default_text="TEMP\\big_src_feature_atri.npy", + key="npy_path", + ), + sg.FileBrowse(i18n("选择.npy文件")), + ], + ], + ) ], [ - sg.Frame(layout=[ - [sg.Text(i18n("输入设备")),sg.Combo(input_devices,key='sg_input_device',default_value=input_devices[sd.default.device[0]])], - [sg.Text(i18n("输出设备")),sg.Combo(output_devices,key='sg_output_device',default_value=output_devices[sd.default.device[1]])] - ],title=i18n("音频设备(请使用同种类驱动)")) + sg.Frame( + layout=[ + [ + sg.Text(i18n("输入设备")), + sg.Combo( + input_devices, + key="sg_input_device", + default_value=input_devices[sd.default.device[0]], + ), + ], + [ + sg.Text(i18n("输出设备")), + sg.Combo( + output_devices, + key="sg_output_device", + default_value=output_devices[sd.default.device[1]], + ), + ], + ], + title=i18n("音频设备(请使用同种类驱动)"), + ) ], [ - sg.Frame(layout=[ - [sg.Text(i18n("响应阈值")),sg.Slider(range=(-60,0),key='threhold',resolution=1,orientation='h',default_value=-30)], - [sg.Text(i18n("音调设置")),sg.Slider(range=(-24,24),key='pitch',resolution=1,orientation='h',default_value=12)], - [sg.Text(i18n('Index Rate')),sg.Slider(range=(0.0,1.0),key='index_rate',resolution=0.01,orientation='h',default_value=0.5)] - ],title=i18n("常规设置")), - sg.Frame(layout=[ - [sg.Text(i18n("采样长度")),sg.Slider(range=(0.1,3.0),key='block_time',resolution=0.1,orientation='h',default_value=1.0)], - [sg.Text(i18n("淡入淡出长度")),sg.Slider(range=(0.01,0.15),key='crossfade_length',resolution=0.01,orientation='h',default_value=0.08)], - [sg.Text(i18n("额外推理时长")),sg.Slider(range=(0.05,3.00),key='extra_time',resolution=0.01,orientation='h',default_value=0.05)], - [sg.Checkbox(i18n('输入降噪'),key='I_noise_reduce'),sg.Checkbox(i18n('输出降噪'),key='O_noise_reduce')] - ],title=i18n("性能设置")) + sg.Frame( + layout=[ + [ + sg.Text(i18n("响应阈值")), + sg.Slider( + range=(-60, 0), + key="threhold", + resolution=1, + orientation="h", + default_value=-30, + ), + ], + [ + sg.Text(i18n("音调设置")), + sg.Slider( + range=(-24, 24), + key="pitch", + resolution=1, + orientation="h", + default_value=12, + ), + ], + [ + sg.Text(i18n("Index Rate")), + sg.Slider( + range=(0.0, 1.0), + key="index_rate", + resolution=0.01, + orientation="h", + default_value=0.5, + ), + ], + ], + title=i18n("常规设置"), + ), + sg.Frame( + layout=[ + [ + sg.Text(i18n("采样长度")), + sg.Slider( + range=(0.1, 3.0), + key="block_time", + resolution=0.1, + orientation="h", + default_value=1.0, + ), + ], + [ + sg.Text(i18n("淡入淡出长度")), + sg.Slider( + range=(0.01, 0.15), + key="crossfade_length", + resolution=0.01, + orientation="h", + default_value=0.08, + ), + ], + [ + sg.Text(i18n("额外推理时长")), + sg.Slider( + range=(0.05, 3.00), + key="extra_time", + resolution=0.01, + orientation="h", + default_value=0.05, + ), + ], + [ + sg.Checkbox(i18n("输入降噪"), key="I_noise_reduce"), + sg.Checkbox(i18n("输出降噪"), key="O_noise_reduce"), + ], + ], + title=i18n("性能设置"), + ), + ], + [ + sg.Button(i18n("开始音频转换"), key="start_vc"), + sg.Button(i18n("停止音频转换"), key="stop_vc"), + sg.Text(i18n("推理时间(ms):")), + sg.Text("0", key="infer_time"), ], - [sg.Button(i18n("开始音频转换"),key='start_vc'),sg.Button(i18n("停止音频转换"),key='stop_vc'),sg.Text(i18n("推理时间(ms):")),sg.Text("0",key='infer_time')] ] - - self.window=sg.Window("RVC - GUI",layout=layout) + + self.window = sg.Window("RVC - GUI", layout=layout) self.event_handler() - + def event_handler(self): while True: event, values = self.window.read() - if event ==sg.WINDOW_CLOSED: - self.flag_vc=False + if event == sg.WINDOW_CLOSED: + self.flag_vc = False exit() - if event == 'start_vc' and self.flag_vc==False: + if event == "start_vc" and self.flag_vc == False: self.set_values(values) print(str(self.config.__dict__)) - print('using_cuda:'+str(torch.cuda.is_available())) + print("using_cuda:" + str(torch.cuda.is_available())) self.start_vc() - if event=='stop_vc'and self.flag_vc==True: + if event == "stop_vc" and self.flag_vc == True: self.flag_vc = False + def set_values(self, values): + self.set_devices(values["sg_input_device"], values["sg_output_device"]) + self.config.hubert_path = values["hubert_path"] + self.config.pth_path = values["pth_path"] + self.config.index_path = values["index_path"] + self.config.npy_path = values["npy_path"] + self.config.threhold = values["threhold"] + self.config.pitch = values["pitch"] + self.config.block_time = values["block_time"] + self.config.crossfade_time = values["crossfade_length"] + self.config.extra_time = values["extra_time"] + self.config.I_noise_reduce = values["I_noise_reduce"] + self.config.O_noise_reduce = values["O_noise_reduce"] + self.config.index_rate = values["index_rate"] - def set_values(self,values): - self.set_devices(values["sg_input_device"],values['sg_output_device']) - self.config.hubert_path=values['hubert_path'] - self.config.pth_path=values['pth_path'] - self.config.index_path=values['index_path'] - self.config.npy_path=values['npy_path'] - self.config.threhold=values['threhold'] - self.config.pitch=values['pitch'] - self.config.block_time=values['block_time'] - self.config.crossfade_time=values['crossfade_length'] - self.config.extra_time=values['extra_time'] - self.config.I_noise_reduce=values['I_noise_reduce'] - self.config.O_noise_reduce=values['O_noise_reduce'] - self.config.index_rate=values['index_rate'] - def start_vc(self): torch.cuda.empty_cache() - self.flag_vc=True - self.block_frame=int(self.config.block_time*self.config.samplerate) - self.crossfade_frame=int(self.config.crossfade_time*self.config.samplerate) - self.sola_search_frame=int(0.012*self.config.samplerate) - self.delay_frame=int(0.02*self.config.samplerate)#往前预留0.02s - self.extra_frame=int(self.config.extra_time*self.config.samplerate)#往后预留0.04s - self.rvc=None - self.rvc=RVC(self.config.pitch,self.config.hubert_path,self.config.pth_path,self.config.index_path,self.config.npy_path,self.config.index_rate) - self.input_wav:np.ndarray=np.zeros(self.extra_frame+self.crossfade_frame+self.sola_search_frame+self.block_frame,dtype='float32') - self.output_wav:torch.Tensor=torch.zeros(self.block_frame,device=device,dtype=torch.float32) - self.sola_buffer:torch.Tensor=torch.zeros(self.crossfade_frame,device=device,dtype=torch.float32) - self.fade_in_window:torch.Tensor=torch.linspace(0.0,1.0,steps=self.crossfade_frame,device=device,dtype=torch.float32) - self.fade_out_window:torch.Tensor = 1 - self.fade_in_window - self.resampler1=tat.Resample(orig_freq=self.config.samplerate,new_freq=16000,dtype=torch.float32) - self.resampler2=tat.Resample(orig_freq=40000,new_freq=self.config.samplerate,dtype=torch.float32) - thread_vc=threading.Thread(target=self.soundinput) + self.flag_vc = True + self.block_frame = int(self.config.block_time * self.config.samplerate) + self.crossfade_frame = int(self.config.crossfade_time * self.config.samplerate) + self.sola_search_frame = int(0.012 * self.config.samplerate) + self.delay_frame = int(0.02 * self.config.samplerate) # 往前预留0.02s + self.extra_frame = int( + self.config.extra_time * self.config.samplerate + ) # 往后预留0.04s + self.rvc = None + self.rvc = RVC( + self.config.pitch, + self.config.hubert_path, + self.config.pth_path, + self.config.index_path, + self.config.npy_path, + self.config.index_rate, + ) + self.input_wav: np.ndarray = np.zeros( + self.extra_frame + + self.crossfade_frame + + self.sola_search_frame + + self.block_frame, + dtype="float32", + ) + self.output_wav: torch.Tensor = torch.zeros( + self.block_frame, device=device, dtype=torch.float32 + ) + self.sola_buffer: torch.Tensor = torch.zeros( + self.crossfade_frame, device=device, dtype=torch.float32 + ) + self.fade_in_window: torch.Tensor = torch.linspace( + 0.0, 1.0, steps=self.crossfade_frame, device=device, dtype=torch.float32 + ) + self.fade_out_window: torch.Tensor = 1 - self.fade_in_window + self.resampler1 = tat.Resample( + orig_freq=self.config.samplerate, new_freq=16000, dtype=torch.float32 + ) + self.resampler2 = tat.Resample( + orig_freq=40000, new_freq=self.config.samplerate, dtype=torch.float32 + ) + thread_vc = threading.Thread(target=self.soundinput) thread_vc.start() - def soundinput(self): - ''' + """ 接受音频输入 - ''' - with sd.Stream(callback=self.audio_callback, blocksize=self.block_frame,samplerate=self.config.samplerate,dtype='float32'): + """ + with sd.Stream( + callback=self.audio_callback, + blocksize=self.block_frame, + samplerate=self.config.samplerate, + dtype="float32", + ): while self.flag_vc: time.sleep(self.config.block_time) - print('Audio block passed.') - print('ENDing VC') + print("Audio block passed.") + print("ENDing VC") - - def audio_callback(self,indata:np.ndarray,outdata:np.ndarray, frames, times, status): - ''' + def audio_callback( + self, indata: np.ndarray, outdata: np.ndarray, frames, times, status + ): + """ 音频处理 - ''' - start_time=time.perf_counter() - indata=librosa.to_mono(indata.T) + """ + start_time = time.perf_counter() + indata = librosa.to_mono(indata.T) if self.config.I_noise_reduce: - indata[:]=nr.reduce_noise(y=indata,sr=self.config.samplerate) - - '''noise gate''' - frame_length=2048 - hop_length=1024 - rms=librosa.feature.rms(y=indata,frame_length=frame_length,hop_length=hop_length) - db_threhold=librosa.amplitude_to_db(rms,ref=1.0)[0] 0 ] - input_devices_indices = [d["index"] for d in devices if d["max_input_channels"] > 0] + input_devices_indices = [ + d["index"] for d in devices if d["max_input_channels"] > 0 + ] output_devices_indices = [ d["index"] for d in devices if d["max_output_channels"] > 0 ] - return input_devices, output_devices, input_devices_indices, output_devices_indices - - def set_devices(self,input_device,output_device): - '''设置输出设备''' - input_devices,output_devices,input_device_indices, output_device_indices=self.get_devices() - sd.default.device[0]=input_device_indices[input_devices.index(input_device)] - sd.default.device[1]=output_device_indices[output_devices.index(output_device)] - print("input device:"+str(sd.default.device[0])+":"+str(input_device)) - print("output device:"+str(sd.default.device[1])+":"+str(output_device)) - -gui=GUI() \ No newline at end of file + return ( + input_devices, + output_devices, + input_devices_indices, + output_devices_indices, + ) + + def set_devices(self, input_device, output_device): + """设置输出设备""" + ( + input_devices, + output_devices, + input_device_indices, + output_device_indices, + ) = self.get_devices() + sd.default.device[0] = input_device_indices[input_devices.index(input_device)] + sd.default.device[1] = output_device_indices[ + output_devices.index(output_device) + ] + print("input device:" + str(sd.default.device[0]) + ":" + str(input_device)) + print("output device:" + str(sd.default.device[1]) + ":" + str(output_device)) + + +gui = GUI() diff --git a/infer-web.py b/infer-web.py index 36fb09a..1c312d0 100644 --- a/infer-web.py +++ b/infer-web.py @@ -3,134 +3,243 @@ import threading from time import sleep from subprocess import Popen from time import sleep -import torch, os,traceback,sys,warnings,shutil,numpy as np +import torch, os, traceback, sys, warnings, shutil, numpy as np import faiss -now_dir=os.getcwd() + +now_dir = os.getcwd() sys.path.append(now_dir) -tmp=os.path.join(now_dir,"TEMP") -shutil.rmtree(tmp,ignore_errors=True) -os.makedirs(tmp,exist_ok=True) -os.makedirs(os.path.join(now_dir,"logs"),exist_ok=True) -os.makedirs(os.path.join(now_dir,"weights"),exist_ok=True) -os.environ["TEMP"]=tmp +tmp = os.path.join(now_dir, "TEMP") +shutil.rmtree(tmp, ignore_errors=True) +os.makedirs(tmp, exist_ok=True) +os.makedirs(os.path.join(now_dir, "logs"), exist_ok=True) +os.makedirs(os.path.join(now_dir, "weights"), exist_ok=True) +os.environ["TEMP"] = tmp warnings.filterwarnings("ignore") torch.manual_seed(114514) from webui_locale import I18nAuto + i18n = I18nAuto() -#判断是否有能用来训练和加速推理的N卡 -ncpu=cpu_count() -ngpu=torch.cuda.device_count() -gpu_infos=[] -if((not torch.cuda.is_available()) or ngpu==0):if_gpu_ok=False +# 判断是否有能用来训练和加速推理的N卡 +ncpu = cpu_count() +ngpu = torch.cuda.device_count() +gpu_infos = [] +if (not torch.cuda.is_available()) or ngpu == 0: + if_gpu_ok = False else: if_gpu_ok = False for i in range(ngpu): - gpu_name=torch.cuda.get_device_name(i) - if(("16"in gpu_name and "V100"not in gpu_name) or "MX"in gpu_name):continue - if("10"in gpu_name or "20"in gpu_name or "30"in gpu_name or "40"in gpu_name or "A50"in gpu_name.upper() or "70"in gpu_name or "80"in gpu_name or "90"in gpu_name or "M4"in gpu_name or "T4"in gpu_name or "TITAN"in gpu_name.upper()):#A10#A100#V100#A40#P40#M40#K80 - if_gpu_ok=True#至少有一张能用的N卡 - gpu_infos.append("%s\t%s"%(i,gpu_name)) -gpu_info="\n".join(gpu_infos)if if_gpu_ok==True and len(gpu_infos)>0 else "很遗憾您这没有能用的显卡来支持您训练" -gpus="-".join([i[0]for i in gpu_infos]) + gpu_name = torch.cuda.get_device_name(i) + if ("16" in gpu_name and "V100" not in gpu_name) or "MX" in gpu_name: + continue + if ( + "10" in gpu_name + or "20" in gpu_name + or "30" in gpu_name + or "40" in gpu_name + or "A50" in gpu_name.upper() + or "70" in gpu_name + or "80" in gpu_name + or "90" in gpu_name + or "M4" in gpu_name + or "T4" in gpu_name + or "TITAN" in gpu_name.upper() + ): # A10#A100#V100#A40#P40#M40#K80 + if_gpu_ok = True # 至少有一张能用的N卡 + gpu_infos.append("%s\t%s" % (i, gpu_name)) +gpu_info = ( + "\n".join(gpu_infos) + if if_gpu_ok == True and len(gpu_infos) > 0 + else "很遗憾您这没有能用的显卡来支持您训练" +) +gpus = "-".join([i[0] for i in gpu_infos]) from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono from scipy.io import wavfile from fairseq import checkpoint_utils import gradio as gr import logging from vc_infer_pipeline import VC -from config import is_half,device,python_cmd,listen_port,iscolab,noparallel,noautoopen +from config import ( + is_half, + device, + python_cmd, + listen_port, + iscolab, + noparallel, + noautoopen, +) from infer_uvr5 import _audio_pre_ from my_utils import load_audio -from train.process_ckpt import show_info,change_info,merge,extract_small_model +from train.process_ckpt import show_info, change_info, merge, extract_small_model + # from trainset_preprocess_pipeline import PreProcess -logging.getLogger('numba').setLevel(logging.WARNING) +logging.getLogger("numba").setLevel(logging.WARNING) + class ToolButton(gr.Button, gr.components.FormComponent): """Small button with single emoji as text, fits inside gradio forms""" + def __init__(self, **kwargs): super().__init__(variant="tool", **kwargs) + def get_block_name(self): return "button" -hubert_model=None + +hubert_model = None + + def load_hubert(): global hubert_model - models, _, _ = checkpoint_utils.load_model_ensemble_and_task(["hubert_base.pt"],suffix="",) + models, _, _ = checkpoint_utils.load_model_ensemble_and_task( + ["hubert_base.pt"], + suffix="", + ) hubert_model = models[0] hubert_model = hubert_model.to(device) - if(is_half):hubert_model = hubert_model.half() - else:hubert_model = hubert_model.float() + if is_half: + hubert_model = hubert_model.half() + else: + hubert_model = hubert_model.float() hubert_model.eval() -weight_root="weights" -weight_uvr5_root="uvr5_weights" -names=[] -for name in os.listdir(weight_root): - if name.endswith(".pth"): names.append(name) -uvr5_names=[] -for name in os.listdir(weight_uvr5_root): - if name.endswith(".pth"): uvr5_names.append(name.replace(".pth","")) -def vc_single(sid,input_audio,f0_up_key,f0_file,f0_method,file_index,file_big_npy,index_rate):#spk_item, input_audio0, vc_transform0,f0_file,f0method0 - global tgt_sr,net_g,vc,hubert_model - if input_audio is None:return "You need to upload an audio", None +weight_root = "weights" +weight_uvr5_root = "uvr5_weights" +names = [] +for name in os.listdir(weight_root): + if name.endswith(".pth"): + names.append(name) +uvr5_names = [] +for name in os.listdir(weight_uvr5_root): + if name.endswith(".pth"): + uvr5_names.append(name.replace(".pth", "")) + + +def vc_single( + sid, + input_audio, + f0_up_key, + f0_file, + f0_method, + file_index, + file_big_npy, + index_rate, +): # spk_item, input_audio0, vc_transform0,f0_file,f0method0 + global tgt_sr, net_g, vc, hubert_model + if input_audio is None: + return "You need to upload an audio", None f0_up_key = int(f0_up_key) try: - audio=load_audio(input_audio,16000) + audio = load_audio(input_audio, 16000) times = [0, 0, 0] - if(hubert_model==None):load_hubert() + if hubert_model == None: + load_hubert() if_f0 = cpt.get("f0", 1) - audio_opt=vc.pipeline(hubert_model,net_g,sid,audio,times,f0_up_key,f0_method,file_index,file_big_npy,index_rate,if_f0,f0_file=f0_file) - print("npy: ", times[0], "s, f0: ", times[1], "s, infer: ", times[2], "s", sep='') + audio_opt = vc.pipeline( + hubert_model, + net_g, + sid, + audio, + times, + f0_up_key, + f0_method, + file_index, + file_big_npy, + index_rate, + if_f0, + f0_file=f0_file, + ) + print( + "npy: ", times[0], "s, f0: ", times[1], "s, infer: ", times[2], "s", sep="" + ) return "Success", (tgt_sr, audio_opt) except: - info=traceback.format_exc() + info = traceback.format_exc() print(info) - return info,(None,None) + return info, (None, None) -def vc_multi(sid,dir_path,opt_root,paths,f0_up_key,f0_method,file_index,file_big_npy,index_rate): + +def vc_multi( + sid, + dir_path, + opt_root, + paths, + f0_up_key, + f0_method, + file_index, + file_big_npy, + index_rate, +): try: - dir_path=dir_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ")#防止小白拷路径头尾带了空格和"和回车 - opt_root=opt_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + dir_path = ( + dir_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + ) # 防止小白拷路径头尾带了空格和"和回车 + opt_root = opt_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ") os.makedirs(opt_root, exist_ok=True) try: - if(dir_path!=""):paths=[os.path.join(dir_path,name)for name in os.listdir(dir_path)] - else:paths=[path.name for path in paths] + if dir_path != "": + paths = [os.path.join(dir_path, name) for name in os.listdir(dir_path)] + else: + paths = [path.name for path in paths] except: traceback.print_exc() paths = [path.name for path in paths] - infos=[] + infos = [] for path in paths: - info,opt=vc_single(sid,path,f0_up_key,None,f0_method,file_index,file_big_npy,index_rate) - if(info=="Success"): + info, opt = vc_single( + sid, + path, + f0_up_key, + None, + f0_method, + file_index, + file_big_npy, + index_rate, + ) + if info == "Success": try: - tgt_sr,audio_opt=opt - wavfile.write("%s/%s" % (opt_root, os.path.basename(path)), tgt_sr, audio_opt) + tgt_sr, audio_opt = opt + wavfile.write( + "%s/%s" % (opt_root, os.path.basename(path)), tgt_sr, audio_opt + ) except: - info=traceback.format_exc() - infos.append("%s->%s"%(os.path.basename(path),info)) + info = traceback.format_exc() + infos.append("%s->%s" % (os.path.basename(path), info)) yield "\n".join(infos) yield "\n".join(infos) except: yield traceback.format_exc() -def uvr(model_name,inp_root,save_root_vocal,paths,save_root_ins): + +def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins): infos = [] try: inp_root = inp_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - save_root_vocal = save_root_vocal.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - save_root_ins = save_root_ins.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - pre_fun = _audio_pre_(model_path=os.path.join(weight_uvr5_root,model_name+".pth"), device=device, is_half=is_half) - if (inp_root != ""):paths = [os.path.join(inp_root, name) for name in os.listdir(inp_root)] - else:paths = [path.name for path in paths] + save_root_vocal = ( + save_root_vocal.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + ) + save_root_ins = ( + save_root_ins.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + ) + pre_fun = _audio_pre_( + model_path=os.path.join(weight_uvr5_root, model_name + ".pth"), + device=device, + is_half=is_half, + ) + if inp_root != "": + paths = [os.path.join(inp_root, name) for name in os.listdir(inp_root)] + else: + paths = [path.name for path in paths] for name in paths: - inp_path=os.path.join(inp_root,name) + inp_path = os.path.join(inp_root, name) try: - pre_fun._path_audio_(inp_path , save_root_ins,save_root_vocal) - infos.append("%s->Success"%(os.path.basename(inp_path))) + pre_fun._path_audio_(inp_path, save_root_ins, save_root_vocal) + infos.append("%s->Success" % (os.path.basename(inp_path))) yield "\n".join(infos) except: - infos.append("%s->%s" % (os.path.basename(inp_path),traceback.format_exc())) + infos.append( + "%s->%s" % (os.path.basename(inp_path), traceback.format_exc()) + ) yield "\n".join(infos) except: infos.append(traceback.format_exc()) @@ -142,441 +251,990 @@ def uvr(model_name,inp_root,save_root_vocal,paths,save_root_ins): except: traceback.print_exc() print("clean_empty_cache") - if torch.cuda.is_available(): torch.cuda.empty_cache() + if torch.cuda.is_available(): + torch.cuda.empty_cache() yield "\n".join(infos) -#一个选项卡全局只能有一个音色 + +# 一个选项卡全局只能有一个音色 def get_vc(sid): - global n_spk,tgt_sr,net_g,vc,cpt - if(sid==[]): + global n_spk, tgt_sr, net_g, vc, cpt + if sid == []: global hubert_model - if (hubert_model != None): # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的 + if hubert_model != None: # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的 print("clean_empty_cache") - del net_g, n_spk, vc, hubert_model,tgt_sr#,cpt - hubert_model = net_g=n_spk=vc=hubert_model=tgt_sr=None - if torch.cuda.is_available(): torch.cuda.empty_cache() + del net_g, n_spk, vc, hubert_model, tgt_sr # ,cpt + hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None + if torch.cuda.is_available(): + torch.cuda.empty_cache() ###楼下不这么折腾清理不干净 if_f0 = cpt.get("f0", 1) - if (if_f0 == 1): + if if_f0 == 1: net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half) else: net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) - del net_g,cpt - if torch.cuda.is_available(): torch.cuda.empty_cache() - cpt=None + del net_g, cpt + if torch.cuda.is_available(): + torch.cuda.empty_cache() + cpt = None return {"visible": False, "__type__": "update"} person = "%s/%s" % (weight_root, sid) - print("loading %s"%person) + print("loading %s" % person) cpt = torch.load(person, map_location="cpu") tgt_sr = cpt["config"][-1] - cpt["config"][-3]=cpt["weight"]["emb_g.weight"].shape[0]#n_spk - if_f0=cpt.get("f0",1) - if(if_f0==1): + cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk + if_f0 = cpt.get("f0", 1) + if if_f0 == 1: net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half) else: net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) del net_g.enc_q print(net_g.load_state_dict(cpt["weight"], strict=False)) # 不加这一行清不干净, 真奇葩 net_g.eval().to(device) - if (is_half):net_g = net_g.half() - else:net_g = net_g.float() + if is_half: + net_g = net_g.half() + else: + net_g = net_g.float() vc = VC(tgt_sr, device, is_half) - n_spk=cpt["config"][-3] - return {"visible": True,"maximum": n_spk, "__type__": "update"} + n_spk = cpt["config"][-3] + return {"visible": True, "maximum": n_spk, "__type__": "update"} + def change_choices(): - names=[] + names = [] for name in os.listdir(weight_root): - if name.endswith(".pth"): names.append(name) + if name.endswith(".pth"): + names.append(name) return {"choices": sorted(names), "__type__": "update"} -def clean():return {"value": "", "__type__": "update"} -def change_f0(if_f0_3,sr2):#np7, f0method8,pretrained_G14,pretrained_D15 - if(if_f0_3=="是"):return {"visible": True, "__type__": "update"},{"visible": True, "__type__": "update"},"pretrained/f0G%s.pth"%sr2,"pretrained/f0D%s.pth"%sr2 - return {"visible": False, "__type__": "update"}, {"visible": False, "__type__": "update"},"pretrained/G%s.pth"%sr2,"pretrained/D%s.pth"%sr2 -sr_dict={ - "32k":32000, - "40k":40000, - "48k":48000, + +def clean(): + return {"value": "", "__type__": "update"} + + +def change_f0(if_f0_3, sr2): # np7, f0method8,pretrained_G14,pretrained_D15 + if if_f0_3 == "是": + return ( + {"visible": True, "__type__": "update"}, + {"visible": True, "__type__": "update"}, + "pretrained/f0G%s.pth" % sr2, + "pretrained/f0D%s.pth" % sr2, + ) + return ( + {"visible": False, "__type__": "update"}, + {"visible": False, "__type__": "update"}, + "pretrained/G%s.pth" % sr2, + "pretrained/D%s.pth" % sr2, + ) + + +sr_dict = { + "32k": 32000, + "40k": 40000, + "48k": 48000, } -def if_done(done,p): + +def if_done(done, p): while 1: - if(p.poll()==None):sleep(0.5) - else:break - done[0]=True + if p.poll() == None: + sleep(0.5) + else: + break + done[0] = True -def if_done_multi(done,ps): +def if_done_multi(done, ps): while 1: - #poll==None代表进程未结束 - #只要有一个进程未结束都不停 - flag=1 + # poll==None代表进程未结束 + # 只要有一个进程未结束都不停 + flag = 1 for p in ps: - if(p.poll()==None): + if p.poll() == None: flag = 0 sleep(0.5) break - if(flag==1):break - done[0]=True + if flag == 1: + break + done[0] = True -def preprocess_dataset(trainset_dir,exp_dir,sr,n_p=ncpu): - sr=sr_dict[sr] - os.makedirs("%s/logs/%s"%(now_dir,exp_dir),exist_ok=True) - f = open("%s/logs/%s/preprocess.log"%(now_dir,exp_dir), "w") + +def preprocess_dataset(trainset_dir, exp_dir, sr, n_p=ncpu): + sr = sr_dict[sr] + os.makedirs("%s/logs/%s" % (now_dir, exp_dir), exist_ok=True) + f = open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "w") f.close() - cmd=python_cmd + " trainset_preprocess_pipeline_print.py %s %s %s %s/logs/%s "%(trainset_dir,sr,n_p,now_dir,exp_dir)+str(noparallel) + cmd = ( + python_cmd + + " trainset_preprocess_pipeline_print.py %s %s %s %s/logs/%s " + % (trainset_dir, sr, n_p, now_dir, exp_dir) + + str(noparallel) + ) print(cmd) - p = Popen(cmd, shell=True)#, stdin=PIPE, stdout=PIPE,stderr=PIPE,cwd=now_dir + p = Popen(cmd, shell=True) # , stdin=PIPE, stdout=PIPE,stderr=PIPE,cwd=now_dir ###煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读 - done=[False] - threading.Thread(target=if_done,args=(done,p,)).start() - while(1): - with open("%s/logs/%s/preprocess.log"%(now_dir,exp_dir),"r")as f:yield(f.read()) + done = [False] + threading.Thread( + target=if_done, + args=( + done, + p, + ), + ).start() + while 1: + with open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "r") as f: + yield (f.read()) sleep(1) - if(done[0]==True):break - with open("%s/logs/%s/preprocess.log"%(now_dir,exp_dir), "r")as f:log = f.read() + if done[0] == True: + break + with open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "r") as f: + log = f.read() print(log) yield log -#but2.click(extract_f0,[gpus6,np7,f0method8,if_f0_3,trainset_dir4],[info2]) -def extract_f0_feature(gpus,n_p,f0method,if_f0,exp_dir): - gpus=gpus.split("-") - os.makedirs("%s/logs/%s"%(now_dir,exp_dir),exist_ok=True) - f = open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir), "w") + + +# but2.click(extract_f0,[gpus6,np7,f0method8,if_f0_3,trainset_dir4],[info2]) +def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir): + gpus = gpus.split("-") + os.makedirs("%s/logs/%s" % (now_dir, exp_dir), exist_ok=True) + f = open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "w") f.close() - if(if_f0=="是"): - cmd=python_cmd + " extract_f0_print.py %s/logs/%s %s %s"%(now_dir,exp_dir,n_p,f0method) + if if_f0 == "是": + cmd = python_cmd + " extract_f0_print.py %s/logs/%s %s %s" % ( + now_dir, + exp_dir, + n_p, + f0method, + ) print(cmd) - p = Popen(cmd, shell=True,cwd=now_dir)#, stdin=PIPE, stdout=PIPE,stderr=PIPE + p = Popen(cmd, shell=True, cwd=now_dir) # , stdin=PIPE, stdout=PIPE,stderr=PIPE ###煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读 - done=[False] - threading.Thread(target=if_done,args=(done,p,)).start() - while(1): - with open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir),"r")as f:yield(f.read()) + done = [False] + threading.Thread( + target=if_done, + args=( + done, + p, + ), + ).start() + while 1: + with open( + "%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r" + ) as f: + yield (f.read()) sleep(1) - if(done[0]==True):break - with open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir), "r")as f:log = f.read() + if done[0] == True: + break + with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f: + log = f.read() print(log) yield log ####对不同part分别开多进程 - ''' + """ n_part=int(sys.argv[1]) i_part=int(sys.argv[2]) i_gpu=sys.argv[3] exp_dir=sys.argv[4] os.environ["CUDA_VISIBLE_DEVICES"]=str(i_gpu) - ''' - leng=len(gpus) - ps=[] - for idx,n_g in enumerate(gpus): - cmd=python_cmd + " extract_feature_print.py %s %s %s %s %s/logs/%s"%(device,leng,idx,n_g,now_dir,exp_dir) + """ + leng = len(gpus) + ps = [] + for idx, n_g in enumerate(gpus): + cmd = python_cmd + " extract_feature_print.py %s %s %s %s %s/logs/%s" % ( + device, + leng, + idx, + n_g, + now_dir, + exp_dir, + ) print(cmd) - p = Popen(cmd, shell=True, cwd=now_dir)#, shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir + p = Popen( + cmd, shell=True, cwd=now_dir + ) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir ps.append(p) ###煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读 done = [False] - threading.Thread(target=if_done_multi, args=(done, ps,)).start() - while (1): - with open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir), "r")as f:yield (f.read()) + threading.Thread( + target=if_done_multi, + args=( + done, + ps, + ), + ).start() + while 1: + with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f: + yield (f.read()) sleep(1) - if (done[0] == True): break - with open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir), "r")as f:log = f.read() + if done[0] == True: + break + with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f: + log = f.read() print(log) yield log -def change_sr2(sr2,if_f0_3): - if(if_f0_3=="是"):return "pretrained/f0G%s.pth"%sr2,"pretrained/f0D%s.pth"%sr2 - else:return "pretrained/G%s.pth"%sr2,"pretrained/D%s.pth"%sr2 -#but3.click(click_train,[exp_dir1,sr2,if_f0_3,save_epoch10,total_epoch11,batch_size12,if_save_latest13,pretrained_G14,pretrained_D15,gpus16]) -def click_train(exp_dir1,sr2,if_f0_3,spk_id5,save_epoch10,total_epoch11,batch_size12,if_save_latest13,pretrained_G14,pretrained_D15,gpus16,if_cache_gpu17): - #生成filelist - exp_dir="%s/logs/%s"%(now_dir,exp_dir1) - os.makedirs(exp_dir,exist_ok=True) - gt_wavs_dir="%s/0_gt_wavs"%(exp_dir) - co256_dir="%s/3_feature256"%(exp_dir) - if(if_f0_3=="是"): + + +def change_sr2(sr2, if_f0_3): + if if_f0_3 == "是": + return "pretrained/f0G%s.pth" % sr2, "pretrained/f0D%s.pth" % sr2 + else: + return "pretrained/G%s.pth" % sr2, "pretrained/D%s.pth" % sr2 + + +# but3.click(click_train,[exp_dir1,sr2,if_f0_3,save_epoch10,total_epoch11,batch_size12,if_save_latest13,pretrained_G14,pretrained_D15,gpus16]) +def click_train( + exp_dir1, + sr2, + if_f0_3, + spk_id5, + save_epoch10, + total_epoch11, + batch_size12, + if_save_latest13, + pretrained_G14, + pretrained_D15, + gpus16, + if_cache_gpu17, +): + # 生成filelist + exp_dir = "%s/logs/%s" % (now_dir, exp_dir1) + os.makedirs(exp_dir, exist_ok=True) + gt_wavs_dir = "%s/0_gt_wavs" % (exp_dir) + co256_dir = "%s/3_feature256" % (exp_dir) + if if_f0_3 == "是": f0_dir = "%s/2a_f0" % (exp_dir) - f0nsf_dir="%s/2b-f0nsf"%(exp_dir) - names=set([name.split(".")[0]for name in os.listdir(gt_wavs_dir)])&set([name.split(".")[0]for name in os.listdir(co256_dir)])&set([name.split(".")[0]for name in os.listdir(f0_dir)])&set([name.split(".")[0]for name in os.listdir(f0nsf_dir)]) + f0nsf_dir = "%s/2b-f0nsf" % (exp_dir) + names = ( + set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)]) + & set([name.split(".")[0] for name in os.listdir(co256_dir)]) + & set([name.split(".")[0] for name in os.listdir(f0_dir)]) + & set([name.split(".")[0] for name in os.listdir(f0nsf_dir)]) + ) else: - names=set([name.split(".")[0]for name in os.listdir(gt_wavs_dir)])&set([name.split(".")[0]for name in os.listdir(co256_dir)]) - opt=[] + names = set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)]) & set( + [name.split(".")[0] for name in os.listdir(co256_dir)] + ) + opt = [] for name in names: - if (if_f0_3 == "是"): - opt.append("%s/%s.wav|%s/%s.npy|%s/%s.wav.npy|%s/%s.wav.npy|%s"%(gt_wavs_dir.replace("\\","\\\\"),name,co256_dir.replace("\\","\\\\"),name,f0_dir.replace("\\","\\\\"),name,f0nsf_dir.replace("\\","\\\\"),name,spk_id5)) + if if_f0_3 == "是": + opt.append( + "%s/%s.wav|%s/%s.npy|%s/%s.wav.npy|%s/%s.wav.npy|%s" + % ( + gt_wavs_dir.replace("\\", "\\\\"), + name, + co256_dir.replace("\\", "\\\\"), + name, + f0_dir.replace("\\", "\\\\"), + name, + f0nsf_dir.replace("\\", "\\\\"), + name, + spk_id5, + ) + ) else: - opt.append("%s/%s.wav|%s/%s.npy|%s"%(gt_wavs_dir.replace("\\","\\\\"),name,co256_dir.replace("\\","\\\\"),name,spk_id5)) - if (if_f0_3 == "是"): - opt.append("%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature256/mute.npy|%s/logs/mute/2a_f0/mute.wav.npy|%s/logs/mute/2b-f0nsf/mute.wav.npy|%s"%(now_dir,sr2,now_dir,now_dir,now_dir,spk_id5)) + opt.append( + "%s/%s.wav|%s/%s.npy|%s" + % ( + gt_wavs_dir.replace("\\", "\\\\"), + name, + co256_dir.replace("\\", "\\\\"), + name, + spk_id5, + ) + ) + if if_f0_3 == "是": + opt.append( + "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature256/mute.npy|%s/logs/mute/2a_f0/mute.wav.npy|%s/logs/mute/2b-f0nsf/mute.wav.npy|%s" + % (now_dir, sr2, now_dir, now_dir, now_dir, spk_id5) + ) else: - opt.append("%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature256/mute.npy|%s"%(now_dir,sr2,now_dir,spk_id5)) - with open("%s/filelist.txt"%exp_dir,"w")as f:f.write("\n".join(opt)) + opt.append( + "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature256/mute.npy|%s" + % (now_dir, sr2, now_dir, spk_id5) + ) + with open("%s/filelist.txt" % exp_dir, "w") as f: + f.write("\n".join(opt)) print("write filelist done") - #生成config#无需生成config + # 生成config#无需生成config # cmd = python_cmd + " train_nsf_sim_cache_sid_load_pretrain.py -e mi-test -sr 40k -f0 1 -bs 4 -g 0 -te 10 -se 5 -pg pretrained/f0G40k.pth -pd pretrained/f0D40k.pth -l 1 -c 0" - print("use gpus:",gpus16) + print("use gpus:", gpus16) if gpus16: - cmd = python_cmd + " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -g %s -te %s -se %s -pg %s -pd %s -l %s -c %s" % (exp_dir1,sr2,1 if if_f0_3=="是"else 0,batch_size12,gpus16,total_epoch11,save_epoch10,pretrained_G14,pretrained_D15,1 if if_save_latest13=="是"else 0,1 if if_cache_gpu17=="是"else 0) + cmd = ( + python_cmd + + " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -g %s -te %s -se %s -pg %s -pd %s -l %s -c %s" + % ( + exp_dir1, + sr2, + 1 if if_f0_3 == "是" else 0, + batch_size12, + gpus16, + total_epoch11, + save_epoch10, + pretrained_G14, + pretrained_D15, + 1 if if_save_latest13 == "是" else 0, + 1 if if_cache_gpu17 == "是" else 0, + ) + ) else: - cmd = python_cmd + " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -te %s -se %s -pg %s -pd %s -l %s -c %s" % (exp_dir1,sr2,1 if if_f0_3=="是"else 0,batch_size12,total_epoch11,save_epoch10,pretrained_G14,pretrained_D15,1 if if_save_latest13=="是"else 0,1 if if_cache_gpu17=="是"else 0) + cmd = ( + python_cmd + + " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -te %s -se %s -pg %s -pd %s -l %s -c %s" + % ( + exp_dir1, + sr2, + 1 if if_f0_3 == "是" else 0, + batch_size12, + total_epoch11, + save_epoch10, + pretrained_G14, + pretrained_D15, + 1 if if_save_latest13 == "是" else 0, + 1 if if_cache_gpu17 == "是" else 0, + ) + ) print(cmd) p = Popen(cmd, shell=True, cwd=now_dir) p.wait() return "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log" + + # but4.click(train_index, [exp_dir1], info3) def train_index(exp_dir1): - exp_dir="%s/logs/%s"%(now_dir,exp_dir1) - os.makedirs(exp_dir,exist_ok=True) - feature_dir="%s/3_feature256"%(exp_dir) - if(os.path.exists(feature_dir)==False):return "请先进行特征提取!" - listdir_res=list(os.listdir(feature_dir)) - if(len(listdir_res)==0):return "请先进行特征提取!" + exp_dir = "%s/logs/%s" % (now_dir, exp_dir1) + os.makedirs(exp_dir, exist_ok=True) + feature_dir = "%s/3_feature256" % (exp_dir) + if os.path.exists(feature_dir) == False: + return "请先进行特征提取!" + listdir_res = list(os.listdir(feature_dir)) + if len(listdir_res) == 0: + return "请先进行特征提取!" npys = [] for name in sorted(listdir_res): phone = np.load("%s/%s" % (feature_dir, name)) npys.append(phone) big_npy = np.concatenate(npys, 0) - np.save("%s/total_fea.npy"%exp_dir, big_npy) + np.save("%s/total_fea.npy" % exp_dir, big_npy) n_ivf = big_npy.shape[0] // 39 - infos=[] - infos.append("%s,%s"%(big_npy.shape,n_ivf)) + infos = [] + infos.append("%s,%s" % (big_npy.shape, n_ivf)) yield "\n".join(infos) - index = faiss.index_factory(256, "IVF%s,Flat"%n_ivf) + index = faiss.index_factory(256, "IVF%s,Flat" % n_ivf) infos.append("training") yield "\n".join(infos) index_ivf = faiss.extract_index_ivf(index) # - index_ivf.nprobe = int(np.power(n_ivf,0.3)) + index_ivf.nprobe = int(np.power(n_ivf, 0.3)) index.train(big_npy) - faiss.write_index(index, '%s/trained_IVF%s_Flat_nprobe_%s.index'%(exp_dir,n_ivf,index_ivf.nprobe)) + faiss.write_index( + index, + "%s/trained_IVF%s_Flat_nprobe_%s.index" % (exp_dir, n_ivf, index_ivf.nprobe), + ) infos.append("adding") yield "\n".join(infos) index.add(big_npy) - faiss.write_index(index, '%s/added_IVF%s_Flat_nprobe_%s.index'%(exp_dir,n_ivf,index_ivf.nprobe)) - infos.append("成功构建索引, added_IVF%s_Flat_nprobe_%s.index"%(n_ivf,index_ivf.nprobe)) + faiss.write_index( + index, + "%s/added_IVF%s_Flat_nprobe_%s.index" % (exp_dir, n_ivf, index_ivf.nprobe), + ) + infos.append("成功构建索引, added_IVF%s_Flat_nprobe_%s.index" % (n_ivf, index_ivf.nprobe)) yield "\n".join(infos) -#but5.click(train1key, [exp_dir1, sr2, if_f0_3, trainset_dir4, spk_id5, gpus6, np7, f0method8, save_epoch10, total_epoch11, batch_size12, if_save_latest13, pretrained_G14, pretrained_D15, gpus16, if_cache_gpu17], info3) -def train1key(exp_dir1, sr2, if_f0_3, trainset_dir4, spk_id5, gpus6, np7, f0method8, save_epoch10, total_epoch11, batch_size12, if_save_latest13, pretrained_G14, pretrained_D15, gpus16, if_cache_gpu17): - infos=[] + + +# but5.click(train1key, [exp_dir1, sr2, if_f0_3, trainset_dir4, spk_id5, gpus6, np7, f0method8, save_epoch10, total_epoch11, batch_size12, if_save_latest13, pretrained_G14, pretrained_D15, gpus16, if_cache_gpu17], info3) +def train1key( + exp_dir1, + sr2, + if_f0_3, + trainset_dir4, + spk_id5, + gpus6, + np7, + f0method8, + save_epoch10, + total_epoch11, + batch_size12, + if_save_latest13, + pretrained_G14, + pretrained_D15, + gpus16, + if_cache_gpu17, +): + infos = [] + def get_info_str(strr): infos.append(strr) return "\n".join(infos) - os.makedirs("%s/logs/%s"%(now_dir,exp_dir1),exist_ok=True) + + os.makedirs("%s/logs/%s" % (now_dir, exp_dir1), exist_ok=True) #########step1:处理数据 - open("%s/logs/%s/preprocess.log"%(now_dir,exp_dir1), "w").close() - cmd=python_cmd + " trainset_preprocess_pipeline_print.py %s %s %s %s/logs/%s "%(trainset_dir4,sr_dict[sr2],ncpu,now_dir,exp_dir1)+str(noparallel) + open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir1), "w").close() + cmd = ( + python_cmd + + " trainset_preprocess_pipeline_print.py %s %s %s %s/logs/%s " + % (trainset_dir4, sr_dict[sr2], ncpu, now_dir, exp_dir1) + + str(noparallel) + ) yield get_info_str("step1:正在处理数据") yield get_info_str(cmd) p = Popen(cmd, shell=True) p.wait() - with open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir1), "r")as f: print(f.read()) + with open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir1), "r") as f: + print(f.read()) #########step2a:提取音高 open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir1), "w") - if(if_f0_3=="是"): + if if_f0_3 == "是": yield get_info_str("step2a:正在提取音高") - cmd=python_cmd + " extract_f0_print.py %s/logs/%s %s %s"%(now_dir,exp_dir1,np7,f0method8) + cmd = python_cmd + " extract_f0_print.py %s/logs/%s %s %s" % ( + now_dir, + exp_dir1, + np7, + f0method8, + ) yield get_info_str(cmd) - p = Popen(cmd, shell=True,cwd=now_dir) + p = Popen(cmd, shell=True, cwd=now_dir) p.wait() - with open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir1), "r")as f:print(f.read()) - else:yield get_info_str("step2a:无需提取音高") + with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir1), "r") as f: + print(f.read()) + else: + yield get_info_str("step2a:无需提取音高") #######step2b:提取特征 yield get_info_str("step2b:正在提取特征") - gpus=gpus16.split("-") - leng=len(gpus) - ps=[] - for idx,n_g in enumerate(gpus): - cmd=python_cmd + " extract_feature_print.py %s %s %s %s %s/logs/%s"%(device,leng,idx,n_g,now_dir,exp_dir1) + gpus = gpus16.split("-") + leng = len(gpus) + ps = [] + for idx, n_g in enumerate(gpus): + cmd = python_cmd + " extract_feature_print.py %s %s %s %s %s/logs/%s" % ( + device, + leng, + idx, + n_g, + now_dir, + exp_dir1, + ) yield get_info_str(cmd) - p = Popen(cmd, shell=True, cwd=now_dir)#, shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir + p = Popen( + cmd, shell=True, cwd=now_dir + ) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir ps.append(p) - for p in ps:p.wait() - with open("%s/logs/%s/extract_f0_feature.log"%(now_dir,exp_dir1), "r")as f:print(f.read()) + for p in ps: + p.wait() + with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir1), "r") as f: + print(f.read()) #######step3a:训练模型 yield get_info_str("step3a:正在训练模型") - #生成filelist - exp_dir="%s/logs/%s"%(now_dir,exp_dir1) - gt_wavs_dir="%s/0_gt_wavs"%(exp_dir) - co256_dir="%s/3_feature256"%(exp_dir) - if(if_f0_3=="是"): + # 生成filelist + exp_dir = "%s/logs/%s" % (now_dir, exp_dir1) + gt_wavs_dir = "%s/0_gt_wavs" % (exp_dir) + co256_dir = "%s/3_feature256" % (exp_dir) + if if_f0_3 == "是": f0_dir = "%s/2a_f0" % (exp_dir) - f0nsf_dir="%s/2b-f0nsf"%(exp_dir) - names=set([name.split(".")[0]for name in os.listdir(gt_wavs_dir)])&set([name.split(".")[0]for name in os.listdir(co256_dir)])&set([name.split(".")[0]for name in os.listdir(f0_dir)])&set([name.split(".")[0]for name in os.listdir(f0nsf_dir)]) + f0nsf_dir = "%s/2b-f0nsf" % (exp_dir) + names = ( + set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)]) + & set([name.split(".")[0] for name in os.listdir(co256_dir)]) + & set([name.split(".")[0] for name in os.listdir(f0_dir)]) + & set([name.split(".")[0] for name in os.listdir(f0nsf_dir)]) + ) else: - names=set([name.split(".")[0]for name in os.listdir(gt_wavs_dir)])&set([name.split(".")[0]for name in os.listdir(co256_dir)]) - opt=[] + names = set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)]) & set( + [name.split(".")[0] for name in os.listdir(co256_dir)] + ) + opt = [] for name in names: - if (if_f0_3 == "是"): - opt.append("%s/%s.wav|%s/%s.npy|%s/%s.wav.npy|%s/%s.wav.npy|%s"%(gt_wavs_dir.replace("\\","\\\\"),name,co256_dir.replace("\\","\\\\"),name,f0_dir.replace("\\","\\\\"),name,f0nsf_dir.replace("\\","\\\\"),name,spk_id5)) + if if_f0_3 == "是": + opt.append( + "%s/%s.wav|%s/%s.npy|%s/%s.wav.npy|%s/%s.wav.npy|%s" + % ( + gt_wavs_dir.replace("\\", "\\\\"), + name, + co256_dir.replace("\\", "\\\\"), + name, + f0_dir.replace("\\", "\\\\"), + name, + f0nsf_dir.replace("\\", "\\\\"), + name, + spk_id5, + ) + ) else: - opt.append("%s/%s.wav|%s/%s.npy|%s"%(gt_wavs_dir.replace("\\","\\\\"),name,co256_dir.replace("\\","\\\\"),name,spk_id5)) - if (if_f0_3 == "是"): - opt.append("%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature256/mute.npy|%s/logs/mute/2a_f0/mute.wav.npy|%s/logs/mute/2b-f0nsf/mute.wav.npy|%s"%(now_dir,sr2,now_dir,now_dir,now_dir,spk_id5)) + opt.append( + "%s/%s.wav|%s/%s.npy|%s" + % ( + gt_wavs_dir.replace("\\", "\\\\"), + name, + co256_dir.replace("\\", "\\\\"), + name, + spk_id5, + ) + ) + if if_f0_3 == "是": + opt.append( + "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature256/mute.npy|%s/logs/mute/2a_f0/mute.wav.npy|%s/logs/mute/2b-f0nsf/mute.wav.npy|%s" + % (now_dir, sr2, now_dir, now_dir, now_dir, spk_id5) + ) else: - opt.append("%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature256/mute.npy|%s"%(now_dir,sr2,now_dir,spk_id5)) - with open("%s/filelist.txt"%exp_dir,"w")as f:f.write("\n".join(opt)) + opt.append( + "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature256/mute.npy|%s" + % (now_dir, sr2, now_dir, spk_id5) + ) + with open("%s/filelist.txt" % exp_dir, "w") as f: + f.write("\n".join(opt)) yield get_info_str("write filelist done") if gpus16: - cmd = python_cmd + " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -g %s -te %s -se %s -pg %s -pd %s -l %s -c %s" % (exp_dir1,sr2,1 if if_f0_3=="是"else 0,batch_size12,gpus16,total_epoch11,save_epoch10,pretrained_G14,pretrained_D15,1 if if_save_latest13=="是"else 0,1 if if_cache_gpu17=="是"else 0) + cmd = ( + python_cmd + + " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -g %s -te %s -se %s -pg %s -pd %s -l %s -c %s" + % ( + exp_dir1, + sr2, + 1 if if_f0_3 == "是" else 0, + batch_size12, + gpus16, + total_epoch11, + save_epoch10, + pretrained_G14, + pretrained_D15, + 1 if if_save_latest13 == "是" else 0, + 1 if if_cache_gpu17 == "是" else 0, + ) + ) else: - cmd = python_cmd + " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -te %s -se %s -pg %s -pd %s -l %s -c %s" % (exp_dir1,sr2,1 if if_f0_3=="是"else 0,batch_size12,total_epoch11,save_epoch10,pretrained_G14,pretrained_D15,1 if if_save_latest13=="是"else 0,1 if if_cache_gpu17=="是"else 0) + cmd = ( + python_cmd + + " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -te %s -se %s -pg %s -pd %s -l %s -c %s" + % ( + exp_dir1, + sr2, + 1 if if_f0_3 == "是" else 0, + batch_size12, + total_epoch11, + save_epoch10, + pretrained_G14, + pretrained_D15, + 1 if if_save_latest13 == "是" else 0, + 1 if if_cache_gpu17 == "是" else 0, + ) + ) yield get_info_str(cmd) p = Popen(cmd, shell=True, cwd=now_dir) p.wait() yield get_info_str("训练结束, 您可查看控制台训练日志或实验文件夹下的train.log") #######step3b:训练索引 - feature_dir="%s/3_feature256"%(exp_dir) + feature_dir = "%s/3_feature256" % (exp_dir) npys = [] - listdir_res=list(os.listdir(feature_dir)) + listdir_res = list(os.listdir(feature_dir)) for name in sorted(listdir_res): phone = np.load("%s/%s" % (feature_dir, name)) npys.append(phone) big_npy = np.concatenate(npys, 0) - np.save("%s/total_fea.npy"%exp_dir, big_npy) + np.save("%s/total_fea.npy" % exp_dir, big_npy) n_ivf = big_npy.shape[0] // 39 - yield get_info_str("%s,%s"%(big_npy.shape,n_ivf)) - index = faiss.index_factory(256, "IVF%s,Flat"%n_ivf) + yield get_info_str("%s,%s" % (big_npy.shape, n_ivf)) + index = faiss.index_factory(256, "IVF%s,Flat" % n_ivf) yield get_info_str("training index") index_ivf = faiss.extract_index_ivf(index) # - index_ivf.nprobe = int(np.power(n_ivf,0.3)) + index_ivf.nprobe = int(np.power(n_ivf, 0.3)) index.train(big_npy) - faiss.write_index(index, '%s/trained_IVF%s_Flat_nprobe_%s.index'%(exp_dir,n_ivf,index_ivf.nprobe)) + faiss.write_index( + index, + "%s/trained_IVF%s_Flat_nprobe_%s.index" % (exp_dir, n_ivf, index_ivf.nprobe), + ) yield get_info_str("adding index") index.add(big_npy) - faiss.write_index(index, '%s/added_IVF%s_Flat_nprobe_%s.index'%(exp_dir,n_ivf,index_ivf.nprobe)) - yield get_info_str("成功构建索引, added_IVF%s_Flat_nprobe_%s.index"%(n_ivf,index_ivf.nprobe)) + faiss.write_index( + index, + "%s/added_IVF%s_Flat_nprobe_%s.index" % (exp_dir, n_ivf, index_ivf.nprobe), + ) + yield get_info_str( + "成功构建索引, added_IVF%s_Flat_nprobe_%s.index" % (n_ivf, index_ivf.nprobe) + ) yield get_info_str("全流程结束!") + # ckpt_path2.change(change_info_,[ckpt_path2],[sr__,if_f0__]) def change_info_(ckpt_path): - if(os.path.exists(ckpt_path.replace(os.path.basename(ckpt_path),"train.log"))==False):return {"__type__": "update"},{"__type__": "update"} + if ( + os.path.exists(ckpt_path.replace(os.path.basename(ckpt_path), "train.log")) + == False + ): + return {"__type__": "update"}, {"__type__": "update"} try: - with open(ckpt_path.replace(os.path.basename(ckpt_path),"train.log"),"r")as f: - info=eval(f.read().strip("\n").split("\n")[0].split("\t")[-1]) - sr,f0=info["sample_rate"],info["if_f0"] - return sr,str(f0) + with open( + ckpt_path.replace(os.path.basename(ckpt_path), "train.log"), "r" + ) as f: + info = eval(f.read().strip("\n").split("\n")[0].split("\t")[-1]) + sr, f0 = info["sample_rate"], info["if_f0"] + return sr, str(f0) except: traceback.print_exc() return {"__type__": "update"}, {"__type__": "update"} with gr.Blocks() as app: - gr.Markdown(value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录使用需遵守的协议-LICENSE.txt.")) + gr.Markdown( + value=i18n( + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录使用需遵守的协议-LICENSE.txt." + ) + ) with gr.Tabs(): with gr.TabItem(i18n("模型推理")): with gr.Row(): sid0 = gr.Dropdown(label=i18n("推理音色"), choices=sorted(names)) refresh_button = gr.Button(i18n("刷新音色列表"), variant="primary") - refresh_button.click( - fn=change_choices, - inputs=[], - outputs=[sid0] - ) + refresh_button.click(fn=change_choices, inputs=[], outputs=[sid0]) clean_button = gr.Button(i18n("卸载音色省显存"), variant="primary") - spk_item = gr.Slider(minimum=0, maximum=2333, step=1, label=i18n("请选择说话人id"), value=0, visible=False, interactive=True) - clean_button.click( - fn=clean, - inputs=[], - outputs=[sid0] + spk_item = gr.Slider( + minimum=0, + maximum=2333, + step=1, + label=i18n("请选择说话人id"), + value=0, + visible=False, + interactive=True, ) + clean_button.click(fn=clean, inputs=[], outputs=[sid0]) sid0.change( fn=get_vc, inputs=[sid0], outputs=[spk_item], ) with gr.Group(): - gr.Markdown(value=i18n("男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ")) + gr.Markdown( + value=i18n("男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ") + ) with gr.Row(): with gr.Column(): - vc_transform0 = gr.Number(label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"), value=0) - input_audio0 = gr.Textbox(label=i18n("输入待处理音频文件路径(默认是正确格式示例)"),value="E:\\codes\\py39\\vits_vc_gpu_train\\todo-songs\\冬之花clip1.wav") - f0method0=gr.Radio(label=i18n("选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比"), choices=["pm","harvest"],value="pm", interactive=True) + vc_transform0 = gr.Number( + label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"), value=0 + ) + input_audio0 = gr.Textbox( + label=i18n("输入待处理音频文件路径(默认是正确格式示例)"), + value="E:\\codes\\py39\\vits_vc_gpu_train\\todo-songs\\冬之花clip1.wav", + ) + f0method0 = gr.Radio( + label=i18n("选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比"), + choices=["pm", "harvest"], + value="pm", + interactive=True, + ) with gr.Column(): - file_index1 = gr.Textbox(label=i18n("特征检索库文件路径"),value="E:\\codes\\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\added_IVF677_Flat_nprobe_7.index", interactive=True) - file_big_npy1 = gr.Textbox(label=i18n("特征文件路径"),value="E:\\codes\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy", interactive=True) - index_rate1 = gr.Slider(minimum=0, maximum=1,label='检索特征占比', value=1,interactive=True) + file_index1 = gr.Textbox( + label=i18n("特征检索库文件路径"), + value="E:\\codes\\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\added_IVF677_Flat_nprobe_7.index", + interactive=True, + ) + file_big_npy1 = gr.Textbox( + label=i18n("特征文件路径"), + value="E:\\codes\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy", + interactive=True, + ) + index_rate1 = gr.Slider( + minimum=0, + maximum=1, + label="检索特征占比", + value=1, + interactive=True, + ) f0_file = gr.File(label=i18n("F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调")) - but0=gr.Button(i18n("转换"), variant="primary") + but0 = gr.Button(i18n("转换"), variant="primary") with gr.Column(): vc_output1 = gr.Textbox(label=i18n("输出信息")) vc_output2 = gr.Audio(label=i18n("输出音频(右下角三个点,点了可以下载)")) - but0.click(vc_single, [spk_item, input_audio0, vc_transform0,f0_file,f0method0,file_index1,file_big_npy1,index_rate1], [vc_output1, vc_output2]) + but0.click( + vc_single, + [ + spk_item, + input_audio0, + vc_transform0, + f0_file, + f0method0, + file_index1, + file_big_npy1, + index_rate1, + ], + [vc_output1, vc_output2], + ) with gr.Group(): - gr.Markdown(value=i18n("批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ")) + gr.Markdown( + value=i18n("批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ") + ) with gr.Row(): with gr.Column(): - vc_transform1 = gr.Number(label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"), value=0) - opt_input = gr.Textbox(label=i18n("指定输出文件夹"),value="opt") - f0method1=gr.Radio(label=i18n("选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比"), choices=["pm","harvest"],value="pm", interactive=True) + vc_transform1 = gr.Number( + label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"), value=0 + ) + opt_input = gr.Textbox(label=i18n("指定输出文件夹"), value="opt") + f0method1 = gr.Radio( + label=i18n("选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比"), + choices=["pm", "harvest"], + value="pm", + interactive=True, + ) with gr.Column(): - file_index2 = gr.Textbox(label=i18n("特征检索库文件路径"),value="E:\\codes\\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\added_IVF677_Flat_nprobe_7.index", interactive=True) - file_big_npy2 = gr.Textbox(label=i18n("特征文件路径"),value="E:\\codes\\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy", interactive=True) - index_rate2 = gr.Slider(minimum=0, maximum=1,label=i18n("检索特征占比"), value=1,interactive=True) + file_index2 = gr.Textbox( + label=i18n("特征检索库文件路径"), + value="E:\\codes\\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\added_IVF677_Flat_nprobe_7.index", + interactive=True, + ) + file_big_npy2 = gr.Textbox( + label=i18n("特征文件路径"), + value="E:\\codes\\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy", + interactive=True, + ) + index_rate2 = gr.Slider( + minimum=0, + maximum=1, + label=i18n("检索特征占比"), + value=1, + interactive=True, + ) with gr.Column(): - dir_input = gr.Textbox(label=i18n("输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)"),value="E:\codes\py39\\vits_vc_gpu_train\\todo-songs") - inputs = gr.File(file_count="multiple", label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹")) - but1=gr.Button(i18n("转换"), variant="primary") + dir_input = gr.Textbox( + label=i18n("输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)"), + value="E:\codes\py39\\vits_vc_gpu_train\\todo-songs", + ) + inputs = gr.File( + file_count="multiple", label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹") + ) + but1 = gr.Button(i18n("转换"), variant="primary") vc_output3 = gr.Textbox(label=i18n("输出信息")) - but1.click(vc_multi, [spk_item, dir_input,opt_input,inputs, vc_transform1,f0method1,file_index2,file_big_npy2,index_rate2], [vc_output3]) + but1.click( + vc_multi, + [ + spk_item, + dir_input, + opt_input, + inputs, + vc_transform1, + f0method1, + file_index2, + file_big_npy2, + index_rate2, + ], + [vc_output3], + ) with gr.TabItem(i18n("伴奏人声分离")): with gr.Group(): - gr.Markdown(value=i18n("人声伴奏分离批量处理, 使用UVR5模型.
不带和声用HP2, 带和声且提取的人声不需要和声用HP5
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)")) + gr.Markdown( + value=i18n( + "人声伴奏分离批量处理, 使用UVR5模型.
不带和声用HP2, 带和声且提取的人声不需要和声用HP5
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)" + ) + ) with gr.Row(): with gr.Column(): - dir_wav_input = gr.Textbox(label=i18n("输入待处理音频文件夹路径"),value="E:\\codes\\py39\\vits_vc_gpu_train\\todo-songs") - wav_inputs = gr.File(file_count="multiple", label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹")) + dir_wav_input = gr.Textbox( + label=i18n("输入待处理音频文件夹路径"), + value="E:\\codes\\py39\\vits_vc_gpu_train\\todo-songs", + ) + wav_inputs = gr.File( + file_count="multiple", label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹") + ) with gr.Column(): model_choose = gr.Dropdown(label=i18n("模型"), choices=uvr5_names) - opt_vocal_root = gr.Textbox(label=i18n("指定输出人声文件夹"),value="opt") - opt_ins_root = gr.Textbox(label=i18n("指定输出乐器文件夹"),value="opt") - but2=gr.Button(i18n("转换"), variant="primary") + opt_vocal_root = gr.Textbox( + label=i18n("指定输出人声文件夹"), value="opt" + ) + opt_ins_root = gr.Textbox(label=i18n("指定输出乐器文件夹"), value="opt") + but2 = gr.Button(i18n("转换"), variant="primary") vc_output4 = gr.Textbox(label=i18n("输出信息")) - but2.click(uvr, [model_choose, dir_wav_input,opt_vocal_root,wav_inputs,opt_ins_root], [vc_output4]) + but2.click( + uvr, + [ + model_choose, + dir_wav_input, + opt_vocal_root, + wav_inputs, + opt_ins_root, + ], + [vc_output4], + ) with gr.TabItem(i18n("训练")): - gr.Markdown(value=i18n("step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ")) + gr.Markdown( + value=i18n( + "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. " + ) + ) with gr.Row(): - exp_dir1 = gr.Textbox(label=i18n("输入实验名"),value="mi-test") - sr2 = gr.Radio(label=i18n("目标采样率"), choices=["32k","40k","48k"],value="40k", interactive=True) - if_f0_3 = gr.Radio(label=i18n("模型是否带音高指导(唱歌一定要, 语音可以不要)"), choices=["是","否"],value="是", interactive=True) - with gr.Group():#暂时单人的, 后面支持最多4人的#数据处理 - gr.Markdown(value=i18n("step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ")) + exp_dir1 = gr.Textbox(label=i18n("输入实验名"), value="mi-test") + sr2 = gr.Radio( + label=i18n("目标采样率"), + choices=["32k", "40k", "48k"], + value="40k", + interactive=True, + ) + if_f0_3 = gr.Radio( + label=i18n("模型是否带音高指导(唱歌一定要, 语音可以不要)"), + choices=["是", "否"], + value="是", + interactive=True, + ) + with gr.Group(): # 暂时单人的, 后面支持最多4人的#数据处理 + gr.Markdown( + value=i18n( + "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. " + ) + ) with gr.Row(): - trainset_dir4 = gr.Textbox(label=i18n("输入训练文件夹路径"),value="E:\\语音音频+标注\\米津玄师\\src") - spk_id5 = gr.Slider(minimum=0, maximum=4, step=1, label=i18n("请指定说话人id"), value=0,interactive=True) - but1=gr.Button(i18n("处理数据"), variant="primary") - info1=gr.Textbox(label=i18n("输出信息"),value="") - but1.click(preprocess_dataset,[trainset_dir4,exp_dir1,sr2],[info1]) + trainset_dir4 = gr.Textbox( + label=i18n("输入训练文件夹路径"), value="E:\\语音音频+标注\\米津玄师\\src" + ) + spk_id5 = gr.Slider( + minimum=0, + maximum=4, + step=1, + label=i18n("请指定说话人id"), + value=0, + interactive=True, + ) + but1 = gr.Button(i18n("处理数据"), variant="primary") + info1 = gr.Textbox(label=i18n("输出信息"), value="") + but1.click( + preprocess_dataset, [trainset_dir4, exp_dir1, sr2], [info1] + ) with gr.Group(): gr.Markdown(value=i18n("step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)")) with gr.Row(): with gr.Column(): - gpus6 = gr.Textbox(label=i18n("以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2"),value=gpus,interactive=True) - gpu_info9 = gr.Textbox(label=i18n("显卡信息"),value=gpu_info) + gpus6 = gr.Textbox( + label=i18n("以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2"), + value=gpus, + interactive=True, + ) + gpu_info9 = gr.Textbox(label=i18n("显卡信息"), value=gpu_info) with gr.Column(): - np7 = gr.Slider(minimum=0, maximum=ncpu, step=1, label=i18n("提取音高使用的CPU进程数"), value=ncpu,interactive=True) - f0method8 = gr.Radio(label=i18n("选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢"), choices=["pm", "harvest","dio"], value="harvest", interactive=True) - but2=gr.Button(i18n("特征提取"), variant="primary") - info2=gr.Textbox(label=i18n("输出信息"),value="",max_lines=8) - but2.click(extract_f0_feature,[gpus6,np7,f0method8,if_f0_3,exp_dir1],[info2]) + np7 = gr.Slider( + minimum=0, + maximum=ncpu, + step=1, + label=i18n("提取音高使用的CPU进程数"), + value=ncpu, + interactive=True, + ) + f0method8 = gr.Radio( + label=i18n( + "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢" + ), + choices=["pm", "harvest", "dio"], + value="harvest", + interactive=True, + ) + but2 = gr.Button(i18n("特征提取"), variant="primary") + info2 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8) + but2.click( + extract_f0_feature, + [gpus6, np7, f0method8, if_f0_3, exp_dir1], + [info2], + ) with gr.Group(): gr.Markdown(value=i18n("step3: 填写训练设置, 开始训练模型和索引")) with gr.Row(): - save_epoch10 = gr.Slider(minimum=0, maximum=50, step=1, label=i18n("保存频率save_every_epoch"), value=5,interactive=True) - total_epoch11 = gr.Slider(minimum=0, maximum=1000, step=1, label=i18n("总训练轮数total_epoch"), value=20,interactive=True) - batch_size12 = gr.Slider(minimum=0, maximum=32, step=1, label='每张显卡的batch_size', value=4,interactive=True) - if_save_latest13 = gr.Radio(label=i18n("是否仅保存最新的ckpt文件以节省硬盘空间"), choices=["是", "否"], value="否", interactive=True) - if_cache_gpu17 = gr.Radio(label=i18n("是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速"), choices=["是", "否"], value="是", interactive=True) + save_epoch10 = gr.Slider( + minimum=0, + maximum=50, + step=1, + label=i18n("保存频率save_every_epoch"), + value=5, + interactive=True, + ) + total_epoch11 = gr.Slider( + minimum=0, + maximum=1000, + step=1, + label=i18n("总训练轮数total_epoch"), + value=20, + interactive=True, + ) + batch_size12 = gr.Slider( + minimum=0, + maximum=32, + step=1, + label="每张显卡的batch_size", + value=4, + interactive=True, + ) + if_save_latest13 = gr.Radio( + label=i18n("是否仅保存最新的ckpt文件以节省硬盘空间"), + choices=["是", "否"], + value="否", + interactive=True, + ) + if_cache_gpu17 = gr.Radio( + label=i18n( + "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速" + ), + choices=["是", "否"], + value="是", + interactive=True, + ) with gr.Row(): - pretrained_G14 = gr.Textbox(label=i18n("加载预训练底模G路径"), value="pretrained/f0G40k.pth",interactive=True) - pretrained_D15 = gr.Textbox(label=i18n("加载预训练底模D路径"), value="pretrained/f0D40k.pth",interactive=True) - sr2.change(change_sr2, [sr2,if_f0_3], [pretrained_G14,pretrained_D15]) - if_f0_3.change(change_f0, [if_f0_3, sr2], [np7, f0method8, pretrained_G14, pretrained_D15]) - gpus16 = gr.Textbox(label=i18n("以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2"), value=gpus,interactive=True) + pretrained_G14 = gr.Textbox( + label=i18n("加载预训练底模G路径"), + value="pretrained/f0G40k.pth", + interactive=True, + ) + pretrained_D15 = gr.Textbox( + label=i18n("加载预训练底模D路径"), + value="pretrained/f0D40k.pth", + interactive=True, + ) + sr2.change( + change_sr2, [sr2, if_f0_3], [pretrained_G14, pretrained_D15] + ) + if_f0_3.change( + change_f0, + [if_f0_3, sr2], + [np7, f0method8, pretrained_G14, pretrained_D15], + ) + gpus16 = gr.Textbox( + label=i18n("以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2"), + value=gpus, + interactive=True, + ) but3 = gr.Button(i18n("训练模型"), variant="primary") but4 = gr.Button(i18n("训练特征索引"), variant="primary") but5 = gr.Button(i18n("一键训练"), variant="primary") - info3 = gr.Textbox(label=i18n("输出信息"), value="",max_lines=10) - but3.click(click_train,[exp_dir1,sr2,if_f0_3,spk_id5,save_epoch10,total_epoch11,batch_size12,if_save_latest13,pretrained_G14,pretrained_D15,gpus16,if_cache_gpu17],info3) - but4.click(train_index,[exp_dir1],info3) - but5.click(train1key,[exp_dir1,sr2,if_f0_3,trainset_dir4,spk_id5,gpus6,np7,f0method8,save_epoch10,total_epoch11,batch_size12,if_save_latest13,pretrained_G14,pretrained_D15,gpus16,if_cache_gpu17],info3) + info3 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=10) + but3.click( + click_train, + [ + exp_dir1, + sr2, + if_f0_3, + spk_id5, + save_epoch10, + total_epoch11, + batch_size12, + if_save_latest13, + pretrained_G14, + pretrained_D15, + gpus16, + if_cache_gpu17, + ], + info3, + ) + but4.click(train_index, [exp_dir1], info3) + but5.click( + train1key, + [ + exp_dir1, + sr2, + if_f0_3, + trainset_dir4, + spk_id5, + gpus6, + np7, + f0method8, + save_epoch10, + total_epoch11, + batch_size12, + if_save_latest13, + pretrained_G14, + pretrained_D15, + gpus16, + if_cache_gpu17, + ], + info3, + ) with gr.TabItem(i18n("ckpt处理")): with gr.Group(): @@ -584,45 +1242,109 @@ with gr.Blocks() as app: with gr.Row(): ckpt_a = gr.Textbox(label=i18n("A模型路径"), value="", interactive=True) ckpt_b = gr.Textbox(label=i18n("B模型路径"), value="", interactive=True) - alpha_a = gr.Slider(minimum=0, maximum=1, label=i18n("A模型权重"), value=0.5, interactive=True) + alpha_a = gr.Slider( + minimum=0, + maximum=1, + label=i18n("A模型权重"), + value=0.5, + interactive=True, + ) with gr.Row(): - sr_ = gr.Radio(label=i18n("目标采样率"), choices=["32k","40k","48k"],value="40k", interactive=True) - if_f0_ = gr.Radio(label=i18n("模型是否带音高指导"), choices=["是","否"],value="是", interactive=True) - info__ = gr.Textbox(label=i18n("要置入的模型信息"), value="", max_lines=8, interactive=True) - name_to_save0=gr.Textbox(label=i18n("保存的模型名不带后缀"), value="", max_lines=1, interactive=True) + sr_ = gr.Radio( + label=i18n("目标采样率"), + choices=["32k", "40k", "48k"], + value="40k", + interactive=True, + ) + if_f0_ = gr.Radio( + label=i18n("模型是否带音高指导"), + choices=["是", "否"], + value="是", + interactive=True, + ) + info__ = gr.Textbox( + label=i18n("要置入的模型信息"), value="", max_lines=8, interactive=True + ) + name_to_save0 = gr.Textbox( + label=i18n("保存的模型名不带后缀"), + value="", + max_lines=1, + interactive=True, + ) with gr.Row(): but6 = gr.Button(i18n("融合"), variant="primary") info4 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8) - but6.click(merge, [ckpt_a,ckpt_b,alpha_a,sr_,if_f0_,info__,name_to_save0], info4)#def merge(path1,path2,alpha1,sr,f0,info): + but6.click( + merge, + [ckpt_a, ckpt_b, alpha_a, sr_, if_f0_, info__, name_to_save0], + info4, + ) # def merge(path1,path2,alpha1,sr,f0,info): with gr.Group(): gr.Markdown(value=i18n("修改模型信息(仅支持weights文件夹下提取的小模型文件)")) with gr.Row(): - ckpt_path0 = gr.Textbox(label=i18n("模型路径"), value="", interactive=True) - info_=gr.Textbox(label=i18n("要改的模型信息"), value="", max_lines=8, interactive=True) - name_to_save1=gr.Textbox(label=i18n("保存的文件名, 默认空为和源文件同名"), value="", max_lines=8, interactive=True) + ckpt_path0 = gr.Textbox( + label=i18n("模型路径"), value="", interactive=True + ) + info_ = gr.Textbox( + label=i18n("要改的模型信息"), value="", max_lines=8, interactive=True + ) + name_to_save1 = gr.Textbox( + label=i18n("保存的文件名, 默认空为和源文件同名"), + value="", + max_lines=8, + interactive=True, + ) with gr.Row(): but7 = gr.Button(i18n("修改"), variant="primary") info5 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8) - but7.click(change_info, [ckpt_path0,info_,name_to_save1], info5) + but7.click(change_info, [ckpt_path0, info_, name_to_save1], info5) with gr.Group(): gr.Markdown(value=i18n("查看模型信息(仅支持weights文件夹下提取的小模型文件)")) with gr.Row(): - ckpt_path1 = gr.Textbox(label=i18n("模型路径"), value="", interactive=True) + ckpt_path1 = gr.Textbox( + label=i18n("模型路径"), value="", interactive=True + ) but8 = gr.Button(i18n("查看"), variant="primary") info6 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8) but8.click(show_info, [ckpt_path1], info6) with gr.Group(): - gr.Markdown(value=i18n("模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况")) + gr.Markdown( + value=i18n( + "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况" + ) + ) with gr.Row(): - ckpt_path2 = gr.Textbox(label=i18n("模型路径"), value="E:\\codes\\py39\\logs\\mi-test_f0_48k\\G_23333.pth", interactive=True) - save_name = gr.Textbox(label=i18n("保存名"), value="", interactive=True) - sr__ = gr.Radio(label=i18n("目标采样率"), choices=["32k","40k","48k"],value="40k", interactive=True) - if_f0__ = gr.Radio(label=i18n("模型是否带音高指导,1是0否"), choices=["1","0"],value="1", interactive=True) - info___ = gr.Textbox(label=i18n("要置入的模型信息"), value="", max_lines=8, interactive=True) + ckpt_path2 = gr.Textbox( + label=i18n("模型路径"), + value="E:\\codes\\py39\\logs\\mi-test_f0_48k\\G_23333.pth", + interactive=True, + ) + save_name = gr.Textbox( + label=i18n("保存名"), value="", interactive=True + ) + sr__ = gr.Radio( + label=i18n("目标采样率"), + choices=["32k", "40k", "48k"], + value="40k", + interactive=True, + ) + if_f0__ = gr.Radio( + label=i18n("模型是否带音高指导,1是0否"), + choices=["1", "0"], + value="1", + interactive=True, + ) + info___ = gr.Textbox( + label=i18n("要置入的模型信息"), value="", max_lines=8, interactive=True + ) but9 = gr.Button(i18n("提取"), variant="primary") info7 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8) - ckpt_path2.change(change_info_,[ckpt_path2],[sr__,if_f0__]) - but9.click(extract_small_model, [ckpt_path2,save_name,sr__,if_f0__,info___], info7) + ckpt_path2.change(change_info_, [ckpt_path2], [sr__, if_f0__]) + but9.click( + extract_small_model, + [ckpt_path2, save_name, sr__, if_f0__, info___], + info7, + ) # with gr.TabItem(i18n("招募音高曲线前端编辑器")): # gr.Markdown(value=i18n("加开发群联系我xxxxx")) @@ -632,4 +1354,9 @@ with gr.Blocks() as app: if iscolab: app.queue(concurrency_count=511, max_size=1022).launch(share=True) else: - app.queue(concurrency_count=511, max_size=1022).launch(server_name="0.0.0.0",inbrowser=not noautoopen,server_port=listen_port,quiet=True) + app.queue(concurrency_count=511, max_size=1022).launch( + server_name="0.0.0.0", + inbrowser=not noautoopen, + server_port=listen_port, + quiet=True, + ) diff --git a/infer/infer-pm-index256.py b/infer/infer-pm-index256.py index add0245..5060345 100644 --- a/infer/infer-pm-index256.py +++ b/infer/infer-pm-index256.py @@ -1,14 +1,19 @@ -''' +""" 对源特征进行检索 -''' -import torch, pdb, os,parselmouth -os.environ["CUDA_VISIBLE_DEVICES"]="0" +""" +import torch, pdb, os, parselmouth + +os.environ["CUDA_VISIBLE_DEVICES"] = "0" import numpy as np import soundfile as sf + # from models import SynthesizerTrn256#hifigan_nonsf # from infer_pack.models import SynthesizerTrn256NSF as SynthesizerTrn256#hifigan_nsf -from infer_pack.models import SynthesizerTrnMs256NSFsid as SynthesizerTrn256#hifigan_nsf +from infer_pack.models import ( + SynthesizerTrnMs256NSFsid as SynthesizerTrn256, +) # hifigan_nsf + # from infer_pack.models import SynthesizerTrnMs256NSFsid_sim as SynthesizerTrn256#hifigan_nsf # from models import SynthesizerTrn256NSFsim as SynthesizerTrn256#hifigan_nsf # from models import SynthesizerTrn256NSFsimFlow as SynthesizerTrn256#hifigan_nsf @@ -16,15 +21,17 @@ from infer_pack.models import SynthesizerTrnMs256NSFsid as SynthesizerTrn256#hif from scipy.io import wavfile from fairseq import checkpoint_utils + # import pyworld import librosa import torch.nn.functional as F import scipy.signal as signal + # import torchcrepe from time import time as ttime device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -model_path = r"E:\codes\py39\vits_vc_gpu_train\hubert_base.pt"# +model_path = r"E:\codes\py39\vits_vc_gpu_train\hubert_base.pt" # print("load model(s) from {}".format(model_path)) models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( [model_path], @@ -37,7 +44,26 @@ model.eval() # net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],183,256,is_half=True)#hifigan#512#256 # net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],109,256,is_half=True)#hifigan#512#256 -net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],183,256,is_half=True)#hifigan#512#256#no_dropout +net_g = SynthesizerTrn256( + 1025, + 32, + 192, + 192, + 768, + 2, + 6, + 3, + 0, + "1", + [3, 7, 11], + [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + [10, 10, 2, 2], + 512, + [16, 16, 4, 4], + 183, + 256, + is_half=True, +) # hifigan#512#256#no_dropout # net_g = SynthesizerTrn256(1025,32,192,192,768,2,3,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],0)#ts3 # net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2],512,[16,16,4],0)#hifigan-ps-sr # @@ -48,51 +74,66 @@ net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0,"1", [3,7,11],[[1,3,5], [1 # weights=torch.load("infer/ft-mi-freeze-vocoder-flow-enc_q_1k.pt") # weights=torch.load("infer/ft-mi-freeze-vocoder_true_1k.pt") # weights=torch.load("infer/ft-mi-sim1k.pt") -weights=torch.load("infer/ft-mi-no_opt-no_dropout.pt") -print(net_g.load_state_dict(weights,strict=True)) +weights = torch.load("infer/ft-mi-no_opt-no_dropout.pt") +print(net_g.load_state_dict(weights, strict=True)) net_g.eval().to(device) net_g.half() -def get_f0(x, p_len,f0_up_key=0): + +def get_f0(x, p_len, f0_up_key=0): time_step = 160 / 16000 * 1000 f0_min = 50 f0_max = 1100 f0_mel_min = 1127 * np.log(1 + f0_min / 700) f0_mel_max = 1127 * np.log(1 + f0_max / 700) - f0 = parselmouth.Sound(x, 16000).to_pitch_ac( - time_step=time_step / 1000, voicing_threshold=0.6, - pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency'] + f0 = ( + parselmouth.Sound(x, 16000) + .to_pitch_ac( + time_step=time_step / 1000, + voicing_threshold=0.6, + pitch_floor=f0_min, + pitch_ceiling=f0_max, + ) + .selected_array["frequency"] + ) - pad_size=(p_len - len(f0) + 1) // 2 - if(pad_size>0 or p_len - len(f0) - pad_size>0): - f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant') + pad_size = (p_len - len(f0) + 1) // 2 + if pad_size > 0 or p_len - len(f0) - pad_size > 0: + f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant") f0 *= pow(2, f0_up_key / 12) f0bak = f0.copy() f0_mel = 1127 * np.log(1 + f0 / 700) - f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1 + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( + f0_mel_max - f0_mel_min + ) + 1 f0_mel[f0_mel <= 1] = 1 f0_mel[f0_mel > 255] = 255 # f0_mel[f0_mel > 188] = 188 f0_coarse = np.rint(f0_mel).astype(np.int) return f0_coarse, f0bak + import faiss -index=faiss.read_index("infer/added_IVF512_Flat_mi_baseline_src_feat.index") -big_npy=np.load("infer/big_src_feature_mi.npy") -ta0=ta1=ta2=0 -for idx,name in enumerate(["冬之花clip1.wav",]):## - wav_path = "todo-songs/%s" % name# - f0_up_key=-2# + +index = faiss.read_index("infer/added_IVF512_Flat_mi_baseline_src_feat.index") +big_npy = np.load("infer/big_src_feature_mi.npy") +ta0 = ta1 = ta2 = 0 +for idx, name in enumerate( + [ + "冬之花clip1.wav", + ] +): ## + wav_path = "todo-songs/%s" % name # + f0_up_key = -2 # audio, sampling_rate = sf.read(wav_path) if len(audio.shape) > 1: audio = librosa.to_mono(audio.transpose(1, 0)) if sampling_rate != 16000: audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000) - feats = torch.from_numpy(audio).float() if feats.dim() == 2: # double channels feats = feats.mean(-1) @@ -104,8 +145,9 @@ for idx,name in enumerate(["冬之花clip1.wav",]):## "padding_mask": padding_mask.to(device), "output_layer": 9, # layer 9 } - if torch.cuda.is_available(): torch.cuda.synchronize() - t0=ttime() + if torch.cuda.is_available(): + torch.cuda.synchronize() + t0 = ttime() with torch.no_grad(): logits = model.extract_features(**inputs) feats = model.final_proj(logits[0]) @@ -113,35 +155,45 @@ for idx,name in enumerate(["冬之花clip1.wav",]):## ####索引优化 npy = feats[0].cpu().numpy().astype("float32") D, I = index.search(npy, 1) - feats = torch.from_numpy(big_npy[I.squeeze()].astype("float16")).unsqueeze(0).to(device) + feats = ( + torch.from_numpy(big_npy[I.squeeze()].astype("float16")).unsqueeze(0).to(device) + ) - feats=F.interpolate(feats.permute(0,2,1),scale_factor=2).permute(0,2,1) - if torch.cuda.is_available(): torch.cuda.synchronize() - t1=ttime() + feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) + if torch.cuda.is_available(): + torch.cuda.synchronize() + t1 = ttime() # p_len = min(feats.shape[1],10000,pitch.shape[0])#太大了爆显存 - p_len = min(feats.shape[1],10000)# - pitch, pitchf = get_f0(audio, p_len,f0_up_key) - p_len = min(feats.shape[1],10000,pitch.shape[0])#太大了爆显存 - if torch.cuda.is_available(): torch.cuda.synchronize() - t2=ttime() - feats = feats[:,:p_len, :] + p_len = min(feats.shape[1], 10000) # + pitch, pitchf = get_f0(audio, p_len, f0_up_key) + p_len = min(feats.shape[1], 10000, pitch.shape[0]) # 太大了爆显存 + if torch.cuda.is_available(): + torch.cuda.synchronize() + t2 = ttime() + feats = feats[:, :p_len, :] pitch = pitch[:p_len] pitchf = pitchf[:p_len] p_len = torch.LongTensor([p_len]).to(device) pitch = torch.LongTensor(pitch).unsqueeze(0).to(device) - sid=torch.LongTensor([0]).to(device) + sid = torch.LongTensor([0]).to(device) pitchf = torch.FloatTensor(pitchf).unsqueeze(0).to(device) with torch.no_grad(): - audio = net_g.infer(feats, p_len,pitch,pitchf,sid)[0][0, 0].data.cpu().float().numpy()#nsf - if torch.cuda.is_available(): torch.cuda.synchronize() - t3=ttime() - ta0+=(t1-t0) - ta1+=(t2-t1) - ta2+=(t3-t2) + audio = ( + net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] + .data.cpu() + .float() + .numpy() + ) # nsf + if torch.cuda.is_available(): + torch.cuda.synchronize() + t3 = ttime() + ta0 += t1 - t0 + ta1 += t2 - t1 + ta2 += t3 - t2 # wavfile.write("ft-mi_1k-index256-noD-%s.wav"%name, 40000, audio)## # wavfile.write("ft-mi-freeze-vocoder-flow-enc_q_1k-%s.wav"%name, 40000, audio)## # wavfile.write("ft-mi-sim1k-%s.wav"%name, 40000, audio)## - wavfile.write("ft-mi-no_opt-no_dropout-%s.wav"%name, 40000, audio)## + wavfile.write("ft-mi-no_opt-no_dropout-%s.wav" % name, 40000, audio) ## -print(ta0,ta1,ta2)# +print(ta0, ta1, ta2) # diff --git a/infer/train-index.py b/infer/train-index.py index 847472c..c49f24b 100644 --- a/infer/train-index.py +++ b/infer/train-index.py @@ -1,31 +1,31 @@ -''' +""" 格式:直接cid为自带的index位;aid放不下了,通过字典来查,反正就5w个 -''' -import faiss,numpy as np,os +""" +import faiss, numpy as np, os # ###########如果是原始特征要先写save -inp_root=r"E:\codes\py39\dataset\mi\2-co256" -npys=[] +inp_root = r"E:\codes\py39\dataset\mi\2-co256" +npys = [] for name in sorted(list(os.listdir(inp_root))): - phone=np.load("%s/%s"%(inp_root,name)) + phone = np.load("%s/%s" % (inp_root, name)) npys.append(phone) -big_npy=np.concatenate(npys,0) -print(big_npy.shape)#(6196072, 192)#fp32#4.43G -np.save("infer/big_src_feature_mi.npy",big_npy) +big_npy = np.concatenate(npys, 0) +print(big_npy.shape) # (6196072, 192)#fp32#4.43G +np.save("infer/big_src_feature_mi.npy", big_npy) ##################train+add # big_npy=np.load("/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/inference_f0/big_src_feature_mi.npy") print(big_npy.shape) -index = faiss.index_factory(256, "IVF512,Flat")#mi +index = faiss.index_factory(256, "IVF512,Flat") # mi print("training") -index_ivf = faiss.extract_index_ivf(index)# +index_ivf = faiss.extract_index_ivf(index) # index_ivf.nprobe = 9 index.train(big_npy) -faiss.write_index(index, 'infer/trained_IVF512_Flat_mi_baseline_src_feat.index') +faiss.write_index(index, "infer/trained_IVF512_Flat_mi_baseline_src_feat.index") print("adding") index.add(big_npy) -faiss.write_index(index,"infer/added_IVF512_Flat_mi_baseline_src_feat.index") -''' +faiss.write_index(index, "infer/added_IVF512_Flat_mi_baseline_src_feat.index") +""" 大小(都是FP32) big_src_feature 2.95G (3098036, 256) @@ -33,4 +33,4 @@ big_emb 4.43G (6196072, 192) big_emb双倍是因为求特征要repeat后再加pitch -''' \ No newline at end of file +""" diff --git a/infer/trans_weights.py b/infer/trans_weights.py index 1845d7d..e0f7f0c 100644 --- a/infer/trans_weights.py +++ b/infer/trans_weights.py @@ -1,11 +1,16 @@ -import torch,pdb +import torch, pdb # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-suc\G_1000.pth")["model"]#sim_nsf# # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder-flow-enc_q\G_1000.pth")["model"]#sim_nsf# # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder\G_1000.pth")["model"]#sim_nsf# # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-test\G_1000.pth")["model"]#sim_nsf# -a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-no_opt-no_dropout\G_1000.pth")["model"]#sim_nsf# -for key in a.keys():a[key]=a[key].half() +a = torch.load( + r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-no_opt-no_dropout\G_1000.pth" +)[ + "model" +] # sim_nsf# +for key in a.keys(): + a[key] = a[key].half() # torch.save(a,"ft-mi-freeze-vocoder_true_1k.pt")# # torch.save(a,"ft-mi-sim1k.pt")# -torch.save(a,"ft-mi-no_opt-no_dropout.pt")# +torch.save(a, "ft-mi-no_opt-no_dropout.pt") # diff --git a/infer_pack/commons.py b/infer_pack/commons.py index a66e6e4..4937729 100644 --- a/infer_pack/commons.py +++ b/infer_pack/commons.py @@ -48,8 +48,10 @@ def slice_segments(x, ids_str, segment_size=4): idx_end = idx_str + segment_size ret[i] = x[i, :, idx_str:idx_end] return ret + + def slice_segments2(x, ids_str, segment_size=4): - ret = torch.zeros_like(x[:, :segment_size]) + ret = torch.zeros_like(x[:, :segment_size]) for i in range(x.size(0)): idx_str = ids_str[i] idx_end = idx_str + segment_size diff --git a/infer_pack/models.py b/infer_pack/models.py index 9eda8be..a7e688e 100644 --- a/infer_pack/models.py +++ b/infer_pack/models.py @@ -1,4 +1,4 @@ -import math,pdb,os +import math, pdb, os from time import time as ttime import torch from torch import nn @@ -12,9 +12,20 @@ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm from infer_pack.commons import init_weights import numpy as np from infer_pack import commons + + class TextEncoder256(nn.Module): def __init__( - self, out_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, f0=True ): + self, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=True, + ): super().__init__() self.out_channels = out_channels self.hidden_channels = hidden_channels @@ -24,8 +35,8 @@ class TextEncoder256(nn.Module): self.kernel_size = kernel_size self.p_dropout = p_dropout self.emb_phone = nn.Linear(256, hidden_channels) - self.lrelu=nn.LeakyReLU(0.1,inplace=True) - if(f0==True): + self.lrelu = nn.LeakyReLU(0.1, inplace=True) + if f0 == True: self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 self.encoder = attentions.Encoder( hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout @@ -33,12 +44,12 @@ class TextEncoder256(nn.Module): self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) def forward(self, phone, pitch, lengths): - if(pitch==None): + if pitch == None: x = self.emb_phone(phone) else: x = self.emb_phone(phone) + self.emb_pitch(pitch) x = x * math.sqrt(self.hidden_channels) # [b, t, h] - x=self.lrelu(x) + x = self.lrelu(x) x = torch.transpose(x, 1, -1) # [b, h, t] x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( x.dtype @@ -48,8 +59,20 @@ class TextEncoder256(nn.Module): m, logs = torch.split(stats, self.out_channels, dim=1) return m, logs, x_mask + + class TextEncoder256Sim(nn.Module): - def __init__( self, out_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, f0=True): + def __init__( + self, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=True, + ): super().__init__() self.out_channels = out_channels self.hidden_channels = hidden_channels @@ -59,8 +82,8 @@ class TextEncoder256Sim(nn.Module): self.kernel_size = kernel_size self.p_dropout = p_dropout self.emb_phone = nn.Linear(256, hidden_channels) - self.lrelu=nn.LeakyReLU(0.1,inplace=True) - if(f0==True): + self.lrelu = nn.LeakyReLU(0.1, inplace=True) + if f0 == True: self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 self.encoder = attentions.Encoder( hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout @@ -68,17 +91,21 @@ class TextEncoder256Sim(nn.Module): self.proj = nn.Conv1d(hidden_channels, out_channels, 1) def forward(self, phone, pitch, lengths): - if(pitch==None): + if pitch == None: x = self.emb_phone(phone) else: x = self.emb_phone(phone) + self.emb_pitch(pitch) x = x * math.sqrt(self.hidden_channels) # [b, t, h] - x=self.lrelu(x) + x = self.lrelu(x) x = torch.transpose(x, 1, -1) # [b, h, t] - x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(x.dtype) + x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( + x.dtype + ) x = self.encoder(x * x_mask, x_mask) x = self.proj(x) * x_mask - return x,x_mask + return x, x_mask + + class ResidualCouplingBlock(nn.Module): def __init__( self, @@ -126,6 +153,8 @@ class ResidualCouplingBlock(nn.Module): def remove_weight_norm(self): for i in range(self.n_flows): self.flows[i * 2].remove_weight_norm() + + class PosteriorEncoder(nn.Module): def __init__( self, @@ -169,6 +198,8 @@ class PosteriorEncoder(nn.Module): def remove_weight_norm(self): self.enc.remove_weight_norm() + + class Generator(torch.nn.Module): def __init__( self, @@ -243,8 +274,10 @@ class Generator(torch.nn.Module): remove_weight_norm(l) for l in self.resblocks: l.remove_weight_norm() + + class SineGen(torch.nn.Module): - """ Definition of sine generator + """Definition of sine generator SineGen(samp_rate, harmonic_num = 0, sine_amp = 0.1, noise_std = 0.003, voiced_threshold = 0, @@ -259,10 +292,15 @@ class SineGen(torch.nn.Module): segment is always sin(np.pi) or cos(0) """ - def __init__(self, samp_rate, harmonic_num=0, - sine_amp=0.1, noise_std=0.003, - voiced_threshold=0, - flag_for_pulse=False): + def __init__( + self, + samp_rate, + harmonic_num=0, + sine_amp=0.1, + noise_std=0.003, + voiced_threshold=0, + flag_for_pulse=False, + ): super(SineGen, self).__init__() self.sine_amp = sine_amp self.noise_std = noise_std @@ -277,8 +315,8 @@ class SineGen(torch.nn.Module): uv = uv * (f0 > self.voiced_threshold) return uv - def forward(self, f0,upp): - """ sine_tensor, uv = forward(f0) + def forward(self, f0, upp): + """sine_tensor, uv = forward(f0) input F0: tensor(batchsize=1, length, dim=1) f0 for unvoiced steps should be 0 output sine_tensor: tensor(batchsize=1, length, dim) @@ -286,32 +324,52 @@ class SineGen(torch.nn.Module): """ with torch.no_grad(): f0 = f0[:, None].transpose(1, 2) - f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim,device=f0.device) + f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) # fundamental component f0_buf[:, :, 0] = f0[:, :, 0] - for idx in np.arange(self.harmonic_num):f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2)# idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic - rad_values = (f0_buf / self.sampling_rate) % 1###%1意味着n_har的乘积无法后处理优化 - rand_ini = torch.rand(f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device) + for idx in np.arange(self.harmonic_num): + f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * ( + idx + 2 + ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic + rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化 + rand_ini = torch.rand( + f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device + ) rand_ini[:, 0] = 0 rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini - tmp_over_one = torch.cumsum(rad_values, 1)# % 1 #####%1意味着后面的cumsum无法再优化 - tmp_over_one*=upp - tmp_over_one=F.interpolate(tmp_over_one.transpose(2, 1), scale_factor=upp, mode='linear', align_corners=True).transpose(2, 1) - rad_values=F.interpolate(rad_values.transpose(2, 1), scale_factor=upp, mode='nearest').transpose(2, 1)####### - tmp_over_one%=1 + tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化 + tmp_over_one *= upp + tmp_over_one = F.interpolate( + tmp_over_one.transpose(2, 1), + scale_factor=upp, + mode="linear", + align_corners=True, + ).transpose(2, 1) + rad_values = F.interpolate( + rad_values.transpose(2, 1), scale_factor=upp, mode="nearest" + ).transpose( + 2, 1 + ) ####### + tmp_over_one %= 1 tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 cumsum_shift = torch.zeros_like(rad_values) cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 - sine_waves = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi) + sine_waves = torch.sin( + torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi + ) sine_waves = sine_waves * self.sine_amp uv = self._f02uv(f0) - uv = F.interpolate(uv.transpose(2, 1), scale_factor=upp, mode='nearest').transpose(2, 1) + uv = F.interpolate( + uv.transpose(2, 1), scale_factor=upp, mode="nearest" + ).transpose(2, 1) noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 noise = noise_amp * torch.randn_like(sine_waves) sine_waves = sine_waves * uv + noise return sine_waves, uv, noise + + class SourceModuleHnNSF(torch.nn.Module): - """ SourceModule for hn-nsf + """SourceModule for hn-nsf SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, add_noise_std=0.003, voiced_threshod=0) sampling_rate: sampling_rate in Hz @@ -328,26 +386,37 @@ class SourceModuleHnNSF(torch.nn.Module): uv (batchsize, length, 1) """ - def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1, - add_noise_std=0.003, voiced_threshod=0,is_half=True): + def __init__( + self, + sampling_rate, + harmonic_num=0, + sine_amp=0.1, + add_noise_std=0.003, + voiced_threshod=0, + is_half=True, + ): super(SourceModuleHnNSF, self).__init__() self.sine_amp = sine_amp self.noise_std = add_noise_std - self.is_half=is_half + self.is_half = is_half # to produce sine waveforms - self.l_sin_gen = SineGen(sampling_rate, harmonic_num, - sine_amp, add_noise_std, voiced_threshod) + self.l_sin_gen = SineGen( + sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod + ) # to merge source harmonics into a single excitation self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) self.l_tanh = torch.nn.Tanh() - def forward(self, x,upp=None): - sine_wavs, uv, _ = self.l_sin_gen(x,upp) - if(self.is_half):sine_wavs=sine_wavs.half() + def forward(self, x, upp=None): + sine_wavs, uv, _ = self.l_sin_gen(x, upp) + if self.is_half: + sine_wavs = sine_wavs.half() sine_merge = self.l_tanh(self.l_linear(sine_wavs)) - return sine_merge,None,None# noise, uv + return sine_merge, None, None # noise, uv + + class GeneratorNSF(torch.nn.Module): def __init__( self, @@ -360,7 +429,7 @@ class GeneratorNSF(torch.nn.Module): upsample_kernel_sizes, gin_channels, sr, - is_half=False + is_half=False, ): super(GeneratorNSF, self).__init__() self.num_kernels = len(resblock_kernel_sizes) @@ -368,9 +437,7 @@ class GeneratorNSF(torch.nn.Module): self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) self.m_source = SourceModuleHnNSF( - sampling_rate=sr, - harmonic_num=0, - is_half=is_half + sampling_rate=sr, harmonic_num=0, is_half=is_half ) self.noise_convs = nn.ModuleList() self.conv_pre = Conv1d( @@ -393,9 +460,16 @@ class GeneratorNSF(torch.nn.Module): ) ) if i + 1 < len(upsample_rates): - stride_f0 = np.prod(upsample_rates[i + 1:]) - self.noise_convs.append(Conv1d( - 1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2)) + stride_f0 = np.prod(upsample_rates[i + 1 :]) + self.noise_convs.append( + Conv1d( + 1, + c_cur, + kernel_size=stride_f0 * 2, + stride=stride_f0, + padding=stride_f0 // 2, + ) + ) else: self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) @@ -413,10 +487,10 @@ class GeneratorNSF(torch.nn.Module): if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) - self.upp=np.prod(upsample_rates) + self.upp = np.prod(upsample_rates) - def forward(self, x, f0,g=None): - har_source, noi_source, uv = self.m_source(f0,self.upp) + def forward(self, x, f0, g=None): + har_source, noi_source, uv = self.m_source(f0, self.upp) har_source = har_source.transpose(1, 2) x = self.conv_pre(x) if g is not None: @@ -444,11 +518,15 @@ class GeneratorNSF(torch.nn.Module): remove_weight_norm(l) for l in self.resblocks: l.remove_weight_norm() -sr2sr={ - "32k":32000, - "40k":40000, - "48k":48000, + + +sr2sr = { + "32k": 32000, + "40k": 40000, + "48k": 48000, } + + class SynthesizerTrnMs256NSFsid(nn.Module): def __init__( self, @@ -472,10 +550,9 @@ class SynthesizerTrnMs256NSFsid(nn.Module): sr, **kwargs ): - super().__init__() - if(type(sr)==type("strr")): - sr=sr2sr[sr] + if type(sr) == type("strr"): + sr = sr2sr[sr] self.spec_channels = spec_channels self.inter_channels = inter_channels self.hidden_channels = hidden_channels @@ -493,7 +570,7 @@ class SynthesizerTrnMs256NSFsid(nn.Module): self.segment_size = segment_size self.gin_channels = gin_channels # self.hop_length = hop_length# - self.spk_embed_dim=spk_embed_dim + self.spk_embed_dim = spk_embed_dim self.enc_p = TextEncoder256( inter_channels, hidden_channels, @@ -511,7 +588,9 @@ class SynthesizerTrnMs256NSFsid(nn.Module): upsample_rates, upsample_initial_channel, upsample_kernel_sizes, - gin_channels=gin_channels, sr=sr, is_half=kwargs["is_half"] + gin_channels=gin_channels, + sr=sr, + is_half=kwargs["is_half"], ) self.enc_q = PosteriorEncoder( spec_channels, @@ -526,13 +605,16 @@ class SynthesizerTrnMs256NSFsid(nn.Module): inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels ) self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) - print("gin_channels:",gin_channels,"self.spk_embed_dim:",self.spk_embed_dim) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + def remove_weight_norm(self): self.dec.remove_weight_norm() self.flow.remove_weight_norm() self.enc_q.remove_weight_norm() - def forward(self, phone, phone_lengths, pitch,pitchf, y, y_lengths,ds):#这里ds是id,[bs,1] + def forward( + self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds + ): # 这里ds是id,[bs,1] # print(1,pitch.shape)#[bs,t] g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) @@ -542,20 +624,20 @@ class SynthesizerTrnMs256NSFsid(nn.Module): z, y_lengths, self.segment_size ) # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length) - pitchf = commons.slice_segments2( - pitchf, ids_slice, self.segment_size - ) + pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size) # print(-2,pitchf.shape,z_slice.shape) - o = self.dec(z_slice,pitchf, g=g) + o = self.dec(z_slice, pitchf, g=g) return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - def infer(self, phone, phone_lengths, pitch, nsff0,sid,max_len=None): + def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len], nsff0,g=g) + o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) return o, x_mask, (z, z_p, m_p, logs_p) + + class SynthesizerTrnMs256NSFsid_nono(nn.Module): def __init__( self, @@ -579,7 +661,6 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module): sr=None, **kwargs ): - super().__init__() self.spec_channels = spec_channels self.inter_channels = inter_channels @@ -598,7 +679,7 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module): self.segment_size = segment_size self.gin_channels = gin_channels # self.hop_length = hop_length# - self.spk_embed_dim=spk_embed_dim + self.spk_embed_dim = spk_embed_dim self.enc_p = TextEncoder256( inter_channels, hidden_channels, @@ -606,7 +687,8 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module): n_heads, n_layers, kernel_size, - p_dropout,f0=False + p_dropout, + f0=False, ) self.dec = Generator( inter_channels, @@ -616,7 +698,7 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module): upsample_rates, upsample_initial_channel, upsample_kernel_sizes, - gin_channels=gin_channels + gin_channels=gin_channels, ) self.enc_q = PosteriorEncoder( spec_channels, @@ -631,14 +713,14 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module): inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels ) self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) - print("gin_channels:",gin_channels,"self.spk_embed_dim:",self.spk_embed_dim) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) def remove_weight_norm(self): self.dec.remove_weight_norm() self.flow.remove_weight_norm() self.enc_q.remove_weight_norm() - def forward(self, phone, phone_lengths, y, y_lengths,ds):#这里ds是id,[bs,1] + def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1] g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) @@ -649,13 +731,15 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module): o = self.dec(z_slice, g=g) return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - def infer(self, phone, phone_lengths,sid,max_len=None): + def infer(self, phone, phone_lengths, sid, max_len=None): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len],g=g) + o = self.dec((z * x_mask)[:, :, :max_len], g=g) return o, x_mask, (z, z_p, m_p, logs_p) + + class SynthesizerTrnMs256NSFsid_sim(nn.Module): """ Synthesizer for Training @@ -684,7 +768,6 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module): use_sdp=True, **kwargs ): - super().__init__() self.spec_channels = spec_channels self.inter_channels = inter_channels @@ -703,7 +786,7 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module): self.segment_size = segment_size self.gin_channels = gin_channels # self.hop_length = hop_length# - self.spk_embed_dim=spk_embed_dim + self.spk_embed_dim = spk_embed_dim self.enc_p = TextEncoder256Sim( inter_channels, hidden_channels, @@ -721,20 +804,24 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module): upsample_rates, upsample_initial_channel, upsample_kernel_sizes, - gin_channels=gin_channels,is_half=kwargs["is_half"] + gin_channels=gin_channels, + is_half=kwargs["is_half"], ) self.flow = ResidualCouplingBlock( inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels ) self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) - print("gin_channels:",gin_channels,"self.spk_embed_dim:",self.spk_embed_dim) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + def remove_weight_norm(self): self.dec.remove_weight_norm() self.flow.remove_weight_norm() self.enc_q.remove_weight_norm() - def forward(self, phone, phone_lengths, pitch, pitchf, y_lengths,ds): # y是spec不需要了现在 + def forward( + self, phone, phone_lengths, pitch, pitchf, y_lengths, ds + ): # y是spec不需要了现在 g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 x, x_mask = self.enc_p(phone, pitch, phone_lengths) x = self.flow(x, x_mask, g=g, reverse=True) @@ -742,22 +829,24 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module): x, y_lengths, self.segment_size ) - pitchf = commons.slice_segments2( - pitchf, ids_slice, self.segment_size - ) + pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size) o = self.dec(z_slice, pitchf, g=g) return o, ids_slice - def infer(self, phone, phone_lengths, pitch, pitchf, ds,max_len=None): # y是spec不需要了现在 + + def infer( + self, phone, phone_lengths, pitch, pitchf, ds, max_len=None + ): # y是spec不需要了现在 g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 x, x_mask = self.enc_p(phone, pitch, phone_lengths) x = self.flow(x, x_mask, g=g, reverse=True) - o = self.dec((x*x_mask)[:, :, :max_len], pitchf, g=g) + o = self.dec((x * x_mask)[:, :, :max_len], pitchf, g=g) return o, o + class MultiPeriodDiscriminator(torch.nn.Module): def __init__(self, use_spectral_norm=False): super(MultiPeriodDiscriminator, self).__init__() - periods = [2, 3, 5, 7, 11,17] + periods = [2, 3, 5, 7, 11, 17] # periods = [3, 5, 7, 11, 17, 23, 37] discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] @@ -767,7 +856,7 @@ class MultiPeriodDiscriminator(torch.nn.Module): self.discriminators = nn.ModuleList(discs) def forward(self, y, y_hat): - y_d_rs = []# + y_d_rs = [] # y_d_gs = [] fmap_rs = [] fmap_gs = [] @@ -783,6 +872,7 @@ class MultiPeriodDiscriminator(torch.nn.Module): return y_d_rs, y_d_gs, fmap_rs, fmap_gs + class DiscriminatorS(torch.nn.Module): def __init__(self, use_spectral_norm=False): super(DiscriminatorS, self).__init__() @@ -812,6 +902,7 @@ class DiscriminatorS(torch.nn.Module): return x, fmap + class DiscriminatorP(torch.nn.Module): def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): super(DiscriminatorP, self).__init__() @@ -889,4 +980,3 @@ class DiscriminatorP(torch.nn.Module): x = torch.flatten(x, 1, -1) return x, fmap - diff --git a/infer_pack/models_onnx.py b/infer_pack/models_onnx.py index ea90d1c..a5f405c 100644 --- a/infer_pack/models_onnx.py +++ b/infer_pack/models_onnx.py @@ -1,4 +1,4 @@ -import math,pdb,os +import math, pdb, os from time import time as ttime import torch from torch import nn @@ -12,9 +12,20 @@ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm from infer_pack.commons import init_weights import numpy as np from infer_pack import commons + + class TextEncoder256(nn.Module): def __init__( - self, out_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, f0=True ): + self, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=True, + ): super().__init__() self.out_channels = out_channels self.hidden_channels = hidden_channels @@ -24,8 +35,8 @@ class TextEncoder256(nn.Module): self.kernel_size = kernel_size self.p_dropout = p_dropout self.emb_phone = nn.Linear(256, hidden_channels) - self.lrelu=nn.LeakyReLU(0.1,inplace=True) - if(f0==True): + self.lrelu = nn.LeakyReLU(0.1, inplace=True) + if f0 == True: self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 self.encoder = attentions.Encoder( hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout @@ -33,12 +44,12 @@ class TextEncoder256(nn.Module): self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) def forward(self, phone, pitch, lengths): - if(pitch==None): + if pitch == None: x = self.emb_phone(phone) else: x = self.emb_phone(phone) + self.emb_pitch(pitch) x = x * math.sqrt(self.hidden_channels) # [b, t, h] - x=self.lrelu(x) + x = self.lrelu(x) x = torch.transpose(x, 1, -1) # [b, h, t] x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( x.dtype @@ -48,8 +59,20 @@ class TextEncoder256(nn.Module): m, logs = torch.split(stats, self.out_channels, dim=1) return m, logs, x_mask + + class TextEncoder256Sim(nn.Module): - def __init__( self, out_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, f0=True): + def __init__( + self, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=True, + ): super().__init__() self.out_channels = out_channels self.hidden_channels = hidden_channels @@ -59,8 +82,8 @@ class TextEncoder256Sim(nn.Module): self.kernel_size = kernel_size self.p_dropout = p_dropout self.emb_phone = nn.Linear(256, hidden_channels) - self.lrelu=nn.LeakyReLU(0.1,inplace=True) - if(f0==True): + self.lrelu = nn.LeakyReLU(0.1, inplace=True) + if f0 == True: self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 self.encoder = attentions.Encoder( hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout @@ -68,17 +91,21 @@ class TextEncoder256Sim(nn.Module): self.proj = nn.Conv1d(hidden_channels, out_channels, 1) def forward(self, phone, pitch, lengths): - if(pitch==None): + if pitch == None: x = self.emb_phone(phone) else: x = self.emb_phone(phone) + self.emb_pitch(pitch) x = x * math.sqrt(self.hidden_channels) # [b, t, h] - x=self.lrelu(x) + x = self.lrelu(x) x = torch.transpose(x, 1, -1) # [b, h, t] - x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(x.dtype) + x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( + x.dtype + ) x = self.encoder(x * x_mask, x_mask) x = self.proj(x) * x_mask - return x,x_mask + return x, x_mask + + class ResidualCouplingBlock(nn.Module): def __init__( self, @@ -126,6 +153,8 @@ class ResidualCouplingBlock(nn.Module): def remove_weight_norm(self): for i in range(self.n_flows): self.flows[i * 2].remove_weight_norm() + + class PosteriorEncoder(nn.Module): def __init__( self, @@ -169,6 +198,8 @@ class PosteriorEncoder(nn.Module): def remove_weight_norm(self): self.enc.remove_weight_norm() + + class Generator(torch.nn.Module): def __init__( self, @@ -243,8 +274,10 @@ class Generator(torch.nn.Module): remove_weight_norm(l) for l in self.resblocks: l.remove_weight_norm() + + class SineGen(torch.nn.Module): - """ Definition of sine generator + """Definition of sine generator SineGen(samp_rate, harmonic_num = 0, sine_amp = 0.1, noise_std = 0.003, voiced_threshold = 0, @@ -259,10 +292,15 @@ class SineGen(torch.nn.Module): segment is always sin(np.pi) or cos(0) """ - def __init__(self, samp_rate, harmonic_num=0, - sine_amp=0.1, noise_std=0.003, - voiced_threshold=0, - flag_for_pulse=False): + def __init__( + self, + samp_rate, + harmonic_num=0, + sine_amp=0.1, + noise_std=0.003, + voiced_threshold=0, + flag_for_pulse=False, + ): super(SineGen, self).__init__() self.sine_amp = sine_amp self.noise_std = noise_std @@ -277,8 +315,8 @@ class SineGen(torch.nn.Module): uv = uv * (f0 > self.voiced_threshold) return uv - def forward(self, f0,upp): - """ sine_tensor, uv = forward(f0) + def forward(self, f0, upp): + """sine_tensor, uv = forward(f0) input F0: tensor(batchsize=1, length, dim=1) f0 for unvoiced steps should be 0 output sine_tensor: tensor(batchsize=1, length, dim) @@ -286,32 +324,52 @@ class SineGen(torch.nn.Module): """ with torch.no_grad(): f0 = f0[:, None].transpose(1, 2) - f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim,device=f0.device) + f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) # fundamental component f0_buf[:, :, 0] = f0[:, :, 0] - for idx in np.arange(self.harmonic_num):f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2)# idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic - rad_values = (f0_buf / self.sampling_rate) % 1###%1意味着n_har的乘积无法后处理优化 - rand_ini = torch.rand(f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device) + for idx in np.arange(self.harmonic_num): + f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * ( + idx + 2 + ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic + rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化 + rand_ini = torch.rand( + f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device + ) rand_ini[:, 0] = 0 rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini - tmp_over_one = torch.cumsum(rad_values, 1)# % 1 #####%1意味着后面的cumsum无法再优化 - tmp_over_one*=upp - tmp_over_one=F.interpolate(tmp_over_one.transpose(2, 1), scale_factor=upp, mode='linear', align_corners=True).transpose(2, 1) - rad_values=F.interpolate(rad_values.transpose(2, 1), scale_factor=upp, mode='nearest').transpose(2, 1)####### - tmp_over_one%=1 + tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化 + tmp_over_one *= upp + tmp_over_one = F.interpolate( + tmp_over_one.transpose(2, 1), + scale_factor=upp, + mode="linear", + align_corners=True, + ).transpose(2, 1) + rad_values = F.interpolate( + rad_values.transpose(2, 1), scale_factor=upp, mode="nearest" + ).transpose( + 2, 1 + ) ####### + tmp_over_one %= 1 tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 cumsum_shift = torch.zeros_like(rad_values) cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 - sine_waves = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi) + sine_waves = torch.sin( + torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi + ) sine_waves = sine_waves * self.sine_amp uv = self._f02uv(f0) - uv = F.interpolate(uv.transpose(2, 1), scale_factor=upp, mode='nearest').transpose(2, 1) + uv = F.interpolate( + uv.transpose(2, 1), scale_factor=upp, mode="nearest" + ).transpose(2, 1) noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 noise = noise_amp * torch.randn_like(sine_waves) sine_waves = sine_waves * uv + noise return sine_waves, uv, noise + + class SourceModuleHnNSF(torch.nn.Module): - """ SourceModule for hn-nsf + """SourceModule for hn-nsf SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, add_noise_std=0.003, voiced_threshod=0) sampling_rate: sampling_rate in Hz @@ -328,26 +386,37 @@ class SourceModuleHnNSF(torch.nn.Module): uv (batchsize, length, 1) """ - def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1, - add_noise_std=0.003, voiced_threshod=0,is_half=True): + def __init__( + self, + sampling_rate, + harmonic_num=0, + sine_amp=0.1, + add_noise_std=0.003, + voiced_threshod=0, + is_half=True, + ): super(SourceModuleHnNSF, self).__init__() self.sine_amp = sine_amp self.noise_std = add_noise_std - self.is_half=is_half + self.is_half = is_half # to produce sine waveforms - self.l_sin_gen = SineGen(sampling_rate, harmonic_num, - sine_amp, add_noise_std, voiced_threshod) + self.l_sin_gen = SineGen( + sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod + ) # to merge source harmonics into a single excitation self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) self.l_tanh = torch.nn.Tanh() - def forward(self, x,upp=None): - sine_wavs, uv, _ = self.l_sin_gen(x,upp) - if(self.is_half):sine_wavs=sine_wavs.half() + def forward(self, x, upp=None): + sine_wavs, uv, _ = self.l_sin_gen(x, upp) + if self.is_half: + sine_wavs = sine_wavs.half() sine_merge = self.l_tanh(self.l_linear(sine_wavs)) - return sine_merge,None,None# noise, uv + return sine_merge, None, None # noise, uv + + class GeneratorNSF(torch.nn.Module): def __init__( self, @@ -360,7 +429,7 @@ class GeneratorNSF(torch.nn.Module): upsample_kernel_sizes, gin_channels, sr, - is_half=False + is_half=False, ): super(GeneratorNSF, self).__init__() self.num_kernels = len(resblock_kernel_sizes) @@ -368,9 +437,7 @@ class GeneratorNSF(torch.nn.Module): self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) self.m_source = SourceModuleHnNSF( - sampling_rate=sr, - harmonic_num=0, - is_half=is_half + sampling_rate=sr, harmonic_num=0, is_half=is_half ) self.noise_convs = nn.ModuleList() self.conv_pre = Conv1d( @@ -393,9 +460,16 @@ class GeneratorNSF(torch.nn.Module): ) ) if i + 1 < len(upsample_rates): - stride_f0 = np.prod(upsample_rates[i + 1:]) - self.noise_convs.append(Conv1d( - 1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2)) + stride_f0 = np.prod(upsample_rates[i + 1 :]) + self.noise_convs.append( + Conv1d( + 1, + c_cur, + kernel_size=stride_f0 * 2, + stride=stride_f0, + padding=stride_f0 // 2, + ) + ) else: self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) @@ -413,10 +487,10 @@ class GeneratorNSF(torch.nn.Module): if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) - self.upp=np.prod(upsample_rates) + self.upp = np.prod(upsample_rates) - def forward(self, x, f0,g=None): - har_source, noi_source, uv = self.m_source(f0,self.upp) + def forward(self, x, f0, g=None): + har_source, noi_source, uv = self.m_source(f0, self.upp) har_source = har_source.transpose(1, 2) x = self.conv_pre(x) if g is not None: @@ -444,11 +518,15 @@ class GeneratorNSF(torch.nn.Module): remove_weight_norm(l) for l in self.resblocks: l.remove_weight_norm() -sr2sr={ - "32k":32000, - "40k":40000, - "48k":48000, + + +sr2sr = { + "32k": 32000, + "40k": 40000, + "48k": 48000, } + + class SynthesizerTrnMs256NSFsid(nn.Module): def __init__( self, @@ -472,10 +550,9 @@ class SynthesizerTrnMs256NSFsid(nn.Module): sr, **kwargs ): - super().__init__() - if(type(sr)==type("strr")): - sr=sr2sr[sr] + if type(sr) == type("strr"): + sr = sr2sr[sr] self.spec_channels = spec_channels self.inter_channels = inter_channels self.hidden_channels = hidden_channels @@ -493,7 +570,7 @@ class SynthesizerTrnMs256NSFsid(nn.Module): self.segment_size = segment_size self.gin_channels = gin_channels # self.hop_length = hop_length# - self.spk_embed_dim=spk_embed_dim + self.spk_embed_dim = spk_embed_dim self.enc_p = TextEncoder256( inter_channels, hidden_channels, @@ -511,7 +588,9 @@ class SynthesizerTrnMs256NSFsid(nn.Module): upsample_rates, upsample_initial_channel, upsample_kernel_sizes, - gin_channels=gin_channels, sr=sr, is_half=kwargs["is_half"] + gin_channels=gin_channels, + sr=sr, + is_half=kwargs["is_half"], ) self.enc_q = PosteriorEncoder( spec_channels, @@ -526,21 +605,22 @@ class SynthesizerTrnMs256NSFsid(nn.Module): inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels ) self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) - print("gin_channels:",gin_channels,"self.spk_embed_dim:",self.spk_embed_dim) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + def remove_weight_norm(self): self.dec.remove_weight_norm() self.flow.remove_weight_norm() self.enc_q.remove_weight_norm() - def forward(self, phone, phone_lengths, pitch, nsff0 ,sid, rnd, max_len=None): - + def forward(self, phone, phone_lengths, pitch, nsff0, sid, rnd, max_len=None): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len], nsff0,g=g) + o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) return o + class SynthesizerTrnMs256NSFsid_sim(nn.Module): """ Synthesizer for Training @@ -569,7 +649,6 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module): use_sdp=True, **kwargs ): - super().__init__() self.spec_channels = spec_channels self.inter_channels = inter_channels @@ -588,7 +667,7 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module): self.segment_size = segment_size self.gin_channels = gin_channels # self.hop_length = hop_length# - self.spk_embed_dim=spk_embed_dim + self.spk_embed_dim = spk_embed_dim self.enc_p = TextEncoder256Sim( inter_channels, hidden_channels, @@ -606,30 +685,35 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module): upsample_rates, upsample_initial_channel, upsample_kernel_sizes, - gin_channels=gin_channels,is_half=kwargs["is_half"] + gin_channels=gin_channels, + is_half=kwargs["is_half"], ) self.flow = ResidualCouplingBlock( inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels ) self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) - print("gin_channels:",gin_channels,"self.spk_embed_dim:",self.spk_embed_dim) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + def remove_weight_norm(self): self.dec.remove_weight_norm() self.flow.remove_weight_norm() self.enc_q.remove_weight_norm() - - def forward(self, phone, phone_lengths, pitch, pitchf, ds,max_len=None): # y是spec不需要了现在 + + def forward( + self, phone, phone_lengths, pitch, pitchf, ds, max_len=None + ): # y是spec不需要了现在 g = self.emb_g(ds.unsqueeze(0)).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 x, x_mask = self.enc_p(phone, pitch, phone_lengths) x = self.flow(x, x_mask, g=g, reverse=True) - o = self.dec((x*x_mask)[:, :, :max_len], pitchf, g=g) + o = self.dec((x * x_mask)[:, :, :max_len], pitchf, g=g) return o + class MultiPeriodDiscriminator(torch.nn.Module): def __init__(self, use_spectral_norm=False): super(MultiPeriodDiscriminator, self).__init__() - periods = [2, 3, 5, 7, 11,17] + periods = [2, 3, 5, 7, 11, 17] # periods = [3, 5, 7, 11, 17, 23, 37] discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] @@ -639,7 +723,7 @@ class MultiPeriodDiscriminator(torch.nn.Module): self.discriminators = nn.ModuleList(discs) def forward(self, y, y_hat): - y_d_rs = []# + y_d_rs = [] # y_d_gs = [] fmap_rs = [] fmap_gs = [] @@ -655,6 +739,7 @@ class MultiPeriodDiscriminator(torch.nn.Module): return y_d_rs, y_d_gs, fmap_rs, fmap_gs + class DiscriminatorS(torch.nn.Module): def __init__(self, use_spectral_norm=False): super(DiscriminatorS, self).__init__() @@ -684,6 +769,7 @@ class DiscriminatorS(torch.nn.Module): return x, fmap + class DiscriminatorP(torch.nn.Module): def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): super(DiscriminatorP, self).__init__() @@ -761,4 +847,3 @@ class DiscriminatorP(torch.nn.Module): x = torch.flatten(x, 1, -1) return x, fmap - diff --git a/infer_pack/transforms.py b/infer_pack/transforms.py index a92f0e4..7d93c48 100644 --- a/infer_pack/transforms.py +++ b/infer_pack/transforms.py @@ -9,66 +9,63 @@ DEFAULT_MIN_BIN_HEIGHT = 1e-3 DEFAULT_MIN_DERIVATIVE = 1e-3 -def piecewise_rational_quadratic_transform(inputs, - unnormalized_widths, - unnormalized_heights, - unnormalized_derivatives, - inverse=False, - tails=None, - tail_bound=1., - min_bin_width=DEFAULT_MIN_BIN_WIDTH, - min_bin_height=DEFAULT_MIN_BIN_HEIGHT, - min_derivative=DEFAULT_MIN_DERIVATIVE): - +def piecewise_rational_quadratic_transform( + inputs, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=False, + tails=None, + tail_bound=1.0, + min_bin_width=DEFAULT_MIN_BIN_WIDTH, + min_bin_height=DEFAULT_MIN_BIN_HEIGHT, + min_derivative=DEFAULT_MIN_DERIVATIVE, +): if tails is None: spline_fn = rational_quadratic_spline spline_kwargs = {} else: spline_fn = unconstrained_rational_quadratic_spline - spline_kwargs = { - 'tails': tails, - 'tail_bound': tail_bound - } + spline_kwargs = {"tails": tails, "tail_bound": tail_bound} outputs, logabsdet = spline_fn( - inputs=inputs, - unnormalized_widths=unnormalized_widths, - unnormalized_heights=unnormalized_heights, - unnormalized_derivatives=unnormalized_derivatives, - inverse=inverse, - min_bin_width=min_bin_width, - min_bin_height=min_bin_height, - min_derivative=min_derivative, - **spline_kwargs + inputs=inputs, + unnormalized_widths=unnormalized_widths, + unnormalized_heights=unnormalized_heights, + unnormalized_derivatives=unnormalized_derivatives, + inverse=inverse, + min_bin_width=min_bin_width, + min_bin_height=min_bin_height, + min_derivative=min_derivative, + **spline_kwargs ) return outputs, logabsdet def searchsorted(bin_locations, inputs, eps=1e-6): bin_locations[..., -1] += eps - return torch.sum( - inputs[..., None] >= bin_locations, - dim=-1 - ) - 1 + return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1 -def unconstrained_rational_quadratic_spline(inputs, - unnormalized_widths, - unnormalized_heights, - unnormalized_derivatives, - inverse=False, - tails='linear', - tail_bound=1., - min_bin_width=DEFAULT_MIN_BIN_WIDTH, - min_bin_height=DEFAULT_MIN_BIN_HEIGHT, - min_derivative=DEFAULT_MIN_DERIVATIVE): +def unconstrained_rational_quadratic_spline( + inputs, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=False, + tails="linear", + tail_bound=1.0, + min_bin_width=DEFAULT_MIN_BIN_WIDTH, + min_bin_height=DEFAULT_MIN_BIN_HEIGHT, + min_derivative=DEFAULT_MIN_DERIVATIVE, +): inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound) outside_interval_mask = ~inside_interval_mask outputs = torch.zeros_like(inputs) logabsdet = torch.zeros_like(inputs) - if tails == 'linear': + if tails == "linear": unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1)) constant = np.log(np.exp(1 - min_derivative) - 1) unnormalized_derivatives[..., 0] = constant @@ -77,45 +74,57 @@ def unconstrained_rational_quadratic_spline(inputs, outputs[outside_interval_mask] = inputs[outside_interval_mask] logabsdet[outside_interval_mask] = 0 else: - raise RuntimeError('{} tails are not implemented.'.format(tails)) + raise RuntimeError("{} tails are not implemented.".format(tails)) - outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline( + ( + outputs[inside_interval_mask], + logabsdet[inside_interval_mask], + ) = rational_quadratic_spline( inputs=inputs[inside_interval_mask], unnormalized_widths=unnormalized_widths[inside_interval_mask, :], unnormalized_heights=unnormalized_heights[inside_interval_mask, :], unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :], inverse=inverse, - left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound, + left=-tail_bound, + right=tail_bound, + bottom=-tail_bound, + top=tail_bound, min_bin_width=min_bin_width, min_bin_height=min_bin_height, - min_derivative=min_derivative + min_derivative=min_derivative, ) return outputs, logabsdet -def rational_quadratic_spline(inputs, - unnormalized_widths, - unnormalized_heights, - unnormalized_derivatives, - inverse=False, - left=0., right=1., bottom=0., top=1., - min_bin_width=DEFAULT_MIN_BIN_WIDTH, - min_bin_height=DEFAULT_MIN_BIN_HEIGHT, - min_derivative=DEFAULT_MIN_DERIVATIVE): + +def rational_quadratic_spline( + inputs, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=False, + left=0.0, + right=1.0, + bottom=0.0, + top=1.0, + min_bin_width=DEFAULT_MIN_BIN_WIDTH, + min_bin_height=DEFAULT_MIN_BIN_HEIGHT, + min_derivative=DEFAULT_MIN_DERIVATIVE, +): if torch.min(inputs) < left or torch.max(inputs) > right: - raise ValueError('Input to a transform is not within its domain') + raise ValueError("Input to a transform is not within its domain") num_bins = unnormalized_widths.shape[-1] if min_bin_width * num_bins > 1.0: - raise ValueError('Minimal bin width too large for the number of bins') + raise ValueError("Minimal bin width too large for the number of bins") if min_bin_height * num_bins > 1.0: - raise ValueError('Minimal bin height too large for the number of bins') + raise ValueError("Minimal bin height too large for the number of bins") widths = F.softmax(unnormalized_widths, dim=-1) widths = min_bin_width + (1 - min_bin_width * num_bins) * widths cumwidths = torch.cumsum(widths, dim=-1) - cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0) + cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0) cumwidths = (right - left) * cumwidths + left cumwidths[..., 0] = left cumwidths[..., -1] = right @@ -126,7 +135,7 @@ def rational_quadratic_spline(inputs, heights = F.softmax(unnormalized_heights, dim=-1) heights = min_bin_height + (1 - min_bin_height * num_bins) * heights cumheights = torch.cumsum(heights, dim=-1) - cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0) + cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0) cumheights = (top - bottom) * cumheights + bottom cumheights[..., 0] = bottom cumheights[..., -1] = top @@ -150,15 +159,13 @@ def rational_quadratic_spline(inputs, input_heights = heights.gather(-1, bin_idx)[..., 0] if inverse: - a = (((inputs - input_cumheights) * (input_derivatives - + input_derivatives_plus_one - - 2 * input_delta) - + input_heights * (input_delta - input_derivatives))) - b = (input_heights * input_derivatives - - (inputs - input_cumheights) * (input_derivatives - + input_derivatives_plus_one - - 2 * input_delta)) - c = - input_delta * (inputs - input_cumheights) + a = (inputs - input_cumheights) * ( + input_derivatives + input_derivatives_plus_one - 2 * input_delta + ) + input_heights * (input_delta - input_derivatives) + b = input_heights * input_derivatives - (inputs - input_cumheights) * ( + input_derivatives + input_derivatives_plus_one - 2 * input_delta + ) + c = -input_delta * (inputs - input_cumheights) discriminant = b.pow(2) - 4 * a * c assert (discriminant >= 0).all() @@ -167,11 +174,15 @@ def rational_quadratic_spline(inputs, outputs = root * input_bin_widths + input_cumwidths theta_one_minus_theta = root * (1 - root) - denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) - * theta_one_minus_theta) - derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2) - + 2 * input_delta * theta_one_minus_theta - + input_derivatives * (1 - root).pow(2)) + denominator = input_delta + ( + (input_derivatives + input_derivatives_plus_one - 2 * input_delta) + * theta_one_minus_theta + ) + derivative_numerator = input_delta.pow(2) * ( + input_derivatives_plus_one * root.pow(2) + + 2 * input_delta * theta_one_minus_theta + + input_derivatives * (1 - root).pow(2) + ) logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) return outputs, -logabsdet @@ -179,15 +190,20 @@ def rational_quadratic_spline(inputs, theta = (inputs - input_cumwidths) / input_bin_widths theta_one_minus_theta = theta * (1 - theta) - numerator = input_heights * (input_delta * theta.pow(2) - + input_derivatives * theta_one_minus_theta) - denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) - * theta_one_minus_theta) + numerator = input_heights * ( + input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta + ) + denominator = input_delta + ( + (input_derivatives + input_derivatives_plus_one - 2 * input_delta) + * theta_one_minus_theta + ) outputs = input_cumheights + numerator / denominator - derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2) - + 2 * input_delta * theta_one_minus_theta - + input_derivatives * (1 - theta).pow(2)) + derivative_numerator = input_delta.pow(2) * ( + input_derivatives_plus_one * theta.pow(2) + + 2 * input_delta * theta_one_minus_theta + + input_derivatives * (1 - theta).pow(2) + ) logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) return outputs, logabsdet diff --git a/infer_uvr5.py b/infer_uvr5.py index 209c44d..b7d484d 100644 --- a/infer_uvr5.py +++ b/infer_uvr5.py @@ -1,108 +1,171 @@ -import os,sys,torch,warnings,pdb +import os, sys, torch, warnings, pdb + warnings.filterwarnings("ignore") import librosa import importlib -import numpy as np -import hashlib , math +import numpy as np +import hashlib, math from tqdm import tqdm from uvr5_pack.lib_v5 import spec_utils -from uvr5_pack.utils import _get_name_params,inference +from uvr5_pack.utils import _get_name_params, inference from uvr5_pack.lib_v5.model_param_init import ModelParameters from scipy.io import wavfile -class _audio_pre_(): - def __init__(self, model_path,device,is_half): + +class _audio_pre_: + def __init__(self, model_path, device, is_half): self.model_path = model_path self.device = device self.data = { # Processing Options - 'postprocess': False, - 'tta': False, + "postprocess": False, + "tta": False, # Constants - 'window_size': 512, - 'agg': 10, - 'high_end_process': 'mirroring', + "window_size": 512, + "agg": 10, + "high_end_process": "mirroring", } nn_arch_sizes = [ - 31191, # default - 33966,61968, 123821, 123812, 537238 # custom + 31191, # default + 33966, + 61968, + 123821, + 123812, + 537238, # custom ] - self.nn_architecture = list('{}KB'.format(s) for s in nn_arch_sizes) - model_size = math.ceil(os.stat(model_path ).st_size / 1024) - nn_architecture = '{}KB'.format(min(nn_arch_sizes, key=lambda x:abs(x-model_size))) - nets = importlib.import_module('uvr5_pack.lib_v5.nets' + f'_{nn_architecture}'.replace('_{}KB'.format(nn_arch_sizes[0]), ''), package=None) - model_hash = hashlib.md5(open(model_path,'rb').read()).hexdigest() - param_name ,model_params_d = _get_name_params(model_path , model_hash) + self.nn_architecture = list("{}KB".format(s) for s in nn_arch_sizes) + model_size = math.ceil(os.stat(model_path).st_size / 1024) + nn_architecture = "{}KB".format( + min(nn_arch_sizes, key=lambda x: abs(x - model_size)) + ) + nets = importlib.import_module( + "uvr5_pack.lib_v5.nets" + + f"_{nn_architecture}".replace("_{}KB".format(nn_arch_sizes[0]), ""), + package=None, + ) + model_hash = hashlib.md5(open(model_path, "rb").read()).hexdigest() + param_name, model_params_d = _get_name_params(model_path, model_hash) mp = ModelParameters(model_params_d) - model = nets.CascadedASPPNet(mp.param['bins'] * 2) - cpk = torch.load( model_path , map_location='cpu') + model = nets.CascadedASPPNet(mp.param["bins"] * 2) + cpk = torch.load(model_path, map_location="cpu") model.load_state_dict(cpk) model.eval() - if(is_half):model = model.half().to(device) - else:model = model.to(device) + if is_half: + model = model.half().to(device) + else: + model = model.to(device) self.mp = mp self.model = model - def _path_audio_(self, music_file ,ins_root=None,vocal_root=None): - if(ins_root is None and vocal_root is None):return "No save root." - name=os.path.basename(music_file) - if(ins_root is not None):os.makedirs(ins_root, exist_ok=True) - if(vocal_root is not None):os.makedirs(vocal_root , exist_ok=True) + def _path_audio_(self, music_file, ins_root=None, vocal_root=None): + if ins_root is None and vocal_root is None: + return "No save root." + name = os.path.basename(music_file) + if ins_root is not None: + os.makedirs(ins_root, exist_ok=True) + if vocal_root is not None: + os.makedirs(vocal_root, exist_ok=True) X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {} - bands_n = len(self.mp.param['band']) + bands_n = len(self.mp.param["band"]) # print(bands_n) - for d in range(bands_n, 0, -1): - bp = self.mp.param['band'][d] - if d == bands_n: # high-end band - X_wave[d], _ = librosa.core.load(#理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑 - music_file, bp['sr'], False, dtype=np.float32, res_type=bp['res_type']) + for d in range(bands_n, 0, -1): + bp = self.mp.param["band"][d] + if d == bands_n: # high-end band + ( + X_wave[d], + _, + ) = librosa.core.load( # 理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑 + music_file, + bp["sr"], + False, + dtype=np.float32, + res_type=bp["res_type"], + ) if X_wave[d].ndim == 1: X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]]) - else: # lower bands - X_wave[d] = librosa.core.resample(X_wave[d+1], self.mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type']) + else: # lower bands + X_wave[d] = librosa.core.resample( + X_wave[d + 1], + self.mp.param["band"][d + 1]["sr"], + bp["sr"], + res_type=bp["res_type"], + ) # Stft of wave source - X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(X_wave[d], bp['hl'], bp['n_fft'], self.mp.param['mid_side'], self.mp.param['mid_side_b2'], self.mp.param['reverse']) + X_spec_s[d] = spec_utils.wave_to_spectrogram_mt( + X_wave[d], + bp["hl"], + bp["n_fft"], + self.mp.param["mid_side"], + self.mp.param["mid_side_b2"], + self.mp.param["reverse"], + ) # pdb.set_trace() - if d == bands_n and self.data['high_end_process'] != 'none': - input_high_end_h = (bp['n_fft']//2 - bp['crop_stop']) + ( self.mp.param['pre_filter_stop'] - self.mp.param['pre_filter_start']) - input_high_end = X_spec_s[d][:, bp['n_fft']//2-input_high_end_h:bp['n_fft']//2, :] + if d == bands_n and self.data["high_end_process"] != "none": + input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + ( + self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"] + ) + input_high_end = X_spec_s[d][ + :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, : + ] X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp) - aggresive_set = float(self.data['agg']/100) - aggressiveness = {'value': aggresive_set, 'split_bin': self.mp.param['band'][1]['crop_stop']} + aggresive_set = float(self.data["agg"] / 100) + aggressiveness = { + "value": aggresive_set, + "split_bin": self.mp.param["band"][1]["crop_stop"], + } with torch.no_grad(): - pred, X_mag, X_phase = inference(X_spec_m,self.device,self.model, aggressiveness,self.data) + pred, X_mag, X_phase = inference( + X_spec_m, self.device, self.model, aggressiveness, self.data + ) # Postprocess - if self.data['postprocess']: + if self.data["postprocess"]: pred_inv = np.clip(X_mag - pred, 0, np.inf) pred = spec_utils.mask_silence(pred, pred_inv) y_spec_m = pred * X_phase v_spec_m = X_spec_m - y_spec_m - if (ins_root is not None): - if self.data['high_end_process'].startswith('mirroring'): - input_high_end_ = spec_utils.mirroring(self.data['high_end_process'], y_spec_m, input_high_end, self.mp) - wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp,input_high_end_h, input_high_end_) + if ins_root is not None: + if self.data["high_end_process"].startswith("mirroring"): + input_high_end_ = spec_utils.mirroring( + self.data["high_end_process"], y_spec_m, input_high_end, self.mp + ) + wav_instrument = spec_utils.cmb_spectrogram_to_wave( + y_spec_m, self.mp, input_high_end_h, input_high_end_ + ) else: wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp) - print ('%s instruments done'%name) - wavfile.write(os.path.join(ins_root, 'instrument_{}.wav'.format(name) ), self.mp.param['sr'], (np.array(wav_instrument)*32768).astype("int16")) # - if (vocal_root is not None): - if self.data['high_end_process'].startswith('mirroring'): - input_high_end_ = spec_utils.mirroring(self.data['high_end_process'], v_spec_m, input_high_end, self.mp) - wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp, input_high_end_h, input_high_end_) + print("%s instruments done" % name) + wavfile.write( + os.path.join(ins_root, "instrument_{}.wav".format(name)), + self.mp.param["sr"], + (np.array(wav_instrument) * 32768).astype("int16"), + ) # + if vocal_root is not None: + if self.data["high_end_process"].startswith("mirroring"): + input_high_end_ = spec_utils.mirroring( + self.data["high_end_process"], v_spec_m, input_high_end, self.mp + ) + wav_vocals = spec_utils.cmb_spectrogram_to_wave( + v_spec_m, self.mp, input_high_end_h, input_high_end_ + ) else: wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp) - print ('%s vocals done'%name) - wavfile.write(os.path.join(vocal_root , 'vocal_{}.wav'.format(name) ), self.mp.param['sr'], (np.array(wav_vocals)*32768).astype("int16")) + print("%s vocals done" % name) + wavfile.write( + os.path.join(vocal_root, "vocal_{}.wav".format(name)), + self.mp.param["sr"], + (np.array(wav_vocals) * 32768).astype("int16"), + ) -if __name__ == '__main__': - device = 'cuda' - is_half=True - model_path='uvr5_weights/2_HP-UVR.pth' - pre_fun = _audio_pre_(model_path=model_path,device=device,is_half=True) - audio_path = '神女劈观.aac' - save_path = 'opt' - pre_fun._path_audio_(audio_path , save_path,save_path) + +if __name__ == "__main__": + device = "cuda" + is_half = True + model_path = "uvr5_weights/2_HP-UVR.pth" + pre_fun = _audio_pre_(model_path=model_path, device=device, is_half=True) + audio_path = "神女劈观.aac" + save_path = "opt" + pre_fun._path_audio_(audio_path, save_path, save_path) diff --git a/locale/locale_diff.py b/locale/locale_diff.py index de00aec..ccc453e 100644 --- a/locale/locale_diff.py +++ b/locale/locale_diff.py @@ -31,7 +31,9 @@ for lang_file in languages: del lang_data[key] # Sort the keys of the language file to match the order of the standard file - lang_data = OrderedDict(sorted(lang_data.items(), key=lambda x: list(standard_data.keys()).index(x[0]))) + lang_data = OrderedDict( + sorted(lang_data.items(), key=lambda x: list(standard_data.keys()).index(x[0])) + ) # Save the updated language file with open(lang_file, "w", encoding="utf-8") as f: diff --git a/my_utils.py b/my_utils.py index e6ac402..89a1527 100644 --- a/my_utils.py +++ b/my_utils.py @@ -1,11 +1,15 @@ import ffmpeg import numpy as np -def load_audio(file,sr): + + +def load_audio(file, sr): try: # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 # This launches a subprocess to decode audio while down-mixing and resampling as necessary. # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. - file=file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")#防止小白拷路径头尾带了空格和"和回车 + file = ( + file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + ) # 防止小白拷路径头尾带了空格和"和回车 out, _ = ( ffmpeg.input(file, threads=0) .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr) diff --git a/slicer2.py b/slicer2.py index 78c08f0..7d9d16d 100644 --- a/slicer2.py +++ b/slicer2.py @@ -18,9 +18,7 @@ def get_rms( x_shape_trimmed = list(y.shape) x_shape_trimmed[axis] -= frame_length - 1 out_shape = tuple(x_shape_trimmed) + tuple([frame_length]) - xw = np.lib.stride_tricks.as_strided( - y, shape=out_shape, strides=out_strides - ) + xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides) if axis < 0: target_axis = axis - 1 else: @@ -38,19 +36,25 @@ def get_rms( class Slicer: - def __init__(self, - sr: int, - threshold: float = -40., - min_length: int = 5000, - min_interval: int = 300, - hop_size: int = 20, - max_sil_kept: int = 5000): + def __init__( + self, + sr: int, + threshold: float = -40.0, + min_length: int = 5000, + min_interval: int = 300, + hop_size: int = 20, + max_sil_kept: int = 5000, + ): if not min_length >= min_interval >= hop_size: - raise ValueError('The following condition must be satisfied: min_length >= min_interval >= hop_size') + raise ValueError( + "The following condition must be satisfied: min_length >= min_interval >= hop_size" + ) if not max_sil_kept >= hop_size: - raise ValueError('The following condition must be satisfied: max_sil_kept >= hop_size') + raise ValueError( + "The following condition must be satisfied: max_sil_kept >= hop_size" + ) min_interval = sr * min_interval / 1000 - self.threshold = 10 ** (threshold / 20.) + self.threshold = 10 ** (threshold / 20.0) self.hop_size = round(sr * hop_size / 1000) self.win_size = min(round(min_interval), 4 * self.hop_size) self.min_length = round(sr * min_length / 1000 / self.hop_size) @@ -59,9 +63,13 @@ class Slicer: def _apply_slice(self, waveform, begin, end): if len(waveform.shape) > 1: - return waveform[:, begin * self.hop_size: min(waveform.shape[1], end * self.hop_size)] + return waveform[ + :, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size) + ] else: - return waveform[begin * self.hop_size: min(waveform.shape[0], end * self.hop_size)] + return waveform[ + begin * self.hop_size : min(waveform.shape[0], end * self.hop_size) + ] # @timeit def slice(self, waveform): @@ -71,7 +79,9 @@ class Slicer: samples = waveform if samples.shape[0] <= self.min_length: return [waveform] - rms_list = get_rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0) + rms_list = get_rms( + y=samples, frame_length=self.win_size, hop_length=self.hop_size + ).squeeze(0) sil_tags = [] silence_start = None clip_start = 0 @@ -87,23 +97,37 @@ class Slicer: continue # Clear recorded silence start if interval is not enough or clip is too short is_leading_silence = silence_start == 0 and i > self.max_sil_kept - need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length + need_slice_middle = ( + i - silence_start >= self.min_interval + and i - clip_start >= self.min_length + ) if not is_leading_silence and not need_slice_middle: silence_start = None continue # Need slicing. Record the range of silent frames to be removed. if i - silence_start <= self.max_sil_kept: - pos = rms_list[silence_start: i + 1].argmin() + silence_start + pos = rms_list[silence_start : i + 1].argmin() + silence_start if silence_start == 0: sil_tags.append((0, pos)) else: sil_tags.append((pos, pos)) clip_start = pos elif i - silence_start <= self.max_sil_kept * 2: - pos = rms_list[i - self.max_sil_kept: silence_start + self.max_sil_kept + 1].argmin() + pos = rms_list[ + i - self.max_sil_kept : silence_start + self.max_sil_kept + 1 + ].argmin() pos += i - self.max_sil_kept - pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start - pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept + pos_l = ( + rms_list[ + silence_start : silence_start + self.max_sil_kept + 1 + ].argmin() + + silence_start + ) + pos_r = ( + rms_list[i - self.max_sil_kept : i + 1].argmin() + + i + - self.max_sil_kept + ) if silence_start == 0: sil_tags.append((0, pos_r)) clip_start = pos_r @@ -111,8 +135,17 @@ class Slicer: sil_tags.append((min(pos_l, pos), max(pos_r, pos))) clip_start = max(pos_r, pos) else: - pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start - pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept + pos_l = ( + rms_list[ + silence_start : silence_start + self.max_sil_kept + 1 + ].argmin() + + silence_start + ) + pos_r = ( + rms_list[i - self.max_sil_kept : i + 1].argmin() + + i + - self.max_sil_kept + ) if silence_start == 0: sil_tags.append((0, pos_r)) else: @@ -121,9 +154,12 @@ class Slicer: silence_start = None # Deal with trailing silence. total_frames = rms_list.shape[0] - if silence_start is not None and total_frames - silence_start >= self.min_interval: + if ( + silence_start is not None + and total_frames - silence_start >= self.min_interval + ): silence_end = min(total_frames, silence_start + self.max_sil_kept) - pos = rms_list[silence_start: silence_end + 1].argmin() + silence_start + pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start sil_tags.append((pos, total_frames + 1)) # Apply and return slices. if len(sil_tags) == 0: @@ -133,9 +169,13 @@ class Slicer: if sil_tags[0][0] > 0: chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0])) for i in range(len(sil_tags) - 1): - chunks.append(self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0])) + chunks.append( + self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]) + ) if sil_tags[-1][1] < total_frames: - chunks.append(self._apply_slice(waveform, sil_tags[-1][1], total_frames)) + chunks.append( + self._apply_slice(waveform, sil_tags[-1][1], total_frames) + ) return chunks @@ -147,18 +187,45 @@ def main(): import soundfile parser = ArgumentParser() - parser.add_argument('audio', type=str, help='The audio to be sliced') - parser.add_argument('--out', type=str, help='Output directory of the sliced audio clips') - parser.add_argument('--db_thresh', type=float, required=False, default=-40, - help='The dB threshold for silence detection') - parser.add_argument('--min_length', type=int, required=False, default=5000, - help='The minimum milliseconds required for each sliced audio clip') - parser.add_argument('--min_interval', type=int, required=False, default=300, - help='The minimum milliseconds for a silence part to be sliced') - parser.add_argument('--hop_size', type=int, required=False, default=10, - help='Frame length in milliseconds') - parser.add_argument('--max_sil_kept', type=int, required=False, default=500, - help='The maximum silence length kept around the sliced clip, presented in milliseconds') + parser.add_argument("audio", type=str, help="The audio to be sliced") + parser.add_argument( + "--out", type=str, help="Output directory of the sliced audio clips" + ) + parser.add_argument( + "--db_thresh", + type=float, + required=False, + default=-40, + help="The dB threshold for silence detection", + ) + parser.add_argument( + "--min_length", + type=int, + required=False, + default=5000, + help="The minimum milliseconds required for each sliced audio clip", + ) + parser.add_argument( + "--min_interval", + type=int, + required=False, + default=300, + help="The minimum milliseconds for a silence part to be sliced", + ) + parser.add_argument( + "--hop_size", + type=int, + required=False, + default=10, + help="Frame length in milliseconds", + ) + parser.add_argument( + "--max_sil_kept", + type=int, + required=False, + default=500, + help="The maximum silence length kept around the sliced clip, presented in milliseconds", + ) args = parser.parse_args() out = args.out if out is None: @@ -170,7 +237,7 @@ def main(): min_length=args.min_length, min_interval=args.min_interval, hop_size=args.hop_size, - max_sil_kept=args.max_sil_kept + max_sil_kept=args.max_sil_kept, ) chunks = slicer.slice(audio) if not os.path.exists(out): @@ -178,8 +245,16 @@ def main(): for i, chunk in enumerate(chunks): if len(chunk.shape) > 1: chunk = chunk.T - soundfile.write(os.path.join(out, f'%s_%d.wav' % (os.path.basename(args.audio).rsplit('.', maxsplit=1)[0], i)), chunk, sr) + soundfile.write( + os.path.join( + out, + f"%s_%d.wav" + % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i), + ), + chunk, + sr, + ) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/train/data_utils.py b/train/data_utils.py index 2e836af..ee7d4d1 100644 --- a/train/data_utils.py +++ b/train/data_utils.py @@ -1,4 +1,4 @@ -import os,traceback +import os, traceback import numpy as np import torch import torch.utils.data @@ -6,6 +6,7 @@ import torch.utils.data from mel_processing import spectrogram_torch from utils import load_wav_to_torch, load_filepaths_and_text + class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset): """ 1) loads audio, text pairs @@ -15,14 +16,14 @@ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset): def __init__(self, audiopaths_and_text, hparams): self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) - self.max_wav_value = hparams.max_wav_value - self.sampling_rate = hparams.sampling_rate - self.filter_length = hparams.filter_length - self.hop_length = hparams.hop_length - self.win_length = hparams.win_length - self.sampling_rate = hparams.sampling_rate - self.min_text_len = getattr(hparams, "min_text_len", 1) - self.max_text_len = getattr(hparams, "max_text_len", 5000) + self.max_wav_value = hparams.max_wav_value + self.sampling_rate = hparams.sampling_rate + self.filter_length = hparams.filter_length + self.hop_length = hparams.hop_length + self.win_length = hparams.win_length + self.sampling_rate = hparams.sampling_rate + self.min_text_len = getattr(hparams, "min_text_len", 1) + self.max_text_len = getattr(hparams, "max_text_len", 5000) self._filter() def _filter(self): @@ -34,12 +35,13 @@ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset): # spec_length = wav_length // hop_length audiopaths_and_text_new = [] lengths = [] - for audiopath, text, pitch,pitchf,dv in self.audiopaths_and_text: + for audiopath, text, pitch, pitchf, dv in self.audiopaths_and_text: if self.min_text_len <= len(text) and len(text) <= self.max_text_len: - audiopaths_and_text_new.append([audiopath, text, pitch,pitchf,dv]) + audiopaths_and_text_new.append([audiopath, text, pitch, pitchf, dv]) lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length)) self.audiopaths_and_text = audiopaths_and_text_new self.lengths = lengths + def get_sid(self, sid): sid = torch.LongTensor([int(sid)]) return sid @@ -54,7 +56,7 @@ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset): phone, pitch, pitchf = self.get_labels(phone, pitch, pitchf) spec, wav = self.get_audio(file) - dv=self.get_sid(dv) + dv = self.get_sid(dv) len_phone = phone.size()[0] len_spec = spec.size()[-1] @@ -71,9 +73,9 @@ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset): pitch = pitch[:len_min] pitchf = pitchf[:len_min] - return (spec, wav, phone, pitch,pitchf,dv) + return (spec, wav, phone, pitch, pitchf, dv) - def get_labels(self, phone, pitch,pitchf): + def get_labels(self, phone, pitch, pitchf): phone = np.load(phone) phone = np.repeat(phone, 2, axis=0) pitch = np.load(pitch) @@ -86,7 +88,7 @@ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset): phone = torch.FloatTensor(phone) pitch = torch.LongTensor(pitch) pitchf = torch.FloatTensor(pitchf) - return phone, pitch,pitchf + return phone, pitch, pitchf def get_audio(self, filename): audio, sampling_rate = load_wav_to_torch(filename) @@ -103,10 +105,15 @@ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset): try: spec = torch.load(spec_filename) except: - print (spec_filename,traceback.format_exc()) - spec = spectrogram_torch(audio_norm, self.filter_length, - self.sampling_rate, self.hop_length, self.win_length, - center=False) + print(spec_filename, traceback.format_exc()) + spec = spectrogram_torch( + audio_norm, + self.filter_length, + self.sampling_rate, + self.hop_length, + self.win_length, + center=False, + ) spec = torch.squeeze(spec, 0) torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) else: @@ -127,6 +134,8 @@ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset): def __len__(self): return len(self.audiopaths_and_text) + + class TextAudioCollateMultiNSFsid: """Zero-pads model inputs and targets""" @@ -155,7 +164,9 @@ class TextAudioCollateMultiNSFsid: max_phone_len = max([x[2].size(0) for x in batch]) phone_lengths = torch.LongTensor(len(batch)) - phone_padded = torch.FloatTensor(len(batch), max_phone_len, batch[0][2].shape[1])#(spec, wav, phone, pitch) + phone_padded = torch.FloatTensor( + len(batch), max_phone_len, batch[0][2].shape[1] + ) # (spec, wav, phone, pitch) pitch_padded = torch.LongTensor(len(batch), max_phone_len) pitchf_padded = torch.FloatTensor(len(batch), max_phone_len) phone_padded.zero_() @@ -187,7 +198,6 @@ class TextAudioCollateMultiNSFsid: # dv[i] = row[5] sid[i] = row[5] - return ( phone_padded, phone_lengths, @@ -198,9 +208,10 @@ class TextAudioCollateMultiNSFsid: wave_padded, wave_lengths, # dv - sid + sid, ) + class TextAudioLoader(torch.utils.data.Dataset): """ 1) loads audio, text pairs @@ -210,14 +221,14 @@ class TextAudioLoader(torch.utils.data.Dataset): def __init__(self, audiopaths_and_text, hparams): self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) - self.max_wav_value = hparams.max_wav_value - self.sampling_rate = hparams.sampling_rate - self.filter_length = hparams.filter_length - self.hop_length = hparams.hop_length - self.win_length = hparams.win_length - self.sampling_rate = hparams.sampling_rate - self.min_text_len = getattr(hparams, "min_text_len", 1) - self.max_text_len = getattr(hparams, "max_text_len", 5000) + self.max_wav_value = hparams.max_wav_value + self.sampling_rate = hparams.sampling_rate + self.filter_length = hparams.filter_length + self.hop_length = hparams.hop_length + self.win_length = hparams.win_length + self.sampling_rate = hparams.sampling_rate + self.min_text_len = getattr(hparams, "min_text_len", 1) + self.max_text_len = getattr(hparams, "max_text_len", 5000) self._filter() def _filter(self): @@ -229,12 +240,13 @@ class TextAudioLoader(torch.utils.data.Dataset): # spec_length = wav_length // hop_length audiopaths_and_text_new = [] lengths = [] - for audiopath, text,dv in self.audiopaths_and_text: + for audiopath, text, dv in self.audiopaths_and_text: if self.min_text_len <= len(text) and len(text) <= self.max_text_len: - audiopaths_and_text_new.append([audiopath, text,dv]) + audiopaths_and_text_new.append([audiopath, text, dv]) lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length)) self.audiopaths_and_text = audiopaths_and_text_new self.lengths = lengths + def get_sid(self, sid): sid = torch.LongTensor([int(sid)]) return sid @@ -247,7 +259,7 @@ class TextAudioLoader(torch.utils.data.Dataset): phone = self.get_labels(phone) spec, wav = self.get_audio(file) - dv=self.get_sid(dv) + dv = self.get_sid(dv) len_phone = phone.size()[0] len_spec = spec.size()[-1] @@ -257,7 +269,7 @@ class TextAudioLoader(torch.utils.data.Dataset): spec = spec[:, :len_min] wav = wav[:, :len_wav] phone = phone[:len_min, :] - return (spec, wav, phone,dv) + return (spec, wav, phone, dv) def get_labels(self, phone): phone = np.load(phone) @@ -282,10 +294,15 @@ class TextAudioLoader(torch.utils.data.Dataset): try: spec = torch.load(spec_filename) except: - print (spec_filename,traceback.format_exc()) - spec = spectrogram_torch(audio_norm, self.filter_length, - self.sampling_rate, self.hop_length, self.win_length, - center=False) + print(spec_filename, traceback.format_exc()) + spec = spectrogram_torch( + audio_norm, + self.filter_length, + self.sampling_rate, + self.hop_length, + self.win_length, + center=False, + ) spec = torch.squeeze(spec, 0) torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) else: @@ -306,6 +323,8 @@ class TextAudioLoader(torch.utils.data.Dataset): def __len__(self): return len(self.audiopaths_and_text) + + class TextAudioCollate: """Zero-pads model inputs and targets""" @@ -334,7 +353,9 @@ class TextAudioCollate: max_phone_len = max([x[2].size(0) for x in batch]) phone_lengths = torch.LongTensor(len(batch)) - phone_padded = torch.FloatTensor(len(batch), max_phone_len, batch[0][2].shape[1]) + phone_padded = torch.FloatTensor( + len(batch), max_phone_len, batch[0][2].shape[1] + ) phone_padded.zero_() sid = torch.LongTensor(len(batch)) @@ -355,7 +376,6 @@ class TextAudioCollate: sid[i] = row[3] - return ( phone_padded, phone_lengths, @@ -363,9 +383,10 @@ class TextAudioCollate: spec_lengths, wave_padded, wave_lengths, - sid + sid, ) + class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): """ Maintain similar input lengths in a batch. @@ -402,7 +423,7 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): if idx_bucket != -1: buckets[idx_bucket].append(i) - for i in range(len(buckets) - 1, -1, -1):# + for i in range(len(buckets) - 1, -1, -1): # if len(buckets[i]) == 0: buckets.pop(i) self.boundaries.pop(i + 1) diff --git a/train/losses.py b/train/losses.py index 80bc9a6..4d71f86 100644 --- a/train/losses.py +++ b/train/losses.py @@ -1,6 +1,7 @@ import torch from torch.nn import functional as F + def feature_loss(fmap_r, fmap_g): loss = 0 for dr, dg in zip(fmap_r, fmap_g): diff --git a/train/mel_processing.py b/train/mel_processing.py index c91b937..315b3d1 100644 --- a/train/mel_processing.py +++ b/train/mel_processing.py @@ -78,7 +78,8 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False) center=center, pad_mode="reflect", normalized=False, - onesided=True,return_complex=False + onesided=True, + return_complex=False, ) spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) @@ -139,8 +140,18 @@ def mel_spectrogram_torch( # normalized=False, # onesided=True, # ) - spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], - center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False) + spec = torch.stft( + y, + n_fft, + hop_length=hop_size, + win_length=win_size, + window=hann_window[wnsize_dtype_device], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=False, + ) spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) spec = torch.matmul(mel_basis[fmax_dtype_device], spec) diff --git a/train/process_ckpt.py b/train/process_ckpt.py index 38f94b3..b4c8eb0 100644 --- a/train/process_ckpt.py +++ b/train/process_ckpt.py @@ -1,101 +1,248 @@ -import torch,traceback,os,pdb +import torch, traceback, os, pdb from collections import OrderedDict -def savee(ckpt,sr,if_f0,name,epoch): + +def savee(ckpt, sr, if_f0, name, epoch): try: opt = OrderedDict() opt["weight"] = {} for key in ckpt.keys(): - if ("enc_q" in key): continue + if "enc_q" in key: + continue opt["weight"][key] = ckpt[key].half() - if(sr=="40k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2], 512, [16, 16, 4, 4], 109, 256, 40000] - elif(sr=="48k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,6,2,2,2], 512, [16, 16, 4, 4,4], 109, 256, 48000] - elif(sr=="32k"):opt["config"] = [513, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 4, 2, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 32000] - opt["info"] = "%sepoch"%epoch + if sr == "40k": + opt["config"] = [ + 1025, + 32, + 192, + 192, + 768, + 2, + 6, + 3, + 0, + "1", + [3, 7, 11], + [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + [10, 10, 2, 2], + 512, + [16, 16, 4, 4], + 109, + 256, + 40000, + ] + elif sr == "48k": + opt["config"] = [ + 1025, + 32, + 192, + 192, + 768, + 2, + 6, + 3, + 0, + "1", + [3, 7, 11], + [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + [10, 6, 2, 2, 2], + 512, + [16, 16, 4, 4, 4], + 109, + 256, + 48000, + ] + elif sr == "32k": + opt["config"] = [ + 513, + 32, + 192, + 192, + 768, + 2, + 6, + 3, + 0, + "1", + [3, 7, 11], + [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + [10, 4, 2, 2, 2], + 512, + [16, 16, 4, 4, 4], + 109, + 256, + 32000, + ] + opt["info"] = "%sepoch" % epoch opt["sr"] = sr - opt["f0"] =if_f0 - torch.save(opt, "weights/%s.pth"%name) + opt["f0"] = if_f0 + torch.save(opt, "weights/%s.pth" % name) return "Success." except: return traceback.format_exc() + def show_info(path): try: a = torch.load(path, map_location="cpu") - return "模型信息:%s\n采样率:%s\n模型是否输入音高引导:%s"%(a.get("info","None"),a.get("sr","None"),a.get("f0","None"),) + return "模型信息:%s\n采样率:%s\n模型是否输入音高引导:%s" % ( + a.get("info", "None"), + a.get("sr", "None"), + a.get("f0", "None"), + ) except: return traceback.format_exc() -def extract_small_model(path,name,sr,if_f0,info): + +def extract_small_model(path, name, sr, if_f0, info): try: ckpt = torch.load(path, map_location="cpu") - if("model"in ckpt):ckpt=ckpt["model"] + if "model" in ckpt: + ckpt = ckpt["model"] opt = OrderedDict() opt["weight"] = {} for key in ckpt.keys(): - if ("enc_q" in key): continue + if "enc_q" in key: + continue opt["weight"][key] = ckpt[key].half() - if(sr=="40k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2], 512, [16, 16, 4, 4], 109, 256, 40000] - elif(sr=="48k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,6,2,2,2], 512, [16, 16, 4, 4,4], 109, 256, 48000] - elif(sr=="32k"):opt["config"] = [513, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 4, 2, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 32000] - if(info==""):info="Extracted model." + if sr == "40k": + opt["config"] = [ + 1025, + 32, + 192, + 192, + 768, + 2, + 6, + 3, + 0, + "1", + [3, 7, 11], + [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + [10, 10, 2, 2], + 512, + [16, 16, 4, 4], + 109, + 256, + 40000, + ] + elif sr == "48k": + opt["config"] = [ + 1025, + 32, + 192, + 192, + 768, + 2, + 6, + 3, + 0, + "1", + [3, 7, 11], + [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + [10, 6, 2, 2, 2], + 512, + [16, 16, 4, 4, 4], + 109, + 256, + 48000, + ] + elif sr == "32k": + opt["config"] = [ + 513, + 32, + 192, + 192, + 768, + 2, + 6, + 3, + 0, + "1", + [3, 7, 11], + [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + [10, 4, 2, 2, 2], + 512, + [16, 16, 4, 4, 4], + 109, + 256, + 32000, + ] + if info == "": + info = "Extracted model." opt["info"] = info opt["sr"] = sr - opt["f0"] =int(if_f0) - torch.save(opt, "weights/%s.pth"%name) + opt["f0"] = int(if_f0) + torch.save(opt, "weights/%s.pth" % name) return "Success." except: return traceback.format_exc() -def change_info(path,info,name): + +def change_info(path, info, name): try: ckpt = torch.load(path, map_location="cpu") - ckpt["info"]=info - if(name==""):name=os.path.basename(path) - torch.save(ckpt, "weights/%s"%name) + ckpt["info"] = info + if name == "": + name = os.path.basename(path) + torch.save(ckpt, "weights/%s" % name) return "Success." except: return traceback.format_exc() -def merge(path1,path2,alpha1,sr,f0,info,name): + +def merge(path1, path2, alpha1, sr, f0, info, name): try: + def extract(ckpt): a = ckpt["model"] opt = OrderedDict() opt["weight"] = {} for key in a.keys(): - if ("enc_q" in key): continue + if "enc_q" in key: + continue opt["weight"][key] = a[key] return opt + ckpt1 = torch.load(path1, map_location="cpu") ckpt2 = torch.load(path2, map_location="cpu") cfg = ckpt1["config"] - if("model"in ckpt1): ckpt1=extract(ckpt1) - else: ckpt1=ckpt1["weight"] - if("model"in ckpt2): ckpt2=extract(ckpt2) - else: ckpt2=ckpt2["weight"] - if(sorted(list(ckpt1.keys()))!=sorted(list(ckpt2.keys()))):return "Fail to merge the models. The model architectures are not the same." + if "model" in ckpt1: + ckpt1 = extract(ckpt1) + else: + ckpt1 = ckpt1["weight"] + if "model" in ckpt2: + ckpt2 = extract(ckpt2) + else: + ckpt2 = ckpt2["weight"] + if sorted(list(ckpt1.keys())) != sorted(list(ckpt2.keys())): + return "Fail to merge the models. The model architectures are not the same." opt = OrderedDict() opt["weight"] = {} for key in ckpt1.keys(): # try: - if(key=="emb_g.weight"and ckpt1[key].shape!=ckpt2[key].shape): - min_shape0=min(ckpt1[key].shape[0],ckpt2[key].shape[0]) - opt["weight"][key] = (alpha1 * (ckpt1[key][:min_shape0].float()) + (1 - alpha1) * (ckpt2[key][:min_shape0].float())).half() - else: - opt["weight"][key] = (alpha1*(ckpt1[key].float())+(1-alpha1)*(ckpt2[key].float())).half() - # except: - # pdb.set_trace() + if key == "emb_g.weight" and ckpt1[key].shape != ckpt2[key].shape: + min_shape0 = min(ckpt1[key].shape[0], ckpt2[key].shape[0]) + opt["weight"][key] = ( + alpha1 * (ckpt1[key][:min_shape0].float()) + + (1 - alpha1) * (ckpt2[key][:min_shape0].float()) + ).half() + else: + opt["weight"][key] = ( + alpha1 * (ckpt1[key].float()) + (1 - alpha1) * (ckpt2[key].float()) + ).half() + # except: + # pdb.set_trace() opt["config"] = cfg - ''' + """ if(sr=="40k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 40000] elif(sr=="48k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,6,2,2,2], 512, [16, 16, 4, 4], 109, 256, 48000] elif(sr=="32k"):opt["config"] = [513, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 4, 2, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 32000] - ''' - opt["sr"]=sr - opt["f0"]=1 if f0=="是"else 0 - opt["info"]=info - torch.save(opt, "weights/%s.pth"%name) + """ + opt["sr"] = sr + opt["f0"] = 1 if f0 == "是" else 0 + opt["info"] = info + torch.save(opt, "weights/%s.pth" % name) return "Success." except: return traceback.format_exc() diff --git a/train/utils.py b/train/utils.py index 71528d6..5ce030a 100644 --- a/train/utils.py +++ b/train/utils.py @@ -1,4 +1,4 @@ -import os,traceback +import os, traceback import glob import sys import argparse @@ -14,44 +14,53 @@ MATPLOTLIB_FLAG = False logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) logger = logging -def load_checkpoint_d(checkpoint_path, combd,sbd, optimizer=None,load_opt=1): - assert os.path.isfile(checkpoint_path) - checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') - ################## - def go(model,bkey): - saved_state_dict = checkpoint_dict[bkey] - if hasattr(model, 'module'):state_dict = model.module.state_dict() - else:state_dict = model.state_dict() - new_state_dict= {} - for k, v in state_dict.items():#模型需要的shape - try: - new_state_dict[k] = saved_state_dict[k] - if(saved_state_dict[k].shape!=state_dict[k].shape): - print("shape-%s-mismatch|need-%s|get-%s"%(k,state_dict[k].shape,saved_state_dict[k].shape))# - raise KeyError - except: - # logger.info(traceback.format_exc()) - logger.info("%s is not in the checkpoint" % k)#pretrain缺失的 - new_state_dict[k] = v#模型自带的随机值 - if hasattr(model, 'module'): - model.module.load_state_dict(new_state_dict,strict=False) - else: - model.load_state_dict(new_state_dict,strict=False) - go(combd,"combd") - go(sbd,"sbd") - ############# - logger.info("Loaded model weights") +def load_checkpoint_d(checkpoint_path, combd, sbd, optimizer=None, load_opt=1): + assert os.path.isfile(checkpoint_path) + checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") - iteration = checkpoint_dict['iteration'] - learning_rate = checkpoint_dict['learning_rate'] - if optimizer is not None and load_opt==1:###加载不了,如果是空的的话,重新初始化,可能还会影响lr时间表的更新,因此在train文件最外围catch - # try: - optimizer.load_state_dict(checkpoint_dict['optimizer']) - # except: - # traceback.print_exc() - logger.info("Loaded checkpoint '{}' (epoch {})" .format(checkpoint_path, iteration)) - return model, optimizer, learning_rate, iteration + ################## + def go(model, bkey): + saved_state_dict = checkpoint_dict[bkey] + if hasattr(model, "module"): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + new_state_dict = {} + for k, v in state_dict.items(): # 模型需要的shape + try: + new_state_dict[k] = saved_state_dict[k] + if saved_state_dict[k].shape != state_dict[k].shape: + print( + "shape-%s-mismatch|need-%s|get-%s" + % (k, state_dict[k].shape, saved_state_dict[k].shape) + ) # + raise KeyError + except: + # logger.info(traceback.format_exc()) + logger.info("%s is not in the checkpoint" % k) # pretrain缺失的 + new_state_dict[k] = v # 模型自带的随机值 + if hasattr(model, "module"): + model.module.load_state_dict(new_state_dict, strict=False) + else: + model.load_state_dict(new_state_dict, strict=False) + + go(combd, "combd") + go(sbd, "sbd") + ############# + logger.info("Loaded model weights") + + iteration = checkpoint_dict["iteration"] + learning_rate = checkpoint_dict["learning_rate"] + if ( + optimizer is not None and load_opt == 1 + ): ###加载不了,如果是空的的话,重新初始化,可能还会影响lr时间表的更新,因此在train文件最外围catch + # try: + optimizer.load_state_dict(checkpoint_dict["optimizer"]) + # except: + # traceback.print_exc() + logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration)) + return model, optimizer, learning_rate, iteration # def load_checkpoint(checkpoint_path, model, optimizer=None): @@ -83,303 +92,380 @@ def load_checkpoint_d(checkpoint_path, combd,sbd, optimizer=None,load_opt=1): # logger.info("Loaded checkpoint '{}' (epoch {})" .format( # checkpoint_path, iteration)) # return model, optimizer, learning_rate, iteration -def load_checkpoint(checkpoint_path, model, optimizer=None,load_opt=1): - assert os.path.isfile(checkpoint_path) - checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') +def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1): + assert os.path.isfile(checkpoint_path) + checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") - saved_state_dict = checkpoint_dict['model'] - if hasattr(model, 'module'): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - new_state_dict= {} - for k, v in state_dict.items():#模型需要的shape - try: - new_state_dict[k] = saved_state_dict[k] - if(saved_state_dict[k].shape!=state_dict[k].shape): - print("shape-%s-mismatch|need-%s|get-%s"%(k,state_dict[k].shape,saved_state_dict[k].shape))# - raise KeyError - except: - # logger.info(traceback.format_exc()) - logger.info("%s is not in the checkpoint" % k)#pretrain缺失的 - new_state_dict[k] = v#模型自带的随机值 - if hasattr(model, 'module'): - model.module.load_state_dict(new_state_dict,strict=False) - else: - model.load_state_dict(new_state_dict,strict=False) - logger.info("Loaded model weights") + saved_state_dict = checkpoint_dict["model"] + if hasattr(model, "module"): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + new_state_dict = {} + for k, v in state_dict.items(): # 模型需要的shape + try: + new_state_dict[k] = saved_state_dict[k] + if saved_state_dict[k].shape != state_dict[k].shape: + print( + "shape-%s-mismatch|need-%s|get-%s" + % (k, state_dict[k].shape, saved_state_dict[k].shape) + ) # + raise KeyError + except: + # logger.info(traceback.format_exc()) + logger.info("%s is not in the checkpoint" % k) # pretrain缺失的 + new_state_dict[k] = v # 模型自带的随机值 + if hasattr(model, "module"): + model.module.load_state_dict(new_state_dict, strict=False) + else: + model.load_state_dict(new_state_dict, strict=False) + logger.info("Loaded model weights") - iteration = checkpoint_dict['iteration'] - learning_rate = checkpoint_dict['learning_rate'] - if optimizer is not None and load_opt==1:###加载不了,如果是空的的话,重新初始化,可能还会影响lr时间表的更新,因此在train文件最外围catch - # try: - optimizer.load_state_dict(checkpoint_dict['optimizer']) - # except: - # traceback.print_exc() - logger.info("Loaded checkpoint '{}' (epoch {})" .format(checkpoint_path, iteration)) - return model, optimizer, learning_rate, iteration + iteration = checkpoint_dict["iteration"] + learning_rate = checkpoint_dict["learning_rate"] + if ( + optimizer is not None and load_opt == 1 + ): ###加载不了,如果是空的的话,重新初始化,可能还会影响lr时间表的更新,因此在train文件最外围catch + # try: + optimizer.load_state_dict(checkpoint_dict["optimizer"]) + # except: + # traceback.print_exc() + logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration)) + return model, optimizer, learning_rate, iteration def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path): - logger.info("Saving model and optimizer state at epoch {} to {}".format( - iteration, checkpoint_path)) - if hasattr(model, 'module'): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - torch.save({'model': state_dict, - 'iteration': iteration, - 'optimizer': optimizer.state_dict(), - 'learning_rate': learning_rate}, checkpoint_path) + logger.info( + "Saving model and optimizer state at epoch {} to {}".format( + iteration, checkpoint_path + ) + ) + if hasattr(model, "module"): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + torch.save( + { + "model": state_dict, + "iteration": iteration, + "optimizer": optimizer.state_dict(), + "learning_rate": learning_rate, + }, + checkpoint_path, + ) + + def save_checkpoint_d(combd, sbd, optimizer, learning_rate, iteration, checkpoint_path): - logger.info("Saving model and optimizer state at epoch {} to {}".format( - iteration, checkpoint_path)) - if hasattr(combd, 'module'): state_dict_combd = combd.module.state_dict() - else:state_dict_combd = combd.state_dict() - if hasattr(sbd, 'module'): state_dict_sbd = sbd.module.state_dict() - else:state_dict_sbd = sbd.state_dict() - torch.save({ - 'combd': state_dict_combd, - 'sbd': state_dict_sbd, - 'iteration': iteration, - 'optimizer': optimizer.state_dict(), - 'learning_rate': learning_rate}, checkpoint_path) + logger.info( + "Saving model and optimizer state at epoch {} to {}".format( + iteration, checkpoint_path + ) + ) + if hasattr(combd, "module"): + state_dict_combd = combd.module.state_dict() + else: + state_dict_combd = combd.state_dict() + if hasattr(sbd, "module"): + state_dict_sbd = sbd.module.state_dict() + else: + state_dict_sbd = sbd.state_dict() + torch.save( + { + "combd": state_dict_combd, + "sbd": state_dict_sbd, + "iteration": iteration, + "optimizer": optimizer.state_dict(), + "learning_rate": learning_rate, + }, + checkpoint_path, + ) -def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050): - for k, v in scalars.items(): - writer.add_scalar(k, v, global_step) - for k, v in histograms.items(): - writer.add_histogram(k, v, global_step) - for k, v in images.items(): - writer.add_image(k, v, global_step, dataformats='HWC') - for k, v in audios.items(): - writer.add_audio(k, v, global_step, audio_sampling_rate) +def summarize( + writer, + global_step, + scalars={}, + histograms={}, + images={}, + audios={}, + audio_sampling_rate=22050, +): + for k, v in scalars.items(): + writer.add_scalar(k, v, global_step) + for k, v in histograms.items(): + writer.add_histogram(k, v, global_step) + for k, v in images.items(): + writer.add_image(k, v, global_step, dataformats="HWC") + for k, v in audios.items(): + writer.add_audio(k, v, global_step, audio_sampling_rate) def latest_checkpoint_path(dir_path, regex="G_*.pth"): - f_list = glob.glob(os.path.join(dir_path, regex)) - f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f)))) - x = f_list[-1] - print(x) - return x + f_list = glob.glob(os.path.join(dir_path, regex)) + f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f)))) + x = f_list[-1] + print(x) + return x def plot_spectrogram_to_numpy(spectrogram): - global MATPLOTLIB_FLAG - if not MATPLOTLIB_FLAG: - import matplotlib - matplotlib.use("Agg") - MATPLOTLIB_FLAG = True - mpl_logger = logging.getLogger('matplotlib') - mpl_logger.setLevel(logging.WARNING) - import matplotlib.pylab as plt - import numpy as np - - fig, ax = plt.subplots(figsize=(10,2)) - im = ax.imshow(spectrogram, aspect="auto", origin="lower", - interpolation='none') - plt.colorbar(im, ax=ax) - plt.xlabel("Frames") - plt.ylabel("Channels") - plt.tight_layout() + global MATPLOTLIB_FLAG + if not MATPLOTLIB_FLAG: + import matplotlib - fig.canvas.draw() - data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') - data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) - plt.close() - return data + matplotlib.use("Agg") + MATPLOTLIB_FLAG = True + mpl_logger = logging.getLogger("matplotlib") + mpl_logger.setLevel(logging.WARNING) + import matplotlib.pylab as plt + import numpy as np + + fig, ax = plt.subplots(figsize=(10, 2)) + im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") + plt.colorbar(im, ax=ax) + plt.xlabel("Frames") + plt.ylabel("Channels") + plt.tight_layout() + + fig.canvas.draw() + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data def plot_alignment_to_numpy(alignment, info=None): - global MATPLOTLIB_FLAG - if not MATPLOTLIB_FLAG: - import matplotlib - matplotlib.use("Agg") - MATPLOTLIB_FLAG = True - mpl_logger = logging.getLogger('matplotlib') - mpl_logger.setLevel(logging.WARNING) - import matplotlib.pylab as plt - import numpy as np + global MATPLOTLIB_FLAG + if not MATPLOTLIB_FLAG: + import matplotlib - fig, ax = plt.subplots(figsize=(6, 4)) - im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower', - interpolation='none') - fig.colorbar(im, ax=ax) - xlabel = 'Decoder timestep' - if info is not None: - xlabel += '\n\n' + info - plt.xlabel(xlabel) - plt.ylabel('Encoder timestep') - plt.tight_layout() + matplotlib.use("Agg") + MATPLOTLIB_FLAG = True + mpl_logger = logging.getLogger("matplotlib") + mpl_logger.setLevel(logging.WARNING) + import matplotlib.pylab as plt + import numpy as np - fig.canvas.draw() - data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') - data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) - plt.close() - return data + fig, ax = plt.subplots(figsize=(6, 4)) + im = ax.imshow( + alignment.transpose(), aspect="auto", origin="lower", interpolation="none" + ) + fig.colorbar(im, ax=ax) + xlabel = "Decoder timestep" + if info is not None: + xlabel += "\n\n" + info + plt.xlabel(xlabel) + plt.ylabel("Encoder timestep") + plt.tight_layout() + + fig.canvas.draw() + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data def load_wav_to_torch(full_path): - sampling_rate, data = read(full_path) - return torch.FloatTensor(data.astype(np.float32)), sampling_rate + sampling_rate, data = read(full_path) + return torch.FloatTensor(data.astype(np.float32)), sampling_rate def load_filepaths_and_text(filename, split="|"): - with open(filename, encoding='utf-8') as f: - filepaths_and_text = [line.strip().split(split) for line in f] - return filepaths_and_text + with open(filename, encoding="utf-8") as f: + filepaths_and_text = [line.strip().split(split) for line in f] + return filepaths_and_text def get_hparams(init=True): - ''' -todo: - 结尾七人组: - 保存频率、总epoch done - bs done - pretrainG、pretrainD done - 卡号:os.en["CUDA_VISIBLE_DEVICES"] done - if_latest todo - 模型:if_f0 todo - 采样率:自动选择config done - 是否缓存数据集进GPU:if_cache_data_in_gpu done + """ + todo: + 结尾七人组: + 保存频率、总epoch done + bs done + pretrainG、pretrainD done + 卡号:os.en["CUDA_VISIBLE_DEVICES"] done + if_latest todo + 模型:if_f0 todo + 采样率:自动选择config done + 是否缓存数据集进GPU:if_cache_data_in_gpu done - -m: - 自动决定training_files路径,改掉train_nsf_load_pretrain.py里的hps.data.training_files done - -c不要了 - ''' - parser = argparse.ArgumentParser() - # parser.add_argument('-c', '--config', type=str, default="configs/40k.json",help='JSON file for configuration') - parser.add_argument('-se', '--save_every_epoch', type=int, required=True,help='checkpoint save frequency (epoch)') - parser.add_argument('-te', '--total_epoch', type=int, required=True,help='total_epoch') - parser.add_argument('-pg', '--pretrainG', type=str, default="",help='Pretrained Discriminator path') - parser.add_argument('-pd', '--pretrainD', type=str, default="",help='Pretrained Generator path') - parser.add_argument('-g', '--gpus', type=str, default="0",help='split by -') - parser.add_argument('-bs', '--batch_size', type=int, required=True,help='batch size') - parser.add_argument('-e', '--experiment_dir', type=str, required=True,help='experiment dir')#-m - parser.add_argument('-sr', '--sample_rate', type=str, required=True,help='sample rate, 32k/40k/48k') - parser.add_argument('-f0', '--if_f0', type=int, required=True,help='use f0 as one of the inputs of the model, 1 or 0') - parser.add_argument('-l', '--if_latest', type=int, required=True,help='if only save the latest G/D pth file, 1 or 0') - parser.add_argument('-c', '--if_cache_data_in_gpu', type=int, required=True,help='if caching the dataset in GPU memory, 1 or 0') + -m: + 自动决定training_files路径,改掉train_nsf_load_pretrain.py里的hps.data.training_files done + -c不要了 + """ + parser = argparse.ArgumentParser() + # parser.add_argument('-c', '--config', type=str, default="configs/40k.json",help='JSON file for configuration') + parser.add_argument( + "-se", + "--save_every_epoch", + type=int, + required=True, + help="checkpoint save frequency (epoch)", + ) + parser.add_argument( + "-te", "--total_epoch", type=int, required=True, help="total_epoch" + ) + parser.add_argument( + "-pg", "--pretrainG", type=str, default="", help="Pretrained Discriminator path" + ) + parser.add_argument( + "-pd", "--pretrainD", type=str, default="", help="Pretrained Generator path" + ) + parser.add_argument("-g", "--gpus", type=str, default="0", help="split by -") + parser.add_argument( + "-bs", "--batch_size", type=int, required=True, help="batch size" + ) + parser.add_argument( + "-e", "--experiment_dir", type=str, required=True, help="experiment dir" + ) # -m + parser.add_argument( + "-sr", "--sample_rate", type=str, required=True, help="sample rate, 32k/40k/48k" + ) + parser.add_argument( + "-f0", + "--if_f0", + type=int, + required=True, + help="use f0 as one of the inputs of the model, 1 or 0", + ) + parser.add_argument( + "-l", + "--if_latest", + type=int, + required=True, + help="if only save the latest G/D pth file, 1 or 0", + ) + parser.add_argument( + "-c", + "--if_cache_data_in_gpu", + type=int, + required=True, + help="if caching the dataset in GPU memory, 1 or 0", + ) - args = parser.parse_args() - name = args.experiment_dir - experiment_dir = os.path.join("./logs", args.experiment_dir) + args = parser.parse_args() + name = args.experiment_dir + experiment_dir = os.path.join("./logs", args.experiment_dir) - if not os.path.exists(experiment_dir): - os.makedirs(experiment_dir) + if not os.path.exists(experiment_dir): + os.makedirs(experiment_dir) - config_path = "configs/%s.json"%args.sample_rate - config_save_path = os.path.join(experiment_dir, "config.json") - if init: - with open(config_path, "r") as f: - data = f.read() - with open(config_save_path, "w") as f: - f.write(data) - else: - with open(config_save_path, "r") as f: - data = f.read() - config = json.loads(data) + config_path = "configs/%s.json" % args.sample_rate + config_save_path = os.path.join(experiment_dir, "config.json") + if init: + with open(config_path, "r") as f: + data = f.read() + with open(config_save_path, "w") as f: + f.write(data) + else: + with open(config_save_path, "r") as f: + data = f.read() + config = json.loads(data) - hparams = HParams(**config) - hparams.model_dir = hparams.experiment_dir = experiment_dir - hparams.save_every_epoch = args.save_every_epoch - hparams.name = name - hparams.total_epoch = args.total_epoch - hparams.pretrainG = args.pretrainG - hparams.pretrainD = args.pretrainD - hparams.gpus = args.gpus - hparams.train.batch_size = args.batch_size - hparams.sample_rate = args.sample_rate - hparams.if_f0 = args.if_f0 - hparams.if_latest = args.if_latest - hparams.if_cache_data_in_gpu = args.if_cache_data_in_gpu - hparams.data.training_files = "%s/filelist.txt"%experiment_dir - return hparams + hparams = HParams(**config) + hparams.model_dir = hparams.experiment_dir = experiment_dir + hparams.save_every_epoch = args.save_every_epoch + hparams.name = name + hparams.total_epoch = args.total_epoch + hparams.pretrainG = args.pretrainG + hparams.pretrainD = args.pretrainD + hparams.gpus = args.gpus + hparams.train.batch_size = args.batch_size + hparams.sample_rate = args.sample_rate + hparams.if_f0 = args.if_f0 + hparams.if_latest = args.if_latest + hparams.if_cache_data_in_gpu = args.if_cache_data_in_gpu + hparams.data.training_files = "%s/filelist.txt" % experiment_dir + return hparams def get_hparams_from_dir(model_dir): - config_save_path = os.path.join(model_dir, "config.json") - with open(config_save_path, "r") as f: - data = f.read() - config = json.loads(data) + config_save_path = os.path.join(model_dir, "config.json") + with open(config_save_path, "r") as f: + data = f.read() + config = json.loads(data) - hparams =HParams(**config) - hparams.model_dir = model_dir - return hparams + hparams = HParams(**config) + hparams.model_dir = model_dir + return hparams def get_hparams_from_file(config_path): - with open(config_path, "r") as f: - data = f.read() - config = json.loads(data) + with open(config_path, "r") as f: + data = f.read() + config = json.loads(data) - hparams =HParams(**config) - return hparams + hparams = HParams(**config) + return hparams def check_git_hash(model_dir): - source_dir = os.path.dirname(os.path.realpath(__file__)) - if not os.path.exists(os.path.join(source_dir, ".git")): - logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format( - source_dir - )) - return + source_dir = os.path.dirname(os.path.realpath(__file__)) + if not os.path.exists(os.path.join(source_dir, ".git")): + logger.warn( + "{} is not a git repository, therefore hash value comparison will be ignored.".format( + source_dir + ) + ) + return - cur_hash = subprocess.getoutput("git rev-parse HEAD") + cur_hash = subprocess.getoutput("git rev-parse HEAD") - path = os.path.join(model_dir, "githash") - if os.path.exists(path): - saved_hash = open(path).read() - if saved_hash != cur_hash: - logger.warn("git hash values are different. {}(saved) != {}(current)".format( - saved_hash[:8], cur_hash[:8])) - else: - open(path, "w").write(cur_hash) + path = os.path.join(model_dir, "githash") + if os.path.exists(path): + saved_hash = open(path).read() + if saved_hash != cur_hash: + logger.warn( + "git hash values are different. {}(saved) != {}(current)".format( + saved_hash[:8], cur_hash[:8] + ) + ) + else: + open(path, "w").write(cur_hash) def get_logger(model_dir, filename="train.log"): - global logger - logger = logging.getLogger(os.path.basename(model_dir)) - logger.setLevel(logging.DEBUG) - - formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s") - if not os.path.exists(model_dir): - os.makedirs(model_dir) - h = logging.FileHandler(os.path.join(model_dir, filename)) - h.setLevel(logging.DEBUG) - h.setFormatter(formatter) - logger.addHandler(h) - return logger + global logger + logger = logging.getLogger(os.path.basename(model_dir)) + logger.setLevel(logging.DEBUG) + + formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s") + if not os.path.exists(model_dir): + os.makedirs(model_dir) + h = logging.FileHandler(os.path.join(model_dir, filename)) + h.setLevel(logging.DEBUG) + h.setFormatter(formatter) + logger.addHandler(h) + return logger -class HParams(): - def __init__(self, **kwargs): - for k, v in kwargs.items(): - if type(v) == dict: - v = HParams(**v) - self[k] = v - - def keys(self): - return self.__dict__.keys() +class HParams: + def __init__(self, **kwargs): + for k, v in kwargs.items(): + if type(v) == dict: + v = HParams(**v) + self[k] = v - def items(self): - return self.__dict__.items() + def keys(self): + return self.__dict__.keys() - def values(self): - return self.__dict__.values() + def items(self): + return self.__dict__.items() - def __len__(self): - return len(self.__dict__) + def values(self): + return self.__dict__.values() - def __getitem__(self, key): - return getattr(self, key) + def __len__(self): + return len(self.__dict__) - def __setitem__(self, key, value): - return setattr(self, key, value) + def __getitem__(self, key): + return getattr(self, key) - def __contains__(self, key): - return key in self.__dict__ + def __setitem__(self, key, value): + return setattr(self, key, value) - def __repr__(self): - return self.__dict__.__repr__() + def __contains__(self, key): + return key in self.__dict__ + + def __repr__(self): + return self.__dict__.__repr__() diff --git a/train_nsf_sim_cache_sid_load_pretrain.py b/train_nsf_sim_cache_sid_load_pretrain.py index d2d9c35..1735201 100644 --- a/train_nsf_sim_cache_sid_load_pretrain.py +++ b/train_nsf_sim_cache_sid_load_pretrain.py @@ -1,12 +1,15 @@ -import sys,os -now_dir=os.getcwd() -sys.path.append(os.path.join(now_dir,"train")) +import sys, os + +now_dir = os.getcwd() +sys.path.append(os.path.join(now_dir, "train")) import utils + hps = utils.get_hparams() -os.environ["CUDA_VISIBLE_DEVICES"]=hps.gpus.replace("-",",") -n_gpus=len(hps.gpus.split("-")) +os.environ["CUDA_VISIBLE_DEVICES"] = hps.gpus.replace("-", ",") +n_gpus = len(hps.gpus.split("-")) from random import shuffle -import traceback,json,argparse,itertools,math,torch,pdb +import traceback, json, argparse, itertools, math, torch, pdb + torch.backends.cudnn.deterministic = False torch.backends.cudnn.benchmark = False from torch import nn, optim @@ -20,9 +23,16 @@ from torch.cuda.amp import autocast, GradScaler from infer_pack import commons from time import time as ttime -from data_utils import TextAudioLoaderMultiNSFsid,TextAudioLoader, TextAudioCollateMultiNSFsid,TextAudioCollate, DistributedBucketSampler +from data_utils import ( + TextAudioLoaderMultiNSFsid, + TextAudioLoader, + TextAudioCollateMultiNSFsid, + TextAudioCollate, + DistributedBucketSampler, +) from infer_pack.models import ( - SynthesizerTrnMs256NSFsid,SynthesizerTrnMs256NSFsid_nono, + SynthesizerTrnMs256NSFsid, + SynthesizerTrnMs256NSFsid_nono, MultiPeriodDiscriminator, ) from losses import generator_loss, discriminator_loss, feature_loss, kl_loss @@ -32,13 +42,11 @@ from mel_processing import mel_spectrogram_torch, spec_to_mel_torch global_step = 0 - def main(): # n_gpus = torch.cuda.device_count() os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = "5555" - mp.spawn( run, nprocs=n_gpus, @@ -62,13 +70,16 @@ def run(rank, n_gpus, hps): backend="gloo", init_method="env://", world_size=n_gpus, rank=rank ) torch.manual_seed(hps.train.seed) - if torch.cuda.is_available(): torch.cuda.set_device(rank) + if torch.cuda.is_available(): + torch.cuda.set_device(rank) - if (hps.if_f0 == 1):train_dataset = TextAudioLoaderMultiNSFsid(hps.data.training_files, hps.data) - else:train_dataset = TextAudioLoader(hps.data.training_files, hps.data) + if hps.if_f0 == 1: + train_dataset = TextAudioLoaderMultiNSFsid(hps.data.training_files, hps.data) + else: + train_dataset = TextAudioLoader(hps.data.training_files, hps.data) train_sampler = DistributedBucketSampler( train_dataset, - hps.train.batch_size*n_gpus, + hps.train.batch_size * n_gpus, # [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1200,1400], # 16s [100, 200, 300, 400, 500, 600, 700, 800, 900], # 16s num_replicas=n_gpus, @@ -77,8 +88,10 @@ def run(rank, n_gpus, hps): ) # It is possible that dataloader's workers are out of shared memory. Please try to raise your shared memory limit. # num_workers=8 -> num_workers=4 - if (hps.if_f0 == 1):collate_fn = TextAudioCollateMultiNSFsid() - else:collate_fn = TextAudioCollate() + if hps.if_f0 == 1: + collate_fn = TextAudioCollateMultiNSFsid() + else: + collate_fn = TextAudioCollate() train_loader = DataLoader( train_dataset, num_workers=4, @@ -89,13 +102,26 @@ def run(rank, n_gpus, hps): persistent_workers=True, prefetch_factor=8, ) - if(hps.if_f0==1): - net_g = SynthesizerTrnMs256NSFsid(hps.data.filter_length // 2 + 1,hps.train.segment_size // hps.data.hop_length,**hps.model,is_half=hps.train.fp16_run,sr=hps.sample_rate) + if hps.if_f0 == 1: + net_g = SynthesizerTrnMs256NSFsid( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + **hps.model, + is_half=hps.train.fp16_run, + sr=hps.sample_rate, + ) else: - net_g = SynthesizerTrnMs256NSFsid_nono(hps.data.filter_length // 2 + 1,hps.train.segment_size // hps.data.hop_length,**hps.model,is_half=hps.train.fp16_run) - if torch.cuda.is_available(): net_g = net_g.cuda(rank) + net_g = SynthesizerTrnMs256NSFsid_nono( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + **hps.model, + is_half=hps.train.fp16_run, + ) + if torch.cuda.is_available(): + net_g = net_g.cuda(rank) net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm) - if torch.cuda.is_available(): net_d = net_d.cuda(rank) + if torch.cuda.is_available(): + net_d = net_d.cuda(rank) optim_g = torch.optim.AdamW( net_g.parameters(), hps.train.learning_rate, @@ -110,30 +136,42 @@ def run(rank, n_gpus, hps): ) # net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True) # net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True) - if torch.cuda.is_available(): + if torch.cuda.is_available(): net_g = DDP(net_g, device_ids=[rank]) net_d = DDP(net_d, device_ids=[rank]) else: net_g = DDP(net_g) net_d = DDP(net_d) - try:#如果能加载自动resume - _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, optim_d) # D多半加载没事 + try: # 如果能加载自动resume + _, _, _, epoch_str = utils.load_checkpoint( + utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, optim_d + ) # D多半加载没事 if rank == 0: logger.info("loaded D") # _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g,load_opt=0) - _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g) + _, _, _, epoch_str = utils.load_checkpoint( + utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g + ) global_step = (epoch_str - 1) * len(train_loader) # epoch_str = 1 # global_step = 0 - except:#如果首次不能加载,加载pretrain + except: # 如果首次不能加载,加载pretrain traceback.print_exc() epoch_str = 1 global_step = 0 if rank == 0: - logger.info("loaded pretrained %s %s"%(hps.pretrainG,hps.pretrainD)) - print(net_g.module.load_state_dict(torch.load(hps.pretrainG,map_location="cpu")["model"]))##测试不加载优化器 - print(net_d.module.load_state_dict(torch.load(hps.pretrainD,map_location="cpu")["model"])) + logger.info("loaded pretrained %s %s" % (hps.pretrainG, hps.pretrainD)) + print( + net_g.module.load_state_dict( + torch.load(hps.pretrainG, map_location="cpu")["model"] + ) + ) ##测试不加载优化器 + print( + net_d.module.load_state_dict( + torch.load(hps.pretrainD, map_location="cpu")["model"] + ) + ) scheduler_g = torch.optim.lr_scheduler.ExponentialLR( optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2 @@ -144,7 +182,7 @@ def run(rank, n_gpus, hps): scaler = GradScaler(enabled=hps.train.fp16_run) - cache=[] + cache = [] for epoch in range(epoch_str, hps.train.epochs + 1): if rank == 0: train_and_evaluate( @@ -157,7 +195,8 @@ def run(rank, n_gpus, hps): scaler, [train_loader, None], logger, - [writer, writer_eval],cache + [writer, writer_eval], + cache, ) else: train_and_evaluate( @@ -170,14 +209,15 @@ def run(rank, n_gpus, hps): scaler, [train_loader, None], None, - None,cache + None, + cache, ) scheduler_g.step() scheduler_d.step() def train_and_evaluate( - rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers,cache + rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers, cache ): net_g, net_d = nets optim_g, optim_d = optims @@ -190,168 +230,90 @@ def train_and_evaluate( net_g.train() net_d.train() - if(cache==[]or hps.if_cache_data_in_gpu==False):#第一个epoch把cache全部填满训练集 + if cache == [] or hps.if_cache_data_in_gpu == False: # 第一个epoch把cache全部填满训练集 # print("caching") for batch_idx, info in enumerate(train_loader): - if (hps.if_f0 == 1):phone,phone_lengths,pitch,pitchf,spec,spec_lengths,wave,wave_lengths,sid=info - else:phone,phone_lengths,spec,spec_lengths,wave,wave_lengths,sid=info + if hps.if_f0 == 1: + ( + phone, + phone_lengths, + pitch, + pitchf, + spec, + spec_lengths, + wave, + wave_lengths, + sid, + ) = info + else: + phone, phone_lengths, spec, spec_lengths, wave, wave_lengths, sid = info if torch.cuda.is_available(): - phone, phone_lengths = phone.cuda(rank, non_blocking=True), phone_lengths.cuda(rank, non_blocking=True ) - if (hps.if_f0 == 1):pitch,pitchf = pitch.cuda(rank, non_blocking=True),pitchf.cuda(rank, non_blocking=True) + phone, phone_lengths = phone.cuda( + rank, non_blocking=True + ), phone_lengths.cuda(rank, non_blocking=True) + if hps.if_f0 == 1: + pitch, pitchf = pitch.cuda(rank, non_blocking=True), pitchf.cuda( + rank, non_blocking=True + ) sid = sid.cuda(rank, non_blocking=True) - spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda(rank, non_blocking=True) - wave, wave_lengths = wave.cuda(rank, non_blocking=True), wave_lengths.cuda(rank, non_blocking=True) - if(hps.if_cache_data_in_gpu==True): - if (hps.if_f0 == 1):cache.append((batch_idx, (phone,phone_lengths,pitch,pitchf,spec,spec_lengths,wave,wave_lengths ,sid))) - else:cache.append((batch_idx, (phone,phone_lengths,spec,spec_lengths,wave,wave_lengths ,sid))) - with autocast(enabled=hps.train.fp16_run): - if (hps.if_f0 == 1):y_hat,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(phone, phone_lengths, pitch,pitchf, spec, spec_lengths,sid) - else:y_hat,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(phone, phone_lengths, spec, spec_lengths,sid) - mel = spec_to_mel_torch(spec,hps.data.filter_length,hps.data.n_mel_channels,hps.data.sampling_rate,hps.data.mel_fmin,hps.data.mel_fmax,) - y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length) - with autocast(enabled=False): - y_hat_mel = mel_spectrogram_torch( - y_hat.float().squeeze(1), - hps.data.filter_length, - hps.data.n_mel_channels, - hps.data.sampling_rate, - hps.data.hop_length, - hps.data.win_length, - hps.data.mel_fmin, - hps.data.mel_fmax, - ) - if(hps.train.fp16_run==True): - y_hat_mel=y_hat_mel.half() - wave = commons.slice_segments( - wave, ids_slice * hps.data.hop_length, hps.train.segment_size - ) # slice - - # Discriminator - y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach()) - with autocast(enabled=False): - loss_disc, losses_disc_r, losses_disc_g = discriminator_loss( - y_d_hat_r, y_d_hat_g - ) - optim_d.zero_grad() - scaler.scale(loss_disc).backward() - scaler.unscale_(optim_d) - grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None) - scaler.step(optim_d) - - with autocast(enabled=hps.train.fp16_run): - # Generator - y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat) - with autocast(enabled=False): - loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel - loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl - loss_fm = feature_loss(fmap_r, fmap_g) - loss_gen, losses_gen = generator_loss(y_d_hat_g) - loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl - optim_g.zero_grad() - scaler.scale(loss_gen_all).backward() - scaler.unscale_(optim_g) - grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None) - scaler.step(optim_g) - scaler.update() - - if rank == 0: - if global_step % hps.train.log_interval == 0: - lr = optim_g.param_groups[0]["lr"] - logger.info( - "Train Epoch: {} [{:.0f}%]".format( - epoch, 100.0 * batch_idx / len(train_loader) + spec, spec_lengths = spec.cuda( + rank, non_blocking=True + ), spec_lengths.cuda(rank, non_blocking=True) + wave, wave_lengths = wave.cuda( + rank, non_blocking=True + ), wave_lengths.cuda(rank, non_blocking=True) + if hps.if_cache_data_in_gpu == True: + if hps.if_f0 == 1: + cache.append( + ( + batch_idx, + ( + phone, + phone_lengths, + pitch, + pitchf, + spec, + spec_lengths, + wave, + wave_lengths, + sid, + ), ) ) - # Amor For Tensorboard display - if loss_mel > 50: - loss_mel = 50 - if loss_kl > 5: - loss_kl = 5 - - logger.info([global_step, lr]) - logger.info( - f"loss_disc={loss_disc:.3f}, loss_gen={loss_gen:.3f}, loss_fm={loss_fm:.3f},loss_mel={loss_mel:.3f}, loss_kl={loss_kl:.3f}" + else: + cache.append( + ( + batch_idx, + ( + phone, + phone_lengths, + spec, + spec_lengths, + wave, + wave_lengths, + sid, + ), + ) ) - scalar_dict = { - "loss/g/total": loss_gen_all, - "loss/d/total": loss_disc, - "learning_rate": lr, - "grad_norm_d": grad_norm_d, - "grad_norm_g": grad_norm_g, - } - scalar_dict.update( - {"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/kl": loss_kl} - ) - - scalar_dict.update( - {"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)} - ) - scalar_dict.update( - {"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)} - ) - scalar_dict.update( - {"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)} - ) - image_dict = { - "slice/mel_org": utils.plot_spectrogram_to_numpy( - y_mel[0].data.cpu().numpy() - ), - "slice/mel_gen": utils.plot_spectrogram_to_numpy( - y_hat_mel[0].data.cpu().numpy() - ), - "all/mel": utils.plot_spectrogram_to_numpy( - mel[0].data.cpu().numpy() - ), - } - utils.summarize( - writer=writer, - global_step=global_step, - images=image_dict, - scalars=scalar_dict, - ) - global_step += 1 - # if global_step % hps.train.eval_interval == 0: - if epoch % hps.save_every_epoch == 0 and rank == 0: - if(hps.if_latest==0): - utils.save_checkpoint( - net_g, - optim_g, - hps.train.learning_rate, - epoch, - os.path.join(hps.model_dir, "G_{}.pth".format(global_step)), - ) - utils.save_checkpoint( - net_d, - optim_d, - hps.train.learning_rate, - epoch, - os.path.join(hps.model_dir, "D_{}.pth".format(global_step)), - ) - else: - utils.save_checkpoint( - net_g, - optim_g, - hps.train.learning_rate, - epoch, - os.path.join(hps.model_dir, "G_{}.pth".format(2333333)), - ) - utils.save_checkpoint( - net_d, - optim_d, - hps.train.learning_rate, - epoch, - os.path.join(hps.model_dir, "D_{}.pth".format(2333333)), - ) - - else:#后续的epoch直接使用打乱的cache - shuffle(cache) - # print("using cache") - for batch_idx, info in cache: - if (hps.if_f0 == 1):phone,phone_lengths,pitch,pitchf,spec,spec_lengths,wave,wave_lengths,sid=info - else:phone,phone_lengths,spec,spec_lengths,wave,wave_lengths,sid=info with autocast(enabled=hps.train.fp16_run): - if (hps.if_f0 == 1):y_hat,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(phone, phone_lengths, pitch,pitchf, spec, spec_lengths,sid) - else:y_hat,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(phone, phone_lengths, spec, spec_lengths,sid) + if hps.if_f0 == 1: + ( + y_hat, + ids_slice, + x_mask, + z_mask, + (z, z_p, m_p, logs_p, m_q, logs_q), + ) = net_g( + phone, phone_lengths, pitch, pitchf, spec, spec_lengths, sid + ) + else: + ( + y_hat, + ids_slice, + x_mask, + z_mask, + (z, z_p, m_p, logs_p, m_q, logs_q), + ) = net_g(phone, phone_lengths, spec, spec_lengths, sid) mel = spec_to_mel_torch( spec, hps.data.filter_length, @@ -374,8 +336,200 @@ def train_and_evaluate( hps.data.mel_fmin, hps.data.mel_fmax, ) - if(hps.train.fp16_run==True): - y_hat_mel=y_hat_mel.half() + if hps.train.fp16_run == True: + y_hat_mel = y_hat_mel.half() + wave = commons.slice_segments( + wave, ids_slice * hps.data.hop_length, hps.train.segment_size + ) # slice + + # Discriminator + y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach()) + with autocast(enabled=False): + loss_disc, losses_disc_r, losses_disc_g = discriminator_loss( + y_d_hat_r, y_d_hat_g + ) + optim_d.zero_grad() + scaler.scale(loss_disc).backward() + scaler.unscale_(optim_d) + grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None) + scaler.step(optim_d) + + with autocast(enabled=hps.train.fp16_run): + # Generator + y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat) + with autocast(enabled=False): + loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel + loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl + loss_fm = feature_loss(fmap_r, fmap_g) + loss_gen, losses_gen = generator_loss(y_d_hat_g) + loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl + optim_g.zero_grad() + scaler.scale(loss_gen_all).backward() + scaler.unscale_(optim_g) + grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None) + scaler.step(optim_g) + scaler.update() + + if rank == 0: + if global_step % hps.train.log_interval == 0: + lr = optim_g.param_groups[0]["lr"] + logger.info( + "Train Epoch: {} [{:.0f}%]".format( + epoch, 100.0 * batch_idx / len(train_loader) + ) + ) + # Amor For Tensorboard display + if loss_mel > 50: + loss_mel = 50 + if loss_kl > 5: + loss_kl = 5 + + logger.info([global_step, lr]) + logger.info( + f"loss_disc={loss_disc:.3f}, loss_gen={loss_gen:.3f}, loss_fm={loss_fm:.3f},loss_mel={loss_mel:.3f}, loss_kl={loss_kl:.3f}" + ) + scalar_dict = { + "loss/g/total": loss_gen_all, + "loss/d/total": loss_disc, + "learning_rate": lr, + "grad_norm_d": grad_norm_d, + "grad_norm_g": grad_norm_g, + } + scalar_dict.update( + { + "loss/g/fm": loss_fm, + "loss/g/mel": loss_mel, + "loss/g/kl": loss_kl, + } + ) + + scalar_dict.update( + {"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)} + ) + scalar_dict.update( + { + "loss/d_r/{}".format(i): v + for i, v in enumerate(losses_disc_r) + } + ) + scalar_dict.update( + { + "loss/d_g/{}".format(i): v + for i, v in enumerate(losses_disc_g) + } + ) + image_dict = { + "slice/mel_org": utils.plot_spectrogram_to_numpy( + y_mel[0].data.cpu().numpy() + ), + "slice/mel_gen": utils.plot_spectrogram_to_numpy( + y_hat_mel[0].data.cpu().numpy() + ), + "all/mel": utils.plot_spectrogram_to_numpy( + mel[0].data.cpu().numpy() + ), + } + utils.summarize( + writer=writer, + global_step=global_step, + images=image_dict, + scalars=scalar_dict, + ) + global_step += 1 + # if global_step % hps.train.eval_interval == 0: + if epoch % hps.save_every_epoch == 0 and rank == 0: + if hps.if_latest == 0: + utils.save_checkpoint( + net_g, + optim_g, + hps.train.learning_rate, + epoch, + os.path.join(hps.model_dir, "G_{}.pth".format(global_step)), + ) + utils.save_checkpoint( + net_d, + optim_d, + hps.train.learning_rate, + epoch, + os.path.join(hps.model_dir, "D_{}.pth".format(global_step)), + ) + else: + utils.save_checkpoint( + net_g, + optim_g, + hps.train.learning_rate, + epoch, + os.path.join(hps.model_dir, "G_{}.pth".format(2333333)), + ) + utils.save_checkpoint( + net_d, + optim_d, + hps.train.learning_rate, + epoch, + os.path.join(hps.model_dir, "D_{}.pth".format(2333333)), + ) + + else: # 后续的epoch直接使用打乱的cache + shuffle(cache) + # print("using cache") + for batch_idx, info in cache: + if hps.if_f0 == 1: + ( + phone, + phone_lengths, + pitch, + pitchf, + spec, + spec_lengths, + wave, + wave_lengths, + sid, + ) = info + else: + phone, phone_lengths, spec, spec_lengths, wave, wave_lengths, sid = info + with autocast(enabled=hps.train.fp16_run): + if hps.if_f0 == 1: + ( + y_hat, + ids_slice, + x_mask, + z_mask, + (z, z_p, m_p, logs_p, m_q, logs_q), + ) = net_g( + phone, phone_lengths, pitch, pitchf, spec, spec_lengths, sid + ) + else: + ( + y_hat, + ids_slice, + x_mask, + z_mask, + (z, z_p, m_p, logs_p, m_q, logs_q), + ) = net_g(phone, phone_lengths, spec, spec_lengths, sid) + mel = spec_to_mel_torch( + spec, + hps.data.filter_length, + hps.data.n_mel_channels, + hps.data.sampling_rate, + hps.data.mel_fmin, + hps.data.mel_fmax, + ) + y_mel = commons.slice_segments( + mel, ids_slice, hps.train.segment_size // hps.data.hop_length + ) + with autocast(enabled=False): + y_hat_mel = mel_spectrogram_torch( + y_hat.float().squeeze(1), + hps.data.filter_length, + hps.data.n_mel_channels, + hps.data.sampling_rate, + hps.data.hop_length, + hps.data.win_length, + hps.data.mel_fmin, + hps.data.mel_fmax, + ) + if hps.train.fp16_run == True: + y_hat_mel = y_hat_mel.half() wave = commons.slice_segments( wave, ids_slice * hps.data.hop_length, hps.train.segment_size ) # slice @@ -435,17 +589,27 @@ def train_and_evaluate( "grad_norm_g": grad_norm_g, } scalar_dict.update( - {"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/kl": loss_kl} + { + "loss/g/fm": loss_fm, + "loss/g/mel": loss_mel, + "loss/g/kl": loss_kl, + } ) scalar_dict.update( {"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)} ) scalar_dict.update( - {"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)} + { + "loss/d_r/{}".format(i): v + for i, v in enumerate(losses_disc_r) + } ) scalar_dict.update( - {"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)} + { + "loss/d_g/{}".format(i): v + for i, v in enumerate(losses_disc_g) + } ) image_dict = { "slice/mel_org": utils.plot_spectrogram_to_numpy( @@ -467,7 +631,7 @@ def train_and_evaluate( global_step += 1 # if global_step % hps.train.eval_interval == 0: if epoch % hps.save_every_epoch == 0 and rank == 0: - if(hps.if_latest==0): + if hps.if_latest == 0: utils.save_checkpoint( net_g, optim_g, @@ -498,15 +662,20 @@ def train_and_evaluate( os.path.join(hps.model_dir, "D_{}.pth".format(2333333)), ) - if rank == 0: logger.info("====> Epoch: {}".format(epoch)) - if(epoch>=hps.total_epoch and rank == 0): + if epoch >= hps.total_epoch and rank == 0: logger.info("Training is done. The program is closed.") - from process_ckpt import savee#def savee(ckpt,sr,if_f0,name,epoch): - if hasattr(net_g, 'module'):ckpt = net_g.module.state_dict() - else:ckpt = net_g.state_dict() - logger.info("saving final ckpt:%s"%(savee(ckpt,hps.sample_rate,hps.if_f0,hps.name,epoch))) + from process_ckpt import savee # def savee(ckpt,sr,if_f0,name,epoch): + + if hasattr(net_g, "module"): + ckpt = net_g.module.state_dict() + else: + ckpt = net_g.state_dict() + logger.info( + "saving final ckpt:%s" + % (savee(ckpt, hps.sample_rate, hps.if_f0, hps.name, epoch)) + ) os._exit(2333333) diff --git a/trainset_preprocess_pipeline_print.py b/trainset_preprocess_pipeline_print.py index 37c660b..40617a1 100644 --- a/trainset_preprocess_pipeline_print.py +++ b/trainset_preprocess_pipeline_print.py @@ -1,5 +1,6 @@ -import sys,os,multiprocessing -now_dir=os.getcwd() +import sys, os, multiprocessing + +now_dir = os.getcwd() sys.path.append(now_dir) inp_root = sys.argv[1] @@ -7,15 +8,17 @@ sr = int(sys.argv[2]) n_p = int(sys.argv[3]) exp_dir = sys.argv[4] noparallel = sys.argv[5] == "True" -import numpy as np,os,traceback +import numpy as np, os, traceback from slicer2 import Slicer -import librosa,traceback -from scipy.io import wavfile +import librosa, traceback +from scipy.io import wavfile import multiprocessing from my_utils import load_audio mutex = multiprocessing.Lock() -f = open("%s/preprocess.log"%exp_dir, "a+") +f = open("%s/preprocess.log" % exp_dir, "a+") + + def println(strr): mutex.acquire() print(strr) @@ -23,81 +26,101 @@ def println(strr): f.flush() mutex.release() -class PreProcess(): - def __init__(self,sr,exp_dir): + +class PreProcess: + def __init__(self, sr, exp_dir): self.slicer = Slicer( sr=sr, threshold=-32, min_length=800, min_interval=400, hop_size=15, - max_sil_kept=150 + max_sil_kept=150, ) - self.sr=sr - self.per=3.7 - self.overlap=0.3 - self.tail=self.per+self.overlap - self.max=0.95 - self.alpha=0.8 - self.exp_dir=exp_dir - self.gt_wavs_dir="%s/0_gt_wavs"%exp_dir - self.wavs16k_dir="%s/1_16k_wavs"%exp_dir - os.makedirs(self.exp_dir,exist_ok=True) - os.makedirs(self.gt_wavs_dir,exist_ok=True) - os.makedirs(self.wavs16k_dir,exist_ok=True) + self.sr = sr + self.per = 3.7 + self.overlap = 0.3 + self.tail = self.per + self.overlap + self.max = 0.95 + self.alpha = 0.8 + self.exp_dir = exp_dir + self.gt_wavs_dir = "%s/0_gt_wavs" % exp_dir + self.wavs16k_dir = "%s/1_16k_wavs" % exp_dir + os.makedirs(self.exp_dir, exist_ok=True) + os.makedirs(self.gt_wavs_dir, exist_ok=True) + os.makedirs(self.wavs16k_dir, exist_ok=True) - def norm_write(self,tmp_audio,idx0,idx1): - tmp_audio = (tmp_audio / np.abs(tmp_audio).max() * (self.max * self.alpha)) + (1 - self.alpha) * tmp_audio - wavfile.write("%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1), self.sr, (tmp_audio*32768).astype(np.int16)) + def norm_write(self, tmp_audio, idx0, idx1): + tmp_audio = (tmp_audio / np.abs(tmp_audio).max() * (self.max * self.alpha)) + ( + 1 - self.alpha + ) * tmp_audio + wavfile.write( + "%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1), + self.sr, + (tmp_audio * 32768).astype(np.int16), + ) tmp_audio = librosa.resample(tmp_audio, orig_sr=self.sr, target_sr=16000) - wavfile.write("%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1), 16000, (tmp_audio*32768).astype(np.int16)) + wavfile.write( + "%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1), + 16000, + (tmp_audio * 32768).astype(np.int16), + ) - def pipeline(self,path, idx0): + def pipeline(self, path, idx0): try: - audio = load_audio(path,self.sr) - idx1=0 + audio = load_audio(path, self.sr) + idx1 = 0 for audio in self.slicer.slice(audio): i = 0 - while (1): + while 1: start = int(self.sr * (self.per - self.overlap) * i) i += 1 - if (len(audio[start:]) > self.tail * self.sr): - tmp_audio = audio[start:start + int(self.per * self.sr)] - self.norm_write(tmp_audio,idx0,idx1) + if len(audio[start:]) > self.tail * self.sr: + tmp_audio = audio[start : start + int(self.per * self.sr)] + self.norm_write(tmp_audio, idx0, idx1) idx1 += 1 else: tmp_audio = audio[start:] break self.norm_write(tmp_audio, idx0, idx1) - println("%s->Suc."%path) + println("%s->Suc." % path) except: - println("%s->%s"%(path,traceback.format_exc())) + println("%s->%s" % (path, traceback.format_exc())) - def pipeline_mp(self,infos): + def pipeline_mp(self, infos): for path, idx0 in infos: - self.pipeline(path,idx0) + self.pipeline(path, idx0) - def pipeline_mp_inp_dir(self,inp_root,n_p): + def pipeline_mp_inp_dir(self, inp_root, n_p): try: - infos = [("%s/%s" % (inp_root, name), idx) for idx, name in enumerate(sorted(list(os.listdir(inp_root))))] + infos = [ + ("%s/%s" % (inp_root, name), idx) + for idx, name in enumerate(sorted(list(os.listdir(inp_root)))) + ] if noparallel: - for i in range(n_p): self.pipeline_mp(infos[i::n_p]) - else: - ps=[] for i in range(n_p): - p=multiprocessing.Process(target=self.pipeline_mp,args=(infos[i::n_p],)) + self.pipeline_mp(infos[i::n_p]) + else: + ps = [] + for i in range(n_p): + p = multiprocessing.Process( + target=self.pipeline_mp, args=(infos[i::n_p],) + ) p.start() ps.append(p) - for p in ps:p.join() + for p in ps: + p.join() except: - println("Fail. %s"%traceback.format_exc()) + println("Fail. %s" % traceback.format_exc()) + def preprocess_trainset(inp_root, sr, n_p, exp_dir): - pp=PreProcess(sr,exp_dir) + pp = PreProcess(sr, exp_dir) println("start preprocess") println(sys.argv) - pp.pipeline_mp_inp_dir(inp_root,n_p) + pp.pipeline_mp_inp_dir(inp_root, n_p) println("end preprocess") -if __name__=='__main__': + +if __name__ == "__main__": preprocess_trainset(inp_root, sr, n_p, exp_dir) diff --git a/uvr5_pack/lib_v5/dataset.py b/uvr5_pack/lib_v5/dataset.py index 59454aa..ba0e45b 100644 --- a/uvr5_pack/lib_v5/dataset.py +++ b/uvr5_pack/lib_v5/dataset.py @@ -10,7 +10,6 @@ from uvr5_pack.lib_v5 import spec_utils class VocalRemoverValidationSet(torch.utils.data.Dataset): - def __init__(self, patch_list): self.patch_list = patch_list @@ -21,7 +20,7 @@ class VocalRemoverValidationSet(torch.utils.data.Dataset): path = self.patch_list[idx] data = np.load(path) - X, y = data['X'], data['y'] + X, y = data["X"], data["y"] X_mag = np.abs(X) y_mag = np.abs(y) @@ -30,16 +29,22 @@ class VocalRemoverValidationSet(torch.utils.data.Dataset): def make_pair(mix_dir, inst_dir): - input_exts = ['.wav', '.m4a', '.mp3', '.mp4', '.flac'] + input_exts = [".wav", ".m4a", ".mp3", ".mp4", ".flac"] - X_list = sorted([ - os.path.join(mix_dir, fname) - for fname in os.listdir(mix_dir) - if os.path.splitext(fname)[1] in input_exts]) - y_list = sorted([ - os.path.join(inst_dir, fname) - for fname in os.listdir(inst_dir) - if os.path.splitext(fname)[1] in input_exts]) + X_list = sorted( + [ + os.path.join(mix_dir, fname) + for fname in os.listdir(mix_dir) + if os.path.splitext(fname)[1] in input_exts + ] + ) + y_list = sorted( + [ + os.path.join(inst_dir, fname) + for fname in os.listdir(inst_dir) + if os.path.splitext(fname)[1] in input_exts + ] + ) filelist = list(zip(X_list, y_list)) @@ -47,10 +52,11 @@ def make_pair(mix_dir, inst_dir): def train_val_split(dataset_dir, split_mode, val_rate, val_filelist): - if split_mode == 'random': + if split_mode == "random": filelist = make_pair( - os.path.join(dataset_dir, 'mixtures'), - os.path.join(dataset_dir, 'instruments')) + os.path.join(dataset_dir, "mixtures"), + os.path.join(dataset_dir, "instruments"), + ) random.shuffle(filelist) @@ -60,19 +66,23 @@ def train_val_split(dataset_dir, split_mode, val_rate, val_filelist): val_filelist = filelist[-val_size:] else: train_filelist = [ - pair for pair in filelist - if list(pair) not in val_filelist] - elif split_mode == 'subdirs': + pair for pair in filelist if list(pair) not in val_filelist + ] + elif split_mode == "subdirs": if len(val_filelist) != 0: - raise ValueError('The `val_filelist` option is not available in `subdirs` mode') + raise ValueError( + "The `val_filelist` option is not available in `subdirs` mode" + ) train_filelist = make_pair( - os.path.join(dataset_dir, 'training/mixtures'), - os.path.join(dataset_dir, 'training/instruments')) + os.path.join(dataset_dir, "training/mixtures"), + os.path.join(dataset_dir, "training/instruments"), + ) val_filelist = make_pair( - os.path.join(dataset_dir, 'validation/mixtures'), - os.path.join(dataset_dir, 'validation/instruments')) + os.path.join(dataset_dir, "validation/mixtures"), + os.path.join(dataset_dir, "validation/instruments"), + ) return train_filelist, val_filelist @@ -81,7 +91,9 @@ def augment(X, y, reduction_rate, reduction_mask, mixup_rate, mixup_alpha): perm = np.random.permutation(len(X)) for i, idx in enumerate(tqdm(perm)): if np.random.uniform() < reduction_rate: - y[idx] = spec_utils.reduce_vocal_aggressively(X[idx], y[idx], reduction_mask) + y[idx] = spec_utils.reduce_vocal_aggressively( + X[idx], y[idx], reduction_mask + ) if np.random.uniform() < 0.5: # swap channel @@ -116,10 +128,8 @@ def make_padding(width, cropsize, offset): def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset): len_dataset = patches * len(filelist) - X_dataset = np.zeros( - (len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64) - y_dataset = np.zeros( - (len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64) + X_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64) + y_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64) for i, (X_path, y_path) in enumerate(tqdm(filelist)): X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft) @@ -127,22 +137,24 @@ def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset X, y = X / coef, y / coef l, r, roi_size = make_padding(X.shape[2], cropsize, offset) - X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode='constant') - y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode='constant') + X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant") + y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant") starts = np.random.randint(0, X_pad.shape[2] - cropsize, patches) ends = starts + cropsize for j in range(patches): idx = i * patches + j - X_dataset[idx] = X_pad[:, :, starts[j]:ends[j]] - y_dataset[idx] = y_pad[:, :, starts[j]:ends[j]] + X_dataset[idx] = X_pad[:, :, starts[j] : ends[j]] + y_dataset[idx] = y_pad[:, :, starts[j] : ends[j]] return X_dataset, y_dataset def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset): patch_list = [] - patch_dir = 'cs{}_sr{}_hl{}_nf{}_of{}'.format(cropsize, sr, hop_length, n_fft, offset) + patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format( + cropsize, sr, hop_length, n_fft, offset + ) os.makedirs(patch_dir, exist_ok=True) for i, (X_path, y_path) in enumerate(tqdm(filelist)): @@ -153,18 +165,19 @@ def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset): X, y = X / coef, y / coef l, r, roi_size = make_padding(X.shape[2], cropsize, offset) - X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode='constant') - y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode='constant') + X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant") + y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant") len_dataset = int(np.ceil(X.shape[2] / roi_size)) for j in range(len_dataset): - outpath = os.path.join(patch_dir, '{}_p{}.npz'.format(basename, j)) + outpath = os.path.join(patch_dir, "{}_p{}.npz".format(basename, j)) start = j * roi_size if not os.path.exists(outpath): np.savez( outpath, - X=X_pad[:, :, start:start + cropsize], - y=y_pad[:, :, start:start + cropsize]) + X=X_pad[:, :, start : start + cropsize], + y=y_pad[:, :, start : start + cropsize], + ) patch_list.append(outpath) return VocalRemoverValidationSet(patch_list) diff --git a/uvr5_pack/lib_v5/layers.py b/uvr5_pack/lib_v5/layers.py index ca64106..9835dc0 100644 --- a/uvr5_pack/lib_v5/layers.py +++ b/uvr5_pack/lib_v5/layers.py @@ -6,19 +6,20 @@ from uvr5_pack.lib_v5 import spec_utils class Conv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): super(Conv2DBNActiv, self).__init__() self.conv = nn.Sequential( nn.Conv2d( - nin, nout, + nin, + nout, kernel_size=ksize, stride=stride, padding=pad, dilation=dilation, - bias=False), + bias=False, + ), nn.BatchNorm2d(nout), - activ() + activ(), ) def __call__(self, x): @@ -26,24 +27,22 @@ class Conv2DBNActiv(nn.Module): class SeperableConv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): super(SeperableConv2DBNActiv, self).__init__() self.conv = nn.Sequential( nn.Conv2d( - nin, nin, + nin, + nin, kernel_size=ksize, stride=stride, padding=pad, dilation=dilation, groups=nin, - bias=False), - nn.Conv2d( - nin, nout, - kernel_size=1, - bias=False), + bias=False, + ), + nn.Conv2d(nin, nout, kernel_size=1, bias=False), nn.BatchNorm2d(nout), - activ() + activ(), ) def __call__(self, x): @@ -51,7 +50,6 @@ class SeperableConv2DBNActiv(nn.Module): class Encoder(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): super(Encoder, self).__init__() self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) @@ -65,14 +63,15 @@ class Encoder(nn.Module): class Decoder(nn.Module): - - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): + def __init__( + self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False + ): super(Decoder, self).__init__() self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.dropout = nn.Dropout2d(0.1) if dropout else None def __call__(self, x, skip=None): - x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True) + x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) if skip is not None: skip = spec_utils.crop_center(skip, x) x = torch.cat([x, skip], dim=1) @@ -85,28 +84,31 @@ class Decoder(nn.Module): class ASPPModule(nn.Module): - def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): super(ASPPModule, self).__init__() self.conv1 = nn.Sequential( nn.AdaptiveAvgPool2d((1, None)), - Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) + Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), ) self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) self.conv3 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) + nin, nin, 3, 1, dilations[0], dilations[0], activ=activ + ) self.conv4 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) + nin, nin, 3, 1, dilations[1], dilations[1], activ=activ + ) self.conv5 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) self.bottleneck = nn.Sequential( - Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), - nn.Dropout2d(0.1) + Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) ) def forward(self, x): _, _, h, w = x.size() - feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True) + feat1 = F.interpolate( + self.conv1(x), size=(h, w), mode="bilinear", align_corners=True + ) feat2 = self.conv2(x) feat3 = self.conv3(x) feat4 = self.conv4(x) diff --git a/uvr5_pack/lib_v5/layers_123812KB .py b/uvr5_pack/lib_v5/layers_123812KB .py index ca64106..9835dc0 100644 --- a/uvr5_pack/lib_v5/layers_123812KB .py +++ b/uvr5_pack/lib_v5/layers_123812KB .py @@ -6,19 +6,20 @@ from uvr5_pack.lib_v5 import spec_utils class Conv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): super(Conv2DBNActiv, self).__init__() self.conv = nn.Sequential( nn.Conv2d( - nin, nout, + nin, + nout, kernel_size=ksize, stride=stride, padding=pad, dilation=dilation, - bias=False), + bias=False, + ), nn.BatchNorm2d(nout), - activ() + activ(), ) def __call__(self, x): @@ -26,24 +27,22 @@ class Conv2DBNActiv(nn.Module): class SeperableConv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): super(SeperableConv2DBNActiv, self).__init__() self.conv = nn.Sequential( nn.Conv2d( - nin, nin, + nin, + nin, kernel_size=ksize, stride=stride, padding=pad, dilation=dilation, groups=nin, - bias=False), - nn.Conv2d( - nin, nout, - kernel_size=1, - bias=False), + bias=False, + ), + nn.Conv2d(nin, nout, kernel_size=1, bias=False), nn.BatchNorm2d(nout), - activ() + activ(), ) def __call__(self, x): @@ -51,7 +50,6 @@ class SeperableConv2DBNActiv(nn.Module): class Encoder(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): super(Encoder, self).__init__() self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) @@ -65,14 +63,15 @@ class Encoder(nn.Module): class Decoder(nn.Module): - - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): + def __init__( + self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False + ): super(Decoder, self).__init__() self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.dropout = nn.Dropout2d(0.1) if dropout else None def __call__(self, x, skip=None): - x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True) + x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) if skip is not None: skip = spec_utils.crop_center(skip, x) x = torch.cat([x, skip], dim=1) @@ -85,28 +84,31 @@ class Decoder(nn.Module): class ASPPModule(nn.Module): - def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): super(ASPPModule, self).__init__() self.conv1 = nn.Sequential( nn.AdaptiveAvgPool2d((1, None)), - Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) + Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), ) self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) self.conv3 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) + nin, nin, 3, 1, dilations[0], dilations[0], activ=activ + ) self.conv4 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) + nin, nin, 3, 1, dilations[1], dilations[1], activ=activ + ) self.conv5 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) self.bottleneck = nn.Sequential( - Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), - nn.Dropout2d(0.1) + Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) ) def forward(self, x): _, _, h, w = x.size() - feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True) + feat1 = F.interpolate( + self.conv1(x), size=(h, w), mode="bilinear", align_corners=True + ) feat2 = self.conv2(x) feat3 = self.conv3(x) feat4 = self.conv4(x) diff --git a/uvr5_pack/lib_v5/layers_123821KB.py b/uvr5_pack/lib_v5/layers_123821KB.py index ca64106..9835dc0 100644 --- a/uvr5_pack/lib_v5/layers_123821KB.py +++ b/uvr5_pack/lib_v5/layers_123821KB.py @@ -6,19 +6,20 @@ from uvr5_pack.lib_v5 import spec_utils class Conv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): super(Conv2DBNActiv, self).__init__() self.conv = nn.Sequential( nn.Conv2d( - nin, nout, + nin, + nout, kernel_size=ksize, stride=stride, padding=pad, dilation=dilation, - bias=False), + bias=False, + ), nn.BatchNorm2d(nout), - activ() + activ(), ) def __call__(self, x): @@ -26,24 +27,22 @@ class Conv2DBNActiv(nn.Module): class SeperableConv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): super(SeperableConv2DBNActiv, self).__init__() self.conv = nn.Sequential( nn.Conv2d( - nin, nin, + nin, + nin, kernel_size=ksize, stride=stride, padding=pad, dilation=dilation, groups=nin, - bias=False), - nn.Conv2d( - nin, nout, - kernel_size=1, - bias=False), + bias=False, + ), + nn.Conv2d(nin, nout, kernel_size=1, bias=False), nn.BatchNorm2d(nout), - activ() + activ(), ) def __call__(self, x): @@ -51,7 +50,6 @@ class SeperableConv2DBNActiv(nn.Module): class Encoder(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): super(Encoder, self).__init__() self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) @@ -65,14 +63,15 @@ class Encoder(nn.Module): class Decoder(nn.Module): - - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): + def __init__( + self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False + ): super(Decoder, self).__init__() self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.dropout = nn.Dropout2d(0.1) if dropout else None def __call__(self, x, skip=None): - x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True) + x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) if skip is not None: skip = spec_utils.crop_center(skip, x) x = torch.cat([x, skip], dim=1) @@ -85,28 +84,31 @@ class Decoder(nn.Module): class ASPPModule(nn.Module): - def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): super(ASPPModule, self).__init__() self.conv1 = nn.Sequential( nn.AdaptiveAvgPool2d((1, None)), - Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) + Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), ) self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) self.conv3 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) + nin, nin, 3, 1, dilations[0], dilations[0], activ=activ + ) self.conv4 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) + nin, nin, 3, 1, dilations[1], dilations[1], activ=activ + ) self.conv5 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) self.bottleneck = nn.Sequential( - Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), - nn.Dropout2d(0.1) + Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) ) def forward(self, x): _, _, h, w = x.size() - feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True) + feat1 = F.interpolate( + self.conv1(x), size=(h, w), mode="bilinear", align_corners=True + ) feat2 = self.conv2(x) feat3 = self.conv3(x) feat4 = self.conv4(x) diff --git a/uvr5_pack/lib_v5/layers_33966KB.py b/uvr5_pack/lib_v5/layers_33966KB.py index 0262e00..78e5392 100644 --- a/uvr5_pack/lib_v5/layers_33966KB.py +++ b/uvr5_pack/lib_v5/layers_33966KB.py @@ -6,19 +6,20 @@ from uvr5_pack.lib_v5 import spec_utils class Conv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): super(Conv2DBNActiv, self).__init__() self.conv = nn.Sequential( nn.Conv2d( - nin, nout, + nin, + nout, kernel_size=ksize, stride=stride, padding=pad, dilation=dilation, - bias=False), + bias=False, + ), nn.BatchNorm2d(nout), - activ() + activ(), ) def __call__(self, x): @@ -26,24 +27,22 @@ class Conv2DBNActiv(nn.Module): class SeperableConv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): super(SeperableConv2DBNActiv, self).__init__() self.conv = nn.Sequential( nn.Conv2d( - nin, nin, + nin, + nin, kernel_size=ksize, stride=stride, padding=pad, dilation=dilation, groups=nin, - bias=False), - nn.Conv2d( - nin, nout, - kernel_size=1, - bias=False), + bias=False, + ), + nn.Conv2d(nin, nout, kernel_size=1, bias=False), nn.BatchNorm2d(nout), - activ() + activ(), ) def __call__(self, x): @@ -51,7 +50,6 @@ class SeperableConv2DBNActiv(nn.Module): class Encoder(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): super(Encoder, self).__init__() self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) @@ -65,14 +63,15 @@ class Encoder(nn.Module): class Decoder(nn.Module): - - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): + def __init__( + self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False + ): super(Decoder, self).__init__() self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.dropout = nn.Dropout2d(0.1) if dropout else None def __call__(self, x, skip=None): - x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True) + x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) if skip is not None: skip = spec_utils.crop_center(skip, x) x = torch.cat([x, skip], dim=1) @@ -85,32 +84,37 @@ class Decoder(nn.Module): class ASPPModule(nn.Module): - def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU): super(ASPPModule, self).__init__() self.conv1 = nn.Sequential( nn.AdaptiveAvgPool2d((1, None)), - Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) + Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), ) self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) self.conv3 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) + nin, nin, 3, 1, dilations[0], dilations[0], activ=activ + ) self.conv4 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) + nin, nin, 3, 1, dilations[1], dilations[1], activ=activ + ) self.conv5 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) self.conv6 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) self.conv7 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) self.bottleneck = nn.Sequential( - Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), - nn.Dropout2d(0.1) + Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) ) def forward(self, x): _, _, h, w = x.size() - feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True) + feat1 = F.interpolate( + self.conv1(x), size=(h, w), mode="bilinear", align_corners=True + ) feat2 = self.conv2(x) feat3 = self.conv3(x) feat4 = self.conv4(x) diff --git a/uvr5_pack/lib_v5/layers_537227KB.py b/uvr5_pack/lib_v5/layers_537227KB.py index 0262e00..78e5392 100644 --- a/uvr5_pack/lib_v5/layers_537227KB.py +++ b/uvr5_pack/lib_v5/layers_537227KB.py @@ -6,19 +6,20 @@ from uvr5_pack.lib_v5 import spec_utils class Conv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): super(Conv2DBNActiv, self).__init__() self.conv = nn.Sequential( nn.Conv2d( - nin, nout, + nin, + nout, kernel_size=ksize, stride=stride, padding=pad, dilation=dilation, - bias=False), + bias=False, + ), nn.BatchNorm2d(nout), - activ() + activ(), ) def __call__(self, x): @@ -26,24 +27,22 @@ class Conv2DBNActiv(nn.Module): class SeperableConv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): super(SeperableConv2DBNActiv, self).__init__() self.conv = nn.Sequential( nn.Conv2d( - nin, nin, + nin, + nin, kernel_size=ksize, stride=stride, padding=pad, dilation=dilation, groups=nin, - bias=False), - nn.Conv2d( - nin, nout, - kernel_size=1, - bias=False), + bias=False, + ), + nn.Conv2d(nin, nout, kernel_size=1, bias=False), nn.BatchNorm2d(nout), - activ() + activ(), ) def __call__(self, x): @@ -51,7 +50,6 @@ class SeperableConv2DBNActiv(nn.Module): class Encoder(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): super(Encoder, self).__init__() self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) @@ -65,14 +63,15 @@ class Encoder(nn.Module): class Decoder(nn.Module): - - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): + def __init__( + self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False + ): super(Decoder, self).__init__() self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.dropout = nn.Dropout2d(0.1) if dropout else None def __call__(self, x, skip=None): - x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True) + x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) if skip is not None: skip = spec_utils.crop_center(skip, x) x = torch.cat([x, skip], dim=1) @@ -85,32 +84,37 @@ class Decoder(nn.Module): class ASPPModule(nn.Module): - def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU): super(ASPPModule, self).__init__() self.conv1 = nn.Sequential( nn.AdaptiveAvgPool2d((1, None)), - Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) + Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), ) self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) self.conv3 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) + nin, nin, 3, 1, dilations[0], dilations[0], activ=activ + ) self.conv4 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) + nin, nin, 3, 1, dilations[1], dilations[1], activ=activ + ) self.conv5 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) self.conv6 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) self.conv7 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) self.bottleneck = nn.Sequential( - Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), - nn.Dropout2d(0.1) + Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) ) def forward(self, x): _, _, h, w = x.size() - feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True) + feat1 = F.interpolate( + self.conv1(x), size=(h, w), mode="bilinear", align_corners=True + ) feat2 = self.conv2(x) feat3 = self.conv3(x) feat4 = self.conv4(x) diff --git a/uvr5_pack/lib_v5/layers_537238KB.py b/uvr5_pack/lib_v5/layers_537238KB.py index 0262e00..78e5392 100644 --- a/uvr5_pack/lib_v5/layers_537238KB.py +++ b/uvr5_pack/lib_v5/layers_537238KB.py @@ -6,19 +6,20 @@ from uvr5_pack.lib_v5 import spec_utils class Conv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): super(Conv2DBNActiv, self).__init__() self.conv = nn.Sequential( nn.Conv2d( - nin, nout, + nin, + nout, kernel_size=ksize, stride=stride, padding=pad, dilation=dilation, - bias=False), + bias=False, + ), nn.BatchNorm2d(nout), - activ() + activ(), ) def __call__(self, x): @@ -26,24 +27,22 @@ class Conv2DBNActiv(nn.Module): class SeperableConv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): super(SeperableConv2DBNActiv, self).__init__() self.conv = nn.Sequential( nn.Conv2d( - nin, nin, + nin, + nin, kernel_size=ksize, stride=stride, padding=pad, dilation=dilation, groups=nin, - bias=False), - nn.Conv2d( - nin, nout, - kernel_size=1, - bias=False), + bias=False, + ), + nn.Conv2d(nin, nout, kernel_size=1, bias=False), nn.BatchNorm2d(nout), - activ() + activ(), ) def __call__(self, x): @@ -51,7 +50,6 @@ class SeperableConv2DBNActiv(nn.Module): class Encoder(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): super(Encoder, self).__init__() self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) @@ -65,14 +63,15 @@ class Encoder(nn.Module): class Decoder(nn.Module): - - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): + def __init__( + self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False + ): super(Decoder, self).__init__() self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.dropout = nn.Dropout2d(0.1) if dropout else None def __call__(self, x, skip=None): - x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True) + x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) if skip is not None: skip = spec_utils.crop_center(skip, x) x = torch.cat([x, skip], dim=1) @@ -85,32 +84,37 @@ class Decoder(nn.Module): class ASPPModule(nn.Module): - def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU): super(ASPPModule, self).__init__() self.conv1 = nn.Sequential( nn.AdaptiveAvgPool2d((1, None)), - Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) + Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), ) self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) self.conv3 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) + nin, nin, 3, 1, dilations[0], dilations[0], activ=activ + ) self.conv4 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) + nin, nin, 3, 1, dilations[1], dilations[1], activ=activ + ) self.conv5 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) self.conv6 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) self.conv7 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) self.bottleneck = nn.Sequential( - Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), - nn.Dropout2d(0.1) + Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) ) def forward(self, x): _, _, h, w = x.size() - feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True) + feat1 = F.interpolate( + self.conv1(x), size=(h, w), mode="bilinear", align_corners=True + ) feat2 = self.conv2(x) feat3 = self.conv3(x) feat4 = self.conv4(x) diff --git a/uvr5_pack/lib_v5/model_param_init.py b/uvr5_pack/lib_v5/model_param_init.py index 514294d..b995c0b 100644 --- a/uvr5_pack/lib_v5/model_param_init.py +++ b/uvr5_pack/lib_v5/model_param_init.py @@ -3,33 +3,33 @@ import os import pathlib default_param = {} -default_param['bins'] = 768 -default_param['unstable_bins'] = 9 # training only -default_param['reduction_bins'] = 762 # training only -default_param['sr'] = 44100 -default_param['pre_filter_start'] = 757 -default_param['pre_filter_stop'] = 768 -default_param['band'] = {} +default_param["bins"] = 768 +default_param["unstable_bins"] = 9 # training only +default_param["reduction_bins"] = 762 # training only +default_param["sr"] = 44100 +default_param["pre_filter_start"] = 757 +default_param["pre_filter_stop"] = 768 +default_param["band"] = {} -default_param['band'][1] = { - 'sr': 11025, - 'hl': 128, - 'n_fft': 960, - 'crop_start': 0, - 'crop_stop': 245, - 'lpf_start': 61, # inference only - 'res_type': 'polyphase' +default_param["band"][1] = { + "sr": 11025, + "hl": 128, + "n_fft": 960, + "crop_start": 0, + "crop_stop": 245, + "lpf_start": 61, # inference only + "res_type": "polyphase", } -default_param['band'][2] = { - 'sr': 44100, - 'hl': 512, - 'n_fft': 1536, - 'crop_start': 24, - 'crop_stop': 547, - 'hpf_start': 81, # inference only - 'res_type': 'sinc_best' +default_param["band"][2] = { + "sr": 44100, + "hl": 512, + "n_fft": 1536, + "crop_start": 24, + "crop_stop": 547, + "hpf_start": 81, # inference only + "res_type": "sinc_best", } @@ -40,21 +40,30 @@ def int_keys(d): k = int(k) r[k] = v return r - + class ModelParameters(object): - def __init__(self, config_path=''): - if '.pth' == pathlib.Path(config_path).suffix: + def __init__(self, config_path=""): + if ".pth" == pathlib.Path(config_path).suffix: import zipfile - - with zipfile.ZipFile(config_path, 'r') as zip: - self.param = json.loads(zip.read('param.json'), object_pairs_hook=int_keys) - elif '.json' == pathlib.Path(config_path).suffix: - with open(config_path, 'r') as f: + + with zipfile.ZipFile(config_path, "r") as zip: + self.param = json.loads( + zip.read("param.json"), object_pairs_hook=int_keys + ) + elif ".json" == pathlib.Path(config_path).suffix: + with open(config_path, "r") as f: self.param = json.loads(f.read(), object_pairs_hook=int_keys) else: self.param = default_param - - for k in ['mid_side', 'mid_side_b', 'mid_side_b2', 'stereo_w', 'stereo_n', 'reverse']: + + for k in [ + "mid_side", + "mid_side_b", + "mid_side_b2", + "stereo_w", + "stereo_n", + "reverse", + ]: if not k in self.param: - self.param[k] = False \ No newline at end of file + self.param[k] = False diff --git a/uvr5_pack/lib_v5/nets.py b/uvr5_pack/lib_v5/nets.py index 70de59a..d4c376e 100644 --- a/uvr5_pack/lib_v5/nets.py +++ b/uvr5_pack/lib_v5/nets.py @@ -7,7 +7,6 @@ from uvr5_pack.lib_v5 import spec_utils class BaseASPPNet(nn.Module): - def __init__(self, nin, ch, dilations=(4, 8, 16)): super(BaseASPPNet, self).__init__() self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) @@ -39,7 +38,6 @@ class BaseASPPNet(nn.Module): class CascadedASPPNet(nn.Module): - def __init__(self, n_fft): super(CascadedASPPNet, self).__init__() self.stg1_low_band_net = BaseASPPNet(2, 16) @@ -64,13 +62,16 @@ class CascadedASPPNet(nn.Module): mix = x.detach() x = x.clone() - x = x[:, :, :self.max_bin] + x = x[:, :, : self.max_bin] bandw = x.size()[2] // 2 - aux1 = torch.cat([ - self.stg1_low_band_net(x[:, :, :bandw]), - self.stg1_high_band_net(x[:, :, bandw:]) - ], dim=2) + aux1 = torch.cat( + [ + self.stg1_low_band_net(x[:, :, :bandw]), + self.stg1_high_band_net(x[:, :, bandw:]), + ], + dim=2, + ) h = torch.cat([x, aux1], dim=1) aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) @@ -82,24 +83,33 @@ class CascadedASPPNet(nn.Module): mask = F.pad( input=mask, pad=(0, 0, 0, self.output_bin - mask.size()[2]), - mode='replicate') - + mode="replicate", + ) + if self.training: aux1 = torch.sigmoid(self.aux1_out(aux1)) aux1 = F.pad( input=aux1, pad=(0, 0, 0, self.output_bin - aux1.size()[2]), - mode='replicate') + mode="replicate", + ) aux2 = torch.sigmoid(self.aux2_out(aux2)) aux2 = F.pad( input=aux2, pad=(0, 0, 0, self.output_bin - aux2.size()[2]), - mode='replicate') + mode="replicate", + ) return mask * mix, aux1 * mix, aux2 * mix - else: + else: if aggressiveness: - mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3) - mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value']) + mask[:, :, : aggressiveness["split_bin"]] = torch.pow( + mask[:, :, : aggressiveness["split_bin"]], + 1 + aggressiveness["value"] / 3, + ) + mask[:, :, aggressiveness["split_bin"] :] = torch.pow( + mask[:, :, aggressiveness["split_bin"] :], + 1 + aggressiveness["value"], + ) return mask * mix @@ -107,7 +117,7 @@ class CascadedASPPNet(nn.Module): h = self.forward(x_mag, aggressiveness) if self.offset > 0: - h = h[:, :, :, self.offset:-self.offset] + h = h[:, :, :, self.offset : -self.offset] assert h.size()[3] > 0 return h diff --git a/uvr5_pack/lib_v5/nets_123812KB.py b/uvr5_pack/lib_v5/nets_123812KB.py index 957c8e1..ea6c45c 100644 --- a/uvr5_pack/lib_v5/nets_123812KB.py +++ b/uvr5_pack/lib_v5/nets_123812KB.py @@ -6,7 +6,6 @@ from uvr5_pack.lib_v5 import layers_123821KB as layers class BaseASPPNet(nn.Module): - def __init__(self, nin, ch, dilations=(4, 8, 16)): super(BaseASPPNet, self).__init__() self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) @@ -38,7 +37,6 @@ class BaseASPPNet(nn.Module): class CascadedASPPNet(nn.Module): - def __init__(self, n_fft): super(CascadedASPPNet, self).__init__() self.stg1_low_band_net = BaseASPPNet(2, 32) @@ -63,13 +61,16 @@ class CascadedASPPNet(nn.Module): mix = x.detach() x = x.clone() - x = x[:, :, :self.max_bin] + x = x[:, :, : self.max_bin] bandw = x.size()[2] // 2 - aux1 = torch.cat([ - self.stg1_low_band_net(x[:, :, :bandw]), - self.stg1_high_band_net(x[:, :, bandw:]) - ], dim=2) + aux1 = torch.cat( + [ + self.stg1_low_band_net(x[:, :, :bandw]), + self.stg1_high_band_net(x[:, :, bandw:]), + ], + dim=2, + ) h = torch.cat([x, aux1], dim=1) aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) @@ -81,24 +82,33 @@ class CascadedASPPNet(nn.Module): mask = F.pad( input=mask, pad=(0, 0, 0, self.output_bin - mask.size()[2]), - mode='replicate') - + mode="replicate", + ) + if self.training: aux1 = torch.sigmoid(self.aux1_out(aux1)) aux1 = F.pad( input=aux1, pad=(0, 0, 0, self.output_bin - aux1.size()[2]), - mode='replicate') + mode="replicate", + ) aux2 = torch.sigmoid(self.aux2_out(aux2)) aux2 = F.pad( input=aux2, pad=(0, 0, 0, self.output_bin - aux2.size()[2]), - mode='replicate') + mode="replicate", + ) return mask * mix, aux1 * mix, aux2 * mix else: if aggressiveness: - mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3) - mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value']) + mask[:, :, : aggressiveness["split_bin"]] = torch.pow( + mask[:, :, : aggressiveness["split_bin"]], + 1 + aggressiveness["value"] / 3, + ) + mask[:, :, aggressiveness["split_bin"] :] = torch.pow( + mask[:, :, aggressiveness["split_bin"] :], + 1 + aggressiveness["value"], + ) return mask * mix @@ -106,7 +116,7 @@ class CascadedASPPNet(nn.Module): h = self.forward(x_mag, aggressiveness) if self.offset > 0: - h = h[:, :, :, self.offset:-self.offset] + h = h[:, :, :, self.offset : -self.offset] assert h.size()[3] > 0 return h diff --git a/uvr5_pack/lib_v5/nets_123821KB.py b/uvr5_pack/lib_v5/nets_123821KB.py index 957c8e1..ea6c45c 100644 --- a/uvr5_pack/lib_v5/nets_123821KB.py +++ b/uvr5_pack/lib_v5/nets_123821KB.py @@ -6,7 +6,6 @@ from uvr5_pack.lib_v5 import layers_123821KB as layers class BaseASPPNet(nn.Module): - def __init__(self, nin, ch, dilations=(4, 8, 16)): super(BaseASPPNet, self).__init__() self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) @@ -38,7 +37,6 @@ class BaseASPPNet(nn.Module): class CascadedASPPNet(nn.Module): - def __init__(self, n_fft): super(CascadedASPPNet, self).__init__() self.stg1_low_band_net = BaseASPPNet(2, 32) @@ -63,13 +61,16 @@ class CascadedASPPNet(nn.Module): mix = x.detach() x = x.clone() - x = x[:, :, :self.max_bin] + x = x[:, :, : self.max_bin] bandw = x.size()[2] // 2 - aux1 = torch.cat([ - self.stg1_low_band_net(x[:, :, :bandw]), - self.stg1_high_band_net(x[:, :, bandw:]) - ], dim=2) + aux1 = torch.cat( + [ + self.stg1_low_band_net(x[:, :, :bandw]), + self.stg1_high_band_net(x[:, :, bandw:]), + ], + dim=2, + ) h = torch.cat([x, aux1], dim=1) aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) @@ -81,24 +82,33 @@ class CascadedASPPNet(nn.Module): mask = F.pad( input=mask, pad=(0, 0, 0, self.output_bin - mask.size()[2]), - mode='replicate') - + mode="replicate", + ) + if self.training: aux1 = torch.sigmoid(self.aux1_out(aux1)) aux1 = F.pad( input=aux1, pad=(0, 0, 0, self.output_bin - aux1.size()[2]), - mode='replicate') + mode="replicate", + ) aux2 = torch.sigmoid(self.aux2_out(aux2)) aux2 = F.pad( input=aux2, pad=(0, 0, 0, self.output_bin - aux2.size()[2]), - mode='replicate') + mode="replicate", + ) return mask * mix, aux1 * mix, aux2 * mix else: if aggressiveness: - mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3) - mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value']) + mask[:, :, : aggressiveness["split_bin"]] = torch.pow( + mask[:, :, : aggressiveness["split_bin"]], + 1 + aggressiveness["value"] / 3, + ) + mask[:, :, aggressiveness["split_bin"] :] = torch.pow( + mask[:, :, aggressiveness["split_bin"] :], + 1 + aggressiveness["value"], + ) return mask * mix @@ -106,7 +116,7 @@ class CascadedASPPNet(nn.Module): h = self.forward(x_mag, aggressiveness) if self.offset > 0: - h = h[:, :, :, self.offset:-self.offset] + h = h[:, :, :, self.offset : -self.offset] assert h.size()[3] > 0 return h diff --git a/uvr5_pack/lib_v5/nets_33966KB.py b/uvr5_pack/lib_v5/nets_33966KB.py index 7cc8262..d2bddb1 100644 --- a/uvr5_pack/lib_v5/nets_33966KB.py +++ b/uvr5_pack/lib_v5/nets_33966KB.py @@ -6,7 +6,6 @@ from uvr5_pack.lib_v5 import layers_33966KB as layers class BaseASPPNet(nn.Module): - def __init__(self, nin, ch, dilations=(4, 8, 16, 32)): super(BaseASPPNet, self).__init__() self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) @@ -38,7 +37,6 @@ class BaseASPPNet(nn.Module): class CascadedASPPNet(nn.Module): - def __init__(self, n_fft): super(CascadedASPPNet, self).__init__() self.stg1_low_band_net = BaseASPPNet(2, 16) @@ -63,13 +61,16 @@ class CascadedASPPNet(nn.Module): mix = x.detach() x = x.clone() - x = x[:, :, :self.max_bin] + x = x[:, :, : self.max_bin] bandw = x.size()[2] // 2 - aux1 = torch.cat([ - self.stg1_low_band_net(x[:, :, :bandw]), - self.stg1_high_band_net(x[:, :, bandw:]) - ], dim=2) + aux1 = torch.cat( + [ + self.stg1_low_band_net(x[:, :, :bandw]), + self.stg1_high_band_net(x[:, :, bandw:]), + ], + dim=2, + ) h = torch.cat([x, aux1], dim=1) aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) @@ -81,24 +82,33 @@ class CascadedASPPNet(nn.Module): mask = F.pad( input=mask, pad=(0, 0, 0, self.output_bin - mask.size()[2]), - mode='replicate') + mode="replicate", + ) if self.training: aux1 = torch.sigmoid(self.aux1_out(aux1)) aux1 = F.pad( input=aux1, pad=(0, 0, 0, self.output_bin - aux1.size()[2]), - mode='replicate') + mode="replicate", + ) aux2 = torch.sigmoid(self.aux2_out(aux2)) aux2 = F.pad( input=aux2, pad=(0, 0, 0, self.output_bin - aux2.size()[2]), - mode='replicate') + mode="replicate", + ) return mask * mix, aux1 * mix, aux2 * mix else: if aggressiveness: - mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3) - mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value']) + mask[:, :, : aggressiveness["split_bin"]] = torch.pow( + mask[:, :, : aggressiveness["split_bin"]], + 1 + aggressiveness["value"] / 3, + ) + mask[:, :, aggressiveness["split_bin"] :] = torch.pow( + mask[:, :, aggressiveness["split_bin"] :], + 1 + aggressiveness["value"], + ) return mask * mix @@ -106,7 +116,7 @@ class CascadedASPPNet(nn.Module): h = self.forward(x_mag, aggressiveness) if self.offset > 0: - h = h[:, :, :, self.offset:-self.offset] + h = h[:, :, :, self.offset : -self.offset] assert h.size()[3] > 0 return h diff --git a/uvr5_pack/lib_v5/nets_537227KB.py b/uvr5_pack/lib_v5/nets_537227KB.py index 3d8006b..1ceac4a 100644 --- a/uvr5_pack/lib_v5/nets_537227KB.py +++ b/uvr5_pack/lib_v5/nets_537227KB.py @@ -7,7 +7,6 @@ from uvr5_pack.lib_v5 import layers_537238KB as layers class BaseASPPNet(nn.Module): - def __init__(self, nin, ch, dilations=(4, 8, 16)): super(BaseASPPNet, self).__init__() self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) @@ -39,7 +38,6 @@ class BaseASPPNet(nn.Module): class CascadedASPPNet(nn.Module): - def __init__(self, n_fft): super(CascadedASPPNet, self).__init__() self.stg1_low_band_net = BaseASPPNet(2, 64) @@ -64,13 +62,16 @@ class CascadedASPPNet(nn.Module): mix = x.detach() x = x.clone() - x = x[:, :, :self.max_bin] + x = x[:, :, : self.max_bin] bandw = x.size()[2] // 2 - aux1 = torch.cat([ - self.stg1_low_band_net(x[:, :, :bandw]), - self.stg1_high_band_net(x[:, :, bandw:]) - ], dim=2) + aux1 = torch.cat( + [ + self.stg1_low_band_net(x[:, :, :bandw]), + self.stg1_high_band_net(x[:, :, bandw:]), + ], + dim=2, + ) h = torch.cat([x, aux1], dim=1) aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) @@ -82,24 +83,33 @@ class CascadedASPPNet(nn.Module): mask = F.pad( input=mask, pad=(0, 0, 0, self.output_bin - mask.size()[2]), - mode='replicate') - + mode="replicate", + ) + if self.training: aux1 = torch.sigmoid(self.aux1_out(aux1)) aux1 = F.pad( input=aux1, pad=(0, 0, 0, self.output_bin - aux1.size()[2]), - mode='replicate') + mode="replicate", + ) aux2 = torch.sigmoid(self.aux2_out(aux2)) aux2 = F.pad( input=aux2, pad=(0, 0, 0, self.output_bin - aux2.size()[2]), - mode='replicate') + mode="replicate", + ) return mask * mix, aux1 * mix, aux2 * mix else: if aggressiveness: - mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3) - mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value']) + mask[:, :, : aggressiveness["split_bin"]] = torch.pow( + mask[:, :, : aggressiveness["split_bin"]], + 1 + aggressiveness["value"] / 3, + ) + mask[:, :, aggressiveness["split_bin"] :] = torch.pow( + mask[:, :, aggressiveness["split_bin"] :], + 1 + aggressiveness["value"], + ) return mask * mix @@ -107,7 +117,7 @@ class CascadedASPPNet(nn.Module): h = self.forward(x_mag, aggressiveness) if self.offset > 0: - h = h[:, :, :, self.offset:-self.offset] + h = h[:, :, :, self.offset : -self.offset] assert h.size()[3] > 0 return h diff --git a/uvr5_pack/lib_v5/nets_537238KB.py b/uvr5_pack/lib_v5/nets_537238KB.py index 3d8006b..1ceac4a 100644 --- a/uvr5_pack/lib_v5/nets_537238KB.py +++ b/uvr5_pack/lib_v5/nets_537238KB.py @@ -7,7 +7,6 @@ from uvr5_pack.lib_v5 import layers_537238KB as layers class BaseASPPNet(nn.Module): - def __init__(self, nin, ch, dilations=(4, 8, 16)): super(BaseASPPNet, self).__init__() self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) @@ -39,7 +38,6 @@ class BaseASPPNet(nn.Module): class CascadedASPPNet(nn.Module): - def __init__(self, n_fft): super(CascadedASPPNet, self).__init__() self.stg1_low_band_net = BaseASPPNet(2, 64) @@ -64,13 +62,16 @@ class CascadedASPPNet(nn.Module): mix = x.detach() x = x.clone() - x = x[:, :, :self.max_bin] + x = x[:, :, : self.max_bin] bandw = x.size()[2] // 2 - aux1 = torch.cat([ - self.stg1_low_band_net(x[:, :, :bandw]), - self.stg1_high_band_net(x[:, :, bandw:]) - ], dim=2) + aux1 = torch.cat( + [ + self.stg1_low_band_net(x[:, :, :bandw]), + self.stg1_high_band_net(x[:, :, bandw:]), + ], + dim=2, + ) h = torch.cat([x, aux1], dim=1) aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) @@ -82,24 +83,33 @@ class CascadedASPPNet(nn.Module): mask = F.pad( input=mask, pad=(0, 0, 0, self.output_bin - mask.size()[2]), - mode='replicate') - + mode="replicate", + ) + if self.training: aux1 = torch.sigmoid(self.aux1_out(aux1)) aux1 = F.pad( input=aux1, pad=(0, 0, 0, self.output_bin - aux1.size()[2]), - mode='replicate') + mode="replicate", + ) aux2 = torch.sigmoid(self.aux2_out(aux2)) aux2 = F.pad( input=aux2, pad=(0, 0, 0, self.output_bin - aux2.size()[2]), - mode='replicate') + mode="replicate", + ) return mask * mix, aux1 * mix, aux2 * mix else: if aggressiveness: - mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3) - mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value']) + mask[:, :, : aggressiveness["split_bin"]] = torch.pow( + mask[:, :, : aggressiveness["split_bin"]], + 1 + aggressiveness["value"] / 3, + ) + mask[:, :, aggressiveness["split_bin"] :] = torch.pow( + mask[:, :, aggressiveness["split_bin"] :], + 1 + aggressiveness["value"], + ) return mask * mix @@ -107,7 +117,7 @@ class CascadedASPPNet(nn.Module): h = self.forward(x_mag, aggressiveness) if self.offset > 0: - h = h[:, :, :, self.offset:-self.offset] + h = h[:, :, :, self.offset : -self.offset] assert h.size()[3] > 0 return h diff --git a/uvr5_pack/lib_v5/nets_61968KB.py b/uvr5_pack/lib_v5/nets_61968KB.py index 957c8e1..ea6c45c 100644 --- a/uvr5_pack/lib_v5/nets_61968KB.py +++ b/uvr5_pack/lib_v5/nets_61968KB.py @@ -6,7 +6,6 @@ from uvr5_pack.lib_v5 import layers_123821KB as layers class BaseASPPNet(nn.Module): - def __init__(self, nin, ch, dilations=(4, 8, 16)): super(BaseASPPNet, self).__init__() self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) @@ -38,7 +37,6 @@ class BaseASPPNet(nn.Module): class CascadedASPPNet(nn.Module): - def __init__(self, n_fft): super(CascadedASPPNet, self).__init__() self.stg1_low_band_net = BaseASPPNet(2, 32) @@ -63,13 +61,16 @@ class CascadedASPPNet(nn.Module): mix = x.detach() x = x.clone() - x = x[:, :, :self.max_bin] + x = x[:, :, : self.max_bin] bandw = x.size()[2] // 2 - aux1 = torch.cat([ - self.stg1_low_band_net(x[:, :, :bandw]), - self.stg1_high_band_net(x[:, :, bandw:]) - ], dim=2) + aux1 = torch.cat( + [ + self.stg1_low_band_net(x[:, :, :bandw]), + self.stg1_high_band_net(x[:, :, bandw:]), + ], + dim=2, + ) h = torch.cat([x, aux1], dim=1) aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) @@ -81,24 +82,33 @@ class CascadedASPPNet(nn.Module): mask = F.pad( input=mask, pad=(0, 0, 0, self.output_bin - mask.size()[2]), - mode='replicate') - + mode="replicate", + ) + if self.training: aux1 = torch.sigmoid(self.aux1_out(aux1)) aux1 = F.pad( input=aux1, pad=(0, 0, 0, self.output_bin - aux1.size()[2]), - mode='replicate') + mode="replicate", + ) aux2 = torch.sigmoid(self.aux2_out(aux2)) aux2 = F.pad( input=aux2, pad=(0, 0, 0, self.output_bin - aux2.size()[2]), - mode='replicate') + mode="replicate", + ) return mask * mix, aux1 * mix, aux2 * mix else: if aggressiveness: - mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3) - mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value']) + mask[:, :, : aggressiveness["split_bin"]] = torch.pow( + mask[:, :, : aggressiveness["split_bin"]], + 1 + aggressiveness["value"] / 3, + ) + mask[:, :, aggressiveness["split_bin"] :] = torch.pow( + mask[:, :, aggressiveness["split_bin"] :], + 1 + aggressiveness["value"], + ) return mask * mix @@ -106,7 +116,7 @@ class CascadedASPPNet(nn.Module): h = self.forward(x_mag, aggressiveness) if self.offset > 0: - h = h[:, :, :, self.offset:-self.offset] + h = h[:, :, :, self.offset : -self.offset] assert h.size()[3] > 0 return h diff --git a/uvr5_pack/lib_v5/spec_utils.py b/uvr5_pack/lib_v5/spec_utils.py index fe95916..a3fd46d 100644 --- a/uvr5_pack/lib_v5/spec_utils.py +++ b/uvr5_pack/lib_v5/spec_utils.py @@ -1,8 +1,9 @@ -import os,librosa -import numpy as np -import soundfile as sf +import os, librosa +import numpy as np +import soundfile as sf from tqdm import tqdm -import json,math ,hashlib +import json, math, hashlib + def crop_center(h1, h2): h1_shape = h1.size() @@ -11,7 +12,7 @@ def crop_center(h1, h2): if h1_shape[3] == h2_shape[3]: return h1 elif h1_shape[3] < h2_shape[3]: - raise ValueError('h1_shape[3] must be greater than h2_shape[3]') + raise ValueError("h1_shape[3] must be greater than h2_shape[3]") # s_freq = (h2_shape[2] - h1_shape[2]) // 2 # e_freq = s_freq + h1_shape[2] @@ -22,7 +23,9 @@ def crop_center(h1, h2): return h1 -def wave_to_spectrogram(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False): +def wave_to_spectrogram( + wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False +): if reverse: wave_left = np.flip(np.asfortranarray(wave[0])) wave_right = np.flip(np.asfortranarray(wave[1])) @@ -30,21 +33,23 @@ def wave_to_spectrogram(wave, hop_length, n_fft, mid_side=False, mid_side_b2=Fal wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2) wave_right = np.asfortranarray(np.subtract(wave[0], wave[1])) elif mid_side_b2: - wave_left = np.asfortranarray(np.add(wave[1], wave[0] * .5)) - wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * .5)) + wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5)) + wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5)) else: wave_left = np.asfortranarray(wave[0]) wave_right = np.asfortranarray(wave[1]) spec_left = librosa.stft(wave_left, n_fft, hop_length=hop_length) spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length) - + spec = np.asfortranarray([spec_left, spec_right]) return spec - - -def wave_to_spectrogram_mt(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False): + + +def wave_to_spectrogram_mt( + wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False +): import threading if reverse: @@ -54,62 +59,75 @@ def wave_to_spectrogram_mt(wave, hop_length, n_fft, mid_side=False, mid_side_b2= wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2) wave_right = np.asfortranarray(np.subtract(wave[0], wave[1])) elif mid_side_b2: - wave_left = np.asfortranarray(np.add(wave[1], wave[0] * .5)) - wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * .5)) + wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5)) + wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5)) else: wave_left = np.asfortranarray(wave[0]) wave_right = np.asfortranarray(wave[1]) - + def run_thread(**kwargs): global spec_left spec_left = librosa.stft(**kwargs) - thread = threading.Thread(target=run_thread, kwargs={'y': wave_left, 'n_fft': n_fft, 'hop_length': hop_length}) + thread = threading.Thread( + target=run_thread, + kwargs={"y": wave_left, "n_fft": n_fft, "hop_length": hop_length}, + ) thread.start() spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length) - thread.join() - + thread.join() + spec = np.asfortranarray([spec_left, spec_right]) return spec - - + + def combine_spectrograms(specs, mp): - l = min([specs[i].shape[2] for i in specs]) - spec_c = np.zeros(shape=(2, mp.param['bins'] + 1, l), dtype=np.complex64) + l = min([specs[i].shape[2] for i in specs]) + spec_c = np.zeros(shape=(2, mp.param["bins"] + 1, l), dtype=np.complex64) offset = 0 - bands_n = len(mp.param['band']) - + bands_n = len(mp.param["band"]) + for d in range(1, bands_n + 1): - h = mp.param['band'][d]['crop_stop'] - mp.param['band'][d]['crop_start'] - spec_c[:, offset:offset+h, :l] = specs[d][:, mp.param['band'][d]['crop_start']:mp.param['band'][d]['crop_stop'], :l] + h = mp.param["band"][d]["crop_stop"] - mp.param["band"][d]["crop_start"] + spec_c[:, offset : offset + h, :l] = specs[d][ + :, mp.param["band"][d]["crop_start"] : mp.param["band"][d]["crop_stop"], :l + ] offset += h - - if offset > mp.param['bins']: - raise ValueError('Too much bins') - + + if offset > mp.param["bins"]: + raise ValueError("Too much bins") + # lowpass fiter - if mp.param['pre_filter_start'] > 0: # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']: + if ( + mp.param["pre_filter_start"] > 0 + ): # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']: if bands_n == 1: - spec_c = fft_lp_filter(spec_c, mp.param['pre_filter_start'], mp.param['pre_filter_stop']) + spec_c = fft_lp_filter( + spec_c, mp.param["pre_filter_start"], mp.param["pre_filter_stop"] + ) else: - gp = 1 - for b in range(mp.param['pre_filter_start'] + 1, mp.param['pre_filter_stop']): - g = math.pow(10, -(b - mp.param['pre_filter_start']) * (3.5 - gp) / 20.0) + gp = 1 + for b in range( + mp.param["pre_filter_start"] + 1, mp.param["pre_filter_stop"] + ): + g = math.pow( + 10, -(b - mp.param["pre_filter_start"]) * (3.5 - gp) / 20.0 + ) gp = g spec_c[:, b, :] *= g - - return np.asfortranarray(spec_c) - -def spectrogram_to_image(spec, mode='magnitude'): - if mode == 'magnitude': + return np.asfortranarray(spec_c) + + +def spectrogram_to_image(spec, mode="magnitude"): + if mode == "magnitude": if np.iscomplexobj(spec): y = np.abs(spec) else: y = spec - y = np.log10(y ** 2 + 1e-8) - elif mode == 'phase': + y = np.log10(y**2 + 1e-8) + elif mode == "phase": if np.iscomplexobj(spec): y = np.angle(spec) else: @@ -121,9 +139,7 @@ def spectrogram_to_image(spec, mode='magnitude'): if y.ndim == 3: img = img.transpose(1, 2, 0) - img = np.concatenate([ - np.max(img, axis=2, keepdims=True), img - ], axis=2) + img = np.concatenate([np.max(img, axis=2, keepdims=True), img], axis=2) return img @@ -136,12 +152,12 @@ def reduce_vocal_aggressively(X, y, softmask): v_mask = v_mag_tmp > y_mag_tmp y_mag = np.clip(y_mag_tmp - v_mag_tmp * v_mask * softmask, 0, np.inf) - return y_mag * np.exp(1.j * np.angle(y)) + return y_mag * np.exp(1.0j * np.angle(y)) def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32): if min_range < fade_size * 2: - raise ValueError('min_range must be >= fade_area * 2') + raise ValueError("min_range must be >= fade_area * 2") mag = mag.copy() @@ -159,72 +175,106 @@ def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32): if s != 0: weight = np.linspace(0, 1, fade_size) - mag[:, :, s:s + fade_size] += weight * ref[:, :, s:s + fade_size] + mag[:, :, s : s + fade_size] += weight * ref[:, :, s : s + fade_size] else: s -= fade_size if e != mag.shape[2]: weight = np.linspace(1, 0, fade_size) - mag[:, :, e - fade_size:e] += weight * ref[:, :, e - fade_size:e] + mag[:, :, e - fade_size : e] += weight * ref[:, :, e - fade_size : e] else: e += fade_size - mag[:, :, s + fade_size:e - fade_size] += ref[:, :, s + fade_size:e - fade_size] + mag[:, :, s + fade_size : e - fade_size] += ref[ + :, :, s + fade_size : e - fade_size + ] old_e = e return mag - + def align_wave_head_and_tail(a, b): - l = min([a[0].size, b[0].size]) - - return a[:l,:l], b[:l,:l] - + l = min([a[0].size, b[0].size]) + + return a[:l, :l], b[:l, :l] + def cache_or_load(mix_path, inst_path, mp): mix_basename = os.path.splitext(os.path.basename(mix_path))[0] inst_basename = os.path.splitext(os.path.basename(inst_path))[0] - cache_dir = 'mph{}'.format(hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode('utf-8')).hexdigest()) - mix_cache_dir = os.path.join('cache', cache_dir) - inst_cache_dir = os.path.join('cache', cache_dir) + cache_dir = "mph{}".format( + hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode("utf-8")).hexdigest() + ) + mix_cache_dir = os.path.join("cache", cache_dir) + inst_cache_dir = os.path.join("cache", cache_dir) os.makedirs(mix_cache_dir, exist_ok=True) os.makedirs(inst_cache_dir, exist_ok=True) - mix_cache_path = os.path.join(mix_cache_dir, mix_basename + '.npy') - inst_cache_path = os.path.join(inst_cache_dir, inst_basename + '.npy') + mix_cache_path = os.path.join(mix_cache_dir, mix_basename + ".npy") + inst_cache_path = os.path.join(inst_cache_dir, inst_basename + ".npy") if os.path.exists(mix_cache_path) and os.path.exists(inst_cache_path): X_spec_m = np.load(mix_cache_path) y_spec_m = np.load(inst_cache_path) else: X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {} - - for d in range(len(mp.param['band']), 0, -1): - bp = mp.param['band'][d] - - if d == len(mp.param['band']): # high-end band + + for d in range(len(mp.param["band"]), 0, -1): + bp = mp.param["band"][d] + + if d == len(mp.param["band"]): # high-end band X_wave[d], _ = librosa.load( - mix_path, bp['sr'], False, dtype=np.float32, res_type=bp['res_type']) + mix_path, bp["sr"], False, dtype=np.float32, res_type=bp["res_type"] + ) y_wave[d], _ = librosa.load( - inst_path, bp['sr'], False, dtype=np.float32, res_type=bp['res_type']) - else: # lower bands - X_wave[d] = librosa.resample(X_wave[d+1], mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type']) - y_wave[d] = librosa.resample(y_wave[d+1], mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type']) - + inst_path, + bp["sr"], + False, + dtype=np.float32, + res_type=bp["res_type"], + ) + else: # lower bands + X_wave[d] = librosa.resample( + X_wave[d + 1], + mp.param["band"][d + 1]["sr"], + bp["sr"], + res_type=bp["res_type"], + ) + y_wave[d] = librosa.resample( + y_wave[d + 1], + mp.param["band"][d + 1]["sr"], + bp["sr"], + res_type=bp["res_type"], + ) + X_wave[d], y_wave[d] = align_wave_head_and_tail(X_wave[d], y_wave[d]) - - X_spec_s[d] = wave_to_spectrogram(X_wave[d], bp['hl'], bp['n_fft'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']) - y_spec_s[d] = wave_to_spectrogram(y_wave[d], bp['hl'], bp['n_fft'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']) - + + X_spec_s[d] = wave_to_spectrogram( + X_wave[d], + bp["hl"], + bp["n_fft"], + mp.param["mid_side"], + mp.param["mid_side_b2"], + mp.param["reverse"], + ) + y_spec_s[d] = wave_to_spectrogram( + y_wave[d], + bp["hl"], + bp["n_fft"], + mp.param["mid_side"], + mp.param["mid_side_b2"], + mp.param["reverse"], + ) + del X_wave, y_wave - + X_spec_m = combine_spectrograms(X_spec_s, mp) y_spec_m = combine_spectrograms(y_spec_s, mp) - + if X_spec_m.shape != y_spec_m.shape: - raise ValueError('The combined spectrograms are different: ' + mix_path) + raise ValueError("The combined spectrograms are different: " + mix_path) _, ext = os.path.splitext(mix_path) @@ -244,72 +294,129 @@ def spectrogram_to_wave(spec, hop_length, mid_side, mid_side_b2, reverse): if reverse: return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)]) elif mid_side: - return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]) + return np.asfortranarray( + [np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)] + ) elif mid_side_b2: - return np.asfortranarray([np.add(wave_right / 1.25, .4 * wave_left), np.subtract(wave_left / 1.25, .4 * wave_right)]) + return np.asfortranarray( + [ + np.add(wave_right / 1.25, 0.4 * wave_left), + np.subtract(wave_left / 1.25, 0.4 * wave_right), + ] + ) else: return np.asfortranarray([wave_left, wave_right]) - - + + def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2): import threading spec_left = np.asfortranarray(spec[0]) spec_right = np.asfortranarray(spec[1]) - + def run_thread(**kwargs): global wave_left wave_left = librosa.istft(**kwargs) - - thread = threading.Thread(target=run_thread, kwargs={'stft_matrix': spec_left, 'hop_length': hop_length}) + + thread = threading.Thread( + target=run_thread, kwargs={"stft_matrix": spec_left, "hop_length": hop_length} + ) thread.start() wave_right = librosa.istft(spec_right, hop_length=hop_length) - thread.join() - + thread.join() + if reverse: return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)]) elif mid_side: - return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]) + return np.asfortranarray( + [np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)] + ) elif mid_side_b2: - return np.asfortranarray([np.add(wave_right / 1.25, .4 * wave_left), np.subtract(wave_left / 1.25, .4 * wave_right)]) + return np.asfortranarray( + [ + np.add(wave_right / 1.25, 0.4 * wave_left), + np.subtract(wave_left / 1.25, 0.4 * wave_right), + ] + ) else: return np.asfortranarray([wave_left, wave_right]) - - + + def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None): wave_band = {} - bands_n = len(mp.param['band']) + bands_n = len(mp.param["band"]) offset = 0 for d in range(1, bands_n + 1): - bp = mp.param['band'][d] - spec_s = np.ndarray(shape=(2, bp['n_fft'] // 2 + 1, spec_m.shape[2]), dtype=complex) - h = bp['crop_stop'] - bp['crop_start'] - spec_s[:, bp['crop_start']:bp['crop_stop'], :] = spec_m[:, offset:offset+h, :] - + bp = mp.param["band"][d] + spec_s = np.ndarray( + shape=(2, bp["n_fft"] // 2 + 1, spec_m.shape[2]), dtype=complex + ) + h = bp["crop_stop"] - bp["crop_start"] + spec_s[:, bp["crop_start"] : bp["crop_stop"], :] = spec_m[ + :, offset : offset + h, : + ] + offset += h - if d == bands_n: # higher - if extra_bins_h: # if --high_end_process bypass - max_bin = bp['n_fft'] // 2 - spec_s[:, max_bin-extra_bins_h:max_bin, :] = extra_bins[:, :extra_bins_h, :] - if bp['hpf_start'] > 0: - spec_s = fft_hp_filter(spec_s, bp['hpf_start'], bp['hpf_stop'] - 1) + if d == bands_n: # higher + if extra_bins_h: # if --high_end_process bypass + max_bin = bp["n_fft"] // 2 + spec_s[:, max_bin - extra_bins_h : max_bin, :] = extra_bins[ + :, :extra_bins_h, : + ] + if bp["hpf_start"] > 0: + spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1) if bands_n == 1: - wave = spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']) + wave = spectrogram_to_wave( + spec_s, + bp["hl"], + mp.param["mid_side"], + mp.param["mid_side_b2"], + mp.param["reverse"], + ) else: - wave = np.add(wave, spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse'])) + wave = np.add( + wave, + spectrogram_to_wave( + spec_s, + bp["hl"], + mp.param["mid_side"], + mp.param["mid_side_b2"], + mp.param["reverse"], + ), + ) else: - sr = mp.param['band'][d+1]['sr'] - if d == 1: # lower - spec_s = fft_lp_filter(spec_s, bp['lpf_start'], bp['lpf_stop']) - wave = librosa.resample(spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']), bp['sr'], sr, res_type="sinc_fastest") - else: # mid - spec_s = fft_hp_filter(spec_s, bp['hpf_start'], bp['hpf_stop'] - 1) - spec_s = fft_lp_filter(spec_s, bp['lpf_start'], bp['lpf_stop']) - wave2 = np.add(wave, spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse'])) + sr = mp.param["band"][d + 1]["sr"] + if d == 1: # lower + spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"]) + wave = librosa.resample( + spectrogram_to_wave( + spec_s, + bp["hl"], + mp.param["mid_side"], + mp.param["mid_side_b2"], + mp.param["reverse"], + ), + bp["sr"], + sr, + res_type="sinc_fastest", + ) + else: # mid + spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1) + spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"]) + wave2 = np.add( + wave, + spectrogram_to_wave( + spec_s, + bp["hl"], + mp.param["mid_side"], + mp.param["mid_side_b2"], + mp.param["reverse"], + ), + ) # wave = librosa.core.resample(wave2, bp['sr'], sr, res_type="sinc_fastest") - wave = librosa.core.resample(wave2, bp['sr'], sr,res_type='scipy') - + wave = librosa.core.resample(wave2, bp["sr"], sr, res_type="scipy") + return wave.T @@ -318,7 +425,7 @@ def fft_lp_filter(spec, bin_start, bin_stop): for b in range(bin_start, bin_stop): g -= 1 / (bin_stop - bin_start) spec[:, b, :] = g * spec[:, b, :] - + spec[:, bin_stop:, :] *= 0 return spec @@ -329,42 +436,69 @@ def fft_hp_filter(spec, bin_start, bin_stop): for b in range(bin_start, bin_stop, -1): g -= 1 / (bin_start - bin_stop) spec[:, b, :] = g * spec[:, b, :] - - spec[:, 0:bin_stop+1, :] *= 0 + + spec[:, 0 : bin_stop + 1, :] *= 0 return spec def mirroring(a, spec_m, input_high_end, mp): - if 'mirroring' == a: - mirror = np.flip(np.abs(spec_m[:, mp.param['pre_filter_start']-10-input_high_end.shape[1]:mp.param['pre_filter_start']-10, :]), 1) - mirror = mirror * np.exp(1.j * np.angle(input_high_end)) - - return np.where(np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror) - - if 'mirroring2' == a: - mirror = np.flip(np.abs(spec_m[:, mp.param['pre_filter_start']-10-input_high_end.shape[1]:mp.param['pre_filter_start']-10, :]), 1) + if "mirroring" == a: + mirror = np.flip( + np.abs( + spec_m[ + :, + mp.param["pre_filter_start"] + - 10 + - input_high_end.shape[1] : mp.param["pre_filter_start"] + - 10, + :, + ] + ), + 1, + ) + mirror = mirror * np.exp(1.0j * np.angle(input_high_end)) + + return np.where( + np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror + ) + + if "mirroring2" == a: + mirror = np.flip( + np.abs( + spec_m[ + :, + mp.param["pre_filter_start"] + - 10 + - input_high_end.shape[1] : mp.param["pre_filter_start"] + - 10, + :, + ] + ), + 1, + ) mi = np.multiply(mirror, input_high_end * 1.7) - + return np.where(np.abs(input_high_end) <= np.abs(mi), input_high_end, mi) -def ensembling(a, specs): +def ensembling(a, specs): for i in range(1, len(specs)): if i == 1: spec = specs[0] ln = min([spec.shape[2], specs[i].shape[2]]) - spec = spec[:,:,:ln] - specs[i] = specs[i][:,:,:ln] + spec = spec[:, :, :ln] + specs[i] = specs[i][:, :, :ln] - if 'min_mag' == a: + if "min_mag" == a: spec = np.where(np.abs(specs[i]) <= np.abs(spec), specs[i], spec) - if 'max_mag' == a: - spec = np.where(np.abs(specs[i]) >= np.abs(spec), specs[i], spec) + if "max_mag" == a: + spec = np.where(np.abs(specs[i]) >= np.abs(spec), specs[i], spec) return spec + def stft(wave, nfft, hl): wave_left = np.asfortranarray(wave[0]) wave_right = np.asfortranarray(wave[1]) @@ -374,6 +508,7 @@ def stft(wave, nfft, hl): return spec + def istft(spec, hl): spec_left = np.asfortranarray(spec[0]) spec_right = np.asfortranarray(spec[1]) @@ -389,62 +524,94 @@ if __name__ == "__main__": import time import argparse from model_param_init import ModelParameters - + p = argparse.ArgumentParser() - p.add_argument('--algorithm', '-a', type=str, choices=['invert', 'invert_p', 'min_mag', 'max_mag', 'deep', 'align'], default='min_mag') - p.add_argument('--model_params', '-m', type=str, default=os.path.join('modelparams', '1band_sr44100_hl512.json')) - p.add_argument('--output_name', '-o', type=str, default='output') - p.add_argument('--vocals_only', '-v', action='store_true') - p.add_argument('input', nargs='+') + p.add_argument( + "--algorithm", + "-a", + type=str, + choices=["invert", "invert_p", "min_mag", "max_mag", "deep", "align"], + default="min_mag", + ) + p.add_argument( + "--model_params", + "-m", + type=str, + default=os.path.join("modelparams", "1band_sr44100_hl512.json"), + ) + p.add_argument("--output_name", "-o", type=str, default="output") + p.add_argument("--vocals_only", "-v", action="store_true") + p.add_argument("input", nargs="+") args = p.parse_args() - + start_time = time.time() - - if args.algorithm.startswith('invert') and len(args.input) != 2: - raise ValueError('There should be two input files.') - - if not args.algorithm.startswith('invert') and len(args.input) < 2: - raise ValueError('There must be at least two input files.') - + + if args.algorithm.startswith("invert") and len(args.input) != 2: + raise ValueError("There should be two input files.") + + if not args.algorithm.startswith("invert") and len(args.input) < 2: + raise ValueError("There must be at least two input files.") + wave, specs = {}, {} mp = ModelParameters(args.model_params) - - for i in range(len(args.input)): + + for i in range(len(args.input)): spec = {} - - for d in range(len(mp.param['band']), 0, -1): - bp = mp.param['band'][d] - - if d == len(mp.param['band']): # high-end band + + for d in range(len(mp.param["band"]), 0, -1): + bp = mp.param["band"][d] + + if d == len(mp.param["band"]): # high-end band wave[d], _ = librosa.load( - args.input[i], bp['sr'], False, dtype=np.float32, res_type=bp['res_type']) - - if len(wave[d].shape) == 1: # mono to stereo + args.input[i], + bp["sr"], + False, + dtype=np.float32, + res_type=bp["res_type"], + ) + + if len(wave[d].shape) == 1: # mono to stereo wave[d] = np.array([wave[d], wave[d]]) - else: # lower bands - wave[d] = librosa.resample(wave[d+1], mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type']) - - spec[d] = wave_to_spectrogram(wave[d], bp['hl'], bp['n_fft'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']) - + else: # lower bands + wave[d] = librosa.resample( + wave[d + 1], + mp.param["band"][d + 1]["sr"], + bp["sr"], + res_type=bp["res_type"], + ) + + spec[d] = wave_to_spectrogram( + wave[d], + bp["hl"], + bp["n_fft"], + mp.param["mid_side"], + mp.param["mid_side_b2"], + mp.param["reverse"], + ) + specs[i] = combine_spectrograms(spec, mp) - + del wave - if args.algorithm == 'deep': + if args.algorithm == "deep": d_spec = np.where(np.abs(specs[0]) <= np.abs(spec[1]), specs[0], spec[1]) v_spec = d_spec - specs[1] - sf.write(os.path.join('{}.wav'.format(args.output_name)), cmb_spectrogram_to_wave(v_spec, mp), mp.param['sr']) - - if args.algorithm.startswith('invert'): + sf.write( + os.path.join("{}.wav".format(args.output_name)), + cmb_spectrogram_to_wave(v_spec, mp), + mp.param["sr"], + ) + + if args.algorithm.startswith("invert"): ln = min([specs[0].shape[2], specs[1].shape[2]]) - specs[0] = specs[0][:,:,:ln] - specs[1] = specs[1][:,:,:ln] - - if 'invert_p' == args.algorithm: + specs[0] = specs[0][:, :, :ln] + specs[1] = specs[1][:, :, :ln] + + if "invert_p" == args.algorithm: X_mag = np.abs(specs[0]) - y_mag = np.abs(specs[1]) - max_mag = np.where(X_mag >= y_mag, X_mag, y_mag) - v_spec = specs[1] - max_mag * np.exp(1.j * np.angle(specs[0])) + y_mag = np.abs(specs[1]) + max_mag = np.where(X_mag >= y_mag, X_mag, y_mag) + v_spec = specs[1] - max_mag * np.exp(1.0j * np.angle(specs[0])) else: specs[1] = reduce_vocal_aggressively(specs[0], specs[1], 0.2) v_spec = specs[0] - specs[1] @@ -458,28 +625,43 @@ if __name__ == "__main__": y_image = spectrogram_to_image(y_mag) v_image = spectrogram_to_image(v_mag) - cv2.imwrite('{}_X.png'.format(args.output_name), X_image) - cv2.imwrite('{}_y.png'.format(args.output_name), y_image) - cv2.imwrite('{}_v.png'.format(args.output_name), v_image) - - sf.write('{}_X.wav'.format(args.output_name), cmb_spectrogram_to_wave(specs[0], mp), mp.param['sr']) - sf.write('{}_y.wav'.format(args.output_name), cmb_spectrogram_to_wave(specs[1], mp), mp.param['sr']) - - sf.write('{}_v.wav'.format(args.output_name), cmb_spectrogram_to_wave(v_spec, mp), mp.param['sr']) - else: - if not args.algorithm == 'deep': - sf.write(os.path.join('ensembled','{}.wav'.format(args.output_name)), cmb_spectrogram_to_wave(ensembling(args.algorithm, specs), mp), mp.param['sr']) + cv2.imwrite("{}_X.png".format(args.output_name), X_image) + cv2.imwrite("{}_y.png".format(args.output_name), y_image) + cv2.imwrite("{}_v.png".format(args.output_name), v_image) - if args.algorithm == 'align': + sf.write( + "{}_X.wav".format(args.output_name), + cmb_spectrogram_to_wave(specs[0], mp), + mp.param["sr"], + ) + sf.write( + "{}_y.wav".format(args.output_name), + cmb_spectrogram_to_wave(specs[1], mp), + mp.param["sr"], + ) + sf.write( + "{}_v.wav".format(args.output_name), + cmb_spectrogram_to_wave(v_spec, mp), + mp.param["sr"], + ) + else: + if not args.algorithm == "deep": + sf.write( + os.path.join("ensembled", "{}.wav".format(args.output_name)), + cmb_spectrogram_to_wave(ensembling(args.algorithm, specs), mp), + mp.param["sr"], + ) + + if args.algorithm == "align": trackalignment = [ { - 'file1':'"{}"'.format(args.input[0]), - 'file2':'"{}"'.format(args.input[1]) + "file1": '"{}"'.format(args.input[0]), + "file2": '"{}"'.format(args.input[1]), } ] - for i,e in tqdm(enumerate(trackalignment), desc="Performing Alignment..."): + for i, e in tqdm(enumerate(trackalignment), desc="Performing Alignment..."): os.system(f"python lib/align_tracks.py {e['file1']} {e['file2']}") - #print('Total time: {0:.{1}f}s'.format(time.time() - start_time, 1)) + # print('Total time: {0:.{1}f}s'.format(time.time() - start_time, 1)) diff --git a/uvr5_pack/name_params.json b/uvr5_pack/name_params.json new file mode 100644 index 0000000..cb66091 --- /dev/null +++ b/uvr5_pack/name_params.json @@ -0,0 +1,263 @@ +{ + "equivalent" : [ + { + "model_hash_name" : [ + { + "hash_name": "47939caf0cfe52a0e81442b85b971dfd", + "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json", + "param_name": "4band_44100" + }, + { + "hash_name": "4e4ecb9764c50a8c414fee6e10395bbe", + "model_params": "uvr5_pack/lib_v5/modelparams/4band_v2.json", + "param_name": "4band_v2" + }, + { + "hash_name": "ca106edd563e034bde0bdec4bb7a4b36", + "model_params": "uvr5_pack/lib_v5/modelparams/4band_v2.json", + "param_name": "4band_v2" + }, + { + "hash_name": "e60a1e84803ce4efc0a6551206cc4b71", + "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json", + "param_name": "4band_44100" + }, + { + "hash_name": "a82f14e75892e55e994376edbf0c8435", + "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json", + "param_name": "4band_44100" + }, + { + "hash_name": "6dd9eaa6f0420af9f1d403aaafa4cc06", + "model_params": "uvr5_pack/lib_v5/modelparams/4band_v2_sn.json", + "param_name": "4band_v2_sn" + }, + { + "hash_name": "08611fb99bd59eaa79ad27c58d137727", + "model_params": "uvr5_pack/lib_v5/modelparams/4band_v2_sn.json", + "param_name": "4band_v2_sn" + }, + { + "hash_name": "5c7bbca45a187e81abbbd351606164e5", + "model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json", + "param_name": "3band_44100_msb2" + }, + { + "hash_name": "d6b2cb685a058a091e5e7098192d3233", + "model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json", + "param_name": "3band_44100_msb2" + }, + { + "hash_name": "c1b9f38170a7c90e96f027992eb7c62b", + "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json", + "param_name": "4band_44100" + }, + { + "hash_name": "c3448ec923fa0edf3d03a19e633faa53", + "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json", + "param_name": "4band_44100" + }, + { + "hash_name": "68aa2c8093d0080704b200d140f59e54", + "model_params": "uvr5_pack/lib_v5/modelparams/3band_44100.json", + "param_name": "3band_44100" + }, + { + "hash_name": "fdc83be5b798e4bd29fe00fe6600e147", + "model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_mid.json", + "param_name": "3band_44100_mid.json" + }, + { + "hash_name": "2ce34bc92fd57f55db16b7a4def3d745", + "model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_mid.json", + "param_name": "3band_44100_mid.json" + }, + { + "hash_name": "52fdca89576f06cf4340b74a4730ee5f", + "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json", + "param_name": "4band_44100.json" + }, + { + "hash_name": "41191165b05d38fc77f072fa9e8e8a30", + "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json", + "param_name": "4band_44100.json" + }, + { + "hash_name": "89e83b511ad474592689e562d5b1f80e", + "model_params": "uvr5_pack/lib_v5/modelparams/2band_32000.json", + "param_name": "2band_32000.json" + }, + { + "hash_name": "0b954da81d453b716b114d6d7c95177f", + "model_params": "uvr5_pack/lib_v5/modelparams/2band_32000.json", + "param_name": "2band_32000.json" + } + + ], + "v4 Models": [ + { + "hash_name": "6a00461c51c2920fd68937d4609ed6c8", + "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json", + "param_name": "1band_sr16000_hl512" + }, + { + "hash_name": "0ab504864d20f1bd378fe9c81ef37140", + "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", + "param_name": "1band_sr32000_hl512" + }, + { + "hash_name": "7dd21065bf91c10f7fccb57d7d83b07f", + "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", + "param_name": "1band_sr32000_hl512" + }, + { + "hash_name": "80ab74d65e515caa3622728d2de07d23", + "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", + "param_name": "1band_sr32000_hl512" + }, + { + "hash_name": "edc115e7fc523245062200c00caa847f", + "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json", + "param_name": "1band_sr33075_hl384" + }, + { + "hash_name": "28063e9f6ab5b341c5f6d3c67f2045b7", + "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json", + "param_name": "1band_sr33075_hl384" + }, + { + "hash_name": "b58090534c52cbc3e9b5104bad666ef2", + "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json", + "param_name": "1band_sr44100_hl512" + }, + { + "hash_name": "0cdab9947f1b0928705f518f3c78ea8f", + "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json", + "param_name": "1band_sr44100_hl512" + }, + { + "hash_name": "ae702fed0238afb5346db8356fe25f13", + "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json", + "param_name": "1band_sr44100_hl1024" + } + ] + } + ], + "User Models" : [ + { + "1 Band": [ + { + "hash_name": "1band_sr16000_hl512", + "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json", + "param_name": "1band_sr16000_hl512" + }, + { + "hash_name": "1band_sr32000_hl512", + "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", + "param_name": "1band_sr16000_hl512" + }, + { + "hash_name": "1band_sr33075_hl384", + "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json", + "param_name": "1band_sr33075_hl384" + }, + { + "hash_name": "1band_sr44100_hl256", + "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json", + "param_name": "1band_sr44100_hl256" + }, + { + "hash_name": "1band_sr44100_hl512", + "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json", + "param_name": "1band_sr44100_hl512" + }, + { + "hash_name": "1band_sr44100_hl1024", + "model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json", + "param_name": "1band_sr44100_hl1024" + } + ], + "2 Band": [ + { + "hash_name": "2band_44100_lofi", + "model_params": "uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json", + "param_name": "2band_44100_lofi" + }, + { + "hash_name": "2band_32000", + "model_params": "uvr5_pack/lib_v5/modelparams/2band_32000.json", + "param_name": "2band_32000" + }, + { + "hash_name": "2band_48000", + "model_params": "uvr5_pack/lib_v5/modelparams/2band_48000.json", + "param_name": "2band_48000" + } + ], + "3 Band": [ + { + "hash_name": "3band_44100", + "model_params": "uvr5_pack/lib_v5/modelparams/3band_44100.json", + "param_name": "3band_44100" + }, + { + "hash_name": "3band_44100_mid", + "model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_mid.json", + "param_name": "3band_44100_mid" + }, + { + "hash_name": "3band_44100_msb2", + "model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json", + "param_name": "3band_44100_msb2" + } + ], + "4 Band": [ + { + "hash_name": "4band_44100", + "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json", + "param_name": "4band_44100" + }, + { + "hash_name": "4band_44100_mid", + "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_mid.json", + "param_name": "4band_44100_mid" + }, + { + "hash_name": "4band_44100_msb", + "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_msb.json", + "param_name": "4band_44100_msb" + }, + { + "hash_name": "4band_44100_msb2", + "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json", + "param_name": "4band_44100_msb2" + }, + { + "hash_name": "4band_44100_reverse", + "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json", + "param_name": "4band_44100_reverse" + }, + { + "hash_name": "4band_44100_sw", + "model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_sw.json", + "param_name": "4band_44100_sw" + }, + { + "hash_name": "4band_v2", + "model_params": "uvr5_pack/lib_v5/modelparams/4band_v2.json", + "param_name": "4band_v2" + }, + { + "hash_name": "4band_v2_sn", + "model_params": "uvr5_pack/lib_v5/modelparams/4band_v2_sn.json", + "param_name": "4band_v2_sn" + }, + { + "hash_name": "tmodelparam", + "model_params": "uvr5_pack/lib_v5/modelparams/tmodelparam.json", + "param_name": "User Model Param Set" + } + ] + } + ] +} \ No newline at end of file diff --git a/uvr5_pack/utils.py b/uvr5_pack/utils.py index 6d1e0e9..30bc59e 100644 --- a/uvr5_pack/utils.py +++ b/uvr5_pack/utils.py @@ -1,6 +1,15 @@ import torch import numpy as np from tqdm import tqdm +import json + + +def load_data(file_name: str = "./uvr5_pack/data.json") -> dict: + with open(file_name, "r") as f: + data = json.load(f) + + return data + def make_padding(width, cropsize, offset): left = offset @@ -10,233 +19,102 @@ def make_padding(width, cropsize, offset): right = roi_size - (width % roi_size) + left return left, right, roi_size -def inference(X_spec, device, model, aggressiveness,data): - ''' + + +def inference(X_spec, device, model, aggressiveness, data): + """ data : dic configs - ''' - - def _execute(X_mag_pad, roi_size, n_window, device, model, aggressiveness,is_half=True): + """ + + def _execute( + X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True + ): model.eval() with torch.no_grad(): preds = [] - + iterations = [n_window] - total_iterations = sum(iterations) - for i in tqdm(range(n_window)): + total_iterations = sum(iterations) + for i in tqdm(range(n_window)): start = i * roi_size - X_mag_window = X_mag_pad[None, :, :, start:start + data['window_size']] + X_mag_window = X_mag_pad[ + None, :, :, start : start + data["window_size"] + ] X_mag_window = torch.from_numpy(X_mag_window) - if(is_half):X_mag_window=X_mag_window.half() - X_mag_window=X_mag_window.to(device) + if is_half: + X_mag_window = X_mag_window.half() + X_mag_window = X_mag_window.to(device) pred = model.predict(X_mag_window, aggressiveness) pred = pred.detach().cpu().numpy() preds.append(pred[0]) - + pred = np.concatenate(preds, axis=2) return pred - + def preprocess(X_spec): X_mag = np.abs(X_spec) X_phase = np.angle(X_spec) return X_mag, X_phase - + X_mag, X_phase = preprocess(X_spec) coef = X_mag.max() X_mag_pre = X_mag / coef n_frame = X_mag_pre.shape[2] - pad_l, pad_r, roi_size = make_padding(n_frame, - data['window_size'], model.offset) + pad_l, pad_r, roi_size = make_padding(n_frame, data["window_size"], model.offset) n_window = int(np.ceil(n_frame / roi_size)) - X_mag_pad = np.pad( - X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode='constant') + X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant") - if(list(model.state_dict().values())[0].dtype==torch.float16):is_half=True - else:is_half=False - pred = _execute(X_mag_pad, roi_size, n_window, - device, model, aggressiveness,is_half) + if list(model.state_dict().values())[0].dtype == torch.float16: + is_half = True + else: + is_half = False + pred = _execute( + X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half + ) pred = pred[:, :, :n_frame] - - if data['tta']: + + if data["tta"]: pad_l += roi_size // 2 pad_r += roi_size // 2 n_window += 1 - X_mag_pad = np.pad( - X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode='constant') + X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant") - pred_tta = _execute(X_mag_pad, roi_size, n_window, - device, model, aggressiveness,is_half) - pred_tta = pred_tta[:, :, roi_size // 2:] + pred_tta = _execute( + X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half + ) + pred_tta = pred_tta[:, :, roi_size // 2 :] pred_tta = pred_tta[:, :, :n_frame] - return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.j * X_phase) + return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.0j * X_phase) else: - return pred * coef, X_mag, np.exp(1.j * X_phase) - + return pred * coef, X_mag, np.exp(1.0j * X_phase) -def _get_name_params(model_path , model_hash): +def _get_name_params(model_path, model_hash): + data = load_data() + flag = False ModelName = model_path - if model_hash == '47939caf0cfe52a0e81442b85b971dfd': - model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json') - param_name_auto=str('4band_44100') - if model_hash == '4e4ecb9764c50a8c414fee6e10395bbe': - model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2.json') - param_name_auto=str('4band_v2') - if model_hash == 'ca106edd563e034bde0bdec4bb7a4b36': - model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2.json') - param_name_auto=str('4band_v2') - if model_hash == 'e60a1e84803ce4efc0a6551206cc4b71': - model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json') - param_name_auto=str('4band_44100') - if model_hash == 'a82f14e75892e55e994376edbf0c8435': - model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json') - param_name_auto=str('4band_44100') - if model_hash == '6dd9eaa6f0420af9f1d403aaafa4cc06': - model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2_sn.json') - param_name_auto=str('4band_v2_sn') - if model_hash == '08611fb99bd59eaa79ad27c58d137727': - model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2_sn.json') - param_name_auto=str('4band_v2_sn') - if model_hash == '5c7bbca45a187e81abbbd351606164e5': - model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json') - param_name_auto=str('3band_44100_msb2') - if model_hash == 'd6b2cb685a058a091e5e7098192d3233': - model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json') - param_name_auto=str('3band_44100_msb2') - if model_hash == 'c1b9f38170a7c90e96f027992eb7c62b': - model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json') - param_name_auto=str('4band_44100') - if model_hash == 'c3448ec923fa0edf3d03a19e633faa53': - model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json') - param_name_auto=str('4band_44100') - if model_hash == '68aa2c8093d0080704b200d140f59e54': - model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100.json') - param_name_auto=str('3band_44100.json') - if model_hash == 'fdc83be5b798e4bd29fe00fe6600e147': - model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_mid.json') - param_name_auto=str('3band_44100_mid.json') - if model_hash == '2ce34bc92fd57f55db16b7a4def3d745': - model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_mid.json') - param_name_auto=str('3band_44100_mid.json') - if model_hash == '52fdca89576f06cf4340b74a4730ee5f': - model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json') - param_name_auto=str('4band_44100.json') - if model_hash == '41191165b05d38fc77f072fa9e8e8a30': - model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json') - param_name_auto=str('4band_44100.json') - if model_hash == '89e83b511ad474592689e562d5b1f80e': - model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_32000.json') - param_name_auto=str('2band_32000.json') - if model_hash == '0b954da81d453b716b114d6d7c95177f': - model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_32000.json') - param_name_auto=str('2band_32000.json') + for type in list(data): + for model in list(data[type][0]): + for i in range(len(data[type][0][model])): + if str(data[type][0][model][i]["hash_name"]) == model_hash: + flag = True + elif str(data[type][0][model][i]["hash_name"]) in ModelName: + flag = True - #v4 Models - if model_hash == '6a00461c51c2920fd68937d4609ed6c8': - model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json') - param_name_auto=str('1band_sr16000_hl512') - if model_hash == '0ab504864d20f1bd378fe9c81ef37140': - model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json') - param_name_auto=str('1band_sr32000_hl512') - if model_hash == '7dd21065bf91c10f7fccb57d7d83b07f': - model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json') - param_name_auto=str('1band_sr32000_hl512') - if model_hash == '80ab74d65e515caa3622728d2de07d23': - model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json') - param_name_auto=str('1band_sr32000_hl512') - if model_hash == 'edc115e7fc523245062200c00caa847f': - model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json') - param_name_auto=str('1band_sr33075_hl384') - if model_hash == '28063e9f6ab5b341c5f6d3c67f2045b7': - model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json') - param_name_auto=str('1band_sr33075_hl384') - if model_hash == 'b58090534c52cbc3e9b5104bad666ef2': - model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json') - param_name_auto=str('1band_sr44100_hl512') - if model_hash == '0cdab9947f1b0928705f518f3c78ea8f': - model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json') - param_name_auto=str('1band_sr44100_hl512') - if model_hash == 'ae702fed0238afb5346db8356fe25f13': - model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json') - param_name_auto=str('1band_sr44100_hl1024') - #User Models - - #1 Band - if '1band_sr16000_hl512' in ModelName: - model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json') - param_name_auto=str('1band_sr16000_hl512') - if '1band_sr32000_hl512' in ModelName: - model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json') - param_name_auto=str('1band_sr32000_hl512') - if '1band_sr33075_hl384' in ModelName: - model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json') - param_name_auto=str('1band_sr33075_hl384') - if '1band_sr44100_hl256' in ModelName: - model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json') - param_name_auto=str('1band_sr44100_hl256') - if '1band_sr44100_hl512' in ModelName: - model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json') - param_name_auto=str('1band_sr44100_hl512') - if '1band_sr44100_hl1024' in ModelName: - model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json') - param_name_auto=str('1band_sr44100_hl1024') - - #2 Band - if '2band_44100_lofi' in ModelName: - model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json') - param_name_auto=str('2band_44100_lofi') - if '2band_32000' in ModelName: - model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_32000.json') - param_name_auto=str('2band_32000') - if '2band_48000' in ModelName: - model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_48000.json') - param_name_auto=str('2band_48000') - - #3 Band - if '3band_44100' in ModelName: - model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100.json') - param_name_auto=str('3band_44100') - if '3band_44100_mid' in ModelName: - model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_mid.json') - param_name_auto=str('3band_44100_mid') - if '3band_44100_msb2' in ModelName: - model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json') - param_name_auto=str('3band_44100_msb2') - - #4 Band - if '4band_44100' in ModelName: - model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json') - param_name_auto=str('4band_44100') - if '4band_44100_mid' in ModelName: - model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_mid.json') - param_name_auto=str('4band_44100_mid') - if '4band_44100_msb' in ModelName: - model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_msb.json') - param_name_auto=str('4band_44100_msb') - if '4band_44100_msb2' in ModelName: - model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json') - param_name_auto=str('4band_44100_msb2') - if '4band_44100_reverse' in ModelName: - model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json') - param_name_auto=str('4band_44100_reverse') - if '4band_44100_sw' in ModelName: - model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_sw.json') - param_name_auto=str('4band_44100_sw') - if '4band_v2' in ModelName: - model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2.json') - param_name_auto=str('4band_v2') - if '4band_v2_sn' in ModelName: - model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2_sn.json') - param_name_auto=str('4band_v2_sn') - if 'tmodelparam' in ModelName: - model_params_auto=str('uvr5_pack/lib_v5/modelparams/tmodelparam.json') - param_name_auto=str('User Model Param Set') - return param_name_auto , model_params_auto + if flag: + model_params_auto = data[type][0][model][i]["model_params"] + param_name_auto = data[type][0][model][i]["param_name"] + if type == "equivalent": + return param_name_auto, model_params_auto + else: + flag = False + return param_name_auto, model_params_auto diff --git a/vc_infer_pipeline.py b/vc_infer_pipeline.py index d351bfa..0668e41 100644 --- a/vc_infer_pipeline.py +++ b/vc_infer_pipeline.py @@ -1,36 +1,47 @@ -import numpy as np,parselmouth,torch,pdb +import numpy as np, parselmouth, torch, pdb from time import time as ttime import torch.nn.functional as F -from config import x_pad,x_query,x_center,x_max +from config import x_pad, x_query, x_center, x_max import scipy.signal as signal -import pyworld,os,traceback,faiss -class VC(object): - def __init__(self,tgt_sr,device,is_half): - self.sr=16000#hubert输入采样率 - self.window=160#每帧点数 - self.t_pad=self.sr*x_pad#每条前后pad时间 - self.t_pad_tgt=tgt_sr*x_pad - self.t_pad2=self.t_pad*2 - self.t_query=self.sr*x_query#查询切点前后查询时间 - self.t_center=self.sr*x_center#查询切点位置 - self.t_max=self.sr*x_max#免查询时长阈值 - self.device=device - self.is_half=is_half +import pyworld, os, traceback, faiss - def get_f0(self,x, p_len,f0_up_key,f0_method,inp_f0=None): + +class VC(object): + def __init__(self, tgt_sr, device, is_half): + self.sr = 16000 # hubert输入采样率 + self.window = 160 # 每帧点数 + self.t_pad = self.sr * x_pad # 每条前后pad时间 + self.t_pad_tgt = tgt_sr * x_pad + self.t_pad2 = self.t_pad * 2 + self.t_query = self.sr * x_query # 查询切点前后查询时间 + self.t_center = self.sr * x_center # 查询切点位置 + self.t_max = self.sr * x_max # 免查询时长阈值 + self.device = device + self.is_half = is_half + + def get_f0(self, x, p_len, f0_up_key, f0_method, inp_f0=None): time_step = self.window / self.sr * 1000 f0_min = 50 f0_max = 1100 f0_mel_min = 1127 * np.log(1 + f0_min / 700) f0_mel_max = 1127 * np.log(1 + f0_max / 700) - if(f0_method=="pm"): - f0 = parselmouth.Sound(x, self.sr).to_pitch_ac( - time_step=time_step / 1000, voicing_threshold=0.6, - pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency'] - pad_size=(p_len - len(f0) + 1) // 2 - if(pad_size>0 or p_len - len(f0) - pad_size>0): - f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant') - elif(f0_method=="harvest"): + if f0_method == "pm": + f0 = ( + parselmouth.Sound(x, self.sr) + .to_pitch_ac( + time_step=time_step / 1000, + voicing_threshold=0.6, + pitch_floor=f0_min, + pitch_ceiling=f0_max, + ) + .selected_array["frequency"] + ) + pad_size = (p_len - len(f0) + 1) // 2 + if pad_size > 0 or p_len - len(f0) - pad_size > 0: + f0 = np.pad( + f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" + ) + elif f0_method == "harvest": f0, t = pyworld.harvest( x.astype(np.double), fs=self.sr, @@ -42,25 +53,45 @@ class VC(object): f0 = signal.medfilt(f0, 3) f0 *= pow(2, f0_up_key / 12) # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) - tf0=self.sr//self.window#每秒f0点数 - if (inp_f0 is not None): - delta_t=np.round((inp_f0[:,0].max()-inp_f0[:,0].min())*tf0+1).astype("int16") - replace_f0=np.interp(list(range(delta_t)), inp_f0[:, 0]*100, inp_f0[:, 1]) - shape=f0[x_pad*tf0:x_pad*tf0+len(replace_f0)].shape[0] - f0[x_pad*tf0:x_pad*tf0+len(replace_f0)]=replace_f0[:shape] + tf0 = self.sr // self.window # 每秒f0点数 + if inp_f0 is not None: + delta_t = np.round( + (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1 + ).astype("int16") + replace_f0 = np.interp( + list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1] + ) + shape = f0[x_pad * tf0 : x_pad * tf0 + len(replace_f0)].shape[0] + f0[x_pad * tf0 : x_pad * tf0 + len(replace_f0)] = replace_f0[:shape] # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) f0bak = f0.copy() f0_mel = 1127 * np.log(1 + f0 / 700) - f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1 + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( + f0_mel_max - f0_mel_min + ) + 1 f0_mel[f0_mel <= 1] = 1 f0_mel[f0_mel > 255] = 255 f0_coarse = np.rint(f0_mel).astype(np.int) - return f0_coarse, f0bak#1-0 + return f0_coarse, f0bak # 1-0 - def vc(self,model,net_g,sid,audio0,pitch,pitchf,times,index,big_npy,index_rate):#,file_index,file_big_npy + def vc( + self, + model, + net_g, + sid, + audio0, + pitch, + pitchf, + times, + index, + big_npy, + index_rate, + ): # ,file_index,file_big_npy feats = torch.from_numpy(audio0) - if(self.is_half):feats=feats.half() - else:feats=feats.float() + if self.is_half: + feats = feats.half() + else: + feats = feats.float() if feats.dim() == 2: # double channels feats = feats.mean(-1) assert feats.dim() == 1, feats.dim() @@ -75,91 +106,196 @@ class VC(object): t0 = ttime() with torch.no_grad(): logits = model.extract_features(**inputs) - feats = model.final_proj(logits[0]) + feats = model.final_proj(logits[0]) - if(isinstance(index,type(None))==False and isinstance(big_npy,type(None))==False and index_rate!=0): + if ( + isinstance(index, type(None)) == False + and isinstance(big_npy, type(None)) == False + and index_rate != 0 + ): npy = feats[0].cpu().numpy() - if(self.is_half):npy=npy.astype("float32") + if self.is_half: + npy = npy.astype("float32") _, I = index.search(npy, 1) - npy=big_npy[I.squeeze()] - if(self.is_half):npy=npy.astype("float16") - feats = torch.from_numpy(npy).unsqueeze(0).to(self.device)*index_rate + (1-index_rate)*feats + npy = big_npy[I.squeeze()] + if self.is_half: + npy = npy.astype("float16") + feats = ( + torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + + (1 - index_rate) * feats + ) feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) t1 = ttime() - p_len = audio0.shape[0]//self.window - if(feats.shape[1]self.t_max): + if audio_pad.shape[0] > self.t_max: audio_sum = np.zeros_like(audio) - for i in range(self.window): audio_sum += audio_pad[i:i - self.window] - for t in range(self.t_center, audio.shape[0],self.t_center):opt_ts.append(t - self.t_query + np.where(np.abs(audio_sum[t - self.t_query:t + self.t_query]) == np.abs(audio_sum[t - self.t_query:t + self.t_query]).min())[0][0]) + for i in range(self.window): + audio_sum += audio_pad[i : i - self.window] + for t in range(self.t_center, audio.shape[0], self.t_center): + opt_ts.append( + t + - self.t_query + + np.where( + np.abs(audio_sum[t - self.t_query : t + self.t_query]) + == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min() + )[0][0] + ) s = 0 - audio_opt=[] - t=None - t1=ttime() - audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode='reflect') - p_len=audio_pad.shape[0]//self.window - inp_f0=None - if(hasattr(f0_file,'name') ==True): + audio_opt = [] + t = None + t1 = ttime() + audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") + p_len = audio_pad.shape[0] // self.window + inp_f0 = None + if hasattr(f0_file, "name") == True: try: - with open(f0_file.name,"r")as f: - lines=f.read().strip("\n").split("\n") - inp_f0=[] - for line in lines:inp_f0.append([float(i)for i in line.split(",")]) - inp_f0=np.array(inp_f0,dtype="float32") + with open(f0_file.name, "r") as f: + lines = f.read().strip("\n").split("\n") + inp_f0 = [] + for line in lines: + inp_f0.append([float(i) for i in line.split(",")]) + inp_f0 = np.array(inp_f0, dtype="float32") except: traceback.print_exc() - sid=torch.tensor(sid,device=self.device).unsqueeze(0).long() - pitch, pitchf=None,None - if(if_f0==1): - pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key,f0_method,inp_f0) + sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() + pitch, pitchf = None, None + if if_f0 == 1: + pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key, f0_method, inp_f0) pitch = pitch[:p_len] pitchf = pitchf[:p_len] - pitch = torch.tensor(pitch,device=self.device).unsqueeze(0).long() - pitchf = torch.tensor(pitchf,device=self.device).unsqueeze(0).float() - t2=ttime() - times[1] += (t2 - t1) + pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() + pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float() + t2 = ttime() + times[1] += t2 - t1 for t in opt_ts: - t=t//self.window*self.window - if (if_f0 == 1): - audio_opt.append(self.vc(model,net_g,sid,audio_pad[s:t+self.t_pad2+self.window],pitch[:,s//self.window:(t+self.t_pad2)//self.window],pitchf[:,s//self.window:(t+self.t_pad2)//self.window],times,index,big_npy,index_rate)[self.t_pad_tgt:-self.t_pad_tgt]) + t = t // self.window * self.window + if if_f0 == 1: + audio_opt.append( + self.vc( + model, + net_g, + sid, + audio_pad[s : t + self.t_pad2 + self.window], + pitch[:, s // self.window : (t + self.t_pad2) // self.window], + pitchf[:, s // self.window : (t + self.t_pad2) // self.window], + times, + index, + big_npy, + index_rate, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) else: - audio_opt.append(self.vc(model,net_g,sid,audio_pad[s:t+self.t_pad2+self.window],None,None,times,index,big_npy,index_rate)[self.t_pad_tgt:-self.t_pad_tgt]) + audio_opt.append( + self.vc( + model, + net_g, + sid, + audio_pad[s : t + self.t_pad2 + self.window], + None, + None, + times, + index, + big_npy, + index_rate, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) s = t - if (if_f0 == 1): - audio_opt.append(self.vc(model,net_g,sid,audio_pad[t:],pitch[:,t//self.window:]if t is not None else pitch,pitchf[:,t//self.window:]if t is not None else pitchf,times,index,big_npy,index_rate)[self.t_pad_tgt:-self.t_pad_tgt]) + if if_f0 == 1: + audio_opt.append( + self.vc( + model, + net_g, + sid, + audio_pad[t:], + pitch[:, t // self.window :] if t is not None else pitch, + pitchf[:, t // self.window :] if t is not None else pitchf, + times, + index, + big_npy, + index_rate, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) else: - audio_opt.append(self.vc(model,net_g,sid,audio_pad[t:],None,None,times,index,big_npy,index_rate)[self.t_pad_tgt:-self.t_pad_tgt]) - audio_opt=np.concatenate(audio_opt) - del pitch,pitchf,sid - if torch.cuda.is_available(): torch.cuda.empty_cache() + audio_opt.append( + self.vc( + model, + net_g, + sid, + audio_pad[t:], + None, + None, + times, + index, + big_npy, + index_rate, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + audio_opt = np.concatenate(audio_opt) + del pitch, pitchf, sid + if torch.cuda.is_available(): + torch.cuda.empty_cache() return audio_opt diff --git a/webui_locale.py b/webui_locale.py index b56b5f6..5ec47e2 100644 --- a/webui_locale.py +++ b/webui_locale.py @@ -1,16 +1,18 @@ import locale import json + def load_language_list(language): with open(f"./locale/{language}.json", "r", encoding="utf-8") as f: language_list = json.load(f) return language_list + class I18nAuto: def __init__(self, language=None): if language is None: - language = 'auto' - if language == 'auto': + language = "auto" + if language == "auto": language = locale.getdefaultlocale()[0] self.language = language print("Use Language:", language)