1
0
mirror of synced 2024-11-27 17:00:54 +01:00

Reformat and rewrite _get_name_params (#57)

* Reformat

* rewrite _get_name_params

* Add workflow for automatic formatting

* Revert "Add workflow for automatic formatting"

This reverts commit 9111c5dbc1.

* revert Retrieval_based_Voice_Conversion_WebUI.ipynb

---------

Co-authored-by: 源文雨 <41315874+fumiama@users.noreply.github.com>
This commit is contained in:
Ftps 2023-04-15 20:44:24 +09:00 committed by GitHub
parent aaa893c4b1
commit c8261b2ccc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
45 changed files with 4878 additions and 2456 deletions

View File

@ -1,13 +1,13 @@
########################硬件参数########################
#填写cuda:x, cpu 或 mps, x指代第几张卡只支持 N卡 / Apple Silicon 加速
device = "cuda:0"
# 填写cuda:x, cpu 或 mps, x指代第几张卡只支持 N卡 / Apple Silicon 加速
device = "cuda:0"
#9-10-20-30-40系显卡无脑True不影响质量>=20显卡开启有加速
is_half = True
# 9-10-20-30-40系显卡无脑True不影响质量>=20显卡开启有加速
is_half = True
#默认0用上所有线程写数字限制CPU资源使用
n_cpu = 0
# 默认0用上所有线程写数字限制CPU资源使用
n_cpu = 0
########################硬件参数########################
@ -16,31 +16,38 @@ n_cpu = 0
########################命令行参数########################
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--port", type=int, default=7865, help="Listen port")
parser.add_argument("--pycmd", type=str, default="python", help="Python command")
parser.add_argument("--colab", action='store_true', help="Launch in colab")
parser.add_argument("--noparallel", action='store_true', help="Disable parallel processing")
parser.add_argument("--noautoopen", action='store_true', help="Do not open in browser automatically")
parser.add_argument("--colab", action="store_true", help="Launch in colab")
parser.add_argument(
"--noparallel", action="store_true", help="Disable parallel processing"
)
parser.add_argument(
"--noautoopen", action="store_true", help="Do not open in browser automatically"
)
cmd_opts = parser.parse_args()
python_cmd=cmd_opts.pycmd
listen_port=cmd_opts.port
iscolab=cmd_opts.colab
noparallel=cmd_opts.noparallel
noautoopen=cmd_opts.noautoopen
python_cmd = cmd_opts.pycmd
listen_port = cmd_opts.port
iscolab = cmd_opts.colab
noparallel = cmd_opts.noparallel
noautoopen = cmd_opts.noautoopen
########################命令行参数########################
import sys
import torch
# has_mps is only available in nightly pytorch (for now) and MasOS 12.3+.
# check `getattr` and try it for compatibility
def has_mps() -> bool:
if sys.platform != "darwin":
return False
else:
if not getattr(torch, 'has_mps', False): return False
if not getattr(torch, "has_mps", False):
return False
try:
torch.zeros(1).to(torch.device("mps"))
return True
@ -48,32 +55,34 @@ def has_mps() -> bool:
return False
if(not torch.cuda.is_available()):
if not torch.cuda.is_available():
if has_mps():
print("没有发现支持的N卡, 使用MPS进行推理")
device = "mps"
device = "mps"
else:
print("没有发现支持的N卡, 使用CPU进行推理")
device = "cpu"
device = "cpu"
is_half = False
if(device not in ["cpu", "mps"]):
if device not in ["cpu", "mps"]:
gpu_name = torch.cuda.get_device_name(int(device.split(":")[-1]))
if("16" in gpu_name or "MX" in gpu_name):
if "16" in gpu_name or "MX" in gpu_name:
print("16系显卡/MX系显卡强制单精度")
is_half = False
from multiprocessing import cpu_count
if(n_cpu==0): n_cpu=cpu_count()
if(is_half):
#6G显存配置
x_pad = 3
x_query = 10
x_center = 60
x_max = 65
if n_cpu == 0:
n_cpu = cpu_count()
if is_half:
# 6G显存配置
x_pad = 3
x_query = 10
x_center = 60
x_max = 65
else:
#5G显存配置
x_pad = 1
x_query = 6
x_center = 38
x_max = 41
# 5G显存配置
x_pad = 1
x_query = 6
x_center = 38
x_max = 41

View File

@ -5,40 +5,43 @@ person = "Shiroha/shiroha.pth"
exported_path = "model.onnx"
cpt = torch.load(person, map_location="cpu")
cpt["config"][-3]=cpt["weight"]["emb_g.weight"].shape[0]#n_spk
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
print(*cpt["config"])
net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=False)
net_g.load_state_dict(cpt["weight"], strict=False)
test_phone = torch.rand(1, 200, 256)
test_phone_lengths = torch.tensor([200]).long()
test_pitch = torch.randint(size=(1 ,200),low=5,high=255)
test_pitch = torch.randint(size=(1, 200), low=5, high=255)
test_pitchf = torch.rand(1, 200)
test_ds = torch.LongTensor([0])
test_rnd = torch.rand(1, 192, 200)
input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
output_names = ["audio", ]
device="cpu"
torch.onnx.export(net_g,
(
test_phone.to(device),
test_phone_lengths.to(device),
test_pitch.to(device),
test_pitchf.to(device),
test_ds.to(device),
test_rnd.to(device)
),
exported_path,
dynamic_axes={
"phone": [1],
"pitch": [1],
"pitchf": [1],
"rnd": [2],
},
do_constant_folding=False,
opset_version=16,
verbose=False,
input_names=input_names,
output_names=output_names)
output_names = [
"audio",
]
device = "cpu"
torch.onnx.export(
net_g,
(
test_phone.to(device),
test_phone_lengths.to(device),
test_pitch.to(device),
test_pitchf.to(device),
test_ds.to(device),
test_rnd.to(device),
),
exported_path,
dynamic_axes={
"phone": [1],
"pitch": [1],
"pitchf": [1],
"rnd": [2],
},
do_constant_folding=False,
opset_version=16,
verbose=False,
input_names=input_names,
output_names=output_names,
)

View File

@ -1,21 +1,26 @@
import os,traceback,sys,parselmouth
import os, traceback, sys, parselmouth
import librosa
import pyworld
from scipy.io import wavfile
import numpy as np,logging
logging.getLogger('numba').setLevel(logging.WARNING)
import numpy as np, logging
logging.getLogger("numba").setLevel(logging.WARNING)
from multiprocessing import Process
exp_dir = sys.argv[1]
f = open("%s/extract_f0_feature.log"%exp_dir, "a+")
f = open("%s/extract_f0_feature.log" % exp_dir, "a+")
def printt(strr):
print(strr)
f.write("%s\n" % strr)
f.flush()
n_p = int(sys.argv[2])
f0method = sys.argv[3]
class FeatureInput(object):
def __init__(self, samplerate=16000, hop_size=160):
self.fs = samplerate
@ -27,21 +32,30 @@ class FeatureInput(object):
self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
def compute_f0(self, path,f0_method):
def compute_f0(self, path, f0_method):
x, sr = librosa.load(path, self.fs)
p_len=x.shape[0]//self.hop
p_len = x.shape[0] // self.hop
assert sr == self.fs
if(f0_method=="pm"):
if f0_method == "pm":
time_step = 160 / 16000 * 1000
f0_min = 50
f0_max = 1100
f0 = parselmouth.Sound(x, sr).to_pitch_ac(
time_step=time_step / 1000, voicing_threshold=0.6,
pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
pad_size=(p_len - len(f0) + 1) // 2
if(pad_size>0 or p_len - len(f0) - pad_size>0):
f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
elif(f0_method=="harvest"):
f0 = (
parselmouth.Sound(x, sr)
.to_pitch_ac(
time_step=time_step / 1000,
voicing_threshold=0.6,
pitch_floor=f0_min,
pitch_ceiling=f0_max,
)
.selected_array["frequency"]
)
pad_size = (p_len - len(f0) + 1) // 2
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
f0 = np.pad(
f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
)
elif f0_method == "harvest":
f0, t = pyworld.harvest(
x.astype(np.double),
fs=sr,
@ -50,7 +64,7 @@ class FeatureInput(object):
frame_period=1000 * self.hop / sr,
)
f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
elif(f0_method=="dio"):
elif f0_method == "dio":
f0, t = pyworld.dio(
x.astype(np.double),
fs=sr,
@ -77,45 +91,67 @@ class FeatureInput(object):
)
return f0_coarse
def go(self,paths,f0_method):
if (len(paths) == 0): printt("no-f0-todo")
def go(self, paths, f0_method):
if len(paths) == 0:
printt("no-f0-todo")
else:
printt("todo-f0-%s"%len(paths))
n=max(len(paths)//5,1)#每个进程最多打印5条
for idx,(inp_path,opt_path1,opt_path2) in enumerate(paths):
printt("todo-f0-%s" % len(paths))
n = max(len(paths) // 5, 1) # 每个进程最多打印5条
for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths):
try:
if(idx%n==0):printt("f0ing,now-%s,all-%s,-%s"%(idx,len(paths),inp_path))
if(os.path.exists(opt_path1+".npy")==True and os.path.exists(opt_path2+".npy")==True):continue
featur_pit = self.compute_f0(inp_path,f0_method)
np.save(opt_path2,featur_pit,allow_pickle=False,)#nsf
if idx % n == 0:
printt("f0ing,now-%s,all-%s,-%s" % (idx, len(paths), inp_path))
if (
os.path.exists(opt_path1 + ".npy") == True
and os.path.exists(opt_path2 + ".npy") == True
):
continue
featur_pit = self.compute_f0(inp_path, f0_method)
np.save(
opt_path2,
featur_pit,
allow_pickle=False,
) # nsf
coarse_pit = self.coarse_f0(featur_pit)
np.save(opt_path1,coarse_pit,allow_pickle=False,)#ori
np.save(
opt_path1,
coarse_pit,
allow_pickle=False,
) # ori
except:
printt("f0fail-%s-%s-%s" % (idx, inp_path,traceback.format_exc()))
printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc()))
if __name__=='__main__':
if __name__ == "__main__":
# exp_dir=r"E:\codes\py39\dataset\mi-test"
# n_p=16
# f = open("%s/log_extract_f0.log"%exp_dir, "w")
printt(sys.argv)
featureInput = FeatureInput()
paths=[]
inp_root= "%s/1_16k_wavs"%(exp_dir)
opt_root1="%s/2a_f0"%(exp_dir)
opt_root2="%s/2b-f0nsf"%(exp_dir)
paths = []
inp_root = "%s/1_16k_wavs" % (exp_dir)
opt_root1 = "%s/2a_f0" % (exp_dir)
opt_root2 = "%s/2b-f0nsf" % (exp_dir)
os.makedirs(opt_root1,exist_ok=True)
os.makedirs(opt_root2,exist_ok=True)
os.makedirs(opt_root1, exist_ok=True)
os.makedirs(opt_root2, exist_ok=True)
for name in sorted(list(os.listdir(inp_root))):
inp_path="%s/%s"%(inp_root,name)
if ("spec" in inp_path): continue
opt_path1="%s/%s"%(opt_root1,name)
opt_path2="%s/%s"%(opt_root2,name)
paths.append([inp_path,opt_path1,opt_path2])
inp_path = "%s/%s" % (inp_root, name)
if "spec" in inp_path:
continue
opt_path1 = "%s/%s" % (opt_root1, name)
opt_path2 = "%s/%s" % (opt_root2, name)
paths.append([inp_path, opt_path1, opt_path2])
ps=[]
ps = []
for i in range(n_p):
p=Process(target=featureInput.go,args=(paths[i::n_p],f0method,))
p = Process(
target=featureInput.go,
args=(
paths[i::n_p],
f0method,
),
)
p.start()
ps.append(p)
for p in ps:

View File

@ -1,33 +1,41 @@
import os,sys,traceback
import os, sys, traceback
# device=sys.argv[1]
n_part=int(sys.argv[2])
i_part=int(sys.argv[3])
n_part = int(sys.argv[2])
i_part = int(sys.argv[3])
if len(sys.argv) == 5:
exp_dir=sys.argv[4]
exp_dir = sys.argv[4]
else:
i_gpu=sys.argv[4]
exp_dir=sys.argv[5]
os.environ["CUDA_VISIBLE_DEVICES"]=str(i_gpu)
i_gpu = sys.argv[4]
exp_dir = sys.argv[5]
os.environ["CUDA_VISIBLE_DEVICES"] = str(i_gpu)
import torch
import torch.nn.functional as F
import soundfile as sf
import numpy as np
from fairseq import checkpoint_utils
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
f = open("%s/extract_f0_feature.log"%exp_dir, "a+")
f = open("%s/extract_f0_feature.log" % exp_dir, "a+")
def printt(strr):
print(strr)
f.write("%s\n" % strr)
f.flush()
printt(sys.argv)
model_path = "hubert_base.pt"
printt(exp_dir)
wavPath = "%s/1_16k_wavs"%exp_dir
outPath = "%s/3_feature256"%exp_dir
os.makedirs(outPath,exist_ok=True)
wavPath = "%s/1_16k_wavs" % exp_dir
outPath = "%s/3_feature256" % exp_dir
os.makedirs(outPath, exist_ok=True)
# wave must be 16k, hop_size=320
def readwave(wav_path, normalize=False):
wav, sr = sf.read(wav_path)
@ -41,6 +49,8 @@ def readwave(wav_path, normalize=False):
feats = F.layer_norm(feats, feats.shape)
feats = feats.view(1, -1)
return feats
# HuBERT model
printt("load model(s) from {}".format(model_path))
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
@ -49,27 +59,32 @@ models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
)
model = models[0]
model = model.to(device)
printt("move model to %s"%device)
if device != "cpu": model = model.half()
printt("move model to %s" % device)
if device != "cpu":
model = model.half()
model.eval()
todo=sorted(list(os.listdir(wavPath)))[i_part::n_part]
n = max(1,len(todo) // 10) # 最多打印十条
if(len(todo)==0):printt("no-feature-todo")
todo = sorted(list(os.listdir(wavPath)))[i_part::n_part]
n = max(1, len(todo) // 10) # 最多打印十条
if len(todo) == 0:
printt("no-feature-todo")
else:
printt("all-feature-%s"%len(todo))
for idx,file in enumerate(todo):
printt("all-feature-%s" % len(todo))
for idx, file in enumerate(todo):
try:
if file.endswith(".wav"):
wav_path = "%s/%s"%(wavPath,file)
out_path = "%s/%s"%(outPath,file.replace("wav","npy"))
wav_path = "%s/%s" % (wavPath, file)
out_path = "%s/%s" % (outPath, file.replace("wav", "npy"))
if(os.path.exists(out_path)):continue
if os.path.exists(out_path):
continue
feats = readwave(wav_path, normalize=saved_cfg.task.normalize)
padding_mask = torch.BoolTensor(feats.shape).fill_(False)
inputs = {
"source": feats.half().to(device) if device != "cpu" else feats.to(device),
"source": feats.half().to(device)
if device != "cpu"
else feats.to(device),
"padding_mask": padding_mask.to(device),
"output_layer": 9, # layer 9
}
@ -78,11 +93,12 @@ else:
feats = model.final_proj(logits[0])
feats = feats.squeeze(0).float().cpu().numpy()
if(np.isnan(feats).sum()==0):
if np.isnan(feats).sum() == 0:
np.save(out_path, feats, allow_pickle=False)
else:
printt("%s-contains nan"%file)
if (idx % n == 0):printt("now-%s,all-%s,%s,%s"%(len(todo),idx,file,feats.shape))
printt("%s-contains nan" % file)
if idx % n == 0:
printt("now-%s,all-%s,%s,%s" % (len(todo), idx, file, feats.shape))
except:
printt(traceback.format_exc())
printt("all-feature-done")

View File

@ -7,9 +7,10 @@ pattern = r"""i18n\((["'][^"']+["'])\)"""
# Initialize the dictionary to store key-value pairs
data = {}
def process(fn: str):
global data
with open(fn, 'r', encoding='utf-8') as f:
with open(fn, "r", encoding="utf-8") as f:
contents = f.read()
matches = re.findall(pattern, contents)
for key in matches:
@ -17,12 +18,13 @@ def process(fn: str):
print("extract:", key)
data[key] = key
print("processing infer-web.py")
process('infer-web.py')
process("infer-web.py")
print("processing gui.py")
process('gui.py')
process("gui.py")
# Save as a JSON file
with open('./locale/zh_CN.json', 'w', encoding='utf-8') as f:
with open("./locale/zh_CN.json", "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=4)

546
gui.py
View File

@ -3,32 +3,36 @@ import sounddevice as sd
import noisereduce as nr
import numpy as np
from fairseq import checkpoint_utils
import librosa,torch,parselmouth,faiss,time,threading
import librosa, torch, parselmouth, faiss, time, threading
import torch.nn.functional as F
import torchaudio.transforms as tat
#import matplotlib.pyplot as plt
# import matplotlib.pyplot as plt
from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono
from webui_locale import I18nAuto
i18n = I18nAuto()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class RVC:
def __init__(self,key,hubert_path,pth_path,index_path,npy_path,index_rate) -> None:
'''
def __init__(
self, key, hubert_path, pth_path, index_path, npy_path, index_rate
) -> None:
"""
初始化
'''
self.f0_up_key=key
"""
self.f0_up_key = key
self.time_step = 160 / 16000 * 1000
self.f0_min = 50
self.f0_max = 1100
self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
self.index=faiss.read_index(index_path)
self.index_rate=index_rate
'''NOT YET USED'''
self.big_npy=np.load(npy_path)
self.index = faiss.read_index(index_path)
self.index_rate = index_rate
"""NOT YET USED"""
self.big_npy = np.load(npy_path)
model_path = hubert_path
print("load model(s) from {}".format(model_path))
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
@ -41,9 +45,9 @@ class RVC:
self.model.eval()
cpt = torch.load(pth_path, map_location="cpu")
tgt_sr = cpt["config"][-1]
cpt["config"][-3]=cpt["weight"]["emb_g.weight"].shape[0]#n_spk
if_f0=cpt.get("f0",1)
if(if_f0==1):
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
if_f0 = cpt.get("f0", 1)
if if_f0 == 1:
self.net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=True)
else:
self.net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
@ -52,36 +56,43 @@ class RVC:
self.net_g.eval().to(device)
self.net_g.half()
def get_f0_coarse(self,f0):
def get_f0_coarse(self, f0):
f0_mel = 1127 * np.log(1 + f0 / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / (self.f0_mel_max - self.f0_mel_min) + 1
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / (
self.f0_mel_max - self.f0_mel_min
) + 1
f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > 255] = 255
# f0_mel[f0_mel > 188] = 188
f0_coarse = np.rint(f0_mel).astype(np.int)
return f0_coarse
def get_f0(self,x, p_len,f0_up_key=0):
f0 = parselmouth.Sound(x, 16000).to_pitch_ac(
time_step=self.time_step / 1000, voicing_threshold=0.6,
pitch_floor=self.f0_min, pitch_ceiling=self.f0_max).selected_array['frequency']
def get_f0(self, x, p_len, f0_up_key=0):
f0 = (
parselmouth.Sound(x, 16000)
.to_pitch_ac(
time_step=self.time_step / 1000,
voicing_threshold=0.6,
pitch_floor=self.f0_min,
pitch_ceiling=self.f0_max,
)
.selected_array["frequency"]
)
pad_size=(p_len - len(f0) + 1) // 2
if(pad_size>0 or p_len - len(f0) - pad_size>0):
f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
pad_size = (p_len - len(f0) + 1) // 2
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
f0 *= pow(2, f0_up_key / 12)
# f0=suofang(f0)
f0bak = f0.copy()
f0_coarse=self.get_f0_coarse(f0)
f0_coarse = self.get_f0_coarse(f0)
return f0_coarse, f0bak
def infer(self,feats:torch.Tensor) -> np.ndarray:
'''
def infer(self, feats: torch.Tensor) -> np.ndarray:
"""
推理函数
'''
audio=feats.clone().cpu().numpy()
"""
audio = feats.clone().cpu().numpy()
assert feats.dim() == 1, feats.dim()
feats = feats.view(1, -1)
padding_mask = torch.BoolTensor(feats.shape).fill_(False)
@ -96,209 +107,389 @@ class RVC:
feats = self.model.final_proj(logits[0])
####索引优化
if(isinstance(self.index,type(None))==False and isinstance(self.big_npy,type(None))==False and self.index_rate!=0):
if (
isinstance(self.index, type(None)) == False
and isinstance(self.big_npy, type(None)) == False
and self.index_rate != 0
):
npy = feats[0].cpu().numpy().astype("float32")
_, I = self.index.search(npy, 1)
npy=self.big_npy[I.squeeze()].astype("float16")
feats = torch.from_numpy(npy).unsqueeze(0).to(device)*self.index_rate + (1-self.index_rate)*feats
npy = self.big_npy[I.squeeze()].astype("float16")
feats = (
torch.from_numpy(npy).unsqueeze(0).to(device) * self.index_rate
+ (1 - self.index_rate) * feats
)
feats=F.interpolate(feats.permute(0,2,1),scale_factor=2).permute(0,2,1)
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
torch.cuda.synchronize()
# p_len = min(feats.shape[1],10000,pitch.shape[0])#太大了爆显存
p_len = min(feats.shape[1],12000)#
p_len = min(feats.shape[1], 12000) #
print(feats.shape)
pitch, pitchf = self.get_f0(audio, p_len,self.f0_up_key)
p_len = min(feats.shape[1],12000,pitch.shape[0])#太大了爆显存
pitch, pitchf = self.get_f0(audio, p_len, self.f0_up_key)
p_len = min(feats.shape[1], 12000, pitch.shape[0]) # 太大了爆显存
torch.cuda.synchronize()
# print(feats.shape,pitch.shape)
feats = feats[:,:p_len, :]
feats = feats[:, :p_len, :]
pitch = pitch[:p_len]
pitchf = pitchf[:p_len]
p_len = torch.LongTensor([p_len]).to(device)
pitch = torch.LongTensor(pitch).unsqueeze(0).to(device)
pitchf = torch.FloatTensor(pitchf).unsqueeze(0).to(device)
ii=0#sid
sid=torch.LongTensor([ii]).to(device)
ii = 0 # sid
sid = torch.LongTensor([ii]).to(device)
with torch.no_grad():
infered_audio = self.net_g.infer(feats, p_len,pitch,pitchf,sid)[0][0, 0].data.cpu().float()#nsf
infered_audio = (
self.net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]
.data.cpu()
.float()
) # nsf
torch.cuda.synchronize()
return infered_audio
class Config:
def __init__(self) -> None:
self.hubert_path:str=''
self.pth_path:str=''
self.index_path:str=''
self.npy_path:str=''
self.pitch:int=12
self.samplerate:int=44100
self.block_time:float=1.0#s
self.buffer_num:int=1
self.threhold:int=-30
self.crossfade_time:float=0.08
self.extra_time:float=0.04
self.I_noise_reduce=False
self.O_noise_reduce=False
self.index_rate=0.3
self.hubert_path: str = ""
self.pth_path: str = ""
self.index_path: str = ""
self.npy_path: str = ""
self.pitch: int = 12
self.samplerate: int = 44100
self.block_time: float = 1.0 # s
self.buffer_num: int = 1
self.threhold: int = -30
self.crossfade_time: float = 0.08
self.extra_time: float = 0.04
self.I_noise_reduce = False
self.O_noise_reduce = False
self.index_rate = 0.3
class GUI:
def __init__(self) -> None:
self.config=Config()
self.flag_vc=False
self.config = Config()
self.flag_vc = False
self.launcher()
def launcher(self):
sg.theme('LightBlue3')
input_devices,output_devices,_, _=self.get_devices()
layout=[
sg.theme("LightBlue3")
input_devices, output_devices, _, _ = self.get_devices()
layout = [
[
sg.Frame(title=i18n('加载模型'),layout=[
[sg.Input(default_text='TEMP\\hubert_base.pt',key='hubert_path'),sg.FileBrowse(i18n('Hubert模型'))],
[sg.Input(default_text='TEMP\\atri.pth',key='pth_path'),sg.FileBrowse(i18n('选择.pth文件'))],
[sg.Input(default_text='TEMP\\added_IVF512_Flat_atri_baseline_src_feat.index',key='index_path'),sg.FileBrowse(i18n('选择.index文件'))],
[sg.Input(default_text='TEMP\\big_src_feature_atri.npy',key='npy_path'),sg.FileBrowse(i18n('选择.npy文件'))]
])
sg.Frame(
title=i18n("加载模型"),
layout=[
[
sg.Input(
default_text="TEMP\\hubert_base.pt", key="hubert_path"
),
sg.FileBrowse(i18n("Hubert模型")),
],
[
sg.Input(default_text="TEMP\\atri.pth", key="pth_path"),
sg.FileBrowse(i18n("选择.pth文件")),
],
[
sg.Input(
default_text="TEMP\\added_IVF512_Flat_atri_baseline_src_feat.index",
key="index_path",
),
sg.FileBrowse(i18n("选择.index文件")),
],
[
sg.Input(
default_text="TEMP\\big_src_feature_atri.npy",
key="npy_path",
),
sg.FileBrowse(i18n("选择.npy文件")),
],
],
)
],
[
sg.Frame(layout=[
[sg.Text(i18n("输入设备")),sg.Combo(input_devices,key='sg_input_device',default_value=input_devices[sd.default.device[0]])],
[sg.Text(i18n("输出设备")),sg.Combo(output_devices,key='sg_output_device',default_value=output_devices[sd.default.device[1]])]
],title=i18n("音频设备(请使用同种类驱动)"))
sg.Frame(
layout=[
[
sg.Text(i18n("输入设备")),
sg.Combo(
input_devices,
key="sg_input_device",
default_value=input_devices[sd.default.device[0]],
),
],
[
sg.Text(i18n("输出设备")),
sg.Combo(
output_devices,
key="sg_output_device",
default_value=output_devices[sd.default.device[1]],
),
],
],
title=i18n("音频设备(请使用同种类驱动)"),
)
],
[
sg.Frame(layout=[
[sg.Text(i18n("响应阈值")),sg.Slider(range=(-60,0),key='threhold',resolution=1,orientation='h',default_value=-30)],
[sg.Text(i18n("音调设置")),sg.Slider(range=(-24,24),key='pitch',resolution=1,orientation='h',default_value=12)],
[sg.Text(i18n('Index Rate')),sg.Slider(range=(0.0,1.0),key='index_rate',resolution=0.01,orientation='h',default_value=0.5)]
],title=i18n("常规设置")),
sg.Frame(layout=[
[sg.Text(i18n("采样长度")),sg.Slider(range=(0.1,3.0),key='block_time',resolution=0.1,orientation='h',default_value=1.0)],
[sg.Text(i18n("淡入淡出长度")),sg.Slider(range=(0.01,0.15),key='crossfade_length',resolution=0.01,orientation='h',default_value=0.08)],
[sg.Text(i18n("额外推理时长")),sg.Slider(range=(0.05,3.00),key='extra_time',resolution=0.01,orientation='h',default_value=0.05)],
[sg.Checkbox(i18n('输入降噪'),key='I_noise_reduce'),sg.Checkbox(i18n('输出降噪'),key='O_noise_reduce')]
],title=i18n("性能设置"))
sg.Frame(
layout=[
[
sg.Text(i18n("响应阈值")),
sg.Slider(
range=(-60, 0),
key="threhold",
resolution=1,
orientation="h",
default_value=-30,
),
],
[
sg.Text(i18n("音调设置")),
sg.Slider(
range=(-24, 24),
key="pitch",
resolution=1,
orientation="h",
default_value=12,
),
],
[
sg.Text(i18n("Index Rate")),
sg.Slider(
range=(0.0, 1.0),
key="index_rate",
resolution=0.01,
orientation="h",
default_value=0.5,
),
],
],
title=i18n("常规设置"),
),
sg.Frame(
layout=[
[
sg.Text(i18n("采样长度")),
sg.Slider(
range=(0.1, 3.0),
key="block_time",
resolution=0.1,
orientation="h",
default_value=1.0,
),
],
[
sg.Text(i18n("淡入淡出长度")),
sg.Slider(
range=(0.01, 0.15),
key="crossfade_length",
resolution=0.01,
orientation="h",
default_value=0.08,
),
],
[
sg.Text(i18n("额外推理时长")),
sg.Slider(
range=(0.05, 3.00),
key="extra_time",
resolution=0.01,
orientation="h",
default_value=0.05,
),
],
[
sg.Checkbox(i18n("输入降噪"), key="I_noise_reduce"),
sg.Checkbox(i18n("输出降噪"), key="O_noise_reduce"),
],
],
title=i18n("性能设置"),
),
],
[
sg.Button(i18n("开始音频转换"), key="start_vc"),
sg.Button(i18n("停止音频转换"), key="stop_vc"),
sg.Text(i18n("推理时间(ms):")),
sg.Text("0", key="infer_time"),
],
[sg.Button(i18n("开始音频转换"),key='start_vc'),sg.Button(i18n("停止音频转换"),key='stop_vc'),sg.Text(i18n("推理时间(ms):")),sg.Text("0",key='infer_time')]
]
self.window=sg.Window("RVC - GUI",layout=layout)
self.window = sg.Window("RVC - GUI", layout=layout)
self.event_handler()
def event_handler(self):
while True:
event, values = self.window.read()
if event ==sg.WINDOW_CLOSED:
self.flag_vc=False
if event == sg.WINDOW_CLOSED:
self.flag_vc = False
exit()
if event == 'start_vc' and self.flag_vc==False:
if event == "start_vc" and self.flag_vc == False:
self.set_values(values)
print(str(self.config.__dict__))
print('using_cuda:'+str(torch.cuda.is_available()))
print("using_cuda:" + str(torch.cuda.is_available()))
self.start_vc()
if event=='stop_vc'and self.flag_vc==True:
if event == "stop_vc" and self.flag_vc == True:
self.flag_vc = False
def set_values(self,values):
self.set_devices(values["sg_input_device"],values['sg_output_device'])
self.config.hubert_path=values['hubert_path']
self.config.pth_path=values['pth_path']
self.config.index_path=values['index_path']
self.config.npy_path=values['npy_path']
self.config.threhold=values['threhold']
self.config.pitch=values['pitch']
self.config.block_time=values['block_time']
self.config.crossfade_time=values['crossfade_length']
self.config.extra_time=values['extra_time']
self.config.I_noise_reduce=values['I_noise_reduce']
self.config.O_noise_reduce=values['O_noise_reduce']
self.config.index_rate=values['index_rate']
def set_values(self, values):
self.set_devices(values["sg_input_device"], values["sg_output_device"])
self.config.hubert_path = values["hubert_path"]
self.config.pth_path = values["pth_path"]
self.config.index_path = values["index_path"]
self.config.npy_path = values["npy_path"]
self.config.threhold = values["threhold"]
self.config.pitch = values["pitch"]
self.config.block_time = values["block_time"]
self.config.crossfade_time = values["crossfade_length"]
self.config.extra_time = values["extra_time"]
self.config.I_noise_reduce = values["I_noise_reduce"]
self.config.O_noise_reduce = values["O_noise_reduce"]
self.config.index_rate = values["index_rate"]
def start_vc(self):
torch.cuda.empty_cache()
self.flag_vc=True
self.block_frame=int(self.config.block_time*self.config.samplerate)
self.crossfade_frame=int(self.config.crossfade_time*self.config.samplerate)
self.sola_search_frame=int(0.012*self.config.samplerate)
self.delay_frame=int(0.02*self.config.samplerate)#往前预留0.02s
self.extra_frame=int(self.config.extra_time*self.config.samplerate)#往后预留0.04s
self.rvc=None
self.rvc=RVC(self.config.pitch,self.config.hubert_path,self.config.pth_path,self.config.index_path,self.config.npy_path,self.config.index_rate)
self.input_wav:np.ndarray=np.zeros(self.extra_frame+self.crossfade_frame+self.sola_search_frame+self.block_frame,dtype='float32')
self.output_wav:torch.Tensor=torch.zeros(self.block_frame,device=device,dtype=torch.float32)
self.sola_buffer:torch.Tensor=torch.zeros(self.crossfade_frame,device=device,dtype=torch.float32)
self.fade_in_window:torch.Tensor=torch.linspace(0.0,1.0,steps=self.crossfade_frame,device=device,dtype=torch.float32)
self.fade_out_window:torch.Tensor = 1 - self.fade_in_window
self.resampler1=tat.Resample(orig_freq=self.config.samplerate,new_freq=16000,dtype=torch.float32)
self.resampler2=tat.Resample(orig_freq=40000,new_freq=self.config.samplerate,dtype=torch.float32)
thread_vc=threading.Thread(target=self.soundinput)
self.flag_vc = True
self.block_frame = int(self.config.block_time * self.config.samplerate)
self.crossfade_frame = int(self.config.crossfade_time * self.config.samplerate)
self.sola_search_frame = int(0.012 * self.config.samplerate)
self.delay_frame = int(0.02 * self.config.samplerate) # 往前预留0.02s
self.extra_frame = int(
self.config.extra_time * self.config.samplerate
) # 往后预留0.04s
self.rvc = None
self.rvc = RVC(
self.config.pitch,
self.config.hubert_path,
self.config.pth_path,
self.config.index_path,
self.config.npy_path,
self.config.index_rate,
)
self.input_wav: np.ndarray = np.zeros(
self.extra_frame
+ self.crossfade_frame
+ self.sola_search_frame
+ self.block_frame,
dtype="float32",
)
self.output_wav: torch.Tensor = torch.zeros(
self.block_frame, device=device, dtype=torch.float32
)
self.sola_buffer: torch.Tensor = torch.zeros(
self.crossfade_frame, device=device, dtype=torch.float32
)
self.fade_in_window: torch.Tensor = torch.linspace(
0.0, 1.0, steps=self.crossfade_frame, device=device, dtype=torch.float32
)
self.fade_out_window: torch.Tensor = 1 - self.fade_in_window
self.resampler1 = tat.Resample(
orig_freq=self.config.samplerate, new_freq=16000, dtype=torch.float32
)
self.resampler2 = tat.Resample(
orig_freq=40000, new_freq=self.config.samplerate, dtype=torch.float32
)
thread_vc = threading.Thread(target=self.soundinput)
thread_vc.start()
def soundinput(self):
'''
"""
接受音频输入
'''
with sd.Stream(callback=self.audio_callback, blocksize=self.block_frame,samplerate=self.config.samplerate,dtype='float32'):
"""
with sd.Stream(
callback=self.audio_callback,
blocksize=self.block_frame,
samplerate=self.config.samplerate,
dtype="float32",
):
while self.flag_vc:
time.sleep(self.config.block_time)
print('Audio block passed.')
print('ENDing VC')
print("Audio block passed.")
print("ENDing VC")
def audio_callback(self,indata:np.ndarray,outdata:np.ndarray, frames, times, status):
'''
def audio_callback(
self, indata: np.ndarray, outdata: np.ndarray, frames, times, status
):
"""
音频处理
'''
start_time=time.perf_counter()
indata=librosa.to_mono(indata.T)
"""
start_time = time.perf_counter()
indata = librosa.to_mono(indata.T)
if self.config.I_noise_reduce:
indata[:]=nr.reduce_noise(y=indata,sr=self.config.samplerate)
indata[:] = nr.reduce_noise(y=indata, sr=self.config.samplerate)
'''noise gate'''
frame_length=2048
hop_length=1024
rms=librosa.feature.rms(y=indata,frame_length=frame_length,hop_length=hop_length)
db_threhold=librosa.amplitude_to_db(rms,ref=1.0)[0]<self.config.threhold
#print(rms.shape,db.shape,db)
"""noise gate"""
frame_length = 2048
hop_length = 1024
rms = librosa.feature.rms(
y=indata, frame_length=frame_length, hop_length=hop_length
)
db_threhold = librosa.amplitude_to_db(rms, ref=1.0)[0] < self.config.threhold
# print(rms.shape,db.shape,db)
for i in range(db_threhold.shape[0]):
if db_threhold[i]:
indata[i*hop_length:(i+1)*hop_length]=0
self.input_wav[:]=np.append(self.input_wav[self.block_frame:],indata)
indata[i * hop_length : (i + 1) * hop_length] = 0
self.input_wav[:] = np.append(self.input_wav[self.block_frame :], indata)
#infer
print('input_wav:'+str(self.input_wav.shape))
#print('infered_wav:'+str(infer_wav.shape))
infer_wav:torch.Tensor=self.resampler2(self.rvc.infer(self.resampler1(torch.from_numpy(self.input_wav))))[-self.crossfade_frame-self.sola_search_frame-self.block_frame:].to(device)
print('infer_wav:'+str(infer_wav.shape))
# infer
print("input_wav:" + str(self.input_wav.shape))
# print('infered_wav:'+str(infer_wav.shape))
infer_wav: torch.Tensor = self.resampler2(
self.rvc.infer(self.resampler1(torch.from_numpy(self.input_wav)))
)[-self.crossfade_frame - self.sola_search_frame - self.block_frame :].to(
device
)
print("infer_wav:" + str(infer_wav.shape))
# SOLA algorithm from https://github.com/yxlllc/DDSP-SVC
cor_nom=F.conv1d(infer_wav[None,None,:self.crossfade_frame + self.sola_search_frame],self.sola_buffer[None,None,:])
cor_den=torch.sqrt(F.conv1d(infer_wav[None,None,:self.crossfade_frame + self.sola_search_frame]**2,torch.ones(1, 1,self.crossfade_frame,device=device))+1e-8)
sola_offset = torch.argmax( cor_nom[0, 0] / cor_den[0, 0])
print('sola offset: ' + str(int(sola_offset)))
cor_nom = F.conv1d(
infer_wav[None, None, : self.crossfade_frame + self.sola_search_frame],
self.sola_buffer[None, None, :],
)
cor_den = torch.sqrt(
F.conv1d(
infer_wav[None, None, : self.crossfade_frame + self.sola_search_frame]
** 2,
torch.ones(1, 1, self.crossfade_frame, device=device),
)
+ 1e-8
)
sola_offset = torch.argmax(cor_nom[0, 0] / cor_den[0, 0])
print("sola offset: " + str(int(sola_offset)))
# crossfade
self.output_wav[:]=infer_wav[sola_offset : sola_offset + self.block_frame]
self.output_wav[:self.crossfade_frame] *= self.fade_in_window
self.output_wav[:self.crossfade_frame] += self.sola_buffer[:]
self.output_wav[:] = infer_wav[sola_offset : sola_offset + self.block_frame]
self.output_wav[: self.crossfade_frame] *= self.fade_in_window
self.output_wav[: self.crossfade_frame] += self.sola_buffer[:]
if sola_offset < self.sola_search_frame:
self.sola_buffer[:] = infer_wav[-self.sola_search_frame - self.crossfade_frame + sola_offset: -self.sola_search_frame + sola_offset]* self.fade_out_window
self.sola_buffer[:] = (
infer_wav[
-self.sola_search_frame
- self.crossfade_frame
+ sola_offset : -self.sola_search_frame
+ sola_offset
]
* self.fade_out_window
)
else:
self.sola_buffer[:] = infer_wav[- self.crossfade_frame :]* self.fade_out_window
self.sola_buffer[:] = (
infer_wav[-self.crossfade_frame :] * self.fade_out_window
)
if self.config.O_noise_reduce:
outdata[:]=np.tile(nr.reduce_noise(y=self.output_wav[:].cpu().numpy(),sr=self.config.samplerate),(2,1)).T
outdata[:] = np.tile(
nr.reduce_noise(
y=self.output_wav[:].cpu().numpy(), sr=self.config.samplerate
),
(2, 1),
).T
else:
outdata[:]=self.output_wav[:].repeat(2, 1).t().cpu().numpy()
total_time=time.perf_counter()-start_time
print('infer time:'+str(total_time))
self.window['infer_time'].update(int(total_time*1000))
outdata[:] = self.output_wav[:].repeat(2, 1).t().cpu().numpy()
total_time = time.perf_counter() - start_time
print("infer time:" + str(total_time))
self.window["infer_time"].update(int(total_time * 1000))
def get_devices(self,update: bool = True):
'''获取设备列表'''
def get_devices(self, update: bool = True):
"""获取设备列表"""
if update:
sd._terminate()
sd._initialize()
@ -317,18 +508,33 @@ class GUI:
for d in devices
if d["max_output_channels"] > 0
]
input_devices_indices = [d["index"] for d in devices if d["max_input_channels"] > 0]
input_devices_indices = [
d["index"] for d in devices if d["max_input_channels"] > 0
]
output_devices_indices = [
d["index"] for d in devices if d["max_output_channels"] > 0
]
return input_devices, output_devices, input_devices_indices, output_devices_indices
return (
input_devices,
output_devices,
input_devices_indices,
output_devices_indices,
)
def set_devices(self,input_device,output_device):
'''设置输出设备'''
input_devices,output_devices,input_device_indices, output_device_indices=self.get_devices()
sd.default.device[0]=input_device_indices[input_devices.index(input_device)]
sd.default.device[1]=output_device_indices[output_devices.index(output_device)]
print("input device:"+str(sd.default.device[0])+":"+str(input_device))
print("output device:"+str(sd.default.device[1])+":"+str(output_device))
def set_devices(self, input_device, output_device):
"""设置输出设备"""
(
input_devices,
output_devices,
input_device_indices,
output_device_indices,
) = self.get_devices()
sd.default.device[0] = input_device_indices[input_devices.index(input_device)]
sd.default.device[1] = output_device_indices[
output_devices.index(output_device)
]
print("input device:" + str(sd.default.device[0]) + ":" + str(input_device))
print("output device:" + str(sd.default.device[1]) + ":" + str(output_device))
gui=GUI()
gui = GUI()

File diff suppressed because it is too large Load Diff

View File

@ -1,14 +1,19 @@
'''
"""
对源特征进行检索
'''
import torch, pdb, os,parselmouth
os.environ["CUDA_VISIBLE_DEVICES"]="0"
"""
import torch, pdb, os, parselmouth
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import numpy as np
import soundfile as sf
# from models import SynthesizerTrn256#hifigan_nonsf
# from infer_pack.models import SynthesizerTrn256NSF as SynthesizerTrn256#hifigan_nsf
from infer_pack.models import SynthesizerTrnMs256NSFsid as SynthesizerTrn256#hifigan_nsf
from infer_pack.models import (
SynthesizerTrnMs256NSFsid as SynthesizerTrn256,
) # hifigan_nsf
# from infer_pack.models import SynthesizerTrnMs256NSFsid_sim as SynthesizerTrn256#hifigan_nsf
# from models import SynthesizerTrn256NSFsim as SynthesizerTrn256#hifigan_nsf
# from models import SynthesizerTrn256NSFsimFlow as SynthesizerTrn256#hifigan_nsf
@ -16,15 +21,17 @@ from infer_pack.models import SynthesizerTrnMs256NSFsid as SynthesizerTrn256#hif
from scipy.io import wavfile
from fairseq import checkpoint_utils
# import pyworld
import librosa
import torch.nn.functional as F
import scipy.signal as signal
# import torchcrepe
from time import time as ttime
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = r"E:\codes\py39\vits_vc_gpu_train\hubert_base.pt"#
model_path = r"E:\codes\py39\vits_vc_gpu_train\hubert_base.pt" #
print("load model(s) from {}".format(model_path))
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
[model_path],
@ -37,7 +44,26 @@ model.eval()
# net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],183,256,is_half=True)#hifigan#512#256
# net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],109,256,is_half=True)#hifigan#512#256
net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],183,256,is_half=True)#hifigan#512#256#no_dropout
net_g = SynthesizerTrn256(
1025,
32,
192,
192,
768,
2,
6,
3,
0,
"1",
[3, 7, 11],
[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
[10, 10, 2, 2],
512,
[16, 16, 4, 4],
183,
256,
is_half=True,
) # hifigan#512#256#no_dropout
# net_g = SynthesizerTrn256(1025,32,192,192,768,2,3,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],0)#ts3
# net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2],512,[16,16,4],0)#hifigan-ps-sr
#
@ -48,51 +74,66 @@ net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0,"1", [3,7,11],[[1,3,5], [1
# weights=torch.load("infer/ft-mi-freeze-vocoder-flow-enc_q_1k.pt")
# weights=torch.load("infer/ft-mi-freeze-vocoder_true_1k.pt")
# weights=torch.load("infer/ft-mi-sim1k.pt")
weights=torch.load("infer/ft-mi-no_opt-no_dropout.pt")
print(net_g.load_state_dict(weights,strict=True))
weights = torch.load("infer/ft-mi-no_opt-no_dropout.pt")
print(net_g.load_state_dict(weights, strict=True))
net_g.eval().to(device)
net_g.half()
def get_f0(x, p_len,f0_up_key=0):
def get_f0(x, p_len, f0_up_key=0):
time_step = 160 / 16000 * 1000
f0_min = 50
f0_max = 1100
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
f0 = parselmouth.Sound(x, 16000).to_pitch_ac(
time_step=time_step / 1000, voicing_threshold=0.6,
pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
f0 = (
parselmouth.Sound(x, 16000)
.to_pitch_ac(
time_step=time_step / 1000,
voicing_threshold=0.6,
pitch_floor=f0_min,
pitch_ceiling=f0_max,
)
.selected_array["frequency"]
)
pad_size=(p_len - len(f0) + 1) // 2
if(pad_size>0 or p_len - len(f0) - pad_size>0):
f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
pad_size = (p_len - len(f0) + 1) // 2
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
f0 *= pow(2, f0_up_key / 12)
f0bak = f0.copy()
f0_mel = 1127 * np.log(1 + f0 / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
f0_mel_max - f0_mel_min
) + 1
f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > 255] = 255
# f0_mel[f0_mel > 188] = 188
f0_coarse = np.rint(f0_mel).astype(np.int)
return f0_coarse, f0bak
import faiss
index=faiss.read_index("infer/added_IVF512_Flat_mi_baseline_src_feat.index")
big_npy=np.load("infer/big_src_feature_mi.npy")
ta0=ta1=ta2=0
for idx,name in enumerate(["冬之花clip1.wav",]):##
wav_path = "todo-songs/%s" % name#
f0_up_key=-2#
index = faiss.read_index("infer/added_IVF512_Flat_mi_baseline_src_feat.index")
big_npy = np.load("infer/big_src_feature_mi.npy")
ta0 = ta1 = ta2 = 0
for idx, name in enumerate(
[
"冬之花clip1.wav",
]
): ##
wav_path = "todo-songs/%s" % name #
f0_up_key = -2 #
audio, sampling_rate = sf.read(wav_path)
if len(audio.shape) > 1:
audio = librosa.to_mono(audio.transpose(1, 0))
if sampling_rate != 16000:
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
feats = torch.from_numpy(audio).float()
if feats.dim() == 2: # double channels
feats = feats.mean(-1)
@ -104,8 +145,9 @@ for idx,name in enumerate(["冬之花clip1.wav",]):##
"padding_mask": padding_mask.to(device),
"output_layer": 9, # layer 9
}
if torch.cuda.is_available(): torch.cuda.synchronize()
t0=ttime()
if torch.cuda.is_available():
torch.cuda.synchronize()
t0 = ttime()
with torch.no_grad():
logits = model.extract_features(**inputs)
feats = model.final_proj(logits[0])
@ -113,35 +155,45 @@ for idx,name in enumerate(["冬之花clip1.wav",]):##
####索引优化
npy = feats[0].cpu().numpy().astype("float32")
D, I = index.search(npy, 1)
feats = torch.from_numpy(big_npy[I.squeeze()].astype("float16")).unsqueeze(0).to(device)
feats = (
torch.from_numpy(big_npy[I.squeeze()].astype("float16")).unsqueeze(0).to(device)
)
feats=F.interpolate(feats.permute(0,2,1),scale_factor=2).permute(0,2,1)
if torch.cuda.is_available(): torch.cuda.synchronize()
t1=ttime()
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
if torch.cuda.is_available():
torch.cuda.synchronize()
t1 = ttime()
# p_len = min(feats.shape[1],10000,pitch.shape[0])#太大了爆显存
p_len = min(feats.shape[1],10000)#
pitch, pitchf = get_f0(audio, p_len,f0_up_key)
p_len = min(feats.shape[1],10000,pitch.shape[0])#太大了爆显存
if torch.cuda.is_available(): torch.cuda.synchronize()
t2=ttime()
feats = feats[:,:p_len, :]
p_len = min(feats.shape[1], 10000) #
pitch, pitchf = get_f0(audio, p_len, f0_up_key)
p_len = min(feats.shape[1], 10000, pitch.shape[0]) # 太大了爆显存
if torch.cuda.is_available():
torch.cuda.synchronize()
t2 = ttime()
feats = feats[:, :p_len, :]
pitch = pitch[:p_len]
pitchf = pitchf[:p_len]
p_len = torch.LongTensor([p_len]).to(device)
pitch = torch.LongTensor(pitch).unsqueeze(0).to(device)
sid=torch.LongTensor([0]).to(device)
sid = torch.LongTensor([0]).to(device)
pitchf = torch.FloatTensor(pitchf).unsqueeze(0).to(device)
with torch.no_grad():
audio = net_g.infer(feats, p_len,pitch,pitchf,sid)[0][0, 0].data.cpu().float().numpy()#nsf
if torch.cuda.is_available(): torch.cuda.synchronize()
t3=ttime()
ta0+=(t1-t0)
ta1+=(t2-t1)
ta2+=(t3-t2)
audio = (
net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]
.data.cpu()
.float()
.numpy()
) # nsf
if torch.cuda.is_available():
torch.cuda.synchronize()
t3 = ttime()
ta0 += t1 - t0
ta1 += t2 - t1
ta2 += t3 - t2
# wavfile.write("ft-mi_1k-index256-noD-%s.wav"%name, 40000, audio)##
# wavfile.write("ft-mi-freeze-vocoder-flow-enc_q_1k-%s.wav"%name, 40000, audio)##
# wavfile.write("ft-mi-sim1k-%s.wav"%name, 40000, audio)##
wavfile.write("ft-mi-no_opt-no_dropout-%s.wav"%name, 40000, audio)##
wavfile.write("ft-mi-no_opt-no_dropout-%s.wav" % name, 40000, audio) ##
print(ta0,ta1,ta2)#
print(ta0, ta1, ta2) #

View File

@ -1,31 +1,31 @@
'''
"""
格式直接cid为自带的index位aid放不下了通过字典来查反正就5w个
'''
import faiss,numpy as np,os
"""
import faiss, numpy as np, os
# ###########如果是原始特征要先写save
inp_root=r"E:\codes\py39\dataset\mi\2-co256"
npys=[]
inp_root = r"E:\codes\py39\dataset\mi\2-co256"
npys = []
for name in sorted(list(os.listdir(inp_root))):
phone=np.load("%s/%s"%(inp_root,name))
phone = np.load("%s/%s" % (inp_root, name))
npys.append(phone)
big_npy=np.concatenate(npys,0)
print(big_npy.shape)#(6196072, 192)#fp32#4.43G
np.save("infer/big_src_feature_mi.npy",big_npy)
big_npy = np.concatenate(npys, 0)
print(big_npy.shape) # (6196072, 192)#fp32#4.43G
np.save("infer/big_src_feature_mi.npy", big_npy)
##################train+add
# big_npy=np.load("/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/inference_f0/big_src_feature_mi.npy")
print(big_npy.shape)
index = faiss.index_factory(256, "IVF512,Flat")#mi
index = faiss.index_factory(256, "IVF512,Flat") # mi
print("training")
index_ivf = faiss.extract_index_ivf(index)#
index_ivf = faiss.extract_index_ivf(index) #
index_ivf.nprobe = 9
index.train(big_npy)
faiss.write_index(index, 'infer/trained_IVF512_Flat_mi_baseline_src_feat.index')
faiss.write_index(index, "infer/trained_IVF512_Flat_mi_baseline_src_feat.index")
print("adding")
index.add(big_npy)
faiss.write_index(index,"infer/added_IVF512_Flat_mi_baseline_src_feat.index")
'''
faiss.write_index(index, "infer/added_IVF512_Flat_mi_baseline_src_feat.index")
"""
大小都是FP32
big_src_feature 2.95G
(3098036, 256)
@ -33,4 +33,4 @@ big_emb 4.43G
(6196072, 192)
big_emb双倍是因为求特征要repeat后再加pitch
'''
"""

View File

@ -1,11 +1,16 @@
import torch,pdb
import torch, pdb
# a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-suc\G_1000.pth")["model"]#sim_nsf#
# a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder-flow-enc_q\G_1000.pth")["model"]#sim_nsf#
# a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder\G_1000.pth")["model"]#sim_nsf#
# a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-test\G_1000.pth")["model"]#sim_nsf#
a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-no_opt-no_dropout\G_1000.pth")["model"]#sim_nsf#
for key in a.keys():a[key]=a[key].half()
a = torch.load(
r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-no_opt-no_dropout\G_1000.pth"
)[
"model"
] # sim_nsf#
for key in a.keys():
a[key] = a[key].half()
# torch.save(a,"ft-mi-freeze-vocoder_true_1k.pt")#
# torch.save(a,"ft-mi-sim1k.pt")#
torch.save(a,"ft-mi-no_opt-no_dropout.pt")#
torch.save(a, "ft-mi-no_opt-no_dropout.pt") #

View File

@ -48,8 +48,10 @@ def slice_segments(x, ids_str, segment_size=4):
idx_end = idx_str + segment_size
ret[i] = x[i, :, idx_str:idx_end]
return ret
def slice_segments2(x, ids_str, segment_size=4):
ret = torch.zeros_like(x[:, :segment_size])
ret = torch.zeros_like(x[:, :segment_size])
for i in range(x.size(0)):
idx_str = ids_str[i]
idx_end = idx_str + segment_size

View File

@ -1,4 +1,4 @@
import math,pdb,os
import math, pdb, os
from time import time as ttime
import torch
from torch import nn
@ -12,9 +12,20 @@ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
from infer_pack.commons import init_weights
import numpy as np
from infer_pack import commons
class TextEncoder256(nn.Module):
def __init__(
self, out_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, f0=True ):
self,
out_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
f0=True,
):
super().__init__()
self.out_channels = out_channels
self.hidden_channels = hidden_channels
@ -24,8 +35,8 @@ class TextEncoder256(nn.Module):
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.emb_phone = nn.Linear(256, hidden_channels)
self.lrelu=nn.LeakyReLU(0.1,inplace=True)
if(f0==True):
self.lrelu = nn.LeakyReLU(0.1, inplace=True)
if f0 == True:
self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
self.encoder = attentions.Encoder(
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
@ -33,12 +44,12 @@ class TextEncoder256(nn.Module):
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
def forward(self, phone, pitch, lengths):
if(pitch==None):
if pitch == None:
x = self.emb_phone(phone)
else:
x = self.emb_phone(phone) + self.emb_pitch(pitch)
x = x * math.sqrt(self.hidden_channels) # [b, t, h]
x=self.lrelu(x)
x = self.lrelu(x)
x = torch.transpose(x, 1, -1) # [b, h, t]
x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
x.dtype
@ -48,8 +59,20 @@ class TextEncoder256(nn.Module):
m, logs = torch.split(stats, self.out_channels, dim=1)
return m, logs, x_mask
class TextEncoder256Sim(nn.Module):
def __init__( self, out_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, f0=True):
def __init__(
self,
out_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
f0=True,
):
super().__init__()
self.out_channels = out_channels
self.hidden_channels = hidden_channels
@ -59,8 +82,8 @@ class TextEncoder256Sim(nn.Module):
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.emb_phone = nn.Linear(256, hidden_channels)
self.lrelu=nn.LeakyReLU(0.1,inplace=True)
if(f0==True):
self.lrelu = nn.LeakyReLU(0.1, inplace=True)
if f0 == True:
self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
self.encoder = attentions.Encoder(
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
@ -68,17 +91,21 @@ class TextEncoder256Sim(nn.Module):
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
def forward(self, phone, pitch, lengths):
if(pitch==None):
if pitch == None:
x = self.emb_phone(phone)
else:
x = self.emb_phone(phone) + self.emb_pitch(pitch)
x = x * math.sqrt(self.hidden_channels) # [b, t, h]
x=self.lrelu(x)
x = self.lrelu(x)
x = torch.transpose(x, 1, -1) # [b, h, t]
x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(x.dtype)
x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
x.dtype
)
x = self.encoder(x * x_mask, x_mask)
x = self.proj(x) * x_mask
return x,x_mask
return x, x_mask
class ResidualCouplingBlock(nn.Module):
def __init__(
self,
@ -126,6 +153,8 @@ class ResidualCouplingBlock(nn.Module):
def remove_weight_norm(self):
for i in range(self.n_flows):
self.flows[i * 2].remove_weight_norm()
class PosteriorEncoder(nn.Module):
def __init__(
self,
@ -169,6 +198,8 @@ class PosteriorEncoder(nn.Module):
def remove_weight_norm(self):
self.enc.remove_weight_norm()
class Generator(torch.nn.Module):
def __init__(
self,
@ -243,8 +274,10 @@ class Generator(torch.nn.Module):
remove_weight_norm(l)
for l in self.resblocks:
l.remove_weight_norm()
class SineGen(torch.nn.Module):
""" Definition of sine generator
"""Definition of sine generator
SineGen(samp_rate, harmonic_num = 0,
sine_amp = 0.1, noise_std = 0.003,
voiced_threshold = 0,
@ -259,10 +292,15 @@ class SineGen(torch.nn.Module):
segment is always sin(np.pi) or cos(0)
"""
def __init__(self, samp_rate, harmonic_num=0,
sine_amp=0.1, noise_std=0.003,
voiced_threshold=0,
flag_for_pulse=False):
def __init__(
self,
samp_rate,
harmonic_num=0,
sine_amp=0.1,
noise_std=0.003,
voiced_threshold=0,
flag_for_pulse=False,
):
super(SineGen, self).__init__()
self.sine_amp = sine_amp
self.noise_std = noise_std
@ -277,8 +315,8 @@ class SineGen(torch.nn.Module):
uv = uv * (f0 > self.voiced_threshold)
return uv
def forward(self, f0,upp):
""" sine_tensor, uv = forward(f0)
def forward(self, f0, upp):
"""sine_tensor, uv = forward(f0)
input F0: tensor(batchsize=1, length, dim=1)
f0 for unvoiced steps should be 0
output sine_tensor: tensor(batchsize=1, length, dim)
@ -286,32 +324,52 @@ class SineGen(torch.nn.Module):
"""
with torch.no_grad():
f0 = f0[:, None].transpose(1, 2)
f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim,device=f0.device)
f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
# fundamental component
f0_buf[:, :, 0] = f0[:, :, 0]
for idx in np.arange(self.harmonic_num):f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2)# idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
rad_values = (f0_buf / self.sampling_rate) % 1###%1意味着n_har的乘积无法后处理优化
rand_ini = torch.rand(f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device)
for idx in np.arange(self.harmonic_num):
f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
idx + 2
) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
rand_ini = torch.rand(
f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
)
rand_ini[:, 0] = 0
rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
tmp_over_one = torch.cumsum(rad_values, 1)# % 1 #####%1意味着后面的cumsum无法再优化
tmp_over_one*=upp
tmp_over_one=F.interpolate(tmp_over_one.transpose(2, 1), scale_factor=upp, mode='linear', align_corners=True).transpose(2, 1)
rad_values=F.interpolate(rad_values.transpose(2, 1), scale_factor=upp, mode='nearest').transpose(2, 1)#######
tmp_over_one%=1
tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
tmp_over_one *= upp
tmp_over_one = F.interpolate(
tmp_over_one.transpose(2, 1),
scale_factor=upp,
mode="linear",
align_corners=True,
).transpose(2, 1)
rad_values = F.interpolate(
rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
).transpose(
2, 1
) #######
tmp_over_one %= 1
tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
cumsum_shift = torch.zeros_like(rad_values)
cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
sine_waves = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi)
sine_waves = torch.sin(
torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
)
sine_waves = sine_waves * self.sine_amp
uv = self._f02uv(f0)
uv = F.interpolate(uv.transpose(2, 1), scale_factor=upp, mode='nearest').transpose(2, 1)
uv = F.interpolate(
uv.transpose(2, 1), scale_factor=upp, mode="nearest"
).transpose(2, 1)
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
noise = noise_amp * torch.randn_like(sine_waves)
sine_waves = sine_waves * uv + noise
return sine_waves, uv, noise
class SourceModuleHnNSF(torch.nn.Module):
""" SourceModule for hn-nsf
"""SourceModule for hn-nsf
SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
add_noise_std=0.003, voiced_threshod=0)
sampling_rate: sampling_rate in Hz
@ -328,26 +386,37 @@ class SourceModuleHnNSF(torch.nn.Module):
uv (batchsize, length, 1)
"""
def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1,
add_noise_std=0.003, voiced_threshod=0,is_half=True):
def __init__(
self,
sampling_rate,
harmonic_num=0,
sine_amp=0.1,
add_noise_std=0.003,
voiced_threshod=0,
is_half=True,
):
super(SourceModuleHnNSF, self).__init__()
self.sine_amp = sine_amp
self.noise_std = add_noise_std
self.is_half=is_half
self.is_half = is_half
# to produce sine waveforms
self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
sine_amp, add_noise_std, voiced_threshod)
self.l_sin_gen = SineGen(
sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
)
# to merge source harmonics into a single excitation
self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
self.l_tanh = torch.nn.Tanh()
def forward(self, x,upp=None):
sine_wavs, uv, _ = self.l_sin_gen(x,upp)
if(self.is_half):sine_wavs=sine_wavs.half()
def forward(self, x, upp=None):
sine_wavs, uv, _ = self.l_sin_gen(x, upp)
if self.is_half:
sine_wavs = sine_wavs.half()
sine_merge = self.l_tanh(self.l_linear(sine_wavs))
return sine_merge,None,None# noise, uv
return sine_merge, None, None # noise, uv
class GeneratorNSF(torch.nn.Module):
def __init__(
self,
@ -360,7 +429,7 @@ class GeneratorNSF(torch.nn.Module):
upsample_kernel_sizes,
gin_channels,
sr,
is_half=False
is_half=False,
):
super(GeneratorNSF, self).__init__()
self.num_kernels = len(resblock_kernel_sizes)
@ -368,9 +437,7 @@ class GeneratorNSF(torch.nn.Module):
self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
self.m_source = SourceModuleHnNSF(
sampling_rate=sr,
harmonic_num=0,
is_half=is_half
sampling_rate=sr, harmonic_num=0, is_half=is_half
)
self.noise_convs = nn.ModuleList()
self.conv_pre = Conv1d(
@ -393,9 +460,16 @@ class GeneratorNSF(torch.nn.Module):
)
)
if i + 1 < len(upsample_rates):
stride_f0 = np.prod(upsample_rates[i + 1:])
self.noise_convs.append(Conv1d(
1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2))
stride_f0 = np.prod(upsample_rates[i + 1 :])
self.noise_convs.append(
Conv1d(
1,
c_cur,
kernel_size=stride_f0 * 2,
stride=stride_f0,
padding=stride_f0 // 2,
)
)
else:
self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
@ -413,10 +487,10 @@ class GeneratorNSF(torch.nn.Module):
if gin_channels != 0:
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
self.upp=np.prod(upsample_rates)
self.upp = np.prod(upsample_rates)
def forward(self, x, f0,g=None):
har_source, noi_source, uv = self.m_source(f0,self.upp)
def forward(self, x, f0, g=None):
har_source, noi_source, uv = self.m_source(f0, self.upp)
har_source = har_source.transpose(1, 2)
x = self.conv_pre(x)
if g is not None:
@ -444,11 +518,15 @@ class GeneratorNSF(torch.nn.Module):
remove_weight_norm(l)
for l in self.resblocks:
l.remove_weight_norm()
sr2sr={
"32k":32000,
"40k":40000,
"48k":48000,
sr2sr = {
"32k": 32000,
"40k": 40000,
"48k": 48000,
}
class SynthesizerTrnMs256NSFsid(nn.Module):
def __init__(
self,
@ -472,10 +550,9 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
sr,
**kwargs
):
super().__init__()
if(type(sr)==type("strr")):
sr=sr2sr[sr]
if type(sr) == type("strr"):
sr = sr2sr[sr]
self.spec_channels = spec_channels
self.inter_channels = inter_channels
self.hidden_channels = hidden_channels
@ -493,7 +570,7 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
self.segment_size = segment_size
self.gin_channels = gin_channels
# self.hop_length = hop_length#
self.spk_embed_dim=spk_embed_dim
self.spk_embed_dim = spk_embed_dim
self.enc_p = TextEncoder256(
inter_channels,
hidden_channels,
@ -511,7 +588,9 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
gin_channels=gin_channels, sr=sr, is_half=kwargs["is_half"]
gin_channels=gin_channels,
sr=sr,
is_half=kwargs["is_half"],
)
self.enc_q = PosteriorEncoder(
spec_channels,
@ -526,13 +605,16 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
)
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
print("gin_channels:",gin_channels,"self.spk_embed_dim:",self.spk_embed_dim)
print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
def remove_weight_norm(self):
self.dec.remove_weight_norm()
self.flow.remove_weight_norm()
self.enc_q.remove_weight_norm()
def forward(self, phone, phone_lengths, pitch,pitchf, y, y_lengths,ds):#这里ds是id[bs,1]
def forward(
self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
): # 这里ds是id[bs,1]
# print(1,pitch.shape)#[bs,t]
g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t广播的
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
@ -542,20 +624,20 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
z, y_lengths, self.segment_size
)
# print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
pitchf = commons.slice_segments2(
pitchf, ids_slice, self.segment_size
)
pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
# print(-2,pitchf.shape,z_slice.shape)
o = self.dec(z_slice,pitchf, g=g)
o = self.dec(z_slice, pitchf, g=g)
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
def infer(self, phone, phone_lengths, pitch, nsff0,sid,max_len=None):
def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec((z * x_mask)[:, :, :max_len], nsff0,g=g)
o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
return o, x_mask, (z, z_p, m_p, logs_p)
class SynthesizerTrnMs256NSFsid_nono(nn.Module):
def __init__(
self,
@ -579,7 +661,6 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
sr=None,
**kwargs
):
super().__init__()
self.spec_channels = spec_channels
self.inter_channels = inter_channels
@ -598,7 +679,7 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
self.segment_size = segment_size
self.gin_channels = gin_channels
# self.hop_length = hop_length#
self.spk_embed_dim=spk_embed_dim
self.spk_embed_dim = spk_embed_dim
self.enc_p = TextEncoder256(
inter_channels,
hidden_channels,
@ -606,7 +687,8 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
n_heads,
n_layers,
kernel_size,
p_dropout,f0=False
p_dropout,
f0=False,
)
self.dec = Generator(
inter_channels,
@ -616,7 +698,7 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
gin_channels=gin_channels
gin_channels=gin_channels,
)
self.enc_q = PosteriorEncoder(
spec_channels,
@ -631,14 +713,14 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
)
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
print("gin_channels:",gin_channels,"self.spk_embed_dim:",self.spk_embed_dim)
print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
def remove_weight_norm(self):
self.dec.remove_weight_norm()
self.flow.remove_weight_norm()
self.enc_q.remove_weight_norm()
def forward(self, phone, phone_lengths, y, y_lengths,ds):#这里ds是id[bs,1]
def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id[bs,1]
g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t广播的
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
@ -649,13 +731,15 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
o = self.dec(z_slice, g=g)
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
def infer(self, phone, phone_lengths,sid,max_len=None):
def infer(self, phone, phone_lengths, sid, max_len=None):
g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec((z * x_mask)[:, :, :max_len],g=g)
o = self.dec((z * x_mask)[:, :, :max_len], g=g)
return o, x_mask, (z, z_p, m_p, logs_p)
class SynthesizerTrnMs256NSFsid_sim(nn.Module):
"""
Synthesizer for Training
@ -684,7 +768,6 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module):
use_sdp=True,
**kwargs
):
super().__init__()
self.spec_channels = spec_channels
self.inter_channels = inter_channels
@ -703,7 +786,7 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module):
self.segment_size = segment_size
self.gin_channels = gin_channels
# self.hop_length = hop_length#
self.spk_embed_dim=spk_embed_dim
self.spk_embed_dim = spk_embed_dim
self.enc_p = TextEncoder256Sim(
inter_channels,
hidden_channels,
@ -721,20 +804,24 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module):
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
gin_channels=gin_channels,is_half=kwargs["is_half"]
gin_channels=gin_channels,
is_half=kwargs["is_half"],
)
self.flow = ResidualCouplingBlock(
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
)
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
print("gin_channels:",gin_channels,"self.spk_embed_dim:",self.spk_embed_dim)
print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
def remove_weight_norm(self):
self.dec.remove_weight_norm()
self.flow.remove_weight_norm()
self.enc_q.remove_weight_norm()
def forward(self, phone, phone_lengths, pitch, pitchf, y_lengths,ds): # y是spec不需要了现在
def forward(
self, phone, phone_lengths, pitch, pitchf, y_lengths, ds
): # y是spec不需要了现在
g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t广播的
x, x_mask = self.enc_p(phone, pitch, phone_lengths)
x = self.flow(x, x_mask, g=g, reverse=True)
@ -742,22 +829,24 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module):
x, y_lengths, self.segment_size
)
pitchf = commons.slice_segments2(
pitchf, ids_slice, self.segment_size
)
pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
o = self.dec(z_slice, pitchf, g=g)
return o, ids_slice
def infer(self, phone, phone_lengths, pitch, pitchf, ds,max_len=None): # y是spec不需要了现在
def infer(
self, phone, phone_lengths, pitch, pitchf, ds, max_len=None
): # y是spec不需要了现在
g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t广播的
x, x_mask = self.enc_p(phone, pitch, phone_lengths)
x = self.flow(x, x_mask, g=g, reverse=True)
o = self.dec((x*x_mask)[:, :, :max_len], pitchf, g=g)
o = self.dec((x * x_mask)[:, :, :max_len], pitchf, g=g)
return o, o
class MultiPeriodDiscriminator(torch.nn.Module):
def __init__(self, use_spectral_norm=False):
super(MultiPeriodDiscriminator, self).__init__()
periods = [2, 3, 5, 7, 11,17]
periods = [2, 3, 5, 7, 11, 17]
# periods = [3, 5, 7, 11, 17, 23, 37]
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
@ -767,7 +856,7 @@ class MultiPeriodDiscriminator(torch.nn.Module):
self.discriminators = nn.ModuleList(discs)
def forward(self, y, y_hat):
y_d_rs = []#
y_d_rs = [] #
y_d_gs = []
fmap_rs = []
fmap_gs = []
@ -783,6 +872,7 @@ class MultiPeriodDiscriminator(torch.nn.Module):
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
class DiscriminatorS(torch.nn.Module):
def __init__(self, use_spectral_norm=False):
super(DiscriminatorS, self).__init__()
@ -812,6 +902,7 @@ class DiscriminatorS(torch.nn.Module):
return x, fmap
class DiscriminatorP(torch.nn.Module):
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
super(DiscriminatorP, self).__init__()
@ -889,4 +980,3 @@ class DiscriminatorP(torch.nn.Module):
x = torch.flatten(x, 1, -1)
return x, fmap

View File

@ -1,4 +1,4 @@
import math,pdb,os
import math, pdb, os
from time import time as ttime
import torch
from torch import nn
@ -12,9 +12,20 @@ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
from infer_pack.commons import init_weights
import numpy as np
from infer_pack import commons
class TextEncoder256(nn.Module):
def __init__(
self, out_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, f0=True ):
self,
out_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
f0=True,
):
super().__init__()
self.out_channels = out_channels
self.hidden_channels = hidden_channels
@ -24,8 +35,8 @@ class TextEncoder256(nn.Module):
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.emb_phone = nn.Linear(256, hidden_channels)
self.lrelu=nn.LeakyReLU(0.1,inplace=True)
if(f0==True):
self.lrelu = nn.LeakyReLU(0.1, inplace=True)
if f0 == True:
self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
self.encoder = attentions.Encoder(
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
@ -33,12 +44,12 @@ class TextEncoder256(nn.Module):
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
def forward(self, phone, pitch, lengths):
if(pitch==None):
if pitch == None:
x = self.emb_phone(phone)
else:
x = self.emb_phone(phone) + self.emb_pitch(pitch)
x = x * math.sqrt(self.hidden_channels) # [b, t, h]
x=self.lrelu(x)
x = self.lrelu(x)
x = torch.transpose(x, 1, -1) # [b, h, t]
x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
x.dtype
@ -48,8 +59,20 @@ class TextEncoder256(nn.Module):
m, logs = torch.split(stats, self.out_channels, dim=1)
return m, logs, x_mask
class TextEncoder256Sim(nn.Module):
def __init__( self, out_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, f0=True):
def __init__(
self,
out_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
f0=True,
):
super().__init__()
self.out_channels = out_channels
self.hidden_channels = hidden_channels
@ -59,8 +82,8 @@ class TextEncoder256Sim(nn.Module):
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.emb_phone = nn.Linear(256, hidden_channels)
self.lrelu=nn.LeakyReLU(0.1,inplace=True)
if(f0==True):
self.lrelu = nn.LeakyReLU(0.1, inplace=True)
if f0 == True:
self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
self.encoder = attentions.Encoder(
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
@ -68,17 +91,21 @@ class TextEncoder256Sim(nn.Module):
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
def forward(self, phone, pitch, lengths):
if(pitch==None):
if pitch == None:
x = self.emb_phone(phone)
else:
x = self.emb_phone(phone) + self.emb_pitch(pitch)
x = x * math.sqrt(self.hidden_channels) # [b, t, h]
x=self.lrelu(x)
x = self.lrelu(x)
x = torch.transpose(x, 1, -1) # [b, h, t]
x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(x.dtype)
x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
x.dtype
)
x = self.encoder(x * x_mask, x_mask)
x = self.proj(x) * x_mask
return x,x_mask
return x, x_mask
class ResidualCouplingBlock(nn.Module):
def __init__(
self,
@ -126,6 +153,8 @@ class ResidualCouplingBlock(nn.Module):
def remove_weight_norm(self):
for i in range(self.n_flows):
self.flows[i * 2].remove_weight_norm()
class PosteriorEncoder(nn.Module):
def __init__(
self,
@ -169,6 +198,8 @@ class PosteriorEncoder(nn.Module):
def remove_weight_norm(self):
self.enc.remove_weight_norm()
class Generator(torch.nn.Module):
def __init__(
self,
@ -243,8 +274,10 @@ class Generator(torch.nn.Module):
remove_weight_norm(l)
for l in self.resblocks:
l.remove_weight_norm()
class SineGen(torch.nn.Module):
""" Definition of sine generator
"""Definition of sine generator
SineGen(samp_rate, harmonic_num = 0,
sine_amp = 0.1, noise_std = 0.003,
voiced_threshold = 0,
@ -259,10 +292,15 @@ class SineGen(torch.nn.Module):
segment is always sin(np.pi) or cos(0)
"""
def __init__(self, samp_rate, harmonic_num=0,
sine_amp=0.1, noise_std=0.003,
voiced_threshold=0,
flag_for_pulse=False):
def __init__(
self,
samp_rate,
harmonic_num=0,
sine_amp=0.1,
noise_std=0.003,
voiced_threshold=0,
flag_for_pulse=False,
):
super(SineGen, self).__init__()
self.sine_amp = sine_amp
self.noise_std = noise_std
@ -277,8 +315,8 @@ class SineGen(torch.nn.Module):
uv = uv * (f0 > self.voiced_threshold)
return uv
def forward(self, f0,upp):
""" sine_tensor, uv = forward(f0)
def forward(self, f0, upp):
"""sine_tensor, uv = forward(f0)
input F0: tensor(batchsize=1, length, dim=1)
f0 for unvoiced steps should be 0
output sine_tensor: tensor(batchsize=1, length, dim)
@ -286,32 +324,52 @@ class SineGen(torch.nn.Module):
"""
with torch.no_grad():
f0 = f0[:, None].transpose(1, 2)
f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim,device=f0.device)
f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
# fundamental component
f0_buf[:, :, 0] = f0[:, :, 0]
for idx in np.arange(self.harmonic_num):f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2)# idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
rad_values = (f0_buf / self.sampling_rate) % 1###%1意味着n_har的乘积无法后处理优化
rand_ini = torch.rand(f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device)
for idx in np.arange(self.harmonic_num):
f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
idx + 2
) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
rand_ini = torch.rand(
f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
)
rand_ini[:, 0] = 0
rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
tmp_over_one = torch.cumsum(rad_values, 1)# % 1 #####%1意味着后面的cumsum无法再优化
tmp_over_one*=upp
tmp_over_one=F.interpolate(tmp_over_one.transpose(2, 1), scale_factor=upp, mode='linear', align_corners=True).transpose(2, 1)
rad_values=F.interpolate(rad_values.transpose(2, 1), scale_factor=upp, mode='nearest').transpose(2, 1)#######
tmp_over_one%=1
tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
tmp_over_one *= upp
tmp_over_one = F.interpolate(
tmp_over_one.transpose(2, 1),
scale_factor=upp,
mode="linear",
align_corners=True,
).transpose(2, 1)
rad_values = F.interpolate(
rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
).transpose(
2, 1
) #######
tmp_over_one %= 1
tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
cumsum_shift = torch.zeros_like(rad_values)
cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
sine_waves = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi)
sine_waves = torch.sin(
torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
)
sine_waves = sine_waves * self.sine_amp
uv = self._f02uv(f0)
uv = F.interpolate(uv.transpose(2, 1), scale_factor=upp, mode='nearest').transpose(2, 1)
uv = F.interpolate(
uv.transpose(2, 1), scale_factor=upp, mode="nearest"
).transpose(2, 1)
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
noise = noise_amp * torch.randn_like(sine_waves)
sine_waves = sine_waves * uv + noise
return sine_waves, uv, noise
class SourceModuleHnNSF(torch.nn.Module):
""" SourceModule for hn-nsf
"""SourceModule for hn-nsf
SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
add_noise_std=0.003, voiced_threshod=0)
sampling_rate: sampling_rate in Hz
@ -328,26 +386,37 @@ class SourceModuleHnNSF(torch.nn.Module):
uv (batchsize, length, 1)
"""
def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1,
add_noise_std=0.003, voiced_threshod=0,is_half=True):
def __init__(
self,
sampling_rate,
harmonic_num=0,
sine_amp=0.1,
add_noise_std=0.003,
voiced_threshod=0,
is_half=True,
):
super(SourceModuleHnNSF, self).__init__()
self.sine_amp = sine_amp
self.noise_std = add_noise_std
self.is_half=is_half
self.is_half = is_half
# to produce sine waveforms
self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
sine_amp, add_noise_std, voiced_threshod)
self.l_sin_gen = SineGen(
sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
)
# to merge source harmonics into a single excitation
self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
self.l_tanh = torch.nn.Tanh()
def forward(self, x,upp=None):
sine_wavs, uv, _ = self.l_sin_gen(x,upp)
if(self.is_half):sine_wavs=sine_wavs.half()
def forward(self, x, upp=None):
sine_wavs, uv, _ = self.l_sin_gen(x, upp)
if self.is_half:
sine_wavs = sine_wavs.half()
sine_merge = self.l_tanh(self.l_linear(sine_wavs))
return sine_merge,None,None# noise, uv
return sine_merge, None, None # noise, uv
class GeneratorNSF(torch.nn.Module):
def __init__(
self,
@ -360,7 +429,7 @@ class GeneratorNSF(torch.nn.Module):
upsample_kernel_sizes,
gin_channels,
sr,
is_half=False
is_half=False,
):
super(GeneratorNSF, self).__init__()
self.num_kernels = len(resblock_kernel_sizes)
@ -368,9 +437,7 @@ class GeneratorNSF(torch.nn.Module):
self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
self.m_source = SourceModuleHnNSF(
sampling_rate=sr,
harmonic_num=0,
is_half=is_half
sampling_rate=sr, harmonic_num=0, is_half=is_half
)
self.noise_convs = nn.ModuleList()
self.conv_pre = Conv1d(
@ -393,9 +460,16 @@ class GeneratorNSF(torch.nn.Module):
)
)
if i + 1 < len(upsample_rates):
stride_f0 = np.prod(upsample_rates[i + 1:])
self.noise_convs.append(Conv1d(
1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2))
stride_f0 = np.prod(upsample_rates[i + 1 :])
self.noise_convs.append(
Conv1d(
1,
c_cur,
kernel_size=stride_f0 * 2,
stride=stride_f0,
padding=stride_f0 // 2,
)
)
else:
self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
@ -413,10 +487,10 @@ class GeneratorNSF(torch.nn.Module):
if gin_channels != 0:
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
self.upp=np.prod(upsample_rates)
self.upp = np.prod(upsample_rates)
def forward(self, x, f0,g=None):
har_source, noi_source, uv = self.m_source(f0,self.upp)
def forward(self, x, f0, g=None):
har_source, noi_source, uv = self.m_source(f0, self.upp)
har_source = har_source.transpose(1, 2)
x = self.conv_pre(x)
if g is not None:
@ -444,11 +518,15 @@ class GeneratorNSF(torch.nn.Module):
remove_weight_norm(l)
for l in self.resblocks:
l.remove_weight_norm()
sr2sr={
"32k":32000,
"40k":40000,
"48k":48000,
sr2sr = {
"32k": 32000,
"40k": 40000,
"48k": 48000,
}
class SynthesizerTrnMs256NSFsid(nn.Module):
def __init__(
self,
@ -472,10 +550,9 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
sr,
**kwargs
):
super().__init__()
if(type(sr)==type("strr")):
sr=sr2sr[sr]
if type(sr) == type("strr"):
sr = sr2sr[sr]
self.spec_channels = spec_channels
self.inter_channels = inter_channels
self.hidden_channels = hidden_channels
@ -493,7 +570,7 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
self.segment_size = segment_size
self.gin_channels = gin_channels
# self.hop_length = hop_length#
self.spk_embed_dim=spk_embed_dim
self.spk_embed_dim = spk_embed_dim
self.enc_p = TextEncoder256(
inter_channels,
hidden_channels,
@ -511,7 +588,9 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
gin_channels=gin_channels, sr=sr, is_half=kwargs["is_half"]
gin_channels=gin_channels,
sr=sr,
is_half=kwargs["is_half"],
)
self.enc_q = PosteriorEncoder(
spec_channels,
@ -526,21 +605,22 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
)
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
print("gin_channels:",gin_channels,"self.spk_embed_dim:",self.spk_embed_dim)
print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
def remove_weight_norm(self):
self.dec.remove_weight_norm()
self.flow.remove_weight_norm()
self.enc_q.remove_weight_norm()
def forward(self, phone, phone_lengths, pitch, nsff0 ,sid, rnd, max_len=None):
def forward(self, phone, phone_lengths, pitch, nsff0, sid, rnd, max_len=None):
g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask
z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec((z * x_mask)[:, :, :max_len], nsff0,g=g)
o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
return o
class SynthesizerTrnMs256NSFsid_sim(nn.Module):
"""
Synthesizer for Training
@ -569,7 +649,6 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module):
use_sdp=True,
**kwargs
):
super().__init__()
self.spec_channels = spec_channels
self.inter_channels = inter_channels
@ -588,7 +667,7 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module):
self.segment_size = segment_size
self.gin_channels = gin_channels
# self.hop_length = hop_length#
self.spk_embed_dim=spk_embed_dim
self.spk_embed_dim = spk_embed_dim
self.enc_p = TextEncoder256Sim(
inter_channels,
hidden_channels,
@ -606,30 +685,35 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module):
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
gin_channels=gin_channels,is_half=kwargs["is_half"]
gin_channels=gin_channels,
is_half=kwargs["is_half"],
)
self.flow = ResidualCouplingBlock(
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
)
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
print("gin_channels:",gin_channels,"self.spk_embed_dim:",self.spk_embed_dim)
print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
def remove_weight_norm(self):
self.dec.remove_weight_norm()
self.flow.remove_weight_norm()
self.enc_q.remove_weight_norm()
def forward(self, phone, phone_lengths, pitch, pitchf, ds,max_len=None): # y是spec不需要了现在
def forward(
self, phone, phone_lengths, pitch, pitchf, ds, max_len=None
): # y是spec不需要了现在
g = self.emb_g(ds.unsqueeze(0)).unsqueeze(-1) # [b, 256, 1]##1是t广播的
x, x_mask = self.enc_p(phone, pitch, phone_lengths)
x = self.flow(x, x_mask, g=g, reverse=True)
o = self.dec((x*x_mask)[:, :, :max_len], pitchf, g=g)
o = self.dec((x * x_mask)[:, :, :max_len], pitchf, g=g)
return o
class MultiPeriodDiscriminator(torch.nn.Module):
def __init__(self, use_spectral_norm=False):
super(MultiPeriodDiscriminator, self).__init__()
periods = [2, 3, 5, 7, 11,17]
periods = [2, 3, 5, 7, 11, 17]
# periods = [3, 5, 7, 11, 17, 23, 37]
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
@ -639,7 +723,7 @@ class MultiPeriodDiscriminator(torch.nn.Module):
self.discriminators = nn.ModuleList(discs)
def forward(self, y, y_hat):
y_d_rs = []#
y_d_rs = [] #
y_d_gs = []
fmap_rs = []
fmap_gs = []
@ -655,6 +739,7 @@ class MultiPeriodDiscriminator(torch.nn.Module):
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
class DiscriminatorS(torch.nn.Module):
def __init__(self, use_spectral_norm=False):
super(DiscriminatorS, self).__init__()
@ -684,6 +769,7 @@ class DiscriminatorS(torch.nn.Module):
return x, fmap
class DiscriminatorP(torch.nn.Module):
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
super(DiscriminatorP, self).__init__()
@ -761,4 +847,3 @@ class DiscriminatorP(torch.nn.Module):
x = torch.flatten(x, 1, -1)
return x, fmap

View File

@ -9,66 +9,63 @@ DEFAULT_MIN_BIN_HEIGHT = 1e-3
DEFAULT_MIN_DERIVATIVE = 1e-3
def piecewise_rational_quadratic_transform(inputs,
unnormalized_widths,
unnormalized_heights,
unnormalized_derivatives,
inverse=False,
tails=None,
tail_bound=1.,
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
min_derivative=DEFAULT_MIN_DERIVATIVE):
def piecewise_rational_quadratic_transform(
inputs,
unnormalized_widths,
unnormalized_heights,
unnormalized_derivatives,
inverse=False,
tails=None,
tail_bound=1.0,
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
min_derivative=DEFAULT_MIN_DERIVATIVE,
):
if tails is None:
spline_fn = rational_quadratic_spline
spline_kwargs = {}
else:
spline_fn = unconstrained_rational_quadratic_spline
spline_kwargs = {
'tails': tails,
'tail_bound': tail_bound
}
spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
outputs, logabsdet = spline_fn(
inputs=inputs,
unnormalized_widths=unnormalized_widths,
unnormalized_heights=unnormalized_heights,
unnormalized_derivatives=unnormalized_derivatives,
inverse=inverse,
min_bin_width=min_bin_width,
min_bin_height=min_bin_height,
min_derivative=min_derivative,
**spline_kwargs
inputs=inputs,
unnormalized_widths=unnormalized_widths,
unnormalized_heights=unnormalized_heights,
unnormalized_derivatives=unnormalized_derivatives,
inverse=inverse,
min_bin_width=min_bin_width,
min_bin_height=min_bin_height,
min_derivative=min_derivative,
**spline_kwargs
)
return outputs, logabsdet
def searchsorted(bin_locations, inputs, eps=1e-6):
bin_locations[..., -1] += eps
return torch.sum(
inputs[..., None] >= bin_locations,
dim=-1
) - 1
return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
def unconstrained_rational_quadratic_spline(inputs,
unnormalized_widths,
unnormalized_heights,
unnormalized_derivatives,
inverse=False,
tails='linear',
tail_bound=1.,
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
min_derivative=DEFAULT_MIN_DERIVATIVE):
def unconstrained_rational_quadratic_spline(
inputs,
unnormalized_widths,
unnormalized_heights,
unnormalized_derivatives,
inverse=False,
tails="linear",
tail_bound=1.0,
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
min_derivative=DEFAULT_MIN_DERIVATIVE,
):
inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
outside_interval_mask = ~inside_interval_mask
outputs = torch.zeros_like(inputs)
logabsdet = torch.zeros_like(inputs)
if tails == 'linear':
if tails == "linear":
unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
constant = np.log(np.exp(1 - min_derivative) - 1)
unnormalized_derivatives[..., 0] = constant
@ -77,45 +74,57 @@ def unconstrained_rational_quadratic_spline(inputs,
outputs[outside_interval_mask] = inputs[outside_interval_mask]
logabsdet[outside_interval_mask] = 0
else:
raise RuntimeError('{} tails are not implemented.'.format(tails))
raise RuntimeError("{} tails are not implemented.".format(tails))
outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline(
(
outputs[inside_interval_mask],
logabsdet[inside_interval_mask],
) = rational_quadratic_spline(
inputs=inputs[inside_interval_mask],
unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
inverse=inverse,
left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound,
left=-tail_bound,
right=tail_bound,
bottom=-tail_bound,
top=tail_bound,
min_bin_width=min_bin_width,
min_bin_height=min_bin_height,
min_derivative=min_derivative
min_derivative=min_derivative,
)
return outputs, logabsdet
def rational_quadratic_spline(inputs,
unnormalized_widths,
unnormalized_heights,
unnormalized_derivatives,
inverse=False,
left=0., right=1., bottom=0., top=1.,
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
min_derivative=DEFAULT_MIN_DERIVATIVE):
def rational_quadratic_spline(
inputs,
unnormalized_widths,
unnormalized_heights,
unnormalized_derivatives,
inverse=False,
left=0.0,
right=1.0,
bottom=0.0,
top=1.0,
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
min_derivative=DEFAULT_MIN_DERIVATIVE,
):
if torch.min(inputs) < left or torch.max(inputs) > right:
raise ValueError('Input to a transform is not within its domain')
raise ValueError("Input to a transform is not within its domain")
num_bins = unnormalized_widths.shape[-1]
if min_bin_width * num_bins > 1.0:
raise ValueError('Minimal bin width too large for the number of bins')
raise ValueError("Minimal bin width too large for the number of bins")
if min_bin_height * num_bins > 1.0:
raise ValueError('Minimal bin height too large for the number of bins')
raise ValueError("Minimal bin height too large for the number of bins")
widths = F.softmax(unnormalized_widths, dim=-1)
widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
cumwidths = torch.cumsum(widths, dim=-1)
cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0)
cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
cumwidths = (right - left) * cumwidths + left
cumwidths[..., 0] = left
cumwidths[..., -1] = right
@ -126,7 +135,7 @@ def rational_quadratic_spline(inputs,
heights = F.softmax(unnormalized_heights, dim=-1)
heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
cumheights = torch.cumsum(heights, dim=-1)
cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0)
cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
cumheights = (top - bottom) * cumheights + bottom
cumheights[..., 0] = bottom
cumheights[..., -1] = top
@ -150,15 +159,13 @@ def rational_quadratic_spline(inputs,
input_heights = heights.gather(-1, bin_idx)[..., 0]
if inverse:
a = (((inputs - input_cumheights) * (input_derivatives
+ input_derivatives_plus_one
- 2 * input_delta)
+ input_heights * (input_delta - input_derivatives)))
b = (input_heights * input_derivatives
- (inputs - input_cumheights) * (input_derivatives
+ input_derivatives_plus_one
- 2 * input_delta))
c = - input_delta * (inputs - input_cumheights)
a = (inputs - input_cumheights) * (
input_derivatives + input_derivatives_plus_one - 2 * input_delta
) + input_heights * (input_delta - input_derivatives)
b = input_heights * input_derivatives - (inputs - input_cumheights) * (
input_derivatives + input_derivatives_plus_one - 2 * input_delta
)
c = -input_delta * (inputs - input_cumheights)
discriminant = b.pow(2) - 4 * a * c
assert (discriminant >= 0).all()
@ -167,11 +174,15 @@ def rational_quadratic_spline(inputs,
outputs = root * input_bin_widths + input_cumwidths
theta_one_minus_theta = root * (1 - root)
denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
* theta_one_minus_theta)
derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2)
+ 2 * input_delta * theta_one_minus_theta
+ input_derivatives * (1 - root).pow(2))
denominator = input_delta + (
(input_derivatives + input_derivatives_plus_one - 2 * input_delta)
* theta_one_minus_theta
)
derivative_numerator = input_delta.pow(2) * (
input_derivatives_plus_one * root.pow(2)
+ 2 * input_delta * theta_one_minus_theta
+ input_derivatives * (1 - root).pow(2)
)
logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
return outputs, -logabsdet
@ -179,15 +190,20 @@ def rational_quadratic_spline(inputs,
theta = (inputs - input_cumwidths) / input_bin_widths
theta_one_minus_theta = theta * (1 - theta)
numerator = input_heights * (input_delta * theta.pow(2)
+ input_derivatives * theta_one_minus_theta)
denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
* theta_one_minus_theta)
numerator = input_heights * (
input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
)
denominator = input_delta + (
(input_derivatives + input_derivatives_plus_one - 2 * input_delta)
* theta_one_minus_theta
)
outputs = input_cumheights + numerator / denominator
derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2)
+ 2 * input_delta * theta_one_minus_theta
+ input_derivatives * (1 - theta).pow(2))
derivative_numerator = input_delta.pow(2) * (
input_derivatives_plus_one * theta.pow(2)
+ 2 * input_delta * theta_one_minus_theta
+ input_derivatives * (1 - theta).pow(2)
)
logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
return outputs, logabsdet

View File

@ -1,108 +1,171 @@
import os,sys,torch,warnings,pdb
import os, sys, torch, warnings, pdb
warnings.filterwarnings("ignore")
import librosa
import importlib
import numpy as np
import hashlib , math
import numpy as np
import hashlib, math
from tqdm import tqdm
from uvr5_pack.lib_v5 import spec_utils
from uvr5_pack.utils import _get_name_params,inference
from uvr5_pack.utils import _get_name_params, inference
from uvr5_pack.lib_v5.model_param_init import ModelParameters
from scipy.io import wavfile
class _audio_pre_():
def __init__(self, model_path,device,is_half):
class _audio_pre_:
def __init__(self, model_path, device, is_half):
self.model_path = model_path
self.device = device
self.data = {
# Processing Options
'postprocess': False,
'tta': False,
"postprocess": False,
"tta": False,
# Constants
'window_size': 512,
'agg': 10,
'high_end_process': 'mirroring',
"window_size": 512,
"agg": 10,
"high_end_process": "mirroring",
}
nn_arch_sizes = [
31191, # default
33966,61968, 123821, 123812, 537238 # custom
31191, # default
33966,
61968,
123821,
123812,
537238, # custom
]
self.nn_architecture = list('{}KB'.format(s) for s in nn_arch_sizes)
model_size = math.ceil(os.stat(model_path ).st_size / 1024)
nn_architecture = '{}KB'.format(min(nn_arch_sizes, key=lambda x:abs(x-model_size)))
nets = importlib.import_module('uvr5_pack.lib_v5.nets' + f'_{nn_architecture}'.replace('_{}KB'.format(nn_arch_sizes[0]), ''), package=None)
model_hash = hashlib.md5(open(model_path,'rb').read()).hexdigest()
param_name ,model_params_d = _get_name_params(model_path , model_hash)
self.nn_architecture = list("{}KB".format(s) for s in nn_arch_sizes)
model_size = math.ceil(os.stat(model_path).st_size / 1024)
nn_architecture = "{}KB".format(
min(nn_arch_sizes, key=lambda x: abs(x - model_size))
)
nets = importlib.import_module(
"uvr5_pack.lib_v5.nets"
+ f"_{nn_architecture}".replace("_{}KB".format(nn_arch_sizes[0]), ""),
package=None,
)
model_hash = hashlib.md5(open(model_path, "rb").read()).hexdigest()
param_name, model_params_d = _get_name_params(model_path, model_hash)
mp = ModelParameters(model_params_d)
model = nets.CascadedASPPNet(mp.param['bins'] * 2)
cpk = torch.load( model_path , map_location='cpu')
model = nets.CascadedASPPNet(mp.param["bins"] * 2)
cpk = torch.load(model_path, map_location="cpu")
model.load_state_dict(cpk)
model.eval()
if(is_half):model = model.half().to(device)
else:model = model.to(device)
if is_half:
model = model.half().to(device)
else:
model = model.to(device)
self.mp = mp
self.model = model
def _path_audio_(self, music_file ,ins_root=None,vocal_root=None):
if(ins_root is None and vocal_root is None):return "No save root."
name=os.path.basename(music_file)
if(ins_root is not None):os.makedirs(ins_root, exist_ok=True)
if(vocal_root is not None):os.makedirs(vocal_root , exist_ok=True)
def _path_audio_(self, music_file, ins_root=None, vocal_root=None):
if ins_root is None and vocal_root is None:
return "No save root."
name = os.path.basename(music_file)
if ins_root is not None:
os.makedirs(ins_root, exist_ok=True)
if vocal_root is not None:
os.makedirs(vocal_root, exist_ok=True)
X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
bands_n = len(self.mp.param['band'])
bands_n = len(self.mp.param["band"])
# print(bands_n)
for d in range(bands_n, 0, -1):
bp = self.mp.param['band'][d]
if d == bands_n: # high-end band
X_wave[d], _ = librosa.core.load(#理论上librosa读取可能对某些音频有bug应该上ffmpeg读取但是太麻烦了弃坑
music_file, bp['sr'], False, dtype=np.float32, res_type=bp['res_type'])
bp = self.mp.param["band"][d]
if d == bands_n: # high-end band
(
X_wave[d],
_,
) = librosa.core.load( # 理论上librosa读取可能对某些音频有bug应该上ffmpeg读取但是太麻烦了弃坑
music_file,
bp["sr"],
False,
dtype=np.float32,
res_type=bp["res_type"],
)
if X_wave[d].ndim == 1:
X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
else: # lower bands
X_wave[d] = librosa.core.resample(X_wave[d+1], self.mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type'])
else: # lower bands
X_wave[d] = librosa.core.resample(
X_wave[d + 1],
self.mp.param["band"][d + 1]["sr"],
bp["sr"],
res_type=bp["res_type"],
)
# Stft of wave source
X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(X_wave[d], bp['hl'], bp['n_fft'], self.mp.param['mid_side'], self.mp.param['mid_side_b2'], self.mp.param['reverse'])
X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
X_wave[d],
bp["hl"],
bp["n_fft"],
self.mp.param["mid_side"],
self.mp.param["mid_side_b2"],
self.mp.param["reverse"],
)
# pdb.set_trace()
if d == bands_n and self.data['high_end_process'] != 'none':
input_high_end_h = (bp['n_fft']//2 - bp['crop_stop']) + ( self.mp.param['pre_filter_stop'] - self.mp.param['pre_filter_start'])
input_high_end = X_spec_s[d][:, bp['n_fft']//2-input_high_end_h:bp['n_fft']//2, :]
if d == bands_n and self.data["high_end_process"] != "none":
input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (
self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"]
)
input_high_end = X_spec_s[d][
:, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :
]
X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
aggresive_set = float(self.data['agg']/100)
aggressiveness = {'value': aggresive_set, 'split_bin': self.mp.param['band'][1]['crop_stop']}
aggresive_set = float(self.data["agg"] / 100)
aggressiveness = {
"value": aggresive_set,
"split_bin": self.mp.param["band"][1]["crop_stop"],
}
with torch.no_grad():
pred, X_mag, X_phase = inference(X_spec_m,self.device,self.model, aggressiveness,self.data)
pred, X_mag, X_phase = inference(
X_spec_m, self.device, self.model, aggressiveness, self.data
)
# Postprocess
if self.data['postprocess']:
if self.data["postprocess"]:
pred_inv = np.clip(X_mag - pred, 0, np.inf)
pred = spec_utils.mask_silence(pred, pred_inv)
y_spec_m = pred * X_phase
v_spec_m = X_spec_m - y_spec_m
if (ins_root is not None):
if self.data['high_end_process'].startswith('mirroring'):
input_high_end_ = spec_utils.mirroring(self.data['high_end_process'], y_spec_m, input_high_end, self.mp)
wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp,input_high_end_h, input_high_end_)
if ins_root is not None:
if self.data["high_end_process"].startswith("mirroring"):
input_high_end_ = spec_utils.mirroring(
self.data["high_end_process"], y_spec_m, input_high_end, self.mp
)
wav_instrument = spec_utils.cmb_spectrogram_to_wave(
y_spec_m, self.mp, input_high_end_h, input_high_end_
)
else:
wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
print ('%s instruments done'%name)
wavfile.write(os.path.join(ins_root, 'instrument_{}.wav'.format(name) ), self.mp.param['sr'], (np.array(wav_instrument)*32768).astype("int16")) #
if (vocal_root is not None):
if self.data['high_end_process'].startswith('mirroring'):
input_high_end_ = spec_utils.mirroring(self.data['high_end_process'], v_spec_m, input_high_end, self.mp)
wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp, input_high_end_h, input_high_end_)
print("%s instruments done" % name)
wavfile.write(
os.path.join(ins_root, "instrument_{}.wav".format(name)),
self.mp.param["sr"],
(np.array(wav_instrument) * 32768).astype("int16"),
) #
if vocal_root is not None:
if self.data["high_end_process"].startswith("mirroring"):
input_high_end_ = spec_utils.mirroring(
self.data["high_end_process"], v_spec_m, input_high_end, self.mp
)
wav_vocals = spec_utils.cmb_spectrogram_to_wave(
v_spec_m, self.mp, input_high_end_h, input_high_end_
)
else:
wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
print ('%s vocals done'%name)
wavfile.write(os.path.join(vocal_root , 'vocal_{}.wav'.format(name) ), self.mp.param['sr'], (np.array(wav_vocals)*32768).astype("int16"))
print("%s vocals done" % name)
wavfile.write(
os.path.join(vocal_root, "vocal_{}.wav".format(name)),
self.mp.param["sr"],
(np.array(wav_vocals) * 32768).astype("int16"),
)
if __name__ == '__main__':
device = 'cuda'
is_half=True
model_path='uvr5_weights/2_HP-UVR.pth'
pre_fun = _audio_pre_(model_path=model_path,device=device,is_half=True)
audio_path = '神女劈观.aac'
save_path = 'opt'
pre_fun._path_audio_(audio_path , save_path,save_path)
if __name__ == "__main__":
device = "cuda"
is_half = True
model_path = "uvr5_weights/2_HP-UVR.pth"
pre_fun = _audio_pre_(model_path=model_path, device=device, is_half=True)
audio_path = "神女劈观.aac"
save_path = "opt"
pre_fun._path_audio_(audio_path, save_path, save_path)

View File

@ -31,7 +31,9 @@ for lang_file in languages:
del lang_data[key]
# Sort the keys of the language file to match the order of the standard file
lang_data = OrderedDict(sorted(lang_data.items(), key=lambda x: list(standard_data.keys()).index(x[0])))
lang_data = OrderedDict(
sorted(lang_data.items(), key=lambda x: list(standard_data.keys()).index(x[0]))
)
# Save the updated language file
with open(lang_file, "w", encoding="utf-8") as f:

View File

@ -1,11 +1,15 @@
import ffmpeg
import numpy as np
def load_audio(file,sr):
def load_audio(file, sr):
try:
# https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
file=file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")#防止小白拷路径头尾带了空格和"和回车
file = (
file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
) # 防止小白拷路径头尾带了空格和"和回车
out, _ = (
ffmpeg.input(file, threads=0)
.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)

View File

@ -18,9 +18,7 @@ def get_rms(
x_shape_trimmed = list(y.shape)
x_shape_trimmed[axis] -= frame_length - 1
out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
xw = np.lib.stride_tricks.as_strided(
y, shape=out_shape, strides=out_strides
)
xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides)
if axis < 0:
target_axis = axis - 1
else:
@ -38,19 +36,25 @@ def get_rms(
class Slicer:
def __init__(self,
sr: int,
threshold: float = -40.,
min_length: int = 5000,
min_interval: int = 300,
hop_size: int = 20,
max_sil_kept: int = 5000):
def __init__(
self,
sr: int,
threshold: float = -40.0,
min_length: int = 5000,
min_interval: int = 300,
hop_size: int = 20,
max_sil_kept: int = 5000,
):
if not min_length >= min_interval >= hop_size:
raise ValueError('The following condition must be satisfied: min_length >= min_interval >= hop_size')
raise ValueError(
"The following condition must be satisfied: min_length >= min_interval >= hop_size"
)
if not max_sil_kept >= hop_size:
raise ValueError('The following condition must be satisfied: max_sil_kept >= hop_size')
raise ValueError(
"The following condition must be satisfied: max_sil_kept >= hop_size"
)
min_interval = sr * min_interval / 1000
self.threshold = 10 ** (threshold / 20.)
self.threshold = 10 ** (threshold / 20.0)
self.hop_size = round(sr * hop_size / 1000)
self.win_size = min(round(min_interval), 4 * self.hop_size)
self.min_length = round(sr * min_length / 1000 / self.hop_size)
@ -59,9 +63,13 @@ class Slicer:
def _apply_slice(self, waveform, begin, end):
if len(waveform.shape) > 1:
return waveform[:, begin * self.hop_size: min(waveform.shape[1], end * self.hop_size)]
return waveform[
:, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size)
]
else:
return waveform[begin * self.hop_size: min(waveform.shape[0], end * self.hop_size)]
return waveform[
begin * self.hop_size : min(waveform.shape[0], end * self.hop_size)
]
# @timeit
def slice(self, waveform):
@ -71,7 +79,9 @@ class Slicer:
samples = waveform
if samples.shape[0] <= self.min_length:
return [waveform]
rms_list = get_rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0)
rms_list = get_rms(
y=samples, frame_length=self.win_size, hop_length=self.hop_size
).squeeze(0)
sil_tags = []
silence_start = None
clip_start = 0
@ -87,23 +97,37 @@ class Slicer:
continue
# Clear recorded silence start if interval is not enough or clip is too short
is_leading_silence = silence_start == 0 and i > self.max_sil_kept
need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length
need_slice_middle = (
i - silence_start >= self.min_interval
and i - clip_start >= self.min_length
)
if not is_leading_silence and not need_slice_middle:
silence_start = None
continue
# Need slicing. Record the range of silent frames to be removed.
if i - silence_start <= self.max_sil_kept:
pos = rms_list[silence_start: i + 1].argmin() + silence_start
pos = rms_list[silence_start : i + 1].argmin() + silence_start
if silence_start == 0:
sil_tags.append((0, pos))
else:
sil_tags.append((pos, pos))
clip_start = pos
elif i - silence_start <= self.max_sil_kept * 2:
pos = rms_list[i - self.max_sil_kept: silence_start + self.max_sil_kept + 1].argmin()
pos = rms_list[
i - self.max_sil_kept : silence_start + self.max_sil_kept + 1
].argmin()
pos += i - self.max_sil_kept
pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
pos_l = (
rms_list[
silence_start : silence_start + self.max_sil_kept + 1
].argmin()
+ silence_start
)
pos_r = (
rms_list[i - self.max_sil_kept : i + 1].argmin()
+ i
- self.max_sil_kept
)
if silence_start == 0:
sil_tags.append((0, pos_r))
clip_start = pos_r
@ -111,8 +135,17 @@ class Slicer:
sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
clip_start = max(pos_r, pos)
else:
pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
pos_l = (
rms_list[
silence_start : silence_start + self.max_sil_kept + 1
].argmin()
+ silence_start
)
pos_r = (
rms_list[i - self.max_sil_kept : i + 1].argmin()
+ i
- self.max_sil_kept
)
if silence_start == 0:
sil_tags.append((0, pos_r))
else:
@ -121,9 +154,12 @@ class Slicer:
silence_start = None
# Deal with trailing silence.
total_frames = rms_list.shape[0]
if silence_start is not None and total_frames - silence_start >= self.min_interval:
if (
silence_start is not None
and total_frames - silence_start >= self.min_interval
):
silence_end = min(total_frames, silence_start + self.max_sil_kept)
pos = rms_list[silence_start: silence_end + 1].argmin() + silence_start
pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start
sil_tags.append((pos, total_frames + 1))
# Apply and return slices.
if len(sil_tags) == 0:
@ -133,9 +169,13 @@ class Slicer:
if sil_tags[0][0] > 0:
chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0]))
for i in range(len(sil_tags) - 1):
chunks.append(self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]))
chunks.append(
self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0])
)
if sil_tags[-1][1] < total_frames:
chunks.append(self._apply_slice(waveform, sil_tags[-1][1], total_frames))
chunks.append(
self._apply_slice(waveform, sil_tags[-1][1], total_frames)
)
return chunks
@ -147,18 +187,45 @@ def main():
import soundfile
parser = ArgumentParser()
parser.add_argument('audio', type=str, help='The audio to be sliced')
parser.add_argument('--out', type=str, help='Output directory of the sliced audio clips')
parser.add_argument('--db_thresh', type=float, required=False, default=-40,
help='The dB threshold for silence detection')
parser.add_argument('--min_length', type=int, required=False, default=5000,
help='The minimum milliseconds required for each sliced audio clip')
parser.add_argument('--min_interval', type=int, required=False, default=300,
help='The minimum milliseconds for a silence part to be sliced')
parser.add_argument('--hop_size', type=int, required=False, default=10,
help='Frame length in milliseconds')
parser.add_argument('--max_sil_kept', type=int, required=False, default=500,
help='The maximum silence length kept around the sliced clip, presented in milliseconds')
parser.add_argument("audio", type=str, help="The audio to be sliced")
parser.add_argument(
"--out", type=str, help="Output directory of the sliced audio clips"
)
parser.add_argument(
"--db_thresh",
type=float,
required=False,
default=-40,
help="The dB threshold for silence detection",
)
parser.add_argument(
"--min_length",
type=int,
required=False,
default=5000,
help="The minimum milliseconds required for each sliced audio clip",
)
parser.add_argument(
"--min_interval",
type=int,
required=False,
default=300,
help="The minimum milliseconds for a silence part to be sliced",
)
parser.add_argument(
"--hop_size",
type=int,
required=False,
default=10,
help="Frame length in milliseconds",
)
parser.add_argument(
"--max_sil_kept",
type=int,
required=False,
default=500,
help="The maximum silence length kept around the sliced clip, presented in milliseconds",
)
args = parser.parse_args()
out = args.out
if out is None:
@ -170,7 +237,7 @@ def main():
min_length=args.min_length,
min_interval=args.min_interval,
hop_size=args.hop_size,
max_sil_kept=args.max_sil_kept
max_sil_kept=args.max_sil_kept,
)
chunks = slicer.slice(audio)
if not os.path.exists(out):
@ -178,8 +245,16 @@ def main():
for i, chunk in enumerate(chunks):
if len(chunk.shape) > 1:
chunk = chunk.T
soundfile.write(os.path.join(out, f'%s_%d.wav' % (os.path.basename(args.audio).rsplit('.', maxsplit=1)[0], i)), chunk, sr)
soundfile.write(
os.path.join(
out,
f"%s_%d.wav"
% (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i),
),
chunk,
sr,
)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@ -1,4 +1,4 @@
import os,traceback
import os, traceback
import numpy as np
import torch
import torch.utils.data
@ -6,6 +6,7 @@ import torch.utils.data
from mel_processing import spectrogram_torch
from utils import load_wav_to_torch, load_filepaths_and_text
class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):
"""
1) loads audio, text pairs
@ -15,14 +16,14 @@ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):
def __init__(self, audiopaths_and_text, hparams):
self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
self.max_wav_value = hparams.max_wav_value
self.sampling_rate = hparams.sampling_rate
self.filter_length = hparams.filter_length
self.hop_length = hparams.hop_length
self.win_length = hparams.win_length
self.sampling_rate = hparams.sampling_rate
self.min_text_len = getattr(hparams, "min_text_len", 1)
self.max_text_len = getattr(hparams, "max_text_len", 5000)
self.max_wav_value = hparams.max_wav_value
self.sampling_rate = hparams.sampling_rate
self.filter_length = hparams.filter_length
self.hop_length = hparams.hop_length
self.win_length = hparams.win_length
self.sampling_rate = hparams.sampling_rate
self.min_text_len = getattr(hparams, "min_text_len", 1)
self.max_text_len = getattr(hparams, "max_text_len", 5000)
self._filter()
def _filter(self):
@ -34,12 +35,13 @@ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):
# spec_length = wav_length // hop_length
audiopaths_and_text_new = []
lengths = []
for audiopath, text, pitch,pitchf,dv in self.audiopaths_and_text:
for audiopath, text, pitch, pitchf, dv in self.audiopaths_and_text:
if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
audiopaths_and_text_new.append([audiopath, text, pitch,pitchf,dv])
audiopaths_and_text_new.append([audiopath, text, pitch, pitchf, dv])
lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
self.audiopaths_and_text = audiopaths_and_text_new
self.lengths = lengths
def get_sid(self, sid):
sid = torch.LongTensor([int(sid)])
return sid
@ -54,7 +56,7 @@ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):
phone, pitch, pitchf = self.get_labels(phone, pitch, pitchf)
spec, wav = self.get_audio(file)
dv=self.get_sid(dv)
dv = self.get_sid(dv)
len_phone = phone.size()[0]
len_spec = spec.size()[-1]
@ -71,9 +73,9 @@ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):
pitch = pitch[:len_min]
pitchf = pitchf[:len_min]
return (spec, wav, phone, pitch,pitchf,dv)
return (spec, wav, phone, pitch, pitchf, dv)
def get_labels(self, phone, pitch,pitchf):
def get_labels(self, phone, pitch, pitchf):
phone = np.load(phone)
phone = np.repeat(phone, 2, axis=0)
pitch = np.load(pitch)
@ -86,7 +88,7 @@ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):
phone = torch.FloatTensor(phone)
pitch = torch.LongTensor(pitch)
pitchf = torch.FloatTensor(pitchf)
return phone, pitch,pitchf
return phone, pitch, pitchf
def get_audio(self, filename):
audio, sampling_rate = load_wav_to_torch(filename)
@ -103,10 +105,15 @@ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):
try:
spec = torch.load(spec_filename)
except:
print (spec_filename,traceback.format_exc())
spec = spectrogram_torch(audio_norm, self.filter_length,
self.sampling_rate, self.hop_length, self.win_length,
center=False)
print(spec_filename, traceback.format_exc())
spec = spectrogram_torch(
audio_norm,
self.filter_length,
self.sampling_rate,
self.hop_length,
self.win_length,
center=False,
)
spec = torch.squeeze(spec, 0)
torch.save(spec, spec_filename, _use_new_zipfile_serialization=False)
else:
@ -127,6 +134,8 @@ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):
def __len__(self):
return len(self.audiopaths_and_text)
class TextAudioCollateMultiNSFsid:
"""Zero-pads model inputs and targets"""
@ -155,7 +164,9 @@ class TextAudioCollateMultiNSFsid:
max_phone_len = max([x[2].size(0) for x in batch])
phone_lengths = torch.LongTensor(len(batch))
phone_padded = torch.FloatTensor(len(batch), max_phone_len, batch[0][2].shape[1])#(spec, wav, phone, pitch)
phone_padded = torch.FloatTensor(
len(batch), max_phone_len, batch[0][2].shape[1]
) # (spec, wav, phone, pitch)
pitch_padded = torch.LongTensor(len(batch), max_phone_len)
pitchf_padded = torch.FloatTensor(len(batch), max_phone_len)
phone_padded.zero_()
@ -187,7 +198,6 @@ class TextAudioCollateMultiNSFsid:
# dv[i] = row[5]
sid[i] = row[5]
return (
phone_padded,
phone_lengths,
@ -198,9 +208,10 @@ class TextAudioCollateMultiNSFsid:
wave_padded,
wave_lengths,
# dv
sid
sid,
)
class TextAudioLoader(torch.utils.data.Dataset):
"""
1) loads audio, text pairs
@ -210,14 +221,14 @@ class TextAudioLoader(torch.utils.data.Dataset):
def __init__(self, audiopaths_and_text, hparams):
self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
self.max_wav_value = hparams.max_wav_value
self.sampling_rate = hparams.sampling_rate
self.filter_length = hparams.filter_length
self.hop_length = hparams.hop_length
self.win_length = hparams.win_length
self.sampling_rate = hparams.sampling_rate
self.min_text_len = getattr(hparams, "min_text_len", 1)
self.max_text_len = getattr(hparams, "max_text_len", 5000)
self.max_wav_value = hparams.max_wav_value
self.sampling_rate = hparams.sampling_rate
self.filter_length = hparams.filter_length
self.hop_length = hparams.hop_length
self.win_length = hparams.win_length
self.sampling_rate = hparams.sampling_rate
self.min_text_len = getattr(hparams, "min_text_len", 1)
self.max_text_len = getattr(hparams, "max_text_len", 5000)
self._filter()
def _filter(self):
@ -229,12 +240,13 @@ class TextAudioLoader(torch.utils.data.Dataset):
# spec_length = wav_length // hop_length
audiopaths_and_text_new = []
lengths = []
for audiopath, text,dv in self.audiopaths_and_text:
for audiopath, text, dv in self.audiopaths_and_text:
if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
audiopaths_and_text_new.append([audiopath, text,dv])
audiopaths_and_text_new.append([audiopath, text, dv])
lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
self.audiopaths_and_text = audiopaths_and_text_new
self.lengths = lengths
def get_sid(self, sid):
sid = torch.LongTensor([int(sid)])
return sid
@ -247,7 +259,7 @@ class TextAudioLoader(torch.utils.data.Dataset):
phone = self.get_labels(phone)
spec, wav = self.get_audio(file)
dv=self.get_sid(dv)
dv = self.get_sid(dv)
len_phone = phone.size()[0]
len_spec = spec.size()[-1]
@ -257,7 +269,7 @@ class TextAudioLoader(torch.utils.data.Dataset):
spec = spec[:, :len_min]
wav = wav[:, :len_wav]
phone = phone[:len_min, :]
return (spec, wav, phone,dv)
return (spec, wav, phone, dv)
def get_labels(self, phone):
phone = np.load(phone)
@ -282,10 +294,15 @@ class TextAudioLoader(torch.utils.data.Dataset):
try:
spec = torch.load(spec_filename)
except:
print (spec_filename,traceback.format_exc())
spec = spectrogram_torch(audio_norm, self.filter_length,
self.sampling_rate, self.hop_length, self.win_length,
center=False)
print(spec_filename, traceback.format_exc())
spec = spectrogram_torch(
audio_norm,
self.filter_length,
self.sampling_rate,
self.hop_length,
self.win_length,
center=False,
)
spec = torch.squeeze(spec, 0)
torch.save(spec, spec_filename, _use_new_zipfile_serialization=False)
else:
@ -306,6 +323,8 @@ class TextAudioLoader(torch.utils.data.Dataset):
def __len__(self):
return len(self.audiopaths_and_text)
class TextAudioCollate:
"""Zero-pads model inputs and targets"""
@ -334,7 +353,9 @@ class TextAudioCollate:
max_phone_len = max([x[2].size(0) for x in batch])
phone_lengths = torch.LongTensor(len(batch))
phone_padded = torch.FloatTensor(len(batch), max_phone_len, batch[0][2].shape[1])
phone_padded = torch.FloatTensor(
len(batch), max_phone_len, batch[0][2].shape[1]
)
phone_padded.zero_()
sid = torch.LongTensor(len(batch))
@ -355,7 +376,6 @@ class TextAudioCollate:
sid[i] = row[3]
return (
phone_padded,
phone_lengths,
@ -363,9 +383,10 @@ class TextAudioCollate:
spec_lengths,
wave_padded,
wave_lengths,
sid
sid,
)
class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
"""
Maintain similar input lengths in a batch.
@ -402,7 +423,7 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
if idx_bucket != -1:
buckets[idx_bucket].append(i)
for i in range(len(buckets) - 1, -1, -1):#
for i in range(len(buckets) - 1, -1, -1): #
if len(buckets[i]) == 0:
buckets.pop(i)
self.boundaries.pop(i + 1)

View File

@ -1,6 +1,7 @@
import torch
from torch.nn import functional as F
def feature_loss(fmap_r, fmap_g):
loss = 0
for dr, dg in zip(fmap_r, fmap_g):

View File

@ -78,7 +78,8 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False)
center=center,
pad_mode="reflect",
normalized=False,
onesided=True,return_complex=False
onesided=True,
return_complex=False,
)
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
@ -139,8 +140,18 @@ def mel_spectrogram_torch(
# normalized=False,
# onesided=True,
# )
spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
spec = torch.stft(
y,
n_fft,
hop_length=hop_size,
win_length=win_size,
window=hann_window[wnsize_dtype_device],
center=center,
pad_mode="reflect",
normalized=False,
onesided=True,
return_complex=False,
)
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
spec = torch.matmul(mel_basis[fmax_dtype_device], spec)

View File

@ -1,101 +1,248 @@
import torch,traceback,os,pdb
import torch, traceback, os, pdb
from collections import OrderedDict
def savee(ckpt,sr,if_f0,name,epoch):
def savee(ckpt, sr, if_f0, name, epoch):
try:
opt = OrderedDict()
opt["weight"] = {}
for key in ckpt.keys():
if ("enc_q" in key): continue
if "enc_q" in key:
continue
opt["weight"][key] = ckpt[key].half()
if(sr=="40k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2], 512, [16, 16, 4, 4], 109, 256, 40000]
elif(sr=="48k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,6,2,2,2], 512, [16, 16, 4, 4,4], 109, 256, 48000]
elif(sr=="32k"):opt["config"] = [513, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 4, 2, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 32000]
opt["info"] = "%sepoch"%epoch
if sr == "40k":
opt["config"] = [
1025,
32,
192,
192,
768,
2,
6,
3,
0,
"1",
[3, 7, 11],
[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
[10, 10, 2, 2],
512,
[16, 16, 4, 4],
109,
256,
40000,
]
elif sr == "48k":
opt["config"] = [
1025,
32,
192,
192,
768,
2,
6,
3,
0,
"1",
[3, 7, 11],
[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
[10, 6, 2, 2, 2],
512,
[16, 16, 4, 4, 4],
109,
256,
48000,
]
elif sr == "32k":
opt["config"] = [
513,
32,
192,
192,
768,
2,
6,
3,
0,
"1",
[3, 7, 11],
[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
[10, 4, 2, 2, 2],
512,
[16, 16, 4, 4, 4],
109,
256,
32000,
]
opt["info"] = "%sepoch" % epoch
opt["sr"] = sr
opt["f0"] =if_f0
torch.save(opt, "weights/%s.pth"%name)
opt["f0"] = if_f0
torch.save(opt, "weights/%s.pth" % name)
return "Success."
except:
return traceback.format_exc()
def show_info(path):
try:
a = torch.load(path, map_location="cpu")
return "模型信息:%s\n采样率:%s\n模型是否输入音高引导:%s"%(a.get("info","None"),a.get("sr","None"),a.get("f0","None"),)
return "模型信息:%s\n采样率:%s\n模型是否输入音高引导:%s" % (
a.get("info", "None"),
a.get("sr", "None"),
a.get("f0", "None"),
)
except:
return traceback.format_exc()
def extract_small_model(path,name,sr,if_f0,info):
def extract_small_model(path, name, sr, if_f0, info):
try:
ckpt = torch.load(path, map_location="cpu")
if("model"in ckpt):ckpt=ckpt["model"]
if "model" in ckpt:
ckpt = ckpt["model"]
opt = OrderedDict()
opt["weight"] = {}
for key in ckpt.keys():
if ("enc_q" in key): continue
if "enc_q" in key:
continue
opt["weight"][key] = ckpt[key].half()
if(sr=="40k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2], 512, [16, 16, 4, 4], 109, 256, 40000]
elif(sr=="48k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,6,2,2,2], 512, [16, 16, 4, 4,4], 109, 256, 48000]
elif(sr=="32k"):opt["config"] = [513, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 4, 2, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 32000]
if(info==""):info="Extracted model."
if sr == "40k":
opt["config"] = [
1025,
32,
192,
192,
768,
2,
6,
3,
0,
"1",
[3, 7, 11],
[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
[10, 10, 2, 2],
512,
[16, 16, 4, 4],
109,
256,
40000,
]
elif sr == "48k":
opt["config"] = [
1025,
32,
192,
192,
768,
2,
6,
3,
0,
"1",
[3, 7, 11],
[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
[10, 6, 2, 2, 2],
512,
[16, 16, 4, 4, 4],
109,
256,
48000,
]
elif sr == "32k":
opt["config"] = [
513,
32,
192,
192,
768,
2,
6,
3,
0,
"1",
[3, 7, 11],
[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
[10, 4, 2, 2, 2],
512,
[16, 16, 4, 4, 4],
109,
256,
32000,
]
if info == "":
info = "Extracted model."
opt["info"] = info
opt["sr"] = sr
opt["f0"] =int(if_f0)
torch.save(opt, "weights/%s.pth"%name)
opt["f0"] = int(if_f0)
torch.save(opt, "weights/%s.pth" % name)
return "Success."
except:
return traceback.format_exc()
def change_info(path,info,name):
def change_info(path, info, name):
try:
ckpt = torch.load(path, map_location="cpu")
ckpt["info"]=info
if(name==""):name=os.path.basename(path)
torch.save(ckpt, "weights/%s"%name)
ckpt["info"] = info
if name == "":
name = os.path.basename(path)
torch.save(ckpt, "weights/%s" % name)
return "Success."
except:
return traceback.format_exc()
def merge(path1,path2,alpha1,sr,f0,info,name):
def merge(path1, path2, alpha1, sr, f0, info, name):
try:
def extract(ckpt):
a = ckpt["model"]
opt = OrderedDict()
opt["weight"] = {}
for key in a.keys():
if ("enc_q" in key): continue
if "enc_q" in key:
continue
opt["weight"][key] = a[key]
return opt
ckpt1 = torch.load(path1, map_location="cpu")
ckpt2 = torch.load(path2, map_location="cpu")
cfg = ckpt1["config"]
if("model"in ckpt1): ckpt1=extract(ckpt1)
else: ckpt1=ckpt1["weight"]
if("model"in ckpt2): ckpt2=extract(ckpt2)
else: ckpt2=ckpt2["weight"]
if(sorted(list(ckpt1.keys()))!=sorted(list(ckpt2.keys()))):return "Fail to merge the models. The model architectures are not the same."
if "model" in ckpt1:
ckpt1 = extract(ckpt1)
else:
ckpt1 = ckpt1["weight"]
if "model" in ckpt2:
ckpt2 = extract(ckpt2)
else:
ckpt2 = ckpt2["weight"]
if sorted(list(ckpt1.keys())) != sorted(list(ckpt2.keys())):
return "Fail to merge the models. The model architectures are not the same."
opt = OrderedDict()
opt["weight"] = {}
for key in ckpt1.keys():
# try:
if(key=="emb_g.weight"and ckpt1[key].shape!=ckpt2[key].shape):
min_shape0=min(ckpt1[key].shape[0],ckpt2[key].shape[0])
opt["weight"][key] = (alpha1 * (ckpt1[key][:min_shape0].float()) + (1 - alpha1) * (ckpt2[key][:min_shape0].float())).half()
else:
opt["weight"][key] = (alpha1*(ckpt1[key].float())+(1-alpha1)*(ckpt2[key].float())).half()
# except:
# pdb.set_trace()
if key == "emb_g.weight" and ckpt1[key].shape != ckpt2[key].shape:
min_shape0 = min(ckpt1[key].shape[0], ckpt2[key].shape[0])
opt["weight"][key] = (
alpha1 * (ckpt1[key][:min_shape0].float())
+ (1 - alpha1) * (ckpt2[key][:min_shape0].float())
).half()
else:
opt["weight"][key] = (
alpha1 * (ckpt1[key].float()) + (1 - alpha1) * (ckpt2[key].float())
).half()
# except:
# pdb.set_trace()
opt["config"] = cfg
'''
"""
if(sr=="40k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 40000]
elif(sr=="48k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,6,2,2,2], 512, [16, 16, 4, 4], 109, 256, 48000]
elif(sr=="32k"):opt["config"] = [513, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 4, 2, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 32000]
'''
opt["sr"]=sr
opt["f0"]=1 if f0==""else 0
opt["info"]=info
torch.save(opt, "weights/%s.pth"%name)
"""
opt["sr"] = sr
opt["f0"] = 1 if f0 == "" else 0
opt["info"] = info
torch.save(opt, "weights/%s.pth" % name)
return "Success."
except:
return traceback.format_exc()

View File

@ -1,4 +1,4 @@
import os,traceback
import os, traceback
import glob
import sys
import argparse
@ -14,44 +14,53 @@ MATPLOTLIB_FLAG = False
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logger = logging
def load_checkpoint_d(checkpoint_path, combd,sbd, optimizer=None,load_opt=1):
assert os.path.isfile(checkpoint_path)
checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
##################
def go(model,bkey):
saved_state_dict = checkpoint_dict[bkey]
if hasattr(model, 'module'):state_dict = model.module.state_dict()
else:state_dict = model.state_dict()
new_state_dict= {}
for k, v in state_dict.items():#模型需要的shape
try:
new_state_dict[k] = saved_state_dict[k]
if(saved_state_dict[k].shape!=state_dict[k].shape):
print("shape-%s-mismatch|need-%s|get-%s"%(k,state_dict[k].shape,saved_state_dict[k].shape))#
raise KeyError
except:
# logger.info(traceback.format_exc())
logger.info("%s is not in the checkpoint" % k)#pretrain缺失的
new_state_dict[k] = v#模型自带的随机值
if hasattr(model, 'module'):
model.module.load_state_dict(new_state_dict,strict=False)
else:
model.load_state_dict(new_state_dict,strict=False)
go(combd,"combd")
go(sbd,"sbd")
#############
logger.info("Loaded model weights")
def load_checkpoint_d(checkpoint_path, combd, sbd, optimizer=None, load_opt=1):
assert os.path.isfile(checkpoint_path)
checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
iteration = checkpoint_dict['iteration']
learning_rate = checkpoint_dict['learning_rate']
if optimizer is not None and load_opt==1:###加载不了如果是空的的话重新初始化可能还会影响lr时间表的更新因此在train文件最外围catch
# try:
optimizer.load_state_dict(checkpoint_dict['optimizer'])
# except:
# traceback.print_exc()
logger.info("Loaded checkpoint '{}' (epoch {})" .format(checkpoint_path, iteration))
return model, optimizer, learning_rate, iteration
##################
def go(model, bkey):
saved_state_dict = checkpoint_dict[bkey]
if hasattr(model, "module"):
state_dict = model.module.state_dict()
else:
state_dict = model.state_dict()
new_state_dict = {}
for k, v in state_dict.items(): # 模型需要的shape
try:
new_state_dict[k] = saved_state_dict[k]
if saved_state_dict[k].shape != state_dict[k].shape:
print(
"shape-%s-mismatch|need-%s|get-%s"
% (k, state_dict[k].shape, saved_state_dict[k].shape)
) #
raise KeyError
except:
# logger.info(traceback.format_exc())
logger.info("%s is not in the checkpoint" % k) # pretrain缺失的
new_state_dict[k] = v # 模型自带的随机值
if hasattr(model, "module"):
model.module.load_state_dict(new_state_dict, strict=False)
else:
model.load_state_dict(new_state_dict, strict=False)
go(combd, "combd")
go(sbd, "sbd")
#############
logger.info("Loaded model weights")
iteration = checkpoint_dict["iteration"]
learning_rate = checkpoint_dict["learning_rate"]
if (
optimizer is not None and load_opt == 1
): ###加载不了如果是空的的话重新初始化可能还会影响lr时间表的更新因此在train文件最外围catch
# try:
optimizer.load_state_dict(checkpoint_dict["optimizer"])
# except:
# traceback.print_exc()
logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration))
return model, optimizer, learning_rate, iteration
# def load_checkpoint(checkpoint_path, model, optimizer=None):
@ -83,303 +92,380 @@ def load_checkpoint_d(checkpoint_path, combd,sbd, optimizer=None,load_opt=1):
# logger.info("Loaded checkpoint '{}' (epoch {})" .format(
# checkpoint_path, iteration))
# return model, optimizer, learning_rate, iteration
def load_checkpoint(checkpoint_path, model, optimizer=None,load_opt=1):
assert os.path.isfile(checkpoint_path)
checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1):
assert os.path.isfile(checkpoint_path)
checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
saved_state_dict = checkpoint_dict['model']
if hasattr(model, 'module'):
state_dict = model.module.state_dict()
else:
state_dict = model.state_dict()
new_state_dict= {}
for k, v in state_dict.items():#模型需要的shape
try:
new_state_dict[k] = saved_state_dict[k]
if(saved_state_dict[k].shape!=state_dict[k].shape):
print("shape-%s-mismatch|need-%s|get-%s"%(k,state_dict[k].shape,saved_state_dict[k].shape))#
raise KeyError
except:
# logger.info(traceback.format_exc())
logger.info("%s is not in the checkpoint" % k)#pretrain缺失的
new_state_dict[k] = v#模型自带的随机值
if hasattr(model, 'module'):
model.module.load_state_dict(new_state_dict,strict=False)
else:
model.load_state_dict(new_state_dict,strict=False)
logger.info("Loaded model weights")
saved_state_dict = checkpoint_dict["model"]
if hasattr(model, "module"):
state_dict = model.module.state_dict()
else:
state_dict = model.state_dict()
new_state_dict = {}
for k, v in state_dict.items(): # 模型需要的shape
try:
new_state_dict[k] = saved_state_dict[k]
if saved_state_dict[k].shape != state_dict[k].shape:
print(
"shape-%s-mismatch|need-%s|get-%s"
% (k, state_dict[k].shape, saved_state_dict[k].shape)
) #
raise KeyError
except:
# logger.info(traceback.format_exc())
logger.info("%s is not in the checkpoint" % k) # pretrain缺失的
new_state_dict[k] = v # 模型自带的随机值
if hasattr(model, "module"):
model.module.load_state_dict(new_state_dict, strict=False)
else:
model.load_state_dict(new_state_dict, strict=False)
logger.info("Loaded model weights")
iteration = checkpoint_dict['iteration']
learning_rate = checkpoint_dict['learning_rate']
if optimizer is not None and load_opt==1:###加载不了如果是空的的话重新初始化可能还会影响lr时间表的更新因此在train文件最外围catch
# try:
optimizer.load_state_dict(checkpoint_dict['optimizer'])
# except:
# traceback.print_exc()
logger.info("Loaded checkpoint '{}' (epoch {})" .format(checkpoint_path, iteration))
return model, optimizer, learning_rate, iteration
iteration = checkpoint_dict["iteration"]
learning_rate = checkpoint_dict["learning_rate"]
if (
optimizer is not None and load_opt == 1
): ###加载不了如果是空的的话重新初始化可能还会影响lr时间表的更新因此在train文件最外围catch
# try:
optimizer.load_state_dict(checkpoint_dict["optimizer"])
# except:
# traceback.print_exc()
logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration))
return model, optimizer, learning_rate, iteration
def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
logger.info("Saving model and optimizer state at epoch {} to {}".format(
iteration, checkpoint_path))
if hasattr(model, 'module'):
state_dict = model.module.state_dict()
else:
state_dict = model.state_dict()
torch.save({'model': state_dict,
'iteration': iteration,
'optimizer': optimizer.state_dict(),
'learning_rate': learning_rate}, checkpoint_path)
logger.info(
"Saving model and optimizer state at epoch {} to {}".format(
iteration, checkpoint_path
)
)
if hasattr(model, "module"):
state_dict = model.module.state_dict()
else:
state_dict = model.state_dict()
torch.save(
{
"model": state_dict,
"iteration": iteration,
"optimizer": optimizer.state_dict(),
"learning_rate": learning_rate,
},
checkpoint_path,
)
def save_checkpoint_d(combd, sbd, optimizer, learning_rate, iteration, checkpoint_path):
logger.info("Saving model and optimizer state at epoch {} to {}".format(
iteration, checkpoint_path))
if hasattr(combd, 'module'): state_dict_combd = combd.module.state_dict()
else:state_dict_combd = combd.state_dict()
if hasattr(sbd, 'module'): state_dict_sbd = sbd.module.state_dict()
else:state_dict_sbd = sbd.state_dict()
torch.save({
'combd': state_dict_combd,
'sbd': state_dict_sbd,
'iteration': iteration,
'optimizer': optimizer.state_dict(),
'learning_rate': learning_rate}, checkpoint_path)
logger.info(
"Saving model and optimizer state at epoch {} to {}".format(
iteration, checkpoint_path
)
)
if hasattr(combd, "module"):
state_dict_combd = combd.module.state_dict()
else:
state_dict_combd = combd.state_dict()
if hasattr(sbd, "module"):
state_dict_sbd = sbd.module.state_dict()
else:
state_dict_sbd = sbd.state_dict()
torch.save(
{
"combd": state_dict_combd,
"sbd": state_dict_sbd,
"iteration": iteration,
"optimizer": optimizer.state_dict(),
"learning_rate": learning_rate,
},
checkpoint_path,
)
def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
for k, v in scalars.items():
writer.add_scalar(k, v, global_step)
for k, v in histograms.items():
writer.add_histogram(k, v, global_step)
for k, v in images.items():
writer.add_image(k, v, global_step, dataformats='HWC')
for k, v in audios.items():
writer.add_audio(k, v, global_step, audio_sampling_rate)
def summarize(
writer,
global_step,
scalars={},
histograms={},
images={},
audios={},
audio_sampling_rate=22050,
):
for k, v in scalars.items():
writer.add_scalar(k, v, global_step)
for k, v in histograms.items():
writer.add_histogram(k, v, global_step)
for k, v in images.items():
writer.add_image(k, v, global_step, dataformats="HWC")
for k, v in audios.items():
writer.add_audio(k, v, global_step, audio_sampling_rate)
def latest_checkpoint_path(dir_path, regex="G_*.pth"):
f_list = glob.glob(os.path.join(dir_path, regex))
f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
x = f_list[-1]
print(x)
return x
f_list = glob.glob(os.path.join(dir_path, regex))
f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
x = f_list[-1]
print(x)
return x
def plot_spectrogram_to_numpy(spectrogram):
global MATPLOTLIB_FLAG
if not MATPLOTLIB_FLAG:
import matplotlib
matplotlib.use("Agg")
MATPLOTLIB_FLAG = True
mpl_logger = logging.getLogger('matplotlib')
mpl_logger.setLevel(logging.WARNING)
import matplotlib.pylab as plt
import numpy as np
global MATPLOTLIB_FLAG
if not MATPLOTLIB_FLAG:
import matplotlib
fig, ax = plt.subplots(figsize=(10,2))
im = ax.imshow(spectrogram, aspect="auto", origin="lower",
interpolation='none')
plt.colorbar(im, ax=ax)
plt.xlabel("Frames")
plt.ylabel("Channels")
plt.tight_layout()
matplotlib.use("Agg")
MATPLOTLIB_FLAG = True
mpl_logger = logging.getLogger("matplotlib")
mpl_logger.setLevel(logging.WARNING)
import matplotlib.pylab as plt
import numpy as np
fig.canvas.draw()
data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
plt.close()
return data
fig, ax = plt.subplots(figsize=(10, 2))
im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
plt.colorbar(im, ax=ax)
plt.xlabel("Frames")
plt.ylabel("Channels")
plt.tight_layout()
fig.canvas.draw()
data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
plt.close()
return data
def plot_alignment_to_numpy(alignment, info=None):
global MATPLOTLIB_FLAG
if not MATPLOTLIB_FLAG:
import matplotlib
matplotlib.use("Agg")
MATPLOTLIB_FLAG = True
mpl_logger = logging.getLogger('matplotlib')
mpl_logger.setLevel(logging.WARNING)
import matplotlib.pylab as plt
import numpy as np
global MATPLOTLIB_FLAG
if not MATPLOTLIB_FLAG:
import matplotlib
fig, ax = plt.subplots(figsize=(6, 4))
im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower',
interpolation='none')
fig.colorbar(im, ax=ax)
xlabel = 'Decoder timestep'
if info is not None:
xlabel += '\n\n' + info
plt.xlabel(xlabel)
plt.ylabel('Encoder timestep')
plt.tight_layout()
matplotlib.use("Agg")
MATPLOTLIB_FLAG = True
mpl_logger = logging.getLogger("matplotlib")
mpl_logger.setLevel(logging.WARNING)
import matplotlib.pylab as plt
import numpy as np
fig.canvas.draw()
data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
plt.close()
return data
fig, ax = plt.subplots(figsize=(6, 4))
im = ax.imshow(
alignment.transpose(), aspect="auto", origin="lower", interpolation="none"
)
fig.colorbar(im, ax=ax)
xlabel = "Decoder timestep"
if info is not None:
xlabel += "\n\n" + info
plt.xlabel(xlabel)
plt.ylabel("Encoder timestep")
plt.tight_layout()
fig.canvas.draw()
data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
plt.close()
return data
def load_wav_to_torch(full_path):
sampling_rate, data = read(full_path)
return torch.FloatTensor(data.astype(np.float32)), sampling_rate
sampling_rate, data = read(full_path)
return torch.FloatTensor(data.astype(np.float32)), sampling_rate
def load_filepaths_and_text(filename, split="|"):
with open(filename, encoding='utf-8') as f:
filepaths_and_text = [line.strip().split(split) for line in f]
return filepaths_and_text
with open(filename, encoding="utf-8") as f:
filepaths_and_text = [line.strip().split(split) for line in f]
return filepaths_and_text
def get_hparams(init=True):
'''
todo:
结尾七人组
保存频率总epoch done
bs done
pretrainGpretrainD done
卡号os.en["CUDA_VISIBLE_DEVICES"] done
if_latest todo
模型if_f0 todo
采样率自动选择config done
是否缓存数据集进GPU:if_cache_data_in_gpu done
"""
todo:
结尾七人组
保存频率总epoch done
bs done
pretrainGpretrainD done
卡号os.en["CUDA_VISIBLE_DEVICES"] done
if_latest todo
模型if_f0 todo
采样率自动选择config done
是否缓存数据集进GPU:if_cache_data_in_gpu done
-m:
自动决定training_files路径,改掉train_nsf_load_pretrain.py里的hps.data.training_files done
-c不要了
'''
parser = argparse.ArgumentParser()
# parser.add_argument('-c', '--config', type=str, default="configs/40k.json",help='JSON file for configuration')
parser.add_argument('-se', '--save_every_epoch', type=int, required=True,help='checkpoint save frequency (epoch)')
parser.add_argument('-te', '--total_epoch', type=int, required=True,help='total_epoch')
parser.add_argument('-pg', '--pretrainG', type=str, default="",help='Pretrained Discriminator path')
parser.add_argument('-pd', '--pretrainD', type=str, default="",help='Pretrained Generator path')
parser.add_argument('-g', '--gpus', type=str, default="0",help='split by -')
parser.add_argument('-bs', '--batch_size', type=int, required=True,help='batch size')
parser.add_argument('-e', '--experiment_dir', type=str, required=True,help='experiment dir')#-m
parser.add_argument('-sr', '--sample_rate', type=str, required=True,help='sample rate, 32k/40k/48k')
parser.add_argument('-f0', '--if_f0', type=int, required=True,help='use f0 as one of the inputs of the model, 1 or 0')
parser.add_argument('-l', '--if_latest', type=int, required=True,help='if only save the latest G/D pth file, 1 or 0')
parser.add_argument('-c', '--if_cache_data_in_gpu', type=int, required=True,help='if caching the dataset in GPU memory, 1 or 0')
-m:
自动决定training_files路径,改掉train_nsf_load_pretrain.py里的hps.data.training_files done
-c不要了
"""
parser = argparse.ArgumentParser()
# parser.add_argument('-c', '--config', type=str, default="configs/40k.json",help='JSON file for configuration')
parser.add_argument(
"-se",
"--save_every_epoch",
type=int,
required=True,
help="checkpoint save frequency (epoch)",
)
parser.add_argument(
"-te", "--total_epoch", type=int, required=True, help="total_epoch"
)
parser.add_argument(
"-pg", "--pretrainG", type=str, default="", help="Pretrained Discriminator path"
)
parser.add_argument(
"-pd", "--pretrainD", type=str, default="", help="Pretrained Generator path"
)
parser.add_argument("-g", "--gpus", type=str, default="0", help="split by -")
parser.add_argument(
"-bs", "--batch_size", type=int, required=True, help="batch size"
)
parser.add_argument(
"-e", "--experiment_dir", type=str, required=True, help="experiment dir"
) # -m
parser.add_argument(
"-sr", "--sample_rate", type=str, required=True, help="sample rate, 32k/40k/48k"
)
parser.add_argument(
"-f0",
"--if_f0",
type=int,
required=True,
help="use f0 as one of the inputs of the model, 1 or 0",
)
parser.add_argument(
"-l",
"--if_latest",
type=int,
required=True,
help="if only save the latest G/D pth file, 1 or 0",
)
parser.add_argument(
"-c",
"--if_cache_data_in_gpu",
type=int,
required=True,
help="if caching the dataset in GPU memory, 1 or 0",
)
args = parser.parse_args()
name = args.experiment_dir
experiment_dir = os.path.join("./logs", args.experiment_dir)
args = parser.parse_args()
name = args.experiment_dir
experiment_dir = os.path.join("./logs", args.experiment_dir)
if not os.path.exists(experiment_dir):
os.makedirs(experiment_dir)
if not os.path.exists(experiment_dir):
os.makedirs(experiment_dir)
config_path = "configs/%s.json"%args.sample_rate
config_save_path = os.path.join(experiment_dir, "config.json")
if init:
with open(config_path, "r") as f:
data = f.read()
with open(config_save_path, "w") as f:
f.write(data)
else:
with open(config_save_path, "r") as f:
data = f.read()
config = json.loads(data)
config_path = "configs/%s.json" % args.sample_rate
config_save_path = os.path.join(experiment_dir, "config.json")
if init:
with open(config_path, "r") as f:
data = f.read()
with open(config_save_path, "w") as f:
f.write(data)
else:
with open(config_save_path, "r") as f:
data = f.read()
config = json.loads(data)
hparams = HParams(**config)
hparams.model_dir = hparams.experiment_dir = experiment_dir
hparams.save_every_epoch = args.save_every_epoch
hparams.name = name
hparams.total_epoch = args.total_epoch
hparams.pretrainG = args.pretrainG
hparams.pretrainD = args.pretrainD
hparams.gpus = args.gpus
hparams.train.batch_size = args.batch_size
hparams.sample_rate = args.sample_rate
hparams.if_f0 = args.if_f0
hparams.if_latest = args.if_latest
hparams.if_cache_data_in_gpu = args.if_cache_data_in_gpu
hparams.data.training_files = "%s/filelist.txt"%experiment_dir
return hparams
hparams = HParams(**config)
hparams.model_dir = hparams.experiment_dir = experiment_dir
hparams.save_every_epoch = args.save_every_epoch
hparams.name = name
hparams.total_epoch = args.total_epoch
hparams.pretrainG = args.pretrainG
hparams.pretrainD = args.pretrainD
hparams.gpus = args.gpus
hparams.train.batch_size = args.batch_size
hparams.sample_rate = args.sample_rate
hparams.if_f0 = args.if_f0
hparams.if_latest = args.if_latest
hparams.if_cache_data_in_gpu = args.if_cache_data_in_gpu
hparams.data.training_files = "%s/filelist.txt" % experiment_dir
return hparams
def get_hparams_from_dir(model_dir):
config_save_path = os.path.join(model_dir, "config.json")
with open(config_save_path, "r") as f:
data = f.read()
config = json.loads(data)
config_save_path = os.path.join(model_dir, "config.json")
with open(config_save_path, "r") as f:
data = f.read()
config = json.loads(data)
hparams =HParams(**config)
hparams.model_dir = model_dir
return hparams
hparams = HParams(**config)
hparams.model_dir = model_dir
return hparams
def get_hparams_from_file(config_path):
with open(config_path, "r") as f:
data = f.read()
config = json.loads(data)
with open(config_path, "r") as f:
data = f.read()
config = json.loads(data)
hparams =HParams(**config)
return hparams
hparams = HParams(**config)
return hparams
def check_git_hash(model_dir):
source_dir = os.path.dirname(os.path.realpath(__file__))
if not os.path.exists(os.path.join(source_dir, ".git")):
logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format(
source_dir
))
return
source_dir = os.path.dirname(os.path.realpath(__file__))
if not os.path.exists(os.path.join(source_dir, ".git")):
logger.warn(
"{} is not a git repository, therefore hash value comparison will be ignored.".format(
source_dir
)
)
return
cur_hash = subprocess.getoutput("git rev-parse HEAD")
cur_hash = subprocess.getoutput("git rev-parse HEAD")
path = os.path.join(model_dir, "githash")
if os.path.exists(path):
saved_hash = open(path).read()
if saved_hash != cur_hash:
logger.warn("git hash values are different. {}(saved) != {}(current)".format(
saved_hash[:8], cur_hash[:8]))
else:
open(path, "w").write(cur_hash)
path = os.path.join(model_dir, "githash")
if os.path.exists(path):
saved_hash = open(path).read()
if saved_hash != cur_hash:
logger.warn(
"git hash values are different. {}(saved) != {}(current)".format(
saved_hash[:8], cur_hash[:8]
)
)
else:
open(path, "w").write(cur_hash)
def get_logger(model_dir, filename="train.log"):
global logger
logger = logging.getLogger(os.path.basename(model_dir))
logger.setLevel(logging.DEBUG)
global logger
logger = logging.getLogger(os.path.basename(model_dir))
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
if not os.path.exists(model_dir):
os.makedirs(model_dir)
h = logging.FileHandler(os.path.join(model_dir, filename))
h.setLevel(logging.DEBUG)
h.setFormatter(formatter)
logger.addHandler(h)
return logger
formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
if not os.path.exists(model_dir):
os.makedirs(model_dir)
h = logging.FileHandler(os.path.join(model_dir, filename))
h.setLevel(logging.DEBUG)
h.setFormatter(formatter)
logger.addHandler(h)
return logger
class HParams():
def __init__(self, **kwargs):
for k, v in kwargs.items():
if type(v) == dict:
v = HParams(**v)
self[k] = v
class HParams:
def __init__(self, **kwargs):
for k, v in kwargs.items():
if type(v) == dict:
v = HParams(**v)
self[k] = v
def keys(self):
return self.__dict__.keys()
def keys(self):
return self.__dict__.keys()
def items(self):
return self.__dict__.items()
def items(self):
return self.__dict__.items()
def values(self):
return self.__dict__.values()
def values(self):
return self.__dict__.values()
def __len__(self):
return len(self.__dict__)
def __len__(self):
return len(self.__dict__)
def __getitem__(self, key):
return getattr(self, key)
def __getitem__(self, key):
return getattr(self, key)
def __setitem__(self, key, value):
return setattr(self, key, value)
def __setitem__(self, key, value):
return setattr(self, key, value)
def __contains__(self, key):
return key in self.__dict__
def __contains__(self, key):
return key in self.__dict__
def __repr__(self):
return self.__dict__.__repr__()
def __repr__(self):
return self.__dict__.__repr__()

View File

@ -1,12 +1,15 @@
import sys,os
now_dir=os.getcwd()
sys.path.append(os.path.join(now_dir,"train"))
import sys, os
now_dir = os.getcwd()
sys.path.append(os.path.join(now_dir, "train"))
import utils
hps = utils.get_hparams()
os.environ["CUDA_VISIBLE_DEVICES"]=hps.gpus.replace("-",",")
n_gpus=len(hps.gpus.split("-"))
os.environ["CUDA_VISIBLE_DEVICES"] = hps.gpus.replace("-", ",")
n_gpus = len(hps.gpus.split("-"))
from random import shuffle
import traceback,json,argparse,itertools,math,torch,pdb
import traceback, json, argparse, itertools, math, torch, pdb
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = False
from torch import nn, optim
@ -20,9 +23,16 @@ from torch.cuda.amp import autocast, GradScaler
from infer_pack import commons
from time import time as ttime
from data_utils import TextAudioLoaderMultiNSFsid,TextAudioLoader, TextAudioCollateMultiNSFsid,TextAudioCollate, DistributedBucketSampler
from data_utils import (
TextAudioLoaderMultiNSFsid,
TextAudioLoader,
TextAudioCollateMultiNSFsid,
TextAudioCollate,
DistributedBucketSampler,
)
from infer_pack.models import (
SynthesizerTrnMs256NSFsid,SynthesizerTrnMs256NSFsid_nono,
SynthesizerTrnMs256NSFsid,
SynthesizerTrnMs256NSFsid_nono,
MultiPeriodDiscriminator,
)
from losses import generator_loss, discriminator_loss, feature_loss, kl_loss
@ -32,13 +42,11 @@ from mel_processing import mel_spectrogram_torch, spec_to_mel_torch
global_step = 0
def main():
# n_gpus = torch.cuda.device_count()
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "5555"
mp.spawn(
run,
nprocs=n_gpus,
@ -62,13 +70,16 @@ def run(rank, n_gpus, hps):
backend="gloo", init_method="env://", world_size=n_gpus, rank=rank
)
torch.manual_seed(hps.train.seed)
if torch.cuda.is_available(): torch.cuda.set_device(rank)
if torch.cuda.is_available():
torch.cuda.set_device(rank)
if (hps.if_f0 == 1):train_dataset = TextAudioLoaderMultiNSFsid(hps.data.training_files, hps.data)
else:train_dataset = TextAudioLoader(hps.data.training_files, hps.data)
if hps.if_f0 == 1:
train_dataset = TextAudioLoaderMultiNSFsid(hps.data.training_files, hps.data)
else:
train_dataset = TextAudioLoader(hps.data.training_files, hps.data)
train_sampler = DistributedBucketSampler(
train_dataset,
hps.train.batch_size*n_gpus,
hps.train.batch_size * n_gpus,
# [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1200,1400], # 16s
[100, 200, 300, 400, 500, 600, 700, 800, 900], # 16s
num_replicas=n_gpus,
@ -77,8 +88,10 @@ def run(rank, n_gpus, hps):
)
# It is possible that dataloader's workers are out of shared memory. Please try to raise your shared memory limit.
# num_workers=8 -> num_workers=4
if (hps.if_f0 == 1):collate_fn = TextAudioCollateMultiNSFsid()
else:collate_fn = TextAudioCollate()
if hps.if_f0 == 1:
collate_fn = TextAudioCollateMultiNSFsid()
else:
collate_fn = TextAudioCollate()
train_loader = DataLoader(
train_dataset,
num_workers=4,
@ -89,13 +102,26 @@ def run(rank, n_gpus, hps):
persistent_workers=True,
prefetch_factor=8,
)
if(hps.if_f0==1):
net_g = SynthesizerTrnMs256NSFsid(hps.data.filter_length // 2 + 1,hps.train.segment_size // hps.data.hop_length,**hps.model,is_half=hps.train.fp16_run,sr=hps.sample_rate)
if hps.if_f0 == 1:
net_g = SynthesizerTrnMs256NSFsid(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
**hps.model,
is_half=hps.train.fp16_run,
sr=hps.sample_rate,
)
else:
net_g = SynthesizerTrnMs256NSFsid_nono(hps.data.filter_length // 2 + 1,hps.train.segment_size // hps.data.hop_length,**hps.model,is_half=hps.train.fp16_run)
if torch.cuda.is_available(): net_g = net_g.cuda(rank)
net_g = SynthesizerTrnMs256NSFsid_nono(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
**hps.model,
is_half=hps.train.fp16_run,
)
if torch.cuda.is_available():
net_g = net_g.cuda(rank)
net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm)
if torch.cuda.is_available(): net_d = net_d.cuda(rank)
if torch.cuda.is_available():
net_d = net_d.cuda(rank)
optim_g = torch.optim.AdamW(
net_g.parameters(),
hps.train.learning_rate,
@ -117,23 +143,35 @@ def run(rank, n_gpus, hps):
net_g = DDP(net_g)
net_d = DDP(net_d)
try:#如果能加载自动resume
_, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, optim_d) # D多半加载没事
try: # 如果能加载自动resume
_, _, _, epoch_str = utils.load_checkpoint(
utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, optim_d
) # D多半加载没事
if rank == 0:
logger.info("loaded D")
# _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g,load_opt=0)
_, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g)
_, _, _, epoch_str = utils.load_checkpoint(
utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g
)
global_step = (epoch_str - 1) * len(train_loader)
# epoch_str = 1
# global_step = 0
except:#如果首次不能加载加载pretrain
except: # 如果首次不能加载加载pretrain
traceback.print_exc()
epoch_str = 1
global_step = 0
if rank == 0:
logger.info("loaded pretrained %s %s"%(hps.pretrainG,hps.pretrainD))
print(net_g.module.load_state_dict(torch.load(hps.pretrainG,map_location="cpu")["model"]))##测试不加载优化器
print(net_d.module.load_state_dict(torch.load(hps.pretrainD,map_location="cpu")["model"]))
logger.info("loaded pretrained %s %s" % (hps.pretrainG, hps.pretrainD))
print(
net_g.module.load_state_dict(
torch.load(hps.pretrainG, map_location="cpu")["model"]
)
) ##测试不加载优化器
print(
net_d.module.load_state_dict(
torch.load(hps.pretrainD, map_location="cpu")["model"]
)
)
scheduler_g = torch.optim.lr_scheduler.ExponentialLR(
optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2
@ -144,7 +182,7 @@ def run(rank, n_gpus, hps):
scaler = GradScaler(enabled=hps.train.fp16_run)
cache=[]
cache = []
for epoch in range(epoch_str, hps.train.epochs + 1):
if rank == 0:
train_and_evaluate(
@ -157,7 +195,8 @@ def run(rank, n_gpus, hps):
scaler,
[train_loader, None],
logger,
[writer, writer_eval],cache
[writer, writer_eval],
cache,
)
else:
train_and_evaluate(
@ -170,14 +209,15 @@ def run(rank, n_gpus, hps):
scaler,
[train_loader, None],
None,
None,cache
None,
cache,
)
scheduler_g.step()
scheduler_d.step()
def train_and_evaluate(
rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers,cache
rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers, cache
):
net_g, net_d = nets
optim_g, optim_d = optims
@ -190,168 +230,90 @@ def train_and_evaluate(
net_g.train()
net_d.train()
if(cache==[]or hps.if_cache_data_in_gpu==False):#第一个epoch把cache全部填满训练集
if cache == [] or hps.if_cache_data_in_gpu == False: # 第一个epoch把cache全部填满训练集
# print("caching")
for batch_idx, info in enumerate(train_loader):
if (hps.if_f0 == 1):phone,phone_lengths,pitch,pitchf,spec,spec_lengths,wave,wave_lengths,sid=info
else:phone,phone_lengths,spec,spec_lengths,wave,wave_lengths,sid=info
if hps.if_f0 == 1:
(
phone,
phone_lengths,
pitch,
pitchf,
spec,
spec_lengths,
wave,
wave_lengths,
sid,
) = info
else:
phone, phone_lengths, spec, spec_lengths, wave, wave_lengths, sid = info
if torch.cuda.is_available():
phone, phone_lengths = phone.cuda(rank, non_blocking=True), phone_lengths.cuda(rank, non_blocking=True )
if (hps.if_f0 == 1):pitch,pitchf = pitch.cuda(rank, non_blocking=True),pitchf.cuda(rank, non_blocking=True)
phone, phone_lengths = phone.cuda(
rank, non_blocking=True
), phone_lengths.cuda(rank, non_blocking=True)
if hps.if_f0 == 1:
pitch, pitchf = pitch.cuda(rank, non_blocking=True), pitchf.cuda(
rank, non_blocking=True
)
sid = sid.cuda(rank, non_blocking=True)
spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda(rank, non_blocking=True)
wave, wave_lengths = wave.cuda(rank, non_blocking=True), wave_lengths.cuda(rank, non_blocking=True)
if(hps.if_cache_data_in_gpu==True):
if (hps.if_f0 == 1):cache.append((batch_idx, (phone,phone_lengths,pitch,pitchf,spec,spec_lengths,wave,wave_lengths ,sid)))
else:cache.append((batch_idx, (phone,phone_lengths,spec,spec_lengths,wave,wave_lengths ,sid)))
with autocast(enabled=hps.train.fp16_run):
if (hps.if_f0 == 1):y_hat,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(phone, phone_lengths, pitch,pitchf, spec, spec_lengths,sid)
else:y_hat,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(phone, phone_lengths, spec, spec_lengths,sid)
mel = spec_to_mel_torch(spec,hps.data.filter_length,hps.data.n_mel_channels,hps.data.sampling_rate,hps.data.mel_fmin,hps.data.mel_fmax,)
y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length)
with autocast(enabled=False):
y_hat_mel = mel_spectrogram_torch(
y_hat.float().squeeze(1),
hps.data.filter_length,
hps.data.n_mel_channels,
hps.data.sampling_rate,
hps.data.hop_length,
hps.data.win_length,
hps.data.mel_fmin,
hps.data.mel_fmax,
)
if(hps.train.fp16_run==True):
y_hat_mel=y_hat_mel.half()
wave = commons.slice_segments(
wave, ids_slice * hps.data.hop_length, hps.train.segment_size
) # slice
# Discriminator
y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach())
with autocast(enabled=False):
loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(
y_d_hat_r, y_d_hat_g
)
optim_d.zero_grad()
scaler.scale(loss_disc).backward()
scaler.unscale_(optim_d)
grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
scaler.step(optim_d)
with autocast(enabled=hps.train.fp16_run):
# Generator
y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat)
with autocast(enabled=False):
loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
loss_fm = feature_loss(fmap_r, fmap_g)
loss_gen, losses_gen = generator_loss(y_d_hat_g)
loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl
optim_g.zero_grad()
scaler.scale(loss_gen_all).backward()
scaler.unscale_(optim_g)
grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
scaler.step(optim_g)
scaler.update()
if rank == 0:
if global_step % hps.train.log_interval == 0:
lr = optim_g.param_groups[0]["lr"]
logger.info(
"Train Epoch: {} [{:.0f}%]".format(
epoch, 100.0 * batch_idx / len(train_loader)
spec, spec_lengths = spec.cuda(
rank, non_blocking=True
), spec_lengths.cuda(rank, non_blocking=True)
wave, wave_lengths = wave.cuda(
rank, non_blocking=True
), wave_lengths.cuda(rank, non_blocking=True)
if hps.if_cache_data_in_gpu == True:
if hps.if_f0 == 1:
cache.append(
(
batch_idx,
(
phone,
phone_lengths,
pitch,
pitchf,
spec,
spec_lengths,
wave,
wave_lengths,
sid,
),
)
)
# Amor For Tensorboard display
if loss_mel > 50:
loss_mel = 50
if loss_kl > 5:
loss_kl = 5
logger.info([global_step, lr])
logger.info(
f"loss_disc={loss_disc:.3f}, loss_gen={loss_gen:.3f}, loss_fm={loss_fm:.3f},loss_mel={loss_mel:.3f}, loss_kl={loss_kl:.3f}"
else:
cache.append(
(
batch_idx,
(
phone,
phone_lengths,
spec,
spec_lengths,
wave,
wave_lengths,
sid,
),
)
)
scalar_dict = {
"loss/g/total": loss_gen_all,
"loss/d/total": loss_disc,
"learning_rate": lr,
"grad_norm_d": grad_norm_d,
"grad_norm_g": grad_norm_g,
}
scalar_dict.update(
{"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/kl": loss_kl}
)
scalar_dict.update(
{"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}
)
scalar_dict.update(
{"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}
)
scalar_dict.update(
{"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}
)
image_dict = {
"slice/mel_org": utils.plot_spectrogram_to_numpy(
y_mel[0].data.cpu().numpy()
),
"slice/mel_gen": utils.plot_spectrogram_to_numpy(
y_hat_mel[0].data.cpu().numpy()
),
"all/mel": utils.plot_spectrogram_to_numpy(
mel[0].data.cpu().numpy()
),
}
utils.summarize(
writer=writer,
global_step=global_step,
images=image_dict,
scalars=scalar_dict,
)
global_step += 1
# if global_step % hps.train.eval_interval == 0:
if epoch % hps.save_every_epoch == 0 and rank == 0:
if(hps.if_latest==0):
utils.save_checkpoint(
net_g,
optim_g,
hps.train.learning_rate,
epoch,
os.path.join(hps.model_dir, "G_{}.pth".format(global_step)),
)
utils.save_checkpoint(
net_d,
optim_d,
hps.train.learning_rate,
epoch,
os.path.join(hps.model_dir, "D_{}.pth".format(global_step)),
)
else:
utils.save_checkpoint(
net_g,
optim_g,
hps.train.learning_rate,
epoch,
os.path.join(hps.model_dir, "G_{}.pth".format(2333333)),
)
utils.save_checkpoint(
net_d,
optim_d,
hps.train.learning_rate,
epoch,
os.path.join(hps.model_dir, "D_{}.pth".format(2333333)),
)
else:#后续的epoch直接使用打乱的cache
shuffle(cache)
# print("using cache")
for batch_idx, info in cache:
if (hps.if_f0 == 1):phone,phone_lengths,pitch,pitchf,spec,spec_lengths,wave,wave_lengths,sid=info
else:phone,phone_lengths,spec,spec_lengths,wave,wave_lengths,sid=info
with autocast(enabled=hps.train.fp16_run):
if (hps.if_f0 == 1):y_hat,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(phone, phone_lengths, pitch,pitchf, spec, spec_lengths,sid)
else:y_hat,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(phone, phone_lengths, spec, spec_lengths,sid)
if hps.if_f0 == 1:
(
y_hat,
ids_slice,
x_mask,
z_mask,
(z, z_p, m_p, logs_p, m_q, logs_q),
) = net_g(
phone, phone_lengths, pitch, pitchf, spec, spec_lengths, sid
)
else:
(
y_hat,
ids_slice,
x_mask,
z_mask,
(z, z_p, m_p, logs_p, m_q, logs_q),
) = net_g(phone, phone_lengths, spec, spec_lengths, sid)
mel = spec_to_mel_torch(
spec,
hps.data.filter_length,
@ -374,8 +336,200 @@ def train_and_evaluate(
hps.data.mel_fmin,
hps.data.mel_fmax,
)
if(hps.train.fp16_run==True):
y_hat_mel=y_hat_mel.half()
if hps.train.fp16_run == True:
y_hat_mel = y_hat_mel.half()
wave = commons.slice_segments(
wave, ids_slice * hps.data.hop_length, hps.train.segment_size
) # slice
# Discriminator
y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach())
with autocast(enabled=False):
loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(
y_d_hat_r, y_d_hat_g
)
optim_d.zero_grad()
scaler.scale(loss_disc).backward()
scaler.unscale_(optim_d)
grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
scaler.step(optim_d)
with autocast(enabled=hps.train.fp16_run):
# Generator
y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat)
with autocast(enabled=False):
loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
loss_fm = feature_loss(fmap_r, fmap_g)
loss_gen, losses_gen = generator_loss(y_d_hat_g)
loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl
optim_g.zero_grad()
scaler.scale(loss_gen_all).backward()
scaler.unscale_(optim_g)
grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
scaler.step(optim_g)
scaler.update()
if rank == 0:
if global_step % hps.train.log_interval == 0:
lr = optim_g.param_groups[0]["lr"]
logger.info(
"Train Epoch: {} [{:.0f}%]".format(
epoch, 100.0 * batch_idx / len(train_loader)
)
)
# Amor For Tensorboard display
if loss_mel > 50:
loss_mel = 50
if loss_kl > 5:
loss_kl = 5
logger.info([global_step, lr])
logger.info(
f"loss_disc={loss_disc:.3f}, loss_gen={loss_gen:.3f}, loss_fm={loss_fm:.3f},loss_mel={loss_mel:.3f}, loss_kl={loss_kl:.3f}"
)
scalar_dict = {
"loss/g/total": loss_gen_all,
"loss/d/total": loss_disc,
"learning_rate": lr,
"grad_norm_d": grad_norm_d,
"grad_norm_g": grad_norm_g,
}
scalar_dict.update(
{
"loss/g/fm": loss_fm,
"loss/g/mel": loss_mel,
"loss/g/kl": loss_kl,
}
)
scalar_dict.update(
{"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}
)
scalar_dict.update(
{
"loss/d_r/{}".format(i): v
for i, v in enumerate(losses_disc_r)
}
)
scalar_dict.update(
{
"loss/d_g/{}".format(i): v
for i, v in enumerate(losses_disc_g)
}
)
image_dict = {
"slice/mel_org": utils.plot_spectrogram_to_numpy(
y_mel[0].data.cpu().numpy()
),
"slice/mel_gen": utils.plot_spectrogram_to_numpy(
y_hat_mel[0].data.cpu().numpy()
),
"all/mel": utils.plot_spectrogram_to_numpy(
mel[0].data.cpu().numpy()
),
}
utils.summarize(
writer=writer,
global_step=global_step,
images=image_dict,
scalars=scalar_dict,
)
global_step += 1
# if global_step % hps.train.eval_interval == 0:
if epoch % hps.save_every_epoch == 0 and rank == 0:
if hps.if_latest == 0:
utils.save_checkpoint(
net_g,
optim_g,
hps.train.learning_rate,
epoch,
os.path.join(hps.model_dir, "G_{}.pth".format(global_step)),
)
utils.save_checkpoint(
net_d,
optim_d,
hps.train.learning_rate,
epoch,
os.path.join(hps.model_dir, "D_{}.pth".format(global_step)),
)
else:
utils.save_checkpoint(
net_g,
optim_g,
hps.train.learning_rate,
epoch,
os.path.join(hps.model_dir, "G_{}.pth".format(2333333)),
)
utils.save_checkpoint(
net_d,
optim_d,
hps.train.learning_rate,
epoch,
os.path.join(hps.model_dir, "D_{}.pth".format(2333333)),
)
else: # 后续的epoch直接使用打乱的cache
shuffle(cache)
# print("using cache")
for batch_idx, info in cache:
if hps.if_f0 == 1:
(
phone,
phone_lengths,
pitch,
pitchf,
spec,
spec_lengths,
wave,
wave_lengths,
sid,
) = info
else:
phone, phone_lengths, spec, spec_lengths, wave, wave_lengths, sid = info
with autocast(enabled=hps.train.fp16_run):
if hps.if_f0 == 1:
(
y_hat,
ids_slice,
x_mask,
z_mask,
(z, z_p, m_p, logs_p, m_q, logs_q),
) = net_g(
phone, phone_lengths, pitch, pitchf, spec, spec_lengths, sid
)
else:
(
y_hat,
ids_slice,
x_mask,
z_mask,
(z, z_p, m_p, logs_p, m_q, logs_q),
) = net_g(phone, phone_lengths, spec, spec_lengths, sid)
mel = spec_to_mel_torch(
spec,
hps.data.filter_length,
hps.data.n_mel_channels,
hps.data.sampling_rate,
hps.data.mel_fmin,
hps.data.mel_fmax,
)
y_mel = commons.slice_segments(
mel, ids_slice, hps.train.segment_size // hps.data.hop_length
)
with autocast(enabled=False):
y_hat_mel = mel_spectrogram_torch(
y_hat.float().squeeze(1),
hps.data.filter_length,
hps.data.n_mel_channels,
hps.data.sampling_rate,
hps.data.hop_length,
hps.data.win_length,
hps.data.mel_fmin,
hps.data.mel_fmax,
)
if hps.train.fp16_run == True:
y_hat_mel = y_hat_mel.half()
wave = commons.slice_segments(
wave, ids_slice * hps.data.hop_length, hps.train.segment_size
) # slice
@ -435,17 +589,27 @@ def train_and_evaluate(
"grad_norm_g": grad_norm_g,
}
scalar_dict.update(
{"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/kl": loss_kl}
{
"loss/g/fm": loss_fm,
"loss/g/mel": loss_mel,
"loss/g/kl": loss_kl,
}
)
scalar_dict.update(
{"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}
)
scalar_dict.update(
{"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}
{
"loss/d_r/{}".format(i): v
for i, v in enumerate(losses_disc_r)
}
)
scalar_dict.update(
{"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}
{
"loss/d_g/{}".format(i): v
for i, v in enumerate(losses_disc_g)
}
)
image_dict = {
"slice/mel_org": utils.plot_spectrogram_to_numpy(
@ -467,7 +631,7 @@ def train_and_evaluate(
global_step += 1
# if global_step % hps.train.eval_interval == 0:
if epoch % hps.save_every_epoch == 0 and rank == 0:
if(hps.if_latest==0):
if hps.if_latest == 0:
utils.save_checkpoint(
net_g,
optim_g,
@ -498,15 +662,20 @@ def train_and_evaluate(
os.path.join(hps.model_dir, "D_{}.pth".format(2333333)),
)
if rank == 0:
logger.info("====> Epoch: {}".format(epoch))
if(epoch>=hps.total_epoch and rank == 0):
if epoch >= hps.total_epoch and rank == 0:
logger.info("Training is done. The program is closed.")
from process_ckpt import savee#def savee(ckpt,sr,if_f0,name,epoch):
if hasattr(net_g, 'module'):ckpt = net_g.module.state_dict()
else:ckpt = net_g.state_dict()
logger.info("saving final ckpt:%s"%(savee(ckpt,hps.sample_rate,hps.if_f0,hps.name,epoch)))
from process_ckpt import savee # def savee(ckpt,sr,if_f0,name,epoch):
if hasattr(net_g, "module"):
ckpt = net_g.module.state_dict()
else:
ckpt = net_g.state_dict()
logger.info(
"saving final ckpt:%s"
% (savee(ckpt, hps.sample_rate, hps.if_f0, hps.name, epoch))
)
os._exit(2333333)

View File

@ -1,5 +1,6 @@
import sys,os,multiprocessing
now_dir=os.getcwd()
import sys, os, multiprocessing
now_dir = os.getcwd()
sys.path.append(now_dir)
inp_root = sys.argv[1]
@ -7,15 +8,17 @@ sr = int(sys.argv[2])
n_p = int(sys.argv[3])
exp_dir = sys.argv[4]
noparallel = sys.argv[5] == "True"
import numpy as np,os,traceback
import numpy as np, os, traceback
from slicer2 import Slicer
import librosa,traceback
from scipy.io import wavfile
import librosa, traceback
from scipy.io import wavfile
import multiprocessing
from my_utils import load_audio
mutex = multiprocessing.Lock()
f = open("%s/preprocess.log"%exp_dir, "a+")
f = open("%s/preprocess.log" % exp_dir, "a+")
def println(strr):
mutex.acquire()
print(strr)
@ -23,81 +26,101 @@ def println(strr):
f.flush()
mutex.release()
class PreProcess():
def __init__(self,sr,exp_dir):
class PreProcess:
def __init__(self, sr, exp_dir):
self.slicer = Slicer(
sr=sr,
threshold=-32,
min_length=800,
min_interval=400,
hop_size=15,
max_sil_kept=150
max_sil_kept=150,
)
self.sr=sr
self.per=3.7
self.overlap=0.3
self.tail=self.per+self.overlap
self.max=0.95
self.alpha=0.8
self.exp_dir=exp_dir
self.gt_wavs_dir="%s/0_gt_wavs"%exp_dir
self.wavs16k_dir="%s/1_16k_wavs"%exp_dir
os.makedirs(self.exp_dir,exist_ok=True)
os.makedirs(self.gt_wavs_dir,exist_ok=True)
os.makedirs(self.wavs16k_dir,exist_ok=True)
self.sr = sr
self.per = 3.7
self.overlap = 0.3
self.tail = self.per + self.overlap
self.max = 0.95
self.alpha = 0.8
self.exp_dir = exp_dir
self.gt_wavs_dir = "%s/0_gt_wavs" % exp_dir
self.wavs16k_dir = "%s/1_16k_wavs" % exp_dir
os.makedirs(self.exp_dir, exist_ok=True)
os.makedirs(self.gt_wavs_dir, exist_ok=True)
os.makedirs(self.wavs16k_dir, exist_ok=True)
def norm_write(self,tmp_audio,idx0,idx1):
tmp_audio = (tmp_audio / np.abs(tmp_audio).max() * (self.max * self.alpha)) + (1 - self.alpha) * tmp_audio
wavfile.write("%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1), self.sr, (tmp_audio*32768).astype(np.int16))
def norm_write(self, tmp_audio, idx0, idx1):
tmp_audio = (tmp_audio / np.abs(tmp_audio).max() * (self.max * self.alpha)) + (
1 - self.alpha
) * tmp_audio
wavfile.write(
"%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1),
self.sr,
(tmp_audio * 32768).astype(np.int16),
)
tmp_audio = librosa.resample(tmp_audio, orig_sr=self.sr, target_sr=16000)
wavfile.write("%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1), 16000, (tmp_audio*32768).astype(np.int16))
wavfile.write(
"%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1),
16000,
(tmp_audio * 32768).astype(np.int16),
)
def pipeline(self,path, idx0):
def pipeline(self, path, idx0):
try:
audio = load_audio(path,self.sr)
idx1=0
audio = load_audio(path, self.sr)
idx1 = 0
for audio in self.slicer.slice(audio):
i = 0
while (1):
while 1:
start = int(self.sr * (self.per - self.overlap) * i)
i += 1
if (len(audio[start:]) > self.tail * self.sr):
tmp_audio = audio[start:start + int(self.per * self.sr)]
self.norm_write(tmp_audio,idx0,idx1)
if len(audio[start:]) > self.tail * self.sr:
tmp_audio = audio[start : start + int(self.per * self.sr)]
self.norm_write(tmp_audio, idx0, idx1)
idx1 += 1
else:
tmp_audio = audio[start:]
break
self.norm_write(tmp_audio, idx0, idx1)
println("%s->Suc."%path)
println("%s->Suc." % path)
except:
println("%s->%s"%(path,traceback.format_exc()))
println("%s->%s" % (path, traceback.format_exc()))
def pipeline_mp(self,infos):
def pipeline_mp(self, infos):
for path, idx0 in infos:
self.pipeline(path,idx0)
self.pipeline(path, idx0)
def pipeline_mp_inp_dir(self,inp_root,n_p):
def pipeline_mp_inp_dir(self, inp_root, n_p):
try:
infos = [("%s/%s" % (inp_root, name), idx) for idx, name in enumerate(sorted(list(os.listdir(inp_root))))]
infos = [
("%s/%s" % (inp_root, name), idx)
for idx, name in enumerate(sorted(list(os.listdir(inp_root))))
]
if noparallel:
for i in range(n_p): self.pipeline_mp(infos[i::n_p])
else:
ps=[]
for i in range(n_p):
p=multiprocessing.Process(target=self.pipeline_mp,args=(infos[i::n_p],))
self.pipeline_mp(infos[i::n_p])
else:
ps = []
for i in range(n_p):
p = multiprocessing.Process(
target=self.pipeline_mp, args=(infos[i::n_p],)
)
p.start()
ps.append(p)
for p in ps:p.join()
for p in ps:
p.join()
except:
println("Fail. %s"%traceback.format_exc())
println("Fail. %s" % traceback.format_exc())
def preprocess_trainset(inp_root, sr, n_p, exp_dir):
pp=PreProcess(sr,exp_dir)
pp = PreProcess(sr, exp_dir)
println("start preprocess")
println(sys.argv)
pp.pipeline_mp_inp_dir(inp_root,n_p)
pp.pipeline_mp_inp_dir(inp_root, n_p)
println("end preprocess")
if __name__=='__main__':
if __name__ == "__main__":
preprocess_trainset(inp_root, sr, n_p, exp_dir)

View File

@ -10,7 +10,6 @@ from uvr5_pack.lib_v5 import spec_utils
class VocalRemoverValidationSet(torch.utils.data.Dataset):
def __init__(self, patch_list):
self.patch_list = patch_list
@ -21,7 +20,7 @@ class VocalRemoverValidationSet(torch.utils.data.Dataset):
path = self.patch_list[idx]
data = np.load(path)
X, y = data['X'], data['y']
X, y = data["X"], data["y"]
X_mag = np.abs(X)
y_mag = np.abs(y)
@ -30,16 +29,22 @@ class VocalRemoverValidationSet(torch.utils.data.Dataset):
def make_pair(mix_dir, inst_dir):
input_exts = ['.wav', '.m4a', '.mp3', '.mp4', '.flac']
input_exts = [".wav", ".m4a", ".mp3", ".mp4", ".flac"]
X_list = sorted([
os.path.join(mix_dir, fname)
for fname in os.listdir(mix_dir)
if os.path.splitext(fname)[1] in input_exts])
y_list = sorted([
os.path.join(inst_dir, fname)
for fname in os.listdir(inst_dir)
if os.path.splitext(fname)[1] in input_exts])
X_list = sorted(
[
os.path.join(mix_dir, fname)
for fname in os.listdir(mix_dir)
if os.path.splitext(fname)[1] in input_exts
]
)
y_list = sorted(
[
os.path.join(inst_dir, fname)
for fname in os.listdir(inst_dir)
if os.path.splitext(fname)[1] in input_exts
]
)
filelist = list(zip(X_list, y_list))
@ -47,10 +52,11 @@ def make_pair(mix_dir, inst_dir):
def train_val_split(dataset_dir, split_mode, val_rate, val_filelist):
if split_mode == 'random':
if split_mode == "random":
filelist = make_pair(
os.path.join(dataset_dir, 'mixtures'),
os.path.join(dataset_dir, 'instruments'))
os.path.join(dataset_dir, "mixtures"),
os.path.join(dataset_dir, "instruments"),
)
random.shuffle(filelist)
@ -60,19 +66,23 @@ def train_val_split(dataset_dir, split_mode, val_rate, val_filelist):
val_filelist = filelist[-val_size:]
else:
train_filelist = [
pair for pair in filelist
if list(pair) not in val_filelist]
elif split_mode == 'subdirs':
pair for pair in filelist if list(pair) not in val_filelist
]
elif split_mode == "subdirs":
if len(val_filelist) != 0:
raise ValueError('The `val_filelist` option is not available in `subdirs` mode')
raise ValueError(
"The `val_filelist` option is not available in `subdirs` mode"
)
train_filelist = make_pair(
os.path.join(dataset_dir, 'training/mixtures'),
os.path.join(dataset_dir, 'training/instruments'))
os.path.join(dataset_dir, "training/mixtures"),
os.path.join(dataset_dir, "training/instruments"),
)
val_filelist = make_pair(
os.path.join(dataset_dir, 'validation/mixtures'),
os.path.join(dataset_dir, 'validation/instruments'))
os.path.join(dataset_dir, "validation/mixtures"),
os.path.join(dataset_dir, "validation/instruments"),
)
return train_filelist, val_filelist
@ -81,7 +91,9 @@ def augment(X, y, reduction_rate, reduction_mask, mixup_rate, mixup_alpha):
perm = np.random.permutation(len(X))
for i, idx in enumerate(tqdm(perm)):
if np.random.uniform() < reduction_rate:
y[idx] = spec_utils.reduce_vocal_aggressively(X[idx], y[idx], reduction_mask)
y[idx] = spec_utils.reduce_vocal_aggressively(
X[idx], y[idx], reduction_mask
)
if np.random.uniform() < 0.5:
# swap channel
@ -116,10 +128,8 @@ def make_padding(width, cropsize, offset):
def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset):
len_dataset = patches * len(filelist)
X_dataset = np.zeros(
(len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
y_dataset = np.zeros(
(len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
X_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
y_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
for i, (X_path, y_path) in enumerate(tqdm(filelist)):
X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
@ -127,22 +137,24 @@ def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset
X, y = X / coef, y / coef
l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode='constant')
y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode='constant')
X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant")
y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant")
starts = np.random.randint(0, X_pad.shape[2] - cropsize, patches)
ends = starts + cropsize
for j in range(patches):
idx = i * patches + j
X_dataset[idx] = X_pad[:, :, starts[j]:ends[j]]
y_dataset[idx] = y_pad[:, :, starts[j]:ends[j]]
X_dataset[idx] = X_pad[:, :, starts[j] : ends[j]]
y_dataset[idx] = y_pad[:, :, starts[j] : ends[j]]
return X_dataset, y_dataset
def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset):
patch_list = []
patch_dir = 'cs{}_sr{}_hl{}_nf{}_of{}'.format(cropsize, sr, hop_length, n_fft, offset)
patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format(
cropsize, sr, hop_length, n_fft, offset
)
os.makedirs(patch_dir, exist_ok=True)
for i, (X_path, y_path) in enumerate(tqdm(filelist)):
@ -153,18 +165,19 @@ def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset):
X, y = X / coef, y / coef
l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode='constant')
y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode='constant')
X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant")
y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant")
len_dataset = int(np.ceil(X.shape[2] / roi_size))
for j in range(len_dataset):
outpath = os.path.join(patch_dir, '{}_p{}.npz'.format(basename, j))
outpath = os.path.join(patch_dir, "{}_p{}.npz".format(basename, j))
start = j * roi_size
if not os.path.exists(outpath):
np.savez(
outpath,
X=X_pad[:, :, start:start + cropsize],
y=y_pad[:, :, start:start + cropsize])
X=X_pad[:, :, start : start + cropsize],
y=y_pad[:, :, start : start + cropsize],
)
patch_list.append(outpath)
return VocalRemoverValidationSet(patch_list)

View File

@ -6,19 +6,20 @@ from uvr5_pack.lib_v5 import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nout,
nin,
nout,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
bias=False),
bias=False,
),
nn.BatchNorm2d(nout),
activ()
activ(),
)
def __call__(self, x):
@ -26,24 +27,22 @@ class Conv2DBNActiv(nn.Module):
class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nin,
nin,
nin,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
groups=nin,
bias=False),
nn.Conv2d(
nin, nout,
kernel_size=1,
bias=False),
bias=False,
),
nn.Conv2d(nin, nout, kernel_size=1, bias=False),
nn.BatchNorm2d(nout),
activ()
activ(),
)
def __call__(self, x):
@ -51,7 +50,6 @@ class SeperableConv2DBNActiv(nn.Module):
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
@ -65,14 +63,15 @@ class Encoder(nn.Module):
class Decoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
def __init__(
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
if skip is not None:
skip = spec_utils.crop_center(skip, x)
x = torch.cat([x, skip], dim=1)
@ -85,28 +84,31 @@ class Decoder(nn.Module):
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
)
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
)
self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ),
nn.Dropout2d(0.1)
Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
)
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)

View File

@ -6,19 +6,20 @@ from uvr5_pack.lib_v5 import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nout,
nin,
nout,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
bias=False),
bias=False,
),
nn.BatchNorm2d(nout),
activ()
activ(),
)
def __call__(self, x):
@ -26,24 +27,22 @@ class Conv2DBNActiv(nn.Module):
class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nin,
nin,
nin,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
groups=nin,
bias=False),
nn.Conv2d(
nin, nout,
kernel_size=1,
bias=False),
bias=False,
),
nn.Conv2d(nin, nout, kernel_size=1, bias=False),
nn.BatchNorm2d(nout),
activ()
activ(),
)
def __call__(self, x):
@ -51,7 +50,6 @@ class SeperableConv2DBNActiv(nn.Module):
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
@ -65,14 +63,15 @@ class Encoder(nn.Module):
class Decoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
def __init__(
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
if skip is not None:
skip = spec_utils.crop_center(skip, x)
x = torch.cat([x, skip], dim=1)
@ -85,28 +84,31 @@ class Decoder(nn.Module):
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
)
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
)
self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ),
nn.Dropout2d(0.1)
Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
)
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)

View File

@ -6,19 +6,20 @@ from uvr5_pack.lib_v5 import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nout,
nin,
nout,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
bias=False),
bias=False,
),
nn.BatchNorm2d(nout),
activ()
activ(),
)
def __call__(self, x):
@ -26,24 +27,22 @@ class Conv2DBNActiv(nn.Module):
class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nin,
nin,
nin,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
groups=nin,
bias=False),
nn.Conv2d(
nin, nout,
kernel_size=1,
bias=False),
bias=False,
),
nn.Conv2d(nin, nout, kernel_size=1, bias=False),
nn.BatchNorm2d(nout),
activ()
activ(),
)
def __call__(self, x):
@ -51,7 +50,6 @@ class SeperableConv2DBNActiv(nn.Module):
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
@ -65,14 +63,15 @@ class Encoder(nn.Module):
class Decoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
def __init__(
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
if skip is not None:
skip = spec_utils.crop_center(skip, x)
x = torch.cat([x, skip], dim=1)
@ -85,28 +84,31 @@ class Decoder(nn.Module):
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
)
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
)
self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ),
nn.Dropout2d(0.1)
Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
)
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)

View File

@ -6,19 +6,20 @@ from uvr5_pack.lib_v5 import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nout,
nin,
nout,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
bias=False),
bias=False,
),
nn.BatchNorm2d(nout),
activ()
activ(),
)
def __call__(self, x):
@ -26,24 +27,22 @@ class Conv2DBNActiv(nn.Module):
class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nin,
nin,
nin,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
groups=nin,
bias=False),
nn.Conv2d(
nin, nout,
kernel_size=1,
bias=False),
bias=False,
),
nn.Conv2d(nin, nout, kernel_size=1, bias=False),
nn.BatchNorm2d(nout),
activ()
activ(),
)
def __call__(self, x):
@ -51,7 +50,6 @@ class SeperableConv2DBNActiv(nn.Module):
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
@ -65,14 +63,15 @@ class Encoder(nn.Module):
class Decoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
def __init__(
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
if skip is not None:
skip = spec_utils.crop_center(skip, x)
x = torch.cat([x, skip], dim=1)
@ -85,32 +84,37 @@ class Decoder(nn.Module):
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
)
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
)
self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.conv6 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.conv7 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ),
nn.Dropout2d(0.1)
Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
)
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)

View File

@ -6,19 +6,20 @@ from uvr5_pack.lib_v5 import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nout,
nin,
nout,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
bias=False),
bias=False,
),
nn.BatchNorm2d(nout),
activ()
activ(),
)
def __call__(self, x):
@ -26,24 +27,22 @@ class Conv2DBNActiv(nn.Module):
class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nin,
nin,
nin,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
groups=nin,
bias=False),
nn.Conv2d(
nin, nout,
kernel_size=1,
bias=False),
bias=False,
),
nn.Conv2d(nin, nout, kernel_size=1, bias=False),
nn.BatchNorm2d(nout),
activ()
activ(),
)
def __call__(self, x):
@ -51,7 +50,6 @@ class SeperableConv2DBNActiv(nn.Module):
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
@ -65,14 +63,15 @@ class Encoder(nn.Module):
class Decoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
def __init__(
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
if skip is not None:
skip = spec_utils.crop_center(skip, x)
x = torch.cat([x, skip], dim=1)
@ -85,32 +84,37 @@ class Decoder(nn.Module):
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
)
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
)
self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.conv6 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.conv7 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ),
nn.Dropout2d(0.1)
Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
)
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)

View File

@ -6,19 +6,20 @@ from uvr5_pack.lib_v5 import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nout,
nin,
nout,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
bias=False),
bias=False,
),
nn.BatchNorm2d(nout),
activ()
activ(),
)
def __call__(self, x):
@ -26,24 +27,22 @@ class Conv2DBNActiv(nn.Module):
class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nin,
nin,
nin,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
groups=nin,
bias=False),
nn.Conv2d(
nin, nout,
kernel_size=1,
bias=False),
bias=False,
),
nn.Conv2d(nin, nout, kernel_size=1, bias=False),
nn.BatchNorm2d(nout),
activ()
activ(),
)
def __call__(self, x):
@ -51,7 +50,6 @@ class SeperableConv2DBNActiv(nn.Module):
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
@ -65,14 +63,15 @@ class Encoder(nn.Module):
class Decoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
def __init__(
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
if skip is not None:
skip = spec_utils.crop_center(skip, x)
x = torch.cat([x, skip], dim=1)
@ -85,32 +84,37 @@ class Decoder(nn.Module):
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
)
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
)
self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.conv6 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.conv7 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ),
nn.Dropout2d(0.1)
Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
)
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)

View File

@ -3,33 +3,33 @@ import os
import pathlib
default_param = {}
default_param['bins'] = 768
default_param['unstable_bins'] = 9 # training only
default_param['reduction_bins'] = 762 # training only
default_param['sr'] = 44100
default_param['pre_filter_start'] = 757
default_param['pre_filter_stop'] = 768
default_param['band'] = {}
default_param["bins"] = 768
default_param["unstable_bins"] = 9 # training only
default_param["reduction_bins"] = 762 # training only
default_param["sr"] = 44100
default_param["pre_filter_start"] = 757
default_param["pre_filter_stop"] = 768
default_param["band"] = {}
default_param['band'][1] = {
'sr': 11025,
'hl': 128,
'n_fft': 960,
'crop_start': 0,
'crop_stop': 245,
'lpf_start': 61, # inference only
'res_type': 'polyphase'
default_param["band"][1] = {
"sr": 11025,
"hl": 128,
"n_fft": 960,
"crop_start": 0,
"crop_stop": 245,
"lpf_start": 61, # inference only
"res_type": "polyphase",
}
default_param['band'][2] = {
'sr': 44100,
'hl': 512,
'n_fft': 1536,
'crop_start': 24,
'crop_stop': 547,
'hpf_start': 81, # inference only
'res_type': 'sinc_best'
default_param["band"][2] = {
"sr": 44100,
"hl": 512,
"n_fft": 1536,
"crop_start": 24,
"crop_stop": 547,
"hpf_start": 81, # inference only
"res_type": "sinc_best",
}
@ -43,18 +43,27 @@ def int_keys(d):
class ModelParameters(object):
def __init__(self, config_path=''):
if '.pth' == pathlib.Path(config_path).suffix:
def __init__(self, config_path=""):
if ".pth" == pathlib.Path(config_path).suffix:
import zipfile
with zipfile.ZipFile(config_path, 'r') as zip:
self.param = json.loads(zip.read('param.json'), object_pairs_hook=int_keys)
elif '.json' == pathlib.Path(config_path).suffix:
with open(config_path, 'r') as f:
with zipfile.ZipFile(config_path, "r") as zip:
self.param = json.loads(
zip.read("param.json"), object_pairs_hook=int_keys
)
elif ".json" == pathlib.Path(config_path).suffix:
with open(config_path, "r") as f:
self.param = json.loads(f.read(), object_pairs_hook=int_keys)
else:
self.param = default_param
for k in ['mid_side', 'mid_side_b', 'mid_side_b2', 'stereo_w', 'stereo_n', 'reverse']:
for k in [
"mid_side",
"mid_side_b",
"mid_side_b2",
"stereo_w",
"stereo_n",
"reverse",
]:
if not k in self.param:
self.param[k] = False

View File

@ -7,7 +7,6 @@ from uvr5_pack.lib_v5 import spec_utils
class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16)):
super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
@ -39,7 +38,6 @@ class BaseASPPNet(nn.Module):
class CascadedASPPNet(nn.Module):
def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 16)
@ -64,13 +62,16 @@ class CascadedASPPNet(nn.Module):
mix = x.detach()
x = x.clone()
x = x[:, :, :self.max_bin]
x = x[:, :, : self.max_bin]
bandw = x.size()[2] // 2
aux1 = torch.cat([
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:])
], dim=2)
aux1 = torch.cat(
[
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:]),
],
dim=2,
)
h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
@ -82,24 +83,33 @@ class CascadedASPPNet(nn.Module):
mask = F.pad(
input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode='replicate')
mode="replicate",
)
if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad(
input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode='replicate')
mode="replicate",
)
aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad(
input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode='replicate')
mode="replicate",
)
return mask * mix, aux1 * mix, aux2 * mix
else:
if aggressiveness:
mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
mask[:, :, : aggressiveness["split_bin"]],
1 + aggressiveness["value"] / 3,
)
mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
mask[:, :, aggressiveness["split_bin"] :],
1 + aggressiveness["value"],
)
return mask * mix
@ -107,7 +117,7 @@ class CascadedASPPNet(nn.Module):
h = self.forward(x_mag, aggressiveness)
if self.offset > 0:
h = h[:, :, :, self.offset:-self.offset]
h = h[:, :, :, self.offset : -self.offset]
assert h.size()[3] > 0
return h

View File

@ -6,7 +6,6 @@ from uvr5_pack.lib_v5 import layers_123821KB as layers
class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16)):
super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
@ -38,7 +37,6 @@ class BaseASPPNet(nn.Module):
class CascadedASPPNet(nn.Module):
def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 32)
@ -63,13 +61,16 @@ class CascadedASPPNet(nn.Module):
mix = x.detach()
x = x.clone()
x = x[:, :, :self.max_bin]
x = x[:, :, : self.max_bin]
bandw = x.size()[2] // 2
aux1 = torch.cat([
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:])
], dim=2)
aux1 = torch.cat(
[
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:]),
],
dim=2,
)
h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
@ -81,24 +82,33 @@ class CascadedASPPNet(nn.Module):
mask = F.pad(
input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode='replicate')
mode="replicate",
)
if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad(
input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode='replicate')
mode="replicate",
)
aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad(
input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode='replicate')
mode="replicate",
)
return mask * mix, aux1 * mix, aux2 * mix
else:
if aggressiveness:
mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
mask[:, :, : aggressiveness["split_bin"]],
1 + aggressiveness["value"] / 3,
)
mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
mask[:, :, aggressiveness["split_bin"] :],
1 + aggressiveness["value"],
)
return mask * mix
@ -106,7 +116,7 @@ class CascadedASPPNet(nn.Module):
h = self.forward(x_mag, aggressiveness)
if self.offset > 0:
h = h[:, :, :, self.offset:-self.offset]
h = h[:, :, :, self.offset : -self.offset]
assert h.size()[3] > 0
return h

View File

@ -6,7 +6,6 @@ from uvr5_pack.lib_v5 import layers_123821KB as layers
class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16)):
super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
@ -38,7 +37,6 @@ class BaseASPPNet(nn.Module):
class CascadedASPPNet(nn.Module):
def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 32)
@ -63,13 +61,16 @@ class CascadedASPPNet(nn.Module):
mix = x.detach()
x = x.clone()
x = x[:, :, :self.max_bin]
x = x[:, :, : self.max_bin]
bandw = x.size()[2] // 2
aux1 = torch.cat([
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:])
], dim=2)
aux1 = torch.cat(
[
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:]),
],
dim=2,
)
h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
@ -81,24 +82,33 @@ class CascadedASPPNet(nn.Module):
mask = F.pad(
input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode='replicate')
mode="replicate",
)
if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad(
input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode='replicate')
mode="replicate",
)
aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad(
input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode='replicate')
mode="replicate",
)
return mask * mix, aux1 * mix, aux2 * mix
else:
if aggressiveness:
mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
mask[:, :, : aggressiveness["split_bin"]],
1 + aggressiveness["value"] / 3,
)
mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
mask[:, :, aggressiveness["split_bin"] :],
1 + aggressiveness["value"],
)
return mask * mix
@ -106,7 +116,7 @@ class CascadedASPPNet(nn.Module):
h = self.forward(x_mag, aggressiveness)
if self.offset > 0:
h = h[:, :, :, self.offset:-self.offset]
h = h[:, :, :, self.offset : -self.offset]
assert h.size()[3] > 0
return h

View File

@ -6,7 +6,6 @@ from uvr5_pack.lib_v5 import layers_33966KB as layers
class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16, 32)):
super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
@ -38,7 +37,6 @@ class BaseASPPNet(nn.Module):
class CascadedASPPNet(nn.Module):
def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 16)
@ -63,13 +61,16 @@ class CascadedASPPNet(nn.Module):
mix = x.detach()
x = x.clone()
x = x[:, :, :self.max_bin]
x = x[:, :, : self.max_bin]
bandw = x.size()[2] // 2
aux1 = torch.cat([
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:])
], dim=2)
aux1 = torch.cat(
[
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:]),
],
dim=2,
)
h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
@ -81,24 +82,33 @@ class CascadedASPPNet(nn.Module):
mask = F.pad(
input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode='replicate')
mode="replicate",
)
if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad(
input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode='replicate')
mode="replicate",
)
aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad(
input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode='replicate')
mode="replicate",
)
return mask * mix, aux1 * mix, aux2 * mix
else:
if aggressiveness:
mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
mask[:, :, : aggressiveness["split_bin"]],
1 + aggressiveness["value"] / 3,
)
mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
mask[:, :, aggressiveness["split_bin"] :],
1 + aggressiveness["value"],
)
return mask * mix
@ -106,7 +116,7 @@ class CascadedASPPNet(nn.Module):
h = self.forward(x_mag, aggressiveness)
if self.offset > 0:
h = h[:, :, :, self.offset:-self.offset]
h = h[:, :, :, self.offset : -self.offset]
assert h.size()[3] > 0
return h

View File

@ -7,7 +7,6 @@ from uvr5_pack.lib_v5 import layers_537238KB as layers
class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16)):
super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
@ -39,7 +38,6 @@ class BaseASPPNet(nn.Module):
class CascadedASPPNet(nn.Module):
def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 64)
@ -64,13 +62,16 @@ class CascadedASPPNet(nn.Module):
mix = x.detach()
x = x.clone()
x = x[:, :, :self.max_bin]
x = x[:, :, : self.max_bin]
bandw = x.size()[2] // 2
aux1 = torch.cat([
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:])
], dim=2)
aux1 = torch.cat(
[
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:]),
],
dim=2,
)
h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
@ -82,24 +83,33 @@ class CascadedASPPNet(nn.Module):
mask = F.pad(
input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode='replicate')
mode="replicate",
)
if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad(
input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode='replicate')
mode="replicate",
)
aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad(
input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode='replicate')
mode="replicate",
)
return mask * mix, aux1 * mix, aux2 * mix
else:
if aggressiveness:
mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
mask[:, :, : aggressiveness["split_bin"]],
1 + aggressiveness["value"] / 3,
)
mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
mask[:, :, aggressiveness["split_bin"] :],
1 + aggressiveness["value"],
)
return mask * mix
@ -107,7 +117,7 @@ class CascadedASPPNet(nn.Module):
h = self.forward(x_mag, aggressiveness)
if self.offset > 0:
h = h[:, :, :, self.offset:-self.offset]
h = h[:, :, :, self.offset : -self.offset]
assert h.size()[3] > 0
return h

View File

@ -7,7 +7,6 @@ from uvr5_pack.lib_v5 import layers_537238KB as layers
class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16)):
super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
@ -39,7 +38,6 @@ class BaseASPPNet(nn.Module):
class CascadedASPPNet(nn.Module):
def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 64)
@ -64,13 +62,16 @@ class CascadedASPPNet(nn.Module):
mix = x.detach()
x = x.clone()
x = x[:, :, :self.max_bin]
x = x[:, :, : self.max_bin]
bandw = x.size()[2] // 2
aux1 = torch.cat([
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:])
], dim=2)
aux1 = torch.cat(
[
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:]),
],
dim=2,
)
h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
@ -82,24 +83,33 @@ class CascadedASPPNet(nn.Module):
mask = F.pad(
input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode='replicate')
mode="replicate",
)
if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad(
input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode='replicate')
mode="replicate",
)
aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad(
input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode='replicate')
mode="replicate",
)
return mask * mix, aux1 * mix, aux2 * mix
else:
if aggressiveness:
mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
mask[:, :, : aggressiveness["split_bin"]],
1 + aggressiveness["value"] / 3,
)
mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
mask[:, :, aggressiveness["split_bin"] :],
1 + aggressiveness["value"],
)
return mask * mix
@ -107,7 +117,7 @@ class CascadedASPPNet(nn.Module):
h = self.forward(x_mag, aggressiveness)
if self.offset > 0:
h = h[:, :, :, self.offset:-self.offset]
h = h[:, :, :, self.offset : -self.offset]
assert h.size()[3] > 0
return h

View File

@ -6,7 +6,6 @@ from uvr5_pack.lib_v5 import layers_123821KB as layers
class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16)):
super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
@ -38,7 +37,6 @@ class BaseASPPNet(nn.Module):
class CascadedASPPNet(nn.Module):
def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 32)
@ -63,13 +61,16 @@ class CascadedASPPNet(nn.Module):
mix = x.detach()
x = x.clone()
x = x[:, :, :self.max_bin]
x = x[:, :, : self.max_bin]
bandw = x.size()[2] // 2
aux1 = torch.cat([
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:])
], dim=2)
aux1 = torch.cat(
[
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:]),
],
dim=2,
)
h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
@ -81,24 +82,33 @@ class CascadedASPPNet(nn.Module):
mask = F.pad(
input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode='replicate')
mode="replicate",
)
if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad(
input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode='replicate')
mode="replicate",
)
aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad(
input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode='replicate')
mode="replicate",
)
return mask * mix, aux1 * mix, aux2 * mix
else:
if aggressiveness:
mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
mask[:, :, : aggressiveness["split_bin"]],
1 + aggressiveness["value"] / 3,
)
mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
mask[:, :, aggressiveness["split_bin"] :],
1 + aggressiveness["value"],
)
return mask * mix
@ -106,7 +116,7 @@ class CascadedASPPNet(nn.Module):
h = self.forward(x_mag, aggressiveness)
if self.offset > 0:
h = h[:, :, :, self.offset:-self.offset]
h = h[:, :, :, self.offset : -self.offset]
assert h.size()[3] > 0
return h

View File

@ -1,8 +1,9 @@
import os,librosa
import numpy as np
import soundfile as sf
import os, librosa
import numpy as np
import soundfile as sf
from tqdm import tqdm
import json,math ,hashlib
import json, math, hashlib
def crop_center(h1, h2):
h1_shape = h1.size()
@ -11,7 +12,7 @@ def crop_center(h1, h2):
if h1_shape[3] == h2_shape[3]:
return h1
elif h1_shape[3] < h2_shape[3]:
raise ValueError('h1_shape[3] must be greater than h2_shape[3]')
raise ValueError("h1_shape[3] must be greater than h2_shape[3]")
# s_freq = (h2_shape[2] - h1_shape[2]) // 2
# e_freq = s_freq + h1_shape[2]
@ -22,7 +23,9 @@ def crop_center(h1, h2):
return h1
def wave_to_spectrogram(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False):
def wave_to_spectrogram(
wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False
):
if reverse:
wave_left = np.flip(np.asfortranarray(wave[0]))
wave_right = np.flip(np.asfortranarray(wave[1]))
@ -30,8 +33,8 @@ def wave_to_spectrogram(wave, hop_length, n_fft, mid_side=False, mid_side_b2=Fal
wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
elif mid_side_b2:
wave_left = np.asfortranarray(np.add(wave[1], wave[0] * .5))
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * .5))
wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5))
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5))
else:
wave_left = np.asfortranarray(wave[0])
wave_right = np.asfortranarray(wave[1])
@ -44,7 +47,9 @@ def wave_to_spectrogram(wave, hop_length, n_fft, mid_side=False, mid_side_b2=Fal
return spec
def wave_to_spectrogram_mt(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False):
def wave_to_spectrogram_mt(
wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False
):
import threading
if reverse:
@ -54,8 +59,8 @@ def wave_to_spectrogram_mt(wave, hop_length, n_fft, mid_side=False, mid_side_b2=
wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
elif mid_side_b2:
wave_left = np.asfortranarray(np.add(wave[1], wave[0] * .5))
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * .5))
wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5))
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5))
else:
wave_left = np.asfortranarray(wave[0])
wave_right = np.asfortranarray(wave[1])
@ -64,7 +69,10 @@ def wave_to_spectrogram_mt(wave, hop_length, n_fft, mid_side=False, mid_side_b2=
global spec_left
spec_left = librosa.stft(**kwargs)
thread = threading.Thread(target=run_thread, kwargs={'y': wave_left, 'n_fft': n_fft, 'hop_length': hop_length})
thread = threading.Thread(
target=run_thread,
kwargs={"y": wave_left, "n_fft": n_fft, "hop_length": hop_length},
)
thread.start()
spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length)
thread.join()
@ -76,40 +84,50 @@ def wave_to_spectrogram_mt(wave, hop_length, n_fft, mid_side=False, mid_side_b2=
def combine_spectrograms(specs, mp):
l = min([specs[i].shape[2] for i in specs])
spec_c = np.zeros(shape=(2, mp.param['bins'] + 1, l), dtype=np.complex64)
spec_c = np.zeros(shape=(2, mp.param["bins"] + 1, l), dtype=np.complex64)
offset = 0
bands_n = len(mp.param['band'])
bands_n = len(mp.param["band"])
for d in range(1, bands_n + 1):
h = mp.param['band'][d]['crop_stop'] - mp.param['band'][d]['crop_start']
spec_c[:, offset:offset+h, :l] = specs[d][:, mp.param['band'][d]['crop_start']:mp.param['band'][d]['crop_stop'], :l]
h = mp.param["band"][d]["crop_stop"] - mp.param["band"][d]["crop_start"]
spec_c[:, offset : offset + h, :l] = specs[d][
:, mp.param["band"][d]["crop_start"] : mp.param["band"][d]["crop_stop"], :l
]
offset += h
if offset > mp.param['bins']:
raise ValueError('Too much bins')
if offset > mp.param["bins"]:
raise ValueError("Too much bins")
# lowpass fiter
if mp.param['pre_filter_start'] > 0: # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']:
if (
mp.param["pre_filter_start"] > 0
): # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']:
if bands_n == 1:
spec_c = fft_lp_filter(spec_c, mp.param['pre_filter_start'], mp.param['pre_filter_stop'])
spec_c = fft_lp_filter(
spec_c, mp.param["pre_filter_start"], mp.param["pre_filter_stop"]
)
else:
gp = 1
for b in range(mp.param['pre_filter_start'] + 1, mp.param['pre_filter_stop']):
g = math.pow(10, -(b - mp.param['pre_filter_start']) * (3.5 - gp) / 20.0)
for b in range(
mp.param["pre_filter_start"] + 1, mp.param["pre_filter_stop"]
):
g = math.pow(
10, -(b - mp.param["pre_filter_start"]) * (3.5 - gp) / 20.0
)
gp = g
spec_c[:, b, :] *= g
return np.asfortranarray(spec_c)
def spectrogram_to_image(spec, mode='magnitude'):
if mode == 'magnitude':
def spectrogram_to_image(spec, mode="magnitude"):
if mode == "magnitude":
if np.iscomplexobj(spec):
y = np.abs(spec)
else:
y = spec
y = np.log10(y ** 2 + 1e-8)
elif mode == 'phase':
y = np.log10(y**2 + 1e-8)
elif mode == "phase":
if np.iscomplexobj(spec):
y = np.angle(spec)
else:
@ -121,9 +139,7 @@ def spectrogram_to_image(spec, mode='magnitude'):
if y.ndim == 3:
img = img.transpose(1, 2, 0)
img = np.concatenate([
np.max(img, axis=2, keepdims=True), img
], axis=2)
img = np.concatenate([np.max(img, axis=2, keepdims=True), img], axis=2)
return img
@ -136,12 +152,12 @@ def reduce_vocal_aggressively(X, y, softmask):
v_mask = v_mag_tmp > y_mag_tmp
y_mag = np.clip(y_mag_tmp - v_mag_tmp * v_mask * softmask, 0, np.inf)
return y_mag * np.exp(1.j * np.angle(y))
return y_mag * np.exp(1.0j * np.angle(y))
def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32):
if min_range < fade_size * 2:
raise ValueError('min_range must be >= fade_area * 2')
raise ValueError("min_range must be >= fade_area * 2")
mag = mag.copy()
@ -159,17 +175,19 @@ def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32):
if s != 0:
weight = np.linspace(0, 1, fade_size)
mag[:, :, s:s + fade_size] += weight * ref[:, :, s:s + fade_size]
mag[:, :, s : s + fade_size] += weight * ref[:, :, s : s + fade_size]
else:
s -= fade_size
if e != mag.shape[2]:
weight = np.linspace(1, 0, fade_size)
mag[:, :, e - fade_size:e] += weight * ref[:, :, e - fade_size:e]
mag[:, :, e - fade_size : e] += weight * ref[:, :, e - fade_size : e]
else:
e += fade_size
mag[:, :, s + fade_size:e - fade_size] += ref[:, :, s + fade_size:e - fade_size]
mag[:, :, s + fade_size : e - fade_size] += ref[
:, :, s + fade_size : e - fade_size
]
old_e = e
return mag
@ -178,22 +196,24 @@ def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32):
def align_wave_head_and_tail(a, b):
l = min([a[0].size, b[0].size])
return a[:l,:l], b[:l,:l]
return a[:l, :l], b[:l, :l]
def cache_or_load(mix_path, inst_path, mp):
mix_basename = os.path.splitext(os.path.basename(mix_path))[0]
inst_basename = os.path.splitext(os.path.basename(inst_path))[0]
cache_dir = 'mph{}'.format(hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode('utf-8')).hexdigest())
mix_cache_dir = os.path.join('cache', cache_dir)
inst_cache_dir = os.path.join('cache', cache_dir)
cache_dir = "mph{}".format(
hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode("utf-8")).hexdigest()
)
mix_cache_dir = os.path.join("cache", cache_dir)
inst_cache_dir = os.path.join("cache", cache_dir)
os.makedirs(mix_cache_dir, exist_ok=True)
os.makedirs(inst_cache_dir, exist_ok=True)
mix_cache_path = os.path.join(mix_cache_dir, mix_basename + '.npy')
inst_cache_path = os.path.join(inst_cache_dir, inst_basename + '.npy')
mix_cache_path = os.path.join(mix_cache_dir, mix_basename + ".npy")
inst_cache_path = os.path.join(inst_cache_dir, inst_basename + ".npy")
if os.path.exists(mix_cache_path) and os.path.exists(inst_cache_path):
X_spec_m = np.load(mix_cache_path)
@ -201,22 +221,52 @@ def cache_or_load(mix_path, inst_path, mp):
else:
X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
for d in range(len(mp.param['band']), 0, -1):
bp = mp.param['band'][d]
for d in range(len(mp.param["band"]), 0, -1):
bp = mp.param["band"][d]
if d == len(mp.param['band']): # high-end band
if d == len(mp.param["band"]): # high-end band
X_wave[d], _ = librosa.load(
mix_path, bp['sr'], False, dtype=np.float32, res_type=bp['res_type'])
mix_path, bp["sr"], False, dtype=np.float32, res_type=bp["res_type"]
)
y_wave[d], _ = librosa.load(
inst_path, bp['sr'], False, dtype=np.float32, res_type=bp['res_type'])
else: # lower bands
X_wave[d] = librosa.resample(X_wave[d+1], mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type'])
y_wave[d] = librosa.resample(y_wave[d+1], mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type'])
inst_path,
bp["sr"],
False,
dtype=np.float32,
res_type=bp["res_type"],
)
else: # lower bands
X_wave[d] = librosa.resample(
X_wave[d + 1],
mp.param["band"][d + 1]["sr"],
bp["sr"],
res_type=bp["res_type"],
)
y_wave[d] = librosa.resample(
y_wave[d + 1],
mp.param["band"][d + 1]["sr"],
bp["sr"],
res_type=bp["res_type"],
)
X_wave[d], y_wave[d] = align_wave_head_and_tail(X_wave[d], y_wave[d])
X_spec_s[d] = wave_to_spectrogram(X_wave[d], bp['hl'], bp['n_fft'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse'])
y_spec_s[d] = wave_to_spectrogram(y_wave[d], bp['hl'], bp['n_fft'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse'])
X_spec_s[d] = wave_to_spectrogram(
X_wave[d],
bp["hl"],
bp["n_fft"],
mp.param["mid_side"],
mp.param["mid_side_b2"],
mp.param["reverse"],
)
y_spec_s[d] = wave_to_spectrogram(
y_wave[d],
bp["hl"],
bp["n_fft"],
mp.param["mid_side"],
mp.param["mid_side_b2"],
mp.param["reverse"],
)
del X_wave, y_wave
@ -224,7 +274,7 @@ def cache_or_load(mix_path, inst_path, mp):
y_spec_m = combine_spectrograms(y_spec_s, mp)
if X_spec_m.shape != y_spec_m.shape:
raise ValueError('The combined spectrograms are different: ' + mix_path)
raise ValueError("The combined spectrograms are different: " + mix_path)
_, ext = os.path.splitext(mix_path)
@ -244,9 +294,16 @@ def spectrogram_to_wave(spec, hop_length, mid_side, mid_side_b2, reverse):
if reverse:
return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
elif mid_side:
return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)])
return np.asfortranarray(
[np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]
)
elif mid_side_b2:
return np.asfortranarray([np.add(wave_right / 1.25, .4 * wave_left), np.subtract(wave_left / 1.25, .4 * wave_right)])
return np.asfortranarray(
[
np.add(wave_right / 1.25, 0.4 * wave_left),
np.subtract(wave_left / 1.25, 0.4 * wave_right),
]
)
else:
return np.asfortranarray([wave_left, wave_right])
@ -261,7 +318,9 @@ def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2):
global wave_left
wave_left = librosa.istft(**kwargs)
thread = threading.Thread(target=run_thread, kwargs={'stft_matrix': spec_left, 'hop_length': hop_length})
thread = threading.Thread(
target=run_thread, kwargs={"stft_matrix": spec_left, "hop_length": hop_length}
)
thread.start()
wave_right = librosa.istft(spec_right, hop_length=hop_length)
thread.join()
@ -269,46 +328,94 @@ def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2):
if reverse:
return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
elif mid_side:
return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)])
return np.asfortranarray(
[np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]
)
elif mid_side_b2:
return np.asfortranarray([np.add(wave_right / 1.25, .4 * wave_left), np.subtract(wave_left / 1.25, .4 * wave_right)])
return np.asfortranarray(
[
np.add(wave_right / 1.25, 0.4 * wave_left),
np.subtract(wave_left / 1.25, 0.4 * wave_right),
]
)
else:
return np.asfortranarray([wave_left, wave_right])
def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None):
wave_band = {}
bands_n = len(mp.param['band'])
bands_n = len(mp.param["band"])
offset = 0
for d in range(1, bands_n + 1):
bp = mp.param['band'][d]
spec_s = np.ndarray(shape=(2, bp['n_fft'] // 2 + 1, spec_m.shape[2]), dtype=complex)
h = bp['crop_stop'] - bp['crop_start']
spec_s[:, bp['crop_start']:bp['crop_stop'], :] = spec_m[:, offset:offset+h, :]
bp = mp.param["band"][d]
spec_s = np.ndarray(
shape=(2, bp["n_fft"] // 2 + 1, spec_m.shape[2]), dtype=complex
)
h = bp["crop_stop"] - bp["crop_start"]
spec_s[:, bp["crop_start"] : bp["crop_stop"], :] = spec_m[
:, offset : offset + h, :
]
offset += h
if d == bands_n: # higher
if extra_bins_h: # if --high_end_process bypass
max_bin = bp['n_fft'] // 2
spec_s[:, max_bin-extra_bins_h:max_bin, :] = extra_bins[:, :extra_bins_h, :]
if bp['hpf_start'] > 0:
spec_s = fft_hp_filter(spec_s, bp['hpf_start'], bp['hpf_stop'] - 1)
if d == bands_n: # higher
if extra_bins_h: # if --high_end_process bypass
max_bin = bp["n_fft"] // 2
spec_s[:, max_bin - extra_bins_h : max_bin, :] = extra_bins[
:, :extra_bins_h, :
]
if bp["hpf_start"] > 0:
spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1)
if bands_n == 1:
wave = spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse'])
wave = spectrogram_to_wave(
spec_s,
bp["hl"],
mp.param["mid_side"],
mp.param["mid_side_b2"],
mp.param["reverse"],
)
else:
wave = np.add(wave, spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']))
wave = np.add(
wave,
spectrogram_to_wave(
spec_s,
bp["hl"],
mp.param["mid_side"],
mp.param["mid_side_b2"],
mp.param["reverse"],
),
)
else:
sr = mp.param['band'][d+1]['sr']
if d == 1: # lower
spec_s = fft_lp_filter(spec_s, bp['lpf_start'], bp['lpf_stop'])
wave = librosa.resample(spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']), bp['sr'], sr, res_type="sinc_fastest")
else: # mid
spec_s = fft_hp_filter(spec_s, bp['hpf_start'], bp['hpf_stop'] - 1)
spec_s = fft_lp_filter(spec_s, bp['lpf_start'], bp['lpf_stop'])
wave2 = np.add(wave, spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']))
sr = mp.param["band"][d + 1]["sr"]
if d == 1: # lower
spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"])
wave = librosa.resample(
spectrogram_to_wave(
spec_s,
bp["hl"],
mp.param["mid_side"],
mp.param["mid_side_b2"],
mp.param["reverse"],
),
bp["sr"],
sr,
res_type="sinc_fastest",
)
else: # mid
spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1)
spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"])
wave2 = np.add(
wave,
spectrogram_to_wave(
spec_s,
bp["hl"],
mp.param["mid_side"],
mp.param["mid_side_b2"],
mp.param["reverse"],
),
)
# wave = librosa.core.resample(wave2, bp['sr'], sr, res_type="sinc_fastest")
wave = librosa.core.resample(wave2, bp['sr'], sr,res_type='scipy')
wave = librosa.core.resample(wave2, bp["sr"], sr, res_type="scipy")
return wave.T
@ -330,20 +437,46 @@ def fft_hp_filter(spec, bin_start, bin_stop):
g -= 1 / (bin_start - bin_stop)
spec[:, b, :] = g * spec[:, b, :]
spec[:, 0:bin_stop+1, :] *= 0
spec[:, 0 : bin_stop + 1, :] *= 0
return spec
def mirroring(a, spec_m, input_high_end, mp):
if 'mirroring' == a:
mirror = np.flip(np.abs(spec_m[:, mp.param['pre_filter_start']-10-input_high_end.shape[1]:mp.param['pre_filter_start']-10, :]), 1)
mirror = mirror * np.exp(1.j * np.angle(input_high_end))
if "mirroring" == a:
mirror = np.flip(
np.abs(
spec_m[
:,
mp.param["pre_filter_start"]
- 10
- input_high_end.shape[1] : mp.param["pre_filter_start"]
- 10,
:,
]
),
1,
)
mirror = mirror * np.exp(1.0j * np.angle(input_high_end))
return np.where(np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror)
return np.where(
np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror
)
if 'mirroring2' == a:
mirror = np.flip(np.abs(spec_m[:, mp.param['pre_filter_start']-10-input_high_end.shape[1]:mp.param['pre_filter_start']-10, :]), 1)
if "mirroring2" == a:
mirror = np.flip(
np.abs(
spec_m[
:,
mp.param["pre_filter_start"]
- 10
- input_high_end.shape[1] : mp.param["pre_filter_start"]
- 10,
:,
]
),
1,
)
mi = np.multiply(mirror, input_high_end * 1.7)
return np.where(np.abs(input_high_end) <= np.abs(mi), input_high_end, mi)
@ -355,16 +488,17 @@ def ensembling(a, specs):
spec = specs[0]
ln = min([spec.shape[2], specs[i].shape[2]])
spec = spec[:,:,:ln]
specs[i] = specs[i][:,:,:ln]
spec = spec[:, :, :ln]
specs[i] = specs[i][:, :, :ln]
if 'min_mag' == a:
if "min_mag" == a:
spec = np.where(np.abs(specs[i]) <= np.abs(spec), specs[i], spec)
if 'max_mag' == a:
if "max_mag" == a:
spec = np.where(np.abs(specs[i]) >= np.abs(spec), specs[i], spec)
return spec
def stft(wave, nfft, hl):
wave_left = np.asfortranarray(wave[0])
wave_right = np.asfortranarray(wave[1])
@ -374,6 +508,7 @@ def stft(wave, nfft, hl):
return spec
def istft(spec, hl):
spec_left = np.asfortranarray(spec[0])
spec_right = np.asfortranarray(spec[1])
@ -391,20 +526,31 @@ if __name__ == "__main__":
from model_param_init import ModelParameters
p = argparse.ArgumentParser()
p.add_argument('--algorithm', '-a', type=str, choices=['invert', 'invert_p', 'min_mag', 'max_mag', 'deep', 'align'], default='min_mag')
p.add_argument('--model_params', '-m', type=str, default=os.path.join('modelparams', '1band_sr44100_hl512.json'))
p.add_argument('--output_name', '-o', type=str, default='output')
p.add_argument('--vocals_only', '-v', action='store_true')
p.add_argument('input', nargs='+')
p.add_argument(
"--algorithm",
"-a",
type=str,
choices=["invert", "invert_p", "min_mag", "max_mag", "deep", "align"],
default="min_mag",
)
p.add_argument(
"--model_params",
"-m",
type=str,
default=os.path.join("modelparams", "1band_sr44100_hl512.json"),
)
p.add_argument("--output_name", "-o", type=str, default="output")
p.add_argument("--vocals_only", "-v", action="store_true")
p.add_argument("input", nargs="+")
args = p.parse_args()
start_time = time.time()
if args.algorithm.startswith('invert') and len(args.input) != 2:
raise ValueError('There should be two input files.')
if args.algorithm.startswith("invert") and len(args.input) != 2:
raise ValueError("There should be two input files.")
if not args.algorithm.startswith('invert') and len(args.input) < 2:
raise ValueError('There must be at least two input files.')
if not args.algorithm.startswith("invert") and len(args.input) < 2:
raise ValueError("There must be at least two input files.")
wave, specs = {}, {}
mp = ModelParameters(args.model_params)
@ -412,39 +558,60 @@ if __name__ == "__main__":
for i in range(len(args.input)):
spec = {}
for d in range(len(mp.param['band']), 0, -1):
bp = mp.param['band'][d]
for d in range(len(mp.param["band"]), 0, -1):
bp = mp.param["band"][d]
if d == len(mp.param['band']): # high-end band
if d == len(mp.param["band"]): # high-end band
wave[d], _ = librosa.load(
args.input[i], bp['sr'], False, dtype=np.float32, res_type=bp['res_type'])
args.input[i],
bp["sr"],
False,
dtype=np.float32,
res_type=bp["res_type"],
)
if len(wave[d].shape) == 1: # mono to stereo
if len(wave[d].shape) == 1: # mono to stereo
wave[d] = np.array([wave[d], wave[d]])
else: # lower bands
wave[d] = librosa.resample(wave[d+1], mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type'])
else: # lower bands
wave[d] = librosa.resample(
wave[d + 1],
mp.param["band"][d + 1]["sr"],
bp["sr"],
res_type=bp["res_type"],
)
spec[d] = wave_to_spectrogram(wave[d], bp['hl'], bp['n_fft'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse'])
spec[d] = wave_to_spectrogram(
wave[d],
bp["hl"],
bp["n_fft"],
mp.param["mid_side"],
mp.param["mid_side_b2"],
mp.param["reverse"],
)
specs[i] = combine_spectrograms(spec, mp)
del wave
if args.algorithm == 'deep':
if args.algorithm == "deep":
d_spec = np.where(np.abs(specs[0]) <= np.abs(spec[1]), specs[0], spec[1])
v_spec = d_spec - specs[1]
sf.write(os.path.join('{}.wav'.format(args.output_name)), cmb_spectrogram_to_wave(v_spec, mp), mp.param['sr'])
sf.write(
os.path.join("{}.wav".format(args.output_name)),
cmb_spectrogram_to_wave(v_spec, mp),
mp.param["sr"],
)
if args.algorithm.startswith('invert'):
if args.algorithm.startswith("invert"):
ln = min([specs[0].shape[2], specs[1].shape[2]])
specs[0] = specs[0][:,:,:ln]
specs[1] = specs[1][:,:,:ln]
specs[0] = specs[0][:, :, :ln]
specs[1] = specs[1][:, :, :ln]
if 'invert_p' == args.algorithm:
if "invert_p" == args.algorithm:
X_mag = np.abs(specs[0])
y_mag = np.abs(specs[1])
max_mag = np.where(X_mag >= y_mag, X_mag, y_mag)
v_spec = specs[1] - max_mag * np.exp(1.j * np.angle(specs[0]))
v_spec = specs[1] - max_mag * np.exp(1.0j * np.angle(specs[0]))
else:
specs[1] = reduce_vocal_aggressively(specs[0], specs[1], 0.2)
v_spec = specs[0] - specs[1]
@ -458,28 +625,43 @@ if __name__ == "__main__":
y_image = spectrogram_to_image(y_mag)
v_image = spectrogram_to_image(v_mag)
cv2.imwrite('{}_X.png'.format(args.output_name), X_image)
cv2.imwrite('{}_y.png'.format(args.output_name), y_image)
cv2.imwrite('{}_v.png'.format(args.output_name), v_image)
cv2.imwrite("{}_X.png".format(args.output_name), X_image)
cv2.imwrite("{}_y.png".format(args.output_name), y_image)
cv2.imwrite("{}_v.png".format(args.output_name), v_image)
sf.write('{}_X.wav'.format(args.output_name), cmb_spectrogram_to_wave(specs[0], mp), mp.param['sr'])
sf.write('{}_y.wav'.format(args.output_name), cmb_spectrogram_to_wave(specs[1], mp), mp.param['sr'])
sf.write(
"{}_X.wav".format(args.output_name),
cmb_spectrogram_to_wave(specs[0], mp),
mp.param["sr"],
)
sf.write(
"{}_y.wav".format(args.output_name),
cmb_spectrogram_to_wave(specs[1], mp),
mp.param["sr"],
)
sf.write('{}_v.wav'.format(args.output_name), cmb_spectrogram_to_wave(v_spec, mp), mp.param['sr'])
sf.write(
"{}_v.wav".format(args.output_name),
cmb_spectrogram_to_wave(v_spec, mp),
mp.param["sr"],
)
else:
if not args.algorithm == 'deep':
sf.write(os.path.join('ensembled','{}.wav'.format(args.output_name)), cmb_spectrogram_to_wave(ensembling(args.algorithm, specs), mp), mp.param['sr'])
if args.algorithm == 'align':
if not args.algorithm == "deep":
sf.write(
os.path.join("ensembled", "{}.wav".format(args.output_name)),
cmb_spectrogram_to_wave(ensembling(args.algorithm, specs), mp),
mp.param["sr"],
)
if args.algorithm == "align":
trackalignment = [
{
'file1':'"{}"'.format(args.input[0]),
'file2':'"{}"'.format(args.input[1])
"file1": '"{}"'.format(args.input[0]),
"file2": '"{}"'.format(args.input[1]),
}
]
for i,e in tqdm(enumerate(trackalignment), desc="Performing Alignment..."):
for i, e in tqdm(enumerate(trackalignment), desc="Performing Alignment..."):
os.system(f"python lib/align_tracks.py {e['file1']} {e['file2']}")
#print('Total time: {0:.{1}f}s'.format(time.time() - start_time, 1))
# print('Total time: {0:.{1}f}s'.format(time.time() - start_time, 1))

263
uvr5_pack/name_params.json Normal file
View File

@ -0,0 +1,263 @@
{
"equivalent" : [
{
"model_hash_name" : [
{
"hash_name": "47939caf0cfe52a0e81442b85b971dfd",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
"param_name": "4band_44100"
},
{
"hash_name": "4e4ecb9764c50a8c414fee6e10395bbe",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_v2.json",
"param_name": "4band_v2"
},
{
"hash_name": "ca106edd563e034bde0bdec4bb7a4b36",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_v2.json",
"param_name": "4band_v2"
},
{
"hash_name": "e60a1e84803ce4efc0a6551206cc4b71",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
"param_name": "4band_44100"
},
{
"hash_name": "a82f14e75892e55e994376edbf0c8435",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
"param_name": "4band_44100"
},
{
"hash_name": "6dd9eaa6f0420af9f1d403aaafa4cc06",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_v2_sn.json",
"param_name": "4band_v2_sn"
},
{
"hash_name": "08611fb99bd59eaa79ad27c58d137727",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_v2_sn.json",
"param_name": "4band_v2_sn"
},
{
"hash_name": "5c7bbca45a187e81abbbd351606164e5",
"model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json",
"param_name": "3band_44100_msb2"
},
{
"hash_name": "d6b2cb685a058a091e5e7098192d3233",
"model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json",
"param_name": "3band_44100_msb2"
},
{
"hash_name": "c1b9f38170a7c90e96f027992eb7c62b",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
"param_name": "4band_44100"
},
{
"hash_name": "c3448ec923fa0edf3d03a19e633faa53",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
"param_name": "4band_44100"
},
{
"hash_name": "68aa2c8093d0080704b200d140f59e54",
"model_params": "uvr5_pack/lib_v5/modelparams/3band_44100.json",
"param_name": "3band_44100"
},
{
"hash_name": "fdc83be5b798e4bd29fe00fe6600e147",
"model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_mid.json",
"param_name": "3band_44100_mid.json"
},
{
"hash_name": "2ce34bc92fd57f55db16b7a4def3d745",
"model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_mid.json",
"param_name": "3band_44100_mid.json"
},
{
"hash_name": "52fdca89576f06cf4340b74a4730ee5f",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
"param_name": "4band_44100.json"
},
{
"hash_name": "41191165b05d38fc77f072fa9e8e8a30",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
"param_name": "4band_44100.json"
},
{
"hash_name": "89e83b511ad474592689e562d5b1f80e",
"model_params": "uvr5_pack/lib_v5/modelparams/2band_32000.json",
"param_name": "2band_32000.json"
},
{
"hash_name": "0b954da81d453b716b114d6d7c95177f",
"model_params": "uvr5_pack/lib_v5/modelparams/2band_32000.json",
"param_name": "2band_32000.json"
}
],
"v4 Models": [
{
"hash_name": "6a00461c51c2920fd68937d4609ed6c8",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json",
"param_name": "1band_sr16000_hl512"
},
{
"hash_name": "0ab504864d20f1bd378fe9c81ef37140",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json",
"param_name": "1band_sr32000_hl512"
},
{
"hash_name": "7dd21065bf91c10f7fccb57d7d83b07f",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json",
"param_name": "1band_sr32000_hl512"
},
{
"hash_name": "80ab74d65e515caa3622728d2de07d23",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json",
"param_name": "1band_sr32000_hl512"
},
{
"hash_name": "edc115e7fc523245062200c00caa847f",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json",
"param_name": "1band_sr33075_hl384"
},
{
"hash_name": "28063e9f6ab5b341c5f6d3c67f2045b7",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json",
"param_name": "1band_sr33075_hl384"
},
{
"hash_name": "b58090534c52cbc3e9b5104bad666ef2",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json",
"param_name": "1band_sr44100_hl512"
},
{
"hash_name": "0cdab9947f1b0928705f518f3c78ea8f",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json",
"param_name": "1band_sr44100_hl512"
},
{
"hash_name": "ae702fed0238afb5346db8356fe25f13",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json",
"param_name": "1band_sr44100_hl1024"
}
]
}
],
"User Models" : [
{
"1 Band": [
{
"hash_name": "1band_sr16000_hl512",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json",
"param_name": "1band_sr16000_hl512"
},
{
"hash_name": "1band_sr32000_hl512",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json",
"param_name": "1band_sr16000_hl512"
},
{
"hash_name": "1band_sr33075_hl384",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json",
"param_name": "1band_sr33075_hl384"
},
{
"hash_name": "1band_sr44100_hl256",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json",
"param_name": "1band_sr44100_hl256"
},
{
"hash_name": "1band_sr44100_hl512",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json",
"param_name": "1band_sr44100_hl512"
},
{
"hash_name": "1band_sr44100_hl1024",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json",
"param_name": "1band_sr44100_hl1024"
}
],
"2 Band": [
{
"hash_name": "2band_44100_lofi",
"model_params": "uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json",
"param_name": "2band_44100_lofi"
},
{
"hash_name": "2band_32000",
"model_params": "uvr5_pack/lib_v5/modelparams/2band_32000.json",
"param_name": "2band_32000"
},
{
"hash_name": "2band_48000",
"model_params": "uvr5_pack/lib_v5/modelparams/2band_48000.json",
"param_name": "2band_48000"
}
],
"3 Band": [
{
"hash_name": "3band_44100",
"model_params": "uvr5_pack/lib_v5/modelparams/3band_44100.json",
"param_name": "3band_44100"
},
{
"hash_name": "3band_44100_mid",
"model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_mid.json",
"param_name": "3band_44100_mid"
},
{
"hash_name": "3band_44100_msb2",
"model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json",
"param_name": "3band_44100_msb2"
}
],
"4 Band": [
{
"hash_name": "4band_44100",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
"param_name": "4band_44100"
},
{
"hash_name": "4band_44100_mid",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_mid.json",
"param_name": "4band_44100_mid"
},
{
"hash_name": "4band_44100_msb",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_msb.json",
"param_name": "4band_44100_msb"
},
{
"hash_name": "4band_44100_msb2",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json",
"param_name": "4band_44100_msb2"
},
{
"hash_name": "4band_44100_reverse",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json",
"param_name": "4band_44100_reverse"
},
{
"hash_name": "4band_44100_sw",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_sw.json",
"param_name": "4band_44100_sw"
},
{
"hash_name": "4band_v2",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_v2.json",
"param_name": "4band_v2"
},
{
"hash_name": "4band_v2_sn",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_v2_sn.json",
"param_name": "4band_v2_sn"
},
{
"hash_name": "tmodelparam",
"model_params": "uvr5_pack/lib_v5/modelparams/tmodelparam.json",
"param_name": "User Model Param Set"
}
]
}
]
}

View File

@ -1,6 +1,15 @@
import torch
import numpy as np
from tqdm import tqdm
import json
def load_data(file_name: str = "./uvr5_pack/data.json") -> dict:
with open(file_name, "r") as f:
data = json.load(f)
return data
def make_padding(width, cropsize, offset):
left = offset
@ -10,12 +19,16 @@ def make_padding(width, cropsize, offset):
right = roi_size - (width % roi_size) + left
return left, right, roi_size
def inference(X_spec, device, model, aggressiveness,data):
'''
data dic configs
'''
def _execute(X_mag_pad, roi_size, n_window, device, model, aggressiveness,is_half=True):
def inference(X_spec, device, model, aggressiveness, data):
"""
data dic configs
"""
def _execute(
X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True
):
model.eval()
with torch.no_grad():
preds = []
@ -25,10 +38,13 @@ def inference(X_spec, device, model, aggressiveness,data):
total_iterations = sum(iterations)
for i in tqdm(range(n_window)):
start = i * roi_size
X_mag_window = X_mag_pad[None, :, :, start:start + data['window_size']]
X_mag_window = X_mag_pad[
None, :, :, start : start + data["window_size"]
]
X_mag_window = torch.from_numpy(X_mag_window)
if(is_half):X_mag_window=X_mag_window.half()
X_mag_window=X_mag_window.to(device)
if is_half:
X_mag_window = X_mag_window.half()
X_mag_window = X_mag_window.to(device)
pred = model.predict(X_mag_window, aggressiveness)
@ -50,193 +66,55 @@ def inference(X_spec, device, model, aggressiveness,data):
X_mag_pre = X_mag / coef
n_frame = X_mag_pre.shape[2]
pad_l, pad_r, roi_size = make_padding(n_frame,
data['window_size'], model.offset)
pad_l, pad_r, roi_size = make_padding(n_frame, data["window_size"], model.offset)
n_window = int(np.ceil(n_frame / roi_size))
X_mag_pad = np.pad(
X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode='constant')
X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
if(list(model.state_dict().values())[0].dtype==torch.float16):is_half=True
else:is_half=False
pred = _execute(X_mag_pad, roi_size, n_window,
device, model, aggressiveness,is_half)
if list(model.state_dict().values())[0].dtype == torch.float16:
is_half = True
else:
is_half = False
pred = _execute(
X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
)
pred = pred[:, :, :n_frame]
if data['tta']:
if data["tta"]:
pad_l += roi_size // 2
pad_r += roi_size // 2
n_window += 1
X_mag_pad = np.pad(
X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode='constant')
X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
pred_tta = _execute(X_mag_pad, roi_size, n_window,
device, model, aggressiveness,is_half)
pred_tta = pred_tta[:, :, roi_size // 2:]
pred_tta = _execute(
X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
)
pred_tta = pred_tta[:, :, roi_size // 2 :]
pred_tta = pred_tta[:, :, :n_frame]
return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.j * X_phase)
return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.0j * X_phase)
else:
return pred * coef, X_mag, np.exp(1.j * X_phase)
return pred * coef, X_mag, np.exp(1.0j * X_phase)
def _get_name_params(model_path , model_hash):
def _get_name_params(model_path, model_hash):
data = load_data()
flag = False
ModelName = model_path
if model_hash == '47939caf0cfe52a0e81442b85b971dfd':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
param_name_auto=str('4band_44100')
if model_hash == '4e4ecb9764c50a8c414fee6e10395bbe':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2.json')
param_name_auto=str('4band_v2')
if model_hash == 'ca106edd563e034bde0bdec4bb7a4b36':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2.json')
param_name_auto=str('4band_v2')
if model_hash == 'e60a1e84803ce4efc0a6551206cc4b71':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
param_name_auto=str('4band_44100')
if model_hash == 'a82f14e75892e55e994376edbf0c8435':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
param_name_auto=str('4band_44100')
if model_hash == '6dd9eaa6f0420af9f1d403aaafa4cc06':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2_sn.json')
param_name_auto=str('4band_v2_sn')
if model_hash == '08611fb99bd59eaa79ad27c58d137727':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2_sn.json')
param_name_auto=str('4band_v2_sn')
if model_hash == '5c7bbca45a187e81abbbd351606164e5':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json')
param_name_auto=str('3band_44100_msb2')
if model_hash == 'd6b2cb685a058a091e5e7098192d3233':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json')
param_name_auto=str('3band_44100_msb2')
if model_hash == 'c1b9f38170a7c90e96f027992eb7c62b':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
param_name_auto=str('4band_44100')
if model_hash == 'c3448ec923fa0edf3d03a19e633faa53':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
param_name_auto=str('4band_44100')
if model_hash == '68aa2c8093d0080704b200d140f59e54':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100.json')
param_name_auto=str('3band_44100.json')
if model_hash == 'fdc83be5b798e4bd29fe00fe6600e147':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_mid.json')
param_name_auto=str('3band_44100_mid.json')
if model_hash == '2ce34bc92fd57f55db16b7a4def3d745':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_mid.json')
param_name_auto=str('3band_44100_mid.json')
if model_hash == '52fdca89576f06cf4340b74a4730ee5f':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
param_name_auto=str('4band_44100.json')
if model_hash == '41191165b05d38fc77f072fa9e8e8a30':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
param_name_auto=str('4band_44100.json')
if model_hash == '89e83b511ad474592689e562d5b1f80e':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_32000.json')
param_name_auto=str('2band_32000.json')
if model_hash == '0b954da81d453b716b114d6d7c95177f':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_32000.json')
param_name_auto=str('2band_32000.json')
for type in list(data):
for model in list(data[type][0]):
for i in range(len(data[type][0][model])):
if str(data[type][0][model][i]["hash_name"]) == model_hash:
flag = True
elif str(data[type][0][model][i]["hash_name"]) in ModelName:
flag = True
#v4 Models
if model_hash == '6a00461c51c2920fd68937d4609ed6c8':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json')
param_name_auto=str('1band_sr16000_hl512')
if model_hash == '0ab504864d20f1bd378fe9c81ef37140':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json')
param_name_auto=str('1band_sr32000_hl512')
if model_hash == '7dd21065bf91c10f7fccb57d7d83b07f':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json')
param_name_auto=str('1band_sr32000_hl512')
if model_hash == '80ab74d65e515caa3622728d2de07d23':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json')
param_name_auto=str('1band_sr32000_hl512')
if model_hash == 'edc115e7fc523245062200c00caa847f':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json')
param_name_auto=str('1band_sr33075_hl384')
if model_hash == '28063e9f6ab5b341c5f6d3c67f2045b7':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json')
param_name_auto=str('1band_sr33075_hl384')
if model_hash == 'b58090534c52cbc3e9b5104bad666ef2':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json')
param_name_auto=str('1band_sr44100_hl512')
if model_hash == '0cdab9947f1b0928705f518f3c78ea8f':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json')
param_name_auto=str('1band_sr44100_hl512')
if model_hash == 'ae702fed0238afb5346db8356fe25f13':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json')
param_name_auto=str('1band_sr44100_hl1024')
#User Models
#1 Band
if '1band_sr16000_hl512' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json')
param_name_auto=str('1band_sr16000_hl512')
if '1band_sr32000_hl512' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json')
param_name_auto=str('1band_sr32000_hl512')
if '1band_sr33075_hl384' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json')
param_name_auto=str('1band_sr33075_hl384')
if '1band_sr44100_hl256' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json')
param_name_auto=str('1band_sr44100_hl256')
if '1band_sr44100_hl512' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json')
param_name_auto=str('1band_sr44100_hl512')
if '1band_sr44100_hl1024' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json')
param_name_auto=str('1band_sr44100_hl1024')
#2 Band
if '2band_44100_lofi' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json')
param_name_auto=str('2band_44100_lofi')
if '2band_32000' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_32000.json')
param_name_auto=str('2band_32000')
if '2band_48000' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_48000.json')
param_name_auto=str('2band_48000')
#3 Band
if '3band_44100' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100.json')
param_name_auto=str('3band_44100')
if '3band_44100_mid' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_mid.json')
param_name_auto=str('3band_44100_mid')
if '3band_44100_msb2' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json')
param_name_auto=str('3band_44100_msb2')
#4 Band
if '4band_44100' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
param_name_auto=str('4band_44100')
if '4band_44100_mid' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_mid.json')
param_name_auto=str('4band_44100_mid')
if '4band_44100_msb' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_msb.json')
param_name_auto=str('4band_44100_msb')
if '4band_44100_msb2' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json')
param_name_auto=str('4band_44100_msb2')
if '4band_44100_reverse' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json')
param_name_auto=str('4band_44100_reverse')
if '4band_44100_sw' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_sw.json')
param_name_auto=str('4band_44100_sw')
if '4band_v2' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2.json')
param_name_auto=str('4band_v2')
if '4band_v2_sn' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2_sn.json')
param_name_auto=str('4band_v2_sn')
if 'tmodelparam' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/tmodelparam.json')
param_name_auto=str('User Model Param Set')
return param_name_auto , model_params_auto
if flag:
model_params_auto = data[type][0][model][i]["model_params"]
param_name_auto = data[type][0][model][i]["param_name"]
if type == "equivalent":
return param_name_auto, model_params_auto
else:
flag = False
return param_name_auto, model_params_auto

View File

@ -1,36 +1,47 @@
import numpy as np,parselmouth,torch,pdb
import numpy as np, parselmouth, torch, pdb
from time import time as ttime
import torch.nn.functional as F
from config import x_pad,x_query,x_center,x_max
from config import x_pad, x_query, x_center, x_max
import scipy.signal as signal
import pyworld,os,traceback,faiss
class VC(object):
def __init__(self,tgt_sr,device,is_half):
self.sr=16000#hubert输入采样率
self.window=160#每帧点数
self.t_pad=self.sr*x_pad#每条前后pad时间
self.t_pad_tgt=tgt_sr*x_pad
self.t_pad2=self.t_pad*2
self.t_query=self.sr*x_query#查询切点前后查询时间
self.t_center=self.sr*x_center#查询切点位置
self.t_max=self.sr*x_max#免查询时长阈值
self.device=device
self.is_half=is_half
import pyworld, os, traceback, faiss
def get_f0(self,x, p_len,f0_up_key,f0_method,inp_f0=None):
class VC(object):
def __init__(self, tgt_sr, device, is_half):
self.sr = 16000 # hubert输入采样率
self.window = 160 # 每帧点数
self.t_pad = self.sr * x_pad # 每条前后pad时间
self.t_pad_tgt = tgt_sr * x_pad
self.t_pad2 = self.t_pad * 2
self.t_query = self.sr * x_query # 查询切点前后查询时间
self.t_center = self.sr * x_center # 查询切点位置
self.t_max = self.sr * x_max # 免查询时长阈值
self.device = device
self.is_half = is_half
def get_f0(self, x, p_len, f0_up_key, f0_method, inp_f0=None):
time_step = self.window / self.sr * 1000
f0_min = 50
f0_max = 1100
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
if(f0_method=="pm"):
f0 = parselmouth.Sound(x, self.sr).to_pitch_ac(
time_step=time_step / 1000, voicing_threshold=0.6,
pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
pad_size=(p_len - len(f0) + 1) // 2
if(pad_size>0 or p_len - len(f0) - pad_size>0):
f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
elif(f0_method=="harvest"):
if f0_method == "pm":
f0 = (
parselmouth.Sound(x, self.sr)
.to_pitch_ac(
time_step=time_step / 1000,
voicing_threshold=0.6,
pitch_floor=f0_min,
pitch_ceiling=f0_max,
)
.selected_array["frequency"]
)
pad_size = (p_len - len(f0) + 1) // 2
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
f0 = np.pad(
f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
)
elif f0_method == "harvest":
f0, t = pyworld.harvest(
x.astype(np.double),
fs=self.sr,
@ -42,25 +53,45 @@ class VC(object):
f0 = signal.medfilt(f0, 3)
f0 *= pow(2, f0_up_key / 12)
# with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
tf0=self.sr//self.window#每秒f0点数
if (inp_f0 is not None):
delta_t=np.round((inp_f0[:,0].max()-inp_f0[:,0].min())*tf0+1).astype("int16")
replace_f0=np.interp(list(range(delta_t)), inp_f0[:, 0]*100, inp_f0[:, 1])
shape=f0[x_pad*tf0:x_pad*tf0+len(replace_f0)].shape[0]
f0[x_pad*tf0:x_pad*tf0+len(replace_f0)]=replace_f0[:shape]
tf0 = self.sr // self.window # 每秒f0点数
if inp_f0 is not None:
delta_t = np.round(
(inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
).astype("int16")
replace_f0 = np.interp(
list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
)
shape = f0[x_pad * tf0 : x_pad * tf0 + len(replace_f0)].shape[0]
f0[x_pad * tf0 : x_pad * tf0 + len(replace_f0)] = replace_f0[:shape]
# with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
f0bak = f0.copy()
f0_mel = 1127 * np.log(1 + f0 / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
f0_mel_max - f0_mel_min
) + 1
f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > 255] = 255
f0_coarse = np.rint(f0_mel).astype(np.int)
return f0_coarse, f0bak#1-0
return f0_coarse, f0bak # 1-0
def vc(self,model,net_g,sid,audio0,pitch,pitchf,times,index,big_npy,index_rate):#,file_index,file_big_npy
def vc(
self,
model,
net_g,
sid,
audio0,
pitch,
pitchf,
times,
index,
big_npy,
index_rate,
): # ,file_index,file_big_npy
feats = torch.from_numpy(audio0)
if(self.is_half):feats=feats.half()
else:feats=feats.float()
if self.is_half:
feats = feats.half()
else:
feats = feats.float()
if feats.dim() == 2: # double channels
feats = feats.mean(-1)
assert feats.dim() == 1, feats.dim()
@ -75,91 +106,196 @@ class VC(object):
t0 = ttime()
with torch.no_grad():
logits = model.extract_features(**inputs)
feats = model.final_proj(logits[0])
feats = model.final_proj(logits[0])
if(isinstance(index,type(None))==False and isinstance(big_npy,type(None))==False and index_rate!=0):
if (
isinstance(index, type(None)) == False
and isinstance(big_npy, type(None)) == False
and index_rate != 0
):
npy = feats[0].cpu().numpy()
if(self.is_half):npy=npy.astype("float32")
if self.is_half:
npy = npy.astype("float32")
_, I = index.search(npy, 1)
npy=big_npy[I.squeeze()]
if(self.is_half):npy=npy.astype("float16")
feats = torch.from_numpy(npy).unsqueeze(0).to(self.device)*index_rate + (1-index_rate)*feats
npy = big_npy[I.squeeze()]
if self.is_half:
npy = npy.astype("float16")
feats = (
torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
+ (1 - index_rate) * feats
)
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
t1 = ttime()
p_len = audio0.shape[0]//self.window
if(feats.shape[1]<p_len):
p_len=feats.shape[1]
if(pitch!=None and pitchf!=None):
pitch=pitch[:,:p_len]
pitchf=pitchf[:,:p_len]
p_len=torch.tensor([p_len],device=self.device).long()
p_len = audio0.shape[0] // self.window
if feats.shape[1] < p_len:
p_len = feats.shape[1]
if pitch != None and pitchf != None:
pitch = pitch[:, :p_len]
pitchf = pitchf[:, :p_len]
p_len = torch.tensor([p_len], device=self.device).long()
with torch.no_grad():
if(pitch!=None and pitchf!=None):
audio1 = (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16)
if pitch != None and pitchf != None:
audio1 = (
(net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768)
.data.cpu()
.float()
.numpy()
.astype(np.int16)
)
else:
audio1 = (net_g.infer(feats, p_len, sid)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16)
del feats,p_len,padding_mask
if torch.cuda.is_available(): torch.cuda.empty_cache()
audio1 = (
(net_g.infer(feats, p_len, sid)[0][0, 0] * 32768)
.data.cpu()
.float()
.numpy()
.astype(np.int16)
)
del feats, p_len, padding_mask
if torch.cuda.is_available():
torch.cuda.empty_cache()
t2 = ttime()
times[0] += (t1 - t0)
times[2] += (t2 - t1)
times[0] += t1 - t0
times[2] += t2 - t1
return audio1
def pipeline(self,model,net_g,sid,audio,times,f0_up_key,f0_method,file_index,file_big_npy,index_rate,if_f0,f0_file=None):
if(file_big_npy!=""and file_index!=""and os.path.exists(file_big_npy)==True and os.path.exists(file_index)==True and index_rate!=0):
def pipeline(
self,
model,
net_g,
sid,
audio,
times,
f0_up_key,
f0_method,
file_index,
file_big_npy,
index_rate,
if_f0,
f0_file=None,
):
if (
file_big_npy != ""
and file_index != ""
and os.path.exists(file_big_npy) == True
and os.path.exists(file_index) == True
and index_rate != 0
):
try:
index = faiss.read_index(file_index)
big_npy = np.load(file_big_npy)
except:
traceback.print_exc()
index=big_npy=None
index = big_npy = None
else:
index=big_npy=None
audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode='reflect')
index = big_npy = None
audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
opt_ts = []
if(audio_pad.shape[0]>self.t_max):
if audio_pad.shape[0] > self.t_max:
audio_sum = np.zeros_like(audio)
for i in range(self.window): audio_sum += audio_pad[i:i - self.window]
for t in range(self.t_center, audio.shape[0],self.t_center):opt_ts.append(t - self.t_query + np.where(np.abs(audio_sum[t - self.t_query:t + self.t_query]) == np.abs(audio_sum[t - self.t_query:t + self.t_query]).min())[0][0])
for i in range(self.window):
audio_sum += audio_pad[i : i - self.window]
for t in range(self.t_center, audio.shape[0], self.t_center):
opt_ts.append(
t
- self.t_query
+ np.where(
np.abs(audio_sum[t - self.t_query : t + self.t_query])
== np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
)[0][0]
)
s = 0
audio_opt=[]
t=None
t1=ttime()
audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode='reflect')
p_len=audio_pad.shape[0]//self.window
inp_f0=None
if(hasattr(f0_file,'name') ==True):
audio_opt = []
t = None
t1 = ttime()
audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
p_len = audio_pad.shape[0] // self.window
inp_f0 = None
if hasattr(f0_file, "name") == True:
try:
with open(f0_file.name,"r")as f:
lines=f.read().strip("\n").split("\n")
inp_f0=[]
for line in lines:inp_f0.append([float(i)for i in line.split(",")])
inp_f0=np.array(inp_f0,dtype="float32")
with open(f0_file.name, "r") as f:
lines = f.read().strip("\n").split("\n")
inp_f0 = []
for line in lines:
inp_f0.append([float(i) for i in line.split(",")])
inp_f0 = np.array(inp_f0, dtype="float32")
except:
traceback.print_exc()
sid=torch.tensor(sid,device=self.device).unsqueeze(0).long()
pitch, pitchf=None,None
if(if_f0==1):
pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key,f0_method,inp_f0)
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
pitch, pitchf = None, None
if if_f0 == 1:
pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key, f0_method, inp_f0)
pitch = pitch[:p_len]
pitchf = pitchf[:p_len]
pitch = torch.tensor(pitch,device=self.device).unsqueeze(0).long()
pitchf = torch.tensor(pitchf,device=self.device).unsqueeze(0).float()
t2=ttime()
times[1] += (t2 - t1)
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
t2 = ttime()
times[1] += t2 - t1
for t in opt_ts:
t=t//self.window*self.window
if (if_f0 == 1):
audio_opt.append(self.vc(model,net_g,sid,audio_pad[s:t+self.t_pad2+self.window],pitch[:,s//self.window:(t+self.t_pad2)//self.window],pitchf[:,s//self.window:(t+self.t_pad2)//self.window],times,index,big_npy,index_rate)[self.t_pad_tgt:-self.t_pad_tgt])
t = t // self.window * self.window
if if_f0 == 1:
audio_opt.append(
self.vc(
model,
net_g,
sid,
audio_pad[s : t + self.t_pad2 + self.window],
pitch[:, s // self.window : (t + self.t_pad2) // self.window],
pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
times,
index,
big_npy,
index_rate,
)[self.t_pad_tgt : -self.t_pad_tgt]
)
else:
audio_opt.append(self.vc(model,net_g,sid,audio_pad[s:t+self.t_pad2+self.window],None,None,times,index,big_npy,index_rate)[self.t_pad_tgt:-self.t_pad_tgt])
audio_opt.append(
self.vc(
model,
net_g,
sid,
audio_pad[s : t + self.t_pad2 + self.window],
None,
None,
times,
index,
big_npy,
index_rate,
)[self.t_pad_tgt : -self.t_pad_tgt]
)
s = t
if (if_f0 == 1):
audio_opt.append(self.vc(model,net_g,sid,audio_pad[t:],pitch[:,t//self.window:]if t is not None else pitch,pitchf[:,t//self.window:]if t is not None else pitchf,times,index,big_npy,index_rate)[self.t_pad_tgt:-self.t_pad_tgt])
if if_f0 == 1:
audio_opt.append(
self.vc(
model,
net_g,
sid,
audio_pad[t:],
pitch[:, t // self.window :] if t is not None else pitch,
pitchf[:, t // self.window :] if t is not None else pitchf,
times,
index,
big_npy,
index_rate,
)[self.t_pad_tgt : -self.t_pad_tgt]
)
else:
audio_opt.append(self.vc(model,net_g,sid,audio_pad[t:],None,None,times,index,big_npy,index_rate)[self.t_pad_tgt:-self.t_pad_tgt])
audio_opt=np.concatenate(audio_opt)
del pitch,pitchf,sid
if torch.cuda.is_available(): torch.cuda.empty_cache()
audio_opt.append(
self.vc(
model,
net_g,
sid,
audio_pad[t:],
None,
None,
times,
index,
big_npy,
index_rate,
)[self.t_pad_tgt : -self.t_pad_tgt]
)
audio_opt = np.concatenate(audio_opt)
del pitch, pitchf, sid
if torch.cuda.is_available():
torch.cuda.empty_cache()
return audio_opt

View File

@ -1,16 +1,18 @@
import locale
import json
def load_language_list(language):
with open(f"./locale/{language}.json", "r", encoding="utf-8") as f:
language_list = json.load(f)
return language_list
class I18nAuto:
def __init__(self, language=None):
if language is None:
language = 'auto'
if language == 'auto':
language = "auto"
if language == "auto":
language = locale.getdefaultlocale()[0]
self.language = language
print("Use Language:", language)