1
0
mirror of synced 2024-11-24 07:30:16 +01:00

Add files via upload

This commit is contained in:
RVC-Boss 2023-05-28 22:58:33 +08:00 committed by GitHub
parent 7789c46ded
commit f1730d42d4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 455 additions and 56 deletions

198
MDXNet.py Normal file
View File

@ -0,0 +1,198 @@
import soundfile as sf
import torch,pdb,time,argparse,os,warnings,sys,librosa
import numpy as np
import onnxruntime as ort
from scipy.io.wavfile import write
from tqdm import tqdm
import torch
import torch.nn as nn
dim_c = 4
class Conv_TDF_net_trim():
def __init__(self, device, model_name, target_name,
L, dim_f, dim_t, n_fft, hop=1024):
super(Conv_TDF_net_trim, self).__init__()
self.dim_f = dim_f
self.dim_t = 2 ** dim_t
self.n_fft = n_fft
self.hop = hop
self.n_bins = self.n_fft // 2 + 1
self.chunk_size = hop * (self.dim_t - 1)
self.window = torch.hann_window(window_length=self.n_fft, periodic=True).to(device)
self.target_name = target_name
self.blender = 'blender' in model_name
out_c = dim_c * 4 if target_name == '*' else dim_c
self.freq_pad = torch.zeros([1, out_c, self.n_bins - self.dim_f, self.dim_t]).to(device)
self.n = L // 2
def stft(self, x):
x = x.reshape([-1, self.chunk_size])
x = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True, return_complex=True)
x = torch.view_as_real(x)
x = x.permute([0, 3, 1, 2])
x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape([-1, dim_c, self.n_bins, self.dim_t])
return x[:, :, :self.dim_f]
def istft(self, x, freq_pad=None):
freq_pad = self.freq_pad.repeat([x.shape[0], 1, 1, 1]) if freq_pad is None else freq_pad
x = torch.cat([x, freq_pad], -2)
c = 4 * 2 if self.target_name == '*' else 2
x = x.reshape([-1, c, 2, self.n_bins, self.dim_t]).reshape([-1, 2, self.n_bins, self.dim_t])
x = x.permute([0, 2, 3, 1])
x = x.contiguous()
x = torch.view_as_complex(x)
x = torch.istft(x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True)
return x.reshape([-1, c, self.chunk_size])
def get_models(device, dim_f, dim_t, n_fft):
return Conv_TDF_net_trim(
device=device,
model_name='Conv-TDF', target_name='vocals',
L=11,
dim_f=dim_f, dim_t=dim_t,
n_fft=n_fft
)
warnings.filterwarnings("ignore")
cpu = torch.device('cpu')
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
class Predictor:
def __init__(self,args):
self.args=args
self.model_ = get_models(device=cpu, dim_f=args.dim_f, dim_t=args.dim_t, n_fft=args.n_fft)
self.model = ort.InferenceSession(os.path.join(args.onnx,self.model_.target_name+'.onnx'), providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
print('onnx load done')
def demix(self, mix):
samples = mix.shape[-1]
margin = self.args.margin
chunk_size = self.args.chunks*44100
assert not margin == 0, 'margin cannot be zero!'
if margin > chunk_size:
margin = chunk_size
segmented_mix = {}
if self.args.chunks == 0 or samples < chunk_size:
chunk_size = samples
counter = -1
for skip in range(0, samples, chunk_size):
counter+=1
s_margin = 0 if counter == 0 else margin
end = min(skip+chunk_size+margin, samples)
start = skip-s_margin
segmented_mix[skip] = mix[:,start:end].copy()
if end == samples:
break
sources = self.demix_base(segmented_mix, margin_size=margin)
'''
mix:(2,big_sample)
segmented_mix:offset->(2,small_sample)
sources:(1,2,big_sample)
'''
return sources
def demix_base(self, mixes, margin_size):
chunked_sources = []
progress_bar = tqdm(total=len(mixes))
progress_bar.set_description("Processing")
for mix in mixes:
cmix = mixes[mix]
sources = []
n_sample = cmix.shape[1]
model=self.model_
trim = model.n_fft//2
gen_size = model.chunk_size-2*trim
pad = gen_size - n_sample%gen_size
mix_p = np.concatenate((np.zeros((2,trim)), cmix, np.zeros((2,pad)), np.zeros((2,trim))), 1)
mix_waves = []
i = 0
while i < n_sample + pad:
waves = np.array(mix_p[:, i:i+model.chunk_size])
mix_waves.append(waves)
i += gen_size
mix_waves = torch.tensor(mix_waves, dtype=torch.float32).to(cpu)
with torch.no_grad():
_ort = self.model
spek = model.stft(mix_waves)
if self.args.denoise:
spec_pred = -_ort.run(None, {'input': -spek.cpu().numpy()})[0]*0.5+_ort.run(None, {'input': spek.cpu().numpy()})[0]*0.5
tar_waves = model.istft(torch.tensor(spec_pred))
else:
tar_waves = model.istft(torch.tensor(_ort.run(None, {'input': spek.cpu().numpy()})[0]))
tar_signal = tar_waves[:,:,trim:-trim].transpose(0,1).reshape(2, -1).numpy()[:, :-pad]
start = 0 if mix == 0 else margin_size
end = None if mix == list(mixes.keys())[::-1][0] else -margin_size
if margin_size == 0:
end = None
sources.append(tar_signal[:,start:end])
progress_bar.update(1)
chunked_sources.append(sources)
_sources = np.concatenate(chunked_sources, axis=-1)
# del self.model
progress_bar.close()
return _sources
def prediction(self, m,vocal_root,others_root):
os.makedirs(vocal_root,exist_ok=True)
os.makedirs(others_root,exist_ok=True)
basename = os.path.basename(m)
mix, rate = librosa.load(m, mono=False, sr=44100)
if mix.ndim == 1:
mix = np.asfortranarray([mix,mix])
mix = mix.T
sources = self.demix(mix.T)
opt=sources[0].T
sf.write("%s/%s_main_vocal.wav"%(vocal_root,basename), mix-opt, rate)
sf.write("%s/%s_others.wav"%(others_root,basename), opt , rate)
class MDXNetDereverb():
def __init__(self,chunks):
self.onnx="uvr5_weights/onnx_dereverb_By_FoxJoy"
self.shifts=10#'Predict with randomised equivariant stabilisation'
self.mixing="min_mag"#['default','min_mag','max_mag']
self.chunks=chunks
self.margin=44100
self.dim_t=9
self.dim_f=3072
self.n_fft=6144
self.denoise=True
self.pred=Predictor(self)
def _path_audio_(self,input,vocal_root,others_root):
self.pred.prediction(input,vocal_root,others_root)
if __name__ == '__main__':
dereverb=MDXNetDereverb(15)
from time import time as ttime
t0=ttime()
dereverb._path_audio_(
"雪雪伴奏对消HP5.wav",
"vocal",
"others",
)
t1=ttime()
print(t1-t0)
'''
runtime\python.exe MDXNet.py
6G:
15/9:0.8G->6.8G
14:0.8G->6.5G
25:
half15:0.7G->6.6G,22.69s
fp32-15:0.7G->6.6G,20.85s
'''

View File

@ -21,6 +21,7 @@ warnings.filterwarnings("ignore")
torch.manual_seed(114514) torch.manual_seed(114514)
from i18n import I18nAuto from i18n import I18nAuto
import ffmpeg import ffmpeg
from MDXNet import MDXNetDereverb
i18n = I18nAuto() i18n = I18nAuto()
i18n.print() i18n.print()
@ -82,7 +83,7 @@ import gradio as gr
import logging import logging
from vc_infer_pipeline import VC from vc_infer_pipeline import VC
from config import Config from config import Config
from infer_uvr5 import _audio_pre_ from infer_uvr5 import _audio_pre_,_audio_pre_new
from my_utils import load_audio from my_utils import load_audio
from train.process_ckpt import show_info, change_info, merge, extract_small_model from train.process_ckpt import show_info, change_info, merge, extract_small_model
@ -133,7 +134,7 @@ for root, dirs, files in os.walk(index_root, topdown=False):
index_paths.append("%s/%s" % (root, name)) index_paths.append("%s/%s" % (root, name))
uvr5_names = [] uvr5_names = []
for name in os.listdir(weight_uvr5_root): for name in os.listdir(weight_uvr5_root):
if name.endswith(".pth"): if name.endswith(".pth")or "onnx"in name:
uvr5_names.append(name.replace(".pth", "")) uvr5_names.append(name.replace(".pth", ""))
@ -150,6 +151,7 @@ def vc_single(
filter_radius, filter_radius,
resample_sr, resample_sr,
rms_mix_rate, rms_mix_rate,
protect
): # spk_item, input_audio0, vc_transform0,f0_file,f0method0 ): # spk_item, input_audio0, vc_transform0,f0_file,f0method0
global tgt_sr, net_g, vc, hubert_model, version global tgt_sr, net_g, vc, hubert_model, version
if input_audio_path is None: if input_audio_path is None:
@ -197,6 +199,7 @@ def vc_single(
resample_sr, resample_sr,
rms_mix_rate, rms_mix_rate,
version, version,
protect,
f0_file=f0_file, f0_file=f0_file,
) )
if resample_sr >= 16000 and tgt_sr != resample_sr: if resample_sr >= 16000 and tgt_sr != resample_sr:
@ -232,6 +235,7 @@ def vc_multi(
filter_radius, filter_radius,
resample_sr, resample_sr,
rms_mix_rate, rms_mix_rate,
protect
): ):
try: try:
dir_path = ( dir_path = (
@ -262,6 +266,7 @@ def vc_multi(
filter_radius, filter_radius,
resample_sr, resample_sr,
rms_mix_rate, rms_mix_rate,
protect
) )
if "Success" in info: if "Success" in info:
try: try:
@ -288,12 +293,16 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg):
save_root_ins = ( save_root_ins = (
save_root_ins.strip(" ").strip('"').strip("\n").strip('"').strip(" ") save_root_ins.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
) )
pre_fun = _audio_pre_( if(model_name=="onnx_dereverb_By_FoxJoy"):
agg=int(agg), pre_fun=MDXNetDereverb(15)
model_path=os.path.join(weight_uvr5_root, model_name + ".pth"), else:
device=config.device, func=_audio_pre_ if "DeEcho"not in model_name else _audio_pre_new
is_half=config.is_half, pre_fun = func(
) agg=int(agg),
model_path=os.path.join(weight_uvr5_root, model_name + ".pth"),
device=config.device,
is_half=config.is_half,
)
if inp_root != "": if inp_root != "":
paths = [os.path.join(inp_root, name) for name in os.listdir(inp_root)] paths = [os.path.join(inp_root, name) for name in os.listdir(inp_root)]
else: else:
@ -336,8 +345,12 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg):
yield "\n".join(infos) yield "\n".join(infos)
finally: finally:
try: try:
del pre_fun.model if (model_name == "onnx_dereverb_By_FoxJoy"):
del pre_fun del pre_fun.pred.model
del pre_fun.pred.model_
else:
del pre_fun.model
del pre_fun
except: except:
traceback.print_exc() traceback.print_exc()
print("clean_empty_cache") print("clean_empty_cache")
@ -790,7 +803,7 @@ def train_index(exp_dir1, version19):
faiss.write_index( faiss.write_index(
index, index,
"%s/trained_IVF%s_Flat_nprobe_%s_%s_%s.index" "%s/trained_IVF%s_Flat_nprobe_%s_%s_%s.index"
% (exp_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19), % (exp_dir, n_ivf, index_ivf.nprobe,exp_dir1, version19),
) )
# faiss.write_index(index, '%s/trained_IVF%s_Flat_FastScan_%s.index'%(exp_dir,n_ivf,version19)) # faiss.write_index(index, '%s/trained_IVF%s_Flat_FastScan_%s.index'%(exp_dir,n_ivf,version19))
infos.append("adding") infos.append("adding")
@ -801,11 +814,11 @@ def train_index(exp_dir1, version19):
faiss.write_index( faiss.write_index(
index, index,
"%s/added_IVF%s_Flat_nprobe_%s_%s_%s.index" "%s/added_IVF%s_Flat_nprobe_%s_%s_%s.index"
% (exp_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19), % (exp_dir, n_ivf, index_ivf.nprobe,exp_dir1, version19),
) )
infos.append( infos.append(
"成功构建索引added_IVF%s_Flat_nprobe_%s_%s_%s.index" "成功构建索引added_IVF%s_Flat_nprobe_%s_%s_%s.index"
% (n_ivf, index_ivf.nprobe, exp_dir1, version19) % (n_ivf, index_ivf.nprobe,exp_dir1, version19)
) )
# faiss.write_index(index, '%s/added_IVF%s_Flat_FastScan_%s.index'%(exp_dir,n_ivf,version19)) # faiss.write_index(index, '%s/added_IVF%s_Flat_FastScan_%s.index'%(exp_dir,n_ivf,version19))
# infos.append("成功构建索引added_IVF%s_Flat_FastScan_%s.index"%(n_ivf,version19)) # infos.append("成功构建索引added_IVF%s_Flat_FastScan_%s.index"%(n_ivf,version19))
@ -1030,7 +1043,7 @@ def train1key(
faiss.write_index( faiss.write_index(
index, index,
"%s/trained_IVF%s_Flat_nprobe_%s_%s_%s.index" "%s/trained_IVF%s_Flat_nprobe_%s_%s_%s.index"
% (model_log_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19), % (model_log_dir, n_ivf, index_ivf.nprobe,exp_dir1, version19),
) )
yield get_info_str("adding index") yield get_info_str("adding index")
batch_size_add = 8192 batch_size_add = 8192
@ -1039,11 +1052,11 @@ def train1key(
faiss.write_index( faiss.write_index(
index, index,
"%s/added_IVF%s_Flat_nprobe_%s_%s_%s.index" "%s/added_IVF%s_Flat_nprobe_%s_%s_%s.index"
% (model_log_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19), % (model_log_dir, n_ivf, index_ivf.nprobe,exp_dir1, version19),
) )
yield get_info_str( yield get_info_str(
"成功构建索引, added_IVF%s_Flat_nprobe_%s_%s_%s.index" "成功构建索引, added_IVF%s_Flat_nprobe_%s_%s_%s.index"
% (n_ivf, index_ivf.nprobe, exp_dir1, version19) % (n_ivf, index_ivf.nprobe, exp_dir1,version19)
) )
yield get_info_str(i18n("全流程结束!")) yield get_info_str(i18n("全流程结束!"))
@ -1161,8 +1174,8 @@ with gr.Blocks() as app:
value="E:\\codes\\py39\\test-20230416b\\todo-songs\\冬之花clip1.wav", value="E:\\codes\\py39\\test-20230416b\\todo-songs\\冬之花clip1.wav",
) )
f0method0 = gr.Radio( f0method0 = gr.Radio(
label=i18n("选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比"), label=i18n("选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU"),
choices=["pm", "harvest"], choices=["pm", "harvest","crepe"],
value="pm", value="pm",
interactive=True, interactive=True,
) )
@ -1197,9 +1210,10 @@ with gr.Blocks() as app:
minimum=0, minimum=0,
maximum=1, maximum=1,
label=i18n("检索特征占比"), label=i18n("检索特征占比"),
value=0.76, value=0.88,
interactive=True, interactive=True,
) )
with gr.Column():
resample_sr0 = gr.Slider( resample_sr0 = gr.Slider(
minimum=0, minimum=0,
maximum=48000, maximum=48000,
@ -1215,9 +1229,17 @@ with gr.Blocks() as app:
value=1, value=1,
interactive=True, interactive=True,
) )
protect0 = gr.Slider(
minimum=0,
maximum=0.5,
label=i18n("保护清辅音和呼吸声防止电音撕裂等artifact拉满0.5不开启,调低加大保护力度但可能降低索引效果"),
value=0.33,
step=0.01,
interactive=True,
)
f0_file = gr.File(label=i18n("F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调")) f0_file = gr.File(label=i18n("F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调"))
but0 = gr.Button(i18n("转换"), variant="primary") but0 = gr.Button(i18n("转换"), variant="primary")
with gr.Column(): with gr.Row():
vc_output1 = gr.Textbox(label=i18n("输出信息")) vc_output1 = gr.Textbox(label=i18n("输出信息"))
vc_output2 = gr.Audio(label=i18n("输出音频(右下角三个点,点了可以下载)")) vc_output2 = gr.Audio(label=i18n("输出音频(右下角三个点,点了可以下载)"))
but0.click( but0.click(
@ -1235,6 +1257,7 @@ with gr.Blocks() as app:
filter_radius0, filter_radius0,
resample_sr0, resample_sr0,
rms_mix_rate0, rms_mix_rate0,
protect0
], ],
[vc_output1, vc_output2], [vc_output1, vc_output2],
) )
@ -1249,8 +1272,8 @@ with gr.Blocks() as app:
) )
opt_input = gr.Textbox(label=i18n("指定输出文件夹"), value="opt") opt_input = gr.Textbox(label=i18n("指定输出文件夹"), value="opt")
f0method1 = gr.Radio( f0method1 = gr.Radio(
label=i18n("选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比"), label=i18n("选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU"),
choices=["pm", "harvest"], choices=["pm", "harvest","crepe"],
value="pm", value="pm",
interactive=True, interactive=True,
) )
@ -1285,6 +1308,7 @@ with gr.Blocks() as app:
value=1, value=1,
interactive=True, interactive=True,
) )
with gr.Column():
resample_sr1 = gr.Slider( resample_sr1 = gr.Slider(
minimum=0, minimum=0,
maximum=48000, maximum=48000,
@ -1300,6 +1324,14 @@ with gr.Blocks() as app:
value=1, value=1,
interactive=True, interactive=True,
) )
protect1 = gr.Slider(
minimum=0,
maximum=0.5,
label=i18n("保护清辅音和呼吸声防止电音撕裂等artifact拉满0.5不开启,调低加大保护力度但可能降低索引效果"),
value=0.33,
step=0.01,
interactive=True,
)
with gr.Column(): with gr.Column():
dir_input = gr.Textbox( dir_input = gr.Textbox(
label=i18n("输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)"), label=i18n("输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)"),
@ -1308,8 +1340,9 @@ with gr.Blocks() as app:
inputs = gr.File( inputs = gr.File(
file_count="multiple", label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹") file_count="multiple", label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹")
) )
but1 = gr.Button(i18n("转换"), variant="primary") with gr.Row():
vc_output3 = gr.Textbox(label=i18n("输出信息")) but1 = gr.Button(i18n("转换"), variant="primary")
vc_output3 = gr.Textbox(label=i18n("输出信息"))
but1.click( but1.click(
vc_multi, vc_multi,
[ [
@ -1326,14 +1359,26 @@ with gr.Blocks() as app:
filter_radius1, filter_radius1,
resample_sr1, resample_sr1,
rms_mix_rate1, rms_mix_rate1,
protect1
], ],
[vc_output3], [vc_output3],
) )
with gr.TabItem(i18n("伴奏人声分离")): with gr.TabItem(i18n("伴奏人声分离&去混响&去回声")):
with gr.Group(): with gr.Group():
gr.Markdown( gr.Markdown(
value=i18n( value=i18n(
"人声伴奏分离批量处理, 使用UVR5模型. <br>不带和声用HP2, 带和声且提取的人声不需要和声用HP5<br>合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)" "人声伴奏分离批量处理, 使用UVR5模型。 <br>"
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。 <br>"
"模型分为三类: <br>"
"1、保留人声不带和声的音频选这个对主人声保留比HP5更好。内置HP2和HP3两个模型HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点 <br>"
"2、仅保留主人声带和声的音频选这个对主人声可能有削弱。内置HP5一个模型 <br> "
"3、去混响、去延迟模型by FoxJoy<br>"
"(1)MDX-Net:对于双通道混响是最好的选择,不能去除单通道混响;<br>"
" (234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底DeReverb额外去除混响可去除单声道混响但是对高频重的板式混响去不干净。<br>"
"去混响/去延迟,附:<br>"
"1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍<br>"
"2、MDX-Net-Dereverb模型挺慢的<br>"
"3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。"
) )
) )
with gr.Row(): with gr.Row():
@ -1384,7 +1429,7 @@ with gr.Blocks() as app:
exp_dir1 = gr.Textbox(label=i18n("输入实验名"), value="mi-test") exp_dir1 = gr.Textbox(label=i18n("输入实验名"), value="mi-test")
sr2 = gr.Radio( sr2 = gr.Radio(
label=i18n("目标采样率"), label=i18n("目标采样率"),
choices=["32k", "40k", "48k"], choices=["40k", "48k"],
value="40k", value="40k",
interactive=True, interactive=True,
) )

View File

@ -1,5 +1,7 @@
import os, sys, torch, warnings, pdb import os, sys, torch, warnings, pdb
now_dir = os.getcwd()
sys.path.append(now_dir)
from json import load as ll
warnings.filterwarnings("ignore") warnings.filterwarnings("ignore")
import librosa import librosa
import importlib import importlib
@ -10,7 +12,8 @@ from uvr5_pack.lib_v5 import spec_utils
from uvr5_pack.utils import _get_name_params, inference from uvr5_pack.utils import _get_name_params, inference
from uvr5_pack.lib_v5.model_param_init import ModelParameters from uvr5_pack.lib_v5.model_param_init import ModelParameters
from scipy.io import wavfile from scipy.io import wavfile
from uvr5_pack.lib_v5.nets_new import CascadedNet
from uvr5_pack.lib_v5 import nets_61968KB as nets
class _audio_pre_: class _audio_pre_:
def __init__(self, agg, model_path, device, is_half): def __init__(self, agg, model_path, device, is_half):
@ -25,28 +28,7 @@ class _audio_pre_:
"agg": agg, "agg": agg,
"high_end_process": "mirroring", "high_end_process": "mirroring",
} }
nn_arch_sizes = [ mp = ModelParameters("uvr5_pack/lib_v5/modelparams/4band_v2.json")
31191, # default
33966,
61968,
123821,
123812,
537238, # custom
]
self.nn_architecture = list("{}KB".format(s) for s in nn_arch_sizes)
model_size = math.ceil(os.stat(model_path).st_size / 1024)
nn_architecture = "{}KB".format(
min(nn_arch_sizes, key=lambda x: abs(x - model_size))
)
nets = importlib.import_module(
"uvr5_pack.lib_v5.nets"
+ f"_{nn_architecture}".replace("_{}KB".format(nn_arch_sizes[0]), ""),
package=None,
)
model_hash = hashlib.md5(open(model_path, "rb").read()).hexdigest()
param_name, model_params_d = _get_name_params(model_path, model_hash)
mp = ModelParameters(model_params_d)
model = nets.CascadedASPPNet(mp.param["bins"] * 2) model = nets.CascadedASPPNet(mp.param["bins"] * 2)
cpk = torch.load(model_path, map_location="cpu") cpk = torch.load(model_path, map_location="cpu")
model.load_state_dict(cpk) model.load_state_dict(cpk)
@ -164,12 +146,148 @@ class _audio_pre_:
(np.array(wav_vocals) * 32768).astype("int16"), (np.array(wav_vocals) * 32768).astype("int16"),
) )
class _audio_pre_new:
def __init__(self, agg, model_path, device, is_half):
self.model_path = model_path
self.device = device
self.data = {
# Processing Options
"postprocess": False,
"tta": False,
# Constants
"window_size": 512,
"agg": agg,
"high_end_process": "mirroring",
}
mp=ModelParameters("uvr5_pack/lib_v5/modelparams/4band_v3.json")
nout=64 if "DeReverb"in model_path else 48
model = CascadedNet(mp.param["bins"] * 2,nout)
cpk = torch.load(model_path, map_location="cpu")
model.load_state_dict(cpk)
model.eval()
if is_half:
model = model.half().to(device)
else:
model = model.to(device)
self.mp = mp
self.model = model
def _path_audio_(self, music_file, vocal_root=None, ins_root=None):#3个VR模型vocal和ins是反的
if ins_root is None and vocal_root is None:
return "No save root."
name = os.path.basename(music_file)
if ins_root is not None:
os.makedirs(ins_root, exist_ok=True)
if vocal_root is not None:
os.makedirs(vocal_root, exist_ok=True)
X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
bands_n = len(self.mp.param["band"])
# print(bands_n)
for d in range(bands_n, 0, -1):
bp = self.mp.param["band"][d]
if d == bands_n: # high-end band
(
X_wave[d],
_,
) = librosa.core.load( # 理论上librosa读取可能对某些音频有bug应该上ffmpeg读取但是太麻烦了弃坑
music_file,
bp["sr"],
False,
dtype=np.float32,
res_type=bp["res_type"],
)
if X_wave[d].ndim == 1:
X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
else: # lower bands
X_wave[d] = librosa.core.resample(
X_wave[d + 1],
self.mp.param["band"][d + 1]["sr"],
bp["sr"],
res_type=bp["res_type"],
)
# Stft of wave source
X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
X_wave[d],
bp["hl"],
bp["n_fft"],
self.mp.param["mid_side"],
self.mp.param["mid_side_b2"],
self.mp.param["reverse"],
)
# pdb.set_trace()
if d == bands_n and self.data["high_end_process"] != "none":
input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (
self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"]
)
input_high_end = X_spec_s[d][
:, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :
]
X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
aggresive_set = float(self.data["agg"] / 100)
aggressiveness = {
"value": aggresive_set,
"split_bin": self.mp.param["band"][1]["crop_stop"],
}
with torch.no_grad():
pred, X_mag, X_phase = inference(
X_spec_m, self.device, self.model, aggressiveness, self.data
)
# Postprocess
if self.data["postprocess"]:
pred_inv = np.clip(X_mag - pred, 0, np.inf)
pred = spec_utils.mask_silence(pred, pred_inv)
y_spec_m = pred * X_phase
v_spec_m = X_spec_m - y_spec_m
if ins_root is not None:
if self.data["high_end_process"].startswith("mirroring"):
input_high_end_ = spec_utils.mirroring(
self.data["high_end_process"], y_spec_m, input_high_end, self.mp
)
wav_instrument = spec_utils.cmb_spectrogram_to_wave(
y_spec_m, self.mp, input_high_end_h, input_high_end_
)
else:
wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
print("%s instruments done" % name)
wavfile.write(
os.path.join(
ins_root, "main_vocal_{}_{}.wav".format(name, self.data["agg"])
),
self.mp.param["sr"],
(np.array(wav_instrument) * 32768).astype("int16"),
) #
if vocal_root is not None:
if self.data["high_end_process"].startswith("mirroring"):
input_high_end_ = spec_utils.mirroring(
self.data["high_end_process"], v_spec_m, input_high_end, self.mp
)
wav_vocals = spec_utils.cmb_spectrogram_to_wave(
v_spec_m, self.mp, input_high_end_h, input_high_end_
)
else:
wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
print("%s vocals done" % name)
wavfile.write(
os.path.join(
vocal_root, "others_{}_{}.wav".format(name, self.data["agg"])
),
self.mp.param["sr"],
(np.array(wav_vocals) * 32768).astype("int16"),
)
if __name__ == "__main__": if __name__ == "__main__":
device = "cuda" device = "cuda"
is_half = True is_half = True
model_path = "uvr5_weights/2_HP-UVR.pth" # model_path = "uvr5_weights/2_HP-UVR.pth"
pre_fun = _audio_pre_(model_path=model_path, device=device, is_half=True) # model_path = "uvr5_weights/VR-DeEchoDeReverb.pth"
audio_path = "神女劈观.aac" # model_path = "uvr5_weights/VR-DeEchoNormal.pth"
model_path = "uvr5_weights/DeEchoNormal.pth"
# pre_fun = _audio_pre_(model_path=model_path, device=device, is_half=True,agg=10)
pre_fun = _audio_pre_new(model_path=model_path, device=device, is_half=True,agg=10)
audio_path = "雪雪伴奏对消HP5.wav"
save_path = "opt" save_path = "opt"
pre_fun._path_audio_(audio_path, save_path, save_path) pre_fun._path_audio_(audio_path, save_path, save_path)

View File

@ -2,7 +2,7 @@ import numpy as np, parselmouth, torch, pdb
from time import time as ttime from time import time as ttime
import torch.nn.functional as F import torch.nn.functional as F
import scipy.signal as signal import scipy.signal as signal
import pyworld, os, traceback, faiss, librosa import pyworld, os, traceback, faiss, librosa,torchcrepe
from scipy import signal from scipy import signal
from functools import lru_cache from functools import lru_cache
@ -103,6 +103,27 @@ class VC(object):
f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10) f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
if filter_radius > 2: if filter_radius > 2:
f0 = signal.medfilt(f0, 3) f0 = signal.medfilt(f0, 3)
elif f0_method == "crepe":
model = "full"
# Pick a batch size that doesn't cause memory errors on your gpu
batch_size = 512
# Compute pitch using first gpu
audio = torch.tensor(np.copy(x))[None].float()
f0, pd = torchcrepe.predict(
audio,
self.sr,
self.window,
f0_min,
f0_max,
model,
batch_size=batch_size,
device=self.device,
return_periodicity=True,
)
pd = torchcrepe.filter.median(pd, 3)
f0 = torchcrepe.filter.mean(f0, 3)
f0[pd < 0.1] = 0
f0 = f0[0].cpu().numpy()
f0 *= pow(2, f0_up_key / 12) f0 *= pow(2, f0_up_key / 12)
# with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
tf0 = self.sr // self.window # 每秒f0点数 tf0 = self.sr // self.window # 每秒f0点数
@ -141,6 +162,7 @@ class VC(object):
big_npy, big_npy,
index_rate, index_rate,
version, version,
protect
): # ,file_index,file_big_npy ): # ,file_index,file_big_npy
feats = torch.from_numpy(audio0) feats = torch.from_numpy(audio0)
if self.is_half: if self.is_half:
@ -162,7 +184,8 @@ class VC(object):
with torch.no_grad(): with torch.no_grad():
logits = model.extract_features(**inputs) logits = model.extract_features(**inputs)
feats = model.final_proj(logits[0]) if version == "v1" else logits[0] feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
if(protect<0.5):
feats0=feats.clone()
if ( if (
isinstance(index, type(None)) == False isinstance(index, type(None)) == False
and isinstance(big_npy, type(None)) == False and isinstance(big_npy, type(None)) == False
@ -188,6 +211,8 @@ class VC(object):
) )
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
if(protect<0.5):
feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
t1 = ttime() t1 = ttime()
p_len = audio0.shape[0] // self.window p_len = audio0.shape[0] // self.window
if feats.shape[1] < p_len: if feats.shape[1] < p_len:
@ -195,6 +220,14 @@ class VC(object):
if pitch != None and pitchf != None: if pitch != None and pitchf != None:
pitch = pitch[:, :p_len] pitch = pitch[:, :p_len]
pitchf = pitchf[:, :p_len] pitchf = pitchf[:, :p_len]
if(protect<0.5):
pitchff = pitchf.clone()
pitchff[pitchf > 0] = 1
pitchff[pitchf < 1] = protect
pitchff = pitchff.unsqueeze(-1)
feats = feats * pitchff + feats0 * (1 - pitchff)
feats=feats.to(feats0.dtype)
p_len = torch.tensor([p_len], device=self.device).long() p_len = torch.tensor([p_len], device=self.device).long()
with torch.no_grad(): with torch.no_grad():
if pitch != None and pitchf != None: if pitch != None and pitchf != None:
@ -235,6 +268,7 @@ class VC(object):
resample_sr, resample_sr,
rms_mix_rate, rms_mix_rate,
version, version,
protect,
f0_file=None, f0_file=None,
): ):
if ( if (
@ -322,6 +356,7 @@ class VC(object):
big_npy, big_npy,
index_rate, index_rate,
version, version,
protect
)[self.t_pad_tgt : -self.t_pad_tgt] )[self.t_pad_tgt : -self.t_pad_tgt]
) )
else: else:
@ -338,6 +373,7 @@ class VC(object):
big_npy, big_npy,
index_rate, index_rate,
version, version,
protect
)[self.t_pad_tgt : -self.t_pad_tgt] )[self.t_pad_tgt : -self.t_pad_tgt]
) )
s = t s = t
@ -355,6 +391,7 @@ class VC(object):
big_npy, big_npy,
index_rate, index_rate,
version, version,
protect
)[self.t_pad_tgt : -self.t_pad_tgt] )[self.t_pad_tgt : -self.t_pad_tgt]
) )
else: else:
@ -371,6 +408,7 @@ class VC(object):
big_npy, big_npy,
index_rate, index_rate,
version, version,
protect
)[self.t_pad_tgt : -self.t_pad_tgt] )[self.t_pad_tgt : -self.t_pad_tgt]
) )
audio_opt = np.concatenate(audio_opt) audio_opt = np.concatenate(audio_opt)