From a7857f12ee9cbc6789407813b106b6647b9bb951 Mon Sep 17 00:00:00 2001 From: liujing04 <129054828+liujing04@users.noreply.github.com> Date: Fri, 31 Mar 2023 17:49:09 +0800 Subject: [PATCH] Add files via upload --- configs/32k.json | 46 ++++++++ configs/40k.json | 46 ++++++++ configs/48k.json | 46 ++++++++ infer/infer-pm-index256.py | 147 +++++++++++++++++++++++ infer/train-index.py | 36 ++++++ infer/trans_weights.py | 11 ++ trainset_preprocess_pipeline_print.py | 104 ++++++++++++++++ vc_infer_pipeline.py | 164 ++++++++++++++++++++++++++ 使用需遵守的协议-LICENSE.txt | 49 ++++++++ 9 files changed, 649 insertions(+) create mode 100644 configs/32k.json create mode 100644 configs/40k.json create mode 100644 configs/48k.json create mode 100644 infer/infer-pm-index256.py create mode 100644 infer/train-index.py create mode 100644 infer/trans_weights.py create mode 100644 trainset_preprocess_pipeline_print.py create mode 100644 vc_infer_pipeline.py create mode 100644 使用需遵守的协议-LICENSE.txt diff --git a/configs/32k.json b/configs/32k.json new file mode 100644 index 0000000..d5f16d6 --- /dev/null +++ b/configs/32k.json @@ -0,0 +1,46 @@ +{ + "train": { + "log_interval": 200, + "seed": 1234, + "epochs": 20000, + "learning_rate": 1e-4, + "betas": [0.8, 0.99], + "eps": 1e-9, + "batch_size": 4, + "fp16_run": true, + "lr_decay": 0.999875, + "segment_size": 12800, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0 + }, + "data": { + "max_wav_value": 32768.0, + "sampling_rate": 32000, + "filter_length": 1024, + "hop_length": 320, + "win_length": 1024, + "n_mel_channels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [10,4,2,2,2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [16,16,4,4,4], + "use_spectral_norm": false, + "gin_channels": 256, + "spk_embed_dim": 109 + } +} diff --git a/configs/40k.json b/configs/40k.json new file mode 100644 index 0000000..4ffc87b --- /dev/null +++ b/configs/40k.json @@ -0,0 +1,46 @@ +{ + "train": { + "log_interval": 200, + "seed": 1234, + "epochs": 20000, + "learning_rate": 1e-4, + "betas": [0.8, 0.99], + "eps": 1e-9, + "batch_size": 4, + "fp16_run": true, + "lr_decay": 0.999875, + "segment_size": 12800, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0 + }, + "data": { + "max_wav_value": 32768.0, + "sampling_rate": 40000, + "filter_length": 2048, + "hop_length": 400, + "win_length": 2048, + "n_mel_channels": 125, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [10,10,2,2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [16,16,4,4], + "use_spectral_norm": false, + "gin_channels": 256, + "spk_embed_dim": 109 + } +} diff --git a/configs/48k.json b/configs/48k.json new file mode 100644 index 0000000..2d0e05b --- /dev/null +++ b/configs/48k.json @@ -0,0 +1,46 @@ +{ + "train": { + "log_interval": 200, + "seed": 1234, + "epochs": 20000, + "learning_rate": 1e-4, + "betas": [0.8, 0.99], + "eps": 1e-9, + "batch_size": 4, + "fp16_run": true, + "lr_decay": 0.999875, + "segment_size": 11520, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0 + }, + "data": { + "max_wav_value": 32768.0, + "sampling_rate": 48000, + "filter_length": 2048, + "hop_length": 480, + "win_length": 2048, + "n_mel_channels": 128, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [10,6,2,2,2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [16,16,4,4,4], + "use_spectral_norm": false, + "gin_channels": 256, + "spk_embed_dim": 109 + } +} diff --git a/infer/infer-pm-index256.py b/infer/infer-pm-index256.py new file mode 100644 index 0000000..dd94834 --- /dev/null +++ b/infer/infer-pm-index256.py @@ -0,0 +1,147 @@ +''' + +对源特征进行检索 +''' +import torch, pdb, os,parselmouth +os.environ["CUDA_VISIBLE_DEVICES"]="0" +import numpy as np +import soundfile as sf +# from models import SynthesizerTrn256#hifigan_nonsf +# from infer_pack.models import SynthesizerTrn256NSF as SynthesizerTrn256#hifigan_nsf +from infer_pack.models import SynthesizerTrnMs256NSFsid as SynthesizerTrn256#hifigan_nsf +# from infer_pack.models import SynthesizerTrnMs256NSFsid_sim as SynthesizerTrn256#hifigan_nsf +# from models import SynthesizerTrn256NSFsim as SynthesizerTrn256#hifigan_nsf +# from models import SynthesizerTrn256NSFsimFlow as SynthesizerTrn256#hifigan_nsf + + +from scipy.io import wavfile +from fairseq import checkpoint_utils +# import pyworld +import librosa +import torch.nn.functional as F +import scipy.signal as signal +# import torchcrepe +from time import time as ttime + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +model_path = r"E:\codes\py39\vits_vc_gpu_train\hubert_base.pt"# +print("load model(s) from {}".format(model_path)) +models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( + [model_path], + suffix="", +) +model = models[0] +model = model.to(device) +model = model.half() +model.eval() + +# net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],183,256,is_half=True)#hifigan#512#256 +# net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],109,256,is_half=True)#hifigan#512#256 +net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],183,256,is_half=True)#hifigan#512#256#no_dropout +# net_g = SynthesizerTrn256(1025,32,192,192,768,2,3,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],0)#ts3 +# net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2],512,[16,16,4],0)#hifigan-ps-sr +# +# net_g = SynthesizerTrn(1025, 32, 192, 192, 768, 2, 6, 3, 0.1, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [5,5], 512, [15,15], 0)#ms +# net_g = SynthesizerTrn(1025, 32, 192, 192, 768, 2, 6, 3, 0.1, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,10], 512, [16,16], 0)#idwt2 + +# weights=torch.load("infer/ft-mi_1k-noD.pt") +# weights=torch.load("infer/ft-mi-freeze-vocoder-flow-enc_q_1k.pt") +# weights=torch.load("infer/ft-mi-freeze-vocoder_true_1k.pt") +# weights=torch.load("infer/ft-mi-sim1k.pt") +weights=torch.load("infer/ft-mi-no_opt-no_dropout.pt") +print(net_g.load_state_dict(weights,strict=True)) + +net_g.eval().to(device) +net_g.half() +def get_f0(x, p_len,f0_up_key=0): + + time_step = 160 / 16000 * 1000 + f0_min = 50 + f0_max = 1100 + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + + f0 = parselmouth.Sound(x, 16000).to_pitch_ac( + time_step=time_step / 1000, voicing_threshold=0.6, + pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency'] + + pad_size=(p_len - len(f0) + 1) // 2 + if(pad_size>0 or p_len - len(f0) - pad_size>0): + f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant') + f0 *= pow(2, f0_up_key / 12) + f0bak = f0.copy() + + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + # f0_mel[f0_mel > 188] = 188 + f0_coarse = np.rint(f0_mel).astype(np.int) + return f0_coarse, f0bak + +import faiss +index=faiss.read_index("infer/added_IVF512_Flat_mi_baseline_src_feat.index") +big_npy=np.load("infer/big_src_feature_mi.npy") +ta0=ta1=ta2=0 +for idx,name in enumerate(["冬之花clip1.wav",]):## + wav_path = "todo-songs/%s" % name# + f0_up_key=-2# + audio, sampling_rate = sf.read(wav_path) + if len(audio.shape) > 1: + audio = librosa.to_mono(audio.transpose(1, 0)) + if sampling_rate != 16000: + audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000) + + + feats = torch.from_numpy(audio).float() + if feats.dim() == 2: # double channels + feats = feats.mean(-1) + assert feats.dim() == 1, feats.dim() + feats = feats.view(1, -1) + padding_mask = torch.BoolTensor(feats.shape).fill_(False) + inputs = { + "source": feats.half().to(device), + "padding_mask": padding_mask.to(device), + "output_layer": 9, # layer 9 + } + torch.cuda.synchronize() + t0=ttime() + with torch.no_grad(): + logits = model.extract_features(**inputs) + feats = model.final_proj(logits[0]) + + ####索引优化 + npy = feats[0].cpu().numpy().astype("float32") + D, I = index.search(npy, 1) + feats = torch.from_numpy(big_npy[I.squeeze()].astype("float16")).unsqueeze(0).to(device) + + feats=F.interpolate(feats.permute(0,2,1),scale_factor=2).permute(0,2,1) + torch.cuda.synchronize() + t1=ttime() + # p_len = min(feats.shape[1],10000,pitch.shape[0])#太大了爆显存 + p_len = min(feats.shape[1],10000)# + pitch, pitchf = get_f0(audio, p_len,f0_up_key) + p_len = min(feats.shape[1],10000,pitch.shape[0])#太大了爆显存 + torch.cuda.synchronize() + t2=ttime() + feats = feats[:,:p_len, :] + pitch = pitch[:p_len] + pitchf = pitchf[:p_len] + p_len = torch.LongTensor([p_len]).to(device) + pitch = torch.LongTensor(pitch).unsqueeze(0).to(device) + sid=torch.LongTensor([0]).to(device) + pitchf = torch.FloatTensor(pitchf).unsqueeze(0).to(device) + with torch.no_grad(): + audio = net_g.infer(feats, p_len,pitch,pitchf,sid)[0][0, 0].data.cpu().float().numpy()#nsf + torch.cuda.synchronize() + t3=ttime() + ta0+=(t1-t0) + ta1+=(t2-t1) + ta2+=(t3-t2) + # wavfile.write("ft-mi_1k-index256-noD-%s.wav"%name, 40000, audio)## + # wavfile.write("ft-mi-freeze-vocoder-flow-enc_q_1k-%s.wav"%name, 40000, audio)## + # wavfile.write("ft-mi-sim1k-%s.wav"%name, 40000, audio)## + wavfile.write("ft-mi-no_opt-no_dropout-%s.wav"%name, 40000, audio)## + + +print(ta0,ta1,ta2)# diff --git a/infer/train-index.py b/infer/train-index.py new file mode 100644 index 0000000..847472c --- /dev/null +++ b/infer/train-index.py @@ -0,0 +1,36 @@ +''' +格式:直接cid为自带的index位;aid放不下了,通过字典来查,反正就5w个 +''' +import faiss,numpy as np,os + +# ###########如果是原始特征要先写save +inp_root=r"E:\codes\py39\dataset\mi\2-co256" +npys=[] +for name in sorted(list(os.listdir(inp_root))): + phone=np.load("%s/%s"%(inp_root,name)) + npys.append(phone) +big_npy=np.concatenate(npys,0) +print(big_npy.shape)#(6196072, 192)#fp32#4.43G +np.save("infer/big_src_feature_mi.npy",big_npy) + +##################train+add +# big_npy=np.load("/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/inference_f0/big_src_feature_mi.npy") +print(big_npy.shape) +index = faiss.index_factory(256, "IVF512,Flat")#mi +print("training") +index_ivf = faiss.extract_index_ivf(index)# +index_ivf.nprobe = 9 +index.train(big_npy) +faiss.write_index(index, 'infer/trained_IVF512_Flat_mi_baseline_src_feat.index') +print("adding") +index.add(big_npy) +faiss.write_index(index,"infer/added_IVF512_Flat_mi_baseline_src_feat.index") +''' +大小(都是FP32) +big_src_feature 2.95G + (3098036, 256) +big_emb 4.43G + (6196072, 192) +big_emb双倍是因为求特征要repeat后再加pitch + +''' \ No newline at end of file diff --git a/infer/trans_weights.py b/infer/trans_weights.py new file mode 100644 index 0000000..1845d7d --- /dev/null +++ b/infer/trans_weights.py @@ -0,0 +1,11 @@ +import torch,pdb + +# a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-suc\G_1000.pth")["model"]#sim_nsf# +# a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder-flow-enc_q\G_1000.pth")["model"]#sim_nsf# +# a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder\G_1000.pth")["model"]#sim_nsf# +# a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-test\G_1000.pth")["model"]#sim_nsf# +a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-no_opt-no_dropout\G_1000.pth")["model"]#sim_nsf# +for key in a.keys():a[key]=a[key].half() +# torch.save(a,"ft-mi-freeze-vocoder_true_1k.pt")# +# torch.save(a,"ft-mi-sim1k.pt")# +torch.save(a,"ft-mi-no_opt-no_dropout.pt")# diff --git a/trainset_preprocess_pipeline_print.py b/trainset_preprocess_pipeline_print.py new file mode 100644 index 0000000..e5c9d45 --- /dev/null +++ b/trainset_preprocess_pipeline_print.py @@ -0,0 +1,104 @@ +import sys,os,pdb,multiprocessing +now_dir=os.getcwd() +sys.path.append(now_dir) + +inp_root = sys.argv[1] +sr = int(sys.argv[2]) +n_p = int(sys.argv[3]) +exp_dir = sys.argv[4] +import numpy as np,ffmpeg,os,traceback +from slicer2 import Slicer +from joblib import Parallel, delayed +import librosa,traceback +from scipy.io import wavfile +import multiprocessing +from my_utils import load_audio +from time import sleep + +f = open("%s/preprocess.log"%exp_dir, "a+") +def printt(strr): + print(strr) + f.write("%s\n" % strr) + f.flush() + +class PreProcess(): + def __init__(self,sr,exp_dir): + self.slicer = Slicer( + sr=sr, + threshold=-32, + min_length=800, + min_interval=400, + hop_size=15, + max_sil_kept=150 + ) + self.sr=sr + self.per=3.7 + self.overlap=0.3 + self.tail=self.per+self.overlap + self.max=0.95 + self.alpha=0.8 + self.exp_dir=exp_dir + self.gt_wavs_dir="%s/0_gt_wavs"%exp_dir + self.wavs16k_dir="%s/1_16k_wavs"%exp_dir + os.makedirs(self.exp_dir,exist_ok=True) + os.makedirs(self.gt_wavs_dir,exist_ok=True) + os.makedirs(self.wavs16k_dir,exist_ok=True) + + def norm_write(self,tmp_audio,idx0,idx1): + tmp_audio = (tmp_audio / np.abs(tmp_audio).max() * (self.max * self.alpha)) + (1 - self.alpha) * tmp_audio + wavfile.write("%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1), self.sr, (tmp_audio*32768).astype(np.int16)) + tmp_audio = librosa.resample(tmp_audio, orig_sr=self.sr, target_sr=16000) + wavfile.write("%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1), 16000, (tmp_audio*32768).astype(np.int16)) + + def pipeline(self,path, idx0): + try: + audio = load_audio(path,self.sr) + idx1=0 + for audio in self.slicer.slice(audio): + i = 0 + while (1): + start = int(self.sr * (self.per - self.overlap) * i) + i += 1 + if (len(audio[start:]) > self.tail * self.sr): + tmp_audio = audio[start:start + int(self.per * self.sr)] + self.norm_write(tmp_audio,idx0,idx1) + idx1 += 1 + else: + tmp_audio = audio[start:] + break + self.norm_write(tmp_audio, idx0, idx1) + printt("%s->Suc."%path) + except: + printt("%s->%s"%(path,traceback.format_exc())) + + def pipeline_mp(self,infos): + for path, idx0 in infos: + self.pipeline(path,idx0) + + def pipeline_mp_inp_dir(self,inp_root,n_p): + try: + infos = [("%s/%s" % (inp_root, name), idx) for idx, name in enumerate(sorted(list(os.listdir(inp_root))))] + ps=[] + for i in range(n_p): + p=multiprocessing.Process(target=self.pipeline_mp,args=(infos[i::n_p],)) + p.start() + ps.append(p) + for p in ps:p.join() + except: + printt("Fail. %s"%traceback.format_exc()) + +if __name__=='__main__': + # f = open("logs/log_preprocess.log", "w") + printt(sys.argv) + ###################################################### + # inp_root=r"E:\语音音频+标注\米津玄师\src" + # inp_root=r"E:\codes\py39\vits_vc_gpu_train\todo-songs" + # sr=40000 + # n_p = 6 + # exp_dir=r"E:\codes\py39\dataset\mi-test" + + ###################################################### + printt("start preprocess") + pp=PreProcess(sr,exp_dir) + pp.pipeline_mp_inp_dir(inp_root,n_p) + printt("end preprocess") diff --git a/vc_infer_pipeline.py b/vc_infer_pipeline.py new file mode 100644 index 0000000..30b03e3 --- /dev/null +++ b/vc_infer_pipeline.py @@ -0,0 +1,164 @@ +import numpy as np,parselmouth,torch,pdb +from time import time as ttime +import torch.nn.functional as F +from config import x_pad,x_query,x_center,x_max +import scipy.signal as signal +import pyworld,os,traceback,faiss +class VC(object): + def __init__(self,tgt_sr,device,is_half): + self.sr=16000#hubert输入采样率 + self.window=160#每帧点数 + self.t_pad=self.sr*x_pad#每条前后pad时间 + self.t_pad_tgt=tgt_sr*x_pad + self.t_pad2=self.t_pad*2 + self.t_query=self.sr*x_query#查询切点前后查询时间 + self.t_center=self.sr*x_center#查询切点位置 + self.t_max=self.sr*x_max#免查询时长阈值 + self.device=device + self.is_half=is_half + + def get_f0(self,x, p_len,f0_up_key,f0_method,inp_f0=None): + time_step = self.window / self.sr * 1000 + f0_min = 50 + f0_max = 1100 + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + if(f0_method=="pm"): + f0 = parselmouth.Sound(x, self.sr).to_pitch_ac( + time_step=time_step / 1000, voicing_threshold=0.6, + pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency'] + pad_size=(p_len - len(f0) + 1) // 2 + if(pad_size>0 or p_len - len(f0) - pad_size>0): + f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant') + elif(f0_method=="harvest"): + f0, t = pyworld.harvest( + x.astype(np.double), + fs=self.sr, + f0_ceil=f0_max, + frame_period=10, + ) + f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr) + f0 = signal.medfilt(f0, 3) + f0 *= pow(2, f0_up_key / 12) + # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) + tf0=self.sr//self.window#每秒f0点数 + if (inp_f0 is not None): + delta_t=np.round((inp_f0[:,0].max()-inp_f0[:,0].min())*tf0+1).astype("int16") + replace_f0=np.interp(list(range(delta_t)), inp_f0[:, 0]*100, inp_f0[:, 1]) + shape=f0[x_pad*tf0:x_pad*tf0+len(replace_f0)].shape[0] + f0[x_pad*tf0:x_pad*tf0+len(replace_f0)]=replace_f0[:shape] + # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) + f0bak = f0.copy() + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + f0_coarse = np.rint(f0_mel).astype(np.int) + return f0_coarse, f0bak#1-0 + + def vc(self,model,net_g,sid,audio0,pitch,pitchf,times,index,big_npy,index_rate):#,file_index,file_big_npy + feats = torch.from_numpy(audio0) + if(self.is_half==True):feats=feats.half() + else:feats=feats.float() + if feats.dim() == 2: # double channels + feats = feats.mean(-1) + assert feats.dim() == 1, feats.dim() + feats = feats.view(1, -1) + padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) + + inputs = { + "source": feats.to(self.device), + "padding_mask": padding_mask, + "output_layer": 9, # layer 9 + } + t0 = ttime() + with torch.no_grad(): + logits = model.extract_features(**inputs) + feats = model.final_proj(logits[0]) + + if(isinstance(index,type(None))==False and isinstance(big_npy,type(None))==False and index_rate!=0): + npy = feats[0].cpu().numpy() + if(self.is_half==True):npy=npy.astype("float32") + D, I = index.search(npy, 1) + npy=big_npy[I.squeeze()] + if(self.is_half==True):npy=npy.astype("float16") + feats = torch.from_numpy(npy).unsqueeze(0).to(self.device)*index_rate + (1-index_rate)*feats + + feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) + t1 = ttime() + p_len = audio0.shape[0]//self.window + if(feats.shape[1]self.t_max): + audio_sum = np.zeros_like(audio) + for i in range(self.window): audio_sum += audio_pad[i:i - self.window] + for t in range(self.t_center, audio.shape[0],self.t_center):opt_ts.append(t - self.t_query + np.where(np.abs(audio_sum[t - self.t_query:t + self.t_query]) == np.abs(audio_sum[t - self.t_query:t + self.t_query]).min())[0][0]) + s = 0 + audio_opt=[] + t=None + t1=ttime() + audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode='reflect') + p_len=audio_pad.shape[0]//self.window + inp_f0=None + if(hasattr(f0_file,'name') ==True): + try: + with open(f0_file.name,"r")as f: + lines=f.read().strip("\n").split("\n") + inp_f0=[] + for line in lines:inp_f0.append([float(i)for i in line.split(",")]) + inp_f0=np.array(inp_f0,dtype="float32") + except: + traceback.print_exc() + sid=torch.tensor(sid,device=self.device).unsqueeze(0).long() + pitch, pitchf=None,None + if(if_f0==1): + pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key,f0_method,inp_f0) + pitch = pitch[:p_len] + pitchf = pitchf[:p_len] + pitch = torch.tensor(pitch,device=self.device).unsqueeze(0).long() + pitchf = torch.tensor(pitchf,device=self.device).unsqueeze(0).float() + t2=ttime() + times[1] += (t2 - t1) + for t in opt_ts: + t=t//self.window*self.window + if (if_f0 == 1): + audio_opt.append(self.vc(model,net_g,sid,audio_pad[s:t+self.t_pad2+self.window],pitch[:,s//self.window:(t+self.t_pad2)//self.window],pitchf[:,s//self.window:(t+self.t_pad2)//self.window],times,index,big_npy,index_rate)[self.t_pad_tgt:-self.t_pad_tgt]) + else: + audio_opt.append(self.vc(model,net_g,sid,audio_pad[s:t+self.t_pad2+self.window],None,None,times,index,big_npy,index_rate)[self.t_pad_tgt:-self.t_pad_tgt]) + s = t + if (if_f0 == 1): + audio_opt.append(self.vc(model,net_g,sid,audio_pad[t:],pitch[:,t//self.window:]if t is not None else pitch,pitchf[:,t//self.window:]if t is not None else pitchf,times,index,big_npy,index_rate)[self.t_pad_tgt:-self.t_pad_tgt]) + else: + audio_opt.append(self.vc(model,net_g,sid,audio_pad[t:],None,None,times,index,big_npy,index_rate)[self.t_pad_tgt:-self.t_pad_tgt]) + audio_opt=np.concatenate(audio_opt) + del pitch,pitchf,sid + torch.cuda.empty_cache() + return audio_opt diff --git a/使用需遵守的协议-LICENSE.txt b/使用需遵守的协议-LICENSE.txt new file mode 100644 index 0000000..89a59b2 --- /dev/null +++ b/使用需遵守的协议-LICENSE.txt @@ -0,0 +1,49 @@ +MIT License + +Copyright (c) 2023 lj1995 + + 本软件及其相关代码以MIT协议开源,作者不对软件具备任何控制力,使用软件者、传播软件导出的声音者自负全责。 + 如不认可该条款,则不能使用或引用软件包内任何代码和文件。 + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +特此授予任何获得本软件和相关文档文件(以下简称“软件”)副本的人免费使用、复制、修改、合并、出版、分发、再授权和/或销售本软件的权利,以及授予本软件所提供的人使用本软件的权利,但须符合以下条件: +上述版权声明和本许可声明应包含在软件的所有副本或实质部分中。 +软件是“按原样”提供的,没有任何明示或暗示的保证,包括但不限于适销性、适用于特定目的和不侵权的保证。在任何情况下,作者或版权持有人均不承担因软件或软件的使用或其他交易而产生、产生或与之相关的任何索赔、损害赔偿或其他责任,无论是在合同诉讼、侵权诉讼还是其他诉讼中。 + +相关引用库协议如下: +################# +ContentVec +https://github.com/auspicious3000/contentvec/blob/main/LICENSE +MIT License +################# +VITS +https://github.com/jaywalnut310/vits/blob/main/LICENSE +MIT License +################# +HIFIGAN +https://github.com/jik876/hifi-gan/blob/master/LICENSE +MIT License +################# +gradio +https://github.com/gradio-app/gradio/blob/main/LICENSE +Apache License 2.0 +################# +ffmpeg +https://github.com/FFmpeg/FFmpeg/blob/master/COPYING.LGPLv3 +https://github.com/BtbN/FFmpeg-Builds/releases/download/autobuild-2021-02-28-12-32/ffmpeg-n4.3.2-160-gfbb9368226-win64-lgpl-4.3.zip +LPGLv3 License +MIT License +################# +ultimatevocalremovergui +https://github.com/Anjok07/ultimatevocalremovergui/blob/master/LICENSE +https://github.com/yang123qwe/vocal_separation_by_uvr5 +MIT License +################# +audio-slicer +https://github.com/openvpi/audio-slicer/blob/main/LICENSE +MIT License \ No newline at end of file