Add files via upload

This commit is contained in:
Anjok07 2022-04-07 02:34:01 -05:00 committed by GitHub
parent ea201f7079
commit 67d5f16ea9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
46 changed files with 2149 additions and 0 deletions

BIN
img/UVR-Icon.ico Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

BIN
img/UVR-banner-2.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

BIN
img/UVR-banner.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

BIN
img/UVRV5.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 56 KiB

BIN
img/refresh.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.6 KiB

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

170
lib_v5/dataset.py Normal file
View File

@ -0,0 +1,170 @@
import os
import random
import numpy as np
import torch
import torch.utils.data
from tqdm import tqdm
from lib_v5 import spec_utils
class VocalRemoverValidationSet(torch.utils.data.Dataset):
def __init__(self, patch_list):
self.patch_list = patch_list
def __len__(self):
return len(self.patch_list)
def __getitem__(self, idx):
path = self.patch_list[idx]
data = np.load(path)
X, y = data['X'], data['y']
X_mag = np.abs(X)
y_mag = np.abs(y)
return X_mag, y_mag
def make_pair(mix_dir, inst_dir):
input_exts = ['.wav', '.m4a', '.mp3', '.mp4', '.flac']
X_list = sorted([
os.path.join(mix_dir, fname)
for fname in os.listdir(mix_dir)
if os.path.splitext(fname)[1] in input_exts])
y_list = sorted([
os.path.join(inst_dir, fname)
for fname in os.listdir(inst_dir)
if os.path.splitext(fname)[1] in input_exts])
filelist = list(zip(X_list, y_list))
return filelist
def train_val_split(dataset_dir, split_mode, val_rate, val_filelist):
if split_mode == 'random':
filelist = make_pair(
os.path.join(dataset_dir, 'mixtures'),
os.path.join(dataset_dir, 'instruments'))
random.shuffle(filelist)
if len(val_filelist) == 0:
val_size = int(len(filelist) * val_rate)
train_filelist = filelist[:-val_size]
val_filelist = filelist[-val_size:]
else:
train_filelist = [
pair for pair in filelist
if list(pair) not in val_filelist]
elif split_mode == 'subdirs':
if len(val_filelist) != 0:
raise ValueError('The `val_filelist` option is not available in `subdirs` mode')
train_filelist = make_pair(
os.path.join(dataset_dir, 'training/mixtures'),
os.path.join(dataset_dir, 'training/instruments'))
val_filelist = make_pair(
os.path.join(dataset_dir, 'validation/mixtures'),
os.path.join(dataset_dir, 'validation/instruments'))
return train_filelist, val_filelist
def augment(X, y, reduction_rate, reduction_mask, mixup_rate, mixup_alpha):
perm = np.random.permutation(len(X))
for i, idx in enumerate(tqdm(perm)):
if np.random.uniform() < reduction_rate:
y[idx] = spec_utils.reduce_vocal_aggressively(X[idx], y[idx], reduction_mask)
if np.random.uniform() < 0.5:
# swap channel
X[idx] = X[idx, ::-1]
y[idx] = y[idx, ::-1]
if np.random.uniform() < 0.02:
# mono
X[idx] = X[idx].mean(axis=0, keepdims=True)
y[idx] = y[idx].mean(axis=0, keepdims=True)
if np.random.uniform() < 0.02:
# inst
X[idx] = y[idx]
if np.random.uniform() < mixup_rate and i < len(perm) - 1:
lam = np.random.beta(mixup_alpha, mixup_alpha)
X[idx] = lam * X[idx] + (1 - lam) * X[perm[i + 1]]
y[idx] = lam * y[idx] + (1 - lam) * y[perm[i + 1]]
return X, y
def make_padding(width, cropsize, offset):
left = offset
roi_size = cropsize - left * 2
if roi_size == 0:
roi_size = cropsize
right = roi_size - (width % roi_size) + left
return left, right, roi_size
def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset):
len_dataset = patches * len(filelist)
X_dataset = np.zeros(
(len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
y_dataset = np.zeros(
(len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
for i, (X_path, y_path) in enumerate(tqdm(filelist)):
X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
coef = np.max([np.abs(X).max(), np.abs(y).max()])
X, y = X / coef, y / coef
l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode='constant')
y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode='constant')
starts = np.random.randint(0, X_pad.shape[2] - cropsize, patches)
ends = starts + cropsize
for j in range(patches):
idx = i * patches + j
X_dataset[idx] = X_pad[:, :, starts[j]:ends[j]]
y_dataset[idx] = y_pad[:, :, starts[j]:ends[j]]
return X_dataset, y_dataset
def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset):
patch_list = []
patch_dir = 'cs{}_sr{}_hl{}_nf{}_of{}'.format(cropsize, sr, hop_length, n_fft, offset)
os.makedirs(patch_dir, exist_ok=True)
for i, (X_path, y_path) in enumerate(tqdm(filelist)):
basename = os.path.splitext(os.path.basename(X_path))[0]
X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
coef = np.max([np.abs(X).max(), np.abs(y).max()])
X, y = X / coef, y / coef
l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode='constant')
y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode='constant')
len_dataset = int(np.ceil(X.shape[2] / roi_size))
for j in range(len_dataset):
outpath = os.path.join(patch_dir, '{}_p{}.npz'.format(basename, j))
start = j * roi_size
if not os.path.exists(outpath):
np.savez(
outpath,
X=X_pad[:, :, start:start + cropsize],
y=y_pad[:, :, start:start + cropsize])
patch_list.append(outpath)
return VocalRemoverValidationSet(patch_list)

116
lib_v5/layers_123812KB .py Normal file
View File

@ -0,0 +1,116 @@
import torch
from torch import nn
import torch.nn.functional as F
from lib_v5 import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nout,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
bias=False),
nn.BatchNorm2d(nout),
activ()
)
def __call__(self, x):
return self.conv(x)
class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nin,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
groups=nin,
bias=False),
nn.Conv2d(
nin, nout,
kernel_size=1,
bias=False),
nn.BatchNorm2d(nout),
activ()
)
def __call__(self, x):
return self.conv(x)
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
def __call__(self, x):
skip = self.conv1(x)
h = self.conv2(skip)
return h, skip
class Decoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
if skip is not None:
skip = spec_utils.crop_center(skip, x)
x = torch.cat([x, skip], dim=1)
h = self.conv(x)
if self.dropout is not None:
h = self.dropout(h)
return h
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
)
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ),
nn.Dropout2d(0.1)
)
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)
feat5 = self.conv5(x)
out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
bottle = self.bottleneck(out)
return bottle

116
lib_v5/layers_123821KB.py Normal file
View File

@ -0,0 +1,116 @@
import torch
from torch import nn
import torch.nn.functional as F
from lib_v5 import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nout,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
bias=False),
nn.BatchNorm2d(nout),
activ()
)
def __call__(self, x):
return self.conv(x)
class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nin,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
groups=nin,
bias=False),
nn.Conv2d(
nin, nout,
kernel_size=1,
bias=False),
nn.BatchNorm2d(nout),
activ()
)
def __call__(self, x):
return self.conv(x)
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
def __call__(self, x):
skip = self.conv1(x)
h = self.conv2(skip)
return h, skip
class Decoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
if skip is not None:
skip = spec_utils.crop_center(skip, x)
x = torch.cat([x, skip], dim=1)
h = self.conv(x)
if self.dropout is not None:
h = self.dropout(h)
return h
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
)
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ),
nn.Dropout2d(0.1)
)
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)
feat5 = self.conv5(x)
out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
bottle = self.bottleneck(out)
return bottle

122
lib_v5/layers_537238KB.py Normal file
View File

@ -0,0 +1,122 @@
import torch
from torch import nn
import torch.nn.functional as F
from lib_v5 import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nout,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
bias=False),
nn.BatchNorm2d(nout),
activ()
)
def __call__(self, x):
return self.conv(x)
class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nin,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
groups=nin,
bias=False),
nn.Conv2d(
nin, nout,
kernel_size=1,
bias=False),
nn.BatchNorm2d(nout),
activ()
)
def __call__(self, x):
return self.conv(x)
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
def __call__(self, x):
skip = self.conv1(x)
h = self.conv2(skip)
return h, skip
class Decoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
if skip is not None:
skip = spec_utils.crop_center(skip, x)
x = torch.cat([x, skip], dim=1)
h = self.conv(x)
if self.dropout is not None:
h = self.dropout(h)
return h
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
)
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
self.conv6 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
self.conv7 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ),
nn.Dropout2d(0.1)
)
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)
feat5 = self.conv5(x)
feat6 = self.conv6(x)
feat7 = self.conv7(x)
out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
bottle = self.bottleneck(out)
return bottle

View File

@ -0,0 +1,60 @@
import json
import os
import pathlib
default_param = {}
default_param['bins'] = 768
default_param['unstable_bins'] = 9 # training only
default_param['reduction_bins'] = 762 # training only
default_param['sr'] = 44100
default_param['pre_filter_start'] = 757
default_param['pre_filter_stop'] = 768
default_param['band'] = {}
default_param['band'][1] = {
'sr': 11025,
'hl': 128,
'n_fft': 960,
'crop_start': 0,
'crop_stop': 245,
'lpf_start': 61, # inference only
'res_type': 'polyphase'
}
default_param['band'][2] = {
'sr': 44100,
'hl': 512,
'n_fft': 1536,
'crop_start': 24,
'crop_stop': 547,
'hpf_start': 81, # inference only
'res_type': 'sinc_best'
}
def int_keys(d):
r = {}
for k, v in d:
if k.isdigit():
k = int(k)
r[k] = v
return r
class ModelParameters(object):
def __init__(self, config_path=''):
if '.pth' == pathlib.Path(config_path).suffix:
import zipfile
with zipfile.ZipFile(config_path, 'r') as zip:
self.param = json.loads(zip.read('param.json'), object_pairs_hook=int_keys)
elif '.json' == pathlib.Path(config_path).suffix:
with open(config_path, 'r') as f:
self.param = json.loads(f.read(), object_pairs_hook=int_keys)
else:
self.param = default_param
for k in ['mid_side', 'mid_side_b', 'mid_side_b2', 'stereo_w', 'stereo_n', 'reverse']:
if not k in self.param:
self.param[k] = False

View File

@ -0,0 +1,19 @@
{
"bins": 1024,
"unstable_bins": 0,
"reduction_bins": 0,
"band": {
"1": {
"sr": 32000,
"hl": 512,
"n_fft": 2048,
"crop_start": 0,
"crop_stop": 1024,
"hpf_start": -1,
"res_type": "kaiser_fast"
}
},
"sr": 32000,
"pre_filter_start": 1000,
"pre_filter_stop": 1021
}

View File

@ -0,0 +1,19 @@
{
"bins": 256,
"unstable_bins": 0,
"reduction_bins": 0,
"band": {
"1": {
"sr": 44100,
"hl": 256,
"n_fft": 512,
"crop_start": 0,
"crop_stop": 256,
"hpf_start": -1,
"res_type": "sinc_best"
}
},
"sr": 44100,
"pre_filter_start": 256,
"pre_filter_stop": 256
}

View File

@ -0,0 +1,19 @@
{
"bins": 1024,
"unstable_bins": 0,
"reduction_bins": 0,
"band": {
"1": {
"sr": 44100,
"hl": 512,
"n_fft": 2048,
"crop_start": 0,
"crop_stop": 1024,
"hpf_start": -1,
"res_type": "sinc_best"
}
},
"sr": 44100,
"pre_filter_start": 1023,
"pre_filter_stop": 1024
}

View File

@ -0,0 +1,30 @@
{
"bins": 768,
"unstable_bins": 7,
"reduction_bins": 705,
"band": {
"1": {
"sr": 6000,
"hl": 66,
"n_fft": 512,
"crop_start": 0,
"crop_stop": 240,
"lpf_start": 60,
"lpf_stop": 118,
"res_type": "sinc_fastest"
},
"2": {
"sr": 32000,
"hl": 352,
"n_fft": 1024,
"crop_start": 22,
"crop_stop": 505,
"hpf_start": 44,
"hpf_stop": 23,
"res_type": "sinc_medium"
}
},
"sr": 32000,
"pre_filter_start": 710,
"pre_filter_stop": 731
}

View File

@ -0,0 +1,30 @@
{
"bins": 512,
"unstable_bins": 7,
"reduction_bins": 510,
"band": {
"1": {
"sr": 11025,
"hl": 160,
"n_fft": 768,
"crop_start": 0,
"crop_stop": 192,
"lpf_start": 41,
"lpf_stop": 139,
"res_type": "sinc_fastest"
},
"2": {
"sr": 44100,
"hl": 640,
"n_fft": 1024,
"crop_start": 10,
"crop_stop": 320,
"hpf_start": 47,
"hpf_stop": 15,
"res_type": "sinc_medium"
}
},
"sr": 44100,
"pre_filter_start": 510,
"pre_filter_stop": 512
}

View File

@ -0,0 +1,30 @@
{
"bins": 768,
"unstable_bins": 7,
"reduction_bins": 705,
"band": {
"1": {
"sr": 6000,
"hl": 66,
"n_fft": 512,
"crop_start": 0,
"crop_stop": 240,
"lpf_start": 60,
"lpf_stop": 240,
"res_type": "sinc_fastest"
},
"2": {
"sr": 48000,
"hl": 528,
"n_fft": 1536,
"crop_start": 22,
"crop_stop": 505,
"hpf_start": 82,
"hpf_stop": 22,
"res_type": "sinc_medium"
}
},
"sr": 48000,
"pre_filter_start": 710,
"pre_filter_stop": 731
}

View File

@ -0,0 +1,42 @@
{
"bins": 768,
"unstable_bins": 5,
"reduction_bins": 733,
"band": {
"1": {
"sr": 11025,
"hl": 128,
"n_fft": 768,
"crop_start": 0,
"crop_stop": 278,
"lpf_start": 28,
"lpf_stop": 140,
"res_type": "polyphase"
},
"2": {
"sr": 22050,
"hl": 256,
"n_fft": 768,
"crop_start": 14,
"crop_stop": 322,
"hpf_start": 70,
"hpf_stop": 14,
"lpf_start": 283,
"lpf_stop": 314,
"res_type": "polyphase"
},
"3": {
"sr": 44100,
"hl": 512,
"n_fft": 768,
"crop_start": 131,
"crop_stop": 313,
"hpf_start": 154,
"hpf_stop": 141,
"res_type": "sinc_medium"
}
},
"sr": 44100,
"pre_filter_start": 757,
"pre_filter_stop": 768
}

View File

@ -0,0 +1,43 @@
{
"mid_side": true,
"bins": 768,
"unstable_bins": 5,
"reduction_bins": 733,
"band": {
"1": {
"sr": 11025,
"hl": 128,
"n_fft": 768,
"crop_start": 0,
"crop_stop": 278,
"lpf_start": 28,
"lpf_stop": 140,
"res_type": "polyphase"
},
"2": {
"sr": 22050,
"hl": 256,
"n_fft": 768,
"crop_start": 14,
"crop_stop": 322,
"hpf_start": 70,
"hpf_stop": 14,
"lpf_start": 283,
"lpf_stop": 314,
"res_type": "polyphase"
},
"3": {
"sr": 44100,
"hl": 512,
"n_fft": 768,
"crop_start": 131,
"crop_stop": 313,
"hpf_start": 154,
"hpf_stop": 141,
"res_type": "sinc_medium"
}
},
"sr": 44100,
"pre_filter_start": 757,
"pre_filter_stop": 768
}

View File

@ -0,0 +1,43 @@
{
"mid_side_b2": true,
"bins": 640,
"unstable_bins": 7,
"reduction_bins": 565,
"band": {
"1": {
"sr": 11025,
"hl": 108,
"n_fft": 1024,
"crop_start": 0,
"crop_stop": 187,
"lpf_start": 92,
"lpf_stop": 186,
"res_type": "polyphase"
},
"2": {
"sr": 22050,
"hl": 216,
"n_fft": 768,
"crop_start": 0,
"crop_stop": 212,
"hpf_start": 68,
"hpf_stop": 34,
"lpf_start": 174,
"lpf_stop": 209,
"res_type": "polyphase"
},
"3": {
"sr": 44100,
"hl": 432,
"n_fft": 640,
"crop_start": 66,
"crop_stop": 307,
"hpf_start": 86,
"hpf_stop": 72,
"res_type": "polyphase"
}
},
"sr": 44100,
"pre_filter_start": 639,
"pre_filter_stop": 640
}

View File

@ -0,0 +1,54 @@
{
"bins": 768,
"unstable_bins": 7,
"reduction_bins": 668,
"band": {
"1": {
"sr": 11025,
"hl": 128,
"n_fft": 1024,
"crop_start": 0,
"crop_stop": 186,
"lpf_start": 37,
"lpf_stop": 73,
"res_type": "polyphase"
},
"2": {
"sr": 11025,
"hl": 128,
"n_fft": 512,
"crop_start": 4,
"crop_stop": 185,
"hpf_start": 36,
"hpf_stop": 18,
"lpf_start": 93,
"lpf_stop": 185,
"res_type": "polyphase"
},
"3": {
"sr": 22050,
"hl": 256,
"n_fft": 512,
"crop_start": 46,
"crop_stop": 186,
"hpf_start": 93,
"hpf_stop": 46,
"lpf_start": 164,
"lpf_stop": 186,
"res_type": "polyphase"
},
"4": {
"sr": 44100,
"hl": 512,
"n_fft": 768,
"crop_start": 121,
"crop_stop": 382,
"hpf_start": 138,
"hpf_stop": 123,
"res_type": "sinc_medium"
}
},
"sr": 44100,
"pre_filter_start": 740,
"pre_filter_stop": 768
}

View File

@ -0,0 +1,55 @@
{
"bins": 768,
"unstable_bins": 7,
"mid_side": true,
"reduction_bins": 668,
"band": {
"1": {
"sr": 11025,
"hl": 128,
"n_fft": 1024,
"crop_start": 0,
"crop_stop": 186,
"lpf_start": 37,
"lpf_stop": 73,
"res_type": "polyphase"
},
"2": {
"sr": 11025,
"hl": 128,
"n_fft": 512,
"crop_start": 4,
"crop_stop": 185,
"hpf_start": 36,
"hpf_stop": 18,
"lpf_start": 93,
"lpf_stop": 185,
"res_type": "polyphase"
},
"3": {
"sr": 22050,
"hl": 256,
"n_fft": 512,
"crop_start": 46,
"crop_stop": 186,
"hpf_start": 93,
"hpf_stop": 46,
"lpf_start": 164,
"lpf_stop": 186,
"res_type": "polyphase"
},
"4": {
"sr": 44100,
"hl": 512,
"n_fft": 768,
"crop_start": 121,
"crop_stop": 382,
"hpf_start": 138,
"hpf_stop": 123,
"res_type": "sinc_medium"
}
},
"sr": 44100,
"pre_filter_start": 740,
"pre_filter_stop": 768
}

View File

@ -0,0 +1,55 @@
{
"mid_side_b": true,
"bins": 768,
"unstable_bins": 7,
"reduction_bins": 668,
"band": {
"1": {
"sr": 11025,
"hl": 128,
"n_fft": 1024,
"crop_start": 0,
"crop_stop": 186,
"lpf_start": 37,
"lpf_stop": 73,
"res_type": "polyphase"
},
"2": {
"sr": 11025,
"hl": 128,
"n_fft": 512,
"crop_start": 4,
"crop_stop": 185,
"hpf_start": 36,
"hpf_stop": 18,
"lpf_start": 93,
"lpf_stop": 185,
"res_type": "polyphase"
},
"3": {
"sr": 22050,
"hl": 256,
"n_fft": 512,
"crop_start": 46,
"crop_stop": 186,
"hpf_start": 93,
"hpf_stop": 46,
"lpf_start": 164,
"lpf_stop": 186,
"res_type": "polyphase"
},
"4": {
"sr": 44100,
"hl": 512,
"n_fft": 768,
"crop_start": 121,
"crop_stop": 382,
"hpf_start": 138,
"hpf_stop": 123,
"res_type": "sinc_medium"
}
},
"sr": 44100,
"pre_filter_start": 740,
"pre_filter_stop": 768
}

View File

@ -0,0 +1,55 @@
{
"mid_side_b": true,
"bins": 768,
"unstable_bins": 7,
"reduction_bins": 668,
"band": {
"1": {
"sr": 11025,
"hl": 128,
"n_fft": 1024,
"crop_start": 0,
"crop_stop": 186,
"lpf_start": 37,
"lpf_stop": 73,
"res_type": "polyphase"
},
"2": {
"sr": 11025,
"hl": 128,
"n_fft": 512,
"crop_start": 4,
"crop_stop": 185,
"hpf_start": 36,
"hpf_stop": 18,
"lpf_start": 93,
"lpf_stop": 185,
"res_type": "polyphase"
},
"3": {
"sr": 22050,
"hl": 256,
"n_fft": 512,
"crop_start": 46,
"crop_stop": 186,
"hpf_start": 93,
"hpf_stop": 46,
"lpf_start": 164,
"lpf_stop": 186,
"res_type": "polyphase"
},
"4": {
"sr": 44100,
"hl": 512,
"n_fft": 768,
"crop_start": 121,
"crop_stop": 382,
"hpf_start": 138,
"hpf_stop": 123,
"res_type": "sinc_medium"
}
},
"sr": 44100,
"pre_filter_start": 740,
"pre_filter_stop": 768
}

View File

@ -0,0 +1,55 @@
{
"reverse": true,
"bins": 768,
"unstable_bins": 7,
"reduction_bins": 668,
"band": {
"1": {
"sr": 11025,
"hl": 128,
"n_fft": 1024,
"crop_start": 0,
"crop_stop": 186,
"lpf_start": 37,
"lpf_stop": 73,
"res_type": "polyphase"
},
"2": {
"sr": 11025,
"hl": 128,
"n_fft": 512,
"crop_start": 4,
"crop_stop": 185,
"hpf_start": 36,
"hpf_stop": 18,
"lpf_start": 93,
"lpf_stop": 185,
"res_type": "polyphase"
},
"3": {
"sr": 22050,
"hl": 256,
"n_fft": 512,
"crop_start": 46,
"crop_stop": 186,
"hpf_start": 93,
"hpf_stop": 46,
"lpf_start": 164,
"lpf_stop": 186,
"res_type": "polyphase"
},
"4": {
"sr": 44100,
"hl": 512,
"n_fft": 768,
"crop_start": 121,
"crop_stop": 382,
"hpf_start": 138,
"hpf_stop": 123,
"res_type": "sinc_medium"
}
},
"sr": 44100,
"pre_filter_start": 740,
"pre_filter_stop": 768
}

View File

@ -0,0 +1,55 @@
{
"stereo_w": true,
"bins": 768,
"unstable_bins": 7,
"reduction_bins": 668,
"band": {
"1": {
"sr": 11025,
"hl": 128,
"n_fft": 1024,
"crop_start": 0,
"crop_stop": 186,
"lpf_start": 37,
"lpf_stop": 73,
"res_type": "polyphase"
},
"2": {
"sr": 11025,
"hl": 128,
"n_fft": 512,
"crop_start": 4,
"crop_stop": 185,
"hpf_start": 36,
"hpf_stop": 18,
"lpf_start": 93,
"lpf_stop": 185,
"res_type": "polyphase"
},
"3": {
"sr": 22050,
"hl": 256,
"n_fft": 512,
"crop_start": 46,
"crop_stop": 186,
"hpf_start": 93,
"hpf_stop": 46,
"lpf_start": 164,
"lpf_stop": 186,
"res_type": "polyphase"
},
"4": {
"sr": 44100,
"hl": 512,
"n_fft": 768,
"crop_start": 121,
"crop_stop": 382,
"hpf_start": 138,
"hpf_stop": 123,
"res_type": "sinc_medium"
}
},
"sr": 44100,
"pre_filter_start": 740,
"pre_filter_stop": 768
}

View File

@ -0,0 +1,54 @@
{
"bins": 672,
"unstable_bins": 8,
"reduction_bins": 637,
"band": {
"1": {
"sr": 7350,
"hl": 80,
"n_fft": 640,
"crop_start": 0,
"crop_stop": 85,
"lpf_start": 25,
"lpf_stop": 53,
"res_type": "polyphase"
},
"2": {
"sr": 7350,
"hl": 80,
"n_fft": 320,
"crop_start": 4,
"crop_stop": 87,
"hpf_start": 25,
"hpf_stop": 12,
"lpf_start": 31,
"lpf_stop": 62,
"res_type": "polyphase"
},
"3": {
"sr": 14700,
"hl": 160,
"n_fft": 512,
"crop_start": 17,
"crop_stop": 216,
"hpf_start": 48,
"hpf_stop": 24,
"lpf_start": 139,
"lpf_stop": 210,
"res_type": "polyphase"
},
"4": {
"sr": 44100,
"hl": 480,
"n_fft": 960,
"crop_start": 78,
"crop_stop": 383,
"hpf_start": 130,
"hpf_stop": 86,
"res_type": "kaiser_fast"
}
},
"sr": 44100,
"pre_filter_start": 668,
"pre_filter_stop": 672
}

View File

@ -0,0 +1,55 @@
{
"bins": 672,
"unstable_bins": 8,
"reduction_bins": 637,
"band": {
"1": {
"sr": 7350,
"hl": 80,
"n_fft": 640,
"crop_start": 0,
"crop_stop": 85,
"lpf_start": 25,
"lpf_stop": 53,
"res_type": "polyphase"
},
"2": {
"sr": 7350,
"hl": 80,
"n_fft": 320,
"crop_start": 4,
"crop_stop": 87,
"hpf_start": 25,
"hpf_stop": 12,
"lpf_start": 31,
"lpf_stop": 62,
"res_type": "polyphase"
},
"3": {
"sr": 14700,
"hl": 160,
"n_fft": 512,
"crop_start": 17,
"crop_stop": 216,
"hpf_start": 48,
"hpf_stop": 24,
"lpf_start": 139,
"lpf_stop": 210,
"res_type": "polyphase"
},
"4": {
"sr": 44100,
"hl": 480,
"n_fft": 960,
"crop_start": 78,
"crop_stop": 383,
"hpf_start": 130,
"hpf_stop": 86,
"convert_channels": "stereo_n",
"res_type": "kaiser_fast"
}
},
"sr": 44100,
"pre_filter_start": 668,
"pre_filter_stop": 672
}

View File

@ -0,0 +1,43 @@
{
"mid_side_b2": true,
"bins": 1280,
"unstable_bins": 7,
"reduction_bins": 565,
"band": {
"1": {
"sr": 11025,
"hl": 108,
"n_fft": 2048,
"crop_start": 0,
"crop_stop": 374,
"lpf_start": 92,
"lpf_stop": 186,
"res_type": "polyphase"
},
"2": {
"sr": 22050,
"hl": 216,
"n_fft": 1536,
"crop_start": 0,
"crop_stop": 424,
"hpf_start": 68,
"hpf_stop": 34,
"lpf_start": 348,
"lpf_stop": 418,
"res_type": "polyphase"
},
"3": {
"sr": 44100,
"hl": 432,
"n_fft": 1280,
"crop_start": 132,
"crop_stop": 614,
"hpf_start": 172,
"hpf_stop": 144,
"res_type": "polyphase"
}
},
"sr": 44100,
"pre_filter_start": 1280,
"pre_filter_stop": 1280
}

112
lib_v5/nets_123812KB.py Normal file
View File

@ -0,0 +1,112 @@
import torch
from torch import nn
import torch.nn.functional as F
from lib_v5 import layers_123821KB as layers
class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16)):
super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
def __call__(self, x):
h, e1 = self.enc1(x)
h, e2 = self.enc2(h)
h, e3 = self.enc3(h)
h, e4 = self.enc4(h)
h = self.aspp(h)
h = self.dec4(h, e4)
h = self.dec3(h, e3)
h = self.dec2(h, e2)
h = self.dec1(h, e1)
return h
class CascadedASPPNet(nn.Module):
def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 32)
self.stg1_high_band_net = BaseASPPNet(2, 32)
self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
self.stg2_full_band_net = BaseASPPNet(16, 32)
self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
self.stg3_full_band_net = BaseASPPNet(32, 64)
self.out = nn.Conv2d(64, 2, 1, bias=False)
self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
self.max_bin = n_fft // 2
self.output_bin = n_fft // 2 + 1
self.offset = 128
def forward(self, x, aggressiveness=None):
mix = x.detach()
x = x.clone()
x = x[:, :, :self.max_bin]
bandw = x.size()[2] // 2
aux1 = torch.cat([
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:])
], dim=2)
h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
h = torch.cat([x, aux1, aux2], dim=1)
h = self.stg3_full_band_net(self.stg3_bridge(h))
mask = torch.sigmoid(self.out(h))
mask = F.pad(
input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode='replicate')
if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad(
input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode='replicate')
aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad(
input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode='replicate')
return mask * mix, aux1 * mix, aux2 * mix
else:
if aggressiveness:
mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
return mask * mix
def predict(self, x_mag, aggressiveness=None):
h = self.forward(x_mag, aggressiveness)
if self.offset > 0:
h = h[:, :, :, self.offset:-self.offset]
assert h.size()[3] > 0
return h

112
lib_v5/nets_123821KB.py Normal file
View File

@ -0,0 +1,112 @@
import torch
from torch import nn
import torch.nn.functional as F
from lib_v5 import layers_123821KB as layers
class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16)):
super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
def __call__(self, x):
h, e1 = self.enc1(x)
h, e2 = self.enc2(h)
h, e3 = self.enc3(h)
h, e4 = self.enc4(h)
h = self.aspp(h)
h = self.dec4(h, e4)
h = self.dec3(h, e3)
h = self.dec2(h, e2)
h = self.dec1(h, e1)
return h
class CascadedASPPNet(nn.Module):
def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 32)
self.stg1_high_band_net = BaseASPPNet(2, 32)
self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
self.stg2_full_band_net = BaseASPPNet(16, 32)
self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
self.stg3_full_band_net = BaseASPPNet(32, 64)
self.out = nn.Conv2d(64, 2, 1, bias=False)
self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
self.max_bin = n_fft // 2
self.output_bin = n_fft // 2 + 1
self.offset = 128
def forward(self, x, aggressiveness=None):
mix = x.detach()
x = x.clone()
x = x[:, :, :self.max_bin]
bandw = x.size()[2] // 2
aux1 = torch.cat([
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:])
], dim=2)
h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
h = torch.cat([x, aux1, aux2], dim=1)
h = self.stg3_full_band_net(self.stg3_bridge(h))
mask = torch.sigmoid(self.out(h))
mask = F.pad(
input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode='replicate')
if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad(
input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode='replicate')
aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad(
input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode='replicate')
return mask * mix, aux1 * mix, aux2 * mix
else:
if aggressiveness:
mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
return mask * mix
def predict(self, x_mag, aggressiveness=None):
h = self.forward(x_mag, aggressiveness)
if self.offset > 0:
h = h[:, :, :, self.offset:-self.offset]
assert h.size()[3] > 0
return h

113
lib_v5/nets_537238KB.py Normal file
View File

@ -0,0 +1,113 @@
import torch
import numpy as np
from torch import nn
import torch.nn.functional as F
from lib_v5 import layers_537238KB as layers
class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16)):
super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
def __call__(self, x):
h, e1 = self.enc1(x)
h, e2 = self.enc2(h)
h, e3 = self.enc3(h)
h, e4 = self.enc4(h)
h = self.aspp(h)
h = self.dec4(h, e4)
h = self.dec3(h, e3)
h = self.dec2(h, e2)
h = self.dec1(h, e1)
return h
class CascadedASPPNet(nn.Module):
def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 64)
self.stg1_high_band_net = BaseASPPNet(2, 64)
self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
self.stg2_full_band_net = BaseASPPNet(32, 64)
self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0)
self.stg3_full_band_net = BaseASPPNet(64, 128)
self.out = nn.Conv2d(128, 2, 1, bias=False)
self.aux1_out = nn.Conv2d(64, 2, 1, bias=False)
self.aux2_out = nn.Conv2d(64, 2, 1, bias=False)
self.max_bin = n_fft // 2
self.output_bin = n_fft // 2 + 1
self.offset = 128
def forward(self, x, aggressiveness=None):
mix = x.detach()
x = x.clone()
x = x[:, :, :self.max_bin]
bandw = x.size()[2] // 2
aux1 = torch.cat([
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:])
], dim=2)
h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
h = torch.cat([x, aux1, aux2], dim=1)
h = self.stg3_full_band_net(self.stg3_bridge(h))
mask = torch.sigmoid(self.out(h))
mask = F.pad(
input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode='replicate')
if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad(
input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode='replicate')
aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad(
input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode='replicate')
return mask * mix, aux1 * mix, aux2 * mix
else:
if aggressiveness:
mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
return mask * mix
def predict(self, x_mag, aggressiveness=None):
h = self.forward(x_mag, aggressiveness)
if self.offset > 0:
h = h[:, :, :, self.offset:-self.offset]
assert h.size()[3] > 0
return h

472
lib_v5/spec_utils.py Normal file
View File

@ -0,0 +1,472 @@
import os
import librosa
import numpy as np
import soundfile as sf
import math
import json
import hashlib
from tqdm import tqdm
def crop_center(h1, h2):
h1_shape = h1.size()
h2_shape = h2.size()
if h1_shape[3] == h2_shape[3]:
return h1
elif h1_shape[3] < h2_shape[3]:
raise ValueError('h1_shape[3] must be greater than h2_shape[3]')
# s_freq = (h2_shape[2] - h1_shape[2]) // 2
# e_freq = s_freq + h1_shape[2]
s_time = (h1_shape[3] - h2_shape[3]) // 2
e_time = s_time + h2_shape[3]
h1 = h1[:, :, :, s_time:e_time]
return h1
def wave_to_spectrogram(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False):
if reverse:
wave_left = np.flip(np.asfortranarray(wave[0]))
wave_right = np.flip(np.asfortranarray(wave[1]))
elif mid_side:
wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
elif mid_side_b2:
wave_left = np.asfortranarray(np.add(wave[1], wave[0] * .5))
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * .5))
else:
wave_left = np.asfortranarray(wave[0])
wave_right = np.asfortranarray(wave[1])
spec_left = librosa.stft(wave_left, n_fft, hop_length=hop_length)
spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length)
spec = np.asfortranarray([spec_left, spec_right])
return spec
def wave_to_spectrogram_mt(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False):
import threading
if reverse:
wave_left = np.flip(np.asfortranarray(wave[0]))
wave_right = np.flip(np.asfortranarray(wave[1]))
elif mid_side:
wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
elif mid_side_b2:
wave_left = np.asfortranarray(np.add(wave[1], wave[0] * .5))
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * .5))
else:
wave_left = np.asfortranarray(wave[0])
wave_right = np.asfortranarray(wave[1])
def run_thread(**kwargs):
global spec_left
spec_left = librosa.stft(**kwargs)
thread = threading.Thread(target=run_thread, kwargs={'y': wave_left, 'n_fft': n_fft, 'hop_length': hop_length})
thread.start()
spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length)
thread.join()
spec = np.asfortranarray([spec_left, spec_right])
return spec
def combine_spectrograms(specs, mp):
l = min([specs[i].shape[2] for i in specs])
spec_c = np.zeros(shape=(2, mp.param['bins'] + 1, l), dtype=np.complex64)
offset = 0
bands_n = len(mp.param['band'])
for d in range(1, bands_n + 1):
h = mp.param['band'][d]['crop_stop'] - mp.param['band'][d]['crop_start']
spec_c[:, offset:offset+h, :l] = specs[d][:, mp.param['band'][d]['crop_start']:mp.param['band'][d]['crop_stop'], :l]
offset += h
if offset > mp.param['bins']:
raise ValueError('Too much bins')
# lowpass fiter
if mp.param['pre_filter_start'] > 0: # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']:
if bands_n == 1:
spec_c = fft_lp_filter(spec_c, mp.param['pre_filter_start'], mp.param['pre_filter_stop'])
else:
gp = 1
for b in range(mp.param['pre_filter_start'] + 1, mp.param['pre_filter_stop']):
g = math.pow(10, -(b - mp.param['pre_filter_start']) * (3.5 - gp) / 20.0)
gp = g
spec_c[:, b, :] *= g
return np.asfortranarray(spec_c)
def spectrogram_to_image(spec, mode='magnitude'):
if mode == 'magnitude':
if np.iscomplexobj(spec):
y = np.abs(spec)
else:
y = spec
y = np.log10(y ** 2 + 1e-8)
elif mode == 'phase':
if np.iscomplexobj(spec):
y = np.angle(spec)
else:
y = spec
y -= y.min()
y *= 255 / y.max()
img = np.uint8(y)
if y.ndim == 3:
img = img.transpose(1, 2, 0)
img = np.concatenate([
np.max(img, axis=2, keepdims=True), img
], axis=2)
return img
def reduce_vocal_aggressively(X, y, softmask):
v = X - y
y_mag_tmp = np.abs(y)
v_mag_tmp = np.abs(v)
v_mask = v_mag_tmp > y_mag_tmp
y_mag = np.clip(y_mag_tmp - v_mag_tmp * v_mask * softmask, 0, np.inf)
return y_mag * np.exp(1.j * np.angle(y))
def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32):
if min_range < fade_size * 2:
raise ValueError('min_range must be >= fade_area * 2')
mag = mag.copy()
idx = np.where(ref.mean(axis=(0, 1)) < thres)[0]
starts = np.insert(idx[np.where(np.diff(idx) != 1)[0] + 1], 0, idx[0])
ends = np.append(idx[np.where(np.diff(idx) != 1)[0]], idx[-1])
uninformative = np.where(ends - starts > min_range)[0]
if len(uninformative) > 0:
starts = starts[uninformative]
ends = ends[uninformative]
old_e = None
for s, e in zip(starts, ends):
if old_e is not None and s - old_e < fade_size:
s = old_e - fade_size * 2
if s != 0:
weight = np.linspace(0, 1, fade_size)
mag[:, :, s:s + fade_size] += weight * ref[:, :, s:s + fade_size]
else:
s -= fade_size
if e != mag.shape[2]:
weight = np.linspace(1, 0, fade_size)
mag[:, :, e - fade_size:e] += weight * ref[:, :, e - fade_size:e]
else:
e += fade_size
mag[:, :, s + fade_size:e - fade_size] += ref[:, :, s + fade_size:e - fade_size]
old_e = e
return mag
def align_wave_head_and_tail(a, b):
l = min([a[0].size, b[0].size])
return a[:l,:l], b[:l,:l]
def cache_or_load(mix_path, inst_path, mp):
mix_basename = os.path.splitext(os.path.basename(mix_path))[0]
inst_basename = os.path.splitext(os.path.basename(inst_path))[0]
cache_dir = 'mph{}'.format(hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode('utf-8')).hexdigest())
mix_cache_dir = os.path.join('cache', cache_dir)
inst_cache_dir = os.path.join('cache', cache_dir)
os.makedirs(mix_cache_dir, exist_ok=True)
os.makedirs(inst_cache_dir, exist_ok=True)
mix_cache_path = os.path.join(mix_cache_dir, mix_basename + '.npy')
inst_cache_path = os.path.join(inst_cache_dir, inst_basename + '.npy')
if os.path.exists(mix_cache_path) and os.path.exists(inst_cache_path):
X_spec_m = np.load(mix_cache_path)
y_spec_m = np.load(inst_cache_path)
else:
X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
for d in range(len(mp.param['band']), 0, -1):
bp = mp.param['band'][d]
if d == len(mp.param['band']): # high-end band
X_wave[d], _ = librosa.load(
mix_path, bp['sr'], False, dtype=np.float32, res_type=bp['res_type'])
y_wave[d], _ = librosa.load(
inst_path, bp['sr'], False, dtype=np.float32, res_type=bp['res_type'])
else: # lower bands
X_wave[d] = librosa.resample(X_wave[d+1], mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type'])
y_wave[d] = librosa.resample(y_wave[d+1], mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type'])
X_wave[d], y_wave[d] = align_wave_head_and_tail(X_wave[d], y_wave[d])
X_spec_s[d] = wave_to_spectrogram(X_wave[d], bp['hl'], bp['n_fft'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse'])
y_spec_s[d] = wave_to_spectrogram(y_wave[d], bp['hl'], bp['n_fft'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse'])
del X_wave, y_wave
X_spec_m = combine_spectrograms(X_spec_s, mp)
y_spec_m = combine_spectrograms(y_spec_s, mp)
if X_spec_m.shape != y_spec_m.shape:
raise ValueError('The combined spectrograms are different: ' + mix_path)
_, ext = os.path.splitext(mix_path)
np.save(mix_cache_path, X_spec_m)
np.save(inst_cache_path, y_spec_m)
return X_spec_m, y_spec_m
def spectrogram_to_wave(spec, hop_length, mid_side, mid_side_b2, reverse):
spec_left = np.asfortranarray(spec[0])
spec_right = np.asfortranarray(spec[1])
wave_left = librosa.istft(spec_left, hop_length=hop_length)
wave_right = librosa.istft(spec_right, hop_length=hop_length)
if reverse:
return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
elif mid_side:
return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)])
elif mid_side_b2:
return np.asfortranarray([np.add(wave_right / 1.25, .4 * wave_left), np.subtract(wave_left / 1.25, .4 * wave_right)])
else:
return np.asfortranarray([wave_left, wave_right])
def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2):
import threading
spec_left = np.asfortranarray(spec[0])
spec_right = np.asfortranarray(spec[1])
def run_thread(**kwargs):
global wave_left
wave_left = librosa.istft(**kwargs)
thread = threading.Thread(target=run_thread, kwargs={'stft_matrix': spec_left, 'hop_length': hop_length})
thread.start()
wave_right = librosa.istft(spec_right, hop_length=hop_length)
thread.join()
if reverse:
return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
elif mid_side:
return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)])
elif mid_side_b2:
return np.asfortranarray([np.add(wave_right / 1.25, .4 * wave_left), np.subtract(wave_left / 1.25, .4 * wave_right)])
else:
return np.asfortranarray([wave_left, wave_right])
def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None):
wave_band = {}
bands_n = len(mp.param['band'])
offset = 0
for d in range(1, bands_n + 1):
bp = mp.param['band'][d]
spec_s = np.ndarray(shape=(2, bp['n_fft'] // 2 + 1, spec_m.shape[2]), dtype=complex)
h = bp['crop_stop'] - bp['crop_start']
spec_s[:, bp['crop_start']:bp['crop_stop'], :] = spec_m[:, offset:offset+h, :]
offset += h
if d == bands_n: # higher
if extra_bins_h: # if --high_end_process bypass
max_bin = bp['n_fft'] // 2
spec_s[:, max_bin-extra_bins_h:max_bin, :] = extra_bins[:, :extra_bins_h, :]
if bp['hpf_start'] > 0:
spec_s = fft_hp_filter(spec_s, bp['hpf_start'], bp['hpf_stop'] - 1)
if bands_n == 1:
wave = spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse'])
else:
wave = np.add(wave, spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']))
else:
sr = mp.param['band'][d+1]['sr']
if d == 1: # lower
spec_s = fft_lp_filter(spec_s, bp['lpf_start'], bp['lpf_stop'])
wave = librosa.resample(spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']), bp['sr'], sr, res_type="sinc_fastest")
else: # mid
spec_s = fft_hp_filter(spec_s, bp['hpf_start'], bp['hpf_stop'] - 1)
spec_s = fft_lp_filter(spec_s, bp['lpf_start'], bp['lpf_stop'])
wave2 = np.add(wave, spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']))
wave = librosa.resample(wave2, bp['sr'], sr, res_type="sinc_fastest")
return wave.T
def fft_lp_filter(spec, bin_start, bin_stop):
g = 1.0
for b in range(bin_start, bin_stop):
g -= 1 / (bin_stop - bin_start)
spec[:, b, :] = g * spec[:, b, :]
spec[:, bin_stop:, :] *= 0
return spec
def fft_hp_filter(spec, bin_start, bin_stop):
g = 1.0
for b in range(bin_start, bin_stop, -1):
g -= 1 / (bin_start - bin_stop)
spec[:, b, :] = g * spec[:, b, :]
spec[:, 0:bin_stop+1, :] *= 0
return spec
def mirroring(a, spec_m, input_high_end, mp):
if 'mirroring' == a:
mirror = np.flip(np.abs(spec_m[:, mp.param['pre_filter_start']-10-input_high_end.shape[1]:mp.param['pre_filter_start']-10, :]), 1)
mirror = mirror * np.exp(1.j * np.angle(input_high_end))
return np.where(np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror)
if 'mirroring2' == a:
mirror = np.flip(np.abs(spec_m[:, mp.param['pre_filter_start']-10-input_high_end.shape[1]:mp.param['pre_filter_start']-10, :]), 1)
mi = np.multiply(mirror, input_high_end * 1.7)
return np.where(np.abs(input_high_end) <= np.abs(mi), input_high_end, mi)
def ensembling(a, specs):
for i in range(1, len(specs)):
if i == 1:
spec = specs[0]
ln = min([spec.shape[2], specs[i].shape[2]])
spec = spec[:,:,:ln]
specs[i] = specs[i][:,:,:ln]
if 'min_mag' == a:
spec = np.where(np.abs(specs[i]) <= np.abs(spec), specs[i], spec)
if 'max_mag' == a:
spec = np.where(np.abs(specs[i]) >= np.abs(spec), specs[i], spec)
return spec
if __name__ == "__main__":
import cv2
import sys
import time
import argparse
from model_param_init import ModelParameters
p = argparse.ArgumentParser()
p.add_argument('--algorithm', '-a', type=str, choices=['invert', 'invert_p', 'min_mag', 'max_mag', 'deep', 'align'], default='min_mag')
p.add_argument('--model_params', '-m', type=str, default=os.path.join('modelparams', '1band_sr44100_hl512.json'))
p.add_argument('--output_name', '-o', type=str, default='output')
p.add_argument('--vocals_only', '-v', action='store_true')
p.add_argument('input', nargs='+')
args = p.parse_args()
start_time = time.time()
if args.algorithm.startswith('invert') and len(args.input) != 2:
raise ValueError('There should be two input files.')
if not args.algorithm.startswith('invert') and len(args.input) < 2:
raise ValueError('There must be at least two input files.')
wave, specs = {}, {}
mp = ModelParameters(args.model_params)
for i in range(len(args.input)):
spec = {}
for d in range(len(mp.param['band']), 0, -1):
bp = mp.param['band'][d]
if d == len(mp.param['band']): # high-end band
wave[d], _ = librosa.load(
args.input[i], bp['sr'], False, dtype=np.float32, res_type=bp['res_type'])
if len(wave[d].shape) == 1: # mono to stereo
wave[d] = np.array([wave[d], wave[d]])
else: # lower bands
wave[d] = librosa.resample(wave[d+1], mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type'])
spec[d] = wave_to_spectrogram(wave[d], bp['hl'], bp['n_fft'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse'])
specs[i] = combine_spectrograms(spec, mp)
del wave
if args.algorithm == 'deep':
d_spec = np.where(np.abs(specs[0]) <= np.abs(spec[1]), specs[0], spec[1])
v_spec = d_spec - specs[1]
sf.write(os.path.join('{}.wav'.format(args.output_name)), cmb_spectrogram_to_wave(v_spec, mp), mp.param['sr'])
if args.algorithm.startswith('invert'):
ln = min([specs[0].shape[2], specs[1].shape[2]])
specs[0] = specs[0][:,:,:ln]
specs[1] = specs[1][:,:,:ln]
if 'invert_p' == args.algorithm:
X_mag = np.abs(specs[0])
y_mag = np.abs(specs[1])
max_mag = np.where(X_mag >= y_mag, X_mag, y_mag)
v_spec = specs[1] - max_mag * np.exp(1.j * np.angle(specs[0]))
else:
specs[1] = reduce_vocal_aggressively(specs[0], specs[1], 0.2)
v_spec = specs[0] - specs[1]
if not args.vocals_only:
X_mag = np.abs(specs[0])
y_mag = np.abs(specs[1])
v_mag = np.abs(v_spec)
X_image = spectrogram_to_image(X_mag)
y_image = spectrogram_to_image(y_mag)
v_image = spectrogram_to_image(v_mag)
cv2.imwrite('{}_X.png'.format(args.output_name), X_image)
cv2.imwrite('{}_y.png'.format(args.output_name), y_image)
cv2.imwrite('{}_v.png'.format(args.output_name), v_image)
sf.write('{}_X.wav'.format(args.output_name), cmb_spectrogram_to_wave(specs[0], mp), mp.param['sr'])
sf.write('{}_y.wav'.format(args.output_name), cmb_spectrogram_to_wave(specs[1], mp), mp.param['sr'])
sf.write('{}_v.wav'.format(args.output_name), cmb_spectrogram_to_wave(v_spec, mp), mp.param['sr'])
else:
if not args.algorithm == 'deep':
sf.write(os.path.join('ensembled','{}.wav'.format(args.output_name)), cmb_spectrogram_to_wave(ensembling(args.algorithm, specs), mp), mp.param['sr'])
if args.algorithm == 'align':
trackalignment = [
{
'file1':'"{}"'.format(args.input[0]),
'file2':'"{}"'.format(args.input[1])
}
]
for i,e in tqdm(enumerate(trackalignment), desc="Performing Alignment..."):
os.system(f"python lib/align_tracks.py {e['file1']} {e['file2']}")
#print('Total time: {0:.{1}f}s'.format(time.time() - start_time, 1))