diff --git a/lib_v4/dataset.py b/lib_v4/dataset.py deleted file mode 100644 index 6e515b2..0000000 --- a/lib_v4/dataset.py +++ /dev/null @@ -1,170 +0,0 @@ -import os -import random - -import numpy as np -import torch -import torch.utils.data -from tqdm import tqdm - -from lib_v4 import spec_utils - - -class VocalRemoverValidationSet(torch.utils.data.Dataset): - - def __init__(self, patch_list): - self.patch_list = patch_list - - def __len__(self): - return len(self.patch_list) - - def __getitem__(self, idx): - path = self.patch_list[idx] - data = np.load(path) - - X, y = data['X'], data['y'] - - X_mag = np.abs(X) - y_mag = np.abs(y) - - return X_mag, y_mag - - -def make_pair(mix_dir, inst_dir): - input_exts = ['.wav', '.m4a', '.mp3', '.mp4', '.flac'] - - X_list = sorted([ - os.path.join(mix_dir, fname) - for fname in os.listdir(mix_dir) - if os.path.splitext(fname)[1] in input_exts]) - y_list = sorted([ - os.path.join(inst_dir, fname) - for fname in os.listdir(inst_dir) - if os.path.splitext(fname)[1] in input_exts]) - - filelist = list(zip(X_list, y_list)) - - return filelist - - -def train_val_split(dataset_dir, split_mode, val_rate, val_filelist): - if split_mode == 'random': - filelist = make_pair( - os.path.join(dataset_dir, 'mixtures'), - os.path.join(dataset_dir, 'instruments')) - - random.shuffle(filelist) - - if len(val_filelist) == 0: - val_size = int(len(filelist) * val_rate) - train_filelist = filelist[:-val_size] - val_filelist = filelist[-val_size:] - else: - train_filelist = [ - pair for pair in filelist - if list(pair) not in val_filelist] - elif split_mode == 'subdirs': - if len(val_filelist) != 0: - raise ValueError('The `val_filelist` option is not available in `subdirs` mode') - - train_filelist = make_pair( - os.path.join(dataset_dir, 'training/mixtures'), - os.path.join(dataset_dir, 'training/instruments')) - - val_filelist = make_pair( - os.path.join(dataset_dir, 'validation/mixtures'), - os.path.join(dataset_dir, 'validation/instruments')) - - return train_filelist, val_filelist - - -def augment(X, y, reduction_rate, reduction_mask, mixup_rate, mixup_alpha): - perm = np.random.permutation(len(X)) - for i, idx in enumerate(tqdm(perm)): - if np.random.uniform() < reduction_rate: - y[idx] = spec_utils.reduce_vocal_aggressively(X[idx], y[idx], reduction_mask) - - if np.random.uniform() < 0.5: - # swap channel - X[idx] = X[idx, ::-1] - y[idx] = y[idx, ::-1] - if np.random.uniform() < 0.02: - # mono - X[idx] = X[idx].mean(axis=0, keepdims=True) - y[idx] = y[idx].mean(axis=0, keepdims=True) - if np.random.uniform() < 0.02: - # inst - X[idx] = y[idx] - - if np.random.uniform() < mixup_rate and i < len(perm) - 1: - lam = np.random.beta(mixup_alpha, mixup_alpha) - X[idx] = lam * X[idx] + (1 - lam) * X[perm[i + 1]] - y[idx] = lam * y[idx] + (1 - lam) * y[perm[i + 1]] - - return X, y - - -def make_padding(width, cropsize, offset): - left = offset - roi_size = cropsize - left * 2 - if roi_size == 0: - roi_size = cropsize - right = roi_size - (width % roi_size) + left - - return left, right, roi_size - - -def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset): - len_dataset = patches * len(filelist) - - X_dataset = np.zeros( - (len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64) - y_dataset = np.zeros( - (len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64) - - for i, (X_path, y_path) in enumerate(tqdm(filelist)): - X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft) - coef = np.max([np.abs(X).max(), np.abs(y).max()]) - X, y = X / coef, y / coef - - l, r, roi_size = make_padding(X.shape[2], cropsize, offset) - X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode='constant') - y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode='constant') - - starts = np.random.randint(0, X_pad.shape[2] - cropsize, patches) - ends = starts + cropsize - for j in range(patches): - idx = i * patches + j - X_dataset[idx] = X_pad[:, :, starts[j]:ends[j]] - y_dataset[idx] = y_pad[:, :, starts[j]:ends[j]] - - return X_dataset, y_dataset - - -def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset): - patch_list = [] - patch_dir = 'cs{}_sr{}_hl{}_nf{}_of{}'.format(cropsize, sr, hop_length, n_fft, offset) - os.makedirs(patch_dir, exist_ok=True) - - for i, (X_path, y_path) in enumerate(tqdm(filelist)): - basename = os.path.splitext(os.path.basename(X_path))[0] - - X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft) - coef = np.max([np.abs(X).max(), np.abs(y).max()]) - X, y = X / coef, y / coef - - l, r, roi_size = make_padding(X.shape[2], cropsize, offset) - X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode='constant') - y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode='constant') - - len_dataset = int(np.ceil(X.shape[2] / roi_size)) - for j in range(len_dataset): - outpath = os.path.join(patch_dir, '{}_p{}.npz'.format(basename, j)) - start = j * roi_size - if not os.path.exists(outpath): - np.savez( - outpath, - X=X_pad[:, :, start:start + cropsize], - y=y_pad[:, :, start:start + cropsize]) - patch_list.append(outpath) - - return VocalRemoverValidationSet(patch_list) diff --git a/lib_v4/layers.py b/lib_v4/layers.py deleted file mode 100644 index 48bc975..0000000 --- a/lib_v4/layers.py +++ /dev/null @@ -1,116 +0,0 @@ -import torch -from torch import nn -import torch.nn.functional as F - -from lib_v4 import spec_utils - - -class Conv2DBNActiv(nn.Module): - - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(Conv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, nout, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - bias=False), - nn.BatchNorm2d(nout), - activ() - ) - - def __call__(self, x): - return self.conv(x) - - -class SeperableConv2DBNActiv(nn.Module): - - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(SeperableConv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, nin, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - groups=nin, - bias=False), - nn.Conv2d( - nin, nout, - kernel_size=1, - bias=False), - nn.BatchNorm2d(nout), - activ() - ) - - def __call__(self, x): - return self.conv(x) - - -class Encoder(nn.Module): - - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): - super(Encoder, self).__init__() - self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) - - def __call__(self, x): - skip = self.conv1(x) - h = self.conv2(skip) - - return h, skip - - -class Decoder(nn.Module): - - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): - super(Decoder, self).__init__() - self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - self.dropout = nn.Dropout2d(0.1) if dropout else None - - def __call__(self, x, skip=None): - x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True) - if skip is not None: - skip = spec_utils.crop_center(skip, x) - x = torch.cat([x, skip], dim=1) - h = self.conv(x) - - if self.dropout is not None: - h = self.dropout(h) - - return h - - -class ASPPModule(nn.Module): - - def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): - super(ASPPModule, self).__init__() - self.conv1 = nn.Sequential( - nn.AdaptiveAvgPool2d((1, None)), - Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) - ) - self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) - self.conv3 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) - self.conv4 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) - self.conv5 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) - self.bottleneck = nn.Sequential( - Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), - nn.Dropout2d(0.1) - ) - - def forward(self, x): - _, _, h, w = x.size() - feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True) - feat2 = self.conv2(x) - feat3 = self.conv3(x) - feat4 = self.conv4(x) - feat5 = self.conv5(x) - out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) - bottle = self.bottleneck(out) - return bottle diff --git a/lib_v4/nets.py b/lib_v4/nets.py deleted file mode 100644 index 266e63a..0000000 --- a/lib_v4/nets.py +++ /dev/null @@ -1,108 +0,0 @@ -import torch -from torch import nn -import torch.nn.functional as F - -from lib_v4 import layers - - -class BaseASPPNet(nn.Module): - - def __init__(self, nin, ch, dilations=(4, 8, 16)): - super(BaseASPPNet, self).__init__() - self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) - self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) - self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) - self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) - - self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) - - self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) - self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) - self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) - self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) - - def __call__(self, x): - h, e1 = self.enc1(x) - h, e2 = self.enc2(h) - h, e3 = self.enc3(h) - h, e4 = self.enc4(h) - - h = self.aspp(h) - - h = self.dec4(h, e4) - h = self.dec3(h, e3) - h = self.dec2(h, e2) - h = self.dec1(h, e1) - - return h - - -class CascadedASPPNet(nn.Module): - - def __init__(self, n_fft): - super(CascadedASPPNet, self).__init__() - self.stg1_low_band_net = BaseASPPNet(2, 16) - self.stg1_high_band_net = BaseASPPNet(2, 16) - - self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0) - self.stg2_full_band_net = BaseASPPNet(8, 16) - - self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) - self.stg3_full_band_net = BaseASPPNet(16, 32) - - self.out = nn.Conv2d(32, 2, 1, bias=False) - self.aux1_out = nn.Conv2d(16, 2, 1, bias=False) - self.aux2_out = nn.Conv2d(16, 2, 1, bias=False) - - self.max_bin = n_fft // 2 - self.output_bin = n_fft // 2 + 1 - - self.offset = 128 - - def forward(self, x): - mix = x.detach() - x = x.clone() - - x = x[:, :, :self.max_bin] - - bandw = x.size()[2] // 2 - aux1 = torch.cat([ - self.stg1_low_band_net(x[:, :, :bandw]), - self.stg1_high_band_net(x[:, :, bandw:]) - ], dim=2) - - h = torch.cat([x, aux1], dim=1) - aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) - - h = torch.cat([x, aux1, aux2], dim=1) - h = self.stg3_full_band_net(self.stg3_bridge(h)) - - mask = torch.sigmoid(self.out(h)) - mask = F.pad( - input=mask, - pad=(0, 0, 0, self.output_bin - mask.size()[2]), - mode='replicate') - - if self.training: - aux1 = torch.sigmoid(self.aux1_out(aux1)) - aux1 = F.pad( - input=aux1, - pad=(0, 0, 0, self.output_bin - aux1.size()[2]), - mode='replicate') - aux2 = torch.sigmoid(self.aux2_out(aux2)) - aux2 = F.pad( - input=aux2, - pad=(0, 0, 0, self.output_bin - aux2.size()[2]), - mode='replicate') - return mask * mix, aux1 * mix, aux2 * mix - else: - return mask * mix - - def predict(self, x_mag): - h = self.forward(x_mag) - - if self.offset > 0: - h = h[:, :, :, self.offset:-self.offset] - assert h.size()[3] > 0 - - return h diff --git a/lib_v4/spec_utils.py b/lib_v4/spec_utils.py deleted file mode 100644 index 8cf19f8..0000000 --- a/lib_v4/spec_utils.py +++ /dev/null @@ -1,216 +0,0 @@ -import os - -import librosa -import numpy as np -import soundfile as sf - - -def crop_center(h1, h2): - h1_shape = h1.size() - h2_shape = h2.size() - - if h1_shape[3] == h2_shape[3]: - return h1 - elif h1_shape[3] < h2_shape[3]: - raise ValueError('h1_shape[3] must be greater than h2_shape[3]') - - # s_freq = (h2_shape[2] - h1_shape[2]) // 2 - # e_freq = s_freq + h1_shape[2] - s_time = (h1_shape[3] - h2_shape[3]) // 2 - e_time = s_time + h2_shape[3] - h1 = h1[:, :, :, s_time:e_time] - - return h1 - - -def wave_to_spectrogram(wave, hop_length, n_fft): - wave_left = np.asfortranarray(wave[0]) - wave_right = np.asfortranarray(wave[1]) - - spec_left = librosa.stft(wave_left, n_fft, hop_length=hop_length) - spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length) - spec = np.asfortranarray([spec_left, spec_right]) - - return spec - - -def spectrogram_to_image(spec, mode='magnitude'): - if mode == 'magnitude': - if np.iscomplexobj(spec): - y = np.abs(spec) - else: - y = spec - y = np.log10(y ** 2 + 1e-8) - elif mode == 'phase': - if np.iscomplexobj(spec): - y = np.angle(spec) - else: - y = spec - - y -= y.min() - y *= 255 / y.max() - img = np.uint8(y) - - if y.ndim == 3: - img = img.transpose(1, 2, 0) - img = np.concatenate([ - np.max(img, axis=2, keepdims=True), img - ], axis=2) - - return img - - -def reduce_vocal_aggressively(X, y, softmask): - v = X - y - y_mag_tmp = np.abs(y) - v_mag_tmp = np.abs(v) - - v_mask = v_mag_tmp > y_mag_tmp - y_mag = np.clip(y_mag_tmp - v_mag_tmp * v_mask * softmask, 0, np.inf) - - return y_mag * np.exp(1.j * np.angle(y)) - - -def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32): - if min_range < fade_size * 2: - raise ValueError('min_range must be >= fade_area * 2') - - mag = mag.copy() - - idx = np.where(ref.mean(axis=(0, 1)) < thres)[0] - starts = np.insert(idx[np.where(np.diff(idx) != 1)[0] + 1], 0, idx[0]) - ends = np.append(idx[np.where(np.diff(idx) != 1)[0]], idx[-1]) - uninformative = np.where(ends - starts > min_range)[0] - if len(uninformative) > 0: - starts = starts[uninformative] - ends = ends[uninformative] - old_e = None - for s, e in zip(starts, ends): - if old_e is not None and s - old_e < fade_size: - s = old_e - fade_size * 2 - - if s != 0: - weight = np.linspace(0, 1, fade_size) - mag[:, :, s:s + fade_size] += weight * ref[:, :, s:s + fade_size] - else: - s -= fade_size - - if e != mag.shape[2]: - weight = np.linspace(1, 0, fade_size) - mag[:, :, e - fade_size:e] += weight * ref[:, :, e - fade_size:e] - else: - e += fade_size - - mag[:, :, s + fade_size:e - fade_size] += ref[:, :, s + fade_size:e - fade_size] - old_e = e - - return mag - - -def align_wave_head_and_tail(a, b, sr): - a, _ = librosa.effects.trim(a) - b, _ = librosa.effects.trim(b) - - a_mono = a[:, :sr * 4].sum(axis=0) - b_mono = b[:, :sr * 4].sum(axis=0) - - a_mono -= a_mono.mean() - b_mono -= b_mono.mean() - - offset = len(a_mono) - 1 - delay = np.argmax(np.correlate(a_mono, b_mono, 'full')) - offset - - if delay > 0: - a = a[:, delay:] - else: - b = b[:, np.abs(delay):] - - if a.shape[1] < b.shape[1]: - b = b[:, :a.shape[1]] - else: - a = a[:, :b.shape[1]] - - return a, b - - -def cache_or_load(mix_path, inst_path, sr, hop_length, n_fft): - mix_basename = os.path.splitext(os.path.basename(mix_path))[0] - inst_basename = os.path.splitext(os.path.basename(inst_path))[0] - - cache_dir = 'sr{}_hl{}_nf{}'.format(sr, hop_length, n_fft) - mix_cache_dir = os.path.join(os.path.dirname(mix_path), cache_dir) - inst_cache_dir = os.path.join(os.path.dirname(inst_path), cache_dir) - os.makedirs(mix_cache_dir, exist_ok=True) - os.makedirs(inst_cache_dir, exist_ok=True) - - mix_cache_path = os.path.join(mix_cache_dir, mix_basename + '.npy') - inst_cache_path = os.path.join(inst_cache_dir, inst_basename + '.npy') - - if os.path.exists(mix_cache_path) and os.path.exists(inst_cache_path): - X = np.load(mix_cache_path) - y = np.load(inst_cache_path) - else: - X, _ = librosa.load( - mix_path, sr, False, dtype=np.float32, res_type='kaiser_fast') - y, _ = librosa.load( - inst_path, sr, False, dtype=np.float32, res_type='kaiser_fast') - - X, y = align_wave_head_and_tail(X, y, sr) - - X = wave_to_spectrogram(X, hop_length, n_fft) - y = wave_to_spectrogram(y, hop_length, n_fft) - - _, ext = os.path.splitext(mix_path) - np.save(mix_cache_path, X) - np.save(inst_cache_path, y) - - return X, y - - -def spectrogram_to_wave(spec, hop_length=1024): - spec_left = np.asfortranarray(spec[0]) - spec_right = np.asfortranarray(spec[1]) - - wave_left = librosa.istft(spec_left, hop_length=hop_length) - wave_right = librosa.istft(spec_right, hop_length=hop_length) - wave = np.asfortranarray([wave_left, wave_right]) - - return wave - - -if __name__ == "__main__": - import cv2 - import sys - - X, _ = librosa.load( - sys.argv[1], 44100, False, dtype=np.float32, res_type='kaiser_fast') - y, _ = librosa.load( - sys.argv[2], 44100, False, dtype=np.float32, res_type='kaiser_fast') - - X, y = align_wave_head_and_tail(X, y, 44100) - - X_spec = wave_to_spectrogram(X, 1024, 2048) - y_spec = wave_to_spectrogram(y, 1024, 2048) - - y_spec = reduce_vocal_aggressively(X_spec, y_spec, 0.2) - v_spec = X_spec - y_spec - - # v_mask = np.abs(v_spec) > np.abs(y_spec) - # y_spec = X_spec - v_spec * v_mask - # v_spec = X_spec - y_spec - - X_mag = np.abs(X_spec) - y_mag = np.abs(y_spec) - v_mag = np.abs(v_spec) - - X_image = spectrogram_to_image(X_mag) - y_image = spectrogram_to_image(y_mag) - v_image = spectrogram_to_image(v_mag) - - cv2.imwrite('test_X.jpg', X_image) - cv2.imwrite('test_y.jpg', y_image) - cv2.imwrite('test_v.jpg', v_image) - - sf.write('test_X.wav', spectrogram_to_wave(X_spec).T, 44100) - sf.write('test_y.wav', spectrogram_to_wave(y_spec).T, 44100) - sf.write('test_v.wav', spectrogram_to_wave(v_spec).T, 44100)