mirror of
https://github.com/Anjok07/ultimatevocalremovergui.git
synced 2024-11-27 17:00:59 +01:00
219 lines
7.2 KiB
Python
219 lines
7.2 KiB
Python
|
# Copyright (c) Facebook, Inc. and its affiliates.
|
||
|
# All rights reserved.
|
||
|
#
|
||
|
# This source code is licensed under the license found in the
|
||
|
# LICENSE file in the root directory of this source tree.
|
||
|
|
||
|
import math
|
||
|
|
||
|
import julius
|
||
|
from torch import nn
|
||
|
from .tasnet_v2 import ConvTasNet
|
||
|
|
||
|
from .utils import capture_init, center_trim
|
||
|
|
||
|
|
||
|
class BLSTM(nn.Module):
|
||
|
def __init__(self, dim, layers=1):
|
||
|
super().__init__()
|
||
|
self.lstm = nn.LSTM(bidirectional=True, num_layers=layers, hidden_size=dim, input_size=dim)
|
||
|
self.linear = nn.Linear(2 * dim, dim)
|
||
|
|
||
|
def forward(self, x):
|
||
|
x = x.permute(2, 0, 1)
|
||
|
x = self.lstm(x)[0]
|
||
|
x = self.linear(x)
|
||
|
x = x.permute(1, 2, 0)
|
||
|
return x
|
||
|
|
||
|
|
||
|
def rescale_conv(conv, reference):
|
||
|
std = conv.weight.std().detach()
|
||
|
scale = (std / reference)**0.5
|
||
|
conv.weight.data /= scale
|
||
|
if conv.bias is not None:
|
||
|
conv.bias.data /= scale
|
||
|
|
||
|
|
||
|
def rescale_module(module, reference):
|
||
|
for sub in module.modules():
|
||
|
if isinstance(sub, (nn.Conv1d, nn.ConvTranspose1d)):
|
||
|
rescale_conv(sub, reference)
|
||
|
|
||
|
def auto_load_demucs_model_v2(sources, demucs_model_name):
|
||
|
|
||
|
if '48' in demucs_model_name:
|
||
|
channels=48
|
||
|
elif 'unittest' in demucs_model_name:
|
||
|
channels=4
|
||
|
else:
|
||
|
channels=64
|
||
|
|
||
|
if 'tasnet' in demucs_model_name:
|
||
|
init_demucs_model = ConvTasNet(sources, X=10)
|
||
|
else:
|
||
|
init_demucs_model = Demucs(sources, channels=channels)
|
||
|
|
||
|
return init_demucs_model
|
||
|
|
||
|
class Demucs(nn.Module):
|
||
|
@capture_init
|
||
|
def __init__(self,
|
||
|
sources,
|
||
|
audio_channels=2,
|
||
|
channels=64,
|
||
|
depth=6,
|
||
|
rewrite=True,
|
||
|
glu=True,
|
||
|
rescale=0.1,
|
||
|
resample=True,
|
||
|
kernel_size=8,
|
||
|
stride=4,
|
||
|
growth=2.,
|
||
|
lstm_layers=2,
|
||
|
context=3,
|
||
|
normalize=False,
|
||
|
samplerate=44100,
|
||
|
segment_length=4 * 10 * 44100):
|
||
|
"""
|
||
|
Args:
|
||
|
sources (list[str]): list of source names
|
||
|
audio_channels (int): stereo or mono
|
||
|
channels (int): first convolution channels
|
||
|
depth (int): number of encoder/decoder layers
|
||
|
rewrite (bool): add 1x1 convolution to each encoder layer
|
||
|
and a convolution to each decoder layer.
|
||
|
For the decoder layer, `context` gives the kernel size.
|
||
|
glu (bool): use glu instead of ReLU
|
||
|
resample_input (bool): upsample x2 the input and downsample /2 the output.
|
||
|
rescale (int): rescale initial weights of convolutions
|
||
|
to get their standard deviation closer to `rescale`
|
||
|
kernel_size (int): kernel size for convolutions
|
||
|
stride (int): stride for convolutions
|
||
|
growth (float): multiply (resp divide) number of channels by that
|
||
|
for each layer of the encoder (resp decoder)
|
||
|
lstm_layers (int): number of lstm layers, 0 = no lstm
|
||
|
context (int): kernel size of the convolution in the
|
||
|
decoder before the transposed convolution. If > 1,
|
||
|
will provide some context from neighboring time
|
||
|
steps.
|
||
|
samplerate (int): stored as meta information for easing
|
||
|
future evaluations of the model.
|
||
|
segment_length (int): stored as meta information for easing
|
||
|
future evaluations of the model. Length of the segments on which
|
||
|
the model was trained.
|
||
|
"""
|
||
|
|
||
|
super().__init__()
|
||
|
self.audio_channels = audio_channels
|
||
|
self.sources = sources
|
||
|
self.kernel_size = kernel_size
|
||
|
self.context = context
|
||
|
self.stride = stride
|
||
|
self.depth = depth
|
||
|
self.resample = resample
|
||
|
self.channels = channels
|
||
|
self.normalize = normalize
|
||
|
self.samplerate = samplerate
|
||
|
self.segment_length = segment_length
|
||
|
|
||
|
self.encoder = nn.ModuleList()
|
||
|
self.decoder = nn.ModuleList()
|
||
|
|
||
|
if glu:
|
||
|
activation = nn.GLU(dim=1)
|
||
|
ch_scale = 2
|
||
|
else:
|
||
|
activation = nn.ReLU()
|
||
|
ch_scale = 1
|
||
|
in_channels = audio_channels
|
||
|
for index in range(depth):
|
||
|
encode = []
|
||
|
encode += [nn.Conv1d(in_channels, channels, kernel_size, stride), nn.ReLU()]
|
||
|
if rewrite:
|
||
|
encode += [nn.Conv1d(channels, ch_scale * channels, 1), activation]
|
||
|
self.encoder.append(nn.Sequential(*encode))
|
||
|
|
||
|
decode = []
|
||
|
if index > 0:
|
||
|
out_channels = in_channels
|
||
|
else:
|
||
|
out_channels = len(self.sources) * audio_channels
|
||
|
if rewrite:
|
||
|
decode += [nn.Conv1d(channels, ch_scale * channels, context), activation]
|
||
|
decode += [nn.ConvTranspose1d(channels, out_channels, kernel_size, stride)]
|
||
|
if index > 0:
|
||
|
decode.append(nn.ReLU())
|
||
|
self.decoder.insert(0, nn.Sequential(*decode))
|
||
|
in_channels = channels
|
||
|
channels = int(growth * channels)
|
||
|
|
||
|
channels = in_channels
|
||
|
|
||
|
if lstm_layers:
|
||
|
self.lstm = BLSTM(channels, lstm_layers)
|
||
|
else:
|
||
|
self.lstm = None
|
||
|
|
||
|
if rescale:
|
||
|
rescale_module(self, reference=rescale)
|
||
|
|
||
|
def valid_length(self, length):
|
||
|
"""
|
||
|
Return the nearest valid length to use with the model so that
|
||
|
there is no time steps left over in a convolutions, e.g. for all
|
||
|
layers, size of the input - kernel_size % stride = 0.
|
||
|
|
||
|
If the mixture has a valid length, the estimated sources
|
||
|
will have exactly the same length when context = 1. If context > 1,
|
||
|
the two signals can be center trimmed to match.
|
||
|
|
||
|
For training, extracts should have a valid length.For evaluation
|
||
|
on full tracks we recommend passing `pad = True` to :method:`forward`.
|
||
|
"""
|
||
|
if self.resample:
|
||
|
length *= 2
|
||
|
for _ in range(self.depth):
|
||
|
length = math.ceil((length - self.kernel_size) / self.stride) + 1
|
||
|
length = max(1, length)
|
||
|
length += self.context - 1
|
||
|
for _ in range(self.depth):
|
||
|
length = (length - 1) * self.stride + self.kernel_size
|
||
|
|
||
|
if self.resample:
|
||
|
length = math.ceil(length / 2)
|
||
|
return int(length)
|
||
|
|
||
|
def forward(self, mix):
|
||
|
x = mix
|
||
|
|
||
|
if self.normalize:
|
||
|
mono = mix.mean(dim=1, keepdim=True)
|
||
|
mean = mono.mean(dim=-1, keepdim=True)
|
||
|
std = mono.std(dim=-1, keepdim=True)
|
||
|
else:
|
||
|
mean = 0
|
||
|
std = 1
|
||
|
|
||
|
x = (x - mean) / (1e-5 + std)
|
||
|
|
||
|
if self.resample:
|
||
|
x = julius.resample_frac(x, 1, 2)
|
||
|
|
||
|
saved = []
|
||
|
for encode in self.encoder:
|
||
|
x = encode(x)
|
||
|
saved.append(x)
|
||
|
if self.lstm:
|
||
|
x = self.lstm(x)
|
||
|
for decode in self.decoder:
|
||
|
skip = center_trim(saved.pop(-1), x)
|
||
|
x = x + skip
|
||
|
x = decode(x)
|
||
|
|
||
|
if self.resample:
|
||
|
x = julius.resample_frac(x, 2, 1)
|
||
|
x = x * std + mean
|
||
|
x = x.view(x.size(0), len(self.sources), self.audio_channels, x.size(-1))
|
||
|
return x
|