ultimatevocalremovergui/demucs/model_v2.py

# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

import math

import julius
from torch import nn
from .tasnet_v2 import ConvTasNet

from .utils import capture_init, center_trim


class BLSTM(nn.Module):
    def __init__(self, dim, layers=1):
        super().__init__()
        self.lstm = nn.LSTM(bidirectional=True, num_layers=layers, hidden_size=dim, input_size=dim)
        self.linear = nn.Linear(2 * dim, dim)

    def forward(self, x):
        x = x.permute(2, 0, 1)
        x = self.lstm(x)[0]
        x = self.linear(x)
        x = x.permute(1, 2, 0)
        return x


def rescale_conv(conv, reference):
    std = conv.weight.std().detach()
    scale = (std / reference)**0.5
    conv.weight.data /= scale
    if conv.bias is not None:
        conv.bias.data /= scale


def rescale_module(module, reference):
    for sub in module.modules():
        if isinstance(sub, (nn.Conv1d, nn.ConvTranspose1d)):
            rescale_conv(sub, reference)

def auto_load_demucs_model_v2(sources, demucs_model_name):
    
    if '48' in demucs_model_name:
        channels=48
    elif 'unittest' in demucs_model_name:
        channels=4
    else:
        channels=64
    
    if 'tasnet' in demucs_model_name:
        init_demucs_model = ConvTasNet(sources, X=10)
    else:
        init_demucs_model = Demucs(sources, channels=channels)
        
    return init_demucs_model

class Demucs(nn.Module):
    @capture_init
    def __init__(self,
                 sources,
                 audio_channels=2,
                 channels=64,
                 depth=6,
                 rewrite=True,
                 glu=True,
                 rescale=0.1,
                 resample=True,
                 kernel_size=8,
                 stride=4,
                 growth=2.,
                 lstm_layers=2,
                 context=3,
                 normalize=False,
                 samplerate=44100,
                 segment_length=4 * 10 * 44100):
        """
        Args:
            sources (list[str]): list of source names
            audio_channels (int): stereo or mono
            channels (int): first convolution channels
            depth (int): number of encoder/decoder layers
            rewrite (bool): add 1x1 convolution to each encoder layer
                and a convolution to each decoder layer.
                For the decoder layer, `context` gives the kernel size.
            glu (bool): use glu instead of ReLU
            resample_input (bool): upsample x2 the input and downsample /2 the output.
            rescale (int): rescale initial weights of convolutions
                to get their standard deviation closer to `rescale`
            kernel_size (int): kernel size for convolutions
            stride (int): stride for convolutions
            growth (float): multiply (resp divide) number of channels by that
                for each layer of the encoder (resp decoder)
            lstm_layers (int): number of lstm layers, 0 = no lstm
            context (int): kernel size of the convolution in the
                decoder before the transposed convolution. If > 1,
                will provide some context from neighboring time
                steps.
            samplerate (int): stored as meta information for easing
                future evaluations of the model.
            segment_length (int): stored as meta information for easing
                future evaluations of the model. Length of the segments on which
                the model was trained.
        """

        super().__init__()
        self.audio_channels = audio_channels
        self.sources = sources
        self.kernel_size = kernel_size
        self.context = context
        self.stride = stride
        self.depth = depth
        self.resample = resample
        self.channels = channels
        self.normalize = normalize
        self.samplerate = samplerate
        self.segment_length = segment_length

        self.encoder = nn.ModuleList()
        self.decoder = nn.ModuleList()

        if glu:
            activation = nn.GLU(dim=1)
            ch_scale = 2
        else:
            activation = nn.ReLU()
            ch_scale = 1
        in_channels = audio_channels
        for index in range(depth):
            encode = []
            encode += [nn.Conv1d(in_channels, channels, kernel_size, stride), nn.ReLU()]
            if rewrite:
                encode += [nn.Conv1d(channels, ch_scale * channels, 1), activation]
            self.encoder.append(nn.Sequential(*encode))

            decode = []
            if index > 0:
                out_channels = in_channels
            else:
                out_channels = len(self.sources) * audio_channels
            if rewrite:
                decode += [nn.Conv1d(channels, ch_scale * channels, context), activation]
            decode += [nn.ConvTranspose1d(channels, out_channels, kernel_size, stride)]
            if index > 0:
                decode.append(nn.ReLU())
            self.decoder.insert(0, nn.Sequential(*decode))
            in_channels = channels
            channels = int(growth * channels)

        channels = in_channels

        if lstm_layers:
            self.lstm = BLSTM(channels, lstm_layers)
        else:
            self.lstm = None

        if rescale:
            rescale_module(self, reference=rescale)

    def valid_length(self, length):
        """
        Return the nearest valid length to use with the model so that
        there is no time steps left over in a convolutions, e.g. for all
        layers, size of the input - kernel_size % stride = 0.

        If the mixture has a valid length, the estimated sources
        will have exactly the same length when context = 1. If context > 1,
        the two signals can be center trimmed to match.

        For training, extracts should have a valid length.For evaluation
        on full tracks we recommend passing `pad = True` to :method:`forward`.
        """
        if self.resample:
            length *= 2
        for _ in range(self.depth):
            length = math.ceil((length - self.kernel_size) / self.stride) + 1
            length = max(1, length)
            length += self.context - 1
        for _ in range(self.depth):
            length = (length - 1) * self.stride + self.kernel_size

        if self.resample:
            length = math.ceil(length / 2)
        return int(length)

    def forward(self, mix):
        x = mix

        if self.normalize:
            mono = mix.mean(dim=1, keepdim=True)
            mean = mono.mean(dim=-1, keepdim=True)
            std = mono.std(dim=-1, keepdim=True)
        else:
            mean = 0
            std = 1

        x = (x - mean) / (1e-5 + std)

        if self.resample:
            x = julius.resample_frac(x, 1, 2)

        saved = []
        for encode in self.encoder:
            x = encode(x)
            saved.append(x)
        if self.lstm:
            x = self.lstm(x)
        for decode in self.decoder:
            skip = center_trim(saved.pop(-1), x)
            x = x + skip
            x = decode(x)

        if self.resample:
            x = julius.resample_frac(x, 2, 1)
        x = x * std + mean
        x = x.view(x.size(0), len(self.sources), self.audio_channels, x.size(-1))
        return x
Add files via upload 2022-12-19 04:18:56 +01:00			`# Copyright (c) Facebook, Inc. and its affiliates.`
			`# All rights reserved.`
			`#`
			`# This source code is licensed under the license found in the`
			`# LICENSE file in the root directory of this source tree.`

			`import math`

			`import julius`
			`from torch import nn`
			`from .tasnet_v2 import ConvTasNet`

			`from .utils import capture_init, center_trim`


			`class BLSTM(nn.Module):`
			`def __init__(self, dim, layers=1):`
			`super().__init__()`
			`self.lstm = nn.LSTM(bidirectional=True, num_layers=layers, hidden_size=dim, input_size=dim)`
			`self.linear = nn.Linear(2 * dim, dim)`

			`def forward(self, x):`
			`x = x.permute(2, 0, 1)`
			`x = self.lstm(x)[0]`
			`x = self.linear(x)`
			`x = x.permute(1, 2, 0)`
			`return x`


			`def rescale_conv(conv, reference):`
			`std = conv.weight.std().detach()`
			`scale = (std / reference)**0.5`
			`conv.weight.data /= scale`
			`if conv.bias is not None:`
			`conv.bias.data /= scale`


			`def rescale_module(module, reference):`
			`for sub in module.modules():`
			`if isinstance(sub, (nn.Conv1d, nn.ConvTranspose1d)):`
			`rescale_conv(sub, reference)`

			`def auto_load_demucs_model_v2(sources, demucs_model_name):`

			`if '48' in demucs_model_name:`
			`channels=48`
			`elif 'unittest' in demucs_model_name:`
			`channels=4`
			`else:`
			`channels=64`

			`if 'tasnet' in demucs_model_name:`
			`init_demucs_model = ConvTasNet(sources, X=10)`
			`else:`
			`init_demucs_model = Demucs(sources, channels=channels)`

			`return init_demucs_model`

			`class Demucs(nn.Module):`
			`@capture_init`
			`def __init__(self,`
			`sources,`
			`audio_channels=2,`
			`channels=64,`
			`depth=6,`
			`rewrite=True,`
			`glu=True,`
			`rescale=0.1,`
			`resample=True,`
			`kernel_size=8,`
			`stride=4,`
			`growth=2.,`
			`lstm_layers=2,`
			`context=3,`
			`normalize=False,`
			`samplerate=44100,`
			`segment_length=4 * 10 * 44100):`
			`"""`
			`Args:`
			`sources (list[str]): list of source names`
			`audio_channels (int): stereo or mono`
			`channels (int): first convolution channels`
			`depth (int): number of encoder/decoder layers`
			`rewrite (bool): add 1x1 convolution to each encoder layer`
			`and a convolution to each decoder layer.`
			For the decoder layer, `context` gives the kernel size.
			`glu (bool): use glu instead of ReLU`
			`resample_input (bool): upsample x2 the input and downsample /2 the output.`
			`rescale (int): rescale initial weights of convolutions`
			to get their standard deviation closer to `rescale`
			`kernel_size (int): kernel size for convolutions`
			`stride (int): stride for convolutions`
			`growth (float): multiply (resp divide) number of channels by that`
			`for each layer of the encoder (resp decoder)`
			`lstm_layers (int): number of lstm layers, 0 = no lstm`
			`context (int): kernel size of the convolution in the`
			`decoder before the transposed convolution. If > 1,`
			`will provide some context from neighboring time`
			`steps.`
			`samplerate (int): stored as meta information for easing`
			`future evaluations of the model.`
			`segment_length (int): stored as meta information for easing`
			`future evaluations of the model. Length of the segments on which`
			`the model was trained.`
			`"""`

			`super().__init__()`
			`self.audio_channels = audio_channels`
			`self.sources = sources`
			`self.kernel_size = kernel_size`
			`self.context = context`
			`self.stride = stride`
			`self.depth = depth`
			`self.resample = resample`
			`self.channels = channels`
			`self.normalize = normalize`
			`self.samplerate = samplerate`
			`self.segment_length = segment_length`

			`self.encoder = nn.ModuleList()`
			`self.decoder = nn.ModuleList()`

			`if glu:`
			`activation = nn.GLU(dim=1)`
			`ch_scale = 2`
			`else:`
			`activation = nn.ReLU()`
			`ch_scale = 1`
			`in_channels = audio_channels`
			`for index in range(depth):`
			`encode = []`
			`encode += [nn.Conv1d(in_channels, channels, kernel_size, stride), nn.ReLU()]`
			`if rewrite:`
			`encode += [nn.Conv1d(channels, ch_scale * channels, 1), activation]`
			`self.encoder.append(nn.Sequential(*encode))`

			`decode = []`
			`if index > 0:`
			`out_channels = in_channels`
			`else:`
			`out_channels = len(self.sources) * audio_channels`
			`if rewrite:`
			`decode += [nn.Conv1d(channels, ch_scale * channels, context), activation]`
			`decode += [nn.ConvTranspose1d(channels, out_channels, kernel_size, stride)]`
			`if index > 0:`
			`decode.append(nn.ReLU())`
			`self.decoder.insert(0, nn.Sequential(*decode))`
			`in_channels = channels`
			`channels = int(growth * channels)`

			`channels = in_channels`

			`if lstm_layers:`
			`self.lstm = BLSTM(channels, lstm_layers)`
			`else:`
			`self.lstm = None`

			`if rescale:`
			`rescale_module(self, reference=rescale)`

			`def valid_length(self, length):`
			`"""`
			`Return the nearest valid length to use with the model so that`
			`there is no time steps left over in a convolutions, e.g. for all`
			`layers, size of the input - kernel_size % stride = 0.`

			`If the mixture has a valid length, the estimated sources`
			`will have exactly the same length when context = 1. If context > 1,`
			`the two signals can be center trimmed to match.`

			`For training, extracts should have a valid length.For evaluation`
			on full tracks we recommend passing `pad = True` to :method:`forward`.
			`"""`
			`if self.resample:`
			`length *= 2`
			`for _ in range(self.depth):`
			`length = math.ceil((length - self.kernel_size) / self.stride) + 1`
			`length = max(1, length)`
			`length += self.context - 1`
			`for _ in range(self.depth):`
			`length = (length - 1) * self.stride + self.kernel_size`

			`if self.resample:`
			`length = math.ceil(length / 2)`
			`return int(length)`

			`def forward(self, mix):`
			`x = mix`

			`if self.normalize:`
			`mono = mix.mean(dim=1, keepdim=True)`
			`mean = mono.mean(dim=-1, keepdim=True)`
			`std = mono.std(dim=-1, keepdim=True)`
			`else:`
			`mean = 0`
			`std = 1`

			`x = (x - mean) / (1e-5 + std)`

			`if self.resample:`
			`x = julius.resample_frac(x, 1, 2)`

			`saved = []`
			`for encode in self.encoder:`
			`x = encode(x)`
			`saved.append(x)`
			`if self.lstm:`
			`x = self.lstm(x)`
			`for decode in self.decoder:`
			`skip = center_trim(saved.pop(-1), x)`
			`x = x + skip`
			`x = decode(x)`

			`if self.resample:`
			`x = julius.resample_frac(x, 2, 1)`
			`x = x * std + mean`
			`x = x.view(x.size(0), len(self.sources), self.audio_channels, x.size(-1))`
			`return x`