Add files via upload

2024-11-27 17:00:54 +01:00 · 2023-03-31 17:47:00 +08:00 · 2023-03-31 17:47:00 +08:00 · 09862d29ec
commit 09862d29ec
parent d3f019120e
66 changed files with 6653 additions and 0 deletions
--- a/infer_pack/pycache/attentions.cpython-39.pyc
+++ b/infer_pack/pycache/attentions.cpython-39.pyc
--- a/infer_pack/pycache/commons.cpython-39.pyc
+++ b/infer_pack/pycache/commons.cpython-39.pyc
--- a/infer_pack/pycache/models.cpython-39.pyc
+++ b/infer_pack/pycache/models.cpython-39.pyc
--- a/infer_pack/pycache/modules.cpython-39.pyc
+++ b/infer_pack/pycache/modules.cpython-39.pyc
--- a/infer_pack/pycache/transforms.cpython-39.pyc
+++ b/infer_pack/pycache/transforms.cpython-39.pyc
--- a/infer_pack/attentions.py
+++ b/infer_pack/attentions.py
@ -0,0 +1,417 @@
+import copy
+import math
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from infer_pack import commons
+from infer_pack import modules
+from infer_pack.modules import LayerNorm
+
+
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size=1,
+        p_dropout=0.0,
+        window_size=10,
+        **kwargs
+    ):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+
+        self.drop = nn.Dropout(p_dropout)
+        self.attn_layers = nn.ModuleList()
+        self.norm_layers_1 = nn.ModuleList()
+        self.ffn_layers = nn.ModuleList()
+        self.norm_layers_2 = nn.ModuleList()
+        for i in range(self.n_layers):
+            self.attn_layers.append(
+                MultiHeadAttention(
+                    hidden_channels,
+                    hidden_channels,
+                    n_heads,
+                    p_dropout=p_dropout,
+                    window_size=window_size,
+                )
+            )
+            self.norm_layers_1.append(LayerNorm(hidden_channels))
+            self.ffn_layers.append(
+                FFN(
+                    hidden_channels,
+                    hidden_channels,
+                    filter_channels,
+                    kernel_size,
+                    p_dropout=p_dropout,
+                )
+            )
+            self.norm_layers_2.append(LayerNorm(hidden_channels))
+
+    def forward(self, x, x_mask):
+        attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+        x = x * x_mask
+        for i in range(self.n_layers):
+            y = self.attn_layers[i](x, x, attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_1[i](x + y)
+
+            y = self.ffn_layers[i](x, x_mask)
+            y = self.drop(y)
+            x = self.norm_layers_2[i](x + y)
+        x = x * x_mask
+        return x
+
+
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size=1,
+        p_dropout=0.0,
+        proximal_bias=False,
+        proximal_init=True,
+        **kwargs
+    ):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.proximal_bias = proximal_bias
+        self.proximal_init = proximal_init
+
+        self.drop = nn.Dropout(p_dropout)
+        self.self_attn_layers = nn.ModuleList()
+        self.norm_layers_0 = nn.ModuleList()
+        self.encdec_attn_layers = nn.ModuleList()
+        self.norm_layers_1 = nn.ModuleList()
+        self.ffn_layers = nn.ModuleList()
+        self.norm_layers_2 = nn.ModuleList()
+        for i in range(self.n_layers):
+            self.self_attn_layers.append(
+                MultiHeadAttention(
+                    hidden_channels,
+                    hidden_channels,
+                    n_heads,
+                    p_dropout=p_dropout,
+                    proximal_bias=proximal_bias,
+                    proximal_init=proximal_init,
+                )
+            )
+            self.norm_layers_0.append(LayerNorm(hidden_channels))
+            self.encdec_attn_layers.append(
+                MultiHeadAttention(
+                    hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
+                )
+            )
+            self.norm_layers_1.append(LayerNorm(hidden_channels))
+            self.ffn_layers.append(
+                FFN(
+                    hidden_channels,
+                    hidden_channels,
+                    filter_channels,
+                    kernel_size,
+                    p_dropout=p_dropout,
+                    causal=True,
+                )
+            )
+            self.norm_layers_2.append(LayerNorm(hidden_channels))
+
+    def forward(self, x, x_mask, h, h_mask):
+        """
+        x: decoder input
+        h: encoder output
+        """
+        self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
+            device=x.device, dtype=x.dtype
+        )
+        encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+        x = x * x_mask
+        for i in range(self.n_layers):
+            y = self.self_attn_layers[i](x, x, self_attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_0[i](x + y)
+
+            y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_1[i](x + y)
+
+            y = self.ffn_layers[i](x, x_mask)
+            y = self.drop(y)
+            x = self.norm_layers_2[i](x + y)
+        x = x * x_mask
+        return x
+
+
+class MultiHeadAttention(nn.Module):
+    def __init__(
+        self,
+        channels,
+        out_channels,
+        n_heads,
+        p_dropout=0.0,
+        window_size=None,
+        heads_share=True,
+        block_length=None,
+        proximal_bias=False,
+        proximal_init=False,
+    ):
+        super().__init__()
+        assert channels % n_heads == 0
+
+        self.channels = channels
+        self.out_channels = out_channels
+        self.n_heads = n_heads
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        self.heads_share = heads_share
+        self.block_length = block_length
+        self.proximal_bias = proximal_bias
+        self.proximal_init = proximal_init
+        self.attn = None
+
+        self.k_channels = channels // n_heads
+        self.conv_q = nn.Conv1d(channels, channels, 1)
+        self.conv_k = nn.Conv1d(channels, channels, 1)
+        self.conv_v = nn.Conv1d(channels, channels, 1)
+        self.conv_o = nn.Conv1d(channels, out_channels, 1)
+        self.drop = nn.Dropout(p_dropout)
+
+        if window_size is not None:
+            n_heads_rel = 1 if heads_share else n_heads
+            rel_stddev = self.k_channels**-0.5
+            self.emb_rel_k = nn.Parameter(
+                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
+                * rel_stddev
+            )
+            self.emb_rel_v = nn.Parameter(
+                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
+                * rel_stddev
+            )
+
+        nn.init.xavier_uniform_(self.conv_q.weight)
+        nn.init.xavier_uniform_(self.conv_k.weight)
+        nn.init.xavier_uniform_(self.conv_v.weight)
+        if proximal_init:
+            with torch.no_grad():
+                self.conv_k.weight.copy_(self.conv_q.weight)
+                self.conv_k.bias.copy_(self.conv_q.bias)
+
+    def forward(self, x, c, attn_mask=None):
+        q = self.conv_q(x)
+        k = self.conv_k(c)
+        v = self.conv_v(c)
+
+        x, self.attn = self.attention(q, k, v, mask=attn_mask)
+
+        x = self.conv_o(x)
+        return x
+
+    def attention(self, query, key, value, mask=None):
+        # reshape [b, d, t] -> [b, n_h, t, d_k]
+        b, d, t_s, t_t = (*key.size(), query.size(2))
+        query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
+        key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+
+        scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
+        if self.window_size is not None:
+            assert (
+                t_s == t_t
+            ), "Relative attention is only available for self-attention."
+            key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
+            rel_logits = self._matmul_with_relative_keys(
+                query / math.sqrt(self.k_channels), key_relative_embeddings
+            )
+            scores_local = self._relative_position_to_absolute_position(rel_logits)
+            scores = scores + scores_local
+        if self.proximal_bias:
+            assert t_s == t_t, "Proximal bias is only available for self-attention."
+            scores = scores + self._attention_bias_proximal(t_s).to(
+                device=scores.device, dtype=scores.dtype
+            )
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, -1e4)
+            if self.block_length is not None:
+                assert (
+                    t_s == t_t
+                ), "Local attention is only available for self-attention."
+                block_mask = (
+                    torch.ones_like(scores)
+                    .triu(-self.block_length)
+                    .tril(self.block_length)
+                )
+                scores = scores.masked_fill(block_mask == 0, -1e4)
+        p_attn = F.softmax(scores, dim=-1)  # [b, n_h, t_t, t_s]
+        p_attn = self.drop(p_attn)
+        output = torch.matmul(p_attn, value)
+        if self.window_size is not None:
+            relative_weights = self._absolute_position_to_relative_position(p_attn)
+            value_relative_embeddings = self._get_relative_embeddings(
+                self.emb_rel_v, t_s
+            )
+            output = output + self._matmul_with_relative_values(
+                relative_weights, value_relative_embeddings
+            )
+        output = (
+            output.transpose(2, 3).contiguous().view(b, d, t_t)
+        )  # [b, n_h, t_t, d_k] -> [b, d, t_t]
+        return output, p_attn
+
+    def _matmul_with_relative_values(self, x, y):
+        """
+        x: [b, h, l, m]
+        y: [h or 1, m, d]
+        ret: [b, h, l, d]
+        """
+        ret = torch.matmul(x, y.unsqueeze(0))
+        return ret
+
+    def _matmul_with_relative_keys(self, x, y):
+        """
+        x: [b, h, l, d]
+        y: [h or 1, m, d]
+        ret: [b, h, l, m]
+        """
+        ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
+        return ret
+
+    def _get_relative_embeddings(self, relative_embeddings, length):
+        max_relative_position = 2 * self.window_size + 1
+        # Pad first before slice to avoid using cond ops.
+        pad_length = max(length - (self.window_size + 1), 0)
+        slice_start_position = max((self.window_size + 1) - length, 0)
+        slice_end_position = slice_start_position + 2 * length - 1
+        if pad_length > 0:
+            padded_relative_embeddings = F.pad(
+                relative_embeddings,
+                commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
+            )
+        else:
+            padded_relative_embeddings = relative_embeddings
+        used_relative_embeddings = padded_relative_embeddings[
+            :, slice_start_position:slice_end_position
+        ]
+        return used_relative_embeddings
+
+    def _relative_position_to_absolute_position(self, x):
+        """
+        x: [b, h, l, 2*l-1]
+        ret: [b, h, l, l]
+        """
+        batch, heads, length, _ = x.size()
+        # Concat columns of pad to shift from relative to absolute indexing.
+        x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
+
+        # Concat extra elements so to add up to shape (len+1, 2*len-1).
+        x_flat = x.view([batch, heads, length * 2 * length])
+        x_flat = F.pad(
+            x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
+        )
+
+        # Reshape and slice out the padded elements.
+        x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
+            :, :, :length, length - 1 :
+        ]
+        return x_final
+
+    def _absolute_position_to_relative_position(self, x):
+        """
+        x: [b, h, l, l]
+        ret: [b, h, l, 2*l-1]
+        """
+        batch, heads, length, _ = x.size()
+        # padd along column
+        x = F.pad(
+            x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
+        )
+        x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
+        # add 0's in the beginning that will skew the elements after reshape
+        x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
+        x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
+        return x_final
+
+    def _attention_bias_proximal(self, length):
+        """Bias for self-attention to encourage attention to close positions.
+        Args:
+          length: an integer scalar.
+        Returns:
+          a Tensor with shape [1, 1, length, length]
+        """
+        r = torch.arange(length, dtype=torch.float32)
+        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
+        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
+
+
+class FFN(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        filter_channels,
+        kernel_size,
+        p_dropout=0.0,
+        activation=None,
+        causal=False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.activation = activation
+        self.causal = causal
+
+        if causal:
+            self.padding = self._causal_padding
+        else:
+            self.padding = self._same_padding
+
+        self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
+        self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
+        self.drop = nn.Dropout(p_dropout)
+
+    def forward(self, x, x_mask):
+        x = self.conv_1(self.padding(x * x_mask))
+        if self.activation == "gelu":
+            x = x * torch.sigmoid(1.702 * x)
+        else:
+            x = torch.relu(x)
+        x = self.drop(x)
+        x = self.conv_2(self.padding(x * x_mask))
+        return x * x_mask
+
+    def _causal_padding(self, x):
+        if self.kernel_size == 1:
+            return x
+        pad_l = self.kernel_size - 1
+        pad_r = 0
+        padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+        x = F.pad(x, commons.convert_pad_shape(padding))
+        return x
+
+    def _same_padding(self, x):
+        if self.kernel_size == 1:
+            return x
+        pad_l = (self.kernel_size - 1) // 2
+        pad_r = self.kernel_size // 2
+        padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+        x = F.pad(x, commons.convert_pad_shape(padding))
+        return x
--- a/infer_pack/commons.py
+++ b/infer_pack/commons.py
@ -0,0 +1,164 @@
+import math
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+
+
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+
+
+def convert_pad_shape(pad_shape):
+    l = pad_shape[::-1]
+    pad_shape = [item for sublist in l for item in sublist]
+    return pad_shape
+
+
+def kl_divergence(m_p, logs_p, m_q, logs_q):
+    """KL(P||Q)"""
+    kl = (logs_q - logs_p) - 0.5
+    kl += (
+        0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
+    )
+    return kl
+
+
+def rand_gumbel(shape):
+    """Sample from the Gumbel distribution, protect from overflows."""
+    uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
+    return -torch.log(-torch.log(uniform_samples))
+
+
+def rand_gumbel_like(x):
+    g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
+    return g
+
+
+def slice_segments(x, ids_str, segment_size=4):
+    ret = torch.zeros_like(x[:, :, :segment_size])
+    for i in range(x.size(0)):
+        idx_str = ids_str[i]
+        idx_end = idx_str + segment_size
+        ret[i] = x[i, :, idx_str:idx_end]
+    return ret
+def slice_segments2(x, ids_str, segment_size=4):
+    ret = torch.zeros_like(x[:,  :segment_size])
+    for i in range(x.size(0)):
+        idx_str = ids_str[i]
+        idx_end = idx_str + segment_size
+        ret[i] = x[i, idx_str:idx_end]
+    return ret
+
+
+def rand_slice_segments(x, x_lengths=None, segment_size=4):
+    b, d, t = x.size()
+    if x_lengths is None:
+        x_lengths = t
+    ids_str_max = x_lengths - segment_size + 1
+    ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+    ret = slice_segments(x, ids_str, segment_size)
+    return ret, ids_str
+
+
+def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
+    position = torch.arange(length, dtype=torch.float)
+    num_timescales = channels // 2
+    log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
+        num_timescales - 1
+    )
+    inv_timescales = min_timescale * torch.exp(
+        torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
+    )
+    scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
+    signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
+    signal = F.pad(signal, [0, 0, 0, channels % 2])
+    signal = signal.view(1, channels, length)
+    return signal
+
+
+def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
+    b, channels, length = x.size()
+    signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+    return x + signal.to(dtype=x.dtype, device=x.device)
+
+
+def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
+    b, channels, length = x.size()
+    signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+    return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
+
+
+def subsequent_mask(length):
+    mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
+    return mask
+
+
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+    n_channels_int = n_channels[0]
+    in_act = input_a + input_b
+    t_act = torch.tanh(in_act[:, :n_channels_int, :])
+    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+    acts = t_act * s_act
+    return acts
+
+
+def convert_pad_shape(pad_shape):
+    l = pad_shape[::-1]
+    pad_shape = [item for sublist in l for item in sublist]
+    return pad_shape
+
+
+def shift_1d(x):
+    x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
+    return x
+
+
+def sequence_mask(length, max_length=None):
+    if max_length is None:
+        max_length = length.max()
+    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+    return x.unsqueeze(0) < length.unsqueeze(1)
+
+
+def generate_path(duration, mask):
+    """
+    duration: [b, 1, t_x]
+    mask: [b, 1, t_y, t_x]
+    """
+    device = duration.device
+
+    b, _, t_y, t_x = mask.shape
+    cum_duration = torch.cumsum(duration, -1)
+
+    cum_duration_flat = cum_duration.view(b * t_x)
+    path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
+    path = path.view(b, t_x, t_y)
+    path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
+    path = path.unsqueeze(1).transpose(2, 3) * mask
+    return path
+
+
+def clip_grad_value_(parameters, clip_value, norm_type=2):
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = list(filter(lambda p: p.grad is not None, parameters))
+    norm_type = float(norm_type)
+    if clip_value is not None:
+        clip_value = float(clip_value)
+
+    total_norm = 0
+    for p in parameters:
+        param_norm = p.grad.data.norm(norm_type)
+        total_norm += param_norm.item() ** norm_type
+        if clip_value is not None:
+            p.grad.data.clamp_(min=-clip_value, max=clip_value)
+    total_norm = total_norm ** (1.0 / norm_type)
+    return total_norm
--- a/infer_pack/models.py
+++ b/infer_pack/models.py
@ -0,0 +1,892 @@
+import math,pdb,os
+from time import time as ttime
+import torch
+from torch import nn
+from torch.nn import functional as F
+from infer_pack import modules
+from infer_pack import attentions
+from infer_pack import commons
+from infer_pack.commons import init_weights, get_padding
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from infer_pack.commons import init_weights
+import numpy as np
+from infer_pack import commons
+class TextEncoder256(nn.Module):
+    def __init__(
+        self,        out_channels,        hidden_channels,        filter_channels,        n_heads,        n_layers,        kernel_size,        p_dropout,        f0=True    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.emb_phone = nn.Linear(256, hidden_channels)
+        self.lrelu=nn.LeakyReLU(0.1,inplace=True)
+        if(f0==True):
+            self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
+        self.encoder = attentions.Encoder(
+            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+    def forward(self, phone, pitch, lengths):
+        if(pitch==None):
+            x = self.emb_phone(phone)
+        else:
+            x = self.emb_phone(phone) + self.emb_pitch(pitch)
+        x = x * math.sqrt(self.hidden_channels)  # [b, t, h]
+        x=self.lrelu(x)
+        x = torch.transpose(x, 1, -1)  # [b, h, t]
+        x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.encoder(x * x_mask, x_mask)
+        stats = self.proj(x) * x_mask
+
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        return m, logs, x_mask
+class TextEncoder256Sim(nn.Module):
+    def __init__(        self,        out_channels,        hidden_channels,        filter_channels,        n_heads,        n_layers,        kernel_size,        p_dropout,        f0=True):
+        super().__init__()
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.emb_phone = nn.Linear(256, hidden_channels)
+        self.lrelu=nn.LeakyReLU(0.1,inplace=True)
+        if(f0==True):
+            self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
+        self.encoder = attentions.Encoder(
+            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+
+    def forward(self, phone, pitch, lengths):
+        if(pitch==None):
+            x = self.emb_phone(phone)
+        else:
+            x = self.emb_phone(phone) + self.emb_pitch(pitch)
+        x = x * math.sqrt(self.hidden_channels)  # [b, t, h]
+        x=self.lrelu(x)
+        x = torch.transpose(x, 1, -1)  # [b, h, t]
+        x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(x.dtype)
+        x = self.encoder(x * x_mask, x_mask)
+        x = self.proj(x) * x_mask
+        return x,x_mask
+class ResidualCouplingBlock(nn.Module):
+    def __init__(
+        self,
+        channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        n_flows=4,
+        gin_channels=0,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+
+        self.flows = nn.ModuleList()
+        for i in range(n_flows):
+            self.flows.append(
+                modules.ResidualCouplingLayer(
+                    channels,
+                    hidden_channels,
+                    kernel_size,
+                    dilation_rate,
+                    n_layers,
+                    gin_channels=gin_channels,
+                    mean_only=True,
+                )
+            )
+            self.flows.append(modules.Flip())
+
+    def forward(self, x, x_mask, g=None, reverse=False):
+        if not reverse:
+            for flow in self.flows:
+                x, _ = flow(x, x_mask, g=g, reverse=reverse)
+        else:
+            for flow in reversed(self.flows):
+                x = flow(x, x_mask, g=g, reverse=reverse)
+        return x
+
+    def remove_weight_norm(self):
+        for i in range(self.n_flows):
+            self.flows[i * 2].remove_weight_norm()
+class PosteriorEncoder(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        gin_channels=0,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+
+        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+        self.enc = modules.WN(
+            hidden_channels,
+            kernel_size,
+            dilation_rate,
+            n_layers,
+            gin_channels=gin_channels,
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+    def forward(self, x, x_lengths, g=None):
+        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.pre(x) * x_mask
+        x = self.enc(x, x_mask, g=g)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+        return z, m, logs, x_mask
+
+    def remove_weight_norm(self):
+        self.enc.remove_weight_norm()
+class Generator(torch.nn.Module):
+    def __init__(
+        self,
+        initial_channel,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        gin_channels=0,
+    ):
+        super(Generator, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.conv_pre = Conv1d(
+            initial_channel, upsample_initial_channel, 7, 1, padding=3
+        )
+        resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
+
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        upsample_initial_channel // (2**i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(resblock_kernel_sizes, resblock_dilation_sizes)
+            ):
+                self.resblocks.append(resblock(ch, k, d))
+
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        self.ups.apply(init_weights)
+
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+
+    def forward(self, x, g=None):
+        x = self.conv_pre(x)
+        if g is not None:
+            x = x + self.cond(g)
+
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+class SineGen(torch.nn.Module):
+    """ Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+
+    def __init__(self, samp_rate, harmonic_num=0,
+                 sine_amp=0.1, noise_std=0.003,
+                 voiced_threshold=0,
+                 flag_for_pulse=False):
+        super(SineGen, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.dim = self.harmonic_num + 1
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = torch.ones_like(f0)
+        uv = uv * (f0 > self.voiced_threshold)
+        return uv
+
+    def forward(self, f0,upp):
+        """ sine_tensor, uv = forward(f0)
+        input F0: tensor(batchsize=1, length, dim=1)
+                  f0 for unvoiced steps should be 0
+        output sine_tensor: tensor(batchsize=1, length, dim)
+        output uv: tensor(batchsize=1, length, 1)
+        """
+        with torch.no_grad():
+            f0 = f0[:, None].transpose(1, 2)
+            f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim,device=f0.device)
+            # fundamental component
+            f0_buf[:, :, 0] = f0[:, :, 0]
+            for idx in np.arange(self.harmonic_num):f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2)# idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
+            rad_values = (f0_buf / self.sampling_rate) % 1###%1意味着n_har的乘积无法后处理优化
+            rand_ini = torch.rand(f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device)
+            rand_ini[:, 0] = 0
+            rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
+            tmp_over_one = torch.cumsum(rad_values, 1)# % 1  #####%1意味着后面的cumsum无法再优化
+            tmp_over_one*=upp
+            tmp_over_one=F.interpolate(tmp_over_one.transpose(2, 1), scale_factor=upp, mode='linear', align_corners=True).transpose(2, 1)
+            rad_values=F.interpolate(rad_values.transpose(2, 1), scale_factor=upp, mode='nearest').transpose(2, 1)#######
+            tmp_over_one%=1
+            tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
+            cumsum_shift = torch.zeros_like(rad_values)
+            cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
+            sine_waves = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi)
+            sine_waves = sine_waves * self.sine_amp
+            uv = self._f02uv(f0)
+            uv = F.interpolate(uv.transpose(2, 1), scale_factor=upp, mode='nearest').transpose(2, 1)
+            noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+            noise = noise_amp * torch.randn_like(sine_waves)
+            sine_waves = sine_waves * uv + noise
+        return sine_waves, uv, noise
+class SourceModuleHnNSF(torch.nn.Module):
+    """ SourceModule for hn-nsf
+    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+    harmonic_num: number of harmonic above F0 (default: 0)
+    sine_amp: amplitude of sine source signal (default: 0.1)
+    add_noise_std: std of additive Gaussian noise (default: 0.003)
+        note that amplitude of noise in unvoiced is decided
+        by sine_amp
+    voiced_threshold: threhold to set U/V given F0 (default: 0)
+    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+    F0_sampled (batchsize, length, 1)
+    Sine_source (batchsize, length, 1)
+    noise_source (batchsize, length 1)
+    uv (batchsize, length, 1)
+    """
+
+    def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0,is_half=True):
+        super(SourceModuleHnNSF, self).__init__()
+
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+        self.is_half=is_half
+        # to produce sine waveforms
+        self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
+                                 sine_amp, add_noise_std, voiced_threshod)
+
+        # to merge source harmonics into a single excitation
+        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = torch.nn.Tanh()
+
+    def forward(self, x,upp=None):
+        sine_wavs, uv, _ = self.l_sin_gen(x,upp)
+        if(self.is_half==True):sine_wavs=sine_wavs.half()
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+        return sine_merge,None,None# noise, uv
+class GeneratorNSF(torch.nn.Module):
+    def __init__(
+        self,
+        initial_channel,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        gin_channels,
+        sr,
+        is_half=False
+    ):
+        super(GeneratorNSF, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+
+        self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
+        self.m_source = SourceModuleHnNSF(
+            sampling_rate=sr,
+            harmonic_num=0,
+            is_half=is_half
+        )
+        self.noise_convs = nn.ModuleList()
+        self.conv_pre = Conv1d(
+            initial_channel, upsample_initial_channel, 7, 1, padding=3
+        )
+        resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
+
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            c_cur = upsample_initial_channel // (2 ** (i + 1))
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        upsample_initial_channel // (2**i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+            if i + 1 < len(upsample_rates):
+                stride_f0 = np.prod(upsample_rates[i + 1:])
+                self.noise_convs.append(Conv1d(
+                    1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2))
+            else:
+                self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(resblock_kernel_sizes, resblock_dilation_sizes)
+            ):
+                self.resblocks.append(resblock(ch, k, d))
+
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        self.ups.apply(init_weights)
+
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+
+        self.upp=np.prod(upsample_rates)
+
+    def forward(self, x, f0,g=None):
+        har_source, noi_source, uv = self.m_source(f0,self.upp)
+        har_source = har_source.transpose(1, 2)
+        x = self.conv_pre(x)
+        if g is not None:
+            x = x + self.cond(g)
+
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            x = self.ups[i](x)
+            x_source = self.noise_convs[i](har_source)
+            x = x + x_source
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+sr2sr={
+    "32k":32000,
+    "40k":40000,
+    "48k":48000,
+}
+class SynthesizerTrnMs256NSFsid(nn.Module):
+    def __init__(
+        self,
+        spec_channels,
+        segment_size,
+        inter_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        spk_embed_dim,
+        gin_channels,
+        sr,
+        **kwargs
+    ):
+
+        super().__init__()
+        if(type(sr)==type("strr")):
+            sr=sr2sr[sr]
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.gin_channels = gin_channels
+        # self.hop_length = hop_length#
+        self.spk_embed_dim=spk_embed_dim
+        self.enc_p = TextEncoder256(
+            inter_channels,
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout,
+        )
+        self.dec = GeneratorNSF(
+            inter_channels,
+            resblock,
+            resblock_kernel_sizes,
+            resblock_dilation_sizes,
+            upsample_rates,
+            upsample_initial_channel,
+            upsample_kernel_sizes,
+            gin_channels=gin_channels,   sr=sr,         is_half=kwargs["is_half"]
+        )
+        self.enc_q = PosteriorEncoder(
+            spec_channels,
+            inter_channels,
+            hidden_channels,
+            5,
+            1,
+            16,
+            gin_channels=gin_channels,
+        )
+        self.flow = ResidualCouplingBlock(
+            inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
+        )
+        self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
+        print("gin_channels:",gin_channels,"self.spk_embed_dim:",self.spk_embed_dim)
+    def remove_weight_norm(self):
+        self.dec.remove_weight_norm()
+        self.flow.remove_weight_norm()
+        self.enc_q.remove_weight_norm()
+
+    def forward(self, phone, phone_lengths, pitch,pitchf, y, y_lengths,ds):#这里ds是id，[bs,1]
+        # print(1,pitch.shape)#[bs,t]
+        g = self.emb_g(ds).unsqueeze(-1)  # [b, 256, 1]##1是t，广播的
+        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
+        z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
+        z_p = self.flow(z, y_mask, g=g)
+        z_slice, ids_slice = commons.rand_slice_segments(
+            z, y_lengths, self.segment_size
+        )
+        # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
+        pitchf = commons.slice_segments2(
+            pitchf, ids_slice, self.segment_size
+        )
+        # print(-2,pitchf.shape,z_slice.shape)
+        o = self.dec(z_slice,pitchf, g=g)
+        return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
+
+    def infer(self, phone, phone_lengths, pitch, nsff0,sid,max_len=None):
+        g = self.emb_g(sid).unsqueeze(-1)
+        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
+        z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
+        z = self.flow(z_p, x_mask, g=g, reverse=True)
+        o = self.dec((z * x_mask)[:, :, :max_len], nsff0,g=g)
+        return o, x_mask, (z, z_p, m_p, logs_p)
+class SynthesizerTrnMs256NSFsid_nono(nn.Module):
+    def __init__(
+        self,
+        spec_channels,
+        segment_size,
+        inter_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        spk_embed_dim,
+        gin_channels,
+        sr=None,
+        **kwargs
+    ):
+
+        super().__init__()
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.gin_channels = gin_channels
+        # self.hop_length = hop_length#
+        self.spk_embed_dim=spk_embed_dim
+        self.enc_p = TextEncoder256(
+            inter_channels,
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout,f0=False
+        )
+        self.dec = Generator(
+            inter_channels,
+            resblock,
+            resblock_kernel_sizes,
+            resblock_dilation_sizes,
+            upsample_rates,
+            upsample_initial_channel,
+            upsample_kernel_sizes,
+            gin_channels=gin_channels
+        )
+        self.enc_q = PosteriorEncoder(
+            spec_channels,
+            inter_channels,
+            hidden_channels,
+            5,
+            1,
+            16,
+            gin_channels=gin_channels,
+        )
+        self.flow = ResidualCouplingBlock(
+            inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
+        )
+        self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
+        print("gin_channels:",gin_channels,"self.spk_embed_dim:",self.spk_embed_dim)
+
+    def remove_weight_norm(self):
+        self.dec.remove_weight_norm()
+        self.flow.remove_weight_norm()
+        self.enc_q.remove_weight_norm()
+
+    def forward(self, phone, phone_lengths, y, y_lengths,ds):#这里ds是id，[bs,1]
+        g = self.emb_g(ds).unsqueeze(-1)  # [b, 256, 1]##1是t，广播的
+        m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
+        z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
+        z_p = self.flow(z, y_mask, g=g)
+        z_slice, ids_slice = commons.rand_slice_segments(
+            z, y_lengths, self.segment_size
+        )
+        o = self.dec(z_slice, g=g)
+        return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
+
+    def infer(self, phone, phone_lengths,sid,max_len=None):
+        g = self.emb_g(sid).unsqueeze(-1)
+        m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
+        z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
+        z = self.flow(z_p, x_mask, g=g, reverse=True)
+        o = self.dec((z * x_mask)[:, :, :max_len],g=g)
+        return o, x_mask, (z, z_p, m_p, logs_p)
+class SynthesizerTrnMs256NSFsid_sim(nn.Module):
+    """
+    Synthesizer for Training
+    """
+
+    def __init__(
+        self,
+        spec_channels,
+        segment_size,
+        inter_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        spk_embed_dim,
+        # hop_length,
+        gin_channels=0,
+        use_sdp=True,
+        **kwargs
+    ):
+
+        super().__init__()
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.gin_channels = gin_channels
+        # self.hop_length = hop_length#
+        self.spk_embed_dim=spk_embed_dim
+        self.enc_p = TextEncoder256Sim(
+            inter_channels,
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout,
+        )
+        self.dec = GeneratorNSF(
+            inter_channels,
+            resblock,
+            resblock_kernel_sizes,
+            resblock_dilation_sizes,
+            upsample_rates,
+            upsample_initial_channel,
+            upsample_kernel_sizes,
+            gin_channels=gin_channels,is_half=kwargs["is_half"]
+        )
+
+        self.flow = ResidualCouplingBlock(
+            inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
+        )
+        self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
+        print("gin_channels:",gin_channels,"self.spk_embed_dim:",self.spk_embed_dim)
+    def remove_weight_norm(self):
+        self.dec.remove_weight_norm()
+        self.flow.remove_weight_norm()
+        self.enc_q.remove_weight_norm()
+
+    def forward(self, phone, phone_lengths, pitch, pitchf, y_lengths,ds):  # y是spec不需要了现在
+        g = self.emb_g(ds).unsqueeze(-1)  # [b, 256, 1]##1是t，广播的
+        x, x_mask = self.enc_p(phone, pitch, phone_lengths)
+        x = self.flow(x, x_mask, g=g, reverse=True)
+        z_slice, ids_slice = commons.rand_slice_segments(
+            x, y_lengths, self.segment_size
+        )
+
+        pitchf = commons.slice_segments2(
+            pitchf, ids_slice, self.segment_size
+        )
+        o = self.dec(z_slice, pitchf, g=g)
+        return o, ids_slice
+    def infer(self, phone, phone_lengths, pitch, pitchf, ds,max_len=None):  # y是spec不需要了现在
+        g = self.emb_g(ds).unsqueeze(-1)  # [b, 256, 1]##1是t，广播的
+        x, x_mask = self.enc_p(phone, pitch, phone_lengths)
+        x = self.flow(x, x_mask, g=g, reverse=True)
+        o = self.dec((x*x_mask)[:, :, :max_len], pitchf, g=g)
+        return o, o
+
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(MultiPeriodDiscriminator, self).__init__()
+        periods = [2, 3, 5, 7, 11,17]
+        # periods = [3, 5, 7, 11, 17, 23, 37]
+
+        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+        discs = discs + [
+            DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
+        ]
+        self.discriminators = nn.ModuleList(discs)
+
+    def forward(self, y, y_hat):
+        y_d_rs = []#
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            # for j in range(len(fmap_r)):
+            #     print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
+            y_d_rs.append(y_d_r)
+            y_d_gs.append(y_d_g)
+            fmap_rs.append(fmap_r)
+            fmap_gs.append(fmap_g)
+
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(Conv1d(1, 16, 15, 1, padding=7)),
+                norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+                norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+                norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+                norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+            ]
+        )
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+
+    def forward(self, x):
+        fmap = []
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        self.use_spectral_norm = use_spectral_norm
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(
+                    Conv2d(
+                        1,
+                        32,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        32,
+                        128,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        128,
+                        512,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        512,
+                        1024,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        1024,
+                        1024,
+                        (kernel_size, 1),
+                        1,
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+            ]
+        )
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+
+    def forward(self, x):
+        fmap = []
+
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
--- a/infer_pack/modules.py
+++ b/infer_pack/modules.py
@ -0,0 +1,522 @@
+import copy
+import math
+import numpy as np
+import scipy
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm
+
+from infer_pack import commons
+from infer_pack.commons import init_weights, get_padding
+from infer_pack.transforms import piecewise_rational_quadratic_transform
+
+
+LRELU_SLOPE = 0.1
+
+
+class LayerNorm(nn.Module):
+    def __init__(self, channels, eps=1e-5):
+        super().__init__()
+        self.channels = channels
+        self.eps = eps
+
+        self.gamma = nn.Parameter(torch.ones(channels))
+        self.beta = nn.Parameter(torch.zeros(channels))
+
+    def forward(self, x):
+        x = x.transpose(1, -1)
+        x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+        return x.transpose(1, -1)
+
+
+class ConvReluNorm(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        hidden_channels,
+        out_channels,
+        kernel_size,
+        n_layers,
+        p_dropout,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.hidden_channels = hidden_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.p_dropout = p_dropout
+        assert n_layers > 1, "Number of layers should be larger than 0."
+
+        self.conv_layers = nn.ModuleList()
+        self.norm_layers = nn.ModuleList()
+        self.conv_layers.append(
+            nn.Conv1d(
+                in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
+            )
+        )
+        self.norm_layers.append(LayerNorm(hidden_channels))
+        self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
+        for _ in range(n_layers - 1):
+            self.conv_layers.append(
+                nn.Conv1d(
+                    hidden_channels,
+                    hidden_channels,
+                    kernel_size,
+                    padding=kernel_size // 2,
+                )
+            )
+            self.norm_layers.append(LayerNorm(hidden_channels))
+        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+        self.proj.weight.data.zero_()
+        self.proj.bias.data.zero_()
+
+    def forward(self, x, x_mask):
+        x_org = x
+        for i in range(self.n_layers):
+            x = self.conv_layers[i](x * x_mask)
+            x = self.norm_layers[i](x)
+            x = self.relu_drop(x)
+        x = x_org + self.proj(x)
+        return x * x_mask
+
+
+class DDSConv(nn.Module):
+    """
+    Dialted and Depth-Separable Convolution
+    """
+
+    def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
+        super().__init__()
+        self.channels = channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.p_dropout = p_dropout
+
+        self.drop = nn.Dropout(p_dropout)
+        self.convs_sep = nn.ModuleList()
+        self.convs_1x1 = nn.ModuleList()
+        self.norms_1 = nn.ModuleList()
+        self.norms_2 = nn.ModuleList()
+        for i in range(n_layers):
+            dilation = kernel_size**i
+            padding = (kernel_size * dilation - dilation) // 2
+            self.convs_sep.append(
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    groups=channels,
+                    dilation=dilation,
+                    padding=padding,
+                )
+            )
+            self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
+            self.norms_1.append(LayerNorm(channels))
+            self.norms_2.append(LayerNorm(channels))
+
+    def forward(self, x, x_mask, g=None):
+        if g is not None:
+            x = x + g
+        for i in range(self.n_layers):
+            y = self.convs_sep[i](x * x_mask)
+            y = self.norms_1[i](y)
+            y = F.gelu(y)
+            y = self.convs_1x1[i](y)
+            y = self.norms_2[i](y)
+            y = F.gelu(y)
+            y = self.drop(y)
+            x = x + y
+        return x * x_mask
+
+
+class WN(torch.nn.Module):
+    def __init__(
+        self,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        gin_channels=0,
+        p_dropout=0,
+    ):
+        super(WN, self).__init__()
+        assert kernel_size % 2 == 1
+        self.hidden_channels = hidden_channels
+        self.kernel_size = (kernel_size,)
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+        self.p_dropout = p_dropout
+
+        self.in_layers = torch.nn.ModuleList()
+        self.res_skip_layers = torch.nn.ModuleList()
+        self.drop = nn.Dropout(p_dropout)
+
+        if gin_channels != 0:
+            cond_layer = torch.nn.Conv1d(
+                gin_channels, 2 * hidden_channels * n_layers, 1
+            )
+            self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
+
+        for i in range(n_layers):
+            dilation = dilation_rate**i
+            padding = int((kernel_size * dilation - dilation) / 2)
+            in_layer = torch.nn.Conv1d(
+                hidden_channels,
+                2 * hidden_channels,
+                kernel_size,
+                dilation=dilation,
+                padding=padding,
+            )
+            in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
+            self.in_layers.append(in_layer)
+
+            # last one is not necessary
+            if i < n_layers - 1:
+                res_skip_channels = 2 * hidden_channels
+            else:
+                res_skip_channels = hidden_channels
+
+            res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
+            res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
+            self.res_skip_layers.append(res_skip_layer)
+
+    def forward(self, x, x_mask, g=None, **kwargs):
+        output = torch.zeros_like(x)
+        n_channels_tensor = torch.IntTensor([self.hidden_channels])
+
+        if g is not None:
+            g = self.cond_layer(g)
+
+        for i in range(self.n_layers):
+            x_in = self.in_layers[i](x)
+            if g is not None:
+                cond_offset = i * 2 * self.hidden_channels
+                g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
+            else:
+                g_l = torch.zeros_like(x_in)
+
+            acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
+            acts = self.drop(acts)
+
+            res_skip_acts = self.res_skip_layers[i](acts)
+            if i < self.n_layers - 1:
+                res_acts = res_skip_acts[:, : self.hidden_channels, :]
+                x = (x + res_acts) * x_mask
+                output = output + res_skip_acts[:, self.hidden_channels :, :]
+            else:
+                output = output + res_skip_acts
+        return output * x_mask
+
+    def remove_weight_norm(self):
+        if self.gin_channels != 0:
+            torch.nn.utils.remove_weight_norm(self.cond_layer)
+        for l in self.in_layers:
+            torch.nn.utils.remove_weight_norm(l)
+        for l in self.res_skip_layers:
+            torch.nn.utils.remove_weight_norm(l)
+
+
+class ResBlock1(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.convs1 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[2],
+                        padding=get_padding(kernel_size, dilation[2]),
+                    )
+                ),
+            ]
+        )
+        self.convs1.apply(init_weights)
+
+        self.convs2 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+            ]
+        )
+        self.convs2.apply(init_weights)
+
+    def forward(self, x, x_mask=None):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c2(xt)
+            x = xt + x
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+
+
+class ResBlock2(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
+        super(ResBlock2, self).__init__()
+        self.convs = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+            ]
+        )
+        self.convs.apply(init_weights)
+
+    def forward(self, x, x_mask=None):
+        for c in self.convs:
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c(xt)
+            x = xt + x
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+
+
+class Log(nn.Module):
+    def forward(self, x, x_mask, reverse=False, **kwargs):
+        if not reverse:
+            y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
+            logdet = torch.sum(-y, [1, 2])
+            return y, logdet
+        else:
+            x = torch.exp(x) * x_mask
+            return x
+
+
+class Flip(nn.Module):
+    def forward(self, x, *args, reverse=False, **kwargs):
+        x = torch.flip(x, [1])
+        if not reverse:
+            logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
+            return x, logdet
+        else:
+            return x
+
+
+class ElementwiseAffine(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.channels = channels
+        self.m = nn.Parameter(torch.zeros(channels, 1))
+        self.logs = nn.Parameter(torch.zeros(channels, 1))
+
+    def forward(self, x, x_mask, reverse=False, **kwargs):
+        if not reverse:
+            y = self.m + torch.exp(self.logs) * x
+            y = y * x_mask
+            logdet = torch.sum(self.logs * x_mask, [1, 2])
+            return y, logdet
+        else:
+            x = (x - self.m) * torch.exp(-self.logs) * x_mask
+            return x
+
+
+class ResidualCouplingLayer(nn.Module):
+    def __init__(
+        self,
+        channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        p_dropout=0,
+        gin_channels=0,
+        mean_only=False,
+    ):
+        assert channels % 2 == 0, "channels should be divisible by 2"
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.half_channels = channels // 2
+        self.mean_only = mean_only
+
+        self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
+        self.enc = WN(
+            hidden_channels,
+            kernel_size,
+            dilation_rate,
+            n_layers,
+            p_dropout=p_dropout,
+            gin_channels=gin_channels,
+        )
+        self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
+        self.post.weight.data.zero_()
+        self.post.bias.data.zero_()
+
+    def forward(self, x, x_mask, g=None, reverse=False):
+        x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
+        h = self.pre(x0) * x_mask
+        h = self.enc(h, x_mask, g=g)
+        stats = self.post(h) * x_mask
+        if not self.mean_only:
+            m, logs = torch.split(stats, [self.half_channels] * 2, 1)
+        else:
+            m = stats
+            logs = torch.zeros_like(m)
+
+        if not reverse:
+            x1 = m + x1 * torch.exp(logs) * x_mask
+            x = torch.cat([x0, x1], 1)
+            logdet = torch.sum(logs, [1, 2])
+            return x, logdet
+        else:
+            x1 = (x1 - m) * torch.exp(-logs) * x_mask
+            x = torch.cat([x0, x1], 1)
+            return x
+
+    def remove_weight_norm(self):
+        self.enc.remove_weight_norm()
+
+
+class ConvFlow(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        filter_channels,
+        kernel_size,
+        n_layers,
+        num_bins=10,
+        tail_bound=5.0,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.num_bins = num_bins
+        self.tail_bound = tail_bound
+        self.half_channels = in_channels // 2
+
+        self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
+        self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)
+        self.proj = nn.Conv1d(
+            filter_channels, self.half_channels * (num_bins * 3 - 1), 1
+        )
+        self.proj.weight.data.zero_()
+        self.proj.bias.data.zero_()
+
+    def forward(self, x, x_mask, g=None, reverse=False):
+        x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
+        h = self.pre(x0)
+        h = self.convs(h, x_mask, g=g)
+        h = self.proj(h) * x_mask
+
+        b, c, t = x0.shape
+        h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2)  # [b, cx?, t] -> [b, c, t, ?]
+
+        unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)
+        unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(
+            self.filter_channels
+        )
+        unnormalized_derivatives = h[..., 2 * self.num_bins :]
+
+        x1, logabsdet = piecewise_rational_quadratic_transform(
+            x1,
+            unnormalized_widths,
+            unnormalized_heights,
+            unnormalized_derivatives,
+            inverse=reverse,
+            tails="linear",
+            tail_bound=self.tail_bound,
+        )
+
+        x = torch.cat([x0, x1], 1) * x_mask
+        logdet = torch.sum(logabsdet * x_mask, [1, 2])
+        if not reverse:
+            return x, logdet
+        else:
+            return x
--- a/infer_pack/transforms.py
+++ b/infer_pack/transforms.py
@ -0,0 +1,193 @@
+import torch
+from torch.nn import functional as F
+
+import numpy as np
+
+
+DEFAULT_MIN_BIN_WIDTH = 1e-3
+DEFAULT_MIN_BIN_HEIGHT = 1e-3
+DEFAULT_MIN_DERIVATIVE = 1e-3
+
+
+def piecewise_rational_quadratic_transform(inputs, 
+                                           unnormalized_widths,
+                                           unnormalized_heights,
+                                           unnormalized_derivatives,
+                                           inverse=False,
+                                           tails=None, 
+                                           tail_bound=1.,
+                                           min_bin_width=DEFAULT_MIN_BIN_WIDTH,
+                                           min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
+                                           min_derivative=DEFAULT_MIN_DERIVATIVE):
+
+    if tails is None:
+        spline_fn = rational_quadratic_spline
+        spline_kwargs = {}
+    else:
+        spline_fn = unconstrained_rational_quadratic_spline
+        spline_kwargs = {
+            'tails': tails,
+            'tail_bound': tail_bound
+        }
+
+    outputs, logabsdet = spline_fn(
+            inputs=inputs,
+            unnormalized_widths=unnormalized_widths,
+            unnormalized_heights=unnormalized_heights,
+            unnormalized_derivatives=unnormalized_derivatives,
+            inverse=inverse,
+            min_bin_width=min_bin_width,
+            min_bin_height=min_bin_height,
+            min_derivative=min_derivative,
+            **spline_kwargs
+    )
+    return outputs, logabsdet
+
+
+def searchsorted(bin_locations, inputs, eps=1e-6):
+    bin_locations[..., -1] += eps
+    return torch.sum(
+        inputs[..., None] >= bin_locations,
+        dim=-1
+    ) - 1
+
+
+def unconstrained_rational_quadratic_spline(inputs,
+                                            unnormalized_widths,
+                                            unnormalized_heights,
+                                            unnormalized_derivatives,
+                                            inverse=False,
+                                            tails='linear',
+                                            tail_bound=1.,
+                                            min_bin_width=DEFAULT_MIN_BIN_WIDTH,
+                                            min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
+                                            min_derivative=DEFAULT_MIN_DERIVATIVE):
+    inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
+    outside_interval_mask = ~inside_interval_mask
+
+    outputs = torch.zeros_like(inputs)
+    logabsdet = torch.zeros_like(inputs)
+
+    if tails == 'linear':
+        unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
+        constant = np.log(np.exp(1 - min_derivative) - 1)
+        unnormalized_derivatives[..., 0] = constant
+        unnormalized_derivatives[..., -1] = constant
+
+        outputs[outside_interval_mask] = inputs[outside_interval_mask]
+        logabsdet[outside_interval_mask] = 0
+    else:
+        raise RuntimeError('{} tails are not implemented.'.format(tails))
+
+    outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline(
+        inputs=inputs[inside_interval_mask],
+        unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
+        unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
+        unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
+        inverse=inverse,
+        left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound,
+        min_bin_width=min_bin_width,
+        min_bin_height=min_bin_height,
+        min_derivative=min_derivative
+    )
+
+    return outputs, logabsdet
+
+def rational_quadratic_spline(inputs,
+                              unnormalized_widths,
+                              unnormalized_heights,
+                              unnormalized_derivatives,
+                              inverse=False,
+                              left=0., right=1., bottom=0., top=1.,
+                              min_bin_width=DEFAULT_MIN_BIN_WIDTH,
+                              min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
+                              min_derivative=DEFAULT_MIN_DERIVATIVE):
+    if torch.min(inputs) < left or torch.max(inputs) > right:
+        raise ValueError('Input to a transform is not within its domain')
+
+    num_bins = unnormalized_widths.shape[-1]
+
+    if min_bin_width * num_bins > 1.0:
+        raise ValueError('Minimal bin width too large for the number of bins')
+    if min_bin_height * num_bins > 1.0:
+        raise ValueError('Minimal bin height too large for the number of bins')
+
+    widths = F.softmax(unnormalized_widths, dim=-1)
+    widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
+    cumwidths = torch.cumsum(widths, dim=-1)
+    cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0)
+    cumwidths = (right - left) * cumwidths + left
+    cumwidths[..., 0] = left
+    cumwidths[..., -1] = right
+    widths = cumwidths[..., 1:] - cumwidths[..., :-1]
+
+    derivatives = min_derivative + F.softplus(unnormalized_derivatives)
+
+    heights = F.softmax(unnormalized_heights, dim=-1)
+    heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
+    cumheights = torch.cumsum(heights, dim=-1)
+    cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0)
+    cumheights = (top - bottom) * cumheights + bottom
+    cumheights[..., 0] = bottom
+    cumheights[..., -1] = top
+    heights = cumheights[..., 1:] - cumheights[..., :-1]
+
+    if inverse:
+        bin_idx = searchsorted(cumheights, inputs)[..., None]
+    else:
+        bin_idx = searchsorted(cumwidths, inputs)[..., None]
+
+    input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
+    input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
+
+    input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
+    delta = heights / widths
+    input_delta = delta.gather(-1, bin_idx)[..., 0]
+
+    input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
+    input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
+
+    input_heights = heights.gather(-1, bin_idx)[..., 0]
+
+    if inverse:
+        a = (((inputs - input_cumheights) * (input_derivatives
+                                             + input_derivatives_plus_one
+                                             - 2 * input_delta)
+              + input_heights * (input_delta - input_derivatives)))
+        b = (input_heights * input_derivatives
+             - (inputs - input_cumheights) * (input_derivatives
+                                              + input_derivatives_plus_one
+                                              - 2 * input_delta))
+        c = - input_delta * (inputs - input_cumheights)
+
+        discriminant = b.pow(2) - 4 * a * c
+        assert (discriminant >= 0).all()
+
+        root = (2 * c) / (-b - torch.sqrt(discriminant))
+        outputs = root * input_bin_widths + input_cumwidths
+
+        theta_one_minus_theta = root * (1 - root)
+        denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
+                                     * theta_one_minus_theta)
+        derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2)
+                                                     + 2 * input_delta * theta_one_minus_theta
+                                                     + input_derivatives * (1 - root).pow(2))
+        logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
+
+        return outputs, -logabsdet
+    else:
+        theta = (inputs - input_cumwidths) / input_bin_widths
+        theta_one_minus_theta = theta * (1 - theta)
+
+        numerator = input_heights * (input_delta * theta.pow(2)
+                                     + input_derivatives * theta_one_minus_theta)
+        denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
+                                     * theta_one_minus_theta)
+        outputs = input_cumheights + numerator / denominator
+
+        derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2)
+                                                     + 2 * input_delta * theta_one_minus_theta
+                                                     + input_derivatives * (1 - theta).pow(2))
+        logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
+
+        return outputs, logabsdet
--- a/todo-songs/冬之花clip1.wav
+++ b/todo-songs/冬之花clip1.wav
--- a/train/pycache/data_utils.cpython-39.pyc
+++ b/train/pycache/data_utils.cpython-39.pyc
--- a/train/pycache/losses.cpython-39.pyc
+++ b/train/pycache/losses.cpython-39.pyc
--- a/train/pycache/mel_processing.cpython-39.pyc
+++ b/train/pycache/mel_processing.cpython-39.pyc
--- a/train/pycache/process_ckpt.cpython-39.pyc
+++ b/train/pycache/process_ckpt.cpython-39.pyc
--- a/train/pycache/utils.cpython-39.pyc
+++ b/train/pycache/utils.cpython-39.pyc
--- a/train/cmd.txt
+++ b/train/cmd.txt
@ -0,0 +1 @@
+python train_nsf_sim_cache_sid.py -c configs/mi_mix40k_nsf_co256_cs1sid_ms2048.json -m ft-mi
--- a/train/data_utils.py
+++ b/train/data_utils.py
@ -0,0 +1,485 @@
+import os,traceback
+import numpy as np
+import torch
+import torch.utils.data
+
+from mel_processing import spectrogram_torch
+from utils import load_wav_to_torch, load_filepaths_and_text
+
+class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):
+    """
+    1) loads audio, text pairs
+    2) normalizes text and converts them to sequences of integers
+    3) computes spectrograms from audio files.
+    """
+
+    def __init__(self, audiopaths_and_text, hparams):
+        self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
+        self.max_wav_value  = hparams.max_wav_value
+        self.sampling_rate  = hparams.sampling_rate
+        self.filter_length  = hparams.filter_length
+        self.hop_length     = hparams.hop_length
+        self.win_length     = hparams.win_length
+        self.sampling_rate  = hparams.sampling_rate
+        self.min_text_len   = getattr(hparams, "min_text_len", 1)
+        self.max_text_len   = getattr(hparams, "max_text_len", 5000)
+        self._filter()
+
+    def _filter(self):
+        """
+        Filter text & store spec lengths
+        """
+        # Store spectrogram lengths for Bucketing
+        # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
+        # spec_length = wav_length // hop_length
+        audiopaths_and_text_new = []
+        lengths = []
+        for audiopath, text, pitch,pitchf,dv in self.audiopaths_and_text:
+            if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
+                audiopaths_and_text_new.append([audiopath, text, pitch,pitchf,dv])
+                lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
+        self.audiopaths_and_text = audiopaths_and_text_new
+        self.lengths = lengths
+    def get_sid(self, sid):
+        sid = torch.LongTensor([int(sid)])
+        return sid
+
+    def get_audio_text_pair(self, audiopath_and_text):
+        # separate filename and text
+        file = audiopath_and_text[0]
+        phone = audiopath_and_text[1]
+        pitch = audiopath_and_text[2]
+        pitchf = audiopath_and_text[3]
+        dv = audiopath_and_text[4]
+
+        phone, pitch, pitchf = self.get_labels(phone, pitch, pitchf)
+        spec, wav = self.get_audio(file)
+        dv=self.get_sid(dv)
+
+        len_phone = phone.size()[0]
+        len_spec = spec.size()[-1]
+        # print(123,phone.shape,pitch.shape,spec.shape)
+        if len_phone != len_spec:
+            len_min = min(len_phone, len_spec)
+            # amor
+            len_wav = len_min * self.hop_length
+
+            spec = spec[:, :len_min]
+            wav = wav[:, :len_wav]
+
+            phone = phone[:len_min, :]
+            pitch = pitch[:len_min]
+            pitchf = pitchf[:len_min]
+
+        return (spec, wav, phone, pitch,pitchf,dv)
+
+    def get_labels(self, phone, pitch,pitchf):
+        phone = np.load(phone)
+        phone = np.repeat(phone, 2, axis=0)
+        pitch = np.load(pitch)
+        pitchf = np.load(pitchf)
+        n_num = min(phone.shape[0], 900)  # DistributedBucketSampler
+        # print(234,phone.shape,pitch.shape)
+        phone = phone[:n_num, :]
+        pitch = pitch[:n_num]
+        pitchf = pitchf[:n_num]
+        phone = torch.FloatTensor(phone)
+        pitch = torch.LongTensor(pitch)
+        pitchf = torch.FloatTensor(pitchf)
+        return phone, pitch,pitchf
+
+    def get_audio(self, filename):
+        audio, sampling_rate = load_wav_to_torch(filename)
+        if sampling_rate != self.sampling_rate:
+            raise ValueError(
+                "{} SR doesn't match target {} SR".format(
+                    sampling_rate, self.sampling_rate
+                )
+            )
+        audio_norm = audio / self.max_wav_value
+        audio_norm = audio_norm.unsqueeze(0)
+        spec_filename = filename.replace(".wav", ".spec.pt")
+        if os.path.exists(spec_filename):
+            try:
+                spec = torch.load(spec_filename)
+            except:
+                print (spec_filename,traceback.format_exc())
+                spec = spectrogram_torch(audio_norm, self.filter_length,
+                                         self.sampling_rate, self.hop_length, self.win_length,
+                                         center=False)
+                spec = torch.squeeze(spec, 0)
+                torch.save(spec, spec_filename, _use_new_zipfile_serialization=False)
+        else:
+            spec = spectrogram_torch(
+                audio_norm,
+                self.filter_length,
+                self.sampling_rate,
+                self.hop_length,
+                self.win_length,
+                center=False,
+            )
+            spec = torch.squeeze(spec, 0)
+            torch.save(spec, spec_filename, _use_new_zipfile_serialization=False)
+        return spec, audio_norm
+
+    def __getitem__(self, index):
+        return self.get_audio_text_pair(self.audiopaths_and_text[index])
+
+    def __len__(self):
+        return len(self.audiopaths_and_text)
+class TextAudioCollateMultiNSFsid:
+    """Zero-pads model inputs and targets"""
+
+    def __init__(self, return_ids=False):
+        self.return_ids = return_ids
+
+    def __call__(self, batch):
+        """Collate's training batch from normalized text and aduio
+        PARAMS
+        ------
+        batch: [text_normalized, spec_normalized, wav_normalized]
+        """
+        # Right zero-pad all one-hot text sequences to max input length
+        _, ids_sorted_decreasing = torch.sort(
+            torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True
+        )
+
+        max_spec_len = max([x[0].size(1) for x in batch])
+        max_wave_len = max([x[1].size(1) for x in batch])
+        spec_lengths = torch.LongTensor(len(batch))
+        wave_lengths = torch.LongTensor(len(batch))
+        spec_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max_spec_len)
+        wave_padded = torch.FloatTensor(len(batch), 1, max_wave_len)
+        spec_padded.zero_()
+        wave_padded.zero_()
+
+        max_phone_len = max([x[2].size(0) for x in batch])
+        phone_lengths = torch.LongTensor(len(batch))
+        phone_padded = torch.FloatTensor(len(batch), max_phone_len, batch[0][2].shape[1])#(spec, wav, phone, pitch)
+        pitch_padded = torch.LongTensor(len(batch), max_phone_len)
+        pitchf_padded = torch.FloatTensor(len(batch), max_phone_len)
+        phone_padded.zero_()
+        pitch_padded.zero_()
+        pitchf_padded.zero_()
+        # dv = torch.FloatTensor(len(batch), 256)#gin=256
+        sid = torch.LongTensor(len(batch))
+
+        for i in range(len(ids_sorted_decreasing)):
+            row = batch[ids_sorted_decreasing[i]]
+
+            spec = row[0]
+            spec_padded[i, :, : spec.size(1)] = spec
+            spec_lengths[i] = spec.size(1)
+
+            wave = row[1]
+            wave_padded[i, :, : wave.size(1)] = wave
+            wave_lengths[i] = wave.size(1)
+
+            phone = row[2]
+            phone_padded[i, : phone.size(0), :] = phone
+            phone_lengths[i] = phone.size(0)
+
+            pitch = row[3]
+            pitch_padded[i, : pitch.size(0)] = pitch
+            pitchf = row[4]
+            pitchf_padded[i, : pitchf.size(0)] = pitchf
+
+            # dv[i] = row[5]
+            sid[i] = row[5]
+
+
+        return (
+            phone_padded,
+            phone_lengths,
+            pitch_padded,
+            pitchf_padded,
+            spec_padded,
+            spec_lengths,
+            wave_padded,
+            wave_lengths,
+            # dv
+            sid
+        )
+
+class TextAudioLoader(torch.utils.data.Dataset):
+    """
+    1) loads audio, text pairs
+    2) normalizes text and converts them to sequences of integers
+    3) computes spectrograms from audio files.
+    """
+
+    def __init__(self, audiopaths_and_text, hparams):
+        self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
+        self.max_wav_value  = hparams.max_wav_value
+        self.sampling_rate  = hparams.sampling_rate
+        self.filter_length  = hparams.filter_length
+        self.hop_length     = hparams.hop_length
+        self.win_length     = hparams.win_length
+        self.sampling_rate  = hparams.sampling_rate
+        self.min_text_len   = getattr(hparams, "min_text_len", 1)
+        self.max_text_len   = getattr(hparams, "max_text_len", 5000)
+        self._filter()
+
+    def _filter(self):
+        """
+        Filter text & store spec lengths
+        """
+        # Store spectrogram lengths for Bucketing
+        # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
+        # spec_length = wav_length // hop_length
+        audiopaths_and_text_new = []
+        lengths = []
+        for audiopath, text,dv in self.audiopaths_and_text:
+            if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
+                audiopaths_and_text_new.append([audiopath, text,dv])
+                lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
+        self.audiopaths_and_text = audiopaths_and_text_new
+        self.lengths = lengths
+    def get_sid(self, sid):
+        sid = torch.LongTensor([int(sid)])
+        return sid
+
+    def get_audio_text_pair(self, audiopath_and_text):
+        # separate filename and text
+        file = audiopath_and_text[0]
+        phone = audiopath_and_text[1]
+        dv = audiopath_and_text[2]
+
+        phone = self.get_labels(phone)
+        spec, wav = self.get_audio(file)
+        dv=self.get_sid(dv)
+
+        len_phone = phone.size()[0]
+        len_spec = spec.size()[-1]
+        if len_phone != len_spec:
+            len_min = min(len_phone, len_spec)
+            len_wav = len_min * self.hop_length
+            spec = spec[:, :len_min]
+            wav = wav[:, :len_wav]
+            phone = phone[:len_min, :]
+        return (spec, wav, phone,dv)
+
+    def get_labels(self, phone):
+        phone = np.load(phone)
+        phone = np.repeat(phone, 2, axis=0)
+        n_num = min(phone.shape[0], 900)  # DistributedBucketSampler
+        phone = phone[:n_num, :]
+        phone = torch.FloatTensor(phone)
+        return phone
+
+    def get_audio(self, filename):
+        audio, sampling_rate = load_wav_to_torch(filename)
+        if sampling_rate != self.sampling_rate:
+            raise ValueError(
+                "{} SR doesn't match target {} SR".format(
+                    sampling_rate, self.sampling_rate
+                )
+            )
+        audio_norm = audio / self.max_wav_value
+        audio_norm = audio_norm.unsqueeze(0)
+        spec_filename = filename.replace(".wav", ".spec.pt")
+        if os.path.exists(spec_filename):
+            try:
+                spec = torch.load(spec_filename)
+            except:
+                print (spec_filename,traceback.format_exc())
+                spec = spectrogram_torch(audio_norm, self.filter_length,
+                                         self.sampling_rate, self.hop_length, self.win_length,
+                                         center=False)
+                spec = torch.squeeze(spec, 0)
+                torch.save(spec, spec_filename, _use_new_zipfile_serialization=False)
+        else:
+            spec = spectrogram_torch(
+                audio_norm,
+                self.filter_length,
+                self.sampling_rate,
+                self.hop_length,
+                self.win_length,
+                center=False,
+            )
+            spec = torch.squeeze(spec, 0)
+            torch.save(spec, spec_filename, _use_new_zipfile_serialization=False)
+        return spec, audio_norm
+
+    def __getitem__(self, index):
+        return self.get_audio_text_pair(self.audiopaths_and_text[index])
+
+    def __len__(self):
+        return len(self.audiopaths_and_text)
+class TextAudioCollate:
+    """Zero-pads model inputs and targets"""
+
+    def __init__(self, return_ids=False):
+        self.return_ids = return_ids
+
+    def __call__(self, batch):
+        """Collate's training batch from normalized text and aduio
+        PARAMS
+        ------
+        batch: [text_normalized, spec_normalized, wav_normalized]
+        """
+        # Right zero-pad all one-hot text sequences to max input length
+        _, ids_sorted_decreasing = torch.sort(
+            torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True
+        )
+
+        max_spec_len = max([x[0].size(1) for x in batch])
+        max_wave_len = max([x[1].size(1) for x in batch])
+        spec_lengths = torch.LongTensor(len(batch))
+        wave_lengths = torch.LongTensor(len(batch))
+        spec_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max_spec_len)
+        wave_padded = torch.FloatTensor(len(batch), 1, max_wave_len)
+        spec_padded.zero_()
+        wave_padded.zero_()
+
+        max_phone_len = max([x[2].size(0) for x in batch])
+        phone_lengths = torch.LongTensor(len(batch))
+        phone_padded = torch.FloatTensor(len(batch), max_phone_len, batch[0][2].shape[1])
+        phone_padded.zero_()
+        sid = torch.LongTensor(len(batch))
+
+        for i in range(len(ids_sorted_decreasing)):
+            row = batch[ids_sorted_decreasing[i]]
+
+            spec = row[0]
+            spec_padded[i, :, : spec.size(1)] = spec
+            spec_lengths[i] = spec.size(1)
+
+            wave = row[1]
+            wave_padded[i, :, : wave.size(1)] = wave
+            wave_lengths[i] = wave.size(1)
+
+            phone = row[2]
+            phone_padded[i, : phone.size(0), :] = phone
+            phone_lengths[i] = phone.size(0)
+
+            sid[i] = row[3]
+
+
+        return (
+            phone_padded,
+            phone_lengths,
+            spec_padded,
+            spec_lengths,
+            wave_padded,
+            wave_lengths,
+            sid
+        )
+
+class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
+    """
+    Maintain similar input lengths in a batch.
+    Length groups are specified by boundaries.
+    Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}.
+
+    It removes samples which are not included in the boundaries.
+    Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded.
+    """
+
+    def __init__(
+        self,
+        dataset,
+        batch_size,
+        boundaries,
+        num_replicas=None,
+        rank=None,
+        shuffle=True,
+    ):
+        super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
+        self.lengths = dataset.lengths
+        self.batch_size = batch_size
+        self.boundaries = boundaries
+
+        self.buckets, self.num_samples_per_bucket = self._create_buckets()
+        self.total_size = sum(self.num_samples_per_bucket)
+        self.num_samples = self.total_size // self.num_replicas
+
+    def _create_buckets(self):
+        buckets = [[] for _ in range(len(self.boundaries) - 1)]
+        for i in range(len(self.lengths)):
+            length = self.lengths[i]
+            idx_bucket = self._bisect(length)
+            if idx_bucket != -1:
+                buckets[idx_bucket].append(i)
+
+        for i in range(len(buckets) - 1, -1, -1):#
+            if len(buckets[i]) == 0:
+                buckets.pop(i)
+                self.boundaries.pop(i + 1)
+
+        num_samples_per_bucket = []
+        for i in range(len(buckets)):
+            len_bucket = len(buckets[i])
+            total_batch_size = self.num_replicas * self.batch_size
+            rem = (
+                total_batch_size - (len_bucket % total_batch_size)
+            ) % total_batch_size
+            num_samples_per_bucket.append(len_bucket + rem)
+        return buckets, num_samples_per_bucket
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+
+        indices = []
+        if self.shuffle:
+            for bucket in self.buckets:
+                indices.append(torch.randperm(len(bucket), generator=g).tolist())
+        else:
+            for bucket in self.buckets:
+                indices.append(list(range(len(bucket))))
+
+        batches = []
+        for i in range(len(self.buckets)):
+            bucket = self.buckets[i]
+            len_bucket = len(bucket)
+            ids_bucket = indices[i]
+            num_samples_bucket = self.num_samples_per_bucket[i]
+
+            # add extra samples to make it evenly divisible
+            rem = num_samples_bucket - len_bucket
+            ids_bucket = (
+                ids_bucket
+                + ids_bucket * (rem // len_bucket)
+                + ids_bucket[: (rem % len_bucket)]
+            )
+
+            # subsample
+            ids_bucket = ids_bucket[self.rank :: self.num_replicas]
+
+            # batching
+            for j in range(len(ids_bucket) // self.batch_size):
+                batch = [
+                    bucket[idx]
+                    for idx in ids_bucket[
+                        j * self.batch_size : (j + 1) * self.batch_size
+                    ]
+                ]
+                batches.append(batch)
+
+        if self.shuffle:
+            batch_ids = torch.randperm(len(batches), generator=g).tolist()
+            batches = [batches[i] for i in batch_ids]
+        self.batches = batches
+
+        assert len(self.batches) * self.batch_size == self.num_samples
+        return iter(self.batches)
+
+    def _bisect(self, x, lo=0, hi=None):
+        if hi is None:
+            hi = len(self.boundaries) - 1
+
+        if hi > lo:
+            mid = (hi + lo) // 2
+            if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]:
+                return mid
+            elif x <= self.boundaries[mid]:
+                return self._bisect(x, lo, mid)
+            else:
+                return self._bisect(x, mid + 1, hi)
+        else:
+            return -1
+
+    def __len__(self):
+        return self.num_samples // self.batch_size
--- a/train/losses.py
+++ b/train/losses.py
@ -0,0 +1,58 @@
+import torch
+from torch.nn import functional as F
+
+def feature_loss(fmap_r, fmap_g):
+    loss = 0
+    for dr, dg in zip(fmap_r, fmap_g):
+        for rl, gl in zip(dr, dg):
+            rl = rl.float().detach()
+            gl = gl.float()
+            loss += torch.mean(torch.abs(rl - gl))
+
+    return loss * 2
+
+
+def discriminator_loss(disc_real_outputs, disc_generated_outputs):
+    loss = 0
+    r_losses = []
+    g_losses = []
+    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+        dr = dr.float()
+        dg = dg.float()
+        r_loss = torch.mean((1 - dr) ** 2)
+        g_loss = torch.mean(dg**2)
+        loss += r_loss + g_loss
+        r_losses.append(r_loss.item())
+        g_losses.append(g_loss.item())
+
+    return loss, r_losses, g_losses
+
+
+def generator_loss(disc_outputs):
+    loss = 0
+    gen_losses = []
+    for dg in disc_outputs:
+        dg = dg.float()
+        l = torch.mean((1 - dg) ** 2)
+        gen_losses.append(l)
+        loss += l
+
+    return loss, gen_losses
+
+
+def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
+    """
+    z_p, logs_q: [b, h, t_t]
+    m_p, logs_p: [b, h, t_t]
+    """
+    z_p = z_p.float()
+    logs_q = logs_q.float()
+    m_p = m_p.float()
+    logs_p = logs_p.float()
+    z_mask = z_mask.float()
+
+    kl = logs_p - logs_q - 0.5
+    kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
+    kl = torch.sum(kl * z_mask)
+    l = kl / torch.sum(z_mask)
+    return l
--- a/train/mel_processing.py
+++ b/train/mel_processing.py
@ -0,0 +1,149 @@
+import math
+import os
+import random
+import torch
+from torch import nn
+import torch.nn.functional as F
+import torch.utils.data
+import numpy as np
+import librosa
+import librosa.util as librosa_util
+from librosa.util import normalize, pad_center, tiny
+from scipy.signal import get_window
+from scipy.io.wavfile import read
+from librosa.filters import mel as librosa_mel_fn
+
+MAX_WAV_VALUE = 32768.0
+
+
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    """
+    PARAMS
+    ------
+    C: compression factor
+    """
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+
+
+def dynamic_range_decompression_torch(x, C=1):
+    """
+    PARAMS
+    ------
+    C: compression factor used to compress
+    """
+    return torch.exp(x) / C
+
+
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+
+
+def spectral_de_normalize_torch(magnitudes):
+    output = dynamic_range_decompression_torch(magnitudes)
+    return output
+
+
+mel_basis = {}
+hann_window = {}
+
+
+def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
+    if torch.min(y) < -1.0:
+        print("min value is ", torch.min(y))
+    if torch.max(y) > 1.0:
+        print("max value is ", torch.max(y))
+
+    global hann_window
+    dtype_device = str(y.dtype) + "_" + str(y.device)
+    wnsize_dtype_device = str(win_size) + "_" + dtype_device
+    if wnsize_dtype_device not in hann_window:
+        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
+            dtype=y.dtype, device=y.device
+        )
+
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1),
+        (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
+        mode="reflect",
+    )
+    y = y.squeeze(1)
+
+    spec = torch.stft(
+        y,
+        n_fft,
+        hop_length=hop_size,
+        win_length=win_size,
+        window=hann_window[wnsize_dtype_device],
+        center=center,
+        pad_mode="reflect",
+        normalized=False,
+        onesided=True,return_complex=False
+    )
+
+    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+    return spec
+
+
+def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
+    global mel_basis
+    dtype_device = str(spec.dtype) + "_" + str(spec.device)
+    fmax_dtype_device = str(fmax) + "_" + dtype_device
+    if fmax_dtype_device not in mel_basis:
+        mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
+        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
+            dtype=spec.dtype, device=spec.device
+        )
+    spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec
+
+
+def mel_spectrogram_torch(
+    y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
+):
+    if torch.min(y) < -1.0:
+        print("min value is ", torch.min(y))
+    if torch.max(y) > 1.0:
+        print("max value is ", torch.max(y))
+
+    global mel_basis, hann_window
+    dtype_device = str(y.dtype) + "_" + str(y.device)
+    fmax_dtype_device = str(fmax) + "_" + dtype_device
+    wnsize_dtype_device = str(win_size) + "_" + dtype_device
+    if fmax_dtype_device not in mel_basis:
+        mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
+        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
+            dtype=y.dtype, device=y.device
+        )
+    if wnsize_dtype_device not in hann_window:
+        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
+            dtype=y.dtype, device=y.device
+        )
+
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1),
+        (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
+        mode="reflect",
+    )
+    y = y.squeeze(1)
+
+    # spec = torch.stft(
+    #     y,
+    #     n_fft,
+    #     hop_length=hop_size,
+    #     win_length=win_size,
+    #     window=hann_window[wnsize_dtype_device],
+    #     center=center,
+    #     pad_mode="reflect",
+    #     normalized=False,
+    #     onesided=True,
+    # )
+    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
+                      center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
+    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+
+    spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
+    spec = spectral_normalize_torch(spec)
+
+    return spec
--- a/train/process_ckpt.py
+++ b/train/process_ckpt.py
@ -0,0 +1,97 @@
+import torch,traceback,os,pdb
+from collections import OrderedDict
+
+def savee(ckpt,sr,if_f0,name,epoch):
+    try:
+        opt = OrderedDict()
+        opt["weight"] = {}
+        for key in ckpt.keys():
+            if ("enc_q" in key): continue
+            opt["weight"][key] = ckpt[key].half()
+        if(sr=="40k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2], 512, [16, 16, 4, 4], 109, 256, 40000]
+        elif(sr=="48k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,6,2,2,2], 512, [16, 16, 4, 4,4], 109, 256, 48000]
+        elif(sr=="32k"):opt["config"] = [513, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 4, 2, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 32000]
+        opt["info"] = "%sepoch"%epoch
+        opt["sr"] = sr
+        opt["f0"] =if_f0
+        torch.save(opt, "weights/%s.pth"%name)
+        return "Success."
+    except:
+        return traceback.format_exc()
+
+def show_info(path):
+    try:
+        a = torch.load(path, map_location="cpu")
+        return "模型信息:%s\n采样率:%s\n模型是否输入音高引导:%s"%(a.get("info","None"),a.get("sr","None"),a.get("f0","None"),)
+    except:
+        return traceback.format_exc()
+
+def extract_small_model(path,name,sr,if_f0,info):
+    try:
+        ckpt = torch.load(path, map_location="cpu")
+        if("model"in ckpt):ckpt=ckpt["model"]
+        opt = OrderedDict()
+        opt["weight"] = {}
+        for key in ckpt.keys():
+            if ("enc_q" in key): continue
+            opt["weight"][key] = ckpt[key].half()
+        if(sr=="40k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2], 512, [16, 16, 4, 4], 109, 256, 40000]
+        elif(sr=="48k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,6,2,2,2], 512, [16, 16, 4, 4,4], 109, 256, 48000]
+        elif(sr=="32k"):opt["config"] = [513, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 4, 2, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 32000]
+        if(info==""):info="Extracted model."
+        opt["info"] = info
+        opt["sr"] = sr
+        opt["f0"] =if_f0
+        torch.save(opt, "weights/%s.pth"%name)
+        return "Success."
+    except:
+        return traceback.format_exc()
+
+def change_info(path,info,name):
+    try:
+        ckpt = torch.load(path, map_location="cpu")
+        ckpt["info"]=info
+        if(name==""):name=os.path.basename(path)
+        torch.save(ckpt, "weights/%s"%name)
+        return "Success."
+    except:
+        return traceback.format_exc()
+
+def merge(path1,path2,alpha1,sr,f0,info,name):
+    try:
+        def extract(ckpt):
+            a = ckpt["model"]
+            opt = OrderedDict()
+            opt["weight"] = {}
+            for key in a.keys():
+                if ("enc_q" in key): continue
+                opt["weight"][key] = a[key]
+            return opt
+        ckpt1 = torch.load(path1, map_location="cpu")
+        ckpt2 = torch.load(path2, map_location="cpu")
+        if("model"in ckpt1):ckpt1=extract(ckpt1)
+        else:ckpt1=ckpt1["weight"]
+        if("model"in ckpt2):ckpt2=extract(ckpt2)
+        else:ckpt2=ckpt2["weight"]
+        if(sorted(list(ckpt1.keys()))!=sorted(list(ckpt2.keys()))):return "Fail to merge the models. The model architectures are not the same."
+        opt = OrderedDict()
+        opt["weight"] = {}
+        for key in ckpt1.keys():
+            # try:
+                if(key=="emb_g.weight"and ckpt1[key].shape!=ckpt2[key].shape):
+                    min_shape0=min(ckpt1[key].shape[0],ckpt2[key].shape[0])
+                    opt["weight"][key] = (alpha1 * (ckpt1[key][:min_shape0].float()) + (1 - alpha1) * (ckpt2[key][:min_shape0].float())).half()
+                else:
+                    opt["weight"][key] = (alpha1*(ckpt1[key].float())+(1-alpha1)*(ckpt2[key].float())).half()
+            # except:
+            #     pdb.set_trace()
+        if(sr=="40k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 40000]
+        elif(sr=="48k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,6,2,2,2], 512, [16, 16, 4, 4], 109, 256, 48000]
+        elif(sr=="32k"):opt["config"] = [513, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 4, 2, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 32000]
+        opt["sr"]=sr
+        opt["f0"]=1 if f0=="是"else 0
+        opt["info"]=info
+        torch.save(opt, "weights/%s.pth"%name)
+        return "Success."
+    except:
+        return traceback.format_exc()
--- a/train/utils.py
+++ b/train/utils.py
@ -0,0 +1,385 @@
+import os,traceback
+import glob
+import sys
+import argparse
+import logging
+import json
+import subprocess
+import numpy as np
+from scipy.io.wavfile import read
+import torch
+
+MATPLOTLIB_FLAG = False
+
+logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+logger = logging
+
+def load_checkpoint_d(checkpoint_path, combd,sbd, optimizer=None,load_opt=1):
+  assert os.path.isfile(checkpoint_path)
+  checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
+
+  ##################
+  def go(model,bkey):
+    saved_state_dict = checkpoint_dict[bkey]
+    if hasattr(model, 'module'):state_dict = model.module.state_dict()
+    else:state_dict = model.state_dict()
+    new_state_dict= {}
+    for k, v in state_dict.items():#模型需要的shape
+      try:
+        new_state_dict[k] = saved_state_dict[k]
+        if(saved_state_dict[k].shape!=state_dict[k].shape):
+          print("shape-%s-mismatch|need-%s|get-%s"%(k,state_dict[k].shape,saved_state_dict[k].shape))#
+          raise KeyError
+      except:
+        # logger.info(traceback.format_exc())
+        logger.info("%s is not in the checkpoint" % k)#pretrain缺失的
+        new_state_dict[k] = v#模型自带的随机值
+    if hasattr(model, 'module'):
+      model.module.load_state_dict(new_state_dict,strict=False)
+    else:
+      model.load_state_dict(new_state_dict,strict=False)
+  go(combd,"combd")
+  go(sbd,"sbd")
+  #############
+  logger.info("Loaded model weights")
+
+  iteration = checkpoint_dict['iteration']
+  learning_rate = checkpoint_dict['learning_rate']
+  if optimizer is not None and load_opt==1:###加载不了，如果是空的的话，重新初始化，可能还会影响lr时间表的更新，因此在train文件最外围catch
+  #   try:
+      optimizer.load_state_dict(checkpoint_dict['optimizer'])
+  #   except:
+  #     traceback.print_exc()
+  logger.info("Loaded checkpoint '{}' (iteration {})" .format(checkpoint_path, iteration))
+  return model, optimizer, learning_rate, iteration
+
+
+# def load_checkpoint(checkpoint_path, model, optimizer=None):
+#   assert os.path.isfile(checkpoint_path)
+#   checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
+#   iteration = checkpoint_dict['iteration']
+#   learning_rate = checkpoint_dict['learning_rate']
+#   if optimizer is not None:
+#     optimizer.load_state_dict(checkpoint_dict['optimizer'])
+#   # print(1111)
+#   saved_state_dict = checkpoint_dict['model']
+#   # print(1111)
+#
+#   if hasattr(model, 'module'):
+#     state_dict = model.module.state_dict()
+#   else:
+#     state_dict = model.state_dict()
+#   new_state_dict= {}
+#   for k, v in state_dict.items():
+#     try:
+#       new_state_dict[k] = saved_state_dict[k]
+#     except:
+#       logger.info("%s is not in the checkpoint" % k)
+#       new_state_dict[k] = v
+#   if hasattr(model, 'module'):
+#     model.module.load_state_dict(new_state_dict)
+#   else:
+#     model.load_state_dict(new_state_dict)
+#   logger.info("Loaded checkpoint '{}' (iteration {})" .format(
+#     checkpoint_path, iteration))
+#   return model, optimizer, learning_rate, iteration
+def load_checkpoint(checkpoint_path, model, optimizer=None,load_opt=1):
+  assert os.path.isfile(checkpoint_path)
+  checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
+
+  saved_state_dict = checkpoint_dict['model']
+  if hasattr(model, 'module'):
+    state_dict = model.module.state_dict()
+  else:
+    state_dict = model.state_dict()
+  new_state_dict= {}
+  for k, v in state_dict.items():#模型需要的shape
+    try:
+      new_state_dict[k] = saved_state_dict[k]
+      if(saved_state_dict[k].shape!=state_dict[k].shape):
+        print("shape-%s-mismatch|need-%s|get-%s"%(k,state_dict[k].shape,saved_state_dict[k].shape))#
+        raise KeyError
+    except:
+      # logger.info(traceback.format_exc())
+      logger.info("%s is not in the checkpoint" % k)#pretrain缺失的
+      new_state_dict[k] = v#模型自带的随机值
+  if hasattr(model, 'module'):
+    model.module.load_state_dict(new_state_dict,strict=False)
+  else:
+    model.load_state_dict(new_state_dict,strict=False)
+  logger.info("Loaded model weights")
+
+  iteration = checkpoint_dict['iteration']
+  learning_rate = checkpoint_dict['learning_rate']
+  if optimizer is not None and load_opt==1:###加载不了，如果是空的的话，重新初始化，可能还会影响lr时间表的更新，因此在train文件最外围catch
+  #   try:
+      optimizer.load_state_dict(checkpoint_dict['optimizer'])
+  #   except:
+  #     traceback.print_exc()
+  logger.info("Loaded checkpoint '{}' (iteration {})" .format(checkpoint_path, iteration))
+  return model, optimizer, learning_rate, iteration
+
+
+def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
+  logger.info("Saving model and optimizer state at iteration {} to {}".format(
+    iteration, checkpoint_path))
+  if hasattr(model, 'module'):
+    state_dict = model.module.state_dict()
+  else:
+    state_dict = model.state_dict()
+  torch.save({'model': state_dict,
+              'iteration': iteration,
+              'optimizer': optimizer.state_dict(),
+              'learning_rate': learning_rate}, checkpoint_path)
+def save_checkpoint_d(combd, sbd, optimizer, learning_rate, iteration, checkpoint_path):
+  logger.info("Saving model and optimizer state at iteration {} to {}".format(
+    iteration, checkpoint_path))
+  if hasattr(combd, 'module'): state_dict_combd = combd.module.state_dict()
+  else:state_dict_combd = combd.state_dict()
+  if hasattr(sbd, 'module'): state_dict_sbd = sbd.module.state_dict()
+  else:state_dict_sbd = sbd.state_dict()
+  torch.save({
+              'combd': state_dict_combd,
+              'sbd': state_dict_sbd,
+              'iteration': iteration,
+              'optimizer': optimizer.state_dict(),
+              'learning_rate': learning_rate}, checkpoint_path)
+
+
+def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
+  for k, v in scalars.items():
+    writer.add_scalar(k, v, global_step)
+  for k, v in histograms.items():
+    writer.add_histogram(k, v, global_step)
+  for k, v in images.items():
+    writer.add_image(k, v, global_step, dataformats='HWC')
+  for k, v in audios.items():
+    writer.add_audio(k, v, global_step, audio_sampling_rate)
+
+
+def latest_checkpoint_path(dir_path, regex="G_*.pth"):
+  f_list = glob.glob(os.path.join(dir_path, regex))
+  f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
+  x = f_list[-1]
+  print(x)
+  return x
+
+
+def plot_spectrogram_to_numpy(spectrogram):
+  global MATPLOTLIB_FLAG
+  if not MATPLOTLIB_FLAG:
+    import matplotlib
+    matplotlib.use("Agg")
+    MATPLOTLIB_FLAG = True
+    mpl_logger = logging.getLogger('matplotlib')
+    mpl_logger.setLevel(logging.WARNING)
+  import matplotlib.pylab as plt
+  import numpy as np
+  
+  fig, ax = plt.subplots(figsize=(10,2))
+  im = ax.imshow(spectrogram, aspect="auto", origin="lower",
+                  interpolation='none')
+  plt.colorbar(im, ax=ax)
+  plt.xlabel("Frames")
+  plt.ylabel("Channels")
+  plt.tight_layout()
+
+  fig.canvas.draw()
+  data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
+  data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+  plt.close()
+  return data
+
+
+def plot_alignment_to_numpy(alignment, info=None):
+  global MATPLOTLIB_FLAG
+  if not MATPLOTLIB_FLAG:
+    import matplotlib
+    matplotlib.use("Agg")
+    MATPLOTLIB_FLAG = True
+    mpl_logger = logging.getLogger('matplotlib')
+    mpl_logger.setLevel(logging.WARNING)
+  import matplotlib.pylab as plt
+  import numpy as np
+
+  fig, ax = plt.subplots(figsize=(6, 4))
+  im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower',
+                  interpolation='none')
+  fig.colorbar(im, ax=ax)
+  xlabel = 'Decoder timestep'
+  if info is not None:
+      xlabel += '\n\n' + info
+  plt.xlabel(xlabel)
+  plt.ylabel('Encoder timestep')
+  plt.tight_layout()
+
+  fig.canvas.draw()
+  data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
+  data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+  plt.close()
+  return data
+
+
+def load_wav_to_torch(full_path):
+  sampling_rate, data = read(full_path)
+  return torch.FloatTensor(data.astype(np.float32)), sampling_rate
+
+
+def load_filepaths_and_text(filename, split="|"):
+  with open(filename, encoding='utf-8') as f:
+    filepaths_and_text = [line.strip().split(split) for line in f]
+  return filepaths_and_text
+
+
+def get_hparams(init=True):
+  '''
+todo:
+  结尾七人组：
+    保存频率、总epoch                     done
+    bs                                    done
+    pretrainG、pretrainD                  done
+    卡号：os.en["CUDA_VISIBLE_DEVICES"]   done
+    if_latest                             todo
+  模型：if_f0                             todo
+  采样率：自动选择config                  done
+  是否缓存数据集进GPU:if_cache_data_in_gpu done
+
+  -m:
+    自动决定training_files路径,改掉train_nsf_load_pretrain.py里的hps.data.training_files    done
+  -c不要了
+  '''
+  parser = argparse.ArgumentParser()
+  # parser.add_argument('-c', '--config', type=str, default="configs/40k.json",help='JSON file for configuration')
+  parser.add_argument('-se', '--save_every_epoch', type=int, required=True,help='checkpoint save frequency (epoch)')
+  parser.add_argument('-te', '--total_epoch', type=int, required=True,help='total_epoch')
+  parser.add_argument('-pg', '--pretrainG', type=str, default="",help='Pretrained Discriminator path')
+  parser.add_argument('-pd', '--pretrainD', type=str, default="",help='Pretrained Generator path')
+  parser.add_argument('-g', '--gpus', type=str, default="0",help='split by -')
+  parser.add_argument('-bs', '--batch_size', type=int, required=True,help='batch size')
+  parser.add_argument('-e', '--experiment_dir', type=str, required=True,help='experiment dir')#-m
+  parser.add_argument('-sr', '--sample_rate', type=str, required=True,help='sample rate, 32k/40k/48k')
+  parser.add_argument('-f0', '--if_f0', type=int, required=True,help='use f0 as one of the inputs of the model, 1 or 0')
+  parser.add_argument('-l', '--if_latest', type=int, required=True,help='if only save the latest G/D pth file, 1 or 0')
+  parser.add_argument('-c', '--if_cache_data_in_gpu', type=int, required=True,help='if caching the dataset in GPU memory, 1 or 0')
+
+  args = parser.parse_args()
+  name = args.experiment_dir
+  experiment_dir = os.path.join("./logs", args.experiment_dir)
+
+  if not os.path.exists(experiment_dir):
+    os.makedirs(experiment_dir)
+
+  config_path = "configs/%s.json"%args.sample_rate
+  config_save_path = os.path.join(experiment_dir, "config.json")
+  if init:
+    with open(config_path, "r") as f:
+      data = f.read()
+    with open(config_save_path, "w") as f:
+      f.write(data)
+  else:
+    with open(config_save_path, "r") as f:
+      data = f.read()
+  config = json.loads(data)
+
+  hparams = HParams(**config)
+  hparams.model_dir = hparams.experiment_dir = experiment_dir
+  hparams.save_every_epoch = args.save_every_epoch
+  hparams.name = name
+  hparams.total_epoch = args.total_epoch
+  hparams.pretrainG = args.pretrainG
+  hparams.pretrainD = args.pretrainD
+  hparams.gpus = args.gpus
+  hparams.train.batch_size = args.batch_size
+  hparams.sample_rate = args.sample_rate
+  hparams.if_f0 = args.if_f0
+  hparams.if_latest = args.if_latest
+  hparams.if_cache_data_in_gpu = args.if_cache_data_in_gpu
+  hparams.data.training_files = "%s/filelist.txt"%experiment_dir
+  return hparams
+
+
+def get_hparams_from_dir(model_dir):
+  config_save_path = os.path.join(model_dir, "config.json")
+  with open(config_save_path, "r") as f:
+    data = f.read()
+  config = json.loads(data)
+
+  hparams =HParams(**config)
+  hparams.model_dir = model_dir
+  return hparams
+
+
+def get_hparams_from_file(config_path):
+  with open(config_path, "r") as f:
+    data = f.read()
+  config = json.loads(data)
+
+  hparams =HParams(**config)
+  return hparams
+
+
+def check_git_hash(model_dir):
+  source_dir = os.path.dirname(os.path.realpath(__file__))
+  if not os.path.exists(os.path.join(source_dir, ".git")):
+    logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format(
+      source_dir
+    ))
+    return
+
+  cur_hash = subprocess.getoutput("git rev-parse HEAD")
+
+  path = os.path.join(model_dir, "githash")
+  if os.path.exists(path):
+    saved_hash = open(path).read()
+    if saved_hash != cur_hash:
+      logger.warn("git hash values are different. {}(saved) != {}(current)".format(
+        saved_hash[:8], cur_hash[:8]))
+  else:
+    open(path, "w").write(cur_hash)
+
+
+def get_logger(model_dir, filename="train.log"):
+  global logger
+  logger = logging.getLogger(os.path.basename(model_dir))
+  logger.setLevel(logging.DEBUG)
+  
+  formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
+  if not os.path.exists(model_dir):
+    os.makedirs(model_dir)
+  h = logging.FileHandler(os.path.join(model_dir, filename))
+  h.setLevel(logging.DEBUG)
+  h.setFormatter(formatter)
+  logger.addHandler(h)
+  return logger
+
+
+class HParams():
+  def __init__(self, **kwargs):
+    for k, v in kwargs.items():
+      if type(v) == dict:
+        v = HParams(**v)
+      self[k] = v
+    
+  def keys(self):
+    return self.__dict__.keys()
+
+  def items(self):
+    return self.__dict__.items()
+
+  def values(self):
+    return self.__dict__.values()
+
+  def __len__(self):
+    return len(self.__dict__)
+
+  def __getitem__(self, key):
+    return getattr(self, key)
+
+  def __setitem__(self, key, value):
+    return setattr(self, key, value)
+
+  def __contains__(self, key):
+    return key in self.__dict__
+
+  def __repr__(self):
+    return self.__dict__.__repr__()
--- a/uvr5_pack/pycache/utils.cpython-39.pyc
+++ b/uvr5_pack/pycache/utils.cpython-39.pyc
--- a/uvr5_pack/lib_v5/pycache/layers_123821KB.cpython-39.pyc
+++ b/uvr5_pack/lib_v5/pycache/layers_123821KB.cpython-39.pyc
--- a/uvr5_pack/lib_v5/pycache/model_param_init.cpython-39.pyc
+++ b/uvr5_pack/lib_v5/pycache/model_param_init.cpython-39.pyc
--- a/uvr5_pack/lib_v5/pycache/nets_61968KB.cpython-39.pyc
+++ b/uvr5_pack/lib_v5/pycache/nets_61968KB.cpython-39.pyc
--- a/uvr5_pack/lib_v5/pycache/spec_utils.cpython-39.pyc
+++ b/uvr5_pack/lib_v5/pycache/spec_utils.cpython-39.pyc
--- a/uvr5_pack/lib_v5/dataset.py
+++ b/uvr5_pack/lib_v5/dataset.py
@ -0,0 +1,170 @@
+import os
+import random
+
+import numpy as np
+import torch
+import torch.utils.data
+from tqdm import tqdm
+
+from uvr5_pack.lib_v5 import spec_utils
+
+
+class VocalRemoverValidationSet(torch.utils.data.Dataset):
+
+    def __init__(self, patch_list):
+        self.patch_list = patch_list
+
+    def __len__(self):
+        return len(self.patch_list)
+
+    def __getitem__(self, idx):
+        path = self.patch_list[idx]
+        data = np.load(path)
+
+        X, y = data['X'], data['y']
+
+        X_mag = np.abs(X)
+        y_mag = np.abs(y)
+
+        return X_mag, y_mag
+
+
+def make_pair(mix_dir, inst_dir):
+    input_exts = ['.wav', '.m4a', '.mp3', '.mp4', '.flac']
+
+    X_list = sorted([
+        os.path.join(mix_dir, fname)
+        for fname in os.listdir(mix_dir)
+        if os.path.splitext(fname)[1] in input_exts])
+    y_list = sorted([
+        os.path.join(inst_dir, fname)
+        for fname in os.listdir(inst_dir)
+        if os.path.splitext(fname)[1] in input_exts])
+
+    filelist = list(zip(X_list, y_list))
+
+    return filelist
+
+
+def train_val_split(dataset_dir, split_mode, val_rate, val_filelist):
+    if split_mode == 'random':
+        filelist = make_pair(
+            os.path.join(dataset_dir, 'mixtures'),
+            os.path.join(dataset_dir, 'instruments'))
+
+        random.shuffle(filelist)
+
+        if len(val_filelist) == 0:
+            val_size = int(len(filelist) * val_rate)
+            train_filelist = filelist[:-val_size]
+            val_filelist = filelist[-val_size:]
+        else:
+            train_filelist = [
+                pair for pair in filelist
+                if list(pair) not in val_filelist]
+    elif split_mode == 'subdirs':
+        if len(val_filelist) != 0:
+            raise ValueError('The `val_filelist` option is not available in `subdirs` mode')
+
+        train_filelist = make_pair(
+            os.path.join(dataset_dir, 'training/mixtures'),
+            os.path.join(dataset_dir, 'training/instruments'))
+
+        val_filelist = make_pair(
+            os.path.join(dataset_dir, 'validation/mixtures'),
+            os.path.join(dataset_dir, 'validation/instruments'))
+
+    return train_filelist, val_filelist
+
+
+def augment(X, y, reduction_rate, reduction_mask, mixup_rate, mixup_alpha):
+    perm = np.random.permutation(len(X))
+    for i, idx in enumerate(tqdm(perm)):
+        if np.random.uniform() < reduction_rate:
+            y[idx] = spec_utils.reduce_vocal_aggressively(X[idx], y[idx], reduction_mask)
+
+        if np.random.uniform() < 0.5:
+            # swap channel
+            X[idx] = X[idx, ::-1]
+            y[idx] = y[idx, ::-1]
+        if np.random.uniform() < 0.02:
+            # mono
+            X[idx] = X[idx].mean(axis=0, keepdims=True)
+            y[idx] = y[idx].mean(axis=0, keepdims=True)
+        if np.random.uniform() < 0.02:
+            # inst
+            X[idx] = y[idx]
+
+        if np.random.uniform() < mixup_rate and i < len(perm) - 1:
+            lam = np.random.beta(mixup_alpha, mixup_alpha)
+            X[idx] = lam * X[idx] + (1 - lam) * X[perm[i + 1]]
+            y[idx] = lam * y[idx] + (1 - lam) * y[perm[i + 1]]
+
+    return X, y
+
+
+def make_padding(width, cropsize, offset):
+    left = offset
+    roi_size = cropsize - left * 2
+    if roi_size == 0:
+        roi_size = cropsize
+    right = roi_size - (width % roi_size) + left
+
+    return left, right, roi_size
+
+
+def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset):
+    len_dataset = patches * len(filelist)
+
+    X_dataset = np.zeros(
+        (len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
+    y_dataset = np.zeros(
+        (len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
+
+    for i, (X_path, y_path) in enumerate(tqdm(filelist)):
+        X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
+        coef = np.max([np.abs(X).max(), np.abs(y).max()])
+        X, y = X / coef, y / coef
+
+        l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
+        X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode='constant')
+        y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode='constant')
+
+        starts = np.random.randint(0, X_pad.shape[2] - cropsize, patches)
+        ends = starts + cropsize
+        for j in range(patches):
+            idx = i * patches + j
+            X_dataset[idx] = X_pad[:, :, starts[j]:ends[j]]
+            y_dataset[idx] = y_pad[:, :, starts[j]:ends[j]]
+
+    return X_dataset, y_dataset
+
+
+def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset):
+    patch_list = []
+    patch_dir = 'cs{}_sr{}_hl{}_nf{}_of{}'.format(cropsize, sr, hop_length, n_fft, offset)
+    os.makedirs(patch_dir, exist_ok=True)
+
+    for i, (X_path, y_path) in enumerate(tqdm(filelist)):
+        basename = os.path.splitext(os.path.basename(X_path))[0]
+
+        X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
+        coef = np.max([np.abs(X).max(), np.abs(y).max()])
+        X, y = X / coef, y / coef
+
+        l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
+        X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode='constant')
+        y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode='constant')
+
+        len_dataset = int(np.ceil(X.shape[2] / roi_size))
+        for j in range(len_dataset):
+            outpath = os.path.join(patch_dir, '{}_p{}.npz'.format(basename, j))
+            start = j * roi_size
+            if not os.path.exists(outpath):
+                np.savez(
+                    outpath,
+                    X=X_pad[:, :, start:start + cropsize],
+                    y=y_pad[:, :, start:start + cropsize])
+            patch_list.append(outpath)
+
+    return VocalRemoverValidationSet(patch_list)
--- a/uvr5_pack/lib_v5/layers.py
+++ b/uvr5_pack/lib_v5/layers.py
@ -0,0 +1,116 @@
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from uvr5_pack.lib_v5 import spec_utils
+
+
+class Conv2DBNActiv(nn.Module):
+
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+        super(Conv2DBNActiv, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                nin, nout,
+                kernel_size=ksize,
+                stride=stride,
+                padding=pad,
+                dilation=dilation,
+                bias=False),
+            nn.BatchNorm2d(nout),
+            activ()
+        )
+
+    def __call__(self, x):
+        return self.conv(x)
+
+
+class SeperableConv2DBNActiv(nn.Module):
+
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+        super(SeperableConv2DBNActiv, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                nin, nin,
+                kernel_size=ksize,
+                stride=stride,
+                padding=pad,
+                dilation=dilation,
+                groups=nin,
+                bias=False),
+            nn.Conv2d(
+                nin, nout,
+                kernel_size=1,
+                bias=False),
+            nn.BatchNorm2d(nout),
+            activ()
+        )
+
+    def __call__(self, x):
+        return self.conv(x)
+
+
+class Encoder(nn.Module):
+
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
+        super(Encoder, self).__init__()
+        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+        self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
+
+    def __call__(self, x):
+        skip = self.conv1(x)
+        h = self.conv2(skip)
+
+        return h, skip
+
+
+class Decoder(nn.Module):
+
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
+        super(Decoder, self).__init__()
+        self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+        self.dropout = nn.Dropout2d(0.1) if dropout else None
+
+    def __call__(self, x, skip=None):
+        x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
+        if skip is not None:
+            skip = spec_utils.crop_center(skip, x)
+            x = torch.cat([x, skip], dim=1)
+        h = self.conv(x)
+
+        if self.dropout is not None:
+            h = self.dropout(h)
+
+        return h
+
+
+class ASPPModule(nn.Module):
+
+    def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
+        super(ASPPModule, self).__init__()
+        self.conv1 = nn.Sequential(
+            nn.AdaptiveAvgPool2d((1, None)),
+            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+        )
+        self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+        self.conv3 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
+        self.conv4 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
+        self.conv5 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
+        self.bottleneck = nn.Sequential(
+            Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ),
+            nn.Dropout2d(0.1)
+        )
+
+    def forward(self, x):
+        _, _, h, w = x.size()
+        feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
+        feat2 = self.conv2(x)
+        feat3 = self.conv3(x)
+        feat4 = self.conv4(x)
+        feat5 = self.conv5(x)
+        out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
+        bottle = self.bottleneck(out)
+        return bottle
--- a/uvr5_pack/lib_v5/layers_123812KB
+++ b/uvr5_pack/lib_v5/layers_123812KB
@ -0,0 +1,116 @@
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from uvr5_pack.lib_v5 import spec_utils
+
+
+class Conv2DBNActiv(nn.Module):
+
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+        super(Conv2DBNActiv, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                nin, nout,
+                kernel_size=ksize,
+                stride=stride,
+                padding=pad,
+                dilation=dilation,
+                bias=False),
+            nn.BatchNorm2d(nout),
+            activ()
+        )
+
+    def __call__(self, x):
+        return self.conv(x)
+
+
+class SeperableConv2DBNActiv(nn.Module):
+
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+        super(SeperableConv2DBNActiv, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                nin, nin,
+                kernel_size=ksize,
+                stride=stride,
+                padding=pad,
+                dilation=dilation,
+                groups=nin,
+                bias=False),
+            nn.Conv2d(
+                nin, nout,
+                kernel_size=1,
+                bias=False),
+            nn.BatchNorm2d(nout),
+            activ()
+        )
+
+    def __call__(self, x):
+        return self.conv(x)
+
+
+class Encoder(nn.Module):
+
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
+        super(Encoder, self).__init__()
+        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+        self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
+
+    def __call__(self, x):
+        skip = self.conv1(x)
+        h = self.conv2(skip)
+
+        return h, skip
+
+
+class Decoder(nn.Module):
+
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
+        super(Decoder, self).__init__()
+        self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+        self.dropout = nn.Dropout2d(0.1) if dropout else None
+
+    def __call__(self, x, skip=None):
+        x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
+        if skip is not None:
+            skip = spec_utils.crop_center(skip, x)
+            x = torch.cat([x, skip], dim=1)
+        h = self.conv(x)
+
+        if self.dropout is not None:
+            h = self.dropout(h)
+
+        return h
+
+
+class ASPPModule(nn.Module):
+
+    def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
+        super(ASPPModule, self).__init__()
+        self.conv1 = nn.Sequential(
+            nn.AdaptiveAvgPool2d((1, None)),
+            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+        )
+        self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+        self.conv3 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
+        self.conv4 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
+        self.conv5 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
+        self.bottleneck = nn.Sequential(
+            Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ),
+            nn.Dropout2d(0.1)
+        )
+
+    def forward(self, x):
+        _, _, h, w = x.size()
+        feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
+        feat2 = self.conv2(x)
+        feat3 = self.conv3(x)
+        feat4 = self.conv4(x)
+        feat5 = self.conv5(x)
+        out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
+        bottle = self.bottleneck(out)
+        return bottle
--- a/uvr5_pack/lib_v5/layers_123821KB.py
+++ b/uvr5_pack/lib_v5/layers_123821KB.py
@ -0,0 +1,116 @@
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from uvr5_pack.lib_v5 import spec_utils
+
+
+class Conv2DBNActiv(nn.Module):
+
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+        super(Conv2DBNActiv, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                nin, nout,
+                kernel_size=ksize,
+                stride=stride,
+                padding=pad,
+                dilation=dilation,
+                bias=False),
+            nn.BatchNorm2d(nout),
+            activ()
+        )
+
+    def __call__(self, x):
+        return self.conv(x)
+
+
+class SeperableConv2DBNActiv(nn.Module):
+
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+        super(SeperableConv2DBNActiv, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                nin, nin,
+                kernel_size=ksize,
+                stride=stride,
+                padding=pad,
+                dilation=dilation,
+                groups=nin,
+                bias=False),
+            nn.Conv2d(
+                nin, nout,
+                kernel_size=1,
+                bias=False),
+            nn.BatchNorm2d(nout),
+            activ()
+        )
+
+    def __call__(self, x):
+        return self.conv(x)
+
+
+class Encoder(nn.Module):
+
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
+        super(Encoder, self).__init__()
+        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+        self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
+
+    def __call__(self, x):
+        skip = self.conv1(x)
+        h = self.conv2(skip)
+
+        return h, skip
+
+
+class Decoder(nn.Module):
+
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
+        super(Decoder, self).__init__()
+        self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+        self.dropout = nn.Dropout2d(0.1) if dropout else None
+
+    def __call__(self, x, skip=None):
+        x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
+        if skip is not None:
+            skip = spec_utils.crop_center(skip, x)
+            x = torch.cat([x, skip], dim=1)
+        h = self.conv(x)
+
+        if self.dropout is not None:
+            h = self.dropout(h)
+
+        return h
+
+
+class ASPPModule(nn.Module):
+
+    def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
+        super(ASPPModule, self).__init__()
+        self.conv1 = nn.Sequential(
+            nn.AdaptiveAvgPool2d((1, None)),
+            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+        )
+        self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+        self.conv3 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
+        self.conv4 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
+        self.conv5 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
+        self.bottleneck = nn.Sequential(
+            Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ),
+            nn.Dropout2d(0.1)
+        )
+
+    def forward(self, x):
+        _, _, h, w = x.size()
+        feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
+        feat2 = self.conv2(x)
+        feat3 = self.conv3(x)
+        feat4 = self.conv4(x)
+        feat5 = self.conv5(x)
+        out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
+        bottle = self.bottleneck(out)
+        return bottle
--- a/uvr5_pack/lib_v5/layers_33966KB.py
+++ b/uvr5_pack/lib_v5/layers_33966KB.py
@ -0,0 +1,122 @@
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from uvr5_pack.lib_v5 import spec_utils
+
+
+class Conv2DBNActiv(nn.Module):
+
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+        super(Conv2DBNActiv, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                nin, nout,
+                kernel_size=ksize,
+                stride=stride,
+                padding=pad,
+                dilation=dilation,
+                bias=False),
+            nn.BatchNorm2d(nout),
+            activ()
+        )
+
+    def __call__(self, x):
+        return self.conv(x)
+
+
+class SeperableConv2DBNActiv(nn.Module):
+
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+        super(SeperableConv2DBNActiv, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                nin, nin,
+                kernel_size=ksize,
+                stride=stride,
+                padding=pad,
+                dilation=dilation,
+                groups=nin,
+                bias=False),
+            nn.Conv2d(
+                nin, nout,
+                kernel_size=1,
+                bias=False),
+            nn.BatchNorm2d(nout),
+            activ()
+        )
+
+    def __call__(self, x):
+        return self.conv(x)
+
+
+class Encoder(nn.Module):
+
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
+        super(Encoder, self).__init__()
+        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+        self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
+
+    def __call__(self, x):
+        skip = self.conv1(x)
+        h = self.conv2(skip)
+
+        return h, skip
+
+
+class Decoder(nn.Module):
+
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
+        super(Decoder, self).__init__()
+        self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+        self.dropout = nn.Dropout2d(0.1) if dropout else None
+
+    def __call__(self, x, skip=None):
+        x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
+        if skip is not None:
+            skip = spec_utils.crop_center(skip, x)
+            x = torch.cat([x, skip], dim=1)
+        h = self.conv(x)
+
+        if self.dropout is not None:
+            h = self.dropout(h)
+
+        return h
+
+
+class ASPPModule(nn.Module):
+
+    def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
+        super(ASPPModule, self).__init__()
+        self.conv1 = nn.Sequential(
+            nn.AdaptiveAvgPool2d((1, None)),
+            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+        )
+        self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+        self.conv3 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
+        self.conv4 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
+        self.conv5 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
+        self.conv6 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
+        self.conv7 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
+        self.bottleneck = nn.Sequential(
+            Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ),
+            nn.Dropout2d(0.1)
+        )
+
+    def forward(self, x):
+        _, _, h, w = x.size()
+        feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
+        feat2 = self.conv2(x)
+        feat3 = self.conv3(x)
+        feat4 = self.conv4(x)
+        feat5 = self.conv5(x)
+        feat6 = self.conv6(x)
+        feat7 = self.conv7(x)
+        out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
+        bottle = self.bottleneck(out)
+        return bottle
--- a/uvr5_pack/lib_v5/layers_537227KB.py
+++ b/uvr5_pack/lib_v5/layers_537227KB.py
@ -0,0 +1,122 @@
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from uvr5_pack.lib_v5 import spec_utils
+
+
+class Conv2DBNActiv(nn.Module):
+
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+        super(Conv2DBNActiv, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                nin, nout,
+                kernel_size=ksize,
+                stride=stride,
+                padding=pad,
+                dilation=dilation,
+                bias=False),
+            nn.BatchNorm2d(nout),
+            activ()
+        )
+
+    def __call__(self, x):
+        return self.conv(x)
+
+
+class SeperableConv2DBNActiv(nn.Module):
+
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+        super(SeperableConv2DBNActiv, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                nin, nin,
+                kernel_size=ksize,
+                stride=stride,
+                padding=pad,
+                dilation=dilation,
+                groups=nin,
+                bias=False),
+            nn.Conv2d(
+                nin, nout,
+                kernel_size=1,
+                bias=False),
+            nn.BatchNorm2d(nout),
+            activ()
+        )
+
+    def __call__(self, x):
+        return self.conv(x)
+
+
+class Encoder(nn.Module):
+
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
+        super(Encoder, self).__init__()
+        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+        self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
+
+    def __call__(self, x):
+        skip = self.conv1(x)
+        h = self.conv2(skip)
+
+        return h, skip
+
+
+class Decoder(nn.Module):
+
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
+        super(Decoder, self).__init__()
+        self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+        self.dropout = nn.Dropout2d(0.1) if dropout else None
+
+    def __call__(self, x, skip=None):
+        x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
+        if skip is not None:
+            skip = spec_utils.crop_center(skip, x)
+            x = torch.cat([x, skip], dim=1)
+        h = self.conv(x)
+
+        if self.dropout is not None:
+            h = self.dropout(h)
+
+        return h
+
+
+class ASPPModule(nn.Module):
+
+    def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
+        super(ASPPModule, self).__init__()
+        self.conv1 = nn.Sequential(
+            nn.AdaptiveAvgPool2d((1, None)),
+            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+        )
+        self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+        self.conv3 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
+        self.conv4 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
+        self.conv5 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
+        self.conv6 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
+        self.conv7 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
+        self.bottleneck = nn.Sequential(
+            Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ),
+            nn.Dropout2d(0.1)
+        )
+
+    def forward(self, x):
+        _, _, h, w = x.size()
+        feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
+        feat2 = self.conv2(x)
+        feat3 = self.conv3(x)
+        feat4 = self.conv4(x)
+        feat5 = self.conv5(x)
+        feat6 = self.conv6(x)
+        feat7 = self.conv7(x)
+        out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
+        bottle = self.bottleneck(out)
+        return bottle
--- a/uvr5_pack/lib_v5/layers_537238KB.py
+++ b/uvr5_pack/lib_v5/layers_537238KB.py
@ -0,0 +1,122 @@
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from uvr5_pack.lib_v5 import spec_utils
+
+
+class Conv2DBNActiv(nn.Module):
+
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+        super(Conv2DBNActiv, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                nin, nout,
+                kernel_size=ksize,
+                stride=stride,
+                padding=pad,
+                dilation=dilation,
+                bias=False),
+            nn.BatchNorm2d(nout),
+            activ()
+        )
+
+    def __call__(self, x):
+        return self.conv(x)
+
+
+class SeperableConv2DBNActiv(nn.Module):
+
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+        super(SeperableConv2DBNActiv, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                nin, nin,
+                kernel_size=ksize,
+                stride=stride,
+                padding=pad,
+                dilation=dilation,
+                groups=nin,
+                bias=False),
+            nn.Conv2d(
+                nin, nout,
+                kernel_size=1,
+                bias=False),
+            nn.BatchNorm2d(nout),
+            activ()
+        )
+
+    def __call__(self, x):
+        return self.conv(x)
+
+
+class Encoder(nn.Module):
+
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
+        super(Encoder, self).__init__()
+        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+        self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
+
+    def __call__(self, x):
+        skip = self.conv1(x)
+        h = self.conv2(skip)
+
+        return h, skip
+
+
+class Decoder(nn.Module):
+
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
+        super(Decoder, self).__init__()
+        self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+        self.dropout = nn.Dropout2d(0.1) if dropout else None
+
+    def __call__(self, x, skip=None):
+        x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
+        if skip is not None:
+            skip = spec_utils.crop_center(skip, x)
+            x = torch.cat([x, skip], dim=1)
+        h = self.conv(x)
+
+        if self.dropout is not None:
+            h = self.dropout(h)
+
+        return h
+
+
+class ASPPModule(nn.Module):
+
+    def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
+        super(ASPPModule, self).__init__()
+        self.conv1 = nn.Sequential(
+            nn.AdaptiveAvgPool2d((1, None)),
+            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+        )
+        self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+        self.conv3 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
+        self.conv4 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
+        self.conv5 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
+        self.conv6 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
+        self.conv7 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
+        self.bottleneck = nn.Sequential(
+            Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ),
+            nn.Dropout2d(0.1)
+        )
+
+    def forward(self, x):
+        _, _, h, w = x.size()
+        feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
+        feat2 = self.conv2(x)
+        feat3 = self.conv3(x)
+        feat4 = self.conv4(x)
+        feat5 = self.conv5(x)
+        feat6 = self.conv6(x)
+        feat7 = self.conv7(x)
+        out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
+        bottle = self.bottleneck(out)
+        return bottle
--- a/uvr5_pack/lib_v5/model_param_init.py
+++ b/uvr5_pack/lib_v5/model_param_init.py
@ -0,0 +1,60 @@
+import json
+import os
+import pathlib
+
+default_param = {}
+default_param['bins'] = 768
+default_param['unstable_bins'] = 9 # training only
+default_param['reduction_bins'] = 762 # training only
+default_param['sr'] = 44100
+default_param['pre_filter_start'] = 757
+default_param['pre_filter_stop'] = 768
+default_param['band'] = {}
+
+
+default_param['band'][1] = {
+    'sr': 11025,
+    'hl': 128,
+    'n_fft': 960,
+    'crop_start': 0,
+    'crop_stop': 245,
+    'lpf_start': 61, # inference only
+    'res_type': 'polyphase'
+}
+
+default_param['band'][2] = {
+    'sr': 44100,
+    'hl': 512,
+    'n_fft': 1536,
+    'crop_start': 24,
+    'crop_stop': 547,
+    'hpf_start': 81, # inference only
+    'res_type': 'sinc_best'
+}
+
+
+def int_keys(d):
+    r = {}
+    for k, v in d:
+        if k.isdigit():
+            k = int(k)
+        r[k] = v
+    return r
+    
+
+class ModelParameters(object):
+    def __init__(self, config_path=''):
+        if '.pth' == pathlib.Path(config_path).suffix:
+            import zipfile
+            
+            with zipfile.ZipFile(config_path, 'r') as zip:
+                self.param = json.loads(zip.read('param.json'), object_pairs_hook=int_keys)
+        elif '.json' == pathlib.Path(config_path).suffix:
+            with open(config_path, 'r') as f:
+                self.param = json.loads(f.read(), object_pairs_hook=int_keys)
+        else:
+            self.param = default_param
+            
+        for k in ['mid_side', 'mid_side_b', 'mid_side_b2', 'stereo_w', 'stereo_n', 'reverse']:
+            if not k in self.param:
+                self.param[k] = False
--- a/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json
+++ b/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json
@ -0,0 +1,19 @@
+{
+	"bins": 1024,
+	"unstable_bins": 0,
+	"reduction_bins": 0,
+	"band": {
+		"1": {
+			"sr": 16000,
+			"hl": 512,
+			"n_fft": 2048,
+			"crop_start": 0,
+			"crop_stop": 1024,
+			"hpf_start": -1,
+			"res_type": "sinc_best"
+		}
+	},
+	"sr": 16000,
+	"pre_filter_start": 1023,
+	"pre_filter_stop": 1024
+}
--- a/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json
+++ b/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json
@ -0,0 +1,19 @@
+{
+	"bins": 1024,
+	"unstable_bins": 0,
+	"reduction_bins": 0,
+	"band": {
+		"1": {
+			"sr": 32000,
+			"hl": 512,
+			"n_fft": 2048,
+			"crop_start": 0,
+			"crop_stop": 1024,
+			"hpf_start": -1,
+			"res_type": "kaiser_fast"
+		}
+	},
+	"sr": 32000,
+	"pre_filter_start": 1000,
+	"pre_filter_stop": 1021
+}
--- a/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json
+++ b/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json
@ -0,0 +1,19 @@
+{
+	"bins": 1024,
+	"unstable_bins": 0,
+	"reduction_bins": 0,
+	"band": {
+		"1": {
+			"sr": 33075,
+			"hl": 384,
+			"n_fft": 2048,
+			"crop_start": 0,
+			"crop_stop": 1024,
+			"hpf_start": -1,
+			"res_type": "sinc_best"
+		}
+	},
+	"sr": 33075,
+	"pre_filter_start": 1000,
+	"pre_filter_stop": 1021
+}
--- a/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json
+++ b/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json
@ -0,0 +1,19 @@
+{
+	"bins": 1024,
+	"unstable_bins": 0,
+	"reduction_bins": 0,
+	"band": {
+		"1": {
+			"sr": 44100,
+			"hl": 1024,
+			"n_fft": 2048,
+			"crop_start": 0,
+			"crop_stop": 1024,
+			"hpf_start": -1,
+			"res_type": "sinc_best"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 1023,
+	"pre_filter_stop": 1024
+}
--- a/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json
+++ b/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json
@ -0,0 +1,19 @@
+{
+	"bins": 256,
+	"unstable_bins": 0,
+	"reduction_bins": 0,
+	"band": {
+		"1": {
+			"sr": 44100,
+			"hl": 256,
+			"n_fft": 512,
+			"crop_start": 0,
+			"crop_stop": 256,
+			"hpf_start": -1,
+			"res_type": "sinc_best"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 256,
+	"pre_filter_stop": 256
+}
--- a/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json
+++ b/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json
@ -0,0 +1,19 @@
+{
+	"bins": 1024,
+	"unstable_bins": 0,
+	"reduction_bins": 0,
+	"band": {
+		"1": {
+			"sr": 44100,
+			"hl": 512,
+			"n_fft": 2048,
+			"crop_start": 0,
+			"crop_stop": 1024,
+			"hpf_start": -1,
+			"res_type": "sinc_best"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 1023,
+	"pre_filter_stop": 1024
+}
--- a/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json
+++ b/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json
@ -0,0 +1,19 @@
+{
+	"bins": 1024,
+	"unstable_bins": 0,
+	"reduction_bins": 0,
+	"band": {
+		"1": {
+			"sr": 44100,
+			"hl": 512,
+			"n_fft": 2048,
+			"crop_start": 0,
+			"crop_stop": 700,
+			"hpf_start": -1,
+			"res_type": "sinc_best"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 1023,
+	"pre_filter_stop": 700
+}
--- a/uvr5_pack/lib_v5/modelparams/2band_32000.json
+++ b/uvr5_pack/lib_v5/modelparams/2band_32000.json
@ -0,0 +1,30 @@
+{
+	"bins": 768,
+	"unstable_bins": 7,
+	"reduction_bins": 705,
+	"band": {
+		"1": {
+			"sr": 6000,
+			"hl": 66,
+			"n_fft": 512,
+			"crop_start": 0,
+			"crop_stop": 240,
+			"lpf_start": 60,
+			"lpf_stop": 118,
+			"res_type": "sinc_fastest"
+		},
+		"2": {
+			"sr": 32000,
+			"hl": 352,
+			"n_fft": 1024,
+			"crop_start": 22,
+			"crop_stop": 505,
+			"hpf_start": 44,
+			"hpf_stop": 23,
+			"res_type": "sinc_medium"
+		}
+	},
+	"sr": 32000,
+	"pre_filter_start": 710,
+	"pre_filter_stop": 731
+}
--- a/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json
+++ b/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json
@ -0,0 +1,30 @@
+{
+	"bins": 512,
+	"unstable_bins": 7,
+	"reduction_bins": 510,
+	"band": {
+		"1": {
+			"sr": 11025,
+			"hl": 160,
+			"n_fft": 768,
+			"crop_start": 0,
+			"crop_stop": 192,
+			"lpf_start": 41,
+			"lpf_stop": 139,
+			"res_type": "sinc_fastest"
+		},
+		"2": {
+			"sr": 44100,
+			"hl": 640,
+			"n_fft": 1024,
+			"crop_start": 10,
+			"crop_stop": 320,
+			"hpf_start": 47,
+			"hpf_stop": 15,
+			"res_type": "sinc_medium"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 510,
+	"pre_filter_stop": 512
+}
--- a/uvr5_pack/lib_v5/modelparams/2band_48000.json
+++ b/uvr5_pack/lib_v5/modelparams/2band_48000.json
@ -0,0 +1,30 @@
+{
+	"bins": 768,
+	"unstable_bins": 7,
+	"reduction_bins": 705,
+	"band": {
+		"1": {
+			"sr": 6000,
+			"hl": 66,
+			"n_fft": 512,
+			"crop_start": 0,
+			"crop_stop": 240,
+			"lpf_start": 60,
+			"lpf_stop": 240,
+			"res_type": "sinc_fastest"
+		},
+		"2": {
+			"sr": 48000,
+			"hl": 528,
+			"n_fft": 1536,
+			"crop_start": 22,
+			"crop_stop": 505,
+			"hpf_start": 82,
+			"hpf_stop": 22,
+			"res_type": "sinc_medium"
+		}
+	},
+	"sr": 48000,
+	"pre_filter_start": 710,
+	"pre_filter_stop": 731
+}
--- a/uvr5_pack/lib_v5/modelparams/3band_44100.json
+++ b/uvr5_pack/lib_v5/modelparams/3band_44100.json
@ -0,0 +1,42 @@
+{
+	"bins": 768,
+	"unstable_bins": 5,
+	"reduction_bins": 733,
+	"band": {
+		"1": {
+			"sr": 11025,
+			"hl": 128,
+			"n_fft": 768,
+			"crop_start": 0,
+			"crop_stop": 278,
+			"lpf_start": 28,
+			"lpf_stop": 140,
+			"res_type": "polyphase"
+		},
+		"2": {
+			"sr": 22050,
+			"hl": 256,
+			"n_fft": 768,
+			"crop_start": 14,
+			"crop_stop": 322,
+			"hpf_start": 70,
+			"hpf_stop": 14,
+			"lpf_start": 283,
+			"lpf_stop": 314,
+			"res_type": "polyphase"
+		},	
+		"3": {
+			"sr": 44100,
+			"hl": 512,
+			"n_fft": 768,
+			"crop_start": 131,
+			"crop_stop": 313,
+			"hpf_start": 154,
+			"hpf_stop": 141,
+			"res_type": "sinc_medium"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 757,
+	"pre_filter_stop": 768
+}
--- a/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json
+++ b/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json
@ -0,0 +1,43 @@
+{
+	"mid_side": true,
+	"bins": 768,
+	"unstable_bins": 5,
+	"reduction_bins": 733,
+	"band": {
+		"1": {
+			"sr": 11025,
+			"hl": 128,
+			"n_fft": 768,
+			"crop_start": 0,
+			"crop_stop": 278,
+			"lpf_start": 28,
+			"lpf_stop": 140,
+			"res_type": "polyphase"
+		},
+		"2": {
+			"sr": 22050,
+			"hl": 256,
+			"n_fft": 768,
+			"crop_start": 14,
+			"crop_stop": 322,
+			"hpf_start": 70,
+			"hpf_stop": 14,
+			"lpf_start": 283,
+			"lpf_stop": 314,
+			"res_type": "polyphase"
+		},	
+		"3": {
+			"sr": 44100,
+			"hl": 512,
+			"n_fft": 768,
+			"crop_start": 131,
+			"crop_stop": 313,
+			"hpf_start": 154,
+			"hpf_stop": 141,
+			"res_type": "sinc_medium"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 757,
+	"pre_filter_stop": 768
+}
--- a/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json
+++ b/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json
@ -0,0 +1,43 @@
+{
+	"mid_side_b2": true,
+	"bins": 640,
+	"unstable_bins": 7,
+	"reduction_bins": 565,
+	"band": {
+		"1": {
+			"sr": 11025,
+			"hl": 108,
+			"n_fft": 1024,
+			"crop_start": 0,
+			"crop_stop": 187,
+			"lpf_start": 92,
+			"lpf_stop": 186,
+			"res_type": "polyphase"
+		},
+		"2": {
+			"sr": 22050,
+			"hl": 216,
+			"n_fft": 768,
+			"crop_start": 0,
+			"crop_stop": 212,
+			"hpf_start": 68,
+			"hpf_stop": 34,
+			"lpf_start": 174,
+			"lpf_stop": 209,
+			"res_type": "polyphase"
+		},	
+		"3": {
+			"sr": 44100,
+			"hl": 432,
+			"n_fft": 640,
+			"crop_start": 66,
+			"crop_stop": 307,
+			"hpf_start": 86,
+			"hpf_stop": 72,
+			"res_type": "kaiser_fast"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 639,
+	"pre_filter_stop": 640
+}
--- a/uvr5_pack/lib_v5/modelparams/4band_44100.json
+++ b/uvr5_pack/lib_v5/modelparams/4band_44100.json
@ -0,0 +1,54 @@
+{
+	"bins": 768,
+	"unstable_bins": 7,
+	"reduction_bins": 668,
+	"band": {
+		"1": {
+			"sr": 11025,
+			"hl": 128,
+			"n_fft": 1024,
+			"crop_start": 0,
+			"crop_stop": 186,
+			"lpf_start": 37,
+			"lpf_stop": 73,
+			"res_type": "polyphase"
+		},
+		"2": {
+			"sr": 11025,
+			"hl": 128,
+			"n_fft": 512,
+			"crop_start": 4,
+			"crop_stop": 185,			
+			"hpf_start": 36,
+			"hpf_stop": 18,
+			"lpf_start": 93,
+			"lpf_stop": 185,
+			"res_type": "polyphase"
+		},
+		"3": {
+			"sr": 22050,
+			"hl": 256,
+			"n_fft": 512,
+			"crop_start": 46,
+			"crop_stop": 186,
+			"hpf_start": 93,
+			"hpf_stop": 46,
+			"lpf_start": 164,
+			"lpf_stop": 186,
+			"res_type": "polyphase"
+		},	
+		"4": {
+			"sr": 44100,
+			"hl": 512,
+			"n_fft": 768,
+			"crop_start": 121,
+			"crop_stop": 382,
+			"hpf_start": 138,
+			"hpf_stop": 123,
+			"res_type": "sinc_medium"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 740,
+	"pre_filter_stop": 768
+}
--- a/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json
+++ b/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json
@ -0,0 +1,55 @@
+{
+	"bins": 768,
+	"unstable_bins": 7,
+	"mid_side": true,
+	"reduction_bins": 668,
+	"band": {
+		"1": {
+			"sr": 11025,
+			"hl": 128,
+			"n_fft": 1024,
+			"crop_start": 0,
+			"crop_stop": 186,
+			"lpf_start": 37,
+			"lpf_stop": 73,
+			"res_type": "polyphase"
+		},
+		"2": {
+			"sr": 11025,
+			"hl": 128,
+			"n_fft": 512,
+			"crop_start": 4,
+			"crop_stop": 185,			
+			"hpf_start": 36,
+			"hpf_stop": 18,
+			"lpf_start": 93,
+			"lpf_stop": 185,
+			"res_type": "polyphase"
+		},
+		"3": {
+			"sr": 22050,
+			"hl": 256,
+			"n_fft": 512,
+			"crop_start": 46,
+			"crop_stop": 186,
+			"hpf_start": 93,
+			"hpf_stop": 46,
+			"lpf_start": 164,
+			"lpf_stop": 186,
+			"res_type": "polyphase"
+		},	
+		"4": {
+			"sr": 44100,
+			"hl": 512,
+			"n_fft": 768,
+			"crop_start": 121,
+			"crop_stop": 382,
+			"hpf_start": 138,
+			"hpf_stop": 123,
+			"res_type": "sinc_medium"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 740,
+	"pre_filter_stop": 768
+}
--- a/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json
+++ b/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json
@ -0,0 +1,55 @@
+{
+	"mid_side_b": true,
+	"bins": 768,
+	"unstable_bins": 7,
+	"reduction_bins": 668,
+	"band": {
+		"1": {
+			"sr": 11025,
+			"hl": 128,
+			"n_fft": 1024,
+			"crop_start": 0,
+			"crop_stop": 186,
+			"lpf_start": 37,
+			"lpf_stop": 73,
+			"res_type": "polyphase"
+		},
+		"2": {
+			"sr": 11025,
+			"hl": 128,
+			"n_fft": 512,
+			"crop_start": 4,
+			"crop_stop": 185,			
+			"hpf_start": 36,
+			"hpf_stop": 18,
+			"lpf_start": 93,
+			"lpf_stop": 185,
+			"res_type": "polyphase"
+		},
+		"3": {
+			"sr": 22050,
+			"hl": 256,
+			"n_fft": 512,
+			"crop_start": 46,
+			"crop_stop": 186,
+			"hpf_start": 93,
+			"hpf_stop": 46,
+			"lpf_start": 164,
+			"lpf_stop": 186,
+			"res_type": "polyphase"
+		},	
+		"4": {
+			"sr": 44100,
+			"hl": 512,
+			"n_fft": 768,
+			"crop_start": 121,
+			"crop_stop": 382,
+			"hpf_start": 138,
+			"hpf_stop": 123,
+			"res_type": "sinc_medium"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 740,
+	"pre_filter_stop": 768
+}
--- a/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json
+++ b/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json
@ -0,0 +1,55 @@
+{
+	"mid_side_b": true,
+	"bins": 768,
+	"unstable_bins": 7,
+	"reduction_bins": 668,
+	"band": {
+		"1": {
+			"sr": 11025,
+			"hl": 128,
+			"n_fft": 1024,
+			"crop_start": 0,
+			"crop_stop": 186,
+			"lpf_start": 37,
+			"lpf_stop": 73,
+			"res_type": "polyphase"
+		},
+		"2": {
+			"sr": 11025,
+			"hl": 128,
+			"n_fft": 512,
+			"crop_start": 4,
+			"crop_stop": 185,			
+			"hpf_start": 36,
+			"hpf_stop": 18,
+			"lpf_start": 93,
+			"lpf_stop": 185,
+			"res_type": "polyphase"
+		},
+		"3": {
+			"sr": 22050,
+			"hl": 256,
+			"n_fft": 512,
+			"crop_start": 46,
+			"crop_stop": 186,
+			"hpf_start": 93,
+			"hpf_stop": 46,
+			"lpf_start": 164,
+			"lpf_stop": 186,
+			"res_type": "polyphase"
+		},	
+		"4": {
+			"sr": 44100,
+			"hl": 512,
+			"n_fft": 768,
+			"crop_start": 121,
+			"crop_stop": 382,
+			"hpf_start": 138,
+			"hpf_stop": 123,
+			"res_type": "sinc_medium"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 740,
+	"pre_filter_stop": 768
+}
--- a/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json
+++ b/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json
@ -0,0 +1,55 @@
+{
+	"reverse": true,
+	"bins": 768,
+	"unstable_bins": 7,
+	"reduction_bins": 668,
+	"band": {
+		"1": {
+			"sr": 11025,
+			"hl": 128,
+			"n_fft": 1024,
+			"crop_start": 0,
+			"crop_stop": 186,
+			"lpf_start": 37,
+			"lpf_stop": 73,
+			"res_type": "polyphase"
+		},
+		"2": {
+			"sr": 11025,
+			"hl": 128,
+			"n_fft": 512,
+			"crop_start": 4,
+			"crop_stop": 185,			
+			"hpf_start": 36,
+			"hpf_stop": 18,
+			"lpf_start": 93,
+			"lpf_stop": 185,
+			"res_type": "polyphase"
+		},
+		"3": {
+			"sr": 22050,
+			"hl": 256,
+			"n_fft": 512,
+			"crop_start": 46,
+			"crop_stop": 186,
+			"hpf_start": 93,
+			"hpf_stop": 46,
+			"lpf_start": 164,
+			"lpf_stop": 186,
+			"res_type": "polyphase"
+		},	
+		"4": {
+			"sr": 44100,
+			"hl": 512,
+			"n_fft": 768,
+			"crop_start": 121,
+			"crop_stop": 382,
+			"hpf_start": 138,
+			"hpf_stop": 123,
+			"res_type": "sinc_medium"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 740,
+	"pre_filter_stop": 768
+}
--- a/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json
+++ b/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json
@ -0,0 +1,55 @@
+{
+	"stereo_w": true,
+	"bins": 768,
+	"unstable_bins": 7,
+	"reduction_bins": 668,
+	"band": {
+		"1": {
+			"sr": 11025,
+			"hl": 128,
+			"n_fft": 1024,
+			"crop_start": 0,
+			"crop_stop": 186,
+			"lpf_start": 37,
+			"lpf_stop": 73,
+			"res_type": "polyphase"
+		},
+		"2": {
+			"sr": 11025,
+			"hl": 128,
+			"n_fft": 512,
+			"crop_start": 4,
+			"crop_stop": 185,			
+			"hpf_start": 36,
+			"hpf_stop": 18,
+			"lpf_start": 93,
+			"lpf_stop": 185,
+			"res_type": "polyphase"
+		},
+		"3": {
+			"sr": 22050,
+			"hl": 256,
+			"n_fft": 512,
+			"crop_start": 46,
+			"crop_stop": 186,
+			"hpf_start": 93,
+			"hpf_stop": 46,
+			"lpf_start": 164,
+			"lpf_stop": 186,
+			"res_type": "polyphase"
+		},	
+		"4": {
+			"sr": 44100,
+			"hl": 512,
+			"n_fft": 768,
+			"crop_start": 121,
+			"crop_stop": 382,
+			"hpf_start": 138,
+			"hpf_stop": 123,
+			"res_type": "sinc_medium"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 740,
+	"pre_filter_stop": 768
+}
--- a/uvr5_pack/lib_v5/modelparams/4band_v2.json
+++ b/uvr5_pack/lib_v5/modelparams/4band_v2.json
@ -0,0 +1,54 @@
+{
+	"bins": 672,
+	"unstable_bins": 8,
+	"reduction_bins": 637,
+	"band": {
+		"1": {
+			"sr": 7350,
+			"hl": 80,
+			"n_fft": 640,
+			"crop_start": 0,
+			"crop_stop": 85,
+			"lpf_start": 25,
+			"lpf_stop": 53,
+			"res_type": "polyphase"
+		},
+		"2": {
+			"sr": 7350,
+			"hl": 80,
+			"n_fft": 320,
+			"crop_start": 4,
+			"crop_stop": 87,
+			"hpf_start": 25,
+			"hpf_stop": 12,
+			"lpf_start": 31,
+			"lpf_stop": 62,
+			"res_type": "polyphase"
+		},		
+		"3": {
+			"sr": 14700,
+			"hl": 160,
+			"n_fft": 512,
+			"crop_start": 17,
+			"crop_stop": 216,
+			"hpf_start": 48,
+			"hpf_stop": 24,
+			"lpf_start": 139,
+			"lpf_stop": 210,
+			"res_type": "polyphase"
+		},	
+		"4": {
+			"sr": 44100,
+			"hl": 480,
+			"n_fft": 960,
+			"crop_start": 78,
+			"crop_stop": 383,
+			"hpf_start": 130,
+			"hpf_stop": 86,
+			"res_type": "kaiser_fast"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 668,
+	"pre_filter_stop": 672
+}
--- a/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json
+++ b/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json
@ -0,0 +1,55 @@
+{
+	"bins": 672,
+	"unstable_bins": 8,
+	"reduction_bins": 637,
+	"band": {
+		"1": {
+			"sr": 7350,
+			"hl": 80,
+			"n_fft": 640,
+			"crop_start": 0,
+			"crop_stop": 85,
+			"lpf_start": 25,
+			"lpf_stop": 53,
+			"res_type": "polyphase"
+		},
+		"2": {
+			"sr": 7350,
+			"hl": 80,
+			"n_fft": 320,
+			"crop_start": 4,
+			"crop_stop": 87,
+			"hpf_start": 25,
+			"hpf_stop": 12,
+			"lpf_start": 31,
+			"lpf_stop": 62,
+			"res_type": "polyphase"
+		},		
+		"3": {
+			"sr": 14700,
+			"hl": 160,
+			"n_fft": 512,
+			"crop_start": 17,
+			"crop_stop": 216,
+			"hpf_start": 48,
+			"hpf_stop": 24,
+			"lpf_start": 139,
+			"lpf_stop": 210,
+			"res_type": "polyphase"
+		},	
+		"4": {
+			"sr": 44100,
+			"hl": 480,
+			"n_fft": 960,
+			"crop_start": 78,
+			"crop_stop": 383,
+			"hpf_start": 130,
+			"hpf_stop": 86,
+			"convert_channels": "stereo_n",
+			"res_type": "kaiser_fast"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 668,
+	"pre_filter_stop": 672
+}
--- a/uvr5_pack/lib_v5/modelparams/ensemble.json
+++ b/uvr5_pack/lib_v5/modelparams/ensemble.json
@ -0,0 +1,43 @@
+{
+	"mid_side_b2": true,
+	"bins": 1280,
+	"unstable_bins": 7,
+	"reduction_bins": 565,
+	"band": {
+		"1": {
+			"sr": 11025,
+			"hl": 108,
+			"n_fft": 2048,
+			"crop_start": 0,
+			"crop_stop": 374,
+			"lpf_start": 92,
+			"lpf_stop": 186,
+			"res_type": "polyphase"
+		},
+		"2": {
+			"sr": 22050,
+			"hl": 216,
+			"n_fft": 1536,
+			"crop_start": 0,
+			"crop_stop": 424,
+			"hpf_start": 68,
+			"hpf_stop": 34,
+			"lpf_start": 348,
+			"lpf_stop": 418,
+			"res_type": "polyphase"
+		},	
+		"3": {
+			"sr": 44100,
+			"hl": 432,
+			"n_fft": 1280,
+			"crop_start": 132,
+			"crop_stop": 614,
+			"hpf_start": 172,
+			"hpf_stop": 144,
+			"res_type": "polyphase"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 1280,
+	"pre_filter_stop": 1280
+}
--- a/uvr5_pack/lib_v5/nets.py
+++ b/uvr5_pack/lib_v5/nets.py
@ -0,0 +1,113 @@
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from uvr5_pack.lib_v5 import layers
+from uvr5_pack.lib_v5 import spec_utils
+
+
+class BaseASPPNet(nn.Module):
+
+    def __init__(self, nin, ch, dilations=(4, 8, 16)):
+        super(BaseASPPNet, self).__init__()
+        self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
+        self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
+        self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
+        self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
+
+        self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
+
+        self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
+        self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
+        self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
+        self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
+
+    def __call__(self, x):
+        h, e1 = self.enc1(x)
+        h, e2 = self.enc2(h)
+        h, e3 = self.enc3(h)
+        h, e4 = self.enc4(h)
+
+        h = self.aspp(h)
+
+        h = self.dec4(h, e4)
+        h = self.dec3(h, e3)
+        h = self.dec2(h, e2)
+        h = self.dec1(h, e1)
+
+        return h
+
+
+class CascadedASPPNet(nn.Module):
+
+    def __init__(self, n_fft):
+        super(CascadedASPPNet, self).__init__()
+        self.stg1_low_band_net = BaseASPPNet(2, 16)
+        self.stg1_high_band_net = BaseASPPNet(2, 16)
+
+        self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0)
+        self.stg2_full_band_net = BaseASPPNet(8, 16)
+
+        self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
+        self.stg3_full_band_net = BaseASPPNet(16, 32)
+
+        self.out = nn.Conv2d(32, 2, 1, bias=False)
+        self.aux1_out = nn.Conv2d(16, 2, 1, bias=False)
+        self.aux2_out = nn.Conv2d(16, 2, 1, bias=False)
+
+        self.max_bin = n_fft // 2
+        self.output_bin = n_fft // 2 + 1
+
+        self.offset = 128
+
+    def forward(self, x, aggressiveness=None):
+        mix = x.detach()
+        x = x.clone()
+
+        x = x[:, :, :self.max_bin]
+
+        bandw = x.size()[2] // 2
+        aux1 = torch.cat([
+            self.stg1_low_band_net(x[:, :, :bandw]),
+            self.stg1_high_band_net(x[:, :, bandw:])
+        ], dim=2)
+
+        h = torch.cat([x, aux1], dim=1)
+        aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
+
+        h = torch.cat([x, aux1, aux2], dim=1)
+        h = self.stg3_full_band_net(self.stg3_bridge(h))
+
+        mask = torch.sigmoid(self.out(h))
+        mask = F.pad(
+            input=mask,
+            pad=(0, 0, 0, self.output_bin - mask.size()[2]),
+            mode='replicate')
+ 
+        if self.training:
+            aux1 = torch.sigmoid(self.aux1_out(aux1))
+            aux1 = F.pad(
+                input=aux1,
+                pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
+                mode='replicate')
+            aux2 = torch.sigmoid(self.aux2_out(aux2))
+            aux2 = F.pad(
+                input=aux2,
+                pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
+                mode='replicate')
+            return mask * mix, aux1 * mix, aux2 * mix
+        else:       
+            if aggressiveness:
+                mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
+                mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
+
+            return mask * mix
+
+    def predict(self, x_mag, aggressiveness=None):
+        h = self.forward(x_mag, aggressiveness)
+
+        if self.offset > 0:
+            h = h[:, :, :, self.offset:-self.offset]
+            assert h.size()[3] > 0
+
+        return h
--- a/uvr5_pack/lib_v5/nets_123812KB.py
+++ b/uvr5_pack/lib_v5/nets_123812KB.py
@ -0,0 +1,112 @@
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from uvr5_pack.lib_v5 import layers_123821KB as layers
+
+
+class BaseASPPNet(nn.Module):
+
+    def __init__(self, nin, ch, dilations=(4, 8, 16)):
+        super(BaseASPPNet, self).__init__()
+        self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
+        self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
+        self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
+        self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
+
+        self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
+
+        self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
+        self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
+        self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
+        self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
+
+    def __call__(self, x):
+        h, e1 = self.enc1(x)
+        h, e2 = self.enc2(h)
+        h, e3 = self.enc3(h)
+        h, e4 = self.enc4(h)
+
+        h = self.aspp(h)
+
+        h = self.dec4(h, e4)
+        h = self.dec3(h, e3)
+        h = self.dec2(h, e2)
+        h = self.dec1(h, e1)
+
+        return h
+
+
+class CascadedASPPNet(nn.Module):
+
+    def __init__(self, n_fft):
+        super(CascadedASPPNet, self).__init__()
+        self.stg1_low_band_net = BaseASPPNet(2, 32)
+        self.stg1_high_band_net = BaseASPPNet(2, 32)
+
+        self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
+        self.stg2_full_band_net = BaseASPPNet(16, 32)
+
+        self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
+        self.stg3_full_band_net = BaseASPPNet(32, 64)
+
+        self.out = nn.Conv2d(64, 2, 1, bias=False)
+        self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
+        self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
+
+        self.max_bin = n_fft // 2
+        self.output_bin = n_fft // 2 + 1
+
+        self.offset = 128
+
+    def forward(self, x, aggressiveness=None):
+        mix = x.detach()
+        x = x.clone()
+
+        x = x[:, :, :self.max_bin]
+
+        bandw = x.size()[2] // 2
+        aux1 = torch.cat([
+            self.stg1_low_band_net(x[:, :, :bandw]),
+            self.stg1_high_band_net(x[:, :, bandw:])
+        ], dim=2)
+
+        h = torch.cat([x, aux1], dim=1)
+        aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
+
+        h = torch.cat([x, aux1, aux2], dim=1)
+        h = self.stg3_full_band_net(self.stg3_bridge(h))
+
+        mask = torch.sigmoid(self.out(h))
+        mask = F.pad(
+            input=mask,
+            pad=(0, 0, 0, self.output_bin - mask.size()[2]),
+            mode='replicate')
+ 
+        if self.training:
+            aux1 = torch.sigmoid(self.aux1_out(aux1))
+            aux1 = F.pad(
+                input=aux1,
+                pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
+                mode='replicate')
+            aux2 = torch.sigmoid(self.aux2_out(aux2))
+            aux2 = F.pad(
+                input=aux2,
+                pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
+                mode='replicate')
+            return mask * mix, aux1 * mix, aux2 * mix
+        else:
+            if aggressiveness:
+                mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
+                mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
+
+            return mask * mix
+
+    def predict(self, x_mag, aggressiveness=None):
+        h = self.forward(x_mag, aggressiveness)
+
+        if self.offset > 0:
+            h = h[:, :, :, self.offset:-self.offset]
+            assert h.size()[3] > 0
+
+        return h
--- a/uvr5_pack/lib_v5/nets_123821KB.py
+++ b/uvr5_pack/lib_v5/nets_123821KB.py
@ -0,0 +1,112 @@
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from uvr5_pack.lib_v5 import layers_123821KB as layers
+
+
+class BaseASPPNet(nn.Module):
+
+    def __init__(self, nin, ch, dilations=(4, 8, 16)):
+        super(BaseASPPNet, self).__init__()
+        self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
+        self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
+        self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
+        self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
+
+        self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
+
+        self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
+        self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
+        self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
+        self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
+
+    def __call__(self, x):
+        h, e1 = self.enc1(x)
+        h, e2 = self.enc2(h)
+        h, e3 = self.enc3(h)
+        h, e4 = self.enc4(h)
+
+        h = self.aspp(h)
+
+        h = self.dec4(h, e4)
+        h = self.dec3(h, e3)
+        h = self.dec2(h, e2)
+        h = self.dec1(h, e1)
+
+        return h
+
+
+class CascadedASPPNet(nn.Module):
+
+    def __init__(self, n_fft):
+        super(CascadedASPPNet, self).__init__()
+        self.stg1_low_band_net = BaseASPPNet(2, 32)
+        self.stg1_high_band_net = BaseASPPNet(2, 32)
+
+        self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
+        self.stg2_full_band_net = BaseASPPNet(16, 32)
+
+        self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
+        self.stg3_full_band_net = BaseASPPNet(32, 64)
+
+        self.out = nn.Conv2d(64, 2, 1, bias=False)
+        self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
+        self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
+
+        self.max_bin = n_fft // 2
+        self.output_bin = n_fft // 2 + 1
+
+        self.offset = 128
+
+    def forward(self, x, aggressiveness=None):
+        mix = x.detach()
+        x = x.clone()
+
+        x = x[:, :, :self.max_bin]
+
+        bandw = x.size()[2] // 2
+        aux1 = torch.cat([
+            self.stg1_low_band_net(x[:, :, :bandw]),
+            self.stg1_high_band_net(x[:, :, bandw:])
+        ], dim=2)
+
+        h = torch.cat([x, aux1], dim=1)
+        aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
+
+        h = torch.cat([x, aux1, aux2], dim=1)
+        h = self.stg3_full_band_net(self.stg3_bridge(h))
+
+        mask = torch.sigmoid(self.out(h))
+        mask = F.pad(
+            input=mask,
+            pad=(0, 0, 0, self.output_bin - mask.size()[2]),
+            mode='replicate')
+ 
+        if self.training:
+            aux1 = torch.sigmoid(self.aux1_out(aux1))
+            aux1 = F.pad(
+                input=aux1,
+                pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
+                mode='replicate')
+            aux2 = torch.sigmoid(self.aux2_out(aux2))
+            aux2 = F.pad(
+                input=aux2,
+                pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
+                mode='replicate')
+            return mask * mix, aux1 * mix, aux2 * mix
+        else:
+            if aggressiveness:
+                mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
+                mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
+
+            return mask * mix
+
+    def predict(self, x_mag, aggressiveness=None):
+        h = self.forward(x_mag, aggressiveness)
+
+        if self.offset > 0:
+            h = h[:, :, :, self.offset:-self.offset]
+            assert h.size()[3] > 0
+
+        return h
--- a/uvr5_pack/lib_v5/nets_33966KB.py
+++ b/uvr5_pack/lib_v5/nets_33966KB.py
@ -0,0 +1,112 @@
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from uvr5_pack.lib_v5 import layers_33966KB as layers
+
+
+class BaseASPPNet(nn.Module):
+
+    def __init__(self, nin, ch, dilations=(4, 8, 16, 32)):
+        super(BaseASPPNet, self).__init__()
+        self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
+        self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
+        self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
+        self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
+
+        self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
+
+        self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
+        self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
+        self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
+        self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
+
+    def __call__(self, x):
+        h, e1 = self.enc1(x)
+        h, e2 = self.enc2(h)
+        h, e3 = self.enc3(h)
+        h, e4 = self.enc4(h)
+
+        h = self.aspp(h)
+
+        h = self.dec4(h, e4)
+        h = self.dec3(h, e3)
+        h = self.dec2(h, e2)
+        h = self.dec1(h, e1)
+
+        return h
+
+
+class CascadedASPPNet(nn.Module):
+
+    def __init__(self, n_fft):
+        super(CascadedASPPNet, self).__init__()
+        self.stg1_low_band_net = BaseASPPNet(2, 16)
+        self.stg1_high_band_net = BaseASPPNet(2, 16)
+
+        self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0)
+        self.stg2_full_band_net = BaseASPPNet(8, 16)
+
+        self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
+        self.stg3_full_band_net = BaseASPPNet(16, 32)
+
+        self.out = nn.Conv2d(32, 2, 1, bias=False)
+        self.aux1_out = nn.Conv2d(16, 2, 1, bias=False)
+        self.aux2_out = nn.Conv2d(16, 2, 1, bias=False)
+
+        self.max_bin = n_fft // 2
+        self.output_bin = n_fft // 2 + 1
+
+        self.offset = 128
+
+    def forward(self, x, aggressiveness=None):
+        mix = x.detach()
+        x = x.clone()
+
+        x = x[:, :, :self.max_bin]
+
+        bandw = x.size()[2] // 2
+        aux1 = torch.cat([
+            self.stg1_low_band_net(x[:, :, :bandw]),
+            self.stg1_high_band_net(x[:, :, bandw:])
+        ], dim=2)
+
+        h = torch.cat([x, aux1], dim=1)
+        aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
+
+        h = torch.cat([x, aux1, aux2], dim=1)
+        h = self.stg3_full_band_net(self.stg3_bridge(h))
+
+        mask = torch.sigmoid(self.out(h))
+        mask = F.pad(
+            input=mask,
+            pad=(0, 0, 0, self.output_bin - mask.size()[2]),
+            mode='replicate')
+
+        if self.training:
+            aux1 = torch.sigmoid(self.aux1_out(aux1))
+            aux1 = F.pad(
+                input=aux1,
+                pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
+                mode='replicate')
+            aux2 = torch.sigmoid(self.aux2_out(aux2))
+            aux2 = F.pad(
+                input=aux2,
+                pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
+                mode='replicate')
+            return mask * mix, aux1 * mix, aux2 * mix
+        else:
+            if aggressiveness:
+                mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
+                mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
+
+            return mask * mix
+
+    def predict(self, x_mag, aggressiveness=None):
+        h = self.forward(x_mag, aggressiveness)
+
+        if self.offset > 0:
+            h = h[:, :, :, self.offset:-self.offset]
+            assert h.size()[3] > 0
+
+        return h
--- a/uvr5_pack/lib_v5/nets_537227KB.py
+++ b/uvr5_pack/lib_v5/nets_537227KB.py
@ -0,0 +1,113 @@
+import torch
+import numpy as np
+from torch import nn
+import torch.nn.functional as F
+
+from uvr5_pack.lib_v5 import layers_537238KB as layers
+
+
+class BaseASPPNet(nn.Module):
+
+    def __init__(self, nin, ch, dilations=(4, 8, 16)):
+        super(BaseASPPNet, self).__init__()
+        self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
+        self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
+        self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
+        self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
+
+        self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
+
+        self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
+        self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
+        self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
+        self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
+
+    def __call__(self, x):
+        h, e1 = self.enc1(x)
+        h, e2 = self.enc2(h)
+        h, e3 = self.enc3(h)
+        h, e4 = self.enc4(h)
+
+        h = self.aspp(h)
+
+        h = self.dec4(h, e4)
+        h = self.dec3(h, e3)
+        h = self.dec2(h, e2)
+        h = self.dec1(h, e1)
+
+        return h
+
+
+class CascadedASPPNet(nn.Module):
+
+    def __init__(self, n_fft):
+        super(CascadedASPPNet, self).__init__()
+        self.stg1_low_band_net = BaseASPPNet(2, 64)
+        self.stg1_high_band_net = BaseASPPNet(2, 64)
+
+        self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
+        self.stg2_full_band_net = BaseASPPNet(32, 64)
+
+        self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0)
+        self.stg3_full_band_net = BaseASPPNet(64, 128)
+
+        self.out = nn.Conv2d(128, 2, 1, bias=False)
+        self.aux1_out = nn.Conv2d(64, 2, 1, bias=False)
+        self.aux2_out = nn.Conv2d(64, 2, 1, bias=False)
+
+        self.max_bin = n_fft // 2
+        self.output_bin = n_fft // 2 + 1
+
+        self.offset = 128
+
+    def forward(self, x, aggressiveness=None):
+        mix = x.detach()
+        x = x.clone()
+
+        x = x[:, :, :self.max_bin]
+
+        bandw = x.size()[2] // 2
+        aux1 = torch.cat([
+            self.stg1_low_band_net(x[:, :, :bandw]),
+            self.stg1_high_band_net(x[:, :, bandw:])
+        ], dim=2)
+
+        h = torch.cat([x, aux1], dim=1)
+        aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
+
+        h = torch.cat([x, aux1, aux2], dim=1)
+        h = self.stg3_full_band_net(self.stg3_bridge(h))
+
+        mask = torch.sigmoid(self.out(h))
+        mask = F.pad(
+            input=mask,
+            pad=(0, 0, 0, self.output_bin - mask.size()[2]),
+            mode='replicate')
+ 
+        if self.training:
+            aux1 = torch.sigmoid(self.aux1_out(aux1))
+            aux1 = F.pad(
+                input=aux1,
+                pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
+                mode='replicate')
+            aux2 = torch.sigmoid(self.aux2_out(aux2))
+            aux2 = F.pad(
+                input=aux2,
+                pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
+                mode='replicate')
+            return mask * mix, aux1 * mix, aux2 * mix
+        else:
+            if aggressiveness:
+                mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
+                mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
+
+            return mask * mix
+
+    def predict(self, x_mag, aggressiveness=None):
+        h = self.forward(x_mag, aggressiveness)
+
+        if self.offset > 0:
+            h = h[:, :, :, self.offset:-self.offset]
+            assert h.size()[3] > 0
+
+        return h
--- a/uvr5_pack/lib_v5/nets_537238KB.py
+++ b/uvr5_pack/lib_v5/nets_537238KB.py
@ -0,0 +1,113 @@
+import torch
+import numpy as np
+from torch import nn
+import torch.nn.functional as F
+
+from uvr5_pack.lib_v5 import layers_537238KB as layers
+
+
+class BaseASPPNet(nn.Module):
+
+    def __init__(self, nin, ch, dilations=(4, 8, 16)):
+        super(BaseASPPNet, self).__init__()
+        self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
+        self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
+        self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
+        self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
+
+        self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
+
+        self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
+        self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
+        self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
+        self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
+
+    def __call__(self, x):
+        h, e1 = self.enc1(x)
+        h, e2 = self.enc2(h)
+        h, e3 = self.enc3(h)
+        h, e4 = self.enc4(h)
+
+        h = self.aspp(h)
+
+        h = self.dec4(h, e4)
+        h = self.dec3(h, e3)
+        h = self.dec2(h, e2)
+        h = self.dec1(h, e1)
+
+        return h
+
+
+class CascadedASPPNet(nn.Module):
+
+    def __init__(self, n_fft):
+        super(CascadedASPPNet, self).__init__()
+        self.stg1_low_band_net = BaseASPPNet(2, 64)
+        self.stg1_high_band_net = BaseASPPNet(2, 64)
+
+        self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
+        self.stg2_full_band_net = BaseASPPNet(32, 64)
+
+        self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0)
+        self.stg3_full_band_net = BaseASPPNet(64, 128)
+
+        self.out = nn.Conv2d(128, 2, 1, bias=False)
+        self.aux1_out = nn.Conv2d(64, 2, 1, bias=False)
+        self.aux2_out = nn.Conv2d(64, 2, 1, bias=False)
+
+        self.max_bin = n_fft // 2
+        self.output_bin = n_fft // 2 + 1
+
+        self.offset = 128
+
+    def forward(self, x, aggressiveness=None):
+        mix = x.detach()
+        x = x.clone()
+
+        x = x[:, :, :self.max_bin]
+
+        bandw = x.size()[2] // 2
+        aux1 = torch.cat([
+            self.stg1_low_band_net(x[:, :, :bandw]),
+            self.stg1_high_band_net(x[:, :, bandw:])
+        ], dim=2)
+
+        h = torch.cat([x, aux1], dim=1)
+        aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
+
+        h = torch.cat([x, aux1, aux2], dim=1)
+        h = self.stg3_full_band_net(self.stg3_bridge(h))
+
+        mask = torch.sigmoid(self.out(h))
+        mask = F.pad(
+            input=mask,
+            pad=(0, 0, 0, self.output_bin - mask.size()[2]),
+            mode='replicate')
+ 
+        if self.training:
+            aux1 = torch.sigmoid(self.aux1_out(aux1))
+            aux1 = F.pad(
+                input=aux1,
+                pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
+                mode='replicate')
+            aux2 = torch.sigmoid(self.aux2_out(aux2))
+            aux2 = F.pad(
+                input=aux2,
+                pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
+                mode='replicate')
+            return mask * mix, aux1 * mix, aux2 * mix
+        else:
+            if aggressiveness:
+                mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
+                mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
+
+            return mask * mix
+
+    def predict(self, x_mag, aggressiveness=None):
+        h = self.forward(x_mag, aggressiveness)
+
+        if self.offset > 0:
+            h = h[:, :, :, self.offset:-self.offset]
+            assert h.size()[3] > 0
+
+        return h
--- a/uvr5_pack/lib_v5/nets_61968KB.py
+++ b/uvr5_pack/lib_v5/nets_61968KB.py
@ -0,0 +1,112 @@
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from uvr5_pack.lib_v5 import layers_123821KB as layers
+
+
+class BaseASPPNet(nn.Module):
+
+    def __init__(self, nin, ch, dilations=(4, 8, 16)):
+        super(BaseASPPNet, self).__init__()
+        self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
+        self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
+        self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
+        self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
+
+        self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
+
+        self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
+        self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
+        self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
+        self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
+
+    def __call__(self, x):
+        h, e1 = self.enc1(x)
+        h, e2 = self.enc2(h)
+        h, e3 = self.enc3(h)
+        h, e4 = self.enc4(h)
+
+        h = self.aspp(h)
+
+        h = self.dec4(h, e4)
+        h = self.dec3(h, e3)
+        h = self.dec2(h, e2)
+        h = self.dec1(h, e1)
+
+        return h
+
+
+class CascadedASPPNet(nn.Module):
+
+    def __init__(self, n_fft):
+        super(CascadedASPPNet, self).__init__()
+        self.stg1_low_band_net = BaseASPPNet(2, 32)
+        self.stg1_high_band_net = BaseASPPNet(2, 32)
+
+        self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
+        self.stg2_full_band_net = BaseASPPNet(16, 32)
+
+        self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
+        self.stg3_full_band_net = BaseASPPNet(32, 64)
+
+        self.out = nn.Conv2d(64, 2, 1, bias=False)
+        self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
+        self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
+
+        self.max_bin = n_fft // 2
+        self.output_bin = n_fft // 2 + 1
+
+        self.offset = 128
+
+    def forward(self, x, aggressiveness=None):
+        mix = x.detach()
+        x = x.clone()
+
+        x = x[:, :, :self.max_bin]
+
+        bandw = x.size()[2] // 2
+        aux1 = torch.cat([
+            self.stg1_low_band_net(x[:, :, :bandw]),
+            self.stg1_high_band_net(x[:, :, bandw:])
+        ], dim=2)
+
+        h = torch.cat([x, aux1], dim=1)
+        aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
+
+        h = torch.cat([x, aux1, aux2], dim=1)
+        h = self.stg3_full_band_net(self.stg3_bridge(h))
+
+        mask = torch.sigmoid(self.out(h))
+        mask = F.pad(
+            input=mask,
+            pad=(0, 0, 0, self.output_bin - mask.size()[2]),
+            mode='replicate')
+ 
+        if self.training:
+            aux1 = torch.sigmoid(self.aux1_out(aux1))
+            aux1 = F.pad(
+                input=aux1,
+                pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
+                mode='replicate')
+            aux2 = torch.sigmoid(self.aux2_out(aux2))
+            aux2 = F.pad(
+                input=aux2,
+                pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
+                mode='replicate')
+            return mask * mix, aux1 * mix, aux2 * mix
+        else:
+            if aggressiveness:
+                mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
+                mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
+
+            return mask * mix
+
+    def predict(self, x_mag, aggressiveness=None):
+        h = self.forward(x_mag, aggressiveness)
+
+        if self.offset > 0:
+            h = h[:, :, :, self.offset:-self.offset]
+            assert h.size()[3] > 0
+
+        return h
--- a/uvr5_pack/lib_v5/spec_utils.py
+++ b/uvr5_pack/lib_v5/spec_utils.py
@ -0,0 +1,485 @@
+import os,librosa
+import numpy  as  np
+import soundfile  as  sf
+from tqdm import tqdm
+import json,math ,hashlib
+
+def crop_center(h1, h2):
+    h1_shape = h1.size()
+    h2_shape = h2.size()
+
+    if h1_shape[3] == h2_shape[3]:
+        return h1
+    elif h1_shape[3] < h2_shape[3]:
+        raise ValueError('h1_shape[3] must be greater than h2_shape[3]')
+
+    # s_freq = (h2_shape[2] - h1_shape[2]) // 2
+    # e_freq = s_freq + h1_shape[2]
+    s_time = (h1_shape[3] - h2_shape[3]) // 2
+    e_time = s_time + h2_shape[3]
+    h1 = h1[:, :, :, s_time:e_time]
+
+    return h1
+
+
+def wave_to_spectrogram(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False):
+    if reverse:
+        wave_left = np.flip(np.asfortranarray(wave[0]))
+        wave_right = np.flip(np.asfortranarray(wave[1]))
+    elif mid_side:
+        wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
+        wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
+    elif mid_side_b2:
+        wave_left = np.asfortranarray(np.add(wave[1], wave[0] * .5))
+        wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * .5))
+    else:
+        wave_left = np.asfortranarray(wave[0])
+        wave_right = np.asfortranarray(wave[1])
+
+    spec_left = librosa.stft(wave_left, n_fft, hop_length=hop_length)
+    spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length)
+    
+    spec = np.asfortranarray([spec_left, spec_right])
+
+    return spec
+   
+   
+def wave_to_spectrogram_mt(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False):
+    import threading
+
+    if reverse:
+        wave_left = np.flip(np.asfortranarray(wave[0]))
+        wave_right = np.flip(np.asfortranarray(wave[1]))
+    elif mid_side:
+        wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
+        wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
+    elif mid_side_b2:
+        wave_left = np.asfortranarray(np.add(wave[1], wave[0] * .5))
+        wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * .5))
+    else:
+        wave_left = np.asfortranarray(wave[0])
+        wave_right = np.asfortranarray(wave[1])
+   
+    def run_thread(**kwargs):
+        global spec_left
+        spec_left = librosa.stft(**kwargs)
+
+    thread = threading.Thread(target=run_thread, kwargs={'y': wave_left, 'n_fft': n_fft, 'hop_length': hop_length})
+    thread.start()
+    spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length)
+    thread.join()   
+    
+    spec = np.asfortranarray([spec_left, spec_right])
+
+    return spec
+    
+    
+def combine_spectrograms(specs, mp):
+    l = min([specs[i].shape[2] for i in specs])    
+    spec_c = np.zeros(shape=(2, mp.param['bins'] + 1, l), dtype=np.complex64)
+    offset = 0
+    bands_n = len(mp.param['band'])
+    
+    for d in range(1, bands_n + 1):
+        h = mp.param['band'][d]['crop_stop'] - mp.param['band'][d]['crop_start']
+        spec_c[:, offset:offset+h, :l] = specs[d][:, mp.param['band'][d]['crop_start']:mp.param['band'][d]['crop_stop'], :l]
+        offset += h
+        
+    if offset > mp.param['bins']:
+        raise ValueError('Too much bins')
+        
+    # lowpass fiter
+    if mp.param['pre_filter_start'] > 0: # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']:   
+        if bands_n == 1:
+            spec_c = fft_lp_filter(spec_c, mp.param['pre_filter_start'], mp.param['pre_filter_stop'])
+        else:
+            gp = 1        
+            for b in range(mp.param['pre_filter_start'] + 1, mp.param['pre_filter_stop']):
+                g = math.pow(10, -(b - mp.param['pre_filter_start']) * (3.5 - gp) / 20.0)
+                gp = g
+                spec_c[:, b, :] *= g
+                
+    return np.asfortranarray(spec_c)
+    
+
+def spectrogram_to_image(spec, mode='magnitude'):
+    if mode == 'magnitude':
+        if np.iscomplexobj(spec):
+            y = np.abs(spec)
+        else:
+            y = spec
+        y = np.log10(y ** 2 + 1e-8)
+    elif mode == 'phase':
+        if np.iscomplexobj(spec):
+            y = np.angle(spec)
+        else:
+            y = spec
+
+    y -= y.min()
+    y *= 255 / y.max()
+    img = np.uint8(y)
+
+    if y.ndim == 3:
+        img = img.transpose(1, 2, 0)
+        img = np.concatenate([
+            np.max(img, axis=2, keepdims=True), img
+        ], axis=2)
+
+    return img
+
+
+def reduce_vocal_aggressively(X, y, softmask):
+    v = X - y
+    y_mag_tmp = np.abs(y)
+    v_mag_tmp = np.abs(v)
+
+    v_mask = v_mag_tmp > y_mag_tmp
+    y_mag = np.clip(y_mag_tmp - v_mag_tmp * v_mask * softmask, 0, np.inf)
+
+    return y_mag * np.exp(1.j * np.angle(y))
+
+
+def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32):
+    if min_range < fade_size * 2:
+        raise ValueError('min_range must be >= fade_area * 2')
+
+    mag = mag.copy()
+
+    idx = np.where(ref.mean(axis=(0, 1)) < thres)[0]
+    starts = np.insert(idx[np.where(np.diff(idx) != 1)[0] + 1], 0, idx[0])
+    ends = np.append(idx[np.where(np.diff(idx) != 1)[0]], idx[-1])
+    uninformative = np.where(ends - starts > min_range)[0]
+    if len(uninformative) > 0:
+        starts = starts[uninformative]
+        ends = ends[uninformative]
+        old_e = None
+        for s, e in zip(starts, ends):
+            if old_e is not None and s - old_e < fade_size:
+                s = old_e - fade_size * 2
+
+            if s != 0:
+                weight = np.linspace(0, 1, fade_size)
+                mag[:, :, s:s + fade_size] += weight * ref[:, :, s:s + fade_size]
+            else:
+                s -= fade_size
+
+            if e != mag.shape[2]:
+                weight = np.linspace(1, 0, fade_size)
+                mag[:, :, e - fade_size:e] += weight * ref[:, :, e - fade_size:e]
+            else:
+                e += fade_size
+
+            mag[:, :, s + fade_size:e - fade_size] += ref[:, :, s + fade_size:e - fade_size]
+            old_e = e
+
+    return mag
+    
+
+def align_wave_head_and_tail(a, b):
+    l = min([a[0].size, b[0].size])  
+    
+    return a[:l,:l], b[:l,:l]
+    
+
+def cache_or_load(mix_path, inst_path, mp):
+    mix_basename = os.path.splitext(os.path.basename(mix_path))[0]
+    inst_basename = os.path.splitext(os.path.basename(inst_path))[0]
+
+    cache_dir = 'mph{}'.format(hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode('utf-8')).hexdigest())
+    mix_cache_dir = os.path.join('cache', cache_dir)
+    inst_cache_dir = os.path.join('cache', cache_dir)
+
+    os.makedirs(mix_cache_dir, exist_ok=True)
+    os.makedirs(inst_cache_dir, exist_ok=True)
+
+    mix_cache_path = os.path.join(mix_cache_dir, mix_basename + '.npy')
+    inst_cache_path = os.path.join(inst_cache_dir, inst_basename + '.npy')
+
+    if os.path.exists(mix_cache_path) and os.path.exists(inst_cache_path):
+        X_spec_m = np.load(mix_cache_path)
+        y_spec_m = np.load(inst_cache_path)
+    else:
+        X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
+         
+        for d in range(len(mp.param['band']), 0, -1):            
+            bp = mp.param['band'][d]
+                    
+            if d == len(mp.param['band']): # high-end band
+                X_wave[d], _ = librosa.load(
+                    mix_path, bp['sr'], False, dtype=np.float32, res_type=bp['res_type'])
+                y_wave[d], _ = librosa.load(
+                    inst_path, bp['sr'], False, dtype=np.float32, res_type=bp['res_type'])
+            else: # lower bands
+                X_wave[d] = librosa.resample(X_wave[d+1], mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type'])
+                y_wave[d] = librosa.resample(y_wave[d+1], mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type'])
+            
+            X_wave[d], y_wave[d] = align_wave_head_and_tail(X_wave[d], y_wave[d])
+            
+            X_spec_s[d] = wave_to_spectrogram(X_wave[d], bp['hl'], bp['n_fft'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse'])
+            y_spec_s[d] = wave_to_spectrogram(y_wave[d], bp['hl'], bp['n_fft'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse'])
+            
+        del X_wave, y_wave
+                 
+        X_spec_m = combine_spectrograms(X_spec_s, mp)
+        y_spec_m = combine_spectrograms(y_spec_s, mp)
+        
+        if X_spec_m.shape != y_spec_m.shape:
+            raise ValueError('The combined spectrograms are different: ' + mix_path)
+
+        _, ext = os.path.splitext(mix_path)
+
+        np.save(mix_cache_path, X_spec_m)
+        np.save(inst_cache_path, y_spec_m)
+
+    return X_spec_m, y_spec_m
+
+
+def spectrogram_to_wave(spec, hop_length, mid_side, mid_side_b2, reverse):
+    spec_left = np.asfortranarray(spec[0])
+    spec_right = np.asfortranarray(spec[1])
+
+    wave_left = librosa.istft(spec_left, hop_length=hop_length)
+    wave_right = librosa.istft(spec_right, hop_length=hop_length)
+
+    if reverse:
+        return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
+    elif mid_side:
+        return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)])
+    elif mid_side_b2:
+        return np.asfortranarray([np.add(wave_right / 1.25, .4 * wave_left), np.subtract(wave_left / 1.25, .4 * wave_right)])
+    else:
+        return np.asfortranarray([wave_left, wave_right])
+    
+    
+def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2):
+    import threading
+
+    spec_left = np.asfortranarray(spec[0])
+    spec_right = np.asfortranarray(spec[1])
+    
+    def run_thread(**kwargs):
+        global wave_left
+        wave_left = librosa.istft(**kwargs)
+        
+    thread = threading.Thread(target=run_thread, kwargs={'stft_matrix': spec_left, 'hop_length': hop_length})
+    thread.start()
+    wave_right = librosa.istft(spec_right, hop_length=hop_length)
+    thread.join()   
+    
+    if reverse:
+        return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
+    elif mid_side:
+        return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)])
+    elif mid_side_b2:
+        return np.asfortranarray([np.add(wave_right / 1.25, .4 * wave_left), np.subtract(wave_left / 1.25, .4 * wave_right)])
+    else:
+        return np.asfortranarray([wave_left, wave_right])
+    
+    
+def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None):
+    wave_band = {}
+    bands_n = len(mp.param['band'])    
+    offset = 0
+
+    for d in range(1, bands_n + 1):
+        bp = mp.param['band'][d]
+        spec_s = np.ndarray(shape=(2, bp['n_fft'] // 2 + 1, spec_m.shape[2]), dtype=complex)
+        h = bp['crop_stop'] - bp['crop_start']
+        spec_s[:, bp['crop_start']:bp['crop_stop'], :] = spec_m[:, offset:offset+h, :]
+        
+        offset += h
+        if d == bands_n: # higher
+            if extra_bins_h: # if --high_end_process bypass
+                max_bin = bp['n_fft'] // 2
+                spec_s[:, max_bin-extra_bins_h:max_bin, :] = extra_bins[:, :extra_bins_h, :]
+            if bp['hpf_start'] > 0:
+                spec_s = fft_hp_filter(spec_s, bp['hpf_start'], bp['hpf_stop'] - 1)
+            if bands_n == 1:
+                wave = spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse'])
+            else:
+                wave = np.add(wave, spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']))
+        else:
+            sr = mp.param['band'][d+1]['sr']
+            if d == 1: # lower
+                spec_s = fft_lp_filter(spec_s, bp['lpf_start'], bp['lpf_stop'])
+                wave = librosa.resample(spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']), bp['sr'], sr, res_type="sinc_fastest")
+            else: # mid
+                spec_s = fft_hp_filter(spec_s, bp['hpf_start'], bp['hpf_stop'] - 1)
+                spec_s = fft_lp_filter(spec_s, bp['lpf_start'], bp['lpf_stop'])
+                wave2 = np.add(wave, spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']))
+                # wave = librosa.core.resample(wave2, bp['sr'], sr, res_type="sinc_fastest")
+                wave = librosa.core.resample(wave2, bp['sr'], sr,res_type='scipy')
+        
+    return wave.T
+
+
+def fft_lp_filter(spec, bin_start, bin_stop):
+    g = 1.0
+    for b in range(bin_start, bin_stop):
+        g -= 1 / (bin_stop - bin_start)
+        spec[:, b, :] = g * spec[:, b, :]
+        
+    spec[:, bin_stop:, :] *= 0
+
+    return spec
+
+
+def fft_hp_filter(spec, bin_start, bin_stop):
+    g = 1.0
+    for b in range(bin_start, bin_stop, -1):
+        g -= 1 / (bin_start - bin_stop)
+        spec[:, b, :] = g * spec[:, b, :]
+    
+    spec[:, 0:bin_stop+1, :] *= 0
+
+    return spec
+
+
+def mirroring(a, spec_m, input_high_end, mp):
+    if 'mirroring' == a:
+        mirror = np.flip(np.abs(spec_m[:, mp.param['pre_filter_start']-10-input_high_end.shape[1]:mp.param['pre_filter_start']-10, :]), 1)
+        mirror = mirror * np.exp(1.j * np.angle(input_high_end))
+        
+        return np.where(np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror)
+        
+    if 'mirroring2' == a:
+        mirror = np.flip(np.abs(spec_m[:, mp.param['pre_filter_start']-10-input_high_end.shape[1]:mp.param['pre_filter_start']-10, :]), 1)
+        mi = np.multiply(mirror, input_high_end * 1.7)
+        
+        return np.where(np.abs(input_high_end) <= np.abs(mi), input_high_end, mi)
+
+
+def ensembling(a, specs):   
+    for i in range(1, len(specs)):
+        if i == 1:
+            spec = specs[0]
+
+        ln = min([spec.shape[2], specs[i].shape[2]])
+        spec = spec[:,:,:ln]
+        specs[i] = specs[i][:,:,:ln]
+
+        if 'min_mag' == a:
+            spec = np.where(np.abs(specs[i]) <= np.abs(spec), specs[i], spec)
+        if 'max_mag' == a:
+            spec = np.where(np.abs(specs[i]) >= np.abs(spec), specs[i], spec)  
+
+    return spec
+
+def stft(wave, nfft, hl):
+    wave_left = np.asfortranarray(wave[0])
+    wave_right = np.asfortranarray(wave[1])
+    spec_left = librosa.stft(wave_left, nfft, hop_length=hl)
+    spec_right = librosa.stft(wave_right, nfft, hop_length=hl)
+    spec = np.asfortranarray([spec_left, spec_right])
+
+    return spec
+
+def istft(spec, hl):
+    spec_left = np.asfortranarray(spec[0])
+    spec_right = np.asfortranarray(spec[1])
+
+    wave_left = librosa.istft(spec_left, hop_length=hl)
+    wave_right = librosa.istft(spec_right, hop_length=hl)
+    wave = np.asfortranarray([wave_left, wave_right])
+
+
+if __name__ == "__main__":
+    import cv2
+    import sys
+    import time
+    import argparse
+    from model_param_init import ModelParameters
+    
+    p = argparse.ArgumentParser()
+    p.add_argument('--algorithm', '-a', type=str, choices=['invert', 'invert_p', 'min_mag', 'max_mag', 'deep', 'align'], default='min_mag')
+    p.add_argument('--model_params', '-m', type=str, default=os.path.join('modelparams', '1band_sr44100_hl512.json'))
+    p.add_argument('--output_name', '-o', type=str, default='output')
+    p.add_argument('--vocals_only', '-v', action='store_true')
+    p.add_argument('input', nargs='+')
+    args = p.parse_args()
+  
+    start_time = time.time()
+    
+    if args.algorithm.startswith('invert') and len(args.input) != 2:
+        raise ValueError('There should be two input files.')    
+    
+    if not args.algorithm.startswith('invert') and len(args.input) < 2:
+        raise ValueError('There must be at least two input files.')
+    
+    wave, specs = {}, {}
+    mp = ModelParameters(args.model_params)
+     
+    for i in range(len(args.input)):    
+        spec = {}
+        
+        for d in range(len(mp.param['band']), 0, -1):          
+            bp = mp.param['band'][d]            
+            
+            if d == len(mp.param['band']): # high-end band                
+                wave[d], _ = librosa.load(
+                    args.input[i], bp['sr'], False, dtype=np.float32, res_type=bp['res_type'])
+                
+                if len(wave[d].shape) == 1: # mono to stereo
+                    wave[d] = np.array([wave[d], wave[d]])
+            else: # lower bands
+                wave[d] = librosa.resample(wave[d+1], mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type'])
+                       
+            spec[d] = wave_to_spectrogram(wave[d], bp['hl'], bp['n_fft'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse'])
+            
+        specs[i] = combine_spectrograms(spec, mp)
+        
+    del wave
+
+    if args.algorithm == 'deep':
+        d_spec = np.where(np.abs(specs[0]) <= np.abs(spec[1]), specs[0], spec[1])
+        v_spec = d_spec - specs[1]
+        sf.write(os.path.join('{}.wav'.format(args.output_name)), cmb_spectrogram_to_wave(v_spec, mp), mp.param['sr'])   
+        
+    if args.algorithm.startswith('invert'):
+        ln = min([specs[0].shape[2], specs[1].shape[2]])
+        specs[0] = specs[0][:,:,:ln]
+        specs[1] = specs[1][:,:,:ln]
+        
+        if 'invert_p' == args.algorithm:
+            X_mag = np.abs(specs[0])
+            y_mag = np.abs(specs[1])            
+            max_mag = np.where(X_mag >= y_mag, X_mag, y_mag)  
+            v_spec = specs[1] - max_mag * np.exp(1.j * np.angle(specs[0]))
+        else:
+            specs[1] = reduce_vocal_aggressively(specs[0], specs[1], 0.2)
+            v_spec = specs[0] - specs[1]
+
+            if not args.vocals_only:
+                X_mag = np.abs(specs[0])
+                y_mag = np.abs(specs[1])
+                v_mag = np.abs(v_spec)
+
+                X_image = spectrogram_to_image(X_mag)
+                y_image = spectrogram_to_image(y_mag)
+                v_image = spectrogram_to_image(v_mag)
+
+                cv2.imwrite('{}_X.png'.format(args.output_name), X_image)
+                cv2.imwrite('{}_y.png'.format(args.output_name), y_image)
+                cv2.imwrite('{}_v.png'.format(args.output_name), v_image)    
+                    
+                sf.write('{}_X.wav'.format(args.output_name), cmb_spectrogram_to_wave(specs[0], mp), mp.param['sr'])
+                sf.write('{}_y.wav'.format(args.output_name), cmb_spectrogram_to_wave(specs[1], mp), mp.param['sr'])
+            
+        sf.write('{}_v.wav'.format(args.output_name), cmb_spectrogram_to_wave(v_spec, mp), mp.param['sr'])    
+    else:    
+        if not args.algorithm == 'deep':
+            sf.write(os.path.join('ensembled','{}.wav'.format(args.output_name)), cmb_spectrogram_to_wave(ensembling(args.algorithm, specs), mp), mp.param['sr'])
+
+    if args.algorithm == 'align':
+
+        trackalignment = [
+            {
+                'file1':'"{}"'.format(args.input[0]),
+                'file2':'"{}"'.format(args.input[1])
+            }
+        ]
+
+        for i,e in tqdm(enumerate(trackalignment), desc="Performing Alignment..."):
+            os.system(f"python lib/align_tracks.py {e['file1']} {e['file2']}")
+
+    #print('Total time: {0:.{1}f}s'.format(time.time() - start_time, 1))
--- a/uvr5_pack/utils.py
+++ b/uvr5_pack/utils.py
@ -0,0 +1,242 @@
+import torch
+import numpy as np
+from tqdm import tqdm
+
+def make_padding(width, cropsize, offset):
+    left = offset
+    roi_size = cropsize - left * 2
+    if roi_size == 0:
+        roi_size = cropsize
+    right = roi_size - (width % roi_size) + left
+
+    return left, right, roi_size
+def inference(X_spec, device, model, aggressiveness,data):
+    '''
+    data ： dic configs
+    '''
+    
+    def _execute(X_mag_pad, roi_size, n_window, device, model, aggressiveness,is_half=True):
+        model.eval()
+        with torch.no_grad():
+            preds = []
+            
+            iterations = [n_window]
+
+            total_iterations = sum(iterations)            
+            for i in tqdm(range(n_window)): 
+                start = i * roi_size
+                X_mag_window = X_mag_pad[None, :, :, start:start + data['window_size']]
+                X_mag_window = torch.from_numpy(X_mag_window)
+                if(is_half==True):X_mag_window=X_mag_window.half()
+                X_mag_window=X_mag_window.to(device)
+
+                pred = model.predict(X_mag_window, aggressiveness)
+
+                pred = pred.detach().cpu().numpy()
+                preds.append(pred[0])
+                
+            pred = np.concatenate(preds, axis=2)
+        return pred
+    
+    def preprocess(X_spec):
+        X_mag = np.abs(X_spec)
+        X_phase = np.angle(X_spec)
+
+        return X_mag, X_phase
+    
+    X_mag, X_phase = preprocess(X_spec)
+
+    coef = X_mag.max()
+    X_mag_pre = X_mag / coef
+
+    n_frame = X_mag_pre.shape[2]
+    pad_l, pad_r, roi_size = make_padding(n_frame,
+                                                data['window_size'], model.offset)
+    n_window = int(np.ceil(n_frame / roi_size))
+
+    X_mag_pad = np.pad(
+        X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode='constant')
+
+    if(list(model.state_dict().values())[0].dtype==torch.float16):is_half=True
+    else:is_half=False
+    pred = _execute(X_mag_pad, roi_size, n_window,
+                        device, model, aggressiveness,is_half)
+    pred = pred[:, :, :n_frame]
+    
+    if data['tta']:
+        pad_l += roi_size // 2
+        pad_r += roi_size // 2
+        n_window += 1
+
+        X_mag_pad = np.pad(
+            X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode='constant')
+
+        pred_tta = _execute(X_mag_pad, roi_size, n_window,
+                                device, model, aggressiveness,is_half)
+        pred_tta = pred_tta[:, :, roi_size // 2:]
+        pred_tta = pred_tta[:, :, :n_frame]
+
+        return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.j * X_phase)
+    else:
+        return pred * coef, X_mag, np.exp(1.j * X_phase)
+            
+
+
+def  _get_name_params(model_path , model_hash):
+    ModelName = model_path
+    if model_hash == '47939caf0cfe52a0e81442b85b971dfd':  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
+        param_name_auto=str('4band_44100')
+    if model_hash == '4e4ecb9764c50a8c414fee6e10395bbe':  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2.json')
+        param_name_auto=str('4band_v2')
+    if model_hash == 'ca106edd563e034bde0bdec4bb7a4b36':
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2.json')
+        param_name_auto=str('4band_v2')
+    if model_hash == 'e60a1e84803ce4efc0a6551206cc4b71':
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
+        param_name_auto=str('4band_44100')
+    if model_hash == 'a82f14e75892e55e994376edbf0c8435':  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
+        param_name_auto=str('4band_44100')
+    if model_hash == '6dd9eaa6f0420af9f1d403aaafa4cc06':   
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2_sn.json')
+        param_name_auto=str('4band_v2_sn')
+    if model_hash == '08611fb99bd59eaa79ad27c58d137727':
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2_sn.json')
+        param_name_auto=str('4band_v2_sn')
+    if model_hash == '5c7bbca45a187e81abbbd351606164e5':
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json')
+        param_name_auto=str('3band_44100_msb2')
+    if model_hash == 'd6b2cb685a058a091e5e7098192d3233':    
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json')
+        param_name_auto=str('3band_44100_msb2')
+    if model_hash == 'c1b9f38170a7c90e96f027992eb7c62b': 
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
+        param_name_auto=str('4band_44100')
+    if model_hash == 'c3448ec923fa0edf3d03a19e633faa53':  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
+        param_name_auto=str('4band_44100')
+    if model_hash == '68aa2c8093d0080704b200d140f59e54':  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100.json')
+        param_name_auto=str('3band_44100.json')
+    if model_hash == 'fdc83be5b798e4bd29fe00fe6600e147':  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_mid.json')
+        param_name_auto=str('3band_44100_mid.json')
+    if model_hash == '2ce34bc92fd57f55db16b7a4def3d745':  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_mid.json')
+        param_name_auto=str('3band_44100_mid.json')
+    if model_hash == '52fdca89576f06cf4340b74a4730ee5f':  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
+        param_name_auto=str('4band_44100.json')
+    if model_hash == '41191165b05d38fc77f072fa9e8e8a30':  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
+        param_name_auto=str('4band_44100.json')
+    if model_hash == '89e83b511ad474592689e562d5b1f80e':  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_32000.json')
+        param_name_auto=str('2band_32000.json')
+    if model_hash == '0b954da81d453b716b114d6d7c95177f':  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_32000.json')
+        param_name_auto=str('2band_32000.json')
+
+    #v4 Models    
+    if model_hash == '6a00461c51c2920fd68937d4609ed6c8':  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json')
+        param_name_auto=str('1band_sr16000_hl512')
+    if model_hash == '0ab504864d20f1bd378fe9c81ef37140':  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json')
+        param_name_auto=str('1band_sr32000_hl512')
+    if model_hash == '7dd21065bf91c10f7fccb57d7d83b07f':  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json')
+        param_name_auto=str('1band_sr32000_hl512')
+    if model_hash == '80ab74d65e515caa3622728d2de07d23':  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json')
+        param_name_auto=str('1band_sr32000_hl512')
+    if model_hash == 'edc115e7fc523245062200c00caa847f':  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json')
+        param_name_auto=str('1band_sr33075_hl384')
+    if model_hash == '28063e9f6ab5b341c5f6d3c67f2045b7':  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json')
+        param_name_auto=str('1band_sr33075_hl384')
+    if model_hash == 'b58090534c52cbc3e9b5104bad666ef2':  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json')
+        param_name_auto=str('1band_sr44100_hl512')
+    if model_hash == '0cdab9947f1b0928705f518f3c78ea8f':  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json')
+        param_name_auto=str('1band_sr44100_hl512')
+    if model_hash == 'ae702fed0238afb5346db8356fe25f13':  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json')
+        param_name_auto=str('1band_sr44100_hl1024')                        
+    #User Models
+
+    #1 Band
+    if '1band_sr16000_hl512' in ModelName:  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json')
+        param_name_auto=str('1band_sr16000_hl512')
+    if '1band_sr32000_hl512' in ModelName:  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json')
+        param_name_auto=str('1band_sr32000_hl512')
+    if '1band_sr33075_hl384' in ModelName:  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json')
+        param_name_auto=str('1band_sr33075_hl384')
+    if '1band_sr44100_hl256' in ModelName:  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json')
+        param_name_auto=str('1band_sr44100_hl256')
+    if '1band_sr44100_hl512' in ModelName:  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json')
+        param_name_auto=str('1band_sr44100_hl512')
+    if '1band_sr44100_hl1024' in ModelName:  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json')
+        param_name_auto=str('1band_sr44100_hl1024')
+        
+    #2 Band
+    if '2band_44100_lofi' in ModelName:  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json')
+        param_name_auto=str('2band_44100_lofi')
+    if '2band_32000' in ModelName:  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_32000.json')
+        param_name_auto=str('2band_32000')
+    if '2band_48000' in ModelName:  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_48000.json')
+        param_name_auto=str('2band_48000')
+        
+    #3 Band   
+    if '3band_44100' in ModelName:  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100.json')
+        param_name_auto=str('3band_44100')
+    if '3band_44100_mid' in ModelName:  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_mid.json')
+        param_name_auto=str('3band_44100_mid')
+    if '3band_44100_msb2' in ModelName:  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json')
+        param_name_auto=str('3band_44100_msb2')
+        
+    #4 Band    
+    if '4band_44100' in ModelName:  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
+        param_name_auto=str('4band_44100')
+    if '4band_44100_mid' in ModelName:  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_mid.json')
+        param_name_auto=str('4band_44100_mid')
+    if '4band_44100_msb' in ModelName:  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_msb.json')
+        param_name_auto=str('4band_44100_msb')
+    if '4band_44100_msb2' in ModelName:  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json')
+        param_name_auto=str('4band_44100_msb2')
+    if '4band_44100_reverse' in ModelName:  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json')
+        param_name_auto=str('4band_44100_reverse')
+    if '4band_44100_sw' in ModelName:  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_sw.json') 
+        param_name_auto=str('4band_44100_sw')
+    if '4band_v2' in ModelName:  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2.json')
+        param_name_auto=str('4band_v2')
+    if '4band_v2_sn' in ModelName:  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2_sn.json')
+        param_name_auto=str('4band_v2_sn')
+    if 'tmodelparam' in ModelName:  
+        model_params_auto=str('uvr5_pack/lib_v5/modelparams/tmodelparam.json')
+        param_name_auto=str('User Model Param Set')
+    return param_name_auto , model_params_auto
				`@ -0,0 +1 @@`
				`python train_nsf_sim_cache_sid.py -c configs/mi_mix40k_nsf_co256_cs1sid_ms2048.json -m ft-mi`