ultimatevocalremovergui/inference_v5.py

from functools import total_ordering
import pprint
import argparse
import os
import importlib
from statistics import mode

import cv2
import librosa
import math
import numpy as np
import soundfile as sf
from tqdm import tqdm

from lib_v5 import dataset
from lib_v5 import spec_utils
from lib_v5.model_param_init import ModelParameters
import torch

# Command line text parsing and widget manipulation
from collections import defaultdict
import tkinter as tk
import traceback  # Error Message Recent Calls
import time  # Timer

class VocalRemover(object):

    def __init__(self, data, text_widget: tk.Text):
        self.data = data
        self.text_widget = text_widget
        self.models = defaultdict(lambda: None)
        self.devices = defaultdict(lambda: None)
        # self.offset = model.offset


data = {
    # Paths
    'input_paths': None,
    'export_path': None,
    # Processing Options
    'gpu': -1,
    'postprocess': True,
    'tta': True,
    'output_image': True,
    # Models
    'instrumentalModel': None,
    'useModel': None,
    # Constants
    'window_size': 512,
    'agg': 10
}

default_window_size = data['window_size']
default_agg = data['agg']

def update_progress(progress_var, total_files, file_num, step: float = 1):
    """Calculate the progress for the progress widget in the GUI"""
    base = (100 / total_files)
    progress = base * (file_num - 1)
    progress += base * step

    progress_var.set(progress)


def get_baseText(total_files, file_num):
    """Create the base text for the command widget"""
    text = 'File {file_num}/{total_files} '.format(file_num=file_num,
                                                total_files=total_files)
    return text


def determineModelFolderName():
    """
    Determine the name that is used for the folder and appended
    to the back of the music files
    """
    modelFolderName = ''
    if not data['modelFolder']:
        # Model Test Mode not selected
        return modelFolderName

    # -Instrumental-
    if os.path.isfile(data['instrumentalModel']):
        modelFolderName += os.path.splitext(os.path.basename(data['instrumentalModel']))[0]

    if modelFolderName:
        modelFolderName = '/' + modelFolderName

    return modelFolderName

def main(window: tk.Wm, text_widget: tk.Text, button_widget: tk.Button, progress_var: tk.Variable,
         **kwargs: dict):

    global args
    global model_params_d
    global nn_arch_sizes

    nn_arch_sizes = [
        31191, # default
        33966, 123821, 123812, 537238 # custom
    ]

    p = argparse.ArgumentParser()
    p.add_argument('--paramone', type=str, default='lib_v5/modelparams/4band_44100.json')
    p.add_argument('--paramtwo', type=str, default='lib_v5/modelparams/4band_v2.json')
    p.add_argument('--paramthree', type=str, default='lib_v5/modelparams/3band_44100_msb2.json')
    p.add_argument('--paramfour', type=str, default='lib_v5/modelparams/4band_v2_sn.json')
    p.add_argument('--aggressiveness',type=float, default=data['agg']/100)
    p.add_argument('--nn_architecture', type=str, choices= ['auto'] + list('{}KB'.format(s) for s in nn_arch_sizes), default='auto')
    p.add_argument('--high_end_process', type=str, default='mirroring')
    args = p.parse_args()


    def save_files(wav_instrument, wav_vocals):
        """Save output music files"""
        vocal_name = '(Vocals)'
        instrumental_name = '(Instrumental)'
        save_path = os.path.dirname(base_name)

        # Swap names if vocal model

        VModel="Vocal"

        if VModel in model_name:
            # Reverse names
            vocal_name, instrumental_name = instrumental_name, vocal_name

        # Save Temp File
        # For instrumental the instrumental is the temp file
        # and for vocal the instrumental is the temp file due
        # to reversement
        sf.write(f'temp.wav',
                 wav_instrument, mp.param['sr'])

        appendModelFolderName = modelFolderName.replace('/', '_')
        # -Save files-
        # Instrumental
        if instrumental_name is not None:
            instrumental_path = '{save_path}/{file_name}.wav'.format(
                save_path=save_path,
                file_name=f'{os.path.basename(base_name)}_{instrumental_name}{appendModelFolderName}',
            )

            sf.write(instrumental_path,
                     wav_instrument, mp.param['sr'])
        # Vocal
        if vocal_name is not None:
            vocal_path = '{save_path}/{file_name}.wav'.format(
                save_path=save_path,
                file_name=f'{os.path.basename(base_name)}_{vocal_name}{appendModelFolderName}',
            )
            sf.write(vocal_path,
                     wav_vocals, mp.param['sr'])

    data.update(kwargs)

    # Update default settings
    global default_window_size
    global default_agg
    default_window_size = data['window_size']
    default_agg = data['agg']

    stime = time.perf_counter()
    progress_var.set(0)
    text_widget.clear()
    button_widget.configure(state=tk.DISABLED)  # Disable Button

    vocal_remover = VocalRemover(data, text_widget)
    modelFolderName = determineModelFolderName()
    if modelFolderName:
        folder_path = f'{data["export_path"]}{modelFolderName}'
        if not os.path.isdir(folder_path):
            os.mkdir(folder_path)

    # Separation Preperation
    try:    #Load File(s)
                for file_num, music_file in enumerate(data['input_paths'], start=1):
                        # Determine File Name
                        base_name = f'{data["export_path"]}{modelFolderName}/{file_num}_{os.path.splitext(os.path.basename(music_file))[0]}'

                        model_name = os.path.basename(data[f'{data["useModel"]}Model'])
                        model = vocal_remover.models[data['useModel']]
                        device = vocal_remover.devices[data['useModel']]
                        # -Get text and update progress-
                        base_text = get_baseText(total_files=len(data['input_paths']),
                                                    file_num=file_num)
                        progress_kwargs = {'progress_var': progress_var,
                                        'total_files': len(data['input_paths']),
                                        'file_num': file_num}
                        update_progress(**progress_kwargs,
                                        step=0)


                #Load Model(s)
                        text_widget.write(base_text + 'Loading models...')


                        if 'auto' == args.nn_architecture:
                            model_size = math.ceil(os.stat(data['instrumentalModel']).st_size / 1024)
                            args.nn_architecture = '{}KB'.format(min(nn_arch_sizes, key=lambda x:abs(x-model_size)))

                        nets = importlib.import_module('lib_v5.nets' + f'_{args.nn_architecture}'.replace('_{}KB'.format(nn_arch_sizes[0]), ''), package=None)

                        ModelName=(data['instrumentalModel'])

                        ModelParam1="4BAND_44100"
                        ModelParam2="4BAND_44100_B"
                        ModelParam3="MSB2"
                        ModelParam4="4BAND_44100_SN"

                        if ModelParam1 in ModelName:
                            model_params_d=args.paramone
                        if ModelParam2 in ModelName:
                            model_params_d=args.paramtwo
                        if ModelParam3 in ModelName:
                            model_params_d=args.paramthree
                        if ModelParam4 in ModelName:
                            model_params_d=args.paramfour

                        print('Model Parameters:', model_params_d)

                        mp = ModelParameters(model_params_d)

                        # -Instrumental-
                        if os.path.isfile(data['instrumentalModel']):
                            device = torch.device('cpu')
                            model = nets.CascadedASPPNet(mp.param['bins'] * 2)
                            model.load_state_dict(torch.load(data['instrumentalModel'],
                                                            map_location=device))
                            if torch.cuda.is_available() and data['gpu'] >= 0:
                                device = torch.device('cuda:{}'.format(data['gpu']))
                                model.to(device)

                            vocal_remover.models['instrumental'] = model
                            vocal_remover.devices['instrumental'] = device

                        text_widget.write(' Done!\n')

                        model_name = os.path.basename(data[f'{data["useModel"]}Model'])

                        mp = ModelParameters(model_params_d)

                        # -Go through the different steps of seperation-
                        # Wave source
                        text_widget.write(base_text + 'Loading wave source...')

                        X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}

                        bands_n = len(mp.param['band'])

                        for d in range(bands_n, 0, -1):
                            bp = mp.param['band'][d]

                            if d == bands_n: # high-end band
                                X_wave[d], _ = librosa.load(
                                    music_file, bp['sr'], False, dtype=np.float32, res_type=bp['res_type'])

                                if X_wave[d].ndim == 1:
                                    X_wave[d] = np.asarray([X_wave[d], X_wave[d]])
                            else: # lower bands
                                X_wave[d] = librosa.resample(X_wave[d+1], mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type'])

                            # Stft of wave source

                            X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(X_wave[d], bp['hl'], bp['n_fft'], mp.param['mid_side'],
                                                                            mp.param['mid_side_b2'], mp.param['reverse'])

                            if d == bands_n and args.high_end_process != 'none':
                                input_high_end_h = (bp['n_fft']//2 - bp['crop_stop']) + (mp.param['pre_filter_stop'] - mp.param['pre_filter_start'])
                                input_high_end = X_spec_s[d][:, bp['n_fft']//2-input_high_end_h:bp['n_fft']//2, :]

                        text_widget.write('Done!\n')

                        update_progress(**progress_kwargs,
                                        step=0.1)

                        text_widget.write(base_text + 'Stft of wave source...')

                        text_widget.write(' Done!\n')

                        text_widget.write(base_text + "Please Wait...\n")

                        X_spec_m = spec_utils.combine_spectrograms(X_spec_s, mp)

                        del X_wave, X_spec_s

                        def inference(X_spec, device, model, aggressiveness):

                            def _execute(X_mag_pad, roi_size, n_window, device, model, aggressiveness):
                                model.eval()

                                with torch.no_grad():
                                    preds = []

                                    iterations = [n_window]

                                    total_iterations = sum(iterations)

                                    text_widget.write(base_text + "Processing "f"{total_iterations} Slices... ")

                                    for i in tqdm(range(n_window)):
                                        update_progress(**progress_kwargs,
                                            step=(0.1 + (0.8/n_window * i)))
                                        start = i * roi_size
                                        X_mag_window = X_mag_pad[None, :, :, start:start + data['window_size']]
                                        X_mag_window = torch.from_numpy(X_mag_window).to(device)

                                        pred = model.predict(X_mag_window, aggressiveness)

                                        pred = pred.detach().cpu().numpy()
                                        preds.append(pred[0])

                                    pred = np.concatenate(preds, axis=2)
                                    text_widget.write('Done!\n')
                                return pred

                            def preprocess(X_spec):
                                X_mag = np.abs(X_spec)
                                X_phase = np.angle(X_spec)

                                return X_mag, X_phase

                            X_mag, X_phase = preprocess(X_spec)

                            coef = X_mag.max()
                            X_mag_pre = X_mag / coef

                            n_frame = X_mag_pre.shape[2]
                            pad_l, pad_r, roi_size = dataset.make_padding(n_frame,
                                                                        data['window_size'], model.offset)
                            n_window = int(np.ceil(n_frame / roi_size))

                            X_mag_pad = np.pad(
                                X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode='constant')

                            pred = _execute(X_mag_pad, roi_size, n_window,
                                                device, model, aggressiveness)
                            pred = pred[:, :, :n_frame]

                            if data['tta']:
                                pad_l += roi_size // 2
                                pad_r += roi_size // 2
                                n_window += 1

                                X_mag_pad = np.pad(
                                    X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode='constant')

                                pred_tta = _execute(X_mag_pad, roi_size, n_window,
                                                        device, model, aggressiveness)
                                pred_tta = pred_tta[:, :, roi_size // 2:]
                                pred_tta = pred_tta[:, :, :n_frame]

                                return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.j * X_phase)
                            else:
                                return pred * coef, X_mag, np.exp(1.j * X_phase)


                        aggressiveness = {'value': args.aggressiveness, 'split_bin': mp.param['band'][1]['crop_stop']}


                        if data['tta']:
                            text_widget.write(base_text + "Running Inferences (TTA)...\n")
                        else:
                            text_widget.write(base_text + "Running Inference...\n")

                        pred, X_mag, X_phase = inference(X_spec_m,
                                                                device,
                                                                model, aggressiveness)

                        update_progress(**progress_kwargs,
                                        step=0.9)
                        # Postprocess
                        if data['postprocess']:
                            text_widget.write(base_text + 'Post processing...')
                            pred_inv = np.clip(X_mag - pred, 0, np.inf)
                            pred = spec_utils.mask_silence(pred, pred_inv)
                            text_widget.write(' Done!\n')

                            update_progress(**progress_kwargs,
                                            step=0.95)

                        # Inverse stft
                        text_widget.write(base_text + 'Inverse stft of instruments and vocals...')  # nopep8
                        y_spec_m = pred * X_phase
                        v_spec_m = X_spec_m - y_spec_m

                        if args.high_end_process.startswith('mirroring'):
                            input_high_end_ = spec_utils.mirroring(args.high_end_process, y_spec_m, input_high_end, mp)

                            wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, mp, input_high_end_h, input_high_end_)
                        else:
                            wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, mp)

                        if args.high_end_process.startswith('mirroring'):
                            input_high_end_ = spec_utils.mirroring(args.high_end_process, v_spec_m, input_high_end, mp)

                            wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, mp, input_high_end_h, input_high_end_)
                        else:
                            wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, mp)

                        text_widget.write('Done!\n')

                        update_progress(**progress_kwargs,
                                        step=1)
                        # Save output music files
                        text_widget.write(base_text + 'Saving Files...')
                        save_files(wav_instrument, wav_vocals)
                        text_widget.write(' Done!\n')

                        update_progress(**progress_kwargs,
                                        step=1)

                        # Save output image
                        if data['output_image']:
                            with open('{}_Instruments.jpg'.format(base_name), mode='wb') as f:
                                image = spec_utils.spectrogram_to_image(y_spec_m)
                                _, bin_image = cv2.imencode('.jpg', image)
                                bin_image.tofile(f)
                            with open('{}_Vocals.jpg'.format(base_name), mode='wb') as f:
                                image = spec_utils.spectrogram_to_image(v_spec_m)
                                _, bin_image = cv2.imencode('.jpg', image)
                                bin_image.tofile(f)

                        text_widget.write(base_text + 'Completed Seperation!\n\n')
    except Exception as e:
        traceback_text = ''.join(traceback.format_tb(e.__traceback__))
        message = f'Traceback Error: "{traceback_text}"\n{type(e).__name__}: "{e}"\nFile: {music_file}\nPlease contact the creator and attach a screenshot of this error with the file and settings that caused it!'
        tk.messagebox.showerror(master=window,
                                title='Untracked Error',
                                message=message)
        print(traceback_text)
        print(type(e).__name__, e)
        print(message)
        progress_var.set(0)
        button_widget.configure(state=tk.NORMAL)  # Enable Button
        return

    os.remove('temp.wav')

    progress_var.set(0)
    text_widget.write(f'\nConversion(s) Completed!\n')
    text_widget.write(f'Time Elapsed: {time.strftime("%H:%M:%S", time.gmtime(int(time.perf_counter() - stime)))}')  # nopep8
    torch.cuda.empty_cache()
    button_widget.configure(state=tk.NORMAL)  # Enable Button