Spaces:

masszhou
/

bgmseparator

Sleeping

File size: 5,130 Bytes

6af7d00

# reference: https://huggingface.co/spaces/r3gm/Audio_separator
import gradio as gr
import shutil
import numpy as np
from pathlib import Path
import os
import time
import torch
from huggingface_hub import hf_hub_download
from uvr_processing import process_uvr_task, run_mdx, get_model_params
from utils import convert_to_stereo_and_wav
import onnxruntime as ort
import io


MODEL_ID = "masszhou/mdxnet"
MODELS_PATH = {
    "bgm": Path(hf_hub_download(repo_id=MODEL_ID, filename="UVR-MDX-NET-Inst_HQ_3.onnx")), 
    "basic_vocal": Path(hf_hub_download(repo_id=MODEL_ID, filename="UVR-MDX-NET-Voc_FT.onnx")),
    "main_vocal": Path(hf_hub_download(repo_id=MODEL_ID, filename="UVR_MDXNET_KARA_2.onnx"))
}

def get_device_info():
    if torch.cuda.is_available():
        device = f"GPU ({torch.cuda.get_device_name(0)})"
    else:
        device = "CPU"
    return f"Current running environment: {device}"


def inference(audio_file: str, 
              stem: str = "vocal",) -> list[str]:
    # audio_file = '/private/var/folders/02/_9ymjkz12xq8m_xh5592pl840000gn/T/gradio/74c3de047a439ea3cfb8e2d1ee6e5a85ea999d3eb30537b88d386aac177902d0/Spare Zeit und Aufwand mit den Servicevertragen von Mercedes-Benz Trucks..m4a'

    if not audio_file:
        raise ValueError("The audio path is missing.")

    if not stem:
        raise ValueError("Please select 'vocal' or 'background' stem.")

    audio_file = Path(audio_file)
    output_dir = Path("./output")


    outputs = []
    start_time = time.time()
    background_path, vocals_path = process_uvr_task(
        input_file_path=audio_file,
        output_dir=output_dir,
        models_path=MODELS_PATH,
        )
    end_time = time.time()
    execution_time = end_time - start_time
    print(f"Execution time: {execution_time} seconds")

    print(f"Background file: {background_path}")
    print(f"Vocals file: {vocals_path}")
    os.makedirs("static/results", exist_ok=True)

    # shutil.copy(background_path, bg_dst)
    # shutil.copy(vocals_path, vc_dst)

    outputs.append(str(background_path))
    outputs.append(str(vocals_path))

    return outputs


def inference_bgm(audio_file: str) -> list[str]:
    mdx_model_params = get_model_params(Path("./mdx_models"))
    audio_file = convert_to_stereo_and_wav(Path(audio_file))  # resampling at 44100 Hz
    device_base = "cuda" if torch.cuda.is_available() else "cpu"
    output_dir = Path("./output")
    model_bgm_path = MODELS_PATH["bgm"]
    background_path, _ = run_mdx(model_params=mdx_model_params,
                                input_filename=audio_file,
                                output_dir=output_dir,
                                model_path=model_bgm_path,
                                denoise=False,
                                device_base=device_base,
                                )

    return [str(background_path)]


def return_original_file(file):
    # 读取原始文件名和内容
    filename = os.path.basename(file.name)
    with open(file.path, "rb") as f:
        content = f.read()
    
    return (filename, content)


def get_gui(theme, title, description):
    with gr.Blocks(theme=theme) as app:
        # Add title and description
        gr.Markdown(title)
        gr.Markdown(description)
        gr.Markdown(get_device_info())
        
        # audio_input = gr.Audio(label="Audio file", type="filepath") 
        # download_button = gr.Button("Inference")
        # file_output = gr.File(label="Result", file_count="multiple", interactive=False)
        
        # download_button.click(
        #     inference,
        #     inputs=[audio_input],
        #     outputs=[file_output],
        # )

        audio_input = gr.File(file_types=[".mp3", ".wav"], label="上传音频")
        download_btn = gr.DownloadData(return_original_file, audio_input)
        
    return app


if __name__ == "__main__":
    title = "<center><strong><font size='7'>Vocal BGM Separator</font></strong></center>"
    description = "This demo uses the MDX-Net models to perform Ultimate Vocal Remover (uvr) task for vocal and background sound separation."
    theme = "NoCrypt/miku"

    model_id = "masszhou/mdxnet"
    models_path = {
        "bgm": Path(hf_hub_download(repo_id=model_id, filename="UVR-MDX-NET-Inst_HQ_3.onnx")),
        "basic_vocal": Path(hf_hub_download(repo_id=model_id, filename="UVR-MDX-NET-Voc_FT.onnx")),
        "main_vocal": Path(hf_hub_download(repo_id=model_id, filename="UVR_MDXNET_KARA_2.onnx"))
    }
    print(f"ort.get_available_providers(): {ort.get_available_providers()}")
    print(gr.__version__)

    # entry point for GUI
    # predict(audio_file, api_name="/inference") -> result
    app_gui = get_gui(theme, title, description)

    # entry point for API
    # predict(audio_file, api_name="/predict") -> output
    app_api = gr.Interface(
        fn=inference_bgm,
        inputs=gr.Audio(type="filepath"),
        outputs=gr.File(file_count="multiple"),
    )

    app = gr.TabbedInterface(
        interface_list=[app_gui, app_api],
        tab_names=["GUI", "API"]
    )

    app.queue(default_concurrency_limit=40)
    app.launch()