File size: 5,130 Bytes
6af7d00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# reference: https://huggingface.co/spaces/r3gm/Audio_separator
import gradio as gr
import shutil
import numpy as np
from pathlib import Path
import os
import time
import torch
from huggingface_hub import hf_hub_download
from uvr_processing import process_uvr_task, run_mdx, get_model_params
from utils import convert_to_stereo_and_wav
import onnxruntime as ort
import io


MODEL_ID = "masszhou/mdxnet"
MODELS_PATH = {
    "bgm": Path(hf_hub_download(repo_id=MODEL_ID, filename="UVR-MDX-NET-Inst_HQ_3.onnx")), 
    "basic_vocal": Path(hf_hub_download(repo_id=MODEL_ID, filename="UVR-MDX-NET-Voc_FT.onnx")),
    "main_vocal": Path(hf_hub_download(repo_id=MODEL_ID, filename="UVR_MDXNET_KARA_2.onnx"))
}

def get_device_info():
    if torch.cuda.is_available():
        device = f"GPU ({torch.cuda.get_device_name(0)})"
    else:
        device = "CPU"
    return f"Current running environment: {device}"


def inference(audio_file: str, 
              stem: str = "vocal",) -> list[str]:
    # audio_file = '/private/var/folders/02/_9ymjkz12xq8m_xh5592pl840000gn/T/gradio/74c3de047a439ea3cfb8e2d1ee6e5a85ea999d3eb30537b88d386aac177902d0/Spare Zeit und Aufwand mit den Servicevertragen von Mercedes-Benz Trucks..m4a'

    if not audio_file:
        raise ValueError("The audio path is missing.")

    if not stem:
        raise ValueError("Please select 'vocal' or 'background' stem.")

    audio_file = Path(audio_file)
    output_dir = Path("./output")


    outputs = []
    start_time = time.time()
    background_path, vocals_path = process_uvr_task(
        input_file_path=audio_file,
        output_dir=output_dir,
        models_path=MODELS_PATH,
        )
    end_time = time.time()
    execution_time = end_time - start_time
    print(f"Execution time: {execution_time} seconds")

    print(f"Background file: {background_path}")
    print(f"Vocals file: {vocals_path}")
    os.makedirs("static/results", exist_ok=True)

    # shutil.copy(background_path, bg_dst)
    # shutil.copy(vocals_path, vc_dst)

    outputs.append(str(background_path))
    outputs.append(str(vocals_path))

    return outputs


def inference_bgm(audio_file: str) -> list[str]:
    mdx_model_params = get_model_params(Path("./mdx_models"))
    audio_file = convert_to_stereo_and_wav(Path(audio_file))  # resampling at 44100 Hz
    device_base = "cuda" if torch.cuda.is_available() else "cpu"
    output_dir = Path("./output")
    model_bgm_path = MODELS_PATH["bgm"]
    background_path, _ = run_mdx(model_params=mdx_model_params,
                                input_filename=audio_file,
                                output_dir=output_dir,
                                model_path=model_bgm_path,
                                denoise=False,
                                device_base=device_base,
                                )

    return [str(background_path)]


def return_original_file(file):
    # 读取原始文件名和内容
    filename = os.path.basename(file.name)
    with open(file.path, "rb") as f:
        content = f.read()
    
    return (filename, content)


def get_gui(theme, title, description):
    with gr.Blocks(theme=theme) as app:
        # Add title and description
        gr.Markdown(title)
        gr.Markdown(description)
        gr.Markdown(get_device_info())
        
        # audio_input = gr.Audio(label="Audio file", type="filepath") 
        # download_button = gr.Button("Inference")
        # file_output = gr.File(label="Result", file_count="multiple", interactive=False)
        
        # download_button.click(
        #     inference,
        #     inputs=[audio_input],
        #     outputs=[file_output],
        # )

        audio_input = gr.File(file_types=[".mp3", ".wav"], label="上传音频")
        download_btn = gr.DownloadData(return_original_file, audio_input)
        
    return app


if __name__ == "__main__":
    title = "<center><strong><font size='7'>Vocal BGM Separator</font></strong></center>"
    description = "This demo uses the MDX-Net models to perform Ultimate Vocal Remover (uvr) task for vocal and background sound separation."
    theme = "NoCrypt/miku"

    model_id = "masszhou/mdxnet"
    models_path = {
        "bgm": Path(hf_hub_download(repo_id=model_id, filename="UVR-MDX-NET-Inst_HQ_3.onnx")),
        "basic_vocal": Path(hf_hub_download(repo_id=model_id, filename="UVR-MDX-NET-Voc_FT.onnx")),
        "main_vocal": Path(hf_hub_download(repo_id=model_id, filename="UVR_MDXNET_KARA_2.onnx"))
    }
    print(f"ort.get_available_providers(): {ort.get_available_providers()}")
    print(gr.__version__)

    # entry point for GUI
    # predict(audio_file, api_name="/inference") -> result
    app_gui = get_gui(theme, title, description)

    # entry point for API
    # predict(audio_file, api_name="/predict") -> output
    app_api = gr.Interface(
        fn=inference_bgm,
        inputs=gr.Audio(type="filepath"),
        outputs=gr.File(file_count="multiple"),
    )

    app = gr.TabbedInterface(
        interface_list=[app_gui, app_api],
        tab_names=["GUI", "API"]
    )

    app.queue(default_concurrency_limit=40)
    app.launch()