File size: 11,831 Bytes
e71a85c
 
 
 
f474942
e71a85c
 
 
 
 
 
 
 
 
f474942
 
 
e71a85c
f474942
 
 
e71a85c
f474942
e71a85c
 
f2c9c4b
 
e71a85c
 
 
 
 
 
 
 
 
 
 
f474942
 
e71a85c
 
 
f474942
 
 
 
 
 
37d9c3a
f474942
 
37d9c3a
 
f474942
37d9c3a
f474942
 
 
 
 
 
 
 
 
e71a85c
f474942
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e71a85c
 
f474942
 
e71a85c
f474942
 
 
e71a85c
f474942
 
 
 
 
e71a85c
 
 
f474942
e71a85c
f474942
e71a85c
f474942
e71a85c
 
 
 
 
 
 
 
 
f474942
e71a85c
f474942
e71a85c
f474942
e71a85c
 
f474942
 
e71a85c
f474942
 
 
 
 
e71a85c
 
 
f474942
 
 
e71a85c
 
 
 
 
 
 
 
 
 
f474942
e71a85c
 
 
 
 
f474942
 
 
 
 
 
 
bcbd1f5
f474942
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bcbd1f5
f474942
e71a85c
f474942
e92135f
f474942
 
 
e71a85c
f474942
e71a85c
f474942
 
 
 
 
e71a85c
f474942
f2c9c4b
e92135f
f474942
 
e92135f
f474942
 
 
e71a85c
e92135f
f474942
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e71a85c
 
f474942
 
 
 
 
 
 
 
e92135f
f474942
 
 
 
 
e71a85c
d5a6e23
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
import gradio as gr
import requests
import random
import os
import zipfile
import librosa
import time
from infer_rvc_python import BaseLoader
from pydub import AudioSegment
from tts_voice import tts_order_voice
import edge_tts
import tempfile
from audio_separator.separator import Separator
import model_handler
import logging
import aiohttp
import asyncio

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# Constants
TEMP_DIR = "temp"
MODEL_PREFIX = "model"


UVR_5_MODELS = [
    {"model_name": "BS-Roformer-Viperx-1297", "checkpoint": "model_bs_roformer_ep_317_sdr_12.9755.ckpt"},
    {"model_name": "MDX23C-InstVoc HQ 2", "checkpoint": "MDX23C-8KFFT-InstVoc_HQ_2.ckpt"},
    {"model_name": "Kim Vocal 2", "checkpoint": "Kim_Vocal_2.onnx"},
    {"model_name": "5_HP-Karaoke", "checkpoint": "5_HP-Karaoke-UVR.pth"},
    {"model_name": "UVR-DeNoise by FoxJoy", "checkpoint": "UVR-DeNoise.pth"},
    {"model_name": "UVR-DeEcho-DeReverb by FoxJoy", "checkpoint": "UVR-DeEcho-DeReverb.pth"},
]
MODELS = [
    {"model": "model.pth", "index": "model.index", "model_name": "Test Model"},
]
BAD_WORDS = ['puttana', 'whore', 'badword3', 'badword4']
MAX_FILE_SIZE = 500_000_000  # 500 MB

os.makedirs(TEMP_DIR, exist_ok=True)

try:
    import spaces
    spaces_status = True
except ImportError:
    spaces_status = False
    logger.warning("Spaces module not found; running in CPU mode")

separator = Separator()
converter = BaseLoader(only_cpu=not spaces_status, hubert_path=None, rmvpe_path=None)

class BadWordError(Exception):
    pass

async def text_to_speech_edge(text, language_code):
    if not text.strip():
        raise ValueError("Text input cannot be empty")
    voice = tts_order_voice.get(language_code, tts_order_voice[list(tts_order_voice.keys())[0]])
    communicate = edge_tts.Communicate(text, voice)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
        tmp_path = tmp_file.name
    await communicate.save(tmp_path)
    return tmp_path

async def download_from_url(url, name, progress=gr.Progress()):
    if not url.startswith("https://huggingface.co"):
        raise ValueError("URL must be from Hugging Face")
    if not name.strip():
        raise ValueError("Model name cannot be empty")
    if any(bad_word in url.lower() or bad_word in name.lower() for bad_word in BAD_WORDS):
        raise BadWordError("Input contains restricted words")

    filename = os.path.join(TEMP_DIR, f"{MODEL_PREFIX}{random.randint(1, 1000)}.zip")
    async with aiohttp.ClientSession() as session:
        async with session.get(url.replace("/blob/", "/resolve/")) as response:
            if response.status != 200:
                raise ValueError("Failed to download file")
            total = int(response.headers.get('content-length', 0))
            if total > MAX_FILE_SIZE:
                raise ValueError(f"File size exceeds {MAX_FILE_SIZE / 1_000_000} MB limit")
            current = 0
            with open(filename, "wb") as f:
                async for data in response.content.iter_chunked(4096):
                    f.write(data)
                    current += len(data)
                    progress(current / total, desc="Downloading model")
    
    try:
        with zipfile.ZipFile(filename, 'r') as zip_ref:
            zip_ref.extractall(os.path.join(TEMP_DIR, os.path.basename(filename).split(".")[0]))
    except Exception as e:
        logger.error(f"Failed to unzip file: {e}")
        raise ValueError("Failed to unzip file")

    unzipped_dir = os.path.join(TEMP_DIR, os.path.basename(filename).split(".")[0])
    pth_files = [os.path.join(root, file) for root, _, files in os.walk(unzipped_dir) for file in files if file.endswith(".pth")]
    index_files = [os.path.join(root, file) for root, _, files in os.walk(unzipped_dir) for file in files if file.endswith(".index")]
    
    if not pth_files or not index_files:
        raise ValueError("No .pth or .index files found in the zip")
    
    pth_file = pth_files[0]
    index_file = index_files[0]
    name = name or os.path.basename(pth_file).split(".")[0]
    MODELS.append({"model": pth_file, "index": index_file, "model_name": name})
    return [f"Downloaded as {name}", pth_file, index_file]

def inf_handler(audio, model_name):
    model_found = False
    for model_info in UVR_5_MODELS:
        if model_info["model_name"] == model_name:
            separator.load_model(model_info["checkpoint"])
            model_found = True
            break
    if not model_found:
        separator.load_model()
    output_files = separator.separate(audio)
    return output_files[0], output_files[1]

def run(model, audio_files, pitch_alg, pitch_lvl, index_inf, r_m_f, e_r, c_b_p):
    if not audio_files:
        raise ValueError("Please upload an audio file")
    if isinstance(audio_files, str):
        audio_files = [audio_files]
    
    random_tag = f"USER_{random.randint(10000000, 99999999)}"
    file_m = model
    file_index = None
    for m in MODELS:
        if m["model_name"] == file_m:
            file_m = m["model"]
            file_index = m["index"]
            break
    
    if not file_m.endswith(".pth"):
        raise ValueError("Model file must be a .pth file")
    
    logger.info(f"Running inference with model: {file_m}, tag: {random_tag}")
    converter.apply_conf(
        tag=random_tag,
        file_model=file_m,
        pitch_algo=pitch_alg,
        pitch_lvl=pitch_lvl,
        file_index=file_index,
        index_influence=index_inf,
        respiration_median_filtering=r_m_f,
        envelope_ratio=e_r,
        consonant_breath_protection=c_b_p,
        resample_sr=44100 if audio_files[0].endswith('.mp3') else 0,
    )
    time.sleep(0.1)
    result = convert_now(audio_files, random_tag, converter)
    return result[0]

def convert_now(audio_files, random_tag, converter):
    return converter(
        audio_files,
        random_tag,
        overwrite=False,
        parallel_workers=8
    )

def upload_model(index_file, pth_file, model_name):
    if not index_file or not pth_file:
        raise ValueError("Both index and model files are required")
    if not model_name.strip():
        raise ValueError("Model name cannot be empty")
    MODELS.append({"model": pth_file.name, "index": index_file.name, "model_name": model_name})
    return "Model uploaded successfully!"

def json_to_markdown_table(json_data):
    table = "| Key | Value |\n| --- | --- |\n"
    for key, value in json_data.items():
        table += f"| {key} | {value} |\n"
    return table

def model_info(name):
    for model in MODELS:
        if model["model_name"] == name:
            info = model_handler.model_info(model["model"])
            info2 = {
                "Model Name": model["model_name"],
                "Model Config": info['config'],
                "Epochs Trained": info['epochs'],
                "Sample Rate": info['sr'],
                "Pitch Guidance": info['f0'],
                "Model Precision": info['size'],
            }
            return json_to_markdown_table(info2)
    return "Model not found"

with gr.Blocks(theme=gr.themes.Soft(primary_hue="pink", secondary_hue="rose"), title="Ilaria RVC 💖") as app:
    gr.Markdown("# Ilaria RVC 💖")
    gr.Markdown("Support the project by donating on [Ko-Fi](https://ko-fi.com/ilariaowo)")

    with gr.Tab("Inference"):
        with gr.Row(equal_height=True):
            models_dropdown = gr.Dropdown(label="Select Model", choices=[m["model_name"] for m in MODELS], value=MODELS[0]["model_name"])
            refresh_button = gr.Button("Refresh Models", variant="secondary")
            refresh_button.click(lambda: gr.Dropdown(choices=[m["model_name"] for m in MODELS]), outputs=models_dropdown)

        sound_gui = gr.Audio(label="Input Audio", type="filepath")

        with gr.Accordion("Text-to-Speech", open=False):
            text_tts = gr.Textbox(label="Text Input", placeholder="Enter text to convert to speech", lines=3)
            dropdown_tts = gr.Dropdown(label="Language and Voice", choices=list(tts_order_voice.keys()), value=list(tts_order_voice.keys())[0])
            button_tts = gr.Button("Generate Speech", variant="primary")
            button_tts.click(text_to_speech_edge, inputs=[text_tts, dropdown_tts], outputs=sound_gui)

        with gr.Accordion("Conversion Settings", open=False):
            pitch_algo_conf = gr.Radio(choices=["pm", "harvest", "crepe", "rmvpe", "rmvpe+"], value="rmvpe", label="Pitch Algorithm", info="Select the algorithm for pitch detection")
            with gr.Row(equal_height=True):
                pitch_lvl_conf = gr.Slider(label="Pitch Level", minimum=-24, maximum=24, step=1, value=0, info="Adjust pitch: negative for male, positive for female")
                index_inf_conf = gr.Slider(minimum=0, maximum=1, value=0.75, label="Index Influence", info="Controls how much accent is applied")
            with gr.Row(equal_height=True):
                respiration_filter_conf = gr.Slider(minimum=0, maximum=7, value=3, step=1, label="Respiration Median Filtering")
                envelope_ratio_conf = gr.Slider(minimum=0, maximum=1, value=0.25, label="Envelope Ratio")
            consonant_protec_conf = gr.Slider(minimum=0, maximum=0.5, value=0.5, label="Consonant Breath Protection")

        with gr.Row(equal_height=True):
            button_conf = gr.Button("Convert Audio", variant="primary")
            output_conf = gr.Audio(type="filepath", label="Converted Audio")
        button_conf.click(run, inputs=[models_dropdown, sound_gui, pitch_algo_conf, pitch_lvl_conf, index_inf_conf, respiration_filter_conf, envelope_ratio_conf, consonant_protec_conf], outputs=output_conf)

    with gr.Tab("Model Loader"):
        with gr.Accordion("Download Model", open=False):
            gr.Markdown("Download a model from Hugging Face (RVC model, max 500 MB)")
            model_url = gr.Textbox(label="Hugging Face Model URL", placeholder="https://huggingface.co/username/model")
            model_name = gr.Textbox(label="Model Name", placeholder="Enter a unique model name")
            download_button = gr.Button("Download Model", variant="primary")
            status = gr.Textbox(label="Status", interactive=False)
            model_pth = gr.Textbox(label="Model .pth File", interactive=False)
            index_pth = gr.Textbox(label="Index .index File", interactive=False)
            download_button.click(download_from_url, [model_url, model_name], [status, model_pth, index_pth])

        with gr.Accordion("Upload Model", open=False):
            index_file_upload = gr.File(label="Index File (.index)")
            pth_file_upload = gr.File(label="Model File (.pth)")
            model_name_upload = gr.Textbox(label="Model Name", placeholder="Enter a unique model name")
            upload_button = gr.Button("Upload Model", variant="primary")
            upload_status = gr.Textbox(label="Status", interactive=False)
            upload_button.click(upload_model, [index_file_upload, pth_file_upload, model_name_upload], upload_status)

    with gr.Tab("Vocal Separator"):
        gr.Markdown("Separate vocals and instruments using UVR models (CPU only)")
        uvr5_audio_file = gr.Audio(label="Input Audio", type="filepath")
        with gr.Row(equal_height=True):
            uvr5_model = gr.Dropdown(label="UVR Model", choices=[m["model_name"] for m in UVR_5_MODELS])
            uvr5_button = gr.Button("Separate", variant="primary")
        uvr5_output_voc = gr.Audio(label="Vocals", type="filepath")
        uvr5_output_inst = gr.Audio(label="Instrumental", type="filepath")
        uvr5_button.click(inf_handler, [uvr5_audio_file, uvr5_model], [uvr5_output_voc, uvr5_output_inst])

app.queue().launch(share=True)