Spaces:
Running
Running
File size: 11,831 Bytes
e71a85c f474942 e71a85c f474942 e71a85c f474942 e71a85c f474942 e71a85c f2c9c4b e71a85c f474942 e71a85c f474942 37d9c3a f474942 37d9c3a f474942 37d9c3a f474942 e71a85c f474942 e71a85c f474942 e71a85c f474942 e71a85c f474942 e71a85c f474942 e71a85c f474942 e71a85c f474942 e71a85c f474942 e71a85c f474942 e71a85c f474942 e71a85c f474942 e71a85c f474942 e71a85c f474942 e71a85c f474942 e71a85c f474942 bcbd1f5 f474942 bcbd1f5 f474942 e71a85c f474942 e92135f f474942 e71a85c f474942 e71a85c f474942 e71a85c f474942 f2c9c4b e92135f f474942 e92135f f474942 e71a85c e92135f f474942 e71a85c f474942 e92135f f474942 e71a85c d5a6e23 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 |
import gradio as gr
import requests
import random
import os
import zipfile
import librosa
import time
from infer_rvc_python import BaseLoader
from pydub import AudioSegment
from tts_voice import tts_order_voice
import edge_tts
import tempfile
from audio_separator.separator import Separator
import model_handler
import logging
import aiohttp
import asyncio
# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
# Constants
TEMP_DIR = "temp"
MODEL_PREFIX = "model"
UVR_5_MODELS = [
{"model_name": "BS-Roformer-Viperx-1297", "checkpoint": "model_bs_roformer_ep_317_sdr_12.9755.ckpt"},
{"model_name": "MDX23C-InstVoc HQ 2", "checkpoint": "MDX23C-8KFFT-InstVoc_HQ_2.ckpt"},
{"model_name": "Kim Vocal 2", "checkpoint": "Kim_Vocal_2.onnx"},
{"model_name": "5_HP-Karaoke", "checkpoint": "5_HP-Karaoke-UVR.pth"},
{"model_name": "UVR-DeNoise by FoxJoy", "checkpoint": "UVR-DeNoise.pth"},
{"model_name": "UVR-DeEcho-DeReverb by FoxJoy", "checkpoint": "UVR-DeEcho-DeReverb.pth"},
]
MODELS = [
{"model": "model.pth", "index": "model.index", "model_name": "Test Model"},
]
BAD_WORDS = ['puttana', 'whore', 'badword3', 'badword4']
MAX_FILE_SIZE = 500_000_000 # 500 MB
os.makedirs(TEMP_DIR, exist_ok=True)
try:
import spaces
spaces_status = True
except ImportError:
spaces_status = False
logger.warning("Spaces module not found; running in CPU mode")
separator = Separator()
converter = BaseLoader(only_cpu=not spaces_status, hubert_path=None, rmvpe_path=None)
class BadWordError(Exception):
pass
async def text_to_speech_edge(text, language_code):
if not text.strip():
raise ValueError("Text input cannot be empty")
voice = tts_order_voice.get(language_code, tts_order_voice[list(tts_order_voice.keys())[0]])
communicate = edge_tts.Communicate(text, voice)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
tmp_path = tmp_file.name
await communicate.save(tmp_path)
return tmp_path
async def download_from_url(url, name, progress=gr.Progress()):
if not url.startswith("https://huggingface.co"):
raise ValueError("URL must be from Hugging Face")
if not name.strip():
raise ValueError("Model name cannot be empty")
if any(bad_word in url.lower() or bad_word in name.lower() for bad_word in BAD_WORDS):
raise BadWordError("Input contains restricted words")
filename = os.path.join(TEMP_DIR, f"{MODEL_PREFIX}{random.randint(1, 1000)}.zip")
async with aiohttp.ClientSession() as session:
async with session.get(url.replace("/blob/", "/resolve/")) as response:
if response.status != 200:
raise ValueError("Failed to download file")
total = int(response.headers.get('content-length', 0))
if total > MAX_FILE_SIZE:
raise ValueError(f"File size exceeds {MAX_FILE_SIZE / 1_000_000} MB limit")
current = 0
with open(filename, "wb") as f:
async for data in response.content.iter_chunked(4096):
f.write(data)
current += len(data)
progress(current / total, desc="Downloading model")
try:
with zipfile.ZipFile(filename, 'r') as zip_ref:
zip_ref.extractall(os.path.join(TEMP_DIR, os.path.basename(filename).split(".")[0]))
except Exception as e:
logger.error(f"Failed to unzip file: {e}")
raise ValueError("Failed to unzip file")
unzipped_dir = os.path.join(TEMP_DIR, os.path.basename(filename).split(".")[0])
pth_files = [os.path.join(root, file) for root, _, files in os.walk(unzipped_dir) for file in files if file.endswith(".pth")]
index_files = [os.path.join(root, file) for root, _, files in os.walk(unzipped_dir) for file in files if file.endswith(".index")]
if not pth_files or not index_files:
raise ValueError("No .pth or .index files found in the zip")
pth_file = pth_files[0]
index_file = index_files[0]
name = name or os.path.basename(pth_file).split(".")[0]
MODELS.append({"model": pth_file, "index": index_file, "model_name": name})
return [f"Downloaded as {name}", pth_file, index_file]
def inf_handler(audio, model_name):
model_found = False
for model_info in UVR_5_MODELS:
if model_info["model_name"] == model_name:
separator.load_model(model_info["checkpoint"])
model_found = True
break
if not model_found:
separator.load_model()
output_files = separator.separate(audio)
return output_files[0], output_files[1]
def run(model, audio_files, pitch_alg, pitch_lvl, index_inf, r_m_f, e_r, c_b_p):
if not audio_files:
raise ValueError("Please upload an audio file")
if isinstance(audio_files, str):
audio_files = [audio_files]
random_tag = f"USER_{random.randint(10000000, 99999999)}"
file_m = model
file_index = None
for m in MODELS:
if m["model_name"] == file_m:
file_m = m["model"]
file_index = m["index"]
break
if not file_m.endswith(".pth"):
raise ValueError("Model file must be a .pth file")
logger.info(f"Running inference with model: {file_m}, tag: {random_tag}")
converter.apply_conf(
tag=random_tag,
file_model=file_m,
pitch_algo=pitch_alg,
pitch_lvl=pitch_lvl,
file_index=file_index,
index_influence=index_inf,
respiration_median_filtering=r_m_f,
envelope_ratio=e_r,
consonant_breath_protection=c_b_p,
resample_sr=44100 if audio_files[0].endswith('.mp3') else 0,
)
time.sleep(0.1)
result = convert_now(audio_files, random_tag, converter)
return result[0]
def convert_now(audio_files, random_tag, converter):
return converter(
audio_files,
random_tag,
overwrite=False,
parallel_workers=8
)
def upload_model(index_file, pth_file, model_name):
if not index_file or not pth_file:
raise ValueError("Both index and model files are required")
if not model_name.strip():
raise ValueError("Model name cannot be empty")
MODELS.append({"model": pth_file.name, "index": index_file.name, "model_name": model_name})
return "Model uploaded successfully!"
def json_to_markdown_table(json_data):
table = "| Key | Value |\n| --- | --- |\n"
for key, value in json_data.items():
table += f"| {key} | {value} |\n"
return table
def model_info(name):
for model in MODELS:
if model["model_name"] == name:
info = model_handler.model_info(model["model"])
info2 = {
"Model Name": model["model_name"],
"Model Config": info['config'],
"Epochs Trained": info['epochs'],
"Sample Rate": info['sr'],
"Pitch Guidance": info['f0'],
"Model Precision": info['size'],
}
return json_to_markdown_table(info2)
return "Model not found"
with gr.Blocks(theme=gr.themes.Soft(primary_hue="pink", secondary_hue="rose"), title="Ilaria RVC 💖") as app:
gr.Markdown("# Ilaria RVC 💖")
gr.Markdown("Support the project by donating on [Ko-Fi](https://ko-fi.com/ilariaowo)")
with gr.Tab("Inference"):
with gr.Row(equal_height=True):
models_dropdown = gr.Dropdown(label="Select Model", choices=[m["model_name"] for m in MODELS], value=MODELS[0]["model_name"])
refresh_button = gr.Button("Refresh Models", variant="secondary")
refresh_button.click(lambda: gr.Dropdown(choices=[m["model_name"] for m in MODELS]), outputs=models_dropdown)
sound_gui = gr.Audio(label="Input Audio", type="filepath")
with gr.Accordion("Text-to-Speech", open=False):
text_tts = gr.Textbox(label="Text Input", placeholder="Enter text to convert to speech", lines=3)
dropdown_tts = gr.Dropdown(label="Language and Voice", choices=list(tts_order_voice.keys()), value=list(tts_order_voice.keys())[0])
button_tts = gr.Button("Generate Speech", variant="primary")
button_tts.click(text_to_speech_edge, inputs=[text_tts, dropdown_tts], outputs=sound_gui)
with gr.Accordion("Conversion Settings", open=False):
pitch_algo_conf = gr.Radio(choices=["pm", "harvest", "crepe", "rmvpe", "rmvpe+"], value="rmvpe", label="Pitch Algorithm", info="Select the algorithm for pitch detection")
with gr.Row(equal_height=True):
pitch_lvl_conf = gr.Slider(label="Pitch Level", minimum=-24, maximum=24, step=1, value=0, info="Adjust pitch: negative for male, positive for female")
index_inf_conf = gr.Slider(minimum=0, maximum=1, value=0.75, label="Index Influence", info="Controls how much accent is applied")
with gr.Row(equal_height=True):
respiration_filter_conf = gr.Slider(minimum=0, maximum=7, value=3, step=1, label="Respiration Median Filtering")
envelope_ratio_conf = gr.Slider(minimum=0, maximum=1, value=0.25, label="Envelope Ratio")
consonant_protec_conf = gr.Slider(minimum=0, maximum=0.5, value=0.5, label="Consonant Breath Protection")
with gr.Row(equal_height=True):
button_conf = gr.Button("Convert Audio", variant="primary")
output_conf = gr.Audio(type="filepath", label="Converted Audio")
button_conf.click(run, inputs=[models_dropdown, sound_gui, pitch_algo_conf, pitch_lvl_conf, index_inf_conf, respiration_filter_conf, envelope_ratio_conf, consonant_protec_conf], outputs=output_conf)
with gr.Tab("Model Loader"):
with gr.Accordion("Download Model", open=False):
gr.Markdown("Download a model from Hugging Face (RVC model, max 500 MB)")
model_url = gr.Textbox(label="Hugging Face Model URL", placeholder="https://huggingface.co/username/model")
model_name = gr.Textbox(label="Model Name", placeholder="Enter a unique model name")
download_button = gr.Button("Download Model", variant="primary")
status = gr.Textbox(label="Status", interactive=False)
model_pth = gr.Textbox(label="Model .pth File", interactive=False)
index_pth = gr.Textbox(label="Index .index File", interactive=False)
download_button.click(download_from_url, [model_url, model_name], [status, model_pth, index_pth])
with gr.Accordion("Upload Model", open=False):
index_file_upload = gr.File(label="Index File (.index)")
pth_file_upload = gr.File(label="Model File (.pth)")
model_name_upload = gr.Textbox(label="Model Name", placeholder="Enter a unique model name")
upload_button = gr.Button("Upload Model", variant="primary")
upload_status = gr.Textbox(label="Status", interactive=False)
upload_button.click(upload_model, [index_file_upload, pth_file_upload, model_name_upload], upload_status)
with gr.Tab("Vocal Separator"):
gr.Markdown("Separate vocals and instruments using UVR models (CPU only)")
uvr5_audio_file = gr.Audio(label="Input Audio", type="filepath")
with gr.Row(equal_height=True):
uvr5_model = gr.Dropdown(label="UVR Model", choices=[m["model_name"] for m in UVR_5_MODELS])
uvr5_button = gr.Button("Separate", variant="primary")
uvr5_output_voc = gr.Audio(label="Vocals", type="filepath")
uvr5_output_inst = gr.Audio(label="Instrumental", type="filepath")
uvr5_button.click(inf_handler, [uvr5_audio_file, uvr5_model], [uvr5_output_voc, uvr5_output_inst])
app.queue().launch(share=True) |