Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import subprocess | |
import os | |
import numpy as np | |
import librosa | |
import soundfile as sf | |
import matplotlib.pyplot as plt | |
import librosa.display | |
import gc | |
import torch | |
import time | |
import warnings | |
import json | |
from scipy import signal | |
from scipy.stats import kurtosis, skew | |
import spaces | |
import urllib.request | |
from datetime import timedelta | |
warnings.filterwarnings("ignore") | |
os.environ["TOKENIZERS_PARALLELISM"] = "true" | |
torch.set_float32_matmul_precision("high") | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
print(f"Using device: {device}") | |
output_folder = "output_file" | |
os.makedirs(output_folder, exist_ok=True) | |
print(f"Output folder ready: {output_folder}") | |
def setup(): | |
os.makedirs("Apollo/model", exist_ok=True) | |
os.makedirs("Apollo/configs", exist_ok=True) | |
files_to_download = { | |
"Apollo/inference.py": "https://raw.githubusercontent.com/jarredou/Apollo-Colab-Inference/main/inference.py", | |
"Apollo/model/pytorch_model.bin": "https://huggingface.co/JusperLee/Apollo/resolve/main/pytorch_model.bin", | |
"Apollo/model/apollo_model.ckpt": "https://huggingface.co/jarredou/lew_apollo_vocal_enhancer/resolve/main/apollo_model.ckpt", | |
"Apollo/model/apollo_model_v2.ckpt": "https://huggingface.co/jarredou/lew_apollo_vocal_enhancer/resolve/main/apollo_model_v2.ckpt", | |
"Apollo/model/apollo_universal_model.ckpt": "https://huggingface.co/ASesYusuf1/Apollo_universal_model/resolve/main/apollo_universal_model.ckpt", | |
"Apollo/configs/config_apollo_vocal.yaml": "https://huggingface.co/jarredou/lew_apollo_vocal_enhancer/resolve/main/config_apollo_vocal.yaml", | |
"Apollo/configs/config_apollo.yaml": "https://huggingface.co/ASesYusuf1/Apollo_universal_model/resolve/main/config_apollo.yaml", | |
"Apollo/configs/apollo.yaml": "https://huggingface.co/JusperLee/Apollo/resolve/main/apollo.yaml", | |
} | |
for file_path, url in files_to_download.items(): | |
if not os.path.exists(file_path): | |
print(f"Downloading {file_path}...") | |
try: | |
subprocess.run(["wget", "-O", file_path, url], check=True, capture_output=True, text=True) | |
print(f"Downloaded {file_path} with wget") | |
except (subprocess.CalledProcessError, FileNotFoundError) as e: | |
print(f"wget failed for {file_path}: {e}. Falling back to urllib...") | |
try: | |
urllib.request.urlretrieve(url, file_path) | |
print(f"Downloaded {file_path} with urllib") | |
except Exception as e: | |
print(f"Failed to download {file_path}: {e}") | |
raise Exception(f"Failed to download {file_path}") | |
try: | |
setup() | |
except Exception as e: | |
print(f"Setup failed: {e}") | |
raise | |
# Süreyi 60'tan 120 saniyeye çıkardım | |
def process_audio(input_file, model, chunk_size, overlap, progress=gr.Progress()): | |
if not input_file: | |
return "No file uploaded.", None, None, None | |
input_file_path = input_file | |
original_file_name = os.path.splitext(os.path.basename(input_file_path))[0] | |
output_file_path = f'{output_folder}/{original_file_name}_processed.wav' | |
model_paths = { | |
'MP3 Enhancer': ('Apollo/model/pytorch_model.bin', 'Apollo/configs/apollo.yaml'), | |
'Lew Vocal Enhancer': ('Apollo/model/apollo_model.ckpt', 'Apollo/configs/apollo.yaml'), | |
'Lew Vocal Enhancer v2 (beta)': ('Apollo/model/apollo_model_v2.ckpt', 'Apollo/configs/config_apollo_vocal.yaml'), | |
'Apollo Universal Model': ('Apollo/model/apollo_universal_model.ckpt', 'Apollo/configs/config_apollo.yaml') | |
} | |
if model not in model_paths: | |
return "Invalid model selected.", None, None, None | |
ckpt, config = model_paths[model] | |
if not os.path.exists(ckpt) or not os.path.exists(config): | |
return f"Model files not found: {ckpt} or {config}", None, None, None | |
print(f"Model selected: {model}") | |
print("Processing started. Please wait...") | |
start_time = time.time() | |
command = [ | |
"python", "Apollo/inference.py", | |
"--in_wav", input_file_path, | |
"--out_wav", output_file_path, | |
"--chunk_size", str(chunk_size), | |
"--overlap", str(overlap), | |
"--ckpt", ckpt, | |
"--config", config | |
] | |
try: | |
process = subprocess.Popen( | |
command, | |
stdout=subprocess.PIPE, | |
stderr=subprocess.STDOUT, | |
text=True | |
) | |
progress(0.0, desc="Processing started...") | |
for line in process.stdout: | |
try: | |
data = json.loads(line.strip()) | |
if "percentage" in data: | |
percentage = data["percentage"] | |
elapsed_time = data["elapsed_time"] | |
if percentage > 0: | |
time_remaining = (elapsed_time / percentage) * (100 - percentage) | |
time_remaining_str = str(timedelta(seconds=int(time_remaining))) | |
else: | |
time_remaining_str = "Calculating..." | |
progress(percentage / 100, desc=f"Processing: {percentage:.1f}% | Time remaining: {time_remaining_str}") | |
else: | |
print(f"Processing: {line.strip()}") | |
except json.JSONDecodeError: | |
print(f"Processing: {line.strip()}") | |
process.stdout.close() | |
process.wait() | |
if process.returncode != 0: | |
return f"Error processing audio: Non-zero exit code {process.returncode}.", None, None, None | |
total_duration = str(timedelta(seconds=int(time.time() - start_time))) | |
progress(1.0, desc=f"Processing completed. Total time: {total_duration}") | |
return output_file_path, input_file_path, None, f"Processing completed. Total time: {total_duration}" | |
except Exception as e: | |
return f"Error in process_audio: {str(e)}", None, None, None | |
def mid_side_separation(audio_file): | |
try: | |
print(f"Loading audio file: {audio_file}") | |
y, sr = librosa.load(audio_file, sr=None, mono=False) | |
print(f"Audio shape: {y.shape}, Sample rate: {sr}") | |
if y.ndim == 1: | |
raise ValueError("Stereo audio file required! Please upload a stereo .wav or .mp3 file.") | |
left, right = y[0], y[1] | |
print("Performing Mid/Side separation...") | |
mid = (left + right) / 2 | |
side = (left - right) / 2 | |
mid_path = os.path.join(output_folder, "mid.wav") | |
side_path = os.path.join(output_folder, "side.wav") | |
print(f"Saving Mid to {mid_path} and Side to {side_path}") | |
sf.write(mid_path, mid, sr) | |
sf.write(side_path, side, sr) | |
print("Mid/Side separation completed.") | |
return mid_path, side_path, sr | |
except Exception as e: | |
print(f"Error in mid/side separation: {str(e)}") | |
raise ValueError(f"Error in mid/side separation: {str(e)}") | |
def mid_side_combine(mid_file, side_file, output_path): | |
try: | |
print(f"Combining Mid: {mid_file} and Side: {side_file}") | |
mid_data, sr_mid = librosa.load(mid_file, sr=None, mono=True) | |
side_data, sr_side = librosa.load(side_file, sr=None, mono=True) | |
if sr_mid != sr_side: | |
raise ValueError("Mid and Side sample rates do not match!") | |
left = mid_data + side_data | |
right = mid_data - side_data | |
stereo = np.stack([left, right], axis=0) | |
print(f"Saving combined audio to {output_path}") | |
sf.write(output_path, stereo.T, sr_mid) | |
return output_path | |
except Exception as e: | |
print(f"Error in mid/side combination: {str(e)}") | |
raise ValueError(f"Error in mid/side combination: {str(e)}") | |
# Süreyi 60'tan 120 saniyeye çıkardım | |
def process_mid_side_upscale(input_file, model, chunk_size, overlap, progress=gr.Progress()): | |
if not input_file: | |
return "No file uploaded.", None, None, None | |
try: | |
total_start_time = time.time() | |
print(f"Starting Mid/Side upscale for: {input_file}") | |
# Mid/Side ayrımı | |
print("Separating Mid and Side channels...") | |
mid_path, side_path, sr = mid_side_separation(input_file) | |
print(f"Mid path: {mid_path}, Side path: {side_path}, Sample rate: {sr}") | |
# Mid kanalını işle | |
print("Processing Mid channel...") | |
mid_restored, _, _, mid_status = process_audio(mid_path, model, chunk_size, overlap, progress=progress) | |
if not mid_restored.endswith(".wav"): | |
return f"Mid channel processing failed: {mid_status}", None, None, None | |
print(f"Mid channel processed: {mid_restored}") | |
# Side kanalını işle | |
print("Processing Side channel...") | |
side_restored, _, _, side_status = process_audio(side_path, model, chunk_size, overlap, progress=progress) | |
if not side_restored.endswith(".wav"): | |
return f"Side channel processing failed: {side_status}", None, None, None | |
print(f"Side channel processed: {side_restored}") | |
# Orijinal dosya adını al ve çıktı yolunu oluştur | |
original_file_name = os.path.splitext(os.path.basename(input_file))[0] | |
final_output_path = os.path.join(output_folder, f"{original_file_name}_upscaled.wav") | |
# Mid ve Side kanallarını birleştir | |
print("Combining processed Mid and Side channels...") | |
final_audio = mid_side_combine(mid_restored, side_restored, final_output_path) | |
print(f"Final audio saved: {final_audio}") | |
total_duration = str(timedelta(seconds=int(time.time() - total_start_time))) | |
progress(1.0, desc=f"Mid/Side upscaling completed. Total time: {total_duration}") | |
return final_audio, input_file, None, f"Mid/Side upscaling completed. Total time: {total_duration}" | |
except Exception as e: | |
error_msg = f"Error in Mid/Side upscale: {str(e)}" | |
print(error_msg) | |
return error_msg, None, None, None | |
def spectrum(audio_file): | |
if not audio_file: | |
return None, "No file selected" | |
try: | |
chunk_duration = 30 | |
hop_length = 512 | |
n_fft = 2048 | |
with sf.SoundFile(audio_file) as sf_desc: | |
duration = len(sf_desc) / sf_desc.samplerate | |
num_chunks = int(np.ceil(duration / chunk_duration)) | |
freqs = librosa.fft_frequencies(sr=sf_desc.samplerate, n_fft=n_fft) | |
total_frames = int(np.ceil(duration * sf_desc.samplerate / hop_length)) | |
S_db_full = np.zeros((len(freqs), total_frames)) | |
for chunk_idx in range(num_chunks): | |
start_time = chunk_idx * chunk_duration | |
y, sr = librosa.load(audio_file, offset=start_time, duration=chunk_duration, sr=None) | |
S_chunk = np.abs(librosa.stft(y, n_fft=n_fft, hop_length=hop_length)) | |
S_db_chunk = librosa.amplitude_to_db(S_chunk, ref=np.max) | |
start_frame = int(start_time * sr / hop_length) | |
end_frame = start_frame + S_db_chunk.shape[1] | |
S_db_full[:, start_frame:end_frame] = S_db_chunk | |
del S_chunk, S_db_chunk | |
gc.collect() | |
downsample_factor = 4 | |
S_db_downsampled = S_db_full[:, ::downsample_factor] | |
threshold = np.max(S_db_downsampled) - 60 | |
significant_freqs = freqs[np.any(S_db_downsampled > threshold, axis=1)] | |
max_freq = np.max(significant_freqs) if len(significant_freqs) > 0 else sr / 2 | |
plt.figure(figsize=(15, 8)) | |
display_hop = 4 | |
librosa.display.specshow( | |
S_db_full[:, ::display_hop], | |
sr=sr, | |
hop_length=hop_length * display_hop, | |
x_axis='time', | |
y_axis='hz', | |
cmap='magma' | |
) | |
freq_ticks = [2000, 4000, 6000, 8000, 10000, 12000, 14000, 16000, 18000, 20000] | |
plt.yticks(freq_ticks, [f"{f/1000:.0f}" for f in freq_ticks]) | |
plt.colorbar(format='%+2.0f dB') | |
plt.title('Frequency Spectrum', fontsize=16) | |
plt.xlabel('Time (seconds)', fontsize=12) | |
plt.ylabel('Frequency (kHz)', fontsize=12) | |
output_image_path = os.path.join(output_folder, 'spectrum.png') | |
plt.savefig(output_image_path, bbox_inches='tight', dpi=150) | |
plt.close() | |
del S_db_full, S_db_downsampled | |
gc.collect() | |
closest_freq = min(freq_ticks, key=lambda x: abs(x - max_freq)) | |
return output_image_path, f"Maximum Frequency {int(closest_freq)} Hz" | |
except Exception as e: | |
return None, f"Error: {str(e)}" | |
def show_credits(): | |
return """This Web UI was created using AI tools and written by U.Z.S. | |
**Apollo-Colab-Inference** (https://github.com/jarredou/Apollo-Colab-Inference): | |
Developed by Jarred Ou, provides a colab-based inference implementation of the Apollo model. | |
**Apollo** (https://github.com/JusperLee/Apollo): | |
Created by Jusper Lee, a deep learning-based model for vocal clarity and audio quality. | |
""" | |
app = gr.Blocks( | |
css=""" | |
.gradio-container { background-color: #121212; color: white; font-family: Arial, sans-serif; } | |
.gradio-button { | |
background-color: #6a0dad; | |
color: white; | |
border: 1px solid #5a0b8a; | |
border-radius: 5px; | |
padding: 10px 20px; | |
} | |
.gradio-button:hover { background-color: #5a0b8a; } | |
.gradio-input, .gradio-file { | |
background-color: rgba(106, 13, 173, 0.2); | |
border: 1px solid #5a0b8a; | |
color: white; | |
border-radius: 5px; | |
} | |
.gradio-input:focus, .gradio-file:focus { | |
border-color: #ffffff; | |
box-shadow: 0 0 5px rgba(255, 255, 255, 0.5); | |
} | |
.gradio-slider { | |
background-color: rgba(106, 13, 173, 0.2); | |
color: white; | |
} | |
.gradio-label { color: white; font-weight: bold; } | |
.gradio-tabs { background-color: rgba(106, 13, 173, 0.2); } | |
.gradio-tab { padding: 15px; } | |
.model-note { color: #ff9800; font-size: 0.9em; } | |
/* Hide footer elements */ | |
footer {display: none !important;} | |
#footer {display: none !important;} | |
.gradio-footer {display: none !important;} | |
@media (max-width: 600px) { | |
.gradio-button { width: 100%; font-size: 16px; } | |
.gradio-input, .gradio-file { width: 100%; font-size: 16px; } | |
.gradio-slider { width: 100%; } | |
.gradio-label { font-size: 14px; } | |
} | |
""" | |
) | |
with app: | |
with gr.Tab("Audio Enhancer"): | |
gr.Markdown("# 🎵 Audio Enhancement Tool") | |
with gr.Row(): | |
with gr.Column(): | |
audio_input = gr.File( | |
label="Select Audio File", | |
file_types=[".wav", ".mp3"], | |
elem_classes=["gradio-file"] | |
) | |
model = gr.Radio( | |
["MP3 Enhancer", "Lew Vocal Enhancer", "Lew Vocal Enhancer v2 (beta)", "Apollo Universal Model"], | |
label="Select Model", | |
value="Apollo Universal Model" | |
) | |
gr.Markdown("**For Universal model, please set Chunk Size to 19**", elem_classes="model-note") | |
with gr.Row(): | |
chunk_size = gr.Slider( | |
minimum=3, | |
maximum=25, | |
step=1, | |
value=19, | |
label="Chunk Size", | |
interactive=True | |
) | |
overlap = gr.Slider( | |
minimum=2, | |
maximum=10, | |
step=1, | |
value=2, | |
label="Overlap", | |
interactive=True | |
) | |
process_button = gr.Button("Process Audio", variant="primary") | |
with gr.Column(): | |
output_audio = gr.Audio(label="Processed Audio") | |
original_audio = gr.Audio(label="Original Audio") | |
status_message = gr.Textbox(label="Status", interactive=False) | |
process_button.click( | |
process_audio, | |
inputs=[audio_input, model, chunk_size, overlap], | |
outputs=[output_audio, original_audio, status_message, status_message] | |
) | |
with gr.Tab("Spectrum Analyzer"): | |
gr.Markdown("# 📊 Frequency Spectrum Analysis") | |
with gr.Row(): | |
with gr.Column(): | |
spectrogram_input = gr.File( | |
label="Select Audio File", | |
file_types=[".wav", ".mp3"], | |
elem_classes=["gradio-file"] | |
) | |
spectrum_button = gr.Button("Analyze Spectrum", variant="primary") | |
with gr.Column(): | |
output_spectrum = gr.Image(label="Frequency Spectrum", interactive=False) | |
max_freq_info = gr.Textbox(label="Frequency Analysis", interactive=False) | |
spectrum_button.click( | |
spectrum, | |
inputs=[spectrogram_input], | |
outputs=[output_spectrum, max_freq_info] | |
) | |
with gr.Tab("Mid/Side Processor"): | |
gr.Markdown("# 🎚️ Mid/Side Channel Processing") | |
gr.Markdown("Upload a stereo audio file to separate, enhance, and recombine its Mid and Side channels.") | |
with gr.Row(): | |
with gr.Column(): | |
ms_input = gr.File( | |
label="Select Stereo Audio File", | |
file_types=[".wav", ".mp3"], | |
elem_classes=["gradio-file"] | |
) | |
ms_model = gr.Radio( | |
["MP3 Enhancer", "Lew Vocal Enhancer", "Lew Vocal Enhancer v2 (beta)", "Apollo Universal Model"], | |
label="Select Model", | |
value="Apollo Universal Model" | |
) | |
with gr.Row(): | |
ms_chunk_size = gr.Slider( | |
minimum=3, | |
maximum=25, | |
step=1, | |
value=19, | |
label="Chunk Size" | |
) | |
ms_overlap = gr.Slider( | |
minimum=2, | |
maximum=10, | |
step=1, | |
value=2, | |
label="Overlap" | |
) | |
ms_process_button = gr.Button("Process Mid/Side", variant="primary") | |
with gr.Column(): | |
ms_output = gr.Audio(label="Processed Audio") | |
ms_original = gr.Audio(label="Original Audio") | |
ms_status_message = gr.Textbox(label="Status", interactive=False) | |
ms_process_button.click( | |
process_mid_side_upscale, | |
inputs=[ms_input, ms_model, ms_chunk_size, ms_overlap], | |
outputs=[ms_output, ms_original, ms_status_message, ms_status_message] | |
) | |
with gr.Tab("About"): | |
gr.Markdown("## ℹ️ About This Tool") | |
gr.Markdown(show_credits()) | |
gr.Markdown("### 🚀 Features") | |
gr.Markdown(""" | |
- High-quality audio enhancement using Apollo models | |
- Frequency spectrum visualization | |
- Advanced Mid/Side channel processing | |
- GPU-accelerated processing | |
""") | |
gr.Markdown("<div class='footer'>Developed by U.Z.S using AI tools</div>") | |
if __name__ == "__main__": | |
app.launch( | |
server_name="0.0.0.0", | |
server_port=7860, | |
show_api=False, | |
) |