Spaces:
Runtime error
Runtime error
""" | |
Wav2Vec2 XLS-R 1B Portuguese - Hugging Face Space | |
""" | |
import gradio as gr | |
import torch | |
import librosa | |
import numpy as np | |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor | |
import warnings | |
warnings.filterwarnings("ignore") | |
# Initialize model and processor | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
model_name = "jonatasgrosman/wav2vec2-xls-r-1b-portuguese" | |
print(f"Loading model {model_name}...") | |
processor = Wav2Vec2Processor.from_pretrained(model_name) | |
model = Wav2Vec2ForCTC.from_pretrained(model_name) | |
model.to(device) | |
model.eval() | |
print(f"Model loaded on device: {device}") | |
def transcribe_audio(audio_path): | |
"""Transcribe audio using Wav2Vec2""" | |
try: | |
# Load and preprocess audio | |
speech_array, sampling_rate = librosa.load(audio_path, sr=16000, mono=True) | |
# Process with model | |
inputs = processor( | |
speech_array, | |
sampling_rate=16000, | |
return_tensors="pt", | |
padding=True | |
) | |
inputs = {k: v.to(device) for k, v in inputs.items()} | |
with torch.no_grad(): | |
logits = model(**inputs).logits | |
# Decode | |
predicted_ids = torch.argmax(logits, dim=-1) | |
transcription = processor.decode(predicted_ids[0]) | |
# Calculate confidence | |
probs = torch.softmax(logits, dim=-1) | |
confidence = torch.max(probs).item() | |
return transcription, confidence | |
except Exception as e: | |
return f"Error: {str(e)}", 0.0 | |
def process_audio(audio): | |
"""Process audio input from Gradio""" | |
if audio is None: | |
return "Please provide an audio file.", "" | |
transcription, confidence = transcribe_audio(audio) | |
# Format output | |
output = f"**Transcription:** {transcription}\n\n" | |
output += f"**Confidence:** {confidence:.2%}" | |
return output, transcription | |
# Create Gradio interface | |
with gr.Blocks(title="Wav2Vec2 XLS-R 1B Portuguese") as demo: | |
gr.Markdown("# ๐๏ธ Wav2Vec2 XLS-R 1B - Portuguese ASR") | |
gr.Markdown("Speech recognition for Portuguese using jonatasgrosman/wav2vec2-xls-r-1b-portuguese") | |
with gr.Row(): | |
with gr.Column(): | |
audio_input = gr.Audio( | |
sources=["upload", "microphone"], | |
type="filepath", | |
label="Audio Input" | |
) | |
submit_btn = gr.Button("Transcribe", variant="primary") | |
with gr.Column(): | |
output_text = gr.Markdown(label="Results") | |
transcription_output = gr.Textbox( | |
label="Transcription Text", | |
lines=3, | |
interactive=False | |
) | |
submit_btn.click( | |
fn=process_audio, | |
inputs=[audio_input], | |
outputs=[output_text, transcription_output] | |
) | |
# Examples section removed - was causing FileNotFoundError | |
# Launch the app - let Hugging Face Spaces handle the configuration | |
if __name__ == "__main__": | |
demo.launch() # Remove server_name and server_port for HF Spaces compatibility |