Spaces:
Runtime error
Runtime error
import gradio as gr | |
import tempfile | |
import requests | |
from moviepy.editor import VideoFileClip | |
import torchaudio | |
import torch | |
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor, pipeline | |
from transformers import WhisperProcessor, WhisperForConditionalGeneration | |
# Load Models | |
accent_model = Wav2Vec2ForSequenceClassification.from_pretrained("sreyan88/wav2vec2-large-xlsr-53-english-accent") | |
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("sreyan88/wav2vec2-large-xlsr-53-english-accent") | |
whisper = pipeline("automatic-speech-recognition", model="openai/whisper-small") | |
# Helpers | |
def download_and_extract_audio(url): | |
r = requests.get(url, stream=True) | |
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as video_file: | |
for chunk in r.iter_content(chunk_size=1024): | |
if chunk: | |
video_file.write(chunk) | |
video_path = video_file.name | |
clip = VideoFileClip(video_path) | |
audio_path = video_path.replace(".mp4", ".wav") | |
clip.audio.write_audiofile(audio_path) | |
return audio_path | |
def classify_accent(audio_path): | |
# Load and process audio | |
waveform, sample_rate = torchaudio.load(audio_path) | |
inputs = feature_extractor(waveform[0], sampling_rate=sample_rate, return_tensors="pt", padding=True) | |
with torch.no_grad(): | |
logits = accent_model(**inputs).logits | |
predicted_class = torch.argmax(logits, dim=1).item() | |
confidence = torch.softmax(logits, dim=1)[0, predicted_class].item() | |
labels = ["american", "australian", "british", "indian", "canadian"] # match model classes | |
return labels[predicted_class], round(confidence * 100, 2) | |
def full_pipeline(url): | |
audio_path = download_and_extract_audio(url) | |
accent, confidence = classify_accent(audio_path) | |
transcription = whisper(audio_path)["text"] | |
summary = f"The speaker’s accent is most likely **{accent.capitalize()}** with a confidence score of **{confidence}%**." | |
return transcription, accent.capitalize(), confidence, summary | |
# Gradio Interface | |
gr.Interface( | |
fn=full_pipeline, | |
inputs=gr.Textbox(label="Public Video URL (MP4 or Loom)"), | |
outputs=[ | |
gr.Textbox(label="Transcription"), | |
gr.Textbox(label="Accent"), | |
gr.Number(label="Confidence Score (%)"), | |
gr.Textbox(label="Summary"), | |
], | |
title="Accent Classifier for English Speakers", | |
description="Paste a video URL to detect English accent and transcription. Built with Wav2Vec2 and Whisper on Hugging Face 🤗." | |
).launch() | |