Spaces:
Runtime error
Runtime error
File size: 2,550 Bytes
9471255 bd89cf2 fa15dd8 bd89cf2 fa15dd8 bd89cf2 fa15dd8 bd89cf2 fa15dd8 bd89cf2 fa15dd8 bd89cf2 fa15dd8 bd89cf2 fa15dd8 bd89cf2 9471255 bd89cf2 fa15dd8 bd89cf2 9471255 bd89cf2 9471255 bd89cf2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
import gradio as gr
import tempfile
import requests
from moviepy.editor import VideoFileClip
import torchaudio
import torch
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor, pipeline
from transformers import WhisperProcessor, WhisperForConditionalGeneration
# Load Models
accent_model = Wav2Vec2ForSequenceClassification.from_pretrained("sreyan88/wav2vec2-large-xlsr-53-english-accent")
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("sreyan88/wav2vec2-large-xlsr-53-english-accent")
whisper = pipeline("automatic-speech-recognition", model="openai/whisper-small")
# Helpers
def download_and_extract_audio(url):
r = requests.get(url, stream=True)
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as video_file:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
video_file.write(chunk)
video_path = video_file.name
clip = VideoFileClip(video_path)
audio_path = video_path.replace(".mp4", ".wav")
clip.audio.write_audiofile(audio_path)
return audio_path
def classify_accent(audio_path):
# Load and process audio
waveform, sample_rate = torchaudio.load(audio_path)
inputs = feature_extractor(waveform[0], sampling_rate=sample_rate, return_tensors="pt", padding=True)
with torch.no_grad():
logits = accent_model(**inputs).logits
predicted_class = torch.argmax(logits, dim=1).item()
confidence = torch.softmax(logits, dim=1)[0, predicted_class].item()
labels = ["american", "australian", "british", "indian", "canadian"] # match model classes
return labels[predicted_class], round(confidence * 100, 2)
def full_pipeline(url):
audio_path = download_and_extract_audio(url)
accent, confidence = classify_accent(audio_path)
transcription = whisper(audio_path)["text"]
summary = f"The speaker’s accent is most likely **{accent.capitalize()}** with a confidence score of **{confidence}%**."
return transcription, accent.capitalize(), confidence, summary
# Gradio Interface
gr.Interface(
fn=full_pipeline,
inputs=gr.Textbox(label="Public Video URL (MP4 or Loom)"),
outputs=[
gr.Textbox(label="Transcription"),
gr.Textbox(label="Accent"),
gr.Number(label="Confidence Score (%)"),
gr.Textbox(label="Summary"),
],
title="Accent Classifier for English Speakers",
description="Paste a video URL to detect English accent and transcription. Built with Wav2Vec2 and Whisper on Hugging Face 🤗."
).launch()
|