EdgarDataScientist's picture
Update app.py
bd89cf2 verified
raw
history blame
2.55 kB
import gradio as gr
import tempfile
import requests
from moviepy.editor import VideoFileClip
import torchaudio
import torch
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor, pipeline
from transformers import WhisperProcessor, WhisperForConditionalGeneration
# Load Models
accent_model = Wav2Vec2ForSequenceClassification.from_pretrained("sreyan88/wav2vec2-large-xlsr-53-english-accent")
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("sreyan88/wav2vec2-large-xlsr-53-english-accent")
whisper = pipeline("automatic-speech-recognition", model="openai/whisper-small")
# Helpers
def download_and_extract_audio(url):
r = requests.get(url, stream=True)
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as video_file:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
video_file.write(chunk)
video_path = video_file.name
clip = VideoFileClip(video_path)
audio_path = video_path.replace(".mp4", ".wav")
clip.audio.write_audiofile(audio_path)
return audio_path
def classify_accent(audio_path):
# Load and process audio
waveform, sample_rate = torchaudio.load(audio_path)
inputs = feature_extractor(waveform[0], sampling_rate=sample_rate, return_tensors="pt", padding=True)
with torch.no_grad():
logits = accent_model(**inputs).logits
predicted_class = torch.argmax(logits, dim=1).item()
confidence = torch.softmax(logits, dim=1)[0, predicted_class].item()
labels = ["american", "australian", "british", "indian", "canadian"] # match model classes
return labels[predicted_class], round(confidence * 100, 2)
def full_pipeline(url):
audio_path = download_and_extract_audio(url)
accent, confidence = classify_accent(audio_path)
transcription = whisper(audio_path)["text"]
summary = f"The speaker’s accent is most likely **{accent.capitalize()}** with a confidence score of **{confidence}%**."
return transcription, accent.capitalize(), confidence, summary
# Gradio Interface
gr.Interface(
fn=full_pipeline,
inputs=gr.Textbox(label="Public Video URL (MP4 or Loom)"),
outputs=[
gr.Textbox(label="Transcription"),
gr.Textbox(label="Accent"),
gr.Number(label="Confidence Score (%)"),
gr.Textbox(label="Summary"),
],
title="Accent Classifier for English Speakers",
description="Paste a video URL to detect English accent and transcription. Built with Wav2Vec2 and Whisper on Hugging Face 🤗."
).launch()