import gradio as gr import tempfile import requests from moviepy.editor import VideoFileClip import torchaudio import torch from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor, pipeline from transformers import WhisperProcessor, WhisperForConditionalGeneration # Load Models accent_model = Wav2Vec2ForSequenceClassification.from_pretrained("sreyan88/wav2vec2-large-xlsr-53-english-accent") feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("sreyan88/wav2vec2-large-xlsr-53-english-accent") whisper = pipeline("automatic-speech-recognition", model="openai/whisper-small") # Helpers def download_and_extract_audio(url): r = requests.get(url, stream=True) with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as video_file: for chunk in r.iter_content(chunk_size=1024): if chunk: video_file.write(chunk) video_path = video_file.name clip = VideoFileClip(video_path) audio_path = video_path.replace(".mp4", ".wav") clip.audio.write_audiofile(audio_path) return audio_path def classify_accent(audio_path): # Load and process audio waveform, sample_rate = torchaudio.load(audio_path) inputs = feature_extractor(waveform[0], sampling_rate=sample_rate, return_tensors="pt", padding=True) with torch.no_grad(): logits = accent_model(**inputs).logits predicted_class = torch.argmax(logits, dim=1).item() confidence = torch.softmax(logits, dim=1)[0, predicted_class].item() labels = ["american", "australian", "british", "indian", "canadian"] # match model classes return labels[predicted_class], round(confidence * 100, 2) def full_pipeline(url): audio_path = download_and_extract_audio(url) accent, confidence = classify_accent(audio_path) transcription = whisper(audio_path)["text"] summary = f"The speaker’s accent is most likely **{accent.capitalize()}** with a confidence score of **{confidence}%**." return transcription, accent.capitalize(), confidence, summary # Gradio Interface gr.Interface( fn=full_pipeline, inputs=gr.Textbox(label="Public Video URL (MP4 or Loom)"), outputs=[ gr.Textbox(label="Transcription"), gr.Textbox(label="Accent"), gr.Number(label="Confidence Score (%)"), gr.Textbox(label="Summary"), ], title="Accent Classifier for English Speakers", description="Paste a video URL to detect English accent and transcription. Built with Wav2Vec2 and Whisper on Hugging Face 🤗." ).launch()