File size: 2,550 Bytes
9471255
bd89cf2
 
fa15dd8
 
bd89cf2
 
 
fa15dd8
bd89cf2
 
 
fa15dd8
bd89cf2
fa15dd8
bd89cf2
 
 
 
 
 
 
 
fa15dd8
bd89cf2
 
fa15dd8
 
 
bd89cf2
fa15dd8
bd89cf2
 
 
 
 
 
 
9471255
bd89cf2
 
 
 
 
 
fa15dd8
bd89cf2
 
 
 
9471255
bd89cf2
 
 
 
9471255
bd89cf2
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import gradio as gr
import tempfile
import requests
from moviepy.editor import VideoFileClip
import torchaudio
import torch
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor, pipeline
from transformers import WhisperProcessor, WhisperForConditionalGeneration

# Load Models
accent_model = Wav2Vec2ForSequenceClassification.from_pretrained("sreyan88/wav2vec2-large-xlsr-53-english-accent")
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("sreyan88/wav2vec2-large-xlsr-53-english-accent")

whisper = pipeline("automatic-speech-recognition", model="openai/whisper-small")

# Helpers
def download_and_extract_audio(url):
    r = requests.get(url, stream=True)
    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as video_file:
        for chunk in r.iter_content(chunk_size=1024):
            if chunk:
                video_file.write(chunk)
        video_path = video_file.name
    clip = VideoFileClip(video_path)
    audio_path = video_path.replace(".mp4", ".wav")
    clip.audio.write_audiofile(audio_path)
    return audio_path

def classify_accent(audio_path):
    # Load and process audio
    waveform, sample_rate = torchaudio.load(audio_path)
    inputs = feature_extractor(waveform[0], sampling_rate=sample_rate, return_tensors="pt", padding=True)
    with torch.no_grad():
        logits = accent_model(**inputs).logits
    predicted_class = torch.argmax(logits, dim=1).item()
    confidence = torch.softmax(logits, dim=1)[0, predicted_class].item()
    labels = ["american", "australian", "british", "indian", "canadian"]  # match model classes
    return labels[predicted_class], round(confidence * 100, 2)

def full_pipeline(url):
    audio_path = download_and_extract_audio(url)
    accent, confidence = classify_accent(audio_path)
    transcription = whisper(audio_path)["text"]
    summary = f"The speaker’s accent is most likely **{accent.capitalize()}** with a confidence score of **{confidence}%**."
    return transcription, accent.capitalize(), confidence, summary

# Gradio Interface
gr.Interface(
    fn=full_pipeline,
    inputs=gr.Textbox(label="Public Video URL (MP4 or Loom)"),
    outputs=[
        gr.Textbox(label="Transcription"),
        gr.Textbox(label="Accent"),
        gr.Number(label="Confidence Score (%)"),
        gr.Textbox(label="Summary"),
    ],
    title="Accent Classifier for English Speakers",
    description="Paste a video URL to detect English accent and transcription. Built with Wav2Vec2 and Whisper on Hugging Face 🤗."
).launch()