Spaces:

EdgarDataScientist
/

Accent_Classification

Runtime error

File size: 2,550 Bytes

9471255
bd89cf2
 
fa15dd8
 
bd89cf2
 
 
fa15dd8
bd89cf2
 
 
fa15dd8
bd89cf2
fa15dd8
bd89cf2
 
 
 
 
 
 
 
fa15dd8
bd89cf2
 
fa15dd8
 
 
bd89cf2
fa15dd8
bd89cf2
 
 
 
 
 
 
9471255
bd89cf2
 
 
 
 
 
fa15dd8
bd89cf2
 
 
 
9471255
bd89cf2
 
 
 
9471255
bd89cf2

import gradio as gr
import tempfile
import requests
from moviepy.editor import VideoFileClip
import torchaudio
import torch
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor, pipeline
from transformers import WhisperProcessor, WhisperForConditionalGeneration

# Load Models
accent_model = Wav2Vec2ForSequenceClassification.from_pretrained("sreyan88/wav2vec2-large-xlsr-53-english-accent")
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("sreyan88/wav2vec2-large-xlsr-53-english-accent")

whisper = pipeline("automatic-speech-recognition", model="openai/whisper-small")

# Helpers
def download_and_extract_audio(url):
    r = requests.get(url, stream=True)
    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as video_file:
        for chunk in r.iter_content(chunk_size=1024):
            if chunk:
                video_file.write(chunk)
        video_path = video_file.name
    clip = VideoFileClip(video_path)
    audio_path = video_path.replace(".mp4", ".wav")
    clip.audio.write_audiofile(audio_path)
    return audio_path

def classify_accent(audio_path):
    # Load and process audio
    waveform, sample_rate = torchaudio.load(audio_path)
    inputs = feature_extractor(waveform[0], sampling_rate=sample_rate, return_tensors="pt", padding=True)
    with torch.no_grad():
        logits = accent_model(**inputs).logits
    predicted_class = torch.argmax(logits, dim=1).item()
    confidence = torch.softmax(logits, dim=1)[0, predicted_class].item()
    labels = ["american", "australian", "british", "indian", "canadian"]  # match model classes
    return labels[predicted_class], round(confidence * 100, 2)

def full_pipeline(url):
    audio_path = download_and_extract_audio(url)
    accent, confidence = classify_accent(audio_path)
    transcription = whisper(audio_path)["text"]
    summary = f"The speaker’s accent is most likely **{accent.capitalize()}** with a confidence score of **{confidence}%**."
    return transcription, accent.capitalize(), confidence, summary

# Gradio Interface
gr.Interface(
    fn=full_pipeline,
    inputs=gr.Textbox(label="Public Video URL (MP4 or Loom)"),
    outputs=[
        gr.Textbox(label="Transcription"),
        gr.Textbox(label="Accent"),
        gr.Number(label="Confidence Score (%)"),
        gr.Textbox(label="Summary"),
    ],
    title="Accent Classifier for English Speakers",
    description="Paste a video URL to detect English accent and transcription. Built with Wav2Vec2 and Whisper on Hugging Face 🤗."
).launch()