Spaces:

Sajidahamed
/

AccentClassification

Sleeping

File size: 2,535 Bytes

7cebfba
98c4440
 
 
 
b883875
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e6d3380
7cebfba
 
b883875
7cebfba
b883875
7cebfba
 
 
 
 
 
 
 
 
 
 
e6d3380
 
 
 
7cebfba
 
 
 
b883875
 
 
 
7cebfba
 
b883875
7cebfba
98c4440
7cebfba

import gradio as gr
import os
import torchaudio
from speechbrain.pretrained import EncoderClassifier

def accent_detect(video_link, video_file):
    # Decide which input to use
    video_path = None

    # If a video file is uploaded, use it
    if video_file is not None:
        video_path = "uploaded_input.mp4"
        with open(video_path, "wb") as f:
            f.write(video_file.read())
    # Else if a link is provided, try to download it
    elif video_link and len(video_link.strip()) > 8:
        # Use yt-dlp for YouTube or wget for direct link
        if "youtube.com" in video_link or "youtu.be" in video_link:
            os.system(f'yt-dlp -o input_video.mp4 "{video_link}"')
        else:
            os.system(f'wget -O input_video.mp4 "{video_link}"')
        if os.path.exists("input_video.mp4") and os.path.getsize("input_video.mp4") > 0:
            video_path = "input_video.mp4"
        else:
            return "Failed to download the video. Please check your link."
    else:
        return "Please upload a video file or provide a valid video link."

    # Extract audio from video
    os.system(f"ffmpeg -y -i '{video_path}' -ar 16000 -ac 1 -vn audio.wav")

    if not os.path.exists("audio.wav") or os.path.getsize("audio.wav") < 1000:
        return "Audio extraction failed. Please use a different video."

    # Load model and classify accent
    accent_model = EncoderClassifier.from_hparams(
        source="speechbrain/lang-id-commonlanguage_ecapa",
        savedir="tmp_accent_model"
    )
    signal, fs = torchaudio.load("audio.wav")
    if signal.shape[0] > 1:
        signal = signal[0].unsqueeze(0)
    prediction = accent_model.classify_batch(signal)
    pred_label = prediction[3][0]
    pred_scores = prediction[1][0]
    confidence = float(pred_scores.max()) * 100
    explanation = (
        f"Predicted Accent: {pred_label} ({confidence:.1f}%)\n"
        f"The model is {confidence:.0f}% confident this is a {pred_label} English accent."
    )
    return explanation

demo = gr.Interface(
    fn=accent_detect,
    inputs=[
        gr.Textbox(label="YouTube or direct MP4 link (optional)", placeholder="https://youtube.com/yourvideo"),
        gr.File(label="Or upload a video file (MP4, WEBM, etc.)"),
    ],
    outputs="text",
    title="🗣️ English Accent Classifier (Gradio Demo)",
    description="Paste a YouTube/direct MP4 link or upload a video file with English speech. The tool predicts the English accent and confidence."
)

if __name__ == "__main__":
    demo.launch()