EdgarDataScientist's picture
Update app.py
f6f6edc verified
raw
history blame
2.93 kB
import gradio as gr
from moviepy.editor import VideoFileClip
from speechbrain.pretrained import EncoderClassifier
import torchaudio
from pytube import YouTube
import os
CLASSIFIER = "Jzuluaga/accent-id-commonaccent_xlsr-en-english"
def download_video(url):
"""Handles YouTube and direct video links with error handling"""
try:
if "youtube.com" in url or "youtu.be" in url:
yt = YouTube(url)
stream = yt.streams.filter(progressive=True, file_extension='mp4').first()
if not stream:
raise ValueError("No suitable video stream found.")
video_path = stream.download() # Store the download path
return video_path
else:
# For direct MP4 links, download file
import requests
local_filename = "temp_video.mp4"
with requests.get(url, stream=True) as r:
r.raise_for_status()
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
video_path = local_filename # Store the download path
return video_path
except Exception as e:
raise RuntimeError(f"Failed to download video: {e}")
def extract_audio(video_path):
clip = VideoFileClip(video_path)
audio_path = "temp_audio.wav"
clip.audio.write_audiofile(audio_path, logger=None)
clip.close()
return audio_path
def classify_accent(audio_path):
classifier = EncoderClassifier.from_hparams(
source=CLASSIFIER,
savedir="pretrained_models/accent_classifier",
run_opts={"device":"cpu"} # or "cuda" if GPU available
)
waveform, sample_rate = torchaudio.load(audio_path)
prediction = classifier.classify_batch(waveform)
# prediction format: (scores, probabilities, embeddings, predicted_labels)
predicted_accent = prediction[3][0]
confidence = prediction[1].exp().max().item() * 100
return predicted_accent, f"{confidence:.2f}%"
def process_video(url):
video_path = None
audio_path = None
try:
video_path = download_video(url)
audio_path = extract_audio(video_path)
accent, confidence = classify_accent(audio_path)
return accent, confidence
except Exception as e:
return f"Error: {e}", ""
finally:
for f in [video_path, audio_path]:
if f and os.path.exists(f):
os.remove(f)
# Gradio interface
iface = gr.Interface(
fn=process_video,
inputs=gr.Textbox(label="Enter Public Video URL (YouTube, Loom, direct MP4)"),
outputs=[
gr.Textbox(label="Detected Accent"),
gr.Textbox(label="Confidence Score")
],
title="English Accent Classifier",
description="Paste a public video URL to detect the English accent and confidence score."
)
if __name__ == "__main__":
iface.launch()