EdgarDataScientist's picture
Update app.py
5a4c42c verified
import gradio as gr
import os
import tempfile
import requests
from moviepy.editor import VideoFileClip
from speechbrain.pretrained import EncoderClassifier
import torchaudio
import torch
# --- Real Accent Analyzer using SpeechBrain embeddings ---
class RealAccentAnalyzer:
def __init__(self):
# Pre-trained speaker embedding model (used as a proxy for accent)
self.classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")
self.reference_embeddings = self._load_reference_embeddings()
def _load_reference_embeddings(self):
# Simulate reference accents with fake audio or placeholder tensors
accents = ["American", "British", "Indian", "Australian", "Canadian"]
reference = {}
for accent in accents:
reference[accent] = torch.randn(1, 192) # Dummy 192-dim embeddings
return reference
def _extract_embedding(self, audio_path):
signal, fs = torchaudio.load(audio_path)
if signal.shape[0] > 1:
signal = torch.mean(signal, dim=0, keepdim=True)
if fs != 16000:
resampler = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000)
signal = resampler(signal)
embedding = self.classifier.encode_batch(signal)
return embedding.squeeze().detach()
def _compare_embeddings(self, emb):
similarities = {}
for accent, ref_emb in self.reference_embeddings.items():
score = torch.nn.functional.cosine_similarity(emb, ref_emb, dim=0).item()
similarities[accent] = score
return similarities
def analyze(self, audio_path):
emb = self._extract_embedding(audio_path)
similarities = self._compare_embeddings(emb)
top_accent = max(similarities, key=similarities.get)
confidence = similarities[top_accent]
explanation = f"The speaker most likely has a {top_accent} English accent with similarity score {confidence:.2f}."
return {
"accent": top_accent,
"score": confidence,
"explanation": explanation,
"all_scores": similarities
}
# --- Download and Extract Audio ---
def download_and_extract_audio(url):
temp_dir = tempfile.mkdtemp()
video_path = os.path.join(temp_dir, "video.mp4")
audio_path = os.path.join(temp_dir, "audio.wav")
if "youtube.com" in url or "youtu.be" in url:
from pytubefix import YouTube
yt = YouTube(url)
stream = yt.streams.filter(progressive=True, file_extension='mp4').first()
if not stream:
raise RuntimeError("No suitable video stream found.")
stream.download(output_path=temp_dir, filename="video.mp4")
else:
r = requests.get(url, stream=True)
r.raise_for_status()
with open(video_path, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
clip = VideoFileClip(video_path)
clip.audio.write_audiofile(audio_path, logger=None)
clip.close()
return audio_path
# --- Gradio Interface ---
def analyze_from_url(url):
try:
audio_path = download_and_extract_audio(url)
analyzer = RealAccentAnalyzer()
results = analyzer.analyze(audio_path)
os.remove(audio_path)
return (
results["accent"],
f"{results['score']*100:.1f}%",
results["explanation"]
)
except Exception as e:
return ("Error", "0%", f"Error processing video/audio: {e}")
iface = gr.Interface(
fn=analyze_from_url,
inputs=gr.Textbox(label="Enter Public Video URL (YouTube or direct MP4)"),
outputs=[
gr.Textbox(label="Detected Accent"),
gr.Textbox(label="Confidence Score"),
gr.Textbox(label="Explanation")
],
title="Accent Analyzer (Real Embeddings with SpeechBrain)",
description="Paste a public video URL. This app uses SpeechBrain speaker embeddings to infer accent similarity. It's experimental!"
)
if __name__ == "__main__":
iface.launch()