|
import os |
|
import subprocess |
|
import sys |
|
|
|
|
|
try: |
|
import yt_dlp as youtube_dl |
|
except ImportError: |
|
subprocess.check_call([sys.executable, "-m", "pip", "install", "yt-dlp"]) |
|
import yt_dlp as youtube_dl |
|
|
|
import gradio as gr |
|
from transformers import pipeline |
|
|
|
def download_video(video_url, filename="downloaded_video.mp4"): |
|
ydl_opts = { |
|
'format': 'bestaudio/best', |
|
'outtmpl': filename, |
|
'noplaylist': True, |
|
'quiet': True, |
|
'user_agent': ( |
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' |
|
'AppleWebKit/537.36 (KHTML, like Gecko) ' |
|
'Chrome/115.0.0.0 Safari/537.36' |
|
) |
|
} |
|
with youtube_dl.YoutubeDL(ydl_opts) as ydl: |
|
ydl.download([video_url]) |
|
return filename |
|
|
|
def extract_audio(video_filename, audio_filename="extracted_audio.wav"): |
|
command = [ |
|
"ffmpeg", |
|
"-y", |
|
"-i", video_filename, |
|
"-vn", |
|
"-acodec", "pcm_s16le", |
|
"-ar", "16000", |
|
"-ac", "1", |
|
audio_filename |
|
] |
|
subprocess.run(command, check=True) |
|
return audio_filename |
|
|
|
def classify_accent(audio_file, model_name="superb/wav2vec2-base-superb-sid"): |
|
classifier = pipeline("audio-classification", model=model_name) |
|
results = classifier(audio_file) |
|
if results: |
|
top = results[0] |
|
return f"Speaker ID (as accent proxy): {top['label']}\nConfidence: {top['score'] * 100:.2f}%" |
|
return "No result." |
|
|
|
def accent_classifier(video_url): |
|
try: |
|
video_file = download_video(video_url) |
|
audio_file = extract_audio(video_file) |
|
result = classify_accent(audio_file) |
|
except Exception as e: |
|
result = f"Error occurred: {e}" |
|
finally: |
|
for f in ["downloaded_video.mp4", "extracted_audio.wav"]: |
|
if os.path.exists(f): |
|
os.remove(f) |
|
return result |
|
|
|
iface = gr.Interface( |
|
fn=accent_classifier, |
|
inputs=gr.Textbox(label="Video URL", placeholder="Paste a public YouTube or Vimeo video link here"), |
|
outputs="text", |
|
title="Accent Classifier", |
|
description="Download a video, extract the audio, and classify the speaker (as an accent proxy) using a Hugging Face model." |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch() |
|
|