EdgarDataScientist commited on
Commit
5cb7e51
·
verified ·
1 Parent(s): b159229

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -44
app.py CHANGED
@@ -1,59 +1,73 @@
1
  import gradio as gr
2
- import tempfile
3
- import requests
4
  from moviepy.editor import VideoFileClip
 
5
  import torchaudio
6
- import torch
7
- from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor, pipeline
8
- from transformers import WhisperProcessor, WhisperForConditionalGeneration
9
 
10
- # Load Models
11
- accent_model = Wav2Vec2ForSequenceClassification.from_pretrained("sreyan88/wav2vec2-large-xlsr-53-english-accent")
12
- feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("sreyan88/wav2vec2-large-xlsr-53-english-accent")
13
 
14
- whisper = pipeline("automatic-speech-recognition", model="openai/whisper-small")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
- # Helpers
17
- def download_and_extract_audio(url):
18
- r = requests.get(url, stream=True)
19
- with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as video_file:
20
- for chunk in r.iter_content(chunk_size=1024):
21
- if chunk:
22
- video_file.write(chunk)
23
- video_path = video_file.name
24
  clip = VideoFileClip(video_path)
25
- audio_path = video_path.replace(".mp4", ".wav")
26
- clip.audio.write_audiofile(audio_path)
 
27
  return audio_path
28
 
29
  def classify_accent(audio_path):
30
- # Load and process audio
 
 
 
 
31
  waveform, sample_rate = torchaudio.load(audio_path)
32
- inputs = feature_extractor(waveform[0], sampling_rate=sample_rate, return_tensors="pt", padding=True)
33
- with torch.no_grad():
34
- logits = accent_model(**inputs).logits
35
- predicted_class = torch.argmax(logits, dim=1).item()
36
- confidence = torch.softmax(logits, dim=1)[0, predicted_class].item()
37
- labels = ["american", "australian", "british", "indian", "canadian"] # match model classes
38
- return labels[predicted_class], round(confidence * 100, 2)
39
 
40
- def full_pipeline(url):
41
- audio_path = download_and_extract_audio(url)
42
- accent, confidence = classify_accent(audio_path)
43
- transcription = whisper(audio_path)["text"]
44
- summary = f"The speaker’s accent is most likely **{accent.capitalize()}** with a confidence score of **{confidence}%**."
45
- return transcription, accent.capitalize(), confidence, summary
 
 
 
 
 
46
 
47
- # Gradio Interface
48
- gr.Interface(
49
- fn=full_pipeline,
50
- inputs=gr.Textbox(label="Public Video URL (MP4 or Loom)"),
51
  outputs=[
52
- gr.Textbox(label="Transcription"),
53
- gr.Textbox(label="Accent"),
54
- gr.Number(label="Confidence Score (%)"),
55
- gr.Textbox(label="Summary"),
56
  ],
57
- title="Accent Classifier for English Speakers",
58
- description="Paste a video URL to detect English accent and transcription. Built with Wav2Vec2 and Whisper on Hugging Face 🤗."
59
- ).launch()
 
 
 
 
1
  import gradio as gr
 
 
2
  from moviepy.editor import VideoFileClip
3
+ from speechbrain.pretrained import EncoderClassifier
4
  import torchaudio
5
+ from pytube import YouTube
6
+ import os
 
7
 
8
+ CLASSIFIER = "Jzuluaga/accent-id-commonaccent_xlsr-en-english"
 
 
9
 
10
+ def download_video(url):
11
+ """Handles YouTube and direct video links"""
12
+ if "youtube.com" in url:
13
+ yt = YouTube(url)
14
+ stream = yt.streams.filter(progressive=True, file_extension='mp4').first()
15
+ video_path = stream.download()
16
+ return video_path
17
+ else: # Direct download (assumes URL is direct mp4 link)
18
+ # Download file locally
19
+ import requests
20
+ local_filename = "temp_video.mp4"
21
+ with requests.get(url, stream=True) as r:
22
+ r.raise_for_status()
23
+ with open(local_filename, 'wb') as f:
24
+ for chunk in r.iter_content(chunk_size=8192):
25
+ f.write(chunk)
26
+ return local_filename
27
 
28
+ def extract_audio(video_path):
 
 
 
 
 
 
 
29
  clip = VideoFileClip(video_path)
30
+ audio_path = "temp_audio.wav"
31
+ clip.audio.write_audiofile(audio_path, logger=None)
32
+ clip.close()
33
  return audio_path
34
 
35
  def classify_accent(audio_path):
36
+ classifier = EncoderClassifier.from_hparams(
37
+ source=CLASSIFIER,
38
+ savedir="pretrained_models/accent_classifier",
39
+ run_opts={"device":"cpu"} # or "cuda" if GPU available
40
+ )
41
  waveform, sample_rate = torchaudio.load(audio_path)
42
+ prediction = classifier.classify_batch(waveform)
43
+ # prediction format: (scores, probabilities, embeddings, predicted_labels)
44
+ predicted_accent = prediction[3][0]
45
+ confidence = prediction[1].exp().max().item() * 100
46
+ return predicted_accent, f"{confidence:.2f}%"
 
 
47
 
48
+ def process_video(url):
49
+ try:
50
+ video_path = download_video(url)
51
+ audio_path = extract_audio(video_path)
52
+ accent, confidence = classify_accent(audio_path)
53
+ finally:
54
+ # Cleanup temp files if they exist
55
+ for f in [video_path, audio_path]:
56
+ if os.path.exists(f):
57
+ os.remove(f)
58
+ return accent, confidence
59
 
60
+ # Gradio interface
61
+ iface = gr.Interface(
62
+ fn=process_video,
63
+ inputs=gr.Textbox(label="Enter Public Video URL (YouTube, Loom, direct MP4)"),
64
  outputs=[
65
+ gr.Textbox(label="Detected Accent"),
66
+ gr.Textbox(label="Confidence Score")
 
 
67
  ],
68
+ title="English Accent Classifier",
69
+ description="Paste a public video URL to detect the English accent and confidence score."
70
+ )
71
+
72
+ if __name__ == "__main__":
73
+ iface.launch()