EdgarDataScientist commited on
Commit
bd89cf2
·
verified ·
1 Parent(s): 9471255

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -58
app.py CHANGED
@@ -1,73 +1,59 @@
1
  import gradio as gr
 
 
2
  from moviepy.editor import VideoFileClip
3
- from speechbrain.pretrained import EncoderClassifier
4
  import torchaudio
5
- from pytube import YouTube
6
- import os
 
7
 
8
- CLASSIFIER = "Jzuluaga/accent-id-commonaccent_xlsr-en-english"
 
 
9
 
10
- def download_video(url):
11
- """Handles YouTube and direct video links"""
12
- if "youtube.com" in url:
13
- yt = YouTube(url)
14
- stream = yt.streams.filter(progressive=True, file_extension='mp4').first()
15
- video_path = stream.download()
16
- return video_path
17
- else: # Direct download (assumes URL is direct mp4 link)
18
- # Download file locally
19
- import requests
20
- local_filename = "temp_video.mp4"
21
- with requests.get(url, stream=True) as r:
22
- r.raise_for_status()
23
- with open(local_filename, 'wb') as f:
24
- for chunk in r.iter_content(chunk_size=8192):
25
- f.write(chunk)
26
- return local_filename
27
 
28
- def extract_audio(video_path):
 
 
 
 
 
 
 
29
  clip = VideoFileClip(video_path)
30
- audio_path = "temp_audio.wav"
31
- clip.audio.write_audiofile(audio_path, logger=None)
32
- clip.close()
33
  return audio_path
34
 
35
  def classify_accent(audio_path):
36
- classifier = EncoderClassifier.from_hparams(
37
- source=CLASSIFIER,
38
- savedir="pretrained_models/accent_classifier",
39
- run_opts={"device":"cpu"} # or "cuda" if GPU available
40
- )
41
  waveform, sample_rate = torchaudio.load(audio_path)
42
- prediction = classifier.classify_batch(waveform)
43
- # prediction format: (scores, probabilities, embeddings, predicted_labels)
44
- predicted_accent = prediction[3][0]
45
- confidence = prediction[1].exp().max().item() * 100
46
- return predicted_accent, f"{confidence:.2f}%"
 
 
47
 
48
- def process_video(url):
49
- try:
50
- video_path = download_video(url)
51
- audio_path = extract_audio(video_path)
52
- accent, confidence = classify_accent(audio_path)
53
- finally:
54
- # Cleanup temp files if they exist
55
- for f in [video_path, audio_path]:
56
- if os.path.exists(f):
57
- os.remove(f)
58
- return accent, confidence
59
 
60
- # Gradio interface
61
- iface = gr.Interface(
62
- fn=process_video,
63
- inputs=gr.Textbox(label="Enter Public Video URL (YouTube, Loom, direct MP4)"),
64
  outputs=[
65
- gr.Textbox(label="Detected Accent"),
66
- gr.Textbox(label="Confidence Score")
 
 
67
  ],
68
- title="English Accent Classifier",
69
- description="Paste a public video URL to detect the English accent and confidence score."
70
- )
71
-
72
- if __name__ == "__main__":
73
- iface.launch()
 
1
  import gradio as gr
2
+ import tempfile
3
+ import requests
4
  from moviepy.editor import VideoFileClip
 
5
  import torchaudio
6
+ import torch
7
+ from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor, pipeline
8
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
9
 
10
+ # Load Models
11
+ accent_model = Wav2Vec2ForSequenceClassification.from_pretrained("sreyan88/wav2vec2-large-xlsr-53-english-accent")
12
+ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("sreyan88/wav2vec2-large-xlsr-53-english-accent")
13
 
14
+ whisper = pipeline("automatic-speech-recognition", model="openai/whisper-small")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ # Helpers
17
+ def download_and_extract_audio(url):
18
+ r = requests.get(url, stream=True)
19
+ with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as video_file:
20
+ for chunk in r.iter_content(chunk_size=1024):
21
+ if chunk:
22
+ video_file.write(chunk)
23
+ video_path = video_file.name
24
  clip = VideoFileClip(video_path)
25
+ audio_path = video_path.replace(".mp4", ".wav")
26
+ clip.audio.write_audiofile(audio_path)
 
27
  return audio_path
28
 
29
  def classify_accent(audio_path):
30
+ # Load and process audio
 
 
 
 
31
  waveform, sample_rate = torchaudio.load(audio_path)
32
+ inputs = feature_extractor(waveform[0], sampling_rate=sample_rate, return_tensors="pt", padding=True)
33
+ with torch.no_grad():
34
+ logits = accent_model(**inputs).logits
35
+ predicted_class = torch.argmax(logits, dim=1).item()
36
+ confidence = torch.softmax(logits, dim=1)[0, predicted_class].item()
37
+ labels = ["american", "australian", "british", "indian", "canadian"] # match model classes
38
+ return labels[predicted_class], round(confidence * 100, 2)
39
 
40
+ def full_pipeline(url):
41
+ audio_path = download_and_extract_audio(url)
42
+ accent, confidence = classify_accent(audio_path)
43
+ transcription = whisper(audio_path)["text"]
44
+ summary = f"The speaker’s accent is most likely **{accent.capitalize()}** with a confidence score of **{confidence}%**."
45
+ return transcription, accent.capitalize(), confidence, summary
 
 
 
 
 
46
 
47
+ # Gradio Interface
48
+ gr.Interface(
49
+ fn=full_pipeline,
50
+ inputs=gr.Textbox(label="Public Video URL (MP4 or Loom)"),
51
  outputs=[
52
+ gr.Textbox(label="Transcription"),
53
+ gr.Textbox(label="Accent"),
54
+ gr.Number(label="Confidence Score (%)"),
55
+ gr.Textbox(label="Summary"),
56
  ],
57
+ title="Accent Classifier for English Speakers",
58
+ description="Paste a video URL to detect English accent and transcription. Built with Wav2Vec2 and Whisper on Hugging Face 🤗."
59
+ ).launch()