Spaces:

EdgarDataScientist
/

Accent_Classification

Runtime error

App Files Files Community

Accent_Classification / app.py

EdgarDataScientist

Update app.py

bd89cf2 verified 3 months ago

raw

history blame

2.55 kB

	import gradio as gr
	import tempfile
	import requests
	from moviepy.editor import VideoFileClip
	import torchaudio
	import torch
	from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor, pipeline
	from transformers import WhisperProcessor, WhisperForConditionalGeneration

	# Load Models
	accent_model = Wav2Vec2ForSequenceClassification.from_pretrained("sreyan88/wav2vec2-large-xlsr-53-english-accent")
	feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("sreyan88/wav2vec2-large-xlsr-53-english-accent")

	whisper = pipeline("automatic-speech-recognition", model="openai/whisper-small")

	# Helpers
	def download_and_extract_audio(url):
	r = requests.get(url, stream=True)
	with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as video_file:
	for chunk in r.iter_content(chunk_size=1024):
	if chunk:
	video_file.write(chunk)
	video_path = video_file.name
	clip = VideoFileClip(video_path)
	audio_path = video_path.replace(".mp4", ".wav")
	clip.audio.write_audiofile(audio_path)
	return audio_path

	def classify_accent(audio_path):
	# Load and process audio
	waveform, sample_rate = torchaudio.load(audio_path)
	inputs = feature_extractor(waveform[0], sampling_rate=sample_rate, return_tensors="pt", padding=True)
	with torch.no_grad():
	logits = accent_model(**inputs).logits
	predicted_class = torch.argmax(logits, dim=1).item()
	confidence = torch.softmax(logits, dim=1)[0, predicted_class].item()
	labels = ["american", "australian", "british", "indian", "canadian"] # match model classes
	return labels[predicted_class], round(confidence * 100, 2)

	def full_pipeline(url):
	audio_path = download_and_extract_audio(url)
	accent, confidence = classify_accent(audio_path)
	transcription = whisper(audio_path)["text"]
	summary = f"The speaker’s accent is most likely {accent.capitalize()} with a confidence score of {confidence}%."
	return transcription, accent.capitalize(), confidence, summary

	# Gradio Interface
	gr.Interface(
	fn=full_pipeline,
	inputs=gr.Textbox(label="Public Video URL (MP4 or Loom)"),
	outputs=[
	gr.Textbox(label="Transcription"),
	gr.Textbox(label="Accent"),
	gr.Number(label="Confidence Score (%)"),
	gr.Textbox(label="Summary"),
	],
	title="Accent Classifier for English Speakers",
	description="Paste a video URL to detect English accent and transcription. Built with Wav2Vec2 and Whisper on Hugging Face 🤗."
	).launch()