accent-detector / src /streamlit_app.py
amirjamali's picture
Enhance Dockerfile and Streamlit configuration; add upload directory, max upload size, and XSRF protection; update README with troubleshooting tips for file uploads
55dbd8d unverified
raw
history blame
28 kB
import streamlit as st
import os
import yt_dlp
import subprocess
import librosa
import numpy as np
import torch
import sys
# Global flag for SpeechBrain availability
HAS_SPEECHBRAIN = False
# Handle SpeechBrain import with fallbacks for different versions
try:
# Try the new path first (SpeechBrain 1.0+)
from speechbrain.inference.classifiers import EncoderClassifier
HAS_SPEECHBRAIN = True
except ImportError:
try:
# Try the legacy path
from speechbrain.pretrained.interfaces import EncoderClassifier
HAS_SPEECHBRAIN = True
except ImportError:
try:
# Try the very old path
from speechbrain.pretrained import EncoderClassifier
HAS_SPEECHBRAIN = True
except ImportError:
# If all fail, we'll handle this later in the code
st.error("⚠️ Unable to import SpeechBrain. Limited functionality available.")
EncoderClassifier = None
# Handle potential compatibility issues with transformers
try:
from transformers import AutoProcessor, AutoModelForAudioClassification
HAS_AUTO_PROCESSOR = True
except ImportError:
from transformers import AutoModelForAudioClassification
HAS_AUTO_PROCESSOR = False
st.warning("Using a compatible but limited version of transformers. Some features may be limited.")
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import tempfile
import time
# Deployment instructions:
# To deploy this app:
# 1. Make sure Docker is installed
# 2. Build the Docker image: docker build -t accent-detector .
# 3. Run the container: docker run -p 8501:8501 --volume /tmp/accent-detector:/app/uploads accent-detector
# For Windows: docker run -p 8501:8501 --volume C:\temp\accent-detector:/app/uploads accent-detector
# 4. Access the app at http://localhost:8501
#
# For cloud deployment:
# - Streamlit Cloud: Connect your GitHub repository to Streamlit Cloud
# - Hugging Face Spaces: Use the Docker deployment option with proper volume mounts
# - Azure/AWS/GCP: Deploy the container using their container services with persistent storage
#
# Troubleshooting file uploads:
# - Set maxUploadSize in .streamlit/config.toml
# - Ensure write permissions on upload directories
# - For 403 errors, check file size and format compatibility
# Load environment variables (if .env file exists)
try:
load_dotenv()
except:
pass
# Check for OpenAI API access - optional for enhanced explanations
try:
import openai
openai.api_key = os.getenv("OPENAI_API_KEY")
have_openai = openai.api_key is not None
except (ImportError, AttributeError):
have_openai = False
# English accent categories
ENGLISH_ACCENTS = {
"en-us": "American English",
"en-gb": "British English",
"en-au": "Australian English",
"en-ca": "Canadian English",
"en-ie": "Irish English",
"en-scotland": "Scottish English",
"en-in": "Indian English",
"en-za": "South African English",
"en-ng": "Nigerian English",
"en-caribbean": "Caribbean English",
}
def download_video(url, video_path="video.mp4", cookies_file=None):
"""Download a video from a URL"""
ydl_opts = {
"outtmpl": video_path,
"quiet": False,
"no_warnings": False,
"verbose": True # More detailed output for debugging
}
# Only use cookies if explicitly provided via file upload
# Don't try to access browser cookies in Docker container
if cookies_file and os.path.exists(cookies_file):
ydl_opts["cookiefile"] = cookies_file
try:
# Special handling for YouTube URLs to try without cookies first
is_youtube = "youtube" in url.lower() or "youtu.be" in url.lower()
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([url])
if os.path.exists(video_path):
return True
else:
st.error(f"Video downloaded but file not found: {video_path}")
return False
except Exception as e:
error_msg = str(e)
st.error(f"Download error: {error_msg}")
# Provide specific guidance based on error type
if is_youtube and ("bot" in error_msg.lower() or "sign in" in error_msg.lower()):
st.warning("YouTube requires authentication. Please upload a cookies.txt file or try a direct video link.")
elif "not find" in error_msg.lower() and "cookies" in error_msg.lower():
st.warning("Browser cookies could not be accessed. Please upload a cookies.txt file.")
elif "network" in error_msg.lower() or "timeout" in error_msg.lower():
st.warning("Network error. Please check your internet connection and try again.")
return False
def extract_audio(video_path="video.mp4", audio_path="audio.wav"):
"""Extract audio from video file using ffmpeg"""
try:
subprocess.run(
['ffmpeg', '-i', video_path, '-vn', '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', audio_path],
check=True,
capture_output=True
)
return os.path.exists(audio_path)
except subprocess.CalledProcessError as e:
st.error(f"Error extracting audio: {e}")
st.error(f"ffmpeg output: {e.stderr.decode('utf-8')}")
raise
class AccentDetector:
def __init__(self):
# Initialize language identification model
self.have_lang_id = False
try:
if EncoderClassifier is not None:
self.lang_id = EncoderClassifier.from_hparams(
source="speechbrain/lang-id-commonlanguage_ecapa",
savedir="tmp_model"
)
self.have_lang_id = True
else:
st.error("SpeechBrain not available. Language identification disabled.")
except Exception as e:
st.error(f"Error loading language ID model: {str(e)}")
# Initialize the accent classifier
self.have_accent_model = False
try:
self.model_name = "speechbrain/lang-id-voxlingua107-ecapa"
# Handle case where AutoProcessor is not available
if HAS_AUTO_PROCESSOR:
self.processor = AutoProcessor.from_pretrained(self.model_name)
else:
# Fall back to using feature_extractor
from transformers import AutoFeatureExtractor
self.processor = AutoFeatureExtractor.from_pretrained(self.model_name)
self.model = AutoModelForAudioClassification.from_pretrained(self.model_name)
self.have_accent_model = True
except Exception as e:
st.warning(f"Could not load accent model: {str(e)}")
self.have_accent_model = False
def is_english(self, audio_path, threshold=0.7):
"""
Determine if the speech is English and return confidence score
"""
if not hasattr(self, 'have_lang_id') or not self.have_lang_id:
# If language ID model is not available, assume English
st.warning("Language identification is not available. Assuming English speech.")
return True, "en", 1.0
try:
out_prob, score, index, lang = self.lang_id.classify_file(audio_path)
score = float(score)
# Check if language is English (slightly fuzzy match)
is_english = "eng" in lang.lower() or "en-" in lang.lower() or lang.lower() == "en"
return is_english, lang, score
except Exception as e:
st.warning(f"Error identifying language: {str(e)}. Assuming English speech.")
return True, "en", 0.5
def classify_accent(self, audio_path):
"""
Classify the specific English accent
"""
if not self.have_accent_model:
return "Unknown English Accent", 0.0
try:
# Load and preprocess audio
audio, sr = librosa.load(audio_path, sr=16000)
inputs = self.processor(audio, sampling_rate=sr, return_tensors="pt")
# Get predictions
with torch.no_grad():
outputs = self.model(**inputs)
# Get probabilities
probs = outputs.logits.softmax(dim=-1)[0]
prediction_id = probs.argmax().item()
confidence = probs[prediction_id].item()
# Get predicted label
id2label = self.model.config.id2label
accent_code = id2label[prediction_id]
# Map to English accent if possible
if accent_code.startswith('en-'):
accent = ENGLISH_ACCENTS.get(accent_code, f"English ({accent_code})")
confidence = confidence # Keep confidence as-is for English accents
else:
# If it's not an English accent code, use our pre-classification
is_english, _, _ = self.is_english(audio_path)
if is_english:
accent = "General English"
else:
accent = f"Non-English ({accent_code})"
confidence *= 0.7 # Reduce confidence for non-specific matches
return accent, confidence
except Exception as e:
st.error(f"Error in accent classification: {str(e)}")
return "Unknown English Accent", 0.0
def generate_explanation(self, audio_path, accent, confidence, is_english, language):
"""
Generate an explanation of the accent detection results using OpenAI API (if available)
"""
if not have_openai:
if is_english:
return f"The speaker has a {accent} accent with {confidence*100:.1f}% confidence. The speech was identified as English."
else:
return f"The speech was identified as {language}, not English. English confidence is low."
try:
import openai
is_english, lang, lang_score = self.is_english(audio_path)
prompt = f"""
Audio analysis detected a speaker with the following characteristics:
- Primary accent/language: {accent}
- Confidence score: {confidence*100:.1f}%
- Detected language category: {lang}
- Is English: {is_english}
Based on this information, provide a 2-3 sentence summary about the speaker's accent.
Focus on how clear their English is and any notable accent characteristics.
This is for hiring purposes to evaluate English speaking abilities.
"""
response = openai.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You are an accent analysis specialist providing factual assessments."},
{"role": "user", "content": prompt}
],
max_tokens=150
)
return response.choices[0].message.content.strip()
except Exception as e:
st.error(f"Error generating explanation: {str(e)}")
if is_english:
return f"The speaker has a {accent} accent with {confidence*100:.1f}% confidence. The speech was identified as English."
else:
return f"The speech was identified as {language}, not English. English confidence is low."
def analyze_audio(self, audio_path):
"""
Complete analysis pipeline returning all needed results
"""
# Check if it's English
is_english, lang, lang_score = self.is_english(audio_path)
# Classify accent if it's English
if is_english:
accent, accent_confidence = self.classify_accent(audio_path)
english_confidence = lang_score * 100 # Scale to percentage
else:
accent = f"Non-English ({lang})"
accent_confidence = lang_score
english_confidence = max(0, min(30, lang_score * 50)) # Cap at 30% if non-English
# Generate explanation
explanation = self.generate_explanation(audio_path, accent, accent_confidence, is_english, lang)
# Create visualization of the audio waveform
try:
y, sr = librosa.load(audio_path, sr=None)
fig, ax = plt.subplots(figsize=(10, 2))
ax.plot(y)
ax.set_xlabel('Sample')
ax.set_ylabel('Amplitude')
ax.set_title('Audio Waveform')
plt.tight_layout()
audio_viz = fig
except Exception as e:
st.warning(f"Could not generate audio visualization: {str(e)}")
audio_viz = None
return {
"is_english": is_english,
"accent": accent,
"accent_confidence": accent_confidence * 100, # Scale to percentage
"english_confidence": english_confidence,
"language_detected": lang,
"explanation": explanation,
"audio_viz": audio_viz
}
def process_uploaded_audio(uploaded_file):
"""Process uploaded audio file"""
try:
# Create a unique filename based on timestamp
timestamp = str(int(time.time()))
file_extension = os.path.splitext(uploaded_file.name)[1].lower()
# Write the uploaded file to disk with proper extension
temp_input_path = f"uploaded_audio_{timestamp}{file_extension}"
with open(temp_input_path, "wb") as f:
f.write(uploaded_file.getbuffer())
# For MP4 files, extract the audio using ffmpeg
if file_extension == ".mp4":
st.info("Extracting audio from video file...")
audio_path = f"extracted_audio_{timestamp}.wav"
try:
subprocess.run(
['ffmpeg', '-i', temp_input_path, '-vn', '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', audio_path],
check=True,
capture_output=True
)
# Remove the original video file
os.remove(temp_input_path)
except subprocess.CalledProcessError as e:
st.error(f"Error extracting audio: {e}")
st.error(f"ffmpeg output: {e.stderr.decode('utf-8')}")
raise
else:
# For audio files, use them directly
audio_path = temp_input_path
detector = AccentDetector()
results = detector.analyze_audio(audio_path)
# Clean up
if os.path.exists(audio_path):
os.remove(audio_path)
return results
except Exception as e:
st.error(f"Error processing audio: {str(e)}")
if 'temp_input_path' in locals() and os.path.exists(temp_input_path):
os.remove(temp_input_path)
if 'audio_path' in locals() and os.path.exists(audio_path):
os.remove(audio_path)
raise
return results
# --- Streamlit App ---
st.set_page_config(
page_title="🎤 English Accent Detector",
page_icon="🎤",
layout="wide"
)
st.title("🎤 English Accent Detection Tool")
st.markdown("""
This application analyzes a speaker's English accent from video URLs or audio uploads,
providing detailed insights for hiring evaluation purposes.
""")
# Add container for tips
with st.container():
st.info("""
💡 **Tips for best results:**
- Use **Loom** or **Vimeo** videos (more reliable than YouTube)
- For YouTube videos, you may need to provide cookies
- Audio clips of 15-30 seconds work best
- Clear speech with minimal background noise is ideal
""")
st.markdown("""
This app analyzes a speaker's English accent from a video or audio source.
It provides:
- Classification of the accent (British, American, etc.)
- Confidence score for English proficiency
- Explanation of accent characteristics
""")
# Create tabs for different input methods
tab1, tab2 = st.tabs(["Video URL", "Upload Audio"])
with tab1:
st.markdown("### 🎬 Analyze video from URL")
url = st.text_input("Enter a public video URL",
placeholder="https://www.loom.com/..., https://vimeo.com/..., or direct MP4 link")
# Recommend alternative sources
st.caption("⚠️ **Note**: YouTube videos often require authentication. For best results, use Loom, Vimeo or direct video links.")
# Add file uploader for cookies.txt
cookies_file = None
uploaded_cookies = st.file_uploader("Upload cookies.txt file for YouTube (if needed)",
type="txt",
help="Only needed for YouTube videos that require authentication")
if uploaded_cookies is not None:
# Save the uploaded cookies file to a temporary file
cookies_file = f"cookies_{int(time.time())}.txt"
with open(cookies_file, "wb") as f:
f.write(uploaded_cookies.getbuffer())
st.success("Cookies file uploaded successfully!")
with st.expander("Having trouble with YouTube videos?"):
st.markdown("""
### YouTube Authentication Issues
YouTube's anti-bot measures often block automated video downloads. To solve this:
#### Option 1: Use Alternative Video Sources (Recommended)
These typically work without authentication issues:
- [Loom](https://www.loom.com/) - Great for screen recordings
- [Vimeo](https://vimeo.com/) - High-quality video hosting
- [Streamable](https://streamable.com/) - Simple video sharing
- Any direct MP4 link
#### Option 2: Upload Cookies for YouTube
1. Install a browser extension like [Get cookies.txt](https://chrome.google.com/webstore/detail/get-cookiestxt-locally/cclelndahbckbenkjhflpdbgdldlbecc)
2. Login to YouTube in your browser
3. Use the extension to export cookies to a .txt file
4. Upload the cookies.txt file using the uploader above
#### Option 3: Use Audio Upload Instead
The 'Upload Audio' tab allows direct analysis of audio files without URL issues.
""")
if st.button("Analyze Video"):
if not url:
st.warning("Please enter a valid URL")
else:
try:
# Create a placeholder for status updates
status = st.empty()
# Generate unique filenames using timestamp to avoid conflicts
timestamp = str(int(time.time()))
video_path = f"video_{timestamp}.mp4"
audio_path = f"audio_{timestamp}.wav"
# Download and process the video
status.text("Downloading video...")
download_success = download_video(url, video_path, cookies_file)
if not download_success:
st.error("Failed to download video")
else:
status.text("Extracting audio...")
extract_success = extract_audio(video_path, audio_path)
if not extract_success:
st.error("Failed to extract audio")
else:
status.text("Analyzing accent... (this may take a moment)")
detector = AccentDetector()
results = detector.analyze_audio(audio_path)
# Display results
st.success("✅ Analysis Complete!")
# Create columns for results
col1, col2 = st.columns([2, 1])
with col1:
st.subheader("Accent Analysis Results")
st.markdown(f"**Detected Accent:** {results['accent']}")
st.markdown(f"**English Proficiency:** {results['english_confidence']:.1f}%")
st.markdown(f"**Accent Confidence:** {results['accent_confidence']:.1f}%")
# Show explanation in a box
st.markdown("### Expert Analysis")
st.info(results['explanation'])
with col2:
if results['audio_viz']:
st.pyplot(results['audio_viz'])
# Show audio playback
st.audio(audio_path)
# Clean up files
try:
if os.path.exists(video_path):
os.remove(video_path)
if os.path.exists(audio_path):
os.remove(audio_path)
if cookies_file and os.path.exists(cookies_file):
os.remove(cookies_file)
except Exception as e:
st.warning(f"Couldn't clean up temporary files: {str(e)}")
except Exception as e:
st.error(f"Error during analysis: {str(e)}")
with tab2:
st.markdown("### 🎵 Upload Audio File")
st.caption("**Recommended option!** Direct audio upload is more reliable than video URLs.")
# Add some information about file size limits
st.info("📝 **File Requirements**: \n"
"• Maximum file size: 200MB \n"
"• Supported formats: WAV, MP3, M4A, OGG, FLAC, MP4 \n"
"• Recommended length: 15-60 seconds of clear speech")
uploaded_file = st.file_uploader("Upload an audio file",
type=["wav", "mp3", "m4a", "ogg", "flac", "mp4"],
help="Support for WAV, MP3, M4A, OGG, FLAC and MP4 formats",
accept_multiple_files=False)
if uploaded_file is not None:
# Show a preview of the audio
st.markdown("#### Audio Preview:")
st.audio(uploaded_file)
st.markdown("#### Ready for Analysis")
col1, col2 = st.columns([1, 3])
with col1:
analyze_button = st.button("Analyze Audio", type="primary", use_container_width=True)
with col2:
st.caption("Tip: 15-30 seconds of clear speech works best for accent detection")
if analyze_button:
with st.spinner("Analyzing audio... (this may take 15-30 seconds)"):
try:
# Check file size before processing
file_size_mb = len(uploaded_file.getvalue()) / (1024 * 1024)
if file_size_mb > 190: # Stay below the 200MB limit with some buffer
st.error(f"File size ({file_size_mb:.1f}MB) is too large. Maximum allowed is 190MB.")
st.info("Tip: Try trimming your audio to just the speech segment for better results.")
else:
# Check the file type and inform user about processing steps
file_extension = os.path.splitext(uploaded_file.name)[1].lower()
if file_extension == '.mp4':
st.info("Processing video file - extracting audio track...")
# Process the file
results = process_uploaded_audio(uploaded_file)
# Display results
st.success("✅ Analysis Complete!")
# Create columns for results
col1, col2 = st.columns([2, 1])
with col1:
st.subheader("Accent Analysis Results")
st.markdown(f"**Detected Accent:** {results['accent']}")
st.markdown(f"**English Proficiency:** {results['english_confidence']:.1f}%")
st.markdown(f"**Accent Confidence:** {results['accent_confidence']:.1f}%")
# Show explanation in a box
st.markdown("### Expert Analysis")
st.info(results['explanation'])
with col2:
if results['audio_viz']:
st.pyplot(results['audio_viz'])
except subprocess.CalledProcessError as e:
st.error("Error processing audio file")
st.error(f"FFmpeg error: {e.stderr.decode('utf-8') if e.stderr else str(e)}")
st.info("Troubleshooting tips:\n"
"• Try a different audio file format (WAV or MP3 recommended)\n"
"• Make sure the file is not corrupted\n"
"• Try a shorter audio clip")
except PermissionError as e:
st.error(f"Permission error: {str(e)}")
st.info("The app doesn't have permission to access or create temporary files. "
"This could be due to Docker container permissions. "
"Contact the administrator or try using a different file.")
except OSError as e:
st.error(f"System error: {str(e)}")
st.info("Check that the file isn't corrupted and try with a smaller audio clip.")
except Exception as e:
error_msg = str(e)
st.error(f"Error during analysis: {error_msg}")
if "403" in error_msg:
st.warning("Received a 403 Forbidden error. This may be due to: \n"
"• File size exceeding limits\n"
"• Temporary file permission issues\n"
"• Network restrictions")
st.info("Try a smaller audio file (less than 50MB) or a different format.")
elif "timeout" in error_msg.lower():
st.warning("The request timed out. Try a shorter audio clip or check your internet connection.")
elif "memory" in error_msg.lower():
st.warning("Out of memory error. Try a shorter audio clip.")
else:
st.info("If the problem persists, try a different audio file format such as MP3 or WAV.")
# Add footer with deployment info
st.markdown("---")
st.markdown("Deployed using Streamlit • Built with SpeechBrain and Transformers")
# Add a section for how it works
with st.expander("ℹ️ How It Works"):
st.markdown("""
This app uses a multi-stage process to analyze a speaker's accent:
1. **Audio Extraction**: The audio track is extracted from the input video or directly processed from uploaded audio.
2. **Language Identification**: First, we determine if the speech is English using SpeechBrain's language identification model.
3. **Accent Classification**: For English speech, we analyze the specific accent using a transformer-based model trained on diverse accent data.
4. **English Proficiency Score**: A confidence score is calculated based on both language identification and accent clarity.
5. **Analysis Summary**: An explanation is generated describing accent characteristics relevant for hiring evaluations.
""")