import streamlit as st import os import yt_dlp import subprocess import librosa import numpy as np import torch import sys # Global flag for SpeechBrain availability HAS_SPEECHBRAIN = False # Handle SpeechBrain import with fallbacks for different versions try: # Try the new path first (SpeechBrain 1.0+) from speechbrain.inference.classifiers import EncoderClassifier HAS_SPEECHBRAIN = True except ImportError: try: # Try the legacy path from speechbrain.pretrained.interfaces import EncoderClassifier HAS_SPEECHBRAIN = True except ImportError: try: # Try the very old path from speechbrain.pretrained import EncoderClassifier HAS_SPEECHBRAIN = True except ImportError: # If all fail, we'll handle this later in the code st.error("âš ī¸ Unable to import SpeechBrain. Limited functionality available.") EncoderClassifier = None # Handle potential compatibility issues with transformers try: from transformers import AutoProcessor, AutoModelForAudioClassification HAS_AUTO_PROCESSOR = True except ImportError: from transformers import AutoModelForAudioClassification HAS_AUTO_PROCESSOR = False st.warning("Using a compatible but limited version of transformers. Some features may be limited.") from dotenv import load_dotenv import matplotlib.pyplot as plt import tempfile import time # Comment for deployment instructions: # To deploy this app: # 1. Make sure Docker is installed # 2. Build the Docker image: docker build -t accent-detector . # 3. Run the container: docker run -p 8501:8501 accent-detector # 4. Access the app at http://localhost:8501 # # For cloud deployment: # - Streamlit Cloud: Connect your GitHub repository to Streamlit Cloud # - Hugging Face Spaces: Use the Docker deployment option # - Azure/AWS/GCP: Deploy the container using their container services # Load environment variables (if .env file exists) try: load_dotenv() except: pass # Check for OpenAI API access - optional for enhanced explanations try: import openai openai.api_key = os.getenv("OPENAI_API_KEY") have_openai = openai.api_key is not None except (ImportError, AttributeError): have_openai = False # English accent categories ENGLISH_ACCENTS = { "en-us": "American English", "en-gb": "British English", "en-au": "Australian English", "en-ca": "Canadian English", "en-ie": "Irish English", "en-scotland": "Scottish English", "en-in": "Indian English", "en-za": "South African English", "en-ng": "Nigerian English", "en-caribbean": "Caribbean English", } def download_video(url, video_path="video.mp4", cookies_file=None): """Download a video from a URL""" ydl_opts = { "outtmpl": video_path, "quiet": False, "no_warnings": False, "verbose": True # More detailed output for debugging } # Only use cookies if explicitly provided via file upload # Don't try to access browser cookies in Docker container if cookies_file and os.path.exists(cookies_file): ydl_opts["cookiefile"] = cookies_file try: # Special handling for YouTube URLs to try without cookies first is_youtube = "youtube" in url.lower() or "youtu.be" in url.lower() with yt_dlp.YoutubeDL(ydl_opts) as ydl: ydl.download([url]) if os.path.exists(video_path): return True else: st.error(f"Video downloaded but file not found: {video_path}") return False except Exception as e: error_msg = str(e) st.error(f"Download error: {error_msg}") # Provide specific guidance based on error type if is_youtube and ("bot" in error_msg.lower() or "sign in" in error_msg.lower()): st.warning("YouTube requires authentication. Please upload a cookies.txt file or try a direct video link.") elif "not find" in error_msg.lower() and "cookies" in error_msg.lower(): st.warning("Browser cookies could not be accessed. Please upload a cookies.txt file.") elif "network" in error_msg.lower() or "timeout" in error_msg.lower(): st.warning("Network error. Please check your internet connection and try again.") return False def extract_audio(video_path="video.mp4", audio_path="audio.wav"): """Extract audio from video file using ffmpeg""" try: subprocess.run( ['ffmpeg', '-i', video_path, '-vn', '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', audio_path], check=True, capture_output=True ) return os.path.exists(audio_path) except subprocess.CalledProcessError as e: st.error(f"Error extracting audio: {e}") st.error(f"ffmpeg output: {e.stderr.decode('utf-8')}") raise class AccentDetector: def __init__(self): # Initialize language identification model self.have_lang_id = False try: if EncoderClassifier is not None: self.lang_id = EncoderClassifier.from_hparams( source="speechbrain/lang-id-commonlanguage_ecapa", savedir="tmp_model" ) self.have_lang_id = True else: st.error("SpeechBrain not available. Language identification disabled.") except Exception as e: st.error(f"Error loading language ID model: {str(e)}") # Initialize the accent classifier self.have_accent_model = False try: self.model_name = "speechbrain/lang-id-voxlingua107-ecapa" # Handle case where AutoProcessor is not available if HAS_AUTO_PROCESSOR: self.processor = AutoProcessor.from_pretrained(self.model_name) else: # Fall back to using feature_extractor from transformers import AutoFeatureExtractor self.processor = AutoFeatureExtractor.from_pretrained(self.model_name) self.model = AutoModelForAudioClassification.from_pretrained(self.model_name) self.have_accent_model = True except Exception as e: st.warning(f"Could not load accent model: {str(e)}") self.have_accent_model = False def is_english(self, audio_path, threshold=0.7): """ Determine if the speech is English and return confidence score """ if not hasattr(self, 'have_lang_id') or not self.have_lang_id: # If language ID model is not available, assume English st.warning("Language identification is not available. Assuming English speech.") return True, "en", 1.0 try: out_prob, score, index, lang = self.lang_id.classify_file(audio_path) score = float(score) # Check if language is English (slightly fuzzy match) is_english = "eng" in lang.lower() or "en-" in lang.lower() or lang.lower() == "en" return is_english, lang, score except Exception as e: st.warning(f"Error identifying language: {str(e)}. Assuming English speech.") return True, "en", 0.5 def classify_accent(self, audio_path): """ Classify the specific English accent """ if not self.have_accent_model: return "Unknown English Accent", 0.0 try: # Load and preprocess audio audio, sr = librosa.load(audio_path, sr=16000) inputs = self.processor(audio, sampling_rate=sr, return_tensors="pt") # Get predictions with torch.no_grad(): outputs = self.model(**inputs) # Get probabilities probs = outputs.logits.softmax(dim=-1)[0] prediction_id = probs.argmax().item() confidence = probs[prediction_id].item() # Get predicted label id2label = self.model.config.id2label accent_code = id2label[prediction_id] # Map to English accent if possible if accent_code.startswith('en-'): accent = ENGLISH_ACCENTS.get(accent_code, f"English ({accent_code})") confidence = confidence # Keep confidence as-is for English accents else: # If it's not an English accent code, use our pre-classification is_english, _, _ = self.is_english(audio_path) if is_english: accent = "General English" else: accent = f"Non-English ({accent_code})" confidence *= 0.7 # Reduce confidence for non-specific matches return accent, confidence except Exception as e: st.error(f"Error in accent classification: {str(e)}") return "Unknown English Accent", 0.0 def generate_explanation(self, audio_path, accent, confidence, is_english, language): """ Generate an explanation of the accent detection results using OpenAI API (if available) """ if not have_openai: if is_english: return f"The speaker has a {accent} accent with {confidence*100:.1f}% confidence. The speech was identified as English." else: return f"The speech was identified as {language}, not English. English confidence is low." try: import openai is_english, lang, lang_score = self.is_english(audio_path) prompt = f""" Audio analysis detected a speaker with the following characteristics: - Primary accent/language: {accent} - Confidence score: {confidence*100:.1f}% - Detected language category: {lang} - Is English: {is_english} Based on this information, provide a 2-3 sentence summary about the speaker's accent. Focus on how clear their English is and any notable accent characteristics. This is for hiring purposes to evaluate English speaking abilities. """ response = openai.chat.completions.create( model="gpt-3.5-turbo", messages=[ {"role": "system", "content": "You are an accent analysis specialist providing factual assessments."}, {"role": "user", "content": prompt} ], max_tokens=150 ) return response.choices[0].message.content.strip() except Exception as e: st.error(f"Error generating explanation: {str(e)}") if is_english: return f"The speaker has a {accent} accent with {confidence*100:.1f}% confidence. The speech was identified as English." else: return f"The speech was identified as {language}, not English. English confidence is low." def analyze_audio(self, audio_path): """ Complete analysis pipeline returning all needed results """ # Check if it's English is_english, lang, lang_score = self.is_english(audio_path) # Classify accent if it's English if is_english: accent, accent_confidence = self.classify_accent(audio_path) english_confidence = lang_score * 100 # Scale to percentage else: accent = f"Non-English ({lang})" accent_confidence = lang_score english_confidence = max(0, min(30, lang_score * 50)) # Cap at 30% if non-English # Generate explanation explanation = self.generate_explanation(audio_path, accent, accent_confidence, is_english, lang) # Create visualization of the audio waveform try: y, sr = librosa.load(audio_path, sr=None) fig, ax = plt.subplots(figsize=(10, 2)) ax.plot(y) ax.set_xlabel('Sample') ax.set_ylabel('Amplitude') ax.set_title('Audio Waveform') plt.tight_layout() audio_viz = fig except Exception as e: st.warning(f"Could not generate audio visualization: {str(e)}") audio_viz = None return { "is_english": is_english, "accent": accent, "accent_confidence": accent_confidence * 100, # Scale to percentage "english_confidence": english_confidence, "language_detected": lang, "explanation": explanation, "audio_viz": audio_viz } def process_uploaded_audio(uploaded_file): """Process uploaded audio file""" with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_file: temp_file.write(uploaded_file.getvalue()) audio_path = temp_file.name detector = AccentDetector() results = detector.analyze_audio(audio_path) # Clean up os.unlink(audio_path) return results # --- Streamlit App --- st.set_page_config( page_title="🎤 English Accent Detector", page_icon="🎤", layout="wide" ) st.title("🎤 English Accent Detection Tool") st.markdown(""" This application analyzes a speaker's English accent from video URLs or audio uploads, providing detailed insights for hiring evaluation purposes. """) # Add container for tips with st.container(): st.info(""" 💡 **Tips for best results:** - Use **Loom** or **Vimeo** videos (more reliable than YouTube) - For YouTube videos, you may need to provide cookies - Audio clips of 15-30 seconds work best - Clear speech with minimal background noise is ideal """) st.markdown(""" This app analyzes a speaker's English accent from a video or audio source. It provides: - Classification of the accent (British, American, etc.) - Confidence score for English proficiency - Explanation of accent characteristics """) # Create tabs for different input methods tab1, tab2 = st.tabs(["Video URL", "Upload Audio"]) with tab1: st.markdown("### đŸŽŦ Analyze video from URL") url = st.text_input("Enter a public video URL", placeholder="https://www.loom.com/..., https://vimeo.com/..., or direct MP4 link") # Recommend alternative sources st.caption("âš ī¸ **Note**: YouTube videos often require authentication. For best results, use Loom, Vimeo or direct video links.") # Add file uploader for cookies.txt cookies_file = None uploaded_cookies = st.file_uploader("Upload cookies.txt file for YouTube (if needed)", type="txt", help="Only needed for YouTube videos that require authentication") if uploaded_cookies is not None: # Save the uploaded cookies file to a temporary file cookies_file = f"cookies_{int(time.time())}.txt" with open(cookies_file, "wb") as f: f.write(uploaded_cookies.getbuffer()) st.success("Cookies file uploaded successfully!") with st.expander("Having trouble with YouTube videos?"): st.markdown(""" ### YouTube Authentication Issues YouTube's anti-bot measures often block automated video downloads. To solve this: #### Option 1: Use Alternative Video Sources (Recommended) These typically work without authentication issues: - [Loom](https://www.loom.com/) - Great for screen recordings - [Vimeo](https://vimeo.com/) - High-quality video hosting - [Streamable](https://streamable.com/) - Simple video sharing - Any direct MP4 link #### Option 2: Upload Cookies for YouTube 1. Install a browser extension like [Get cookies.txt](https://chrome.google.com/webstore/detail/get-cookiestxt-locally/cclelndahbckbenkjhflpdbgdldlbecc) 2. Login to YouTube in your browser 3. Use the extension to export cookies to a .txt file 4. Upload the cookies.txt file using the uploader above #### Option 3: Use Audio Upload Instead The 'Upload Audio' tab allows direct analysis of audio files without URL issues. """) if st.button("Analyze Video"): if not url: st.warning("Please enter a valid URL") else: try: # Create a placeholder for status updates status = st.empty() # Generate unique filenames using timestamp to avoid conflicts timestamp = str(int(time.time())) video_path = f"video_{timestamp}.mp4" audio_path = f"audio_{timestamp}.wav" # Download and process the video status.text("Downloading video...") download_success = download_video(url, video_path, cookies_file) if not download_success: st.error("Failed to download video") else: status.text("Extracting audio...") extract_success = extract_audio(video_path, audio_path) if not extract_success: st.error("Failed to extract audio") else: status.text("Analyzing accent... (this may take a moment)") detector = AccentDetector() results = detector.analyze_audio(audio_path) # Display results st.success("✅ Analysis Complete!") # Create columns for results col1, col2 = st.columns([2, 1]) with col1: st.subheader("Accent Analysis Results") st.markdown(f"**Detected Accent:** {results['accent']}") st.markdown(f"**English Proficiency:** {results['english_confidence']:.1f}%") st.markdown(f"**Accent Confidence:** {results['accent_confidence']:.1f}%") # Show explanation in a box st.markdown("### Expert Analysis") st.info(results['explanation']) with col2: if results['audio_viz']: st.pyplot(results['audio_viz']) # Show audio playback st.audio(audio_path) # Clean up files try: if os.path.exists(video_path): os.remove(video_path) if os.path.exists(audio_path): os.remove(audio_path) if cookies_file and os.path.exists(cookies_file): os.remove(cookies_file) except Exception as e: st.warning(f"Couldn't clean up temporary files: {str(e)}") except Exception as e: st.error(f"Error during analysis: {str(e)}") with tab2: st.markdown("### đŸŽĩ Upload Audio File") st.caption("**Recommended option!** Direct audio upload is more reliable than video URLs.") uploaded_file = st.file_uploader("Upload an audio file", type=["wav", "mp3", "m4a", "ogg", "flac"], help="Support for WAV, MP3, M4A, OGG and FLAC formats") if uploaded_file is not None: # Show a preview of the audio st.markdown("#### Audio Preview:") st.audio(uploaded_file) st.markdown("#### Ready for Analysis") col1, col2 = st.columns([1, 3]) with col1: analyze_button = st.button("Analyze Audio", type="primary", use_container_width=True) with col2: st.caption("Tip: 15-30 seconds of clear speech works best for accent detection") if analyze_button: with st.spinner("Analyzing audio... (this may take 15-30 seconds)"): try: results = process_uploaded_audio(uploaded_file) # Display results st.success("✅ Analysis Complete!") # Create columns for results col1, col2 = st.columns([2, 1]) with col1: st.subheader("Accent Analysis Results") st.markdown(f"**Detected Accent:** {results['accent']}") st.markdown(f"**English Proficiency:** {results['english_confidence']:.1f}%") st.markdown(f"**Accent Confidence:** {results['accent_confidence']:.1f}%") # Show explanation in a box st.markdown("### Expert Analysis") st.info(results['explanation']) with col2: if results['audio_viz']: st.pyplot(results['audio_viz']) except Exception as e: st.error(f"Error during analysis: {str(e)}") # Add footer with deployment info st.markdown("---") st.markdown("Deployed using Streamlit â€ĸ Built with SpeechBrain and Transformers") # Add a section for how it works with st.expander("â„šī¸ How It Works"): st.markdown(""" This app uses a multi-stage process to analyze a speaker's accent: 1. **Audio Extraction**: The audio track is extracted from the input video or directly processed from uploaded audio. 2. **Language Identification**: First, we determine if the speech is English using SpeechBrain's language identification model. 3. **Accent Classification**: For English speech, we analyze the specific accent using a transformer-based model trained on diverse accent data. 4. **English Proficiency Score**: A confidence score is calculated based on both language identification and accent clarity. 5. **Analysis Summary**: An explanation is generated describing accent characteristics relevant for hiring evaluations. """)