amirjamali commited on
Commit
5bacc9d
·
unverified ·
1 Parent(s): 1b3a125

Refactor Dockerfile for improved dependency management and enhance Streamlit app initialization with better error handling for language identification and accent classification models

Browse files
Dockerfile CHANGED
@@ -1,83 +1,47 @@
1
- FROM python:3.9
2
 
3
- WORKDIR /app
4
-
5
- # Set environment variables to avoid permission errors
6
- ENV HOME=/app \
7
- PYTHONUNBUFFERED=1 \
8
- PYTHONDONTWRITEBYTECODE=1 \
9
- MPLCONFIGDIR=/tmp/matplotlib \
10
- TRANSFORMERS_CACHE=/app/.cache/huggingface \
11
- XDG_CACHE_HOME=/app/.cache
12
 
13
- # Create non-root user for better security
14
- RUN groupadd -g 1000 appuser && \
15
- useradd -u 1000 -g appuser -s /bin/sh -m appuser
16
 
17
- # Install system dependencies including ffmpeg for audio processing
18
- # Add retry logic and better mirrors for improved network reliability
19
- RUN apt-get update --allow-releaseinfo-change || (sleep 2 && apt-get update) && \
20
- apt-get install -y \
21
- build-essential \
22
- curl \
23
- software-properties-common \
24
- git \
25
- ffmpeg \
26
- libsndfile1 \
27
- ca-certificates \
28
- && apt-get clean \
29
  && rm -rf /var/lib/apt/lists/*
30
 
31
- # Set pip to have more retries and timeout
32
- ENV PIP_DEFAULT_TIMEOUT=100
33
- ENV PIP_RETRIES=3
34
-
35
- # Copy requirements and install Python dependencies
36
- COPY requirements.txt ./
37
 
38
- # First install torch and torchaudio separately for better compatibility
39
- RUN pip install --upgrade pip && \
40
- pip install torch==2.0.1 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cpu
41
 
42
- # Then install the rest of the requirements with retries
43
- RUN pip install --no-cache-dir -r requirements.txt || \
44
- (sleep 2 && pip install --no-cache-dir -r requirements.txt) || \
45
- (sleep 5 && pip install --no-cache-dir -r requirements.txt --use-deprecated=legacy-resolver)
46
-
47
- # Install SpeechBrain directly using Git for better compatibility
48
- RUN pip install git+https://github.com/speechbrain/speechbrain@v0.5.14
49
 
50
  # Copy source code
51
  COPY src/ ./src/
52
 
53
- # Create directories with proper permissions
54
- RUN mkdir -p /app/cookies /app/tmp_model /app/.streamlit /app/.cache && \
55
- chown -R appuser:appuser /app
56
-
57
- # Switch to non-root user for better security
58
- USER appuser
59
-
60
- # Create a Streamlit configuration file to avoid permission issues
61
- RUN mkdir -p /app/.streamlit && \
62
- echo '\
63
- [server]\n\
64
- port = 8501\n\
65
- address = "0.0.0.0"\n\
66
- headless = true\n\
67
- enableCORS = false\n\
68
- \n\
69
- [browser]\n\
70
- gatherUsageStats = false\n\
71
- \n\
72
- [runner]\n\
73
- fastReruns = true\n\
74
- ' > /app/.streamlit/config.toml
75
 
76
- # Expose the port Streamlit will run on
77
  EXPOSE 8501
78
 
79
- # Health check to ensure the service is running
80
  HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
81
 
82
- # Run the Streamlit app
83
- ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py"]
 
1
+ FROM python:3.9-slim
2
 
3
+ # Set environment variables
4
+ ENV PYTHONUNBUFFERED=1 `
5
+ PYTHONDONTWRITEBYTECODE=1 `
6
+ MPLCONFIGDIR=/tmp/matplotlib
 
 
 
 
 
7
 
8
+ WORKDIR /app
 
 
9
 
10
+ # Install system dependencies
11
+ RUN apt-get update && `
12
+ apt-get install -y --no-install-recommends `
13
+ build-essential `
14
+ curl `
15
+ git `
16
+ ffmpeg `
17
+ libsndfile1 `
18
+ && apt-get clean `
 
 
 
19
  && rm -rf /var/lib/apt/lists/*
20
 
21
+ # Create necessary directories
22
+ RUN mkdir -p /app/tmp_model /tmp/matplotlib
 
 
 
 
23
 
24
+ # Copy requirements first (for better caching)
25
+ COPY requirements.txt .
 
26
 
27
+ # Install Python dependencies with specific order for compatibility
28
+ RUN pip install --no-cache-dir --upgrade pip && `
29
+ pip install --no-cache-dir torch==2.0.1 torchaudio==2.0.2 && `
30
+ pip install --no-cache-dir -r requirements.txt && `
31
+ pip install --no-cache-dir git+https://github.com/speechbrain/speechbrain.git@v0.5.14
 
 
32
 
33
  # Copy source code
34
  COPY src/ ./src/
35
 
36
+ # Set up Streamlit configuration
37
+ RUN mkdir -p .streamlit && `
38
+ echo "[server]`nport = 8501`naddress = \"0.0.0.0\"`nheadless = true`n`n[browser]`ngatherUsageStats = false`n`n[runner]`nfastReruns = true" > ./.streamlit/config.toml
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
+ # Expose port
41
  EXPOSE 8501
42
 
43
+ # Health check
44
  HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
45
 
46
+ # Run the app
47
+ ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
src/__pycache__/streamlit_app.cpython-312.pyc ADDED
Binary file (24.6 kB). View file
 
src/streamlit_app.py CHANGED
@@ -139,7 +139,8 @@ def extract_audio(video_path="video.mp4", audio_path="audio.wav"):
139
 
140
  class AccentDetector:
141
  def __init__(self):
142
- # Initialize the language identification model
 
143
  try:
144
  if EncoderClassifier is not None:
145
  self.lang_id = EncoderClassifier.from_hparams(
@@ -149,12 +150,10 @@ class AccentDetector:
149
  self.have_lang_id = True
150
  else:
151
  st.error("SpeechBrain not available. Language identification disabled.")
152
- self.have_lang_id = False
153
  except Exception as e:
154
  st.error(f"Error loading language ID model: {str(e)}")
155
- self.have_lang_id = False
156
- # Initialize the English accent classifier - using VoxLingua107 for now
157
- # In production, you'd use a more specialized accent model
158
  try:
159
  self.model_name = "speechbrain/lang-id-voxlingua107-ecapa"
160
 
@@ -162,10 +161,11 @@ class AccentDetector:
162
  if HAS_AUTO_PROCESSOR:
163
  self.processor = AutoProcessor.from_pretrained(self.model_name)
164
  else:
165
- # Fall back to using feature_extractor directly if AutoProcessor is not available
166
  from transformers import AutoFeatureExtractor
167
  self.processor = AutoFeatureExtractor.from_pretrained(self.model_name)
168
- self.model = AutoModelForAudioClassification.from_pretrained(self.model_name)
 
169
  self.have_accent_model = True
170
  except Exception as e:
171
  st.warning(f"Could not load accent model: {str(e)}")
 
139
 
140
  class AccentDetector:
141
  def __init__(self):
142
+ # Initialize language identification model
143
+ self.have_lang_id = False
144
  try:
145
  if EncoderClassifier is not None:
146
  self.lang_id = EncoderClassifier.from_hparams(
 
150
  self.have_lang_id = True
151
  else:
152
  st.error("SpeechBrain not available. Language identification disabled.")
 
153
  except Exception as e:
154
  st.error(f"Error loading language ID model: {str(e)}")
155
+ # Initialize the accent classifier
156
+ self.have_accent_model = False
 
157
  try:
158
  self.model_name = "speechbrain/lang-id-voxlingua107-ecapa"
159
 
 
161
  if HAS_AUTO_PROCESSOR:
162
  self.processor = AutoProcessor.from_pretrained(self.model_name)
163
  else:
164
+ # Fall back to using feature_extractor
165
  from transformers import AutoFeatureExtractor
166
  self.processor = AutoFeatureExtractor.from_pretrained(self.model_name)
167
+
168
+ self.model = AutoModelForAudioClassification.from_pretrained(self.model_name)
169
  self.have_accent_model = True
170
  except Exception as e:
171
  st.warning(f"Could not load accent model: {str(e)}")