satyamr196 commited on
Commit
f928012
·
1 Parent(s): ac711b2

fixing the error - WORKER TIMEOUT says gunicorn because it's taking too long to respond, hence using threading to run generateTranscription fxn in background

Browse files
Files changed (2) hide show
  1. ASR_Server.py +15 -11
  2. Dockerfile +35 -7
ASR_Server.py CHANGED
@@ -2,17 +2,12 @@ from flask import Flask, jsonify
2
  from datasets import load_dataset, Audio
3
  import pandas as pd
4
  import os
 
5
 
6
  import os
7
  os.environ["HF_HOME"] = "/tmp/huggingface"
8
 
9
 
10
- # Load dataset without decoding audio (required!)
11
- dataset = load_dataset("satyamr196/asr_fairness_audio", split="train")
12
- # dataset = dataset.with_format("python", decode_audio=False)
13
- dataset = dataset.cast_column("audio", Audio(decode=False))
14
-
15
- print(" ___ ")
16
  csv_path = "test.csv"
17
  df = pd.read_csv(csv_path)
18
  print(f"CSV Loaded with {len(df)} rows")
@@ -88,7 +83,7 @@ print(f"CSV Loaded with {len(df)} rows")
88
  # df.to_csv(output_csv_path, index=False)
89
  # print(f"✅ Transcripts saved to {output_csv_path}")
90
 
91
- def generateTranscript(ASR_model, dataset, csv_path, output_dir="./"):
92
  import os
93
  import time
94
  import tqdm
@@ -96,6 +91,11 @@ def generateTranscript(ASR_model, dataset, csv_path, output_dir="./"):
96
  import soundfile as sf
97
  from transformers import pipeline
98
 
 
 
 
 
 
99
  output_csv_path = os.path.join("./", f"test_with_{ASR_model}.csv")
100
  # Check if transcript already exists
101
  if os.path.exists(output_csv_path):
@@ -108,14 +108,14 @@ def generateTranscript(ASR_model, dataset, csv_path, output_dir="./"):
108
 
109
  # Initialize ASR pipeline
110
  pipe = pipeline("automatic-speech-recognition", model=ASR_model, device=-1)
111
- print("Device set to use CPU")
112
 
113
  # Column with filenames in the CSV
114
  filename_column = df.columns[0]
115
  df[filename_column] = df[filename_column].str.strip().str.lower()
116
 
117
  # Build map from filename -> dataset sample (without decoding audio)
118
- print("Creating dataset map from filenames...")
119
  # dataset = dataset.with_format("python", decode_audio=False)
120
  dataset_map = {
121
  os.path.basename(sample["audio"]["path"]).lower(): sample
@@ -199,8 +199,12 @@ def asr_models():
199
  "Fairseq S2T",
200
  "ESPnet"
201
  ]
202
- generateTranscript("openai/whisper-base", dataset, csv_path, output_dir="./") ;
203
- # print("Transcript generation completed.")
 
 
 
 
204
  return jsonify({"asr_models": models})
205
 
206
  # if __name__ == "__main__":
 
2
  from datasets import load_dataset, Audio
3
  import pandas as pd
4
  import os
5
+ import threading
6
 
7
  import os
8
  os.environ["HF_HOME"] = "/tmp/huggingface"
9
 
10
 
 
 
 
 
 
 
11
  csv_path = "test.csv"
12
  df = pd.read_csv(csv_path)
13
  print(f"CSV Loaded with {len(df)} rows")
 
83
  # df.to_csv(output_csv_path, index=False)
84
  # print(f"✅ Transcripts saved to {output_csv_path}")
85
 
86
+ def generateTranscript(ASR_model, csv_path, output_dir="./"):
87
  import os
88
  import time
89
  import tqdm
 
91
  import soundfile as sf
92
  from transformers import pipeline
93
 
94
+ # Load dataset without decoding audio (required!)
95
+ dataset = load_dataset("satyamr196/asr_fairness_audio", split="train")
96
+ # dataset = dataset.with_format("python", decode_audio=False)
97
+ dataset = dataset.cast_column("audio", Audio(decode=False))
98
+
99
  output_csv_path = os.path.join("./", f"test_with_{ASR_model}.csv")
100
  # Check if transcript already exists
101
  if os.path.exists(output_csv_path):
 
108
 
109
  # Initialize ASR pipeline
110
  pipe = pipeline("automatic-speech-recognition", model=ASR_model, device=-1)
111
+ # print("Device set to use CPU")
112
 
113
  # Column with filenames in the CSV
114
  filename_column = df.columns[0]
115
  df[filename_column] = df[filename_column].str.strip().str.lower()
116
 
117
  # Build map from filename -> dataset sample (without decoding audio)
118
+ # print("Creating dataset map from filenames...")
119
  # dataset = dataset.with_format("python", decode_audio=False)
120
  dataset_map = {
121
  os.path.basename(sample["audio"]["path"]).lower(): sample
 
199
  "Fairseq S2T",
200
  "ESPnet"
201
  ]
202
+ def background_job():
203
+ generateTranscript("openai/whisper-base", csv_path, output_dir="./")
204
+
205
+ # Start the background job in a separate thread
206
+ threading.Thread(target=background_job).start()
207
+ print("Transcription started in background")
208
  return jsonify({"asr_models": models})
209
 
210
  # if __name__ == "__main__":
Dockerfile CHANGED
@@ -1,14 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  FROM python:3.9
2
 
3
- # RUN useradd -m -u 1000 user
4
- # USER user
5
- ENV PATH="/home/user/.local/bin:$PATH"
 
6
  ENV HF_HOME=/tmp/huggingface
7
 
 
 
 
 
 
 
8
  WORKDIR /app
9
 
10
- COPY ./requirements.txt requirements.txt
11
- RUN pip install --no-cache-dir --upgrade -r requirements.txt
 
 
 
 
12
 
13
- COPY . /app
14
- CMD ["gunicorn","-b", "0.0.0.0:7860","ASR_Server:app"]
 
1
+ # FROM python:3.9
2
+
3
+ # # RUN useradd -m -u 1000 user
4
+ # # USER user
5
+ # ENV PATH="/home/user/.local/bin:$PATH"
6
+ # ENV HF_HOME=/tmp/huggingface
7
+
8
+ # WORKDIR /app
9
+
10
+ # COPY ./requirements.txt requirements.txt
11
+ # RUN pip install --no-cache-dir --upgrade -r requirements.txt
12
+
13
+ # COPY . /app
14
+ # CMD ["gunicorn","-b", "0.0.0.0:7860","ASR_Server:app"]
15
+
16
+ # Base image
17
+
18
  FROM python:3.9
19
 
20
+ # Avoid interactive prompts during install
21
+ ENV DEBIAN_FRONTEND=noninteractive
22
+
23
+ # Set HF cache to avoid permission denied errors
24
  ENV HF_HOME=/tmp/huggingface
25
 
26
+ # Install system packages
27
+ RUN apt-get update && apt-get install -y \
28
+ libsndfile1 \
29
+ && rm -rf /var/lib/apt/lists/*
30
+
31
+ # Set working directory
32
  WORKDIR /app
33
 
34
+ # Copy code
35
+ COPY . .
36
+
37
+ # Install dependencies
38
+ RUN pip install --upgrade pip
39
+ RUN pip install -r requirements.txt
40
 
41
+ # Run the Flask app with Gunicorn on HF's required port
42
+ CMD ["gunicorn", "-b", "0.0.0.0:7860", "ASR_Server:app"]