Spaces:
Running
Running
Commit
·
f928012
1
Parent(s):
ac711b2
fixing the error - WORKER TIMEOUT says gunicorn because it's taking too long to respond, hence using threading to run generateTranscription fxn in background
Browse files- ASR_Server.py +15 -11
- Dockerfile +35 -7
ASR_Server.py
CHANGED
@@ -2,17 +2,12 @@ from flask import Flask, jsonify
|
|
2 |
from datasets import load_dataset, Audio
|
3 |
import pandas as pd
|
4 |
import os
|
|
|
5 |
|
6 |
import os
|
7 |
os.environ["HF_HOME"] = "/tmp/huggingface"
|
8 |
|
9 |
|
10 |
-
# Load dataset without decoding audio (required!)
|
11 |
-
dataset = load_dataset("satyamr196/asr_fairness_audio", split="train")
|
12 |
-
# dataset = dataset.with_format("python", decode_audio=False)
|
13 |
-
dataset = dataset.cast_column("audio", Audio(decode=False))
|
14 |
-
|
15 |
-
print(" ___ ")
|
16 |
csv_path = "test.csv"
|
17 |
df = pd.read_csv(csv_path)
|
18 |
print(f"CSV Loaded with {len(df)} rows")
|
@@ -88,7 +83,7 @@ print(f"CSV Loaded with {len(df)} rows")
|
|
88 |
# df.to_csv(output_csv_path, index=False)
|
89 |
# print(f"✅ Transcripts saved to {output_csv_path}")
|
90 |
|
91 |
-
def generateTranscript(ASR_model,
|
92 |
import os
|
93 |
import time
|
94 |
import tqdm
|
@@ -96,6 +91,11 @@ def generateTranscript(ASR_model, dataset, csv_path, output_dir="./"):
|
|
96 |
import soundfile as sf
|
97 |
from transformers import pipeline
|
98 |
|
|
|
|
|
|
|
|
|
|
|
99 |
output_csv_path = os.path.join("./", f"test_with_{ASR_model}.csv")
|
100 |
# Check if transcript already exists
|
101 |
if os.path.exists(output_csv_path):
|
@@ -108,14 +108,14 @@ def generateTranscript(ASR_model, dataset, csv_path, output_dir="./"):
|
|
108 |
|
109 |
# Initialize ASR pipeline
|
110 |
pipe = pipeline("automatic-speech-recognition", model=ASR_model, device=-1)
|
111 |
-
print("Device set to use CPU")
|
112 |
|
113 |
# Column with filenames in the CSV
|
114 |
filename_column = df.columns[0]
|
115 |
df[filename_column] = df[filename_column].str.strip().str.lower()
|
116 |
|
117 |
# Build map from filename -> dataset sample (without decoding audio)
|
118 |
-
print("Creating dataset map from filenames...")
|
119 |
# dataset = dataset.with_format("python", decode_audio=False)
|
120 |
dataset_map = {
|
121 |
os.path.basename(sample["audio"]["path"]).lower(): sample
|
@@ -199,8 +199,12 @@ def asr_models():
|
|
199 |
"Fairseq S2T",
|
200 |
"ESPnet"
|
201 |
]
|
202 |
-
|
203 |
-
|
|
|
|
|
|
|
|
|
204 |
return jsonify({"asr_models": models})
|
205 |
|
206 |
# if __name__ == "__main__":
|
|
|
2 |
from datasets import load_dataset, Audio
|
3 |
import pandas as pd
|
4 |
import os
|
5 |
+
import threading
|
6 |
|
7 |
import os
|
8 |
os.environ["HF_HOME"] = "/tmp/huggingface"
|
9 |
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
csv_path = "test.csv"
|
12 |
df = pd.read_csv(csv_path)
|
13 |
print(f"CSV Loaded with {len(df)} rows")
|
|
|
83 |
# df.to_csv(output_csv_path, index=False)
|
84 |
# print(f"✅ Transcripts saved to {output_csv_path}")
|
85 |
|
86 |
+
def generateTranscript(ASR_model, csv_path, output_dir="./"):
|
87 |
import os
|
88 |
import time
|
89 |
import tqdm
|
|
|
91 |
import soundfile as sf
|
92 |
from transformers import pipeline
|
93 |
|
94 |
+
# Load dataset without decoding audio (required!)
|
95 |
+
dataset = load_dataset("satyamr196/asr_fairness_audio", split="train")
|
96 |
+
# dataset = dataset.with_format("python", decode_audio=False)
|
97 |
+
dataset = dataset.cast_column("audio", Audio(decode=False))
|
98 |
+
|
99 |
output_csv_path = os.path.join("./", f"test_with_{ASR_model}.csv")
|
100 |
# Check if transcript already exists
|
101 |
if os.path.exists(output_csv_path):
|
|
|
108 |
|
109 |
# Initialize ASR pipeline
|
110 |
pipe = pipeline("automatic-speech-recognition", model=ASR_model, device=-1)
|
111 |
+
# print("Device set to use CPU")
|
112 |
|
113 |
# Column with filenames in the CSV
|
114 |
filename_column = df.columns[0]
|
115 |
df[filename_column] = df[filename_column].str.strip().str.lower()
|
116 |
|
117 |
# Build map from filename -> dataset sample (without decoding audio)
|
118 |
+
# print("Creating dataset map from filenames...")
|
119 |
# dataset = dataset.with_format("python", decode_audio=False)
|
120 |
dataset_map = {
|
121 |
os.path.basename(sample["audio"]["path"]).lower(): sample
|
|
|
199 |
"Fairseq S2T",
|
200 |
"ESPnet"
|
201 |
]
|
202 |
+
def background_job():
|
203 |
+
generateTranscript("openai/whisper-base", csv_path, output_dir="./")
|
204 |
+
|
205 |
+
# Start the background job in a separate thread
|
206 |
+
threading.Thread(target=background_job).start()
|
207 |
+
print("Transcription started in background")
|
208 |
return jsonify({"asr_models": models})
|
209 |
|
210 |
# if __name__ == "__main__":
|
Dockerfile
CHANGED
@@ -1,14 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
FROM python:3.9
|
2 |
|
3 |
-
#
|
4 |
-
|
5 |
-
|
|
|
6 |
ENV HF_HOME=/tmp/huggingface
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
WORKDIR /app
|
9 |
|
10 |
-
|
11 |
-
|
|
|
|
|
|
|
|
|
12 |
|
13 |
-
|
14 |
-
CMD ["gunicorn","-b", "0.0.0.0:7860","ASR_Server:app"]
|
|
|
1 |
+
# FROM python:3.9
|
2 |
+
|
3 |
+
# # RUN useradd -m -u 1000 user
|
4 |
+
# # USER user
|
5 |
+
# ENV PATH="/home/user/.local/bin:$PATH"
|
6 |
+
# ENV HF_HOME=/tmp/huggingface
|
7 |
+
|
8 |
+
# WORKDIR /app
|
9 |
+
|
10 |
+
# COPY ./requirements.txt requirements.txt
|
11 |
+
# RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
12 |
+
|
13 |
+
# COPY . /app
|
14 |
+
# CMD ["gunicorn","-b", "0.0.0.0:7860","ASR_Server:app"]
|
15 |
+
|
16 |
+
# Base image
|
17 |
+
|
18 |
FROM python:3.9
|
19 |
|
20 |
+
# Avoid interactive prompts during install
|
21 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
22 |
+
|
23 |
+
# Set HF cache to avoid permission denied errors
|
24 |
ENV HF_HOME=/tmp/huggingface
|
25 |
|
26 |
+
# Install system packages
|
27 |
+
RUN apt-get update && apt-get install -y \
|
28 |
+
libsndfile1 \
|
29 |
+
&& rm -rf /var/lib/apt/lists/*
|
30 |
+
|
31 |
+
# Set working directory
|
32 |
WORKDIR /app
|
33 |
|
34 |
+
# Copy code
|
35 |
+
COPY . .
|
36 |
+
|
37 |
+
# Install dependencies
|
38 |
+
RUN pip install --upgrade pip
|
39 |
+
RUN pip install -r requirements.txt
|
40 |
|
41 |
+
# Run the Flask app with Gunicorn on HF's required port
|
42 |
+
CMD ["gunicorn", "-b", "0.0.0.0:7860", "ASR_Server:app"]
|