Spaces:
Running
Running
Commit
·
0e84bb7
1
Parent(s):
a4cec90
write files to /data directory becuase ./ directory is not allowed to write in HF spaces
Browse files- ASR_Server.py +21 -81
ASR_Server.py
CHANGED
@@ -14,81 +14,12 @@ job_status = {
|
|
14 |
"message": "No job running"
|
15 |
}
|
16 |
csv_path = "test.csv"
|
|
|
17 |
df = pd.read_csv(csv_path)
|
18 |
print(f"CSV Loaded with {len(df)} rows")
|
19 |
|
20 |
-
|
21 |
-
|
22 |
-
# import time
|
23 |
-
# import pandas as pd
|
24 |
-
# import librosa
|
25 |
-
# import tqdm
|
26 |
-
# from transformers import pipeline
|
27 |
-
|
28 |
-
# os.makedirs(output_dir, exist_ok=True)
|
29 |
-
# # output_csv_path = os.path.join(output_dir, f"test_with_{ASR_model.replace('/', '_')}.csv")
|
30 |
-
# output_csv_path = os.path.join(output_dir, f"test_with_{ASR_model}.csv")
|
31 |
-
|
32 |
-
# if os.path.exists(output_csv_path):
|
33 |
-
# print(f"Transcript already exists for model {ASR_model}. Skipping transcription.")
|
34 |
-
# return
|
35 |
-
|
36 |
-
# # Load metadata CSV
|
37 |
-
# df = pd.read_csv(csv_path)
|
38 |
-
# print(f"CSV Loaded with {len(df)} rows")
|
39 |
-
|
40 |
-
# # Prepare
|
41 |
-
# df[df.columns[0]] = df[df.columns[0]].str.strip().str.lower()
|
42 |
-
# filename_column = df.columns[0]
|
43 |
-
# transcripts = []
|
44 |
-
# rtfx_score = []
|
45 |
-
|
46 |
-
# # Load ASR model
|
47 |
-
# pipe = pipeline("automatic-speech-recognition", model=ASR_model)
|
48 |
-
|
49 |
-
# # Create a map of dataset samples by file name (assumes filename is in dataset)
|
50 |
-
# dataset_map = {
|
51 |
-
# sample["audio"]["path"].split("/")[-1].lower(): sample for sample in dataset
|
52 |
-
# }
|
53 |
-
|
54 |
-
# for idx, row in tqdm.tqdm(df.iterrows(), total=len(df)):
|
55 |
-
# filename = row[filename_column].strip().lower() + ".wav"
|
56 |
-
# if filename in dataset_map:
|
57 |
-
# sample = dataset_map[filename]
|
58 |
-
# try:
|
59 |
-
# audio_array = sample["audio"]["array"]
|
60 |
-
# sample_rate = sample["audio"]["sampling_rate"]
|
61 |
-
|
62 |
-
# start_time = time.time()
|
63 |
-
# result = pipe({"array": audio_array, "sampling_rate": sample_rate})
|
64 |
-
# end_time = time.time()
|
65 |
-
|
66 |
-
# transcript = result["text"]
|
67 |
-
# duration = librosa.get_duration(y=audio_array, sr=sample_rate)
|
68 |
-
# rtfx = (end_time - start_time) / duration if duration > 0 else 0
|
69 |
-
|
70 |
-
# transcripts.append(transcript)
|
71 |
-
# rtfx_score.append(rtfx)
|
72 |
-
|
73 |
-
# print(f"✅ {filename}: RTFX = {rtfx:.2f}")
|
74 |
-
|
75 |
-
# except Exception as e:
|
76 |
-
# print(f"❌ Error with {filename}: {e}")
|
77 |
-
# transcripts.append("")
|
78 |
-
# rtfx_score.append(0)
|
79 |
-
# else:
|
80 |
-
# print(f"⚠️ File not in dataset: {filename}")
|
81 |
-
# transcripts.append("")
|
82 |
-
# rtfx_score.append(0)
|
83 |
-
|
84 |
-
# # Append to original DataFrame
|
85 |
-
# df['transcript'] = transcripts
|
86 |
-
# df['rtfx'] = rtfx_score
|
87 |
-
|
88 |
-
# df.to_csv(output_csv_path, index=False)
|
89 |
-
# print(f"✅ Transcripts saved to {output_csv_path}")
|
90 |
-
|
91 |
-
def generateTranscript(ASR_model, csv_path, output_dir="./"):
|
92 |
import os
|
93 |
import time
|
94 |
import tqdm
|
@@ -109,7 +40,7 @@ def generateTranscript(ASR_model, csv_path, output_dir="./"):
|
|
109 |
# dataset = dataset.with_format("python", decode_audio=False)
|
110 |
dataset = dataset.cast_column("audio", Audio(decode=False))
|
111 |
|
112 |
-
output_csv_path = os.path.join(
|
113 |
# Check if transcript already exists
|
114 |
if os.path.exists(output_csv_path):
|
115 |
print(f"Transcript already exists for model {ASR_model}. Skipping transcription.")
|
@@ -122,9 +53,17 @@ def generateTranscript(ASR_model, csv_path, output_dir="./"):
|
|
122 |
total = len(df)
|
123 |
job_status["total"] = total
|
124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
# Initialize ASR pipeline
|
126 |
-
pipe = pipeline("automatic-speech-recognition", model=ASR_model, device
|
127 |
-
# print("Device set to use CPU")
|
128 |
|
129 |
# Column with filenames in the CSV
|
130 |
filename_column = df.columns[0]
|
@@ -175,6 +114,7 @@ def generateTranscript(ASR_model, csv_path, output_dir="./"):
|
|
175 |
|
176 |
job_status["completed"] = idx + 1
|
177 |
job_status["message"] = f"Processing {idx + 1}/{total}"
|
|
|
178 |
|
179 |
# Save results
|
180 |
df["transcript"] = transcripts
|
@@ -182,11 +122,11 @@ def generateTranscript(ASR_model, csv_path, output_dir="./"):
|
|
182 |
|
183 |
os.makedirs(output_dir, exist_ok=True)
|
184 |
# Create the directory if it doesn't exist
|
185 |
-
|
186 |
-
if not os.path.exists(
|
187 |
-
os.makedirs(
|
188 |
-
print(f"Created directory: {
|
189 |
-
|
190 |
df.to_csv(output_csv_path, index=False)
|
191 |
job_status["running"] = False
|
192 |
job_status["message"] = "Transcription completed."
|
@@ -221,7 +161,7 @@ def asr_models():
|
|
221 |
"ESPnet"
|
222 |
]
|
223 |
def background_job():
|
224 |
-
generateTranscript("openai/whisper-base"
|
225 |
|
226 |
# Start the background job in a separate thread
|
227 |
threading.Thread(target=background_job).start()
|
|
|
14 |
"message": "No job running"
|
15 |
}
|
16 |
csv_path = "test.csv"
|
17 |
+
output_dir="/data"
|
18 |
df = pd.read_csv(csv_path)
|
19 |
print(f"CSV Loaded with {len(df)} rows")
|
20 |
|
21 |
+
|
22 |
+
def generateTranscript(ASR_model):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
import os
|
24 |
import time
|
25 |
import tqdm
|
|
|
40 |
# dataset = dataset.with_format("python", decode_audio=False)
|
41 |
dataset = dataset.cast_column("audio", Audio(decode=False))
|
42 |
|
43 |
+
output_csv_path = os.path.join(output_dir, f"test_with_{ASR_model}.csv")
|
44 |
# Check if transcript already exists
|
45 |
if os.path.exists(output_csv_path):
|
46 |
print(f"Transcript already exists for model {ASR_model}. Skipping transcription.")
|
|
|
53 |
total = len(df)
|
54 |
job_status["total"] = total
|
55 |
|
56 |
+
import torch
|
57 |
+
# Check if GPU is available
|
58 |
+
if torch.cuda.is_available():
|
59 |
+
device = 0
|
60 |
+
print("Device set to use GPU")
|
61 |
+
else:
|
62 |
+
device = -1
|
63 |
+
print("Device set to use CPU")
|
64 |
+
|
65 |
# Initialize ASR pipeline
|
66 |
+
pipe = pipeline("automatic-speech-recognition", model=ASR_model, device=device)
|
|
|
67 |
|
68 |
# Column with filenames in the CSV
|
69 |
filename_column = df.columns[0]
|
|
|
114 |
|
115 |
job_status["completed"] = idx + 1
|
116 |
job_status["message"] = f"Processing {idx + 1}/{total}"
|
117 |
+
job_status["%_completed"] = {idx + 1}*100/{total}
|
118 |
|
119 |
# Save results
|
120 |
df["transcript"] = transcripts
|
|
|
122 |
|
123 |
os.makedirs(output_dir, exist_ok=True)
|
124 |
# Create the directory if it doesn't exist
|
125 |
+
csv_output_dir = os.path.dirname(output_csv_path) # Get the directory path
|
126 |
+
if not os.path.exists(csv_output_dir): # Check if directory exists
|
127 |
+
os.makedirs(csv_output_dir) # Create directory if it doesn't exist
|
128 |
+
print(f"Created directory: {csv_output_dir}")
|
129 |
+
|
130 |
df.to_csv(output_csv_path, index=False)
|
131 |
job_status["running"] = False
|
132 |
job_status["message"] = "Transcription completed."
|
|
|
161 |
"ESPnet"
|
162 |
]
|
163 |
def background_job():
|
164 |
+
generateTranscript("openai/whisper-base")
|
165 |
|
166 |
# Start the background job in a separate thread
|
167 |
threading.Thread(target=background_job).start()
|