satyamr196 commited on
Commit
0e84bb7
·
1 Parent(s): a4cec90

write files to /data directory becuase ./ directory is not allowed to write in HF spaces

Browse files
Files changed (1) hide show
  1. ASR_Server.py +21 -81
ASR_Server.py CHANGED
@@ -14,81 +14,12 @@ job_status = {
14
  "message": "No job running"
15
  }
16
  csv_path = "test.csv"
 
17
  df = pd.read_csv(csv_path)
18
  print(f"CSV Loaded with {len(df)} rows")
19
 
20
- # def generateTranscript(ASR_model, dataset, csv_path, output_dir="./"):
21
- # import os
22
- # import time
23
- # import pandas as pd
24
- # import librosa
25
- # import tqdm
26
- # from transformers import pipeline
27
-
28
- # os.makedirs(output_dir, exist_ok=True)
29
- # # output_csv_path = os.path.join(output_dir, f"test_with_{ASR_model.replace('/', '_')}.csv")
30
- # output_csv_path = os.path.join(output_dir, f"test_with_{ASR_model}.csv")
31
-
32
- # if os.path.exists(output_csv_path):
33
- # print(f"Transcript already exists for model {ASR_model}. Skipping transcription.")
34
- # return
35
-
36
- # # Load metadata CSV
37
- # df = pd.read_csv(csv_path)
38
- # print(f"CSV Loaded with {len(df)} rows")
39
-
40
- # # Prepare
41
- # df[df.columns[0]] = df[df.columns[0]].str.strip().str.lower()
42
- # filename_column = df.columns[0]
43
- # transcripts = []
44
- # rtfx_score = []
45
-
46
- # # Load ASR model
47
- # pipe = pipeline("automatic-speech-recognition", model=ASR_model)
48
-
49
- # # Create a map of dataset samples by file name (assumes filename is in dataset)
50
- # dataset_map = {
51
- # sample["audio"]["path"].split("/")[-1].lower(): sample for sample in dataset
52
- # }
53
-
54
- # for idx, row in tqdm.tqdm(df.iterrows(), total=len(df)):
55
- # filename = row[filename_column].strip().lower() + ".wav"
56
- # if filename in dataset_map:
57
- # sample = dataset_map[filename]
58
- # try:
59
- # audio_array = sample["audio"]["array"]
60
- # sample_rate = sample["audio"]["sampling_rate"]
61
-
62
- # start_time = time.time()
63
- # result = pipe({"array": audio_array, "sampling_rate": sample_rate})
64
- # end_time = time.time()
65
-
66
- # transcript = result["text"]
67
- # duration = librosa.get_duration(y=audio_array, sr=sample_rate)
68
- # rtfx = (end_time - start_time) / duration if duration > 0 else 0
69
-
70
- # transcripts.append(transcript)
71
- # rtfx_score.append(rtfx)
72
-
73
- # print(f"✅ {filename}: RTFX = {rtfx:.2f}")
74
-
75
- # except Exception as e:
76
- # print(f"❌ Error with {filename}: {e}")
77
- # transcripts.append("")
78
- # rtfx_score.append(0)
79
- # else:
80
- # print(f"⚠️ File not in dataset: {filename}")
81
- # transcripts.append("")
82
- # rtfx_score.append(0)
83
-
84
- # # Append to original DataFrame
85
- # df['transcript'] = transcripts
86
- # df['rtfx'] = rtfx_score
87
-
88
- # df.to_csv(output_csv_path, index=False)
89
- # print(f"✅ Transcripts saved to {output_csv_path}")
90
-
91
- def generateTranscript(ASR_model, csv_path, output_dir="./"):
92
  import os
93
  import time
94
  import tqdm
@@ -109,7 +40,7 @@ def generateTranscript(ASR_model, csv_path, output_dir="./"):
109
  # dataset = dataset.with_format("python", decode_audio=False)
110
  dataset = dataset.cast_column("audio", Audio(decode=False))
111
 
112
- output_csv_path = os.path.join("./", f"test_with_{ASR_model}.csv")
113
  # Check if transcript already exists
114
  if os.path.exists(output_csv_path):
115
  print(f"Transcript already exists for model {ASR_model}. Skipping transcription.")
@@ -122,9 +53,17 @@ def generateTranscript(ASR_model, csv_path, output_dir="./"):
122
  total = len(df)
123
  job_status["total"] = total
124
 
 
 
 
 
 
 
 
 
 
125
  # Initialize ASR pipeline
126
- pipe = pipeline("automatic-speech-recognition", model=ASR_model, device=-1)
127
- # print("Device set to use CPU")
128
 
129
  # Column with filenames in the CSV
130
  filename_column = df.columns[0]
@@ -175,6 +114,7 @@ def generateTranscript(ASR_model, csv_path, output_dir="./"):
175
 
176
  job_status["completed"] = idx + 1
177
  job_status["message"] = f"Processing {idx + 1}/{total}"
 
178
 
179
  # Save results
180
  df["transcript"] = transcripts
@@ -182,11 +122,11 @@ def generateTranscript(ASR_model, csv_path, output_dir="./"):
182
 
183
  os.makedirs(output_dir, exist_ok=True)
184
  # Create the directory if it doesn't exist
185
- output_dir = os.path.dirname(os.path.join(output_dir, f"test_with_{ASR_model}.csv")) # Get the directory path
186
- if not os.path.exists(output_dir): # Check if directory exists
187
- os.makedirs(output_dir) # Create directory if it doesn't exist
188
- print(f"Created directory: {output_dir}")
189
-
190
  df.to_csv(output_csv_path, index=False)
191
  job_status["running"] = False
192
  job_status["message"] = "Transcription completed."
@@ -221,7 +161,7 @@ def asr_models():
221
  "ESPnet"
222
  ]
223
  def background_job():
224
- generateTranscript("openai/whisper-base", csv_path, output_dir="./")
225
 
226
  # Start the background job in a separate thread
227
  threading.Thread(target=background_job).start()
 
14
  "message": "No job running"
15
  }
16
  csv_path = "test.csv"
17
+ output_dir="/data"
18
  df = pd.read_csv(csv_path)
19
  print(f"CSV Loaded with {len(df)} rows")
20
 
21
+
22
+ def generateTranscript(ASR_model):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  import os
24
  import time
25
  import tqdm
 
40
  # dataset = dataset.with_format("python", decode_audio=False)
41
  dataset = dataset.cast_column("audio", Audio(decode=False))
42
 
43
+ output_csv_path = os.path.join(output_dir, f"test_with_{ASR_model}.csv")
44
  # Check if transcript already exists
45
  if os.path.exists(output_csv_path):
46
  print(f"Transcript already exists for model {ASR_model}. Skipping transcription.")
 
53
  total = len(df)
54
  job_status["total"] = total
55
 
56
+ import torch
57
+ # Check if GPU is available
58
+ if torch.cuda.is_available():
59
+ device = 0
60
+ print("Device set to use GPU")
61
+ else:
62
+ device = -1
63
+ print("Device set to use CPU")
64
+
65
  # Initialize ASR pipeline
66
+ pipe = pipeline("automatic-speech-recognition", model=ASR_model, device=device)
 
67
 
68
  # Column with filenames in the CSV
69
  filename_column = df.columns[0]
 
114
 
115
  job_status["completed"] = idx + 1
116
  job_status["message"] = f"Processing {idx + 1}/{total}"
117
+ job_status["%_completed"] = {idx + 1}*100/{total}
118
 
119
  # Save results
120
  df["transcript"] = transcripts
 
122
 
123
  os.makedirs(output_dir, exist_ok=True)
124
  # Create the directory if it doesn't exist
125
+ csv_output_dir = os.path.dirname(output_csv_path) # Get the directory path
126
+ if not os.path.exists(csv_output_dir): # Check if directory exists
127
+ os.makedirs(csv_output_dir) # Create directory if it doesn't exist
128
+ print(f"Created directory: {csv_output_dir}")
129
+
130
  df.to_csv(output_csv_path, index=False)
131
  job_status["running"] = False
132
  job_status["message"] = "Transcription completed."
 
161
  "ESPnet"
162
  ]
163
  def background_job():
164
+ generateTranscript("openai/whisper-base")
165
 
166
  # Start the background job in a separate thread
167
  threading.Thread(target=background_job).start()