satyamr196 commited on
Commit
547836e
·
1 Parent(s): fb5d66c

Major Update : created all the routes reqired, add utils folder containing helper fxns

Browse files
ASR_Server.py CHANGED
@@ -1,12 +1,17 @@
1
- from flask import Flask, jsonify
 
2
  from datasets import load_dataset, Audio
3
  import pandas as pd
4
  import os
5
  import threading
 
 
 
 
6
 
7
- import os
8
  os.environ["HF_HOME"] = "/tmp/huggingface"
9
-
10
  #Check cpu score
11
  import timeit
12
  cpu_score = timeit.timeit("sum(range(1000000))", number=5)
@@ -15,14 +20,22 @@ print(f"🧠 CPU benchmark score: {cpu_score:.2f}")
15
  job_status = {
16
  "running": False,
17
  "model": None,
18
- "completed": 0,
19
- "message": "No job running"
 
 
20
  }
 
21
  csv_path = "test.csv"
22
- output_dir="data"
 
23
  df = pd.read_csv(csv_path)
24
  print(f"CSV Loaded with {len(df)} rows")
25
 
 
 
 
 
26
 
27
  def generateTranscript(ASR_model):
28
  import os
@@ -36,36 +49,25 @@ def generateTranscript(ASR_model):
36
  "running": True,
37
  "model": ASR_model,
38
  "completed": 0,
39
- "message": "Starting transcription..."
 
 
40
  })
41
 
42
-
43
- # Load dataset without decoding audio (required!)
44
- dataset = load_dataset("satyamr196/asr_fairness_audio", split="train")
45
- # dataset = dataset.with_format("python", decode_audio=False)
46
- dataset = dataset.cast_column("audio", Audio(decode=False))
47
-
48
- output_csv_path = os.path.join(output_dir, f"test_with_{ASR_model}.csv")
49
  # Check if transcript already exists
50
- if os.path.exists(output_csv_path):
 
 
 
51
  print(f"Transcript already exists for model {ASR_model}. Skipping transcription.")
52
  return
53
 
54
- # Load CSV
55
- df = pd.read_csv(csv_path)
56
- print(f"CSV Loaded with {len(df)} rows")
57
 
58
  total = len(df)
59
  job_status["total"] = total
60
-
61
- # import torch
62
- # # Check if GPU is available
63
- # if torch.cuda.is_available():
64
- # device = 0
65
- # print("Device set to use GPU")
66
- # else:
67
- # device = -1
68
- # print("Device set to use CPU")
69
 
70
  # Initialize ASR pipeline
71
  pipe = pipeline("automatic-speech-recognition", model=ASR_model)
@@ -79,7 +81,7 @@ def generateTranscript(ASR_model):
79
  # dataset = dataset.with_format("python", decode_audio=False)
80
  dataset_map = {
81
  os.path.basename(sample["audio"]["path"]).lower(): sample
82
- for sample in dataset
83
  }
84
 
85
  transcripts = []
@@ -96,7 +98,9 @@ def generateTranscript(ASR_model):
96
  audio_array, sample_rate = sf.read(file_path)
97
 
98
  start_time = time.time()
99
- result = pipe({"array": audio_array, "sampling_rate": sample_rate})
 
 
100
  end_time = time.time()
101
 
102
  transcript = result["text"]
@@ -106,7 +110,7 @@ def generateTranscript(ASR_model):
106
  transcripts.append(transcript)
107
  rtfx_score.append(rtfx)
108
 
109
- print(f"✅ {filename}: RTFX = {rtfx:.2f}")
110
 
111
  except Exception as e:
112
  print(f"❌ Error with {filename}: {e}")
@@ -125,22 +129,21 @@ def generateTranscript(ASR_model):
125
  df["transcript"] = transcripts
126
  df["rtfx"] = rtfx_score
127
 
128
- os.makedirs(output_dir, exist_ok=True)
129
- # Create the directory if it doesn't exist
130
- csv_output_dir = os.path.dirname(output_csv_path) # Get the directory path
131
- if not os.path.exists(csv_output_dir): # Check if directory exists
132
- os.makedirs(csv_output_dir) # Create directory if it doesn't exist
133
- print(f"Created directory: {csv_output_dir}")
134
-
135
- df.to_csv(output_csv_path, index=False)
136
  job_status["running"] = False
137
  job_status["message"] = "Transcription completed."
 
 
 
138
 
139
- print(f"\n📄 Transcripts saved to: {output_csv_path}")
140
 
 
 
 
141
 
 
142
 
143
  app = Flask(__name__)
 
144
 
145
  @app.route("/")
146
  def home():
@@ -165,8 +168,9 @@ def asr_models():
165
  "Fairseq S2T",
166
  "ESPnet"
167
  ]
 
168
  def background_job():
169
- generateTranscript("openai/whisper-tiny")
170
 
171
  # Start the background job in a separate thread
172
  threading.Thread(target=background_job).start()
@@ -177,5 +181,94 @@ def asr_models():
177
  def get_status():
178
  return jsonify(job_status)
179
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  # if __name__ == "__main__":
181
  # app.run(debug=True)
 
1
+ from flask import Flask, jsonify, request
2
+ from flask_cors import CORS
3
  from datasets import load_dataset, Audio
4
  import pandas as pd
5
  import os
6
  import threading
7
+ from dotenv import load_dotenv
8
+ from utils.load_csv import upload_csv, download_csv
9
+ from utils.generate_results import generateResults
10
+ from utils.generate_box_plot import box_plot_data
11
 
12
+ # Set the cache directory for Hugging Face datasets
13
  os.environ["HF_HOME"] = "/tmp/huggingface"
14
+ ASR_model = "openai/whisper-tiny" # Replace with your ASR model
15
  #Check cpu score
16
  import timeit
17
  cpu_score = timeit.timeit("sum(range(1000000))", number=5)
 
20
  job_status = {
21
  "running": False,
22
  "model": None,
23
+ "completed": None,
24
+ "%_completed" : None,
25
+ "message": "No Transcription in progress",
26
+ "total": None
27
  }
28
+
29
  csv_path = "test.csv"
30
+ csv_transcript = f"test_with_{ASR_model.replace("/","_")}.csv"
31
+ csv_result = f"test_with_{ASR_model.replace("/","_")}_WER.csv"
32
  df = pd.read_csv(csv_path)
33
  print(f"CSV Loaded with {len(df)} rows")
34
 
35
+ # # Load dataset without decoding audio (required!)
36
+ # dataset = load_dataset("satyamr196/asr_fairness_audio", split="train")
37
+ # # dataset = dataset.with_format("python", decode_audio=False)
38
+ # dataset = dataset.cast_column("audio", Audio(decode=False))
39
 
40
  def generateTranscript(ASR_model):
41
  import os
 
49
  "running": True,
50
  "model": ASR_model,
51
  "completed": 0,
52
+ "%_completed" : 0,
53
+ "message": "Starting transcription...",
54
+ "total": None
55
  })
56
 
 
 
 
 
 
 
 
57
  # Check if transcript already exists
58
+ df_transcript = download_csv(csv_transcript)
59
+ if(df_transcript is None):
60
+ print(f"CSV not found in the dataset repo. Proceeding to generate transcript.")
61
+ else:
62
  print(f"Transcript already exists for model {ASR_model}. Skipping transcription.")
63
  return
64
 
65
+ # # Load test.csv
66
+ # df = pd.read_csv(csv_path)
67
+ # print(f"CSV Loaded with {len(df)} rows")
68
 
69
  total = len(df)
70
  job_status["total"] = total
 
 
 
 
 
 
 
 
 
71
 
72
  # Initialize ASR pipeline
73
  pipe = pipeline("automatic-speech-recognition", model=ASR_model)
 
81
  # dataset = dataset.with_format("python", decode_audio=False)
82
  dataset_map = {
83
  os.path.basename(sample["audio"]["path"]).lower(): sample
84
+ # for sample in dataset #uncomment this line to use the dataset
85
  }
86
 
87
  transcripts = []
 
98
  audio_array, sample_rate = sf.read(file_path)
99
 
100
  start_time = time.time()
101
+ # result = pipe({"array": audio_array, "sampling_rate": sample_rate})
102
+ result = pipe({"array": audio_array, "sampling_rate": sample_rate},return_timestamps=True)
103
+
104
  end_time = time.time()
105
 
106
  transcript = result["text"]
 
110
  transcripts.append(transcript)
111
  rtfx_score.append(rtfx)
112
 
113
+ print(f"✅ {filename}: RTFX = {rtfx:.2f}, Progress: {(idx + 1) * 100 / total} %")
114
 
115
  except Exception as e:
116
  print(f"❌ Error with {filename}: {e}")
 
129
  df["transcript"] = transcripts
130
  df["rtfx"] = rtfx_score
131
 
 
 
 
 
 
 
 
 
132
  job_status["running"] = False
133
  job_status["message"] = "Transcription completed."
134
+ # df.to_csv(csv_result, index=False)
135
+ upload_csv(df, csv_transcript)
136
+ print(f"\n📄 Transcripts saved to: {csv_transcript}")
137
 
 
138
 
139
+ # generateTranscript(ASR_model)
140
+ # print(generate_results(ASR_model))
141
+ # print(box_plot_data(ASR_model))
142
 
143
+ # ! FLASK SERVER CODE :-
144
 
145
  app = Flask(__name__)
146
+ CORS(app,origins="*")
147
 
148
  @app.route("/")
149
  def home():
 
168
  "Fairseq S2T",
169
  "ESPnet"
170
  ]
171
+
172
  def background_job():
173
+ generateTranscript(ASR_model)
174
 
175
  # Start the background job in a separate thread
176
  threading.Thread(target=background_job).start()
 
181
  def get_status():
182
  return jsonify(job_status)
183
 
184
+ @app.route('/api', methods=['GET'])
185
+ def api():
186
+ model = request.args.get('ASR_model', default="", type=str)
187
+ csv_transcript = f"test_with_{model.replace("/","_")}.csv"
188
+ csv_result = f"test_with_{model.replace("/","_")}_WER.csv"
189
+ if not model:
190
+ return jsonify({'error': 'ASR_model parameter is required'}), 400 # Return 400 if model is missing
191
+ elif (download_csv(csv_transcript) is not None):
192
+ # Load the CSV file from the Hugging Face Hub
193
+ Results = generateResults(model)
194
+ wer_Gender, wer_SEG, wer_Ethnicity, wer_Language = box_plot_data(model)
195
+
196
+ return jsonify({
197
+ 'message': f'{model} has been evaluated and results are shown below',
198
+ 'endpoint': "/api",
199
+ 'model': model,
200
+ 'greet' : "Welcome to ASR-FairBench",
201
+ **Results,
202
+ 'wer_Gender' : wer_Gender,
203
+ 'wer_SEG' : wer_SEG,
204
+ 'wer_Ethnicity' : wer_Ethnicity,
205
+ 'wer_Language' : wer_Language
206
+ })
207
+ else:
208
+ # Check if `generateTranscript` is already running for this model
209
+ if job_status["running"] :
210
+ return jsonify({
211
+ 'message': f'Transcription for {job_status["model"]} is in progress. Please wait for it to complete. Then submit your model again.',
212
+ 'status': job_status
213
+ })
214
+
215
+ response = jsonify({
216
+ 'message': f'Given Model {model} is being Evaluated, Please come back after a few hours and run the query again. Usually, it completes within an hour'
217
+ })
218
+
219
+ # Run `generateTranscript(model)` in a separate thread
220
+ # Start the transcript generation in a separate thread
221
+ # thread = threading.Thread(target=generateTranscript, args=(model,), daemon=True)
222
+ thread = threading.Thread(target=generateTranscript, args=(model,))
223
+ thread.start()
224
+
225
+ return response
226
+
227
+ @app.route("/insert", methods=["POST"])
228
+ def insert_document():
229
+ try:
230
+ data = request.json # Get JSON data from request
231
+ model_name = data.get("Model")
232
+ csv_filename = "leaderboard.csv"
233
+
234
+ # Try to download the leaderboard CSV from HF dataset
235
+ df = download_csv(csv_filename)
236
+
237
+ if df is None:
238
+ # If not found, create a new DataFrame with this single entry
239
+ df = pd.DataFrame([data])
240
+ else:
241
+ # Check if the model already exists in leaderboard
242
+ if model_name in df["Model"].values:
243
+ return jsonify({"exists": True})
244
+ # Append the new row
245
+ df = pd.concat([df, pd.DataFrame([data])], ignore_index=True)
246
+
247
+ # Upload the updated CSV back to the Hugging Face dataset
248
+ success = upload_csv(df, csv_filename)
249
+ if not success:
250
+ return jsonify({"exists": "Error", "error": "Upload to Hugging Face failed"})
251
+
252
+ return jsonify({"exists": False, "message": "Data inserted into leaderboard successfully!"})
253
+
254
+ except Exception as e:
255
+ return jsonify({"exists": "Error", "error": str(e)})
256
+
257
+ # Fetch all documents
258
+ @app.route("/fetch", methods=["GET"])
259
+ def fetch_documents():
260
+ try:
261
+ csv_filename = "leaderboard.csv"
262
+ df = download_csv(csv_filename)
263
+
264
+ if df is None:
265
+ return jsonify({"error": "Leaderboard CSV not found in Hugging Face dataset."})
266
+
267
+ documents = df.to_dict(orient="records") # Convert DataFrame to list of dicts
268
+ return jsonify({"data": documents})
269
+
270
+ except Exception as e:
271
+ return jsonify({"error": str(e)})
272
+
273
  # if __name__ == "__main__":
274
  # app.run(debug=True)
requirements.txt CHANGED
@@ -14,4 +14,5 @@ flask
14
  pymongo
15
  flask-cors
16
  pandas
17
- tqdm
 
 
14
  pymongo
15
  flask-cors
16
  pandas
17
+ tqdm
18
+ dotenv
test.csv CHANGED
@@ -1,4 +1,4 @@
1
- hash_name,transcription,age,gender,first_language,socioeconomic_bkgd,ethnicity,combined_column
2
  96ce0c2debfa8656fe16d30187d683df,hey facebook set my status uh to available,31 - 45,male,English,Low,"Native American, American Indian, or Alaska Native","male_English_Low_Native American, American Indian, or Alaska Native"
3
  f3d12b16dd637efb9ce8142632d63f34,send text message to wolfgang hey uncle wolfgang i really miss you hope all is well let's talk soon,46 - 65,male,German,Affluent,White,male_German_Affluent_White
4
  9859c9ab6ca0377d593ad245f39bc224,text john i am sorry about your loss,23 - 30,female,English,Low,Black or African American,female_English_Low_Black or African American
 
1
+ hash_name,transcription,age,gender,first_language,socioeconomic_bkgd,ethnicity,combined_column
2
  96ce0c2debfa8656fe16d30187d683df,hey facebook set my status uh to available,31 - 45,male,English,Low,"Native American, American Indian, or Alaska Native","male_English_Low_Native American, American Indian, or Alaska Native"
3
  f3d12b16dd637efb9ce8142632d63f34,send text message to wolfgang hey uncle wolfgang i really miss you hope all is well let's talk soon,46 - 65,male,German,Affluent,White,male_German_Affluent_White
4
  9859c9ab6ca0377d593ad245f39bc224,text john i am sorry about your loss,23 - 30,female,English,Low,Black or African American,female_English_Low_Black or African American
utils/__init__.py ADDED
File without changes
utils/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (145 Bytes). View file
 
utils/__pycache__/generateResults.cpython-313.pyc ADDED
Binary file (10.1 kB). View file
 
utils/__pycache__/generate_box_plot.cpython-313.pyc ADDED
Binary file (5.48 kB). View file
 
utils/__pycache__/generate_results.cpython-313.pyc ADDED
Binary file (10.1 kB). View file
 
utils/__pycache__/load_csv.cpython-313.pyc ADDED
Binary file (1.61 kB). View file
 
utils/audio_duration.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from pydub import AudioSegment
2
+ #The audio duration calculation
3
+ def get_audio_duration(audio_file):
4
+ audio = AudioSegment.from_file(audio_file)
5
+ return len(audio) / 1000 # Convert milliseconds to seconds
utils/generate_box_plot.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils.load_csv import download_csv
2
+
3
+ def box_plot_data(ASR_model):
4
+
5
+ # Load the CSV file
6
+ csv_result = f"test_with_{ASR_model.replace("/","_")}_WER.csv"
7
+ df = download_csv(csv_result)
8
+
9
+ # Display actual column names to check for issues
10
+ print(df.columns)
11
+
12
+ # Trim column names of any leading or trailing spaces
13
+ df.columns = df.columns.str.strip()
14
+
15
+ # Extract WER values for Male and Female
16
+ wer_Gender = {
17
+ "Male": df[df["gender"] == "male"]["WER"].tolist(),
18
+ "Female": df[df["gender"] == "female"]["WER"].tolist()
19
+ }
20
+ wer_SEG = {
21
+ "Low": df[df["socioeconomic_bkgd"] == "Low"]["WER"].tolist(),
22
+ "Affluent": df[df["socioeconomic_bkgd"] == "Affluent"]["WER"].tolist(),
23
+ "Medium": df[df["socioeconomic_bkgd"] == "Medium"]["WER"].tolist(),
24
+ }
25
+ wer_Ethnicity = {
26
+ "Asian, South Asian or Asian American": df[df["ethnicity"] == "Asian, South Asian or Asian American"]["WER"].tolist(),
27
+ "Black or African American": df[df["ethnicity"] == "Black or African American"]["WER"].tolist(),
28
+ "Hispanic, Latino, or Spanish": df[df["ethnicity"] == "Hispanic, Latino, or Spanish"]["WER"].tolist(),
29
+ "Middle Eastern or North African": df[df["ethnicity"] == "Middle Eastern or North African"]["WER"].tolist(),
30
+ "Native American, American Indian, or Alaska Native": df[df["ethnicity"] == "Native American, American Indian, or Alaska Native"]["WER"].tolist(),
31
+ "Native Hawaiian or Other Pacific Islander": df[df["ethnicity"] == "Native Hawaiian or Other Pacific Islander"]["WER"].tolist(),
32
+ "White": df[df["ethnicity"] == "White"]["WER"].tolist(),
33
+ }
34
+ wer_Language = {
35
+ "English": df[df["first_language"] == "English"]["WER"].tolist(),
36
+ "German": df[df["first_language"] == "German"]["WER"].tolist(),
37
+ "French": df[df["first_language"] == "French"]["WER"].tolist(),
38
+ "Arabic": df[df["first_language"] == "Arabic"]["WER"].tolist(),
39
+ "Cantonese": df[df["first_language"] == "Cantonese"]["WER"].tolist(),
40
+ "Creole": df[df["first_language"] == "Creole"]["WER"].tolist(),
41
+ "Dutch": df[df["first_language"] == "Dutch"]["WER"].tolist(),
42
+ "English/Turkish": df[df["first_language"] == "English/Turkish"]["WER"].tolist(),
43
+ "Filipino": df[df["first_language"] == "Filipino"]["WER"].tolist(),
44
+ "Hindi": df[df["first_language"] == "Hindi"]["WER"].tolist(),
45
+ "Hmong": df[df["first_language"] == "Hmong"]["WER"].tolist(),
46
+ "Hindi": df[df["first_language"] == "Hindi"]["WER"].tolist(),
47
+ "Indonesian": df[df["first_language"] == "Indonesian"]["WER"].tolist(),
48
+ "Italian": df[df["first_language"] == "Italian"]["WER"].tolist(),
49
+ "Japanese": df[df["first_language"] == "Japanese"]["WER"].tolist(),
50
+ "Korean": df[df["first_language"] == "Korean"]["WER"].tolist(),
51
+ "Laotian": df[df["first_language"] == "Laotian"]["WER"].tolist(),
52
+ "Malay": df[df["first_language"] == "Malay"]["WER"].tolist(),
53
+ "Malaysian": df[df["first_language"] == "Malaysian"]["WER"].tolist(),
54
+ "Mandarin": df[df["first_language"] == "Mandarin"]["WER"].tolist(),
55
+ "Marathi": df[df["first_language"] == "Marathi"]["WER"].tolist(),
56
+ "Nepali": df[df["first_language"] == "Nepali"]["WER"].tolist(),
57
+ "Other": df[df["first_language"] == "Other"]["WER"].tolist(),
58
+ "Portuguese": df[df["first_language"] == "Portuguese"]["WER"].tolist(),
59
+ "Russian": df[df["first_language"] == "Russian"]["WER"].tolist(),
60
+ "Spanish": df[df["first_language"] == "Spanish"]["WER"].tolist(),
61
+ "Tagalog": df[df["first_language"] == "Tagalog"]["WER"].tolist(),
62
+ "Turkish": df[df["first_language"] == "Turkish"]["WER"].tolist(),
63
+ "Russian": df[df["first_language"] == "Russian"]["WER"].tolist(),
64
+ "Ukrainian": df[df["first_language"] == "Ukrainian"]["WER"].tolist(),
65
+ "Urdu": df[df["first_language"] == "Urdu"]["WER"].tolist(),
66
+ "Vietnamese": df[df["first_language"] == "Vietnamese"]["WER"].tolist(),
67
+ }
68
+
69
+ return wer_Gender, wer_SEG, wer_Ethnicity, wer_Language
utils/generate_results.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import re
3
+ import numpy as np
4
+ from jiwer import wer
5
+ import statsmodels.api as sm
6
+ import statsmodels.formula.api as smf
7
+ from utils.load_csv import download_csv, upload_csv
8
+
9
+ def generateResults(ASR_model):
10
+
11
+ # Define normalization function
12
+ def normalize_text(text):
13
+ """
14
+ Normalize text by converting to lowercase, removing special characters,
15
+ except digits, and handling None or float values.
16
+ """
17
+ if text is None or pd.isna(text): # Check for None or NaN
18
+ return ""
19
+ if isinstance(text, float): # Check for floats and convert them to empty string
20
+ return ""
21
+ text = text.lower() # Convert to lowercase
22
+ text = re.sub(r'[^a-z0-9\s]', '', text) # Keep only letters, digits, and spaces
23
+ return text.strip() # Remove leading/trailing spaces
24
+
25
+
26
+ # Load the CSV with whisper transcripts f"test_with_{ASR_model}.csv"
27
+ csv_transcript = f"test_with_{ASR_model.replace("/","_")}.csv"
28
+ # Read the CSV file
29
+ df = download_csv(csv_transcript)
30
+
31
+ if(df is None):
32
+ print(f"CSV not found in the dataset repo. Please generate the transcript file first.")
33
+ return
34
+
35
+ # Normalize original text and whisper transcripts
36
+ df['normalized_transcription'] = df[df.columns[1]].apply(normalize_text) # Replace 'original_text' with your column name
37
+
38
+ # Check if whisper transcript column exists
39
+ if 'transcript' in df.columns:
40
+ df['normalized_transcript'] = df[df.columns[8]].apply(normalize_text)
41
+
42
+ # Calculate WER
43
+ wer_scores = []
44
+ for index, row in df.iterrows():
45
+ original = row['normalized_transcription']
46
+ transcript = row['normalized_transcript']
47
+ if original and transcript:
48
+ wer_score = wer(original, transcript)
49
+ else:
50
+ wer_score = 1.0 # Maximum error if one text is missing
51
+ wer_scores.append(wer_score)
52
+
53
+ df['WER'] = wer_scores
54
+ # Compute IQR
55
+ Q1 = df['WER'].quantile(0.25)
56
+ Q3 = df['WER'].quantile(0.75)
57
+ IQR = Q3 - Q1
58
+ # Define outlier range
59
+ lower_bound = Q1 - 1.5 * IQR
60
+ upper_bound = Q3 + 1.5 * IQR
61
+ # Remove outliers
62
+ df = df[(df['WER'] >= lower_bound) & (df['WER'] <= upper_bound)]
63
+ else:
64
+ print("Column 'transcript' not found in CSV")
65
+
66
+ # Save the updated CSV
67
+ csv_result = f"test_with_{ASR_model.replace("/","_")}_WER.csv"
68
+ upload_csv(df,csv_result)
69
+
70
+ print(f"WER calculations saved to {csv_result}")
71
+ avg_wer = df["WER"].mean()
72
+ avg_rtfx = df["rtfx"].mean()
73
+ print(f"Average WER: {avg_wer} and Avg RTFX : {avg_rtfx}")
74
+ #----------------------------------------------------------------------------------------------------------
75
+
76
+ #----------------------------------------------------------------------------------------------------------
77
+ # Define protected attributes and label columns
78
+ protected_attributes = ['gender', 'first_language', 'socioeconomic_bkgd', 'ethnicity']
79
+ label_column = 'normalized_transcription'
80
+ prediction_column = 'normalized_transcript'
81
+ wer_column = 'WER'
82
+
83
+ data = df
84
+
85
+ # Function to calculate WER disparity
86
+ def calculate_wer_disparity(data, protected_attribute, wer_column):
87
+ groups = data[protected_attribute].unique()
88
+ wer_disparity = {}
89
+ for group in groups:
90
+ group_data = data[data[protected_attribute] == group]
91
+ avg_wer = group_data[wer_column].mean()
92
+ wer_disparity[group] = avg_wer
93
+ return wer_disparity
94
+
95
+ # Calculate WER disparity for each protected attribute
96
+ for attribute in protected_attributes:
97
+ disparity = calculate_wer_disparity(data, attribute, wer_column)
98
+ print(f"WER Disparity for {attribute}:", disparity)
99
+ #-------------------------------------------------------------------------------------------------------
100
+
101
+ #-------------------------------------------------------------------------------------------------------
102
+ data["Reference_words"] = data["normalized_transcription"].str.split().str.len()
103
+
104
+ # Compute word error count (WER_count)
105
+ data["WER_count"] = data["Reference_words"] * data["WER"]
106
+
107
+ df = data
108
+
109
+ categorical_cols = ['gender', 'first_language', 'socioeconomic_bkgd', 'ethnicity']
110
+ for col in categorical_cols:
111
+ df[col] = df[col].astype("category")
112
+
113
+ # Offset: log of reference word count (to adjust for different transcript lengths)
114
+ df["log_Ref_Words"] = np.log(df["Reference_words"] + 1) # Adding 1 to avoid log(0)
115
+
116
+ # Fit a Mixed-Effects Poisson Regression Model
117
+ mixed_model = smf.mixedlm(
118
+ formula="WER_count ~ log_Ref_Words + age + gender + first_language + socioeconomic_bkgd + ethnicity", # Fixed effects
119
+ data=df,
120
+ groups=df["combined_column"] # Random effect on speaker
121
+
122
+ ).fit()
123
+
124
+ # Display results
125
+ # print(mixed_model.summary())
126
+
127
+ #--------------------------------------------------------------------------------------------------------------------------
128
+
129
+ #--------------------------------------------------------------------------------------------------------------------------
130
+ from scipy.stats import chi2
131
+
132
+ # Assume 'mixed_model' is your already-fitted mixed-effects model and 'df' is your DataFrame.
133
+ # Also assume df["log_Ref_Words"] = np.log(df["Reference_words"] + 1)
134
+ params = mixed_model.params
135
+
136
+ # Set fixed values for continuous predictors:
137
+ fixed_log_ref = df["log_Ref_Words"].mean()
138
+ baseline_log = params["Intercept"] + params["log_Ref_Words"] * fixed_log_ref
139
+ exposure = np.exp(fixed_log_ref) - 1
140
+
141
+ def compute_predicted_error_rate(category, level, params, baseline_log, exposure):
142
+ """Computes the predicted WER (error rate) for a given level of a demographic attribute."""
143
+ coef_name = f"{category}[T.{level}]"
144
+ effect = params.get(coef_name, 0) # For the baseline level, effect is 0.
145
+ pred_log = baseline_log + effect
146
+ pred_count = np.exp(pred_log)
147
+ return pred_count / exposure
148
+
149
+ def compute_category_fairness(category, params, baseline_log, exposure, df):
150
+ """
151
+ For a given category, compute:
152
+ - Predicted error rates for each subgroup level.
153
+ - Raw fairness scores (0-100 scale: 100 = best, 0 = worst) based on linear scaling.
154
+ - A weighted category fairness score using group proportions.
155
+ """
156
+ levels = df[category].cat.categories
157
+ predictions = {}
158
+ for lvl in levels:
159
+ predictions[lvl] = compute_predicted_error_rate(category, lvl, params, baseline_log, exposure)
160
+
161
+ # Convert predictions to a Series.
162
+ pred_series = pd.Series(predictions)
163
+ min_pred, max_pred = pred_series.min(), pred_series.max()
164
+
165
+ # Compute raw fairness scores: if all levels are identical, assign 100 to everyone.
166
+ if max_pred == min_pred:
167
+ raw_fairness = pred_series.apply(lambda x: 100.0)
168
+ else:
169
+ raw_fairness = pred_series.apply(lambda x: 100 * (1 - (x - min_pred) / (max_pred - min_pred)))
170
+
171
+ # Weight the subgroup fairness scores by their sample proportions in the dataset.
172
+ group_proportions = df[category].value_counts(normalize=True)
173
+ # Ensure ordering matches the fairness scores index:
174
+ group_proportions = group_proportions.reindex(raw_fairness.index, fill_value=0)
175
+ weighted_category_fairness = np.average(raw_fairness, weights=group_proportions)
176
+
177
+ return pred_series, raw_fairness, weighted_category_fairness
178
+
179
+ def perform_lrt(attribute, df):
180
+ """Performs Likelihood Ratio Test (LRT) to test the overall significance of an attribute."""
181
+ full_model = smf.mixedlm(f"WER ~ {attribute} + log_Ref_Words", df, groups=df["combined_column"]).fit()
182
+ reduced_model = smf.mixedlm("WER ~ log_Ref_Words", df, groups=df["combined_column"]).fit()
183
+ lr_stat = 2 * (full_model.llf - reduced_model.llf)
184
+ df_diff = full_model.df_modelwc - reduced_model.df_modelwc
185
+ p_value = chi2.sf(lr_stat, df_diff)
186
+ return p_value
187
+
188
+ # List of attributes to evaluate
189
+ categories = ['gender', 'first_language', 'socioeconomic_bkgd', 'ethnicity']
190
+ results = {}
191
+ adjusted_category_scores = [] # To store adjusted fairness scores for each category.
192
+ weights_for_categories = [] # Weight each category based on significance if desired.
193
+
194
+ for cat in categories:
195
+ preds, raw_fairness, category_raw_score = compute_category_fairness(cat, params, baseline_log, exposure, df)
196
+ # Perform LRT to get overall significance for this attribute.
197
+ lrt_p_value = perform_lrt(cat, df)
198
+
199
+ # Compute multiplier based on significance.
200
+ # If p-value < 0.05, we penalize the fairness score proportionally.
201
+ multiplier = (lrt_p_value / 0.05) if lrt_p_value < 0.05 else 1.0
202
+
203
+ # Adjusted fairness score for the category:
204
+ adjusted_score = category_raw_score * multiplier
205
+
206
+ # Save results.
207
+ results[cat] = {
208
+ 'Predicted Error Rates': preds,
209
+ 'Raw Fairness Scores': raw_fairness,
210
+ # 'Weighted Raw Fairness Score': category_raw_score,
211
+ # 'LRT p-value': lrt_p_value,
212
+ 'Adjusted Category Fairness Score': adjusted_score
213
+ }
214
+
215
+ # For overall score, we could weight categories (here we simply use the adjusted score).
216
+ adjusted_category_scores.append(adjusted_score)
217
+ # Optionally, use multiplier as a weight for overall aggregation.
218
+ weights_for_categories.append(multiplier)
219
+
220
+ # Compute overall fairness score across attributes using the adjusted category scores.
221
+ overall_fairness_score = np.average(adjusted_category_scores)
222
+ #FAAS is the Fairness Adjusted ASR Score based on which models will be ranked
223
+ faas = 10*np.log10(overall_fairness_score/avg_wer)
224
+ print("Fairness Adjusted ASR Score for the model is", faas)
225
+ # print("\nFinal Overall Fairness Score (Weighted Average over Categories):", overall_fairness_score) # used for summary_speedometer,Leaderboard
226
+ # print(results['gender'])
227
+ # print(results['gender']['Predicted Error Rates'])
228
+ # print(results['gender']['Adjusted Category Fairness Score'])
229
+ print("________________________________")
230
+ Results = {
231
+ 'Predicted Error Rates': {
232
+ 'gender': results['gender']['Predicted Error Rates'].to_dict(), # Convert Series to dict
233
+ 'first_language': results['first_language']['Predicted Error Rates'].to_dict(),
234
+ 'socioeconomic_bkgd': results['socioeconomic_bkgd']['Predicted Error Rates'].to_dict(),
235
+ 'ethnicity': results['ethnicity']['Predicted Error Rates'].to_dict()
236
+ },
237
+ 'Raw Fairness Scores': {
238
+ 'gender': results['gender']['Raw Fairness Scores'].to_dict(),
239
+ 'first_language': results['first_language']['Raw Fairness Scores'].to_dict(),
240
+ 'socioeconomic_bkgd': results['socioeconomic_bkgd']['Raw Fairness Scores'].to_dict(),
241
+ 'ethnicity': results['ethnicity']['Raw Fairness Scores'].to_dict()
242
+ },
243
+ 'Adjusted Category Fairness Score': {
244
+ 'gender': float(results['gender']['Adjusted Category Fairness Score']), # Convert NumPy float to Python float
245
+ 'first_language': float(results['first_language']['Adjusted Category Fairness Score']),
246
+ 'socioeconomic_bkgd': float(results['socioeconomic_bkgd']['Adjusted Category Fairness Score']),
247
+ 'ethnicity': float(results['ethnicity']['Adjusted Category Fairness Score'])
248
+ },
249
+ 'Overall Fairness Score': float(overall_fairness_score), # Convert NumPy float to Python float
250
+ 'Avg_wer': float(avg_wer), # Convert NumPy float to Python float
251
+ 'Avg_rtfx': float(avg_rtfx), # Convert NumPy float to Python float
252
+ 'FAAS': float(faas), # Convert NumPy float to Python float
253
+ 'ASR_model': ASR_model,
254
+ }
255
+ # print(Results)
256
+ return Results
utils/load_csv.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ from huggingface_hub import HfApi, hf_hub_download
3
+ import os
4
+ import io
5
+ import pandas as pd
6
+
7
+ # Load environment variables from .env file
8
+ load_dotenv()
9
+
10
+ # ASR_model = "openai/whisper-largev2" # Replace with your ASR model
11
+ # csv_path = "test.csv" #read from local
12
+ # csv_transcript = f"test_with_{ASR_model.replace("/","_")}.csv" # to save in dataset repo
13
+ # csv_result = f"test_with_{ASR_model.replace("/","_")}_WER.csv" # to save in dataset repo
14
+
15
+ # df = pd.read_csv(csv_path)
16
+ # print(f"CSV Loaded with {len(df)} rows")
17
+
18
+ def upload_csv(df,csv_filename):
19
+ csv_buffer = io.BytesIO()
20
+ df.to_csv(csv_buffer, index=False)
21
+ csv_buffer.seek(0)
22
+
23
+ try:
24
+ # Upload the generated csv to Hugging Face Hub
25
+ api = HfApi(token=os.getenv("HF_TOKEN"))
26
+ print(f"✅ CSV uploading : {csv_filename}")
27
+ api.upload_file(
28
+ path_or_fileobj=csv_buffer,
29
+ path_in_repo=csv_filename,
30
+ repo_id="satyamr196/asr_fairness_results",
31
+ repo_type="dataset"
32
+ )
33
+ return True
34
+ except Exception as e:
35
+ print(f"⚠️ Could not upload CSV: {csv_filename} — {e}")
36
+ return False
37
+
38
+ # upload_csv(df,f"test_with_{ASR_model.replace("/","_")}_WER.csv");
39
+
40
+ def download_csv(csv_filename):
41
+ repo_id = "satyamr196/asr_fairness_results"
42
+
43
+ try:
44
+ # Download the CSV file from the dataset repo
45
+ csv_path = hf_hub_download(repo_id=repo_id, filename=csv_filename, repo_type="dataset")
46
+ # Load into pandas
47
+ return pd.read_csv(csv_path)
48
+ except Exception as e:
49
+ # print(f"⚠️ Could not load CSV: {csv_filename} — {e}")
50
+ return None
51
+
52
+
53
+ # # # Load the csv from the Hugging Face Hub
54
+ # df = download_csv(csv_result)
55
+ # if(df is None):
56
+ # print(f"CSV not found in the dataset repo. Please upload the file first.")
57
+ # else:
58
+ # print(f"CSV Loaded with {len(df)} rows")
59
+
60
+ # print(df)