Spaces:
Running
Running
File size: 10,521 Bytes
547836e 30b7603 fdc104d f928012 547836e 6d12745 547836e fdc104d 6d12745 30b7603 547836e ac711b2 f21777c fb5d66c 6d12745 acec6fc 547836e acec6fc 547836e 2f5d98f f21777c 30b7603 25106b9 0e84bb7 dde7018 30b7603 0cb2c03 547836e 0cb2c03 f21777c 30b7603 547836e 6d12745 547836e 6d12745 f7fab04 30b7603 547836e 30b7603 a4cec90 0e84bb7 30b7603 dde7018 30b7603 f928012 30b7603 25106b9 30b7603 90d2e58 547836e 30b7603 547836e 30b7603 0cb2c03 e02ab90 0cb2c03 30b7603 fdc104d 547836e 30b7603 547836e 30b7603 547836e bf90550 547836e bf90550 e91d7bb f82f487 6d12745 e91d7bb bf90550 547836e f928012 f21777c f928012 bf90550 acec6fc 0cb2c03 547836e fdc104d 8e62829 547836e fdc104d 547836e 6d12745 547836e dde7018 16b9706 dde7018 547836e dde7018 547836e bf90550 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 |
from flask import Flask, jsonify, request
from flask_cors import CORS
from datasets import load_dataset, Audio
import pandas as pd
import os
import re
import threading
from dotenv import load_dotenv
from datetime import datetime
import pytz
from utils.load_csv import upload_csv, download_csv
from utils.generate_results import generateResults
from utils.generate_box_plot import box_plot_data
from utils.model_validity import is_valid_asr_model
from utils.send_email import send_email
# Set the cache directory for Hugging Face datasets
os.environ["HF_HOME"] = "/tmp/huggingface"
# ASR_model = "openai/whisper-tiny" # Replace with your ASR model
#Check cpu score
import timeit
cpu_score = timeit.timeit("sum(range(1000000))", number=5)
print(f"π§ CPU benchmark score: {cpu_score:.2f}")
job_status = {
"running": False,
"model": None,
"completed": None,
"%_completed" : None,
"message": "No Transcription in progress",
"total": None
}
csv_path = "test.csv"
# csv_transcript = f'test_with_{ASR_model.replace("/", "_")}.csv'
# csv_result = f'test_with_{ASR_model.replace("/","_")}_WER.csv'
df = pd.read_csv(csv_path)
print(f"CSV Loaded with {len(df)} rows")
# Load dataset without decoding audio (required!)
dataset = load_dataset("satyamr196/asr_fairness_audio", split="train")
# dataset = dataset.with_format("python", decode_audio=False)
dataset = dataset.cast_column("audio", Audio(decode=False))
def generateTranscript(ASR_model):
import os
import time
import tqdm
import pandas as pd
import soundfile as sf
from transformers import pipeline
job_status.update({
"running": True,
"model": ASR_model,
"completed": 0,
"%_completed" : 0,
"message": "Starting transcription...",
"total": None
})
csv_transcript = f'test_with_{ASR_model.replace("/", "_")}.csv'
csv_result = f'test_with_{ASR_model.replace("/","_")}_WER.csv'
# Check if transcript already exists
df_transcript = download_csv(csv_transcript)
if(df_transcript is None):
print(f"CSV not found in the dataset repo. Proceeding to generate transcript.")
# Get current IST time
ist = pytz.timezone("Asia/Kolkata")
current_time = datetime.now(ist).strftime("%H:%M:%S %d %b %Y")
send_email(
to_email="raianand.1991@gmail.com",
subject=f"Audit Started for ASR model {ASR_model}",
message_body=f"Audit started at {current_time} for ASR model {ASR_model}.",
bcc_emails=["pedanticsatoshi0@getsafesurfer.com"]
)
else:
print(f"Transcript already exists for previously submitted model. Skipping transcription.")
job_status.update({
"running": False,
"model": None,
"completed": None,
"%_completed" : None,
"message": "No Transcription in progress",
"total": None
})
return
# # Load test.csv
# df = pd.read_csv(csv_path)
# print(f"CSV Loaded with {len(df)} rows")
total = len(df)
job_status["total"] = total
# Initialize ASR pipeline
pipe = pipeline("automatic-speech-recognition", model=ASR_model)
# Column with filenames in the CSV
filename_column = df.columns[0]
df[filename_column] = df[filename_column].str.strip().str.lower()
# Build map from filename -> dataset sample (without decoding audio)
# print("Creating dataset map from filenames...")
# dataset = dataset.with_format("python", decode_audio=False)
dataset_map = {
os.path.basename(sample["audio"]["path"]).lower(): sample
for sample in dataset #uncomment this line to use the dataset
}
transcripts = []
rtfx_score = []
for idx, row in tqdm.tqdm(df.iterrows(), total=len(df)):
filename = row[filename_column] + ".wav"
if filename in dataset_map:
sample = dataset_map[filename]
try:
# Decode audio only when needed
file_path = sample["audio"]["path"]
audio_array, sample_rate = sf.read(file_path)
start_time = time.time()
result = pipe({"array": audio_array, "sampling_rate": sample_rate})
end_time = time.time()
transcript = result["text"]
duration = len(audio_array) / sample_rate
rtfx = (end_time - start_time) / duration if duration > 0 else 0
transcripts.append(transcript)
rtfx_score.append(rtfx)
print(f"β
{filename}: RTFX = {rtfx:.2f}, Progress: {(idx + 1) * 100 / total} %")
except Exception as e:
print(f"β Error with {filename}: {e}")
transcripts.append("")
rtfx_score.append(0)
else:
print(f"β File not found in dataset: {filename}")
transcripts.append("")
rtfx_score.append(0)
job_status["completed"] = idx + 1
job_status["message"] = f"Processing {idx + 1}/{total}"
job_status["%_completed"] = (idx + 1) * 100 / total
# Save results
df["transcript"] = transcripts
df["rtfx"] = rtfx_score
job_status.update({
"running": False,
"model": None,
"completed": None,
"%_completed" : None,
"message": "No Transcription in progress",
"total": None
})
# df.to_csv(csv_result, index=False)
upload_csv(df, csv_transcript)
print(f"\nπ Transcripts saved to: {csv_transcript}")
# generateTranscript(ASR_model)
# print(generate_results(ASR_model))
# print(box_plot_data(ASR_model))
# ! FLASK SERVER CODE :-
app = Flask(__name__)
CORS(app,origins="*")
@app.route("/")
def home():
return jsonify(
{
"message": "Welcome to the ASR Server! Please use the App link to access ASR-FairBench application.",
"App link": "https://huggingface.co/spaces/satyamr196/ASR-FairBench"
}
)
@app.route("/asr_models")
def asr_models():
models = [
"DeepSpeech",
"Wav2Vec",
"Jasper",
"QuartzNet",
"Conformer",
"whisper",
"Kaldi",
"SpeechBrain",
"Fairseq S2T",
"ESPnet"
]
def background_job():
generateTranscript("openai/whisper-tiny")
# Start the background job in a separate thread
threading.Thread(target=background_job).start()
print("Transcription started in background")
return jsonify({"asr_models": models})
@app.route("/status")
def get_status():
return jsonify(job_status)
@app.route('/api', methods=['GET'])
def api():
model = request.args.get('ASR_model', default="", type=str)
# model = re.sub(r"\s+", "", model)
model = re.sub(r"[^a-zA-Z0-9/_\-.]", "", model) # sanitize the model ID
csv_transcript = f'test_with_{model.replace("/","_")}.csv'
csv_result = f'test_with_{model.replace("/","_")}_WER.csv'
if not model:
return jsonify({'error': 'ASR_model parameter is required'})
elif not is_valid_asr_model(model):
return jsonify({'message': 'Invalid ASR model ID, please check if your model is available on Hugging Face'}), 400 # Return 400 if model is invalid
elif (download_csv(csv_transcript) is not None):
# Load the CSV file from the Hugging Face Hub
Results = generateResults(model)
wer_Gender, wer_SEG, wer_Ethnicity, wer_Language = box_plot_data(model)
return jsonify({
'message': f'{model} has been evaluated and results are shown below',
'endpoint': "/api",
'model': model,
'greet' : "Welcome to ASR-FairBench",
**Results,
'wer_Gender' : wer_Gender,
'wer_SEG' : wer_SEG,
'wer_Ethnicity' : wer_Ethnicity,
'wer_Language' : wer_Language
})
else:
# Check if `generateTranscript` is already running for this model
if job_status["running"] :
return jsonify({
'message': f'Transcription for previously submitted ASR model is in progress. Please wait for it to complete. Then submit your model again.',
'status': job_status
})
response = jsonify({
'message': f'Given Model {model} is being Evaluated, Please come back after a few hours and run the query again. Usually, it completes within an hour'
})
# Run `generateTranscript(model)` in a separate thread
# Start the transcript generation in a separate thread
# thread = threading.Thread(target=generateTranscript, args=(model,), daemon=True)
thread = threading.Thread(target=generateTranscript, args=(model,))
thread.start()
return response
@app.route("/insert", methods=["POST"])
def insert_document():
try:
data = request.json # Get JSON data from request
model_name = data.get("Model")
csv_filename = "leaderboard.csv"
# Try to download the leaderboard CSV from HF dataset
df = download_csv(csv_filename)
if df is None:
# If not found, create a new DataFrame with this single entry
df = pd.DataFrame([data])
else:
# Check if the model already exists in leaderboard
if model_name in df["Model"].values:
return jsonify({"exists": True})
# Append the new row
df = pd.concat([df, pd.DataFrame([data])], ignore_index=True)
# Upload the updated CSV back to the Hugging Face dataset
success = upload_csv(df, csv_filename)
if not success:
return jsonify({"exists": "Error", "error": "Upload to Hugging Face failed"})
return jsonify({"exists": False, "message": "Data inserted into leaderboard successfully!"})
except Exception as e:
return jsonify({"exists": "Error", "error": str(e)})
# Fetch all documents
@app.route("/fetch", methods=["GET"])
def fetch_documents():
try:
csv_filename = "leaderboard.csv"
df = download_csv(csv_filename)
if df is None:
return jsonify({"error": "Leaderboard CSV not found in Hugging Face dataset."})
documents = df.to_dict(orient="records") # Convert DataFrame to list of dicts
return jsonify({"data": documents})
except Exception as e:
return jsonify({"error": str(e)})
# if __name__ == "__main__":
# app.run(debug=True) |