ASR-FairBench-Server / ASR_Server.py
satyamr196's picture
Revert "1)Modified code to accomodate microsoft asr_models also which require their custom code to run, hence set trust_remote_code=True in pipe, 2)Added extra error handling, 3) Now, pipe is loaded in main thread instead of background thread."
dde7018
from flask import Flask, jsonify, request
from flask_cors import CORS
from datasets import load_dataset, Audio
import pandas as pd
import os
import re
import threading
from dotenv import load_dotenv
from datetime import datetime
import pytz
from utils.load_csv import upload_csv, download_csv
from utils.generate_results import generateResults
from utils.generate_box_plot import box_plot_data
from utils.model_validity import is_valid_asr_model
from utils.send_email import send_email
# Set the cache directory for Hugging Face datasets
os.environ["HF_HOME"] = "/tmp/huggingface"
# ASR_model = "openai/whisper-tiny" # Replace with your ASR model
#Check cpu score
import timeit
cpu_score = timeit.timeit("sum(range(1000000))", number=5)
print(f"🧠 CPU benchmark score: {cpu_score:.2f}")
job_status = {
"running": False,
"model": None,
"completed": None,
"%_completed" : None,
"message": "No Transcription in progress",
"total": None
}
csv_path = "test.csv"
# csv_transcript = f'test_with_{ASR_model.replace("/", "_")}.csv'
# csv_result = f'test_with_{ASR_model.replace("/","_")}_WER.csv'
df = pd.read_csv(csv_path)
print(f"CSV Loaded with {len(df)} rows")
# Load dataset without decoding audio (required!)
dataset = load_dataset("satyamr196/asr_fairness_audio", split="train")
# dataset = dataset.with_format("python", decode_audio=False)
dataset = dataset.cast_column("audio", Audio(decode=False))
def generateTranscript(ASR_model):
import os
import time
import tqdm
import pandas as pd
import soundfile as sf
from transformers import pipeline
job_status.update({
"running": True,
"model": ASR_model,
"completed": 0,
"%_completed" : 0,
"message": "Starting transcription...",
"total": None
})
csv_transcript = f'test_with_{ASR_model.replace("/", "_")}.csv'
csv_result = f'test_with_{ASR_model.replace("/","_")}_WER.csv'
# Check if transcript already exists
df_transcript = download_csv(csv_transcript)
if(df_transcript is None):
print(f"CSV not found in the dataset repo. Proceeding to generate transcript.")
# Get current IST time
ist = pytz.timezone("Asia/Kolkata")
current_time = datetime.now(ist).strftime("%H:%M:%S %d %b %Y")
send_email(
to_email="raianand.1991@gmail.com",
subject=f"Audit Started for ASR model {ASR_model}",
message_body=f"Audit started at {current_time} for ASR model {ASR_model}.",
bcc_emails=["pedanticsatoshi0@getsafesurfer.com"]
)
else:
print(f"Transcript already exists for previously submitted model. Skipping transcription.")
job_status.update({
"running": False,
"model": None,
"completed": None,
"%_completed" : None,
"message": "No Transcription in progress",
"total": None
})
return
# # Load test.csv
# df = pd.read_csv(csv_path)
# print(f"CSV Loaded with {len(df)} rows")
total = len(df)
job_status["total"] = total
# Initialize ASR pipeline
pipe = pipeline("automatic-speech-recognition", model=ASR_model)
# Column with filenames in the CSV
filename_column = df.columns[0]
df[filename_column] = df[filename_column].str.strip().str.lower()
# Build map from filename -> dataset sample (without decoding audio)
# print("Creating dataset map from filenames...")
# dataset = dataset.with_format("python", decode_audio=False)
dataset_map = {
os.path.basename(sample["audio"]["path"]).lower(): sample
for sample in dataset #uncomment this line to use the dataset
}
transcripts = []
rtfx_score = []
for idx, row in tqdm.tqdm(df.iterrows(), total=len(df)):
filename = row[filename_column] + ".wav"
if filename in dataset_map:
sample = dataset_map[filename]
try:
# Decode audio only when needed
file_path = sample["audio"]["path"]
audio_array, sample_rate = sf.read(file_path)
start_time = time.time()
result = pipe({"array": audio_array, "sampling_rate": sample_rate})
end_time = time.time()
transcript = result["text"]
duration = len(audio_array) / sample_rate
rtfx = (end_time - start_time) / duration if duration > 0 else 0
transcripts.append(transcript)
rtfx_score.append(rtfx)
print(f"✅ {filename}: RTFX = {rtfx:.2f}, Progress: {(idx + 1) * 100 / total} %")
except Exception as e:
print(f"❌ Error with {filename}: {e}")
transcripts.append("")
rtfx_score.append(0)
else:
print(f"❌ File not found in dataset: {filename}")
transcripts.append("")
rtfx_score.append(0)
job_status["completed"] = idx + 1
job_status["message"] = f"Processing {idx + 1}/{total}"
job_status["%_completed"] = (idx + 1) * 100 / total
# Save results
df["transcript"] = transcripts
df["rtfx"] = rtfx_score
job_status.update({
"running": False,
"model": None,
"completed": None,
"%_completed" : None,
"message": "No Transcription in progress",
"total": None
})
# df.to_csv(csv_result, index=False)
upload_csv(df, csv_transcript)
print(f"\n📄 Transcripts saved to: {csv_transcript}")
# generateTranscript(ASR_model)
# print(generate_results(ASR_model))
# print(box_plot_data(ASR_model))
# ! FLASK SERVER CODE :-
app = Flask(__name__)
CORS(app,origins="*")
@app.route("/")
def home():
return jsonify(
{
"message": "Welcome to the ASR Server! Please use the App link to access ASR-FairBench application.",
"App link": "https://huggingface.co/spaces/satyamr196/ASR-FairBench"
}
)
@app.route("/asr_models")
def asr_models():
models = [
"DeepSpeech",
"Wav2Vec",
"Jasper",
"QuartzNet",
"Conformer",
"whisper",
"Kaldi",
"SpeechBrain",
"Fairseq S2T",
"ESPnet"
]
def background_job():
generateTranscript("openai/whisper-tiny")
# Start the background job in a separate thread
threading.Thread(target=background_job).start()
print("Transcription started in background")
return jsonify({"asr_models": models})
@app.route("/status")
def get_status():
return jsonify(job_status)
@app.route('/api', methods=['GET'])
def api():
model = request.args.get('ASR_model', default="", type=str)
# model = re.sub(r"\s+", "", model)
model = re.sub(r"[^a-zA-Z0-9/_\-.]", "", model) # sanitize the model ID
csv_transcript = f'test_with_{model.replace("/","_")}.csv'
csv_result = f'test_with_{model.replace("/","_")}_WER.csv'
if not model:
return jsonify({'error': 'ASR_model parameter is required'})
elif not is_valid_asr_model(model):
return jsonify({'message': 'Invalid ASR model ID, please check if your model is available on Hugging Face'}), 400 # Return 400 if model is invalid
elif (download_csv(csv_transcript) is not None):
# Load the CSV file from the Hugging Face Hub
Results = generateResults(model)
wer_Gender, wer_SEG, wer_Ethnicity, wer_Language = box_plot_data(model)
return jsonify({
'message': f'{model} has been evaluated and results are shown below',
'endpoint': "/api",
'model': model,
'greet' : "Welcome to ASR-FairBench",
**Results,
'wer_Gender' : wer_Gender,
'wer_SEG' : wer_SEG,
'wer_Ethnicity' : wer_Ethnicity,
'wer_Language' : wer_Language
})
else:
# Check if `generateTranscript` is already running for this model
if job_status["running"] :
return jsonify({
'message': f'Transcription for previously submitted ASR model is in progress. Please wait for it to complete. Then submit your model again.',
'status': job_status
})
response = jsonify({
'message': f'Given Model {model} is being Evaluated, Please come back after a few hours and run the query again. Usually, it completes within an hour'
})
# Run `generateTranscript(model)` in a separate thread
# Start the transcript generation in a separate thread
# thread = threading.Thread(target=generateTranscript, args=(model,), daemon=True)
thread = threading.Thread(target=generateTranscript, args=(model,))
thread.start()
return response
@app.route("/insert", methods=["POST"])
def insert_document():
try:
data = request.json # Get JSON data from request
model_name = data.get("Model")
csv_filename = "leaderboard.csv"
# Try to download the leaderboard CSV from HF dataset
df = download_csv(csv_filename)
if df is None:
# If not found, create a new DataFrame with this single entry
df = pd.DataFrame([data])
else:
# Check if the model already exists in leaderboard
if model_name in df["Model"].values:
return jsonify({"exists": True})
# Append the new row
df = pd.concat([df, pd.DataFrame([data])], ignore_index=True)
# Upload the updated CSV back to the Hugging Face dataset
success = upload_csv(df, csv_filename)
if not success:
return jsonify({"exists": "Error", "error": "Upload to Hugging Face failed"})
return jsonify({"exists": False, "message": "Data inserted into leaderboard successfully!"})
except Exception as e:
return jsonify({"exists": "Error", "error": str(e)})
# Fetch all documents
@app.route("/fetch", methods=["GET"])
def fetch_documents():
try:
csv_filename = "leaderboard.csv"
df = download_csv(csv_filename)
if df is None:
return jsonify({"error": "Leaderboard CSV not found in Hugging Face dataset."})
documents = df.to_dict(orient="records") # Convert DataFrame to list of dicts
return jsonify({"data": documents})
except Exception as e:
return jsonify({"error": str(e)})
# if __name__ == "__main__":
# app.run(debug=True)