Spaces:

satyamr196
/

ASR-FairBench-Server

Running

ASR-FairBench-Server / ASR_Server.py

Revert "1)Modified code to accomodate microsoft asr_models also which require their custom code to run, hence set trust_remote_code=True in pipe, 2)Added extra error handling, 3) Now, pipe is loaded in main thread instead of background thread."

dde7018 18 days ago

raw

history blame contribute delete

10.5 kB

	from flask import Flask, jsonify, request
	from flask_cors import CORS
	from datasets import load_dataset, Audio
	import pandas as pd
	import os
	import re
	import threading
	from dotenv import load_dotenv
	from datetime import datetime
	import pytz
	from utils.load_csv import upload_csv, download_csv
	from utils.generate_results import generateResults
	from utils.generate_box_plot import box_plot_data
	from utils.model_validity import is_valid_asr_model
	from utils.send_email import send_email

	# Set the cache directory for Hugging Face datasets
	os.environ["HF_HOME"] = "/tmp/huggingface"
	# ASR_model = "openai/whisper-tiny" # Replace with your ASR model
	#Check cpu score
	import timeit
	cpu_score = timeit.timeit("sum(range(1000000))", number=5)
	print(f"🧠 CPU benchmark score: {cpu_score:.2f}")


	job_status = {
	"running": False,
	"model": None,
	"completed": None,
	"%_completed" : None,
	"message": "No Transcription in progress",
	"total": None
	}

	csv_path = "test.csv"
	# csv_transcript = f'test_with_{ASR_model.replace("/", "_")}.csv'
	# csv_result = f'test_with_{ASR_model.replace("/","_")}_WER.csv'
	df = pd.read_csv(csv_path)
	print(f"CSV Loaded with {len(df)} rows")

	# Load dataset without decoding audio (required!)
	dataset = load_dataset("satyamr196/asr_fairness_audio", split="train")
	# dataset = dataset.with_format("python", decode_audio=False)
	dataset = dataset.cast_column("audio", Audio(decode=False))

	def generateTranscript(ASR_model):
	import os
	import time
	import tqdm
	import pandas as pd
	import soundfile as sf
	from transformers import pipeline

	job_status.update({
	"running": True,
	"model": ASR_model,
	"completed": 0,
	"%_completed" : 0,
	"message": "Starting transcription...",
	"total": None
	})

	csv_transcript = f'test_with_{ASR_model.replace("/", "_")}.csv'
	csv_result = f'test_with_{ASR_model.replace("/","_")}_WER.csv'
	# Check if transcript already exists
	df_transcript = download_csv(csv_transcript)
	if(df_transcript is None):
	print(f"CSV not found in the dataset repo. Proceeding to generate transcript.")
	# Get current IST time
	ist = pytz.timezone("Asia/Kolkata")
	current_time = datetime.now(ist).strftime("%H:%M:%S %d %b %Y")
	send_email(
	to_email="raianand.1991@gmail.com",
	subject=f"Audit Started for ASR model {ASR_model}",
	message_body=f"Audit started at {current_time} for ASR model {ASR_model}.",
	bcc_emails=["pedanticsatoshi0@getsafesurfer.com"]
	)
	else:
	print(f"Transcript already exists for previously submitted model. Skipping transcription.")
	job_status.update({
	"running": False,
	"model": None,
	"completed": None,
	"%_completed" : None,
	"message": "No Transcription in progress",
	"total": None
	})
	return

	# # Load test.csv
	# df = pd.read_csv(csv_path)
	# print(f"CSV Loaded with {len(df)} rows")

	total = len(df)
	job_status["total"] = total

	# Initialize ASR pipeline
	pipe = pipeline("automatic-speech-recognition", model=ASR_model)

	# Column with filenames in the CSV
	filename_column = df.columns[0]
	df[filename_column] = df[filename_column].str.strip().str.lower()

	# Build map from filename -> dataset sample (without decoding audio)
	# print("Creating dataset map from filenames...")
	# dataset = dataset.with_format("python", decode_audio=False)
	dataset_map = {
	os.path.basename(sample["audio"]["path"]).lower(): sample
	for sample in dataset #uncomment this line to use the dataset
	}

	transcripts = []
	rtfx_score = []

	for idx, row in tqdm.tqdm(df.iterrows(), total=len(df)):
	filename = row[filename_column] + ".wav"

	if filename in dataset_map:
	sample = dataset_map[filename]
	try:
	# Decode audio only when needed
	file_path = sample["audio"]["path"]
	audio_array, sample_rate = sf.read(file_path)

	start_time = time.time()
	result = pipe({"array": audio_array, "sampling_rate": sample_rate})

	end_time = time.time()

	transcript = result["text"]
	duration = len(audio_array) / sample_rate
	rtfx = (end_time - start_time) / duration if duration > 0 else 0

	transcripts.append(transcript)
	rtfx_score.append(rtfx)

	print(f"✅ {filename}: RTFX = {rtfx:.2f}, Progress: {(idx + 1) * 100 / total} %")

	except Exception as e:
	print(f"❌ Error with {filename}: {e}")
	transcripts.append("")
	rtfx_score.append(0)
	else:
	print(f"❌ File not found in dataset: {filename}")
	transcripts.append("")
	rtfx_score.append(0)

	job_status["completed"] = idx + 1
	job_status["message"] = f"Processing {idx + 1}/{total}"
	job_status["%_completed"] = (idx + 1) * 100 / total

	# Save results
	df["transcript"] = transcripts
	df["rtfx"] = rtfx_score

	job_status.update({
	"running": False,
	"model": None,
	"completed": None,
	"%_completed" : None,
	"message": "No Transcription in progress",
	"total": None
	})
	# df.to_csv(csv_result, index=False)
	upload_csv(df, csv_transcript)
	print(f"\n📄 Transcripts saved to: {csv_transcript}")


	# generateTranscript(ASR_model)
	# print(generate_results(ASR_model))
	# print(box_plot_data(ASR_model))

	# ! FLASK SERVER CODE :-

	app = Flask(__name__)
	CORS(app,origins="*")

	@app.route("/")
	def home():
	return jsonify(
	{
	"message": "Welcome to the ASR Server! Please use the App link to access ASR-FairBench application.",
	"App link": "https://huggingface.co/spaces/satyamr196/ASR-FairBench"
	}
	)

	@app.route("/asr_models")
	def asr_models():
	models = [
	"DeepSpeech",
	"Wav2Vec",
	"Jasper",
	"QuartzNet",
	"Conformer",
	"whisper",
	"Kaldi",
	"SpeechBrain",
	"Fairseq S2T",
	"ESPnet"
	]

	def background_job():
	generateTranscript("openai/whisper-tiny")

	# Start the background job in a separate thread
	threading.Thread(target=background_job).start()
	print("Transcription started in background")
	return jsonify({"asr_models": models})

	@app.route("/status")
	def get_status():
	return jsonify(job_status)

	@app.route('/api', methods=['GET'])
	def api():
	model = request.args.get('ASR_model', default="", type=str)
	# model = re.sub(r"\s+", "", model)
	model = re.sub(r"[^a-zA-Z0-9/_\-.]", "", model) # sanitize the model ID
	csv_transcript = f'test_with_{model.replace("/","_")}.csv'
	csv_result = f'test_with_{model.replace("/","_")}_WER.csv'
	if not model:
	return jsonify({'error': 'ASR_model parameter is required'})
	elif not is_valid_asr_model(model):
	return jsonify({'message': 'Invalid ASR model ID, please check if your model is available on Hugging Face'}), 400 # Return 400 if model is invalid
	elif (download_csv(csv_transcript) is not None):
	# Load the CSV file from the Hugging Face Hub
	Results = generateResults(model)
	wer_Gender, wer_SEG, wer_Ethnicity, wer_Language = box_plot_data(model)

	return jsonify({
	'message': f'{model} has been evaluated and results are shown below',
	'endpoint': "/api",
	'model': model,
	'greet' : "Welcome to ASR-FairBench",
	**Results,
	'wer_Gender' : wer_Gender,
	'wer_SEG' : wer_SEG,
	'wer_Ethnicity' : wer_Ethnicity,
	'wer_Language' : wer_Language
	})
	else:
	# Check if `generateTranscript` is already running for this model
	if job_status["running"] :
	return jsonify({
	'message': f'Transcription for previously submitted ASR model is in progress. Please wait for it to complete. Then submit your model again.',
	'status': job_status
	})

	response = jsonify({
	'message': f'Given Model {model} is being Evaluated, Please come back after a few hours and run the query again. Usually, it completes within an hour'
	})

	# Run `generateTranscript(model)` in a separate thread
	# Start the transcript generation in a separate thread
	# thread = threading.Thread(target=generateTranscript, args=(model,), daemon=True)
	thread = threading.Thread(target=generateTranscript, args=(model,))
	thread.start()

	return response

	@app.route("/insert", methods=["POST"])
	def insert_document():
	try:
	data = request.json # Get JSON data from request
	model_name = data.get("Model")
	csv_filename = "leaderboard.csv"

	# Try to download the leaderboard CSV from HF dataset
	df = download_csv(csv_filename)

	if df is None:
	# If not found, create a new DataFrame with this single entry
	df = pd.DataFrame([data])
	else:
	# Check if the model already exists in leaderboard
	if model_name in df["Model"].values:
	return jsonify({"exists": True})
	# Append the new row
	df = pd.concat([df, pd.DataFrame([data])], ignore_index=True)

	# Upload the updated CSV back to the Hugging Face dataset
	success = upload_csv(df, csv_filename)
	if not success:
	return jsonify({"exists": "Error", "error": "Upload to Hugging Face failed"})

	return jsonify({"exists": False, "message": "Data inserted into leaderboard successfully!"})

	except Exception as e:
	return jsonify({"exists": "Error", "error": str(e)})

	# Fetch all documents
	@app.route("/fetch", methods=["GET"])
	def fetch_documents():
	try:
	csv_filename = "leaderboard.csv"
	df = download_csv(csv_filename)

	if df is None:
	return jsonify({"error": "Leaderboard CSV not found in Hugging Face dataset."})

	documents = df.to_dict(orient="records") # Convert DataFrame to list of dicts
	return jsonify({"data": documents})

	except Exception as e:
	return jsonify({"error": str(e)})

	# if __name__ == "__main__":
	# app.run(debug=True)