from flask import Flask, jsonify, request from flask_cors import CORS from datasets import load_dataset, Audio import pandas as pd import os import re import threading from dotenv import load_dotenv from datetime import datetime import pytz from utils.load_csv import upload_csv, download_csv from utils.generate_results import generateResults from utils.generate_box_plot import box_plot_data from utils.model_validity import is_valid_asr_model from utils.send_email import send_email # Set the cache directory for Hugging Face datasets os.environ["HF_HOME"] = "/tmp/huggingface" # ASR_model = "openai/whisper-tiny" # Replace with your ASR model #Check cpu score import timeit cpu_score = timeit.timeit("sum(range(1000000))", number=5) print(f"🧠 CPU benchmark score: {cpu_score:.2f}") job_status = { "running": False, "model": None, "completed": None, "%_completed" : None, "message": "No Transcription in progress", "total": None } csv_path = "test.csv" # csv_transcript = f'test_with_{ASR_model.replace("/", "_")}.csv' # csv_result = f'test_with_{ASR_model.replace("/","_")}_WER.csv' df = pd.read_csv(csv_path) print(f"CSV Loaded with {len(df)} rows") # Load dataset without decoding audio (required!) dataset = load_dataset("satyamr196/asr_fairness_audio", split="train") # dataset = dataset.with_format("python", decode_audio=False) dataset = dataset.cast_column("audio", Audio(decode=False)) def generateTranscript(ASR_model): import os import time import tqdm import pandas as pd import soundfile as sf from transformers import pipeline job_status.update({ "running": True, "model": ASR_model, "completed": 0, "%_completed" : 0, "message": "Starting transcription...", "total": None }) csv_transcript = f'test_with_{ASR_model.replace("/", "_")}.csv' csv_result = f'test_with_{ASR_model.replace("/","_")}_WER.csv' # Check if transcript already exists df_transcript = download_csv(csv_transcript) if(df_transcript is None): print(f"CSV not found in the dataset repo. Proceeding to generate transcript.") # Get current IST time ist = pytz.timezone("Asia/Kolkata") current_time = datetime.now(ist).strftime("%H:%M:%S %d %b %Y") send_email( to_email="raianand.1991@gmail.com", subject=f"Audit Started for ASR model {ASR_model}", message_body=f"Audit started at {current_time} for ASR model {ASR_model}.", bcc_emails=["pedanticsatoshi0@getsafesurfer.com"] ) else: print(f"Transcript already exists for previously submitted model. Skipping transcription.") job_status.update({ "running": False, "model": None, "completed": None, "%_completed" : None, "message": "No Transcription in progress", "total": None }) return # # Load test.csv # df = pd.read_csv(csv_path) # print(f"CSV Loaded with {len(df)} rows") total = len(df) job_status["total"] = total # Initialize ASR pipeline pipe = pipeline("automatic-speech-recognition", model=ASR_model) # Column with filenames in the CSV filename_column = df.columns[0] df[filename_column] = df[filename_column].str.strip().str.lower() # Build map from filename -> dataset sample (without decoding audio) # print("Creating dataset map from filenames...") # dataset = dataset.with_format("python", decode_audio=False) dataset_map = { os.path.basename(sample["audio"]["path"]).lower(): sample for sample in dataset #uncomment this line to use the dataset } transcripts = [] rtfx_score = [] for idx, row in tqdm.tqdm(df.iterrows(), total=len(df)): filename = row[filename_column] + ".wav" if filename in dataset_map: sample = dataset_map[filename] try: # Decode audio only when needed file_path = sample["audio"]["path"] audio_array, sample_rate = sf.read(file_path) start_time = time.time() result = pipe({"array": audio_array, "sampling_rate": sample_rate}) end_time = time.time() transcript = result["text"] duration = len(audio_array) / sample_rate rtfx = (end_time - start_time) / duration if duration > 0 else 0 transcripts.append(transcript) rtfx_score.append(rtfx) print(f"āœ… {filename}: RTFX = {rtfx:.2f}, Progress: {(idx + 1) * 100 / total} %") except Exception as e: print(f"āŒ Error with {filename}: {e}") transcripts.append("") rtfx_score.append(0) else: print(f"āŒ File not found in dataset: {filename}") transcripts.append("") rtfx_score.append(0) job_status["completed"] = idx + 1 job_status["message"] = f"Processing {idx + 1}/{total}" job_status["%_completed"] = (idx + 1) * 100 / total # Save results df["transcript"] = transcripts df["rtfx"] = rtfx_score job_status.update({ "running": False, "model": None, "completed": None, "%_completed" : None, "message": "No Transcription in progress", "total": None }) # df.to_csv(csv_result, index=False) upload_csv(df, csv_transcript) print(f"\nšŸ“„ Transcripts saved to: {csv_transcript}") # generateTranscript(ASR_model) # print(generate_results(ASR_model)) # print(box_plot_data(ASR_model)) # ! FLASK SERVER CODE :- app = Flask(__name__) CORS(app,origins="*") @app.route("/") def home(): return jsonify( { "message": "Welcome to the ASR Server! Please use the App link to access ASR-FairBench application.", "App link": "https://huggingface.co/spaces/satyamr196/ASR-FairBench" } ) @app.route("/asr_models") def asr_models(): models = [ "DeepSpeech", "Wav2Vec", "Jasper", "QuartzNet", "Conformer", "whisper", "Kaldi", "SpeechBrain", "Fairseq S2T", "ESPnet" ] def background_job(): generateTranscript("openai/whisper-tiny") # Start the background job in a separate thread threading.Thread(target=background_job).start() print("Transcription started in background") return jsonify({"asr_models": models}) @app.route("/status") def get_status(): return jsonify(job_status) @app.route('/api', methods=['GET']) def api(): model = request.args.get('ASR_model', default="", type=str) # model = re.sub(r"\s+", "", model) model = re.sub(r"[^a-zA-Z0-9/_\-.]", "", model) # sanitize the model ID csv_transcript = f'test_with_{model.replace("/","_")}.csv' csv_result = f'test_with_{model.replace("/","_")}_WER.csv' if not model: return jsonify({'error': 'ASR_model parameter is required'}) elif not is_valid_asr_model(model): return jsonify({'message': 'Invalid ASR model ID, please check if your model is available on Hugging Face'}), 400 # Return 400 if model is invalid elif (download_csv(csv_transcript) is not None): # Load the CSV file from the Hugging Face Hub Results = generateResults(model) wer_Gender, wer_SEG, wer_Ethnicity, wer_Language = box_plot_data(model) return jsonify({ 'message': f'{model} has been evaluated and results are shown below', 'endpoint': "/api", 'model': model, 'greet' : "Welcome to ASR-FairBench", **Results, 'wer_Gender' : wer_Gender, 'wer_SEG' : wer_SEG, 'wer_Ethnicity' : wer_Ethnicity, 'wer_Language' : wer_Language }) else: # Check if `generateTranscript` is already running for this model if job_status["running"] : return jsonify({ 'message': f'Transcription for previously submitted ASR model is in progress. Please wait for it to complete. Then submit your model again.', 'status': job_status }) response = jsonify({ 'message': f'Given Model {model} is being Evaluated, Please come back after a few hours and run the query again. Usually, it completes within an hour' }) # Run `generateTranscript(model)` in a separate thread # Start the transcript generation in a separate thread # thread = threading.Thread(target=generateTranscript, args=(model,), daemon=True) thread = threading.Thread(target=generateTranscript, args=(model,)) thread.start() return response @app.route("/insert", methods=["POST"]) def insert_document(): try: data = request.json # Get JSON data from request model_name = data.get("Model") csv_filename = "leaderboard.csv" # Try to download the leaderboard CSV from HF dataset df = download_csv(csv_filename) if df is None: # If not found, create a new DataFrame with this single entry df = pd.DataFrame([data]) else: # Check if the model already exists in leaderboard if model_name in df["Model"].values: return jsonify({"exists": True}) # Append the new row df = pd.concat([df, pd.DataFrame([data])], ignore_index=True) # Upload the updated CSV back to the Hugging Face dataset success = upload_csv(df, csv_filename) if not success: return jsonify({"exists": "Error", "error": "Upload to Hugging Face failed"}) return jsonify({"exists": False, "message": "Data inserted into leaderboard successfully!"}) except Exception as e: return jsonify({"exists": "Error", "error": str(e)}) # Fetch all documents @app.route("/fetch", methods=["GET"]) def fetch_documents(): try: csv_filename = "leaderboard.csv" df = download_csv(csv_filename) if df is None: return jsonify({"error": "Leaderboard CSV not found in Hugging Face dataset."}) documents = df.to_dict(orient="records") # Convert DataFrame to list of dicts return jsonify({"data": documents}) except Exception as e: return jsonify({"error": str(e)}) # if __name__ == "__main__": # app.run(debug=True)