Spaces:
Running
Running

Revert "1)Modified code to accomodate microsoft asr_models also which require their custom code to run, hence set trust_remote_code=True in pipe, 2)Added extra error handling, 3) Now, pipe is loaded in main thread instead of background thread."
dde7018
from flask import Flask, jsonify, request | |
from flask_cors import CORS | |
from datasets import load_dataset, Audio | |
import pandas as pd | |
import os | |
import re | |
import threading | |
from dotenv import load_dotenv | |
from datetime import datetime | |
import pytz | |
from utils.load_csv import upload_csv, download_csv | |
from utils.generate_results import generateResults | |
from utils.generate_box_plot import box_plot_data | |
from utils.model_validity import is_valid_asr_model | |
from utils.send_email import send_email | |
# Set the cache directory for Hugging Face datasets | |
os.environ["HF_HOME"] = "/tmp/huggingface" | |
# ASR_model = "openai/whisper-tiny" # Replace with your ASR model | |
#Check cpu score | |
import timeit | |
cpu_score = timeit.timeit("sum(range(1000000))", number=5) | |
print(f"🧠 CPU benchmark score: {cpu_score:.2f}") | |
job_status = { | |
"running": False, | |
"model": None, | |
"completed": None, | |
"%_completed" : None, | |
"message": "No Transcription in progress", | |
"total": None | |
} | |
csv_path = "test.csv" | |
# csv_transcript = f'test_with_{ASR_model.replace("/", "_")}.csv' | |
# csv_result = f'test_with_{ASR_model.replace("/","_")}_WER.csv' | |
df = pd.read_csv(csv_path) | |
print(f"CSV Loaded with {len(df)} rows") | |
# Load dataset without decoding audio (required!) | |
dataset = load_dataset("satyamr196/asr_fairness_audio", split="train") | |
# dataset = dataset.with_format("python", decode_audio=False) | |
dataset = dataset.cast_column("audio", Audio(decode=False)) | |
def generateTranscript(ASR_model): | |
import os | |
import time | |
import tqdm | |
import pandas as pd | |
import soundfile as sf | |
from transformers import pipeline | |
job_status.update({ | |
"running": True, | |
"model": ASR_model, | |
"completed": 0, | |
"%_completed" : 0, | |
"message": "Starting transcription...", | |
"total": None | |
}) | |
csv_transcript = f'test_with_{ASR_model.replace("/", "_")}.csv' | |
csv_result = f'test_with_{ASR_model.replace("/","_")}_WER.csv' | |
# Check if transcript already exists | |
df_transcript = download_csv(csv_transcript) | |
if(df_transcript is None): | |
print(f"CSV not found in the dataset repo. Proceeding to generate transcript.") | |
# Get current IST time | |
ist = pytz.timezone("Asia/Kolkata") | |
current_time = datetime.now(ist).strftime("%H:%M:%S %d %b %Y") | |
send_email( | |
to_email="raianand.1991@gmail.com", | |
subject=f"Audit Started for ASR model {ASR_model}", | |
message_body=f"Audit started at {current_time} for ASR model {ASR_model}.", | |
bcc_emails=["pedanticsatoshi0@getsafesurfer.com"] | |
) | |
else: | |
print(f"Transcript already exists for previously submitted model. Skipping transcription.") | |
job_status.update({ | |
"running": False, | |
"model": None, | |
"completed": None, | |
"%_completed" : None, | |
"message": "No Transcription in progress", | |
"total": None | |
}) | |
return | |
# # Load test.csv | |
# df = pd.read_csv(csv_path) | |
# print(f"CSV Loaded with {len(df)} rows") | |
total = len(df) | |
job_status["total"] = total | |
# Initialize ASR pipeline | |
pipe = pipeline("automatic-speech-recognition", model=ASR_model) | |
# Column with filenames in the CSV | |
filename_column = df.columns[0] | |
df[filename_column] = df[filename_column].str.strip().str.lower() | |
# Build map from filename -> dataset sample (without decoding audio) | |
# print("Creating dataset map from filenames...") | |
# dataset = dataset.with_format("python", decode_audio=False) | |
dataset_map = { | |
os.path.basename(sample["audio"]["path"]).lower(): sample | |
for sample in dataset #uncomment this line to use the dataset | |
} | |
transcripts = [] | |
rtfx_score = [] | |
for idx, row in tqdm.tqdm(df.iterrows(), total=len(df)): | |
filename = row[filename_column] + ".wav" | |
if filename in dataset_map: | |
sample = dataset_map[filename] | |
try: | |
# Decode audio only when needed | |
file_path = sample["audio"]["path"] | |
audio_array, sample_rate = sf.read(file_path) | |
start_time = time.time() | |
result = pipe({"array": audio_array, "sampling_rate": sample_rate}) | |
end_time = time.time() | |
transcript = result["text"] | |
duration = len(audio_array) / sample_rate | |
rtfx = (end_time - start_time) / duration if duration > 0 else 0 | |
transcripts.append(transcript) | |
rtfx_score.append(rtfx) | |
print(f"✅ {filename}: RTFX = {rtfx:.2f}, Progress: {(idx + 1) * 100 / total} %") | |
except Exception as e: | |
print(f"❌ Error with {filename}: {e}") | |
transcripts.append("") | |
rtfx_score.append(0) | |
else: | |
print(f"❌ File not found in dataset: {filename}") | |
transcripts.append("") | |
rtfx_score.append(0) | |
job_status["completed"] = idx + 1 | |
job_status["message"] = f"Processing {idx + 1}/{total}" | |
job_status["%_completed"] = (idx + 1) * 100 / total | |
# Save results | |
df["transcript"] = transcripts | |
df["rtfx"] = rtfx_score | |
job_status.update({ | |
"running": False, | |
"model": None, | |
"completed": None, | |
"%_completed" : None, | |
"message": "No Transcription in progress", | |
"total": None | |
}) | |
# df.to_csv(csv_result, index=False) | |
upload_csv(df, csv_transcript) | |
print(f"\n📄 Transcripts saved to: {csv_transcript}") | |
# generateTranscript(ASR_model) | |
# print(generate_results(ASR_model)) | |
# print(box_plot_data(ASR_model)) | |
# ! FLASK SERVER CODE :- | |
app = Flask(__name__) | |
CORS(app,origins="*") | |
def home(): | |
return jsonify( | |
{ | |
"message": "Welcome to the ASR Server! Please use the App link to access ASR-FairBench application.", | |
"App link": "https://huggingface.co/spaces/satyamr196/ASR-FairBench" | |
} | |
) | |
def asr_models(): | |
models = [ | |
"DeepSpeech", | |
"Wav2Vec", | |
"Jasper", | |
"QuartzNet", | |
"Conformer", | |
"whisper", | |
"Kaldi", | |
"SpeechBrain", | |
"Fairseq S2T", | |
"ESPnet" | |
] | |
def background_job(): | |
generateTranscript("openai/whisper-tiny") | |
# Start the background job in a separate thread | |
threading.Thread(target=background_job).start() | |
print("Transcription started in background") | |
return jsonify({"asr_models": models}) | |
def get_status(): | |
return jsonify(job_status) | |
def api(): | |
model = request.args.get('ASR_model', default="", type=str) | |
# model = re.sub(r"\s+", "", model) | |
model = re.sub(r"[^a-zA-Z0-9/_\-.]", "", model) # sanitize the model ID | |
csv_transcript = f'test_with_{model.replace("/","_")}.csv' | |
csv_result = f'test_with_{model.replace("/","_")}_WER.csv' | |
if not model: | |
return jsonify({'error': 'ASR_model parameter is required'}) | |
elif not is_valid_asr_model(model): | |
return jsonify({'message': 'Invalid ASR model ID, please check if your model is available on Hugging Face'}), 400 # Return 400 if model is invalid | |
elif (download_csv(csv_transcript) is not None): | |
# Load the CSV file from the Hugging Face Hub | |
Results = generateResults(model) | |
wer_Gender, wer_SEG, wer_Ethnicity, wer_Language = box_plot_data(model) | |
return jsonify({ | |
'message': f'{model} has been evaluated and results are shown below', | |
'endpoint': "/api", | |
'model': model, | |
'greet' : "Welcome to ASR-FairBench", | |
**Results, | |
'wer_Gender' : wer_Gender, | |
'wer_SEG' : wer_SEG, | |
'wer_Ethnicity' : wer_Ethnicity, | |
'wer_Language' : wer_Language | |
}) | |
else: | |
# Check if `generateTranscript` is already running for this model | |
if job_status["running"] : | |
return jsonify({ | |
'message': f'Transcription for previously submitted ASR model is in progress. Please wait for it to complete. Then submit your model again.', | |
'status': job_status | |
}) | |
response = jsonify({ | |
'message': f'Given Model {model} is being Evaluated, Please come back after a few hours and run the query again. Usually, it completes within an hour' | |
}) | |
# Run `generateTranscript(model)` in a separate thread | |
# Start the transcript generation in a separate thread | |
# thread = threading.Thread(target=generateTranscript, args=(model,), daemon=True) | |
thread = threading.Thread(target=generateTranscript, args=(model,)) | |
thread.start() | |
return response | |
def insert_document(): | |
try: | |
data = request.json # Get JSON data from request | |
model_name = data.get("Model") | |
csv_filename = "leaderboard.csv" | |
# Try to download the leaderboard CSV from HF dataset | |
df = download_csv(csv_filename) | |
if df is None: | |
# If not found, create a new DataFrame with this single entry | |
df = pd.DataFrame([data]) | |
else: | |
# Check if the model already exists in leaderboard | |
if model_name in df["Model"].values: | |
return jsonify({"exists": True}) | |
# Append the new row | |
df = pd.concat([df, pd.DataFrame([data])], ignore_index=True) | |
# Upload the updated CSV back to the Hugging Face dataset | |
success = upload_csv(df, csv_filename) | |
if not success: | |
return jsonify({"exists": "Error", "error": "Upload to Hugging Face failed"}) | |
return jsonify({"exists": False, "message": "Data inserted into leaderboard successfully!"}) | |
except Exception as e: | |
return jsonify({"exists": "Error", "error": str(e)}) | |
# Fetch all documents | |
def fetch_documents(): | |
try: | |
csv_filename = "leaderboard.csv" | |
df = download_csv(csv_filename) | |
if df is None: | |
return jsonify({"error": "Leaderboard CSV not found in Hugging Face dataset."}) | |
documents = df.to_dict(orient="records") # Convert DataFrame to list of dicts | |
return jsonify({"data": documents}) | |
except Exception as e: | |
return jsonify({"error": str(e)}) | |
# if __name__ == "__main__": | |
# app.run(debug=True) |