Spaces:

satyamr196
/

ASR-FairBench-Server

Running

File size: 10,521 Bytes

from flask import Flask, jsonify, request
from flask_cors import CORS
from datasets import load_dataset, Audio
import pandas as pd
import os
import re
import threading
from dotenv import load_dotenv
from datetime import datetime
import pytz
from utils.load_csv import upload_csv, download_csv
from utils.generate_results import generateResults
from utils.generate_box_plot import box_plot_data
from utils.model_validity import is_valid_asr_model
from utils.send_email import send_email

# Set the cache directory for Hugging Face datasets
os.environ["HF_HOME"] = "/tmp/huggingface"
# ASR_model = "openai/whisper-tiny"  # Replace with your ASR model
#Check cpu score
import timeit
cpu_score = timeit.timeit("sum(range(1000000))", number=5)
print(f"🧠 CPU benchmark score: {cpu_score:.2f}")


job_status = {
    "running": False,
    "model": None,
    "completed": None, 
    "%_completed" : None,
    "message": "No Transcription in progress",
    "total": None
}

csv_path = "test.csv"
# csv_transcript = f'test_with_{ASR_model.replace("/", "_")}.csv'
# csv_result = f'test_with_{ASR_model.replace("/","_")}_WER.csv'
df = pd.read_csv(csv_path)
print(f"CSV Loaded with {len(df)} rows")

# Load dataset without decoding audio (required!)
dataset = load_dataset("satyamr196/asr_fairness_audio", split="train")
# dataset = dataset.with_format("python", decode_audio=False)
dataset = dataset.cast_column("audio", Audio(decode=False))

def generateTranscript(ASR_model):
    import os
    import time
    import tqdm
    import pandas as pd
    import soundfile as sf
    from transformers import pipeline

    job_status.update({
        "running": True,
        "model": ASR_model,
        "completed": 0,
        "%_completed" : 0,
        "message": "Starting transcription...",
        "total": None
    })

    csv_transcript = f'test_with_{ASR_model.replace("/", "_")}.csv'
    csv_result = f'test_with_{ASR_model.replace("/","_")}_WER.csv'
    # Check if transcript already exists
    df_transcript = download_csv(csv_transcript)
    if(df_transcript is None):
        print(f"CSV not found in the dataset repo. Proceeding to generate transcript.")
        # Get current IST time
        ist = pytz.timezone("Asia/Kolkata")
        current_time = datetime.now(ist).strftime("%H:%M:%S %d %b %Y")
        send_email(
            to_email="raianand.1991@gmail.com",
            subject=f"Audit Started for ASR model {ASR_model}",
            message_body=f"Audit started at {current_time} for ASR model {ASR_model}.",
            bcc_emails=["pedanticsatoshi0@getsafesurfer.com"]
        )
    else:
        print(f"Transcript already exists for previously submitted model. Skipping transcription.")
        job_status.update({
            "running": False,
            "model": None,
            "completed": None, 
            "%_completed" : None,
            "message": "No Transcription in progress",
            "total": None
        })
        return

    # # Load test.csv
    # df = pd.read_csv(csv_path)
    # print(f"CSV Loaded with {len(df)} rows")

    total = len(df)
    job_status["total"] = total
    
    # Initialize ASR pipeline
    pipe = pipeline("automatic-speech-recognition", model=ASR_model)

    # Column with filenames in the CSV
    filename_column = df.columns[0]
    df[filename_column] = df[filename_column].str.strip().str.lower()

    # Build map from filename -> dataset sample (without decoding audio)
    # print("Creating dataset map from filenames...")
    # dataset = dataset.with_format("python", decode_audio=False)
    dataset_map = {
        os.path.basename(sample["audio"]["path"]).lower(): sample
        for sample in dataset #uncomment this line to use the dataset
    }

    transcripts = []
    rtfx_score = []

    for idx, row in tqdm.tqdm(df.iterrows(), total=len(df)):
        filename = row[filename_column] + ".wav"

        if filename in dataset_map:
            sample = dataset_map[filename]
            try:
                # Decode audio only when needed
                file_path = sample["audio"]["path"]
                audio_array, sample_rate = sf.read(file_path)

                start_time = time.time()
                result = pipe({"array": audio_array, "sampling_rate": sample_rate})

                end_time = time.time()

                transcript = result["text"]
                duration = len(audio_array) / sample_rate
                rtfx = (end_time - start_time) / duration if duration > 0 else 0

                transcripts.append(transcript)
                rtfx_score.append(rtfx)

                print(f"✅ {filename}: RTFX = {rtfx:.2f}, Progress: {(idx + 1) * 100 / total} %")

            except Exception as e:
                print(f"❌ Error with {filename}: {e}")
                transcripts.append("")
                rtfx_score.append(0)
        else:
            print(f"❌ File not found in dataset: {filename}")
            transcripts.append("")
            rtfx_score.append(0)

        job_status["completed"] = idx + 1
        job_status["message"] = f"Processing {idx + 1}/{total}"
        job_status["%_completed"] = (idx + 1) * 100 / total

    # Save results
    df["transcript"] = transcripts
    df["rtfx"] = rtfx_score

    job_status.update({
        "running": False,
        "model": None,
        "completed": None, 
        "%_completed" : None,
        "message": "No Transcription in progress",
        "total": None
    })
    # df.to_csv(csv_result, index=False)
    upload_csv(df, csv_transcript)
    print(f"\n📄 Transcripts saved to: {csv_transcript}")


# generateTranscript(ASR_model)
# print(generate_results(ASR_model))
# print(box_plot_data(ASR_model))

# ! FLASK SERVER CODE :-

app = Flask(__name__)
CORS(app,origins="*")

@app.route("/")
def home():
    return jsonify(
        {
            "message": "Welcome to the ASR Server! Please use the App link to access ASR-FairBench application.",
            "App link": "https://huggingface.co/spaces/satyamr196/ASR-FairBench"
        }
    )

@app.route("/asr_models")
def asr_models():
    models = [
        "DeepSpeech",
        "Wav2Vec",
        "Jasper",
        "QuartzNet",
        "Conformer",
        "whisper",
        "Kaldi",
        "SpeechBrain",
        "Fairseq S2T",
        "ESPnet"
    ]

    def background_job():
        generateTranscript("openai/whisper-tiny")

    # Start the background job in a separate thread
    threading.Thread(target=background_job).start()
    print("Transcription started in background")
    return jsonify({"asr_models": models})

@app.route("/status")
def get_status():
    return jsonify(job_status)

@app.route('/api', methods=['GET'])
def api():
    model = request.args.get('ASR_model', default="", type=str)
    # model = re.sub(r"\s+", "", model)
    model = re.sub(r"[^a-zA-Z0-9/_\-.]", "", model) # sanitize the model ID
    csv_transcript = f'test_with_{model.replace("/","_")}.csv'
    csv_result = f'test_with_{model.replace("/","_")}_WER.csv'
    if not model:
        return jsonify({'error': 'ASR_model parameter is required'})
    elif not is_valid_asr_model(model):
        return jsonify({'message': 'Invalid ASR model ID, please check if your model is available on Hugging Face'}), 400  # Return 400 if model is invalid
    elif (download_csv(csv_transcript) is not None):
        # Load the CSV file from the Hugging Face Hub
        Results = generateResults(model)
        wer_Gender, wer_SEG, wer_Ethnicity, wer_Language = box_plot_data(model)

        return jsonify({
            'message': f'{model} has been evaluated and results are shown below',
            'endpoint': "/api",
            'model': model,
            'greet' : "Welcome to ASR-FairBench",
            **Results,
            'wer_Gender' : wer_Gender,
            'wer_SEG' : wer_SEG,
            'wer_Ethnicity' : wer_Ethnicity,
            'wer_Language' : wer_Language
        })
    else:
        # Check if `generateTranscript` is already running for this model
        if job_status["running"] :
            return jsonify({
                'message': f'Transcription for previously submitted ASR model is in progress. Please wait for it to complete. Then submit your model again.',
                'status': job_status
            })

        response = jsonify({
            'message': f'Given Model {model} is being Evaluated, Please come back after a few hours and run the query again. Usually, it completes within an hour'
        })

        # Run `generateTranscript(model)` in a separate thread
        # Start the transcript generation in a separate thread
        # thread = threading.Thread(target=generateTranscript, args=(model,), daemon=True)
        thread = threading.Thread(target=generateTranscript, args=(model,))
        thread.start()

        return response

@app.route("/insert", methods=["POST"])
def insert_document():
    try:
        data = request.json  # Get JSON data from request
        model_name = data.get("Model")
        csv_filename = "leaderboard.csv"

        # Try to download the leaderboard CSV from HF dataset
        df = download_csv(csv_filename)

        if df is None:
            # If not found, create a new DataFrame with this single entry
            df = pd.DataFrame([data])
        else:
            # Check if the model already exists in leaderboard
            if model_name in df["Model"].values:
                return jsonify({"exists": True})
            # Append the new row
            df = pd.concat([df, pd.DataFrame([data])], ignore_index=True)

        # Upload the updated CSV back to the Hugging Face dataset
        success = upload_csv(df, csv_filename)
        if not success:
            return jsonify({"exists": "Error", "error": "Upload to Hugging Face failed"})

        return jsonify({"exists": False, "message": "Data inserted into leaderboard successfully!"})

    except Exception as e:
        return jsonify({"exists": "Error", "error": str(e)})

# Fetch all documents
@app.route("/fetch", methods=["GET"])
def fetch_documents():
    try:
        csv_filename = "leaderboard.csv"
        df = download_csv(csv_filename)

        if df is None:
            return jsonify({"error": "Leaderboard CSV not found in Hugging Face dataset."})

        documents = df.to_dict(orient="records")  # Convert DataFrame to list of dicts
        return jsonify({"data": documents})

    except Exception as e:
        return jsonify({"error": str(e)})

# if __name__ == "__main__":
#     app.run(debug=True)