from flask import Flask, jsonify from datasets import load_dataset, Audio import pandas as pd import os import threading import os os.environ["HF_HOME"] = "/tmp/huggingface" job_status = { "running": False, "model": None, "completed": 0, "message": "No job running" } csv_path = "test.csv" output_dir="/data" df = pd.read_csv(csv_path) print(f"CSV Loaded with {len(df)} rows") def generateTranscript(ASR_model): import os import time import tqdm import pandas as pd import soundfile as sf from transformers import pipeline job_status.update({ "running": True, "model": ASR_model, "completed": 0, "message": "Starting transcription..." }) # Load dataset without decoding audio (required!) dataset = load_dataset("satyamr196/asr_fairness_audio", split="train") # dataset = dataset.with_format("python", decode_audio=False) dataset = dataset.cast_column("audio", Audio(decode=False)) output_csv_path = os.path.join(output_dir, f"test_with_{ASR_model}.csv") # Check if transcript already exists if os.path.exists(output_csv_path): print(f"Transcript already exists for model {ASR_model}. Skipping transcription.") return # Load CSV df = pd.read_csv(csv_path) print(f"CSV Loaded with {len(df)} rows") total = len(df) job_status["total"] = total import torch # Check if GPU is available if torch.cuda.is_available(): device = 0 print("Device set to use GPU") else: device = -1 print("Device set to use CPU") # Initialize ASR pipeline pipe = pipeline("automatic-speech-recognition", model=ASR_model, device=device) # Column with filenames in the CSV filename_column = df.columns[0] df[filename_column] = df[filename_column].str.strip().str.lower() # Build map from filename -> dataset sample (without decoding audio) # print("Creating dataset map from filenames...") # dataset = dataset.with_format("python", decode_audio=False) dataset_map = { os.path.basename(sample["audio"]["path"]).lower(): sample for sample in dataset } transcripts = [] rtfx_score = [] for idx, row in tqdm.tqdm(df.iterrows(), total=len(df)): filename = row[filename_column] + ".wav" if filename in dataset_map: sample = dataset_map[filename] try: # Decode audio only when needed file_path = sample["audio"]["path"] audio_array, sample_rate = sf.read(file_path) start_time = time.time() result = pipe({"array": audio_array, "sampling_rate": sample_rate}) end_time = time.time() transcript = result["text"] duration = len(audio_array) / sample_rate rtfx = (end_time - start_time) / duration if duration > 0 else 0 transcripts.append(transcript) rtfx_score.append(rtfx) print(f"āœ… {filename}: RTFX = {rtfx:.2f}") except Exception as e: print(f"āŒ Error with {filename}: {e}") transcripts.append("") rtfx_score.append(0) else: print(f"āŒ File not found in dataset: {filename}") transcripts.append("") rtfx_score.append(0) job_status["completed"] = idx + 1 job_status["message"] = f"Processing {idx + 1}/{total}" job_status["%_completed"] = (idx + 1) * 100 / total # Save results df["transcript"] = transcripts df["rtfx"] = rtfx_score os.makedirs(output_dir, exist_ok=True) # Create the directory if it doesn't exist csv_output_dir = os.path.dirname(output_csv_path) # Get the directory path if not os.path.exists(csv_output_dir): # Check if directory exists os.makedirs(csv_output_dir) # Create directory if it doesn't exist print(f"Created directory: {csv_output_dir}") df.to_csv(output_csv_path, index=False) job_status["running"] = False job_status["message"] = "Transcription completed." print(f"\nšŸ“„ Transcripts saved to: {output_csv_path}") app = Flask(__name__) @app.route("/") def home(): return jsonify( { "message": "Welcome to the ASR Server! Please use the App link to access ASR-FairBench application.", "App link": "https://github.com/SatyamR196/ASR-FairBench" } ) @app.route("/asr_models") def asr_models(): models = [ "DeepSpeech", "Wav2Vec", "Jasper", "QuartzNet", "Conformer", "whisper", "Kaldi", "SpeechBrain", "Fairseq S2T", "ESPnet" ] def background_job(): generateTranscript("openai/whisper-base") # Start the background job in a separate thread threading.Thread(target=background_job).start() print("Transcription started in background") return jsonify({"asr_models": models}) @app.route("/status") def get_status(): return jsonify(job_status) # if __name__ == "__main__": # app.run(debug=True)