File size: 10,521 Bytes
547836e
 
30b7603
 
 
fdc104d
f928012
547836e
6d12745
 
547836e
 
 
fdc104d
6d12745
30b7603
547836e
ac711b2
f21777c
fb5d66c
 
 
 
 
6d12745
acec6fc
 
 
547836e
 
 
 
acec6fc
547836e
2f5d98f
f21777c
 
30b7603
 
 
25106b9
 
 
 
0e84bb7
dde7018
30b7603
 
 
 
 
 
 
0cb2c03
 
 
 
547836e
 
 
0cb2c03
 
f21777c
 
30b7603
547836e
 
 
6d12745
 
 
 
 
 
 
 
 
547836e
6d12745
f7fab04
 
 
 
 
 
 
 
30b7603
 
547836e
 
 
30b7603
a4cec90
 
0e84bb7
30b7603
dde7018
30b7603
 
 
 
 
 
f928012
30b7603
 
 
25106b9
30b7603
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90d2e58
547836e
30b7603
 
 
 
 
 
 
 
 
547836e
30b7603
 
 
 
 
 
 
 
 
 
0cb2c03
 
e02ab90
0cb2c03
30b7603
 
 
 
fdc104d
 
 
 
 
 
 
 
547836e
 
 
30b7603
 
547836e
 
 
30b7603
547836e
bf90550
 
547836e
bf90550
 
 
e91d7bb
 
f82f487
6d12745
e91d7bb
 
bf90550
 
 
 
 
 
 
 
 
 
 
 
 
 
 
547836e
f928012
f21777c
f928012
 
 
 
bf90550
 
acec6fc
0cb2c03
 
 
547836e
 
 
fdc104d
 
8e62829
 
547836e
fdc104d
 
 
547836e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d12745
547836e
 
 
dde7018
 
 
 
 
 
16b9706
dde7018
 
547836e
dde7018
547836e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf90550
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
from flask import Flask, jsonify, request
from flask_cors import CORS
from datasets import load_dataset, Audio
import pandas as pd
import os
import re
import threading
from dotenv import load_dotenv
from datetime import datetime
import pytz
from utils.load_csv import upload_csv, download_csv
from utils.generate_results import generateResults
from utils.generate_box_plot import box_plot_data
from utils.model_validity import is_valid_asr_model
from utils.send_email import send_email

# Set the cache directory for Hugging Face datasets
os.environ["HF_HOME"] = "/tmp/huggingface"
# ASR_model = "openai/whisper-tiny"  # Replace with your ASR model
#Check cpu score
import timeit
cpu_score = timeit.timeit("sum(range(1000000))", number=5)
print(f"🧠 CPU benchmark score: {cpu_score:.2f}")


job_status = {
    "running": False,
    "model": None,
    "completed": None, 
    "%_completed" : None,
    "message": "No Transcription in progress",
    "total": None
}

csv_path = "test.csv"
# csv_transcript = f'test_with_{ASR_model.replace("/", "_")}.csv'
# csv_result = f'test_with_{ASR_model.replace("/","_")}_WER.csv'
df = pd.read_csv(csv_path)
print(f"CSV Loaded with {len(df)} rows")

# Load dataset without decoding audio (required!)
dataset = load_dataset("satyamr196/asr_fairness_audio", split="train")
# dataset = dataset.with_format("python", decode_audio=False)
dataset = dataset.cast_column("audio", Audio(decode=False))

def generateTranscript(ASR_model):
    import os
    import time
    import tqdm
    import pandas as pd
    import soundfile as sf
    from transformers import pipeline

    job_status.update({
        "running": True,
        "model": ASR_model,
        "completed": 0,
        "%_completed" : 0,
        "message": "Starting transcription...",
        "total": None
    })

    csv_transcript = f'test_with_{ASR_model.replace("/", "_")}.csv'
    csv_result = f'test_with_{ASR_model.replace("/","_")}_WER.csv'
    # Check if transcript already exists
    df_transcript = download_csv(csv_transcript)
    if(df_transcript is None):
        print(f"CSV not found in the dataset repo. Proceeding to generate transcript.")
        # Get current IST time
        ist = pytz.timezone("Asia/Kolkata")
        current_time = datetime.now(ist).strftime("%H:%M:%S %d %b %Y")
        send_email(
            to_email="raianand.1991@gmail.com",
            subject=f"Audit Started for ASR model {ASR_model}",
            message_body=f"Audit started at {current_time} for ASR model {ASR_model}.",
            bcc_emails=["pedanticsatoshi0@getsafesurfer.com"]
        )
    else:
        print(f"Transcript already exists for previously submitted model. Skipping transcription.")
        job_status.update({
            "running": False,
            "model": None,
            "completed": None, 
            "%_completed" : None,
            "message": "No Transcription in progress",
            "total": None
        })
        return

    # # Load test.csv
    # df = pd.read_csv(csv_path)
    # print(f"CSV Loaded with {len(df)} rows")

    total = len(df)
    job_status["total"] = total
    
    # Initialize ASR pipeline
    pipe = pipeline("automatic-speech-recognition", model=ASR_model)

    # Column with filenames in the CSV
    filename_column = df.columns[0]
    df[filename_column] = df[filename_column].str.strip().str.lower()

    # Build map from filename -> dataset sample (without decoding audio)
    # print("Creating dataset map from filenames...")
    # dataset = dataset.with_format("python", decode_audio=False)
    dataset_map = {
        os.path.basename(sample["audio"]["path"]).lower(): sample
        for sample in dataset #uncomment this line to use the dataset
    }

    transcripts = []
    rtfx_score = []

    for idx, row in tqdm.tqdm(df.iterrows(), total=len(df)):
        filename = row[filename_column] + ".wav"

        if filename in dataset_map:
            sample = dataset_map[filename]
            try:
                # Decode audio only when needed
                file_path = sample["audio"]["path"]
                audio_array, sample_rate = sf.read(file_path)

                start_time = time.time()
                result = pipe({"array": audio_array, "sampling_rate": sample_rate})

                end_time = time.time()

                transcript = result["text"]
                duration = len(audio_array) / sample_rate
                rtfx = (end_time - start_time) / duration if duration > 0 else 0

                transcripts.append(transcript)
                rtfx_score.append(rtfx)

                print(f"βœ… {filename}: RTFX = {rtfx:.2f}, Progress: {(idx + 1) * 100 / total} %")

            except Exception as e:
                print(f"❌ Error with {filename}: {e}")
                transcripts.append("")
                rtfx_score.append(0)
        else:
            print(f"❌ File not found in dataset: {filename}")
            transcripts.append("")
            rtfx_score.append(0)

        job_status["completed"] = idx + 1
        job_status["message"] = f"Processing {idx + 1}/{total}"
        job_status["%_completed"] = (idx + 1) * 100 / total

    # Save results
    df["transcript"] = transcripts
    df["rtfx"] = rtfx_score

    job_status.update({
        "running": False,
        "model": None,
        "completed": None, 
        "%_completed" : None,
        "message": "No Transcription in progress",
        "total": None
    })
    # df.to_csv(csv_result, index=False)
    upload_csv(df, csv_transcript)
    print(f"\nπŸ“„ Transcripts saved to: {csv_transcript}")


# generateTranscript(ASR_model)
# print(generate_results(ASR_model))
# print(box_plot_data(ASR_model))

# ! FLASK SERVER CODE :-

app = Flask(__name__)
CORS(app,origins="*")

@app.route("/")
def home():
    return jsonify(
        {
            "message": "Welcome to the ASR Server! Please use the App link to access ASR-FairBench application.",
            "App link": "https://huggingface.co/spaces/satyamr196/ASR-FairBench"
        }
    )

@app.route("/asr_models")
def asr_models():
    models = [
        "DeepSpeech",
        "Wav2Vec",
        "Jasper",
        "QuartzNet",
        "Conformer",
        "whisper",
        "Kaldi",
        "SpeechBrain",
        "Fairseq S2T",
        "ESPnet"
    ]

    def background_job():
        generateTranscript("openai/whisper-tiny")

    # Start the background job in a separate thread
    threading.Thread(target=background_job).start()
    print("Transcription started in background")
    return jsonify({"asr_models": models})

@app.route("/status")
def get_status():
    return jsonify(job_status)

@app.route('/api', methods=['GET'])
def api():
    model = request.args.get('ASR_model', default="", type=str)
    # model = re.sub(r"\s+", "", model)
    model = re.sub(r"[^a-zA-Z0-9/_\-.]", "", model) # sanitize the model ID
    csv_transcript = f'test_with_{model.replace("/","_")}.csv'
    csv_result = f'test_with_{model.replace("/","_")}_WER.csv'
    if not model:
        return jsonify({'error': 'ASR_model parameter is required'})
    elif not is_valid_asr_model(model):
        return jsonify({'message': 'Invalid ASR model ID, please check if your model is available on Hugging Face'}), 400  # Return 400 if model is invalid
    elif (download_csv(csv_transcript) is not None):
        # Load the CSV file from the Hugging Face Hub
        Results = generateResults(model)
        wer_Gender, wer_SEG, wer_Ethnicity, wer_Language = box_plot_data(model)

        return jsonify({
            'message': f'{model} has been evaluated and results are shown below',
            'endpoint': "/api",
            'model': model,
            'greet' : "Welcome to ASR-FairBench",
            **Results,
            'wer_Gender' : wer_Gender,
            'wer_SEG' : wer_SEG,
            'wer_Ethnicity' : wer_Ethnicity,
            'wer_Language' : wer_Language
        })
    else:
        # Check if `generateTranscript` is already running for this model
        if job_status["running"] :
            return jsonify({
                'message': f'Transcription for previously submitted ASR model is in progress. Please wait for it to complete. Then submit your model again.',
                'status': job_status
            })

        response = jsonify({
            'message': f'Given Model {model} is being Evaluated, Please come back after a few hours and run the query again. Usually, it completes within an hour'
        })

        # Run `generateTranscript(model)` in a separate thread
        # Start the transcript generation in a separate thread
        # thread = threading.Thread(target=generateTranscript, args=(model,), daemon=True)
        thread = threading.Thread(target=generateTranscript, args=(model,))
        thread.start()

        return response

@app.route("/insert", methods=["POST"])
def insert_document():
    try:
        data = request.json  # Get JSON data from request
        model_name = data.get("Model")
        csv_filename = "leaderboard.csv"

        # Try to download the leaderboard CSV from HF dataset
        df = download_csv(csv_filename)

        if df is None:
            # If not found, create a new DataFrame with this single entry
            df = pd.DataFrame([data])
        else:
            # Check if the model already exists in leaderboard
            if model_name in df["Model"].values:
                return jsonify({"exists": True})
            # Append the new row
            df = pd.concat([df, pd.DataFrame([data])], ignore_index=True)

        # Upload the updated CSV back to the Hugging Face dataset
        success = upload_csv(df, csv_filename)
        if not success:
            return jsonify({"exists": "Error", "error": "Upload to Hugging Face failed"})

        return jsonify({"exists": False, "message": "Data inserted into leaderboard successfully!"})

    except Exception as e:
        return jsonify({"exists": "Error", "error": str(e)})

# Fetch all documents
@app.route("/fetch", methods=["GET"])
def fetch_documents():
    try:
        csv_filename = "leaderboard.csv"
        df = download_csv(csv_filename)

        if df is None:
            return jsonify({"error": "Leaderboard CSV not found in Hugging Face dataset."})

        documents = df.to_dict(orient="records")  # Convert DataFrame to list of dicts
        return jsonify({"data": documents})

    except Exception as e:
        return jsonify({"error": str(e)})

# if __name__ == "__main__":
#     app.run(debug=True)