Spaces:

satyamr196
/

ASR-FairBench-Server

Running

App Files Files Community

satyamr196 commited on May 23

Commit

16b9706

1 Parent(s): 62021f3

1)Modified code to accomodate microsoft asr_models also which require their custom code to run, hence set trust_remote_code=True in pipe, 2)Added extra error handling, 3) Now, pipe is loaded in main thread instead of background thread.

Browse files

Files changed (1) hide show

ASR_Server.py +48 -11

ASR_Server.py CHANGED Viewed

@@ -43,7 +43,7 @@ dataset = load_dataset("satyamr196/asr_fairness_audio", split="train")
 # dataset = dataset.with_format("python", decode_audio=False)
 dataset = dataset.cast_column("audio", Audio(decode=False))
-def generateTranscript(ASR_model):
     import os
     import time
     import tqdm
@@ -94,8 +94,10 @@ def generateTranscript(ASR_model):
     total = len(df)
     job_status["total"] = total
     # Initialize ASR pipeline
-    pipe = pipeline("automatic-speech-recognition", model=ASR_model)
     # Column with filenames in the CSV
     filename_column = df.columns[0]
@@ -213,6 +215,8 @@ def get_status():
 @app.route('/api', methods=['GET'])
 def api():
     model = request.args.get('ASR_model', default="", type=str)
     # model = re.sub(r"\s+", "", model)
     model = re.sub(r"[^a-zA-Z0-9/_\-.]", "", model) # sanitize the model ID
@@ -246,17 +250,50 @@ def api():
                 'status': job_status
             })
-        response = jsonify({
-            'message': f'Given Model {model} is being Evaluated, Please come back after a few hours and run the query again. Usually, it completes within an hour'
-        })
-        # Run `generateTranscript(model)` in a separate thread
-        # Start the transcript generation in a separate thread
-        # thread = threading.Thread(target=generateTranscript, args=(model,), daemon=True)
-        thread = threading.Thread(target=generateTranscript, args=(model,))
         thread.start()
-        return response
 @app.route("/insert", methods=["POST"])
 def insert_document():

 # dataset = dataset.with_format("python", decode_audio=False)
 dataset = dataset.cast_column("audio", Audio(decode=False))
+def generateTranscript(ASR_model, pipe=None):
     import os
     import time
     import tqdm
     total = len(df)
     job_status["total"] = total
+    if pipe is None:
+        pipe = pipeline("automatic-speech-recognition", model=ASR_model, trust_remote_code=True)
     # Initialize ASR pipeline
+    # pipe = pipeline("automatic-speech-recognition", model=ASR_model, trust_remote_code=True)
     # Column with filenames in the CSV
     filename_column = df.columns[0]
 @app.route('/api', methods=['GET'])
 def api():
+    from transformers import pipeline
     model = request.args.get('ASR_model', default="", type=str)
     # model = re.sub(r"\s+", "", model)
     model = re.sub(r"[^a-zA-Z0-9/_\-.]", "", model) # sanitize the model ID
                 'status': job_status
             })
+        try:
+            print(f"⏳ Loading model {model} in main thread...")
+            pipe = pipeline("automatic-speech-recognition", model=model, trust_remote_code=True)
+        except Exception as e:
+            return jsonify({
+                "error": f"Model load failed: {str(e)}",
+                "message": f"Model load failed: {str(e)}"
+            }), 500
+        def thread_wrapper(model, pipe):
+            try:
+                job_status["running"] = True
+                job_status["error"] = None
+                job_status["model"] = model
+                generateTranscript(model, pipe)
+                job_status["running"] = False
+                # return jsonify({
+                #     'message': f'Given Model {model} is being Evaluated, Please come back after a few hours and run the query again. Usually, it completes within an hour'
+                # }),200
+            except Exception as e:
+                print(f"❌ Background transcription for {model} failed:", e)
+                job_status["running"] = False
+                job_status["error"] = str(e)
+                # return jsonify({
+                #     "error": f"Background transcription failed: {str(e)}",
+                #     "message": f"Background transcription failed: {str(e)}"
+                # }), 500
+        # Then use:
+        thread = threading.Thread(target=thread_wrapper, args=(model, pipe), daemon=True)
         thread.start()
+        # thread = threading.Thread(target=generateTranscript, args=(model,), daemon=True)
+        # thread = threading.Thread(target=generateTranscript, args=(model,pipe))
+        # thread.start()
+        if job_status.get("error"):
+            return jsonify({
+                'message': f'❌transcription for model "{job_status.get("model")}" failed.',
+                'error': job_status["error"]
+            }), 500
+        return jsonify({
+            'message': f'Given Model {model} is being Evaluated. Please come back after a few hours and run the query again.',
+            'status': job_status
+        }), 202
 @app.route("/insert", methods=["POST"])
 def insert_document():