Spaces:

Luigi
/

Whisper-vs-Sensevoice-Small

Running on Zero

App Files Files Community

Luigi commited on May 27

Commit

019d245

1 Parent(s): ee1376a

fix model_type error

Browse files

Files changed (1) hide show

app.py +55 -64

app.py CHANGED Viewed

@@ -1,9 +1,10 @@
-# app.py
-import spaces
 import re
 import torch
 import gradio as gr
 from transformers import pipeline
 # List of Whisper model IDs
 WHISPER_MODELS = [
@@ -13,30 +14,7 @@ WHISPER_MODELS = [
     "openai/whisper-small",
     "openai/whisper-medium",
     "openai/whisper-base",
-    "JacobLinCool/whisper-large-v3-turbo-common_voice_19_0-zh-TW",
-    "Jingmiao/whisper-small-zh_tw",
-    "DDTChen/whisper-medium-zh-tw",
-    "kimbochen/whisper-small-zh-tw",
-    "ChrisTorng/whisper-large-v3-turbo-common_voice_19_0-zh-TW-ct2",
-    "JacobLinCool/whisper-large-v3-turbo-zh-TW-clean-1",
-    "JunWorks/whisper-small-zhTW",
-    "WANGTINGTING/whisper-large-v2-zh-TW-vol2",
-    "xmzhu/whisper-tiny-zh-TW",
-    "ingrenn/whisper-small-common-voice-13-zh-TW",
-    "jun-han/whisper-small-zh-TW",
-    "xmzhu/whisper-tiny-zh-TW-baseline",
-    "JacobLinCool/whisper-large-v3-turbo-common_voice_16_1-zh-TW-2",
-    "JacobLinCool/whisper-large-v3-common_voice_19_0-zh-TW-full-1",
-    "momo103197/whisper-small-zh-TW-mix",
-    "JacobLinCool/whisper-large-v3-turbo-zh-TW-clean-1-merged",
-    "JacobLinCool/whisper-large-v2-common_voice_19_0-zh-TW-full-1",
-    "kimas1269/whisper-meduim_zhtw",
-    "JunWorks/whisper-base-zhTW",
-    "JunWorks/whisper-small-zhTW-frozenDecoder",
-    "sandy1990418/whisper-large-v3-turbo-zh-tw",
-    "JacobLinCool/whisper-large-v3-turbo-common_voice_16_1-zh-TW-pissa-merged",
-    "momo103197/whisper-small-zh-TW-16",
-    "k1nto/Belle-whisper-large-v3-zh-punct-ct2"
 ]
 # List of SenseVoice model IDs
@@ -48,62 +26,75 @@ SENSEVOICE_MODELS = [
     "apinge/sensevoice-small"
 ]
-# Cache pipelines
-pipes = {}
-def get_asr_pipe(model_id):
-    if model_id not in pipes:
-        # run on GPU if available
         device = 0 if torch.cuda.is_available() else -1
-        pipes[model_id] = pipeline("automatic-speech-recognition", model=model_id, device=device)
-    return pipes[model_id]
 @spaces.GPU
 def transcribe(whisper_model, sense_model, audio_path, enable_punct):
-    # 1) Whisper
-    whisper_pipe = get_asr_pipe(whisper_model)
-    whisper_out = whisper_pipe(audio_path)
-    text_whisper = whisper_out.get("text", "").strip()
-    # 2) SenseVoice
-    sense_pipe = get_asr_pipe(sense_model)
-    sense_out = sense_pipe(audio_path)
-    text_sense = sense_out.get("text", "").strip()
-    # 3) strip punctuation if disabled
     if not enable_punct:
         text_sense = re.sub(r"[^\w\s]", "", text_sense)
     return text_whisper, text_sense
-with gr.Blocks() as demo:
-    gr.Markdown("## Whisper vs. FunASR SenseVoice Comparison")
     with gr.Row():
-        whisper_dd = gr.Dropdown(
-            choices=WHISPER_MODELS,
-            value=WHISPER_MODELS[0],
-            label="Whisper Model"
-        )
-        sense_dd = gr.Dropdown(
-            choices=SENSEVOICE_MODELS,
-            value=SENSEVOICE_MODELS[0],
-            label="SenseVoice Model"
-        )
     punct = gr.Checkbox(label="Enable Punctuation (SenseVoice)", value=True)
-    audio_in = gr.Audio(
-        sources=["upload","microphone"],
-        type="filepath",
-        label="Upload or Record Audio"
-    )
     with gr.Row():
         out_whisper = gr.Textbox(label="Whisper Transcript")
         out_sense = gr.Textbox(label="SenseVoice Transcript")
     btn = gr.Button("Transcribe")
-    btn.click(
-        fn=transcribe,
-        inputs=[whisper_dd, sense_dd, audio_in, punct],
-        outputs=[out_whisper, out_sense]
-    )
 if __name__ == "__main__":
     demo.launch()

 import re
 import torch
 import gradio as gr
+import spaces  # zeroGPU support
 from transformers import pipeline
+from funasr import AutoModel
+from funasr.utils.postprocess_utils import rich_transcription_postprocess
 # List of Whisper model IDs
 WHISPER_MODELS = [
     "openai/whisper-small",
     "openai/whisper-medium",
     "openai/whisper-base",
+    # ... additional multilingual Whisper variants
 ]
 # List of SenseVoice model IDs
     "apinge/sensevoice-small"
 ]
+# Cache Whisper pipelines
+whisper_pipes = {}
+# Cache SenseVoice models
+sense_models = {}
+def get_whisper_pipe(model_id):
+    if model_id not in whisper_pipes:
         device = 0 if torch.cuda.is_available() else -1
+        whisper_pipes[model_id] = pipeline(
+            "automatic-speech-recognition",
+            model=model_id,
+            device=device
+        )
+    return whisper_pipes[model_id]
+def get_sense_model(model_id):
+    if model_id not in sense_models:
+        device_str = "cuda:0" if torch.cuda.is_available() else "cpu"
+        sense_models[model_id] = AutoModel(
+            model=model_id,
+            vad_model="fsmn-vad",
+            vad_kwargs={"max_single_segment_time": 30000},
+            device=device_str,
+            hub="hf",
+        )
+    return sense_models[model_id]
+# Decorate with @spaces.GPU to allocate GPU only during transcription
 @spaces.GPU
 def transcribe(whisper_model, sense_model, audio_path, enable_punct):
+    # Whisper transcription
+    pipe = get_whisper_pipe(whisper_model)
+    out = pipe(audio_path)
+    text_whisper = out.get("text", "").strip()
+    # SenseVoice transcription using FunASR
+    model = get_sense_model(sense_model)
+    res = model.generate(
+        input=audio_path,
+        cache={},
+        language="auto",
+        use_itn=True,
+        batch_size_s=60,
+        merge_vad=True,
+        merge_length_s=15,
+    )
+    text_sense = rich_transcription_postprocess(res[0]["text"])  # apply punctuation/normalization
+    # Strip punctuation if disabled
     if not enable_punct:
         text_sense = re.sub(r"[^\w\s]", "", text_sense)
     return text_whisper, text_sense
+# Gradio UI setup
+demo = gr.Blocks()
+with demo:
+    gr.Markdown("## Whisper vs. FunASR SenseVoice Comparison (ZeroGPU Enabled)")
     with gr.Row():
+        whisper_dd = gr.Dropdown(choices=WHISPER_MODELS, value=WHISPER_MODELS[0], label="Whisper Model")
+        sense_dd = gr.Dropdown(choices=SENSEVOICE_MODELS, value=SENSEVOICE_MODELS[0], label="SenseVoice Model")
     punct = gr.Checkbox(label="Enable Punctuation (SenseVoice)", value=True)
+    audio_input = gr.Audio(source="upload+microphone", type="filepath", label="Upload or Record Audio")
     with gr.Row():
         out_whisper = gr.Textbox(label="Whisper Transcript")
         out_sense = gr.Textbox(label="SenseVoice Transcript")
     btn = gr.Button("Transcribe")
+    btn.click(fn=transcribe, inputs=[whisper_dd, sense_dd, audio_input, punct], outputs=[out_whisper, out_sense])
 if __name__ == "__main__":
     demo.launch()