Spaces:

Luigi
/

Whisper-vs-Sensevoice-Small

Running on Zero

App Files Files Community

Luigi commited on Jun 1

Commit

a8b76d6

1 Parent(s): fe1810a

allow uer to enable / disable multilingual feature on whiper

Browse files

Files changed (1) hide show

app.py +9 -6

app.py CHANGED Viewed

@@ -143,7 +143,7 @@ def get_diarization_pipe():
 # —————— Whisper Transcription ——————
-def _transcribe_fwhisper_cpu_stream(model_id, language, audio_path):
     """
     Generator-based streaming transcription with accumulation using Faster-Whisper on CPU.
     Yields (accumulated_text, diar_html) tuples for Gradio streaming.
@@ -172,6 +172,7 @@ def _transcribe_fwhisper_cpu_stream(model_id, language, audio_path):
                 language=None if language == "auto" else language,
                 vad_filter=True,
                 batch_size=16,
             )
         os.unlink(tmp.name)
         text = converter.convert("".join(s.text for s in segments).strip())
@@ -182,7 +183,7 @@ def _transcribe_fwhisper_cpu_stream(model_id, language, audio_path):
 @spaces.GPU
-def _transcribe_fwhisper_gpu_stream(model_id, language, audio_path):
     """
     Generator-based streaming transcription with accumulation using Faster-Whisper on CUDA.
     Yields (accumulated_text, diar_html) tuples for Gradio streaming.
@@ -212,6 +213,7 @@ def _transcribe_fwhisper_gpu_stream(model_id, language, audio_path):
                 language=None if language == "auto" else language,
                 vad_filter=True,
                 batch_size=16,
             )
         os.unlink(tmp.name)
         text = converter.convert("".join(s.text for s in segments).strip())
@@ -219,12 +221,12 @@ def _transcribe_fwhisper_gpu_stream(model_id, language, audio_path):
         yield "", format_diarization_html(snippets)
     return
-def transcribe_fwhisper_stream(model_id, language, audio_path, device_sel):
     """Dispatch to CPU or GPU streaming generators, preserving two-value yields."""
     if device_sel == "GPU" and torch.cuda.is_available():
-        yield from _transcribe_fwhisper_gpu_stream(model_id, language, audio_path)
     else:
-        yield from _transcribe_fwhisper_cpu_stream(model_id, language, audio_path)
 # —————— SenseVoice Transcription ——————
 def _transcribe_sense_cpu_stream(model_id: str, language: str, audio_path: str,
@@ -324,6 +326,7 @@ with Demo:
             whisper_dd      = gr.Dropdown(choices=WHISPER_MODELS, value=WHISPER_MODELS[0], label="Whisper Model")
             whisper_lang    = gr.Dropdown(choices=WHISPER_LANGUAGES, value="auto",      label="Whisper Language")
             device_radio    = gr.Radio(choices=["GPU","CPU"], value="GPU", label="Device")
             btn_w           = gr.Button("Transcribe with Faster-Whisper")
         with gr.Column():
@@ -353,7 +356,7 @@ with Demo:
     # wire the callbacks into those shared boxes
     btn_w.click(
         fn=transcribe_fwhisper_stream,
-        inputs=[whisper_dd, whisper_lang, audio_input, device_radio],
         outputs=[out_w, out_w_d]
     )
     btn_s.click(

 # —————— Whisper Transcription ——————
+def _transcribe_fwhisper_cpu_stream(model_id, language, audio_path, whisper_multilingual_en):
     """
     Generator-based streaming transcription with accumulation using Faster-Whisper on CPU.
     Yields (accumulated_text, diar_html) tuples for Gradio streaming.
                 language=None if language == "auto" else language,
                 vad_filter=True,
                 batch_size=16,
+                multilingual=whisper_multilingual_en,
             )
         os.unlink(tmp.name)
         text = converter.convert("".join(s.text for s in segments).strip())
 @spaces.GPU
+def _transcribe_fwhisper_gpu_stream(model_id, language, audio_path, whisper_multilingual_en):
     """
     Generator-based streaming transcription with accumulation using Faster-Whisper on CUDA.
     Yields (accumulated_text, diar_html) tuples for Gradio streaming.
                 language=None if language == "auto" else language,
                 vad_filter=True,
                 batch_size=16,
+                multilingual=whisper_multilingual_en,
             )
         os.unlink(tmp.name)
         text = converter.convert("".join(s.text for s in segments).strip())
         yield "", format_diarization_html(snippets)
     return
+def transcribe_fwhisper_stream(model_id, language, audio_path, device_sel, whisper_multilingual_en):
     """Dispatch to CPU or GPU streaming generators, preserving two-value yields."""
     if device_sel == "GPU" and torch.cuda.is_available():
+        yield from _transcribe_fwhisper_gpu_stream(model_id, language, audio_path, whisper_multilingual_en)
     else:
+        yield from _transcribe_fwhisper_cpu_stream(model_id, language, audio_path, whisper_multilingual_en)
 # —————— SenseVoice Transcription ——————
 def _transcribe_sense_cpu_stream(model_id: str, language: str, audio_path: str,
             whisper_dd      = gr.Dropdown(choices=WHISPER_MODELS, value=WHISPER_MODELS[0], label="Whisper Model")
             whisper_lang    = gr.Dropdown(choices=WHISPER_LANGUAGES, value="auto",      label="Whisper Language")
             device_radio    = gr.Radio(choices=["GPU","CPU"], value="GPU", label="Device")
+            whisper_multilingual_en = gr.Checkbox(label="Multilingual", value=True)
             btn_w           = gr.Button("Transcribe with Faster-Whisper")
         with gr.Column():
     # wire the callbacks into those shared boxes
     btn_w.click(
         fn=transcribe_fwhisper_stream,
+        inputs=[whisper_dd, whisper_lang, audio_input, device_radio, whisper_multilingual_en],
         outputs=[out_w, out_w_d]
     )
     btn_s.click(