Spaces:

Luigi
/

Whisper-vs-Sensevoice-Small

Running on Zero

App Files Files Community

Luigi commited on May 27

Commit

c5bcdb3

1 Parent(s): 38f97a7

get HF_TOKEN from secret variables

Browse files

Files changed (1) hide show

app.py +67 -75

app.py CHANGED Viewed

@@ -44,7 +44,6 @@ WHISPER_MODELS = [
     "momo103197/whisper-small-zh-TW-16",
     "k1nto/Belle-whisper-large-v3-zh-punct-ct2"
 ]
 SENSEVOICE_MODELS = [
     "FunAudioLLM/SenseVoiceSmall",
     "AXERA-TECH/SenseVoice",
@@ -55,14 +54,16 @@ SENSEVOICE_MODELS = [
 # —————— Language Options ——————
 WHISPER_LANGUAGES = [
-    "auto", "af","am","ar","as","az","ba","be","bg","bn","bo","br","bs","ca",
-    "cs","cy","da","de","el","en","es","et","eu","fa","fi","fo","fr",
-    "gl","gu","ha","haw","he","hi","hr","ht","hu","hy","id","is","it",
-    "ja","jw","ka","kk","km","kn","ko","la","lb","ln","lo","lt","lv",
-    "mg","mi","mk","ml","mn","mr","ms","mt","my","ne","nl","nn","no",
-    "oc","pa","pl","ps","pt","ro","ru","sa","sd","si","sk","sl","sn",
-    "so","sq","sr","su","sv","sw","ta","te","tg","th","tk","tl","tr",
-    "tt","uk","ur","uz","vi","yi","yo","zh","yue"
 ]
 SENSEVOICE_LANGUAGES = ["auto", "zh", "yue", "en", "ja", "ko", "nospeech"]
@@ -102,52 +103,57 @@ def get_sense_model(model_id: str):
 def get_diarization_pipe():
     global dar_pipe
     if dar_pipe is None:
         dar_pipe = DiarizationPipeline.from_pretrained(
             "pyannote/speaker-diarization-3.1",
-            use_auth_token=True
         )
     return dar_pipe
 # —————— Transcription Functions ——————
-def transcribe_whisper(model_id: str, language: str, audio_path: str, device_sel: str, enable_diar: bool):
-    # select device for Whisper
     use_gpu = (device_sel == "GPU" and torch.cuda.is_available())
     device = 0 if use_gpu else -1
     pipe = get_whisper_pipe(model_id, device)
     # full transcription
-    if language == "auto":
-        result = pipe(audio_path)
-    else:
-        result = pipe(audio_path, generate_kwargs={"language": language})
     transcript = result.get("text", "").strip()
     diar_text = ""
-    # optional diarization for Whisper
     if enable_diar:
         diarizer = get_diarization_pipe()
-        diarization = diarizer(audio_path)
         snippets = []
-        for turn, _, speaker in diarization.itertracks(yield_label=True):
-            start_ms = int(turn.start * 1000)
-            end_ms = int(turn.end * 1000)
             segment = AudioSegment.from_file(audio_path)[start_ms:end_ms]
             with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
                 segment.export(tmp.name, format="wav")
-                if language == "auto":
-                    seg_out = pipe(tmp.name)
-                else:
-                    seg_out = pipe(tmp.name, generate_kwargs={"language": language})
             os.unlink(tmp.name)
-            txt = seg_out.get("text", "").strip()
-            snippets.append(f"[{speaker}] {txt}")
         diar_text = "\n".join(snippets)
     return transcript, diar_text
 @spaces.GPU
-def transcribe_sense(model_id: str, language: str, audio_path: str, enable_punct: bool, enable_diar: bool):
     model = get_sense_model(model_id)
-    # if no diarization, full file
     if not enable_diar:
-        segments = model.generate(
             input=audio_path,
             cache={},
             language=language,
@@ -156,21 +162,20 @@ def transcribe_sense(model_id: str, language: str, audio_path: str, enable_punct
             merge_vad=True,
             merge_length_s=15,
         )
-        text = rich_transcription_postprocess(segments[0]['text'])
         if not enable_punct:
             text = re.sub(r"[^\w\s]", "", text)
         return text, ""
-    # with diarization: split by speaker
     diarizer = get_diarization_pipe()
-    diarization = diarizer(audio_path)
-    speaker_snippets = []
-    for turn, _, speaker in diarization.itertracks(yield_label=True):
-        start_ms = int(turn.start * 1000)
-        end_ms = int(turn.end * 1000)
         segment = AudioSegment.from_file(audio_path)[start_ms:end_ms]
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
             segment.export(tmp.name, format="wav")
-            segments = model.generate(
                 input=tmp.name,
                 cache={},
                 language=language,
@@ -180,65 +185,52 @@ def transcribe_sense(model_id: str, language: str, audio_path: str, enable_punct
                 merge_length_s=0,
             )
         os.unlink(tmp.name)
-        txt = rich_transcription_postprocess(segments[0]['text'])
         if not enable_punct:
             txt = re.sub(r"[^\w\s]", "", txt)
-        speaker_snippets.append(f"[{speaker}] {txt}")
-    full_text = "\n".join(speaker_snippets)
-    # also return full non-diarized transcript for comparison
-    segments_full = model.generate(
         input=audio_path,
         cache={},
         language=language,
         use_itn=True,
         batch_size_s=300,
         merge_vad=True,
-        merge_length_s=15,
-    )
-    text_full = rich_transcription_postprocess(segments_full[0]['text'])
     if not enable_punct:
-        text_full = re.sub(r"[^\w\s]", "", text_full)
-    return text_full, full_text
 # —————— Gradio UI ——————
 demo = gr.Blocks()
 with demo:
-    gr.Markdown("## Whisper vs. SenseVoice (Language, Device & Speaker Diarization)")
-    audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio Input")
     with gr.Row():
-        # Whisper column
         with gr.Column():
             gr.Markdown("### Whisper ASR")
             whisper_dd = gr.Dropdown(choices=WHISPER_MODELS, value=WHISPER_MODELS[0], label="Whisper Model")
             whisper_lang = gr.Dropdown(choices=WHISPER_LANGUAGES, value="auto", label="Whisper Language")
-            device_radio = gr.Radio(choices=["GPU", "CPU"], value="GPU", label="Device")
-            diar_check = gr.Checkbox(label="Enable Speaker Diarization", value=False)
-            whisper_btn = gr.Button("Transcribe with Whisper")
-            out_whisper = gr.Textbox(label="Transcript")
-            out_whisper_diar = gr.Textbox(label="Diarized Transcript")
-            whisper_btn.click(
-                fn=transcribe_whisper,
-                inputs=[whisper_dd, whisper_lang, audio_input, device_radio, diar_check],
-                outputs=[out_whisper, out_whisper_diar]
-            )
-        # SenseVoice column
         with gr.Column():
             gr.Markdown("### FunASR SenseVoice ASR")
             sense_dd = gr.Dropdown(choices=SENSEVOICE_MODELS, value=SENSEVOICE_MODELS[0], label="SenseVoice Model")
             sense_lang = gr.Dropdown(choices=SENSEVOICE_LANGUAGES, value="auto", label="SenseVoice Language")
             punct = gr.Checkbox(label="Enable Punctuation", value=True)
-            diar_sense = gr.Checkbox(label="Enable Speaker Diarization", value=False)
-            sense_btn = gr.Button("Transcribe with SenseVoice")
-            out_sense = gr.Textbox(label="Transcript")
-            out_sense_diar = gr.Textbox(label="Diarized Transcript")
-            sense_btn.click(
-                fn=transcribe_sense,
-                inputs=[sense_dd, sense_lang, audio_input, punct, diar_sense],
-                outputs=[out_sense, out_sense_diar]
-            )
 if __name__ == "__main__":
     demo.launch()

     "momo103197/whisper-small-zh-TW-16",
     "k1nto/Belle-whisper-large-v3-zh-punct-ct2"
 ]
 SENSEVOICE_MODELS = [
     "FunAudioLLM/SenseVoiceSmall",
     "AXERA-TECH/SenseVoice",
 # —————— Language Options ——————
 WHISPER_LANGUAGES = [
+    "auto", "af","am","ar","as","az","ba","be","bg","bn","bo",
+    "br","bs","ca","cs","cy","da","de","el","en","es","et",
+    "eu","fa","fi","fo","fr","gl","gu","ha","haw","he","hi",
+    "hr","ht","hu","hy","id","is","it","ja","jw","ka","kk",
+    "km","kn","ko","la","lb","ln","lo","lt","lv","mg","mi",
+    "mk","ml","mn","mr","ms","mt","my","ne","nl","nn","no",
+    "oc","pa","pl","ps","pt","ro","ru","sa","sd","si","sk",
+    "sl","sn","so","sq","sr","su","sv","sw","ta","te","tg",
+    "th","tk","tl","tr","tt","uk","ur","uz","vi","yi","yo",
+    "zh","yue"
 ]
 SENSEVOICE_LANGUAGES = ["auto", "zh", "yue", "en", "ja", "ko", "nospeech"]
 def get_diarization_pipe():
     global dar_pipe
     if dar_pipe is None:
+        # Pull token from environment (HF_TOKEN or HUGGINGFACE_TOKEN)
+        token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_TOKEN")
         dar_pipe = DiarizationPipeline.from_pretrained(
             "pyannote/speaker-diarization-3.1",
+            use_auth_token=token or True
         )
     return dar_pipe
 # —————— Transcription Functions ——————
+def transcribe_whisper(model_id: str,
+                       language: str,
+                       audio_path: str,
+                       device_sel: str,
+                       enable_diar: bool):
+    # select device: 0 for GPU, -1 for CPU
     use_gpu = (device_sel == "GPU" and torch.cuda.is_available())
     device = 0 if use_gpu else -1
     pipe = get_whisper_pipe(model_id, device)
     # full transcription
+    result = (pipe(audio_path) if language == "auto"
+              else pipe(audio_path, generate_kwargs={"language": language}))
     transcript = result.get("text", "").strip()
     diar_text = ""
+    # optional speaker diarization
     if enable_diar:
         diarizer = get_diarization_pipe()
+        diary = diarizer(audio_path)
         snippets = []
+        for turn, _, speaker in diary.itertracks(yield_label=True):
+            start_ms, end_ms = int(turn.start*1000), int(turn.end*1000)
             segment = AudioSegment.from_file(audio_path)[start_ms:end_ms]
             with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
                 segment.export(tmp.name, format="wav")
+                seg_out = (pipe(tmp.name) if language == "auto"
+                           else pipe(tmp.name, generate_kwargs={"language": language}))
             os.unlink(tmp.name)
+            text = seg_out.get("text", "").strip()
+            snippets.append(f"[{speaker}] {text}")
         diar_text = "\n".join(snippets)
     return transcript, diar_text
 @spaces.GPU
+def transcribe_sense(model_id: str,
+                     language: str,
+                     audio_path: str,
+                     enable_punct: bool,
+                     enable_diar: bool):
     model = get_sense_model(model_id)
+    # no diarization
     if not enable_diar:
+        segs = model.generate(
             input=audio_path,
             cache={},
             language=language,
             merge_vad=True,
             merge_length_s=15,
         )
+        text = rich_transcription_postprocess(segs[0]['text'])
         if not enable_punct:
             text = re.sub(r"[^\w\s]", "", text)
         return text, ""
+    # with diarization
     diarizer = get_diarization_pipe()
+    diary = diarizer(audio_path)
+    snippets = []
+    for turn, _, speaker in diary.itertracks(yield_label=True):
+        start_ms, end_ms = int(turn.start*1000), int(turn.end*1000)
         segment = AudioSegment.from_file(audio_path)[start_ms:end_ms]
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
             segment.export(tmp.name, format="wav")
+            segs = model.generate(
                 input=tmp.name,
                 cache={},
                 language=language,
                 merge_length_s=0,
             )
         os.unlink(tmp.name)
+        txt = rich_transcription_postprocess(segs[0]['text'])
         if not enable_punct:
             txt = re.sub(r"[^\w\s]", "", txt)
+        snippets.append(f"[{speaker}] {txt}")
+    full = rich_transcription_postprocess(model.generate(
         input=audio_path,
         cache={},
         language=language,
         use_itn=True,
         batch_size_s=300,
         merge_vad=True,
+        merge_length_s=15
+    )[0]['text'])
     if not enable_punct:
+        full = re.sub(r"[^\w\s]", "", full)
+    return full, "\n".join(snippets)
 # —————— Gradio UI ——————
 demo = gr.Blocks()
 with demo:
+    gr.Markdown("## Whisper vs. SenseVoice (Language, Device & Diarization)")
+    audio_input = gr.Audio(sources=["upload","microphone"], type="filepath", label="Audio Input")
     with gr.Row():
         with gr.Column():
             gr.Markdown("### Whisper ASR")
             whisper_dd = gr.Dropdown(choices=WHISPER_MODELS, value=WHISPER_MODELS[0], label="Whisper Model")
             whisper_lang = gr.Dropdown(choices=WHISPER_LANGUAGES, value="auto", label="Whisper Language")
+            device_radio = gr.Radio(choices=["GPU","CPU"], value="GPU", label="Device")
+            diar_check = gr.Checkbox(label="Enable Diarization")
+            btn_w = gr.Button("Transcribe with Whisper")
+            out_w = gr.Textbox(label="Transcript")
+            out_w_d = gr.Textbox(label="Diarized Transcript")
+            btn_w.click(fn=transcribe_whisper,
+                        inputs=[whisper_dd, whisper_lang, audio_input, device_radio, diar_check],
+                        outputs=[out_w, out_w_d])
         with gr.Column():
             gr.Markdown("### FunASR SenseVoice ASR")
             sense_dd = gr.Dropdown(choices=SENSEVOICE_MODELS, value=SENSEVOICE_MODELS[0], label="SenseVoice Model")
             sense_lang = gr.Dropdown(choices=SENSEVOICE_LANGUAGES, value="auto", label="SenseVoice Language")
             punct = gr.Checkbox(label="Enable Punctuation", value=True)
+            diar_s = gr.Checkbox(label="Enable Diarization")
+            btn_s = gr.Button("Transcribe with SenseVoice")
+            out_s = gr.Textbox(label="Transcript")
+            out_s_d = gr.Textbox(label="Diarized Transcript")
+            btn_s.click(fn=transcribe_sense,
+                        inputs=[sense_dd, sense_lang, audio_input, punct, diar_s],
+                        outputs=[out_s, out_s_d])
 if __name__ == "__main__":
     demo.launch()