Spaces:

Luigi
/

Whisper-vs-Sensevoice-Small

Running on Zero

App Files Files Community

Luigi commited on May 27

Commit

38f97a7

1 Parent(s): 95897a7

add spk diarization to sensevoice transscript

Browse files

Files changed (1) hide show

app.py +58 -16

app.py CHANGED Viewed

@@ -55,8 +55,7 @@ SENSEVOICE_MODELS = [
 # —————— Language Options ——————
 WHISPER_LANGUAGES = [
-    "auto",
-    "af","am","ar","as","az","ba","be","bg","bn","bo","br","bs","ca",
     "cs","cy","da","de","el","en","es","et","eu","fa","fi","fo","fr",
     "gl","gu","ha","haw","he","hi","hr","ht","hu","hy","id","is","it",
     "ja","jw","ka","kk","km","kn","ko","la","lb","ln","lo","lt","lv",
@@ -111,7 +110,7 @@ def get_diarization_pipe():
 # —————— Transcription Functions ——————
 def transcribe_whisper(model_id: str, language: str, audio_path: str, device_sel: str, enable_diar: bool):
-    # select device
     use_gpu = (device_sel == "GPU" and torch.cuda.is_available())
     device = 0 if use_gpu else -1
     pipe = get_whisper_pipe(model_id, device)
@@ -122,7 +121,7 @@ def transcribe_whisper(model_id: str, language: str, audio_path: str, device_sel
         result = pipe(audio_path, generate_kwargs={"language": language})
     transcript = result.get("text", "").strip()
     diar_text = ""
-    # optional diarization
     if enable_diar:
         diarizer = get_diarization_pipe()
         diarization = diarizer(audio_path)
@@ -144,9 +143,50 @@ def transcribe_whisper(model_id: str, language: str, audio_path: str, device_sel
     return transcript, diar_text
 @spaces.GPU
-def transcribe_sense(model_id: str, language: str, audio_path: str, enable_punct: bool):
     model = get_sense_model(model_id)
-    segments = model.generate(
         input=audio_path,
         cache={},
         language=language,
@@ -155,15 +195,15 @@ def transcribe_sense(model_id: str, language: str, audio_path: str, enable_punct
         merge_vad=True,
         merge_length_s=15,
     )
-    text = rich_transcription_postprocess(segments[0]['text'])
     if not enable_punct:
-        text = re.sub(r"[^\w\s]", "", text)
-    return text
 # —————— Gradio UI ——————
 demo = gr.Blocks()
 with demo:
-    gr.Markdown("## Whisper vs. SenseVoice Transcription (with Language, Device & Diarization)")
     audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio Input")
@@ -176,12 +216,12 @@ with demo:
             device_radio = gr.Radio(choices=["GPU", "CPU"], value="GPU", label="Device")
             diar_check = gr.Checkbox(label="Enable Speaker Diarization", value=False)
             whisper_btn = gr.Button("Transcribe with Whisper")
-            out_whisper = gr.Textbox(label="Whisper Transcript")
-            out_diar = gr.Textbox(label="Diarized Transcript (Whisper)")
             whisper_btn.click(
                 fn=transcribe_whisper,
                 inputs=[whisper_dd, whisper_lang, audio_input, device_radio, diar_check],
-                outputs=[out_whisper, out_diar]
             )
         # SenseVoice column
@@ -190,12 +230,14 @@ with demo:
             sense_dd = gr.Dropdown(choices=SENSEVOICE_MODELS, value=SENSEVOICE_MODELS[0], label="SenseVoice Model")
             sense_lang = gr.Dropdown(choices=SENSEVOICE_LANGUAGES, value="auto", label="SenseVoice Language")
             punct = gr.Checkbox(label="Enable Punctuation", value=True)
             sense_btn = gr.Button("Transcribe with SenseVoice")
-            out_sense = gr.Textbox(label="SenseVoice Transcript")
             sense_btn.click(
                 fn=transcribe_sense,
-                inputs=[sense_dd, sense_lang, audio_input, punct],
-                outputs=[out_sense]
             )
 if __name__ == "__main__":

 # —————— Language Options ——————
 WHISPER_LANGUAGES = [
+    "auto", "af","am","ar","as","az","ba","be","bg","bn","bo","br","bs","ca",
     "cs","cy","da","de","el","en","es","et","eu","fa","fi","fo","fr",
     "gl","gu","ha","haw","he","hi","hr","ht","hu","hy","id","is","it",
     "ja","jw","ka","kk","km","kn","ko","la","lb","ln","lo","lt","lv",
 # —————— Transcription Functions ——————
 def transcribe_whisper(model_id: str, language: str, audio_path: str, device_sel: str, enable_diar: bool):
+    # select device for Whisper
     use_gpu = (device_sel == "GPU" and torch.cuda.is_available())
     device = 0 if use_gpu else -1
     pipe = get_whisper_pipe(model_id, device)
         result = pipe(audio_path, generate_kwargs={"language": language})
     transcript = result.get("text", "").strip()
     diar_text = ""
+    # optional diarization for Whisper
     if enable_diar:
         diarizer = get_diarization_pipe()
         diarization = diarizer(audio_path)
     return transcript, diar_text
 @spaces.GPU
+def transcribe_sense(model_id: str, language: str, audio_path: str, enable_punct: bool, enable_diar: bool):
     model = get_sense_model(model_id)
+    # if no diarization, full file
+    if not enable_diar:
+        segments = model.generate(
+            input=audio_path,
+            cache={},
+            language=language,
+            use_itn=True,
+            batch_size_s=300,
+            merge_vad=True,
+            merge_length_s=15,
+        )
+        text = rich_transcription_postprocess(segments[0]['text'])
+        if not enable_punct:
+            text = re.sub(r"[^\w\s]", "", text)
+        return text, ""
+    # with diarization: split by speaker
+    diarizer = get_diarization_pipe()
+    diarization = diarizer(audio_path)
+    speaker_snippets = []
+    for turn, _, speaker in diarization.itertracks(yield_label=True):
+        start_ms = int(turn.start * 1000)
+        end_ms = int(turn.end * 1000)
+        segment = AudioSegment.from_file(audio_path)[start_ms:end_ms]
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+            segment.export(tmp.name, format="wav")
+            segments = model.generate(
+                input=tmp.name,
+                cache={},
+                language=language,
+                use_itn=True,
+                batch_size_s=300,
+                merge_vad=False,
+                merge_length_s=0,
+            )
+        os.unlink(tmp.name)
+        txt = rich_transcription_postprocess(segments[0]['text'])
+        if not enable_punct:
+            txt = re.sub(r"[^\w\s]", "", txt)
+        speaker_snippets.append(f"[{speaker}] {txt}")
+    full_text = "\n".join(speaker_snippets)
+    # also return full non-diarized transcript for comparison
+    segments_full = model.generate(
         input=audio_path,
         cache={},
         language=language,
         merge_vad=True,
         merge_length_s=15,
     )
+    text_full = rich_transcription_postprocess(segments_full[0]['text'])
     if not enable_punct:
+        text_full = re.sub(r"[^\w\s]", "", text_full)
+    return text_full, full_text
 # —————— Gradio UI ——————
 demo = gr.Blocks()
 with demo:
+    gr.Markdown("## Whisper vs. SenseVoice (Language, Device & Speaker Diarization)")
     audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio Input")
             device_radio = gr.Radio(choices=["GPU", "CPU"], value="GPU", label="Device")
             diar_check = gr.Checkbox(label="Enable Speaker Diarization", value=False)
             whisper_btn = gr.Button("Transcribe with Whisper")
+            out_whisper = gr.Textbox(label="Transcript")
+            out_whisper_diar = gr.Textbox(label="Diarized Transcript")
             whisper_btn.click(
                 fn=transcribe_whisper,
                 inputs=[whisper_dd, whisper_lang, audio_input, device_radio, diar_check],
+                outputs=[out_whisper, out_whisper_diar]
             )
         # SenseVoice column
             sense_dd = gr.Dropdown(choices=SENSEVOICE_MODELS, value=SENSEVOICE_MODELS[0], label="SenseVoice Model")
             sense_lang = gr.Dropdown(choices=SENSEVOICE_LANGUAGES, value="auto", label="SenseVoice Language")
             punct = gr.Checkbox(label="Enable Punctuation", value=True)
+            diar_sense = gr.Checkbox(label="Enable Speaker Diarization", value=False)
             sense_btn = gr.Button("Transcribe with SenseVoice")
+            out_sense = gr.Textbox(label="Transcript")
+            out_sense_diar = gr.Textbox(label="Diarized Transcript")
             sense_btn.click(
                 fn=transcribe_sense,
+                inputs=[sense_dd, sense_lang, audio_input, punct, diar_sense],
+                outputs=[out_sense, out_sense_diar]
             )
 if __name__ == "__main__":