Luigi commited on
Commit
7833553
·
1 Parent(s): f737f82

Add speaker diarization

Browse files
Files changed (2) hide show
  1. app.py +47 -24
  2. requirements.txt +20 -12
app.py CHANGED
@@ -1,9 +1,12 @@
1
  import os
2
  import re
 
3
 
4
  import torch
5
  import gradio as gr
6
  from transformers import pipeline
 
 
7
 
8
  import spaces # zeroGPU support
9
  from funasr import AutoModel
@@ -67,10 +70,10 @@ SENSEVOICE_LANGUAGES = ["auto", "zh", "yue", "en", "ja", "ko", "nospeech"]
67
  # —————— Caches ——————
68
  whisper_pipes = {}
69
  sense_models = {}
 
70
 
71
  # —————— Helpers ——————
72
  def get_whisper_pipe(model_id: str, device: int):
73
- # HuggingFace pipeline caching by model and device (-1=cpu, 0=gpu)
74
  key = (model_id, device)
75
  if key not in whisper_pipes:
76
  whisper_pipes[key] = pipeline(
@@ -96,32 +99,50 @@ def get_sense_model(model_id: str):
96
  )
97
  return sense_models[model_id]
98
 
99
- # —————— Whisper Transcribers ——————
100
- @spaces.GPU
101
- def _transcribe_whisper_gpu(model_id: str, language: str, audio_path: str):
102
- pipe = get_whisper_pipe(model_id, device=0)
103
- if language == "auto":
104
- result = pipe(audio_path)
105
- else:
106
- result = pipe(audio_path, generate_kwargs={"language": language})
107
- return result.get("text", "").strip()
108
 
109
- def _transcribe_whisper_cpu(model_id: str, language: str, audio_path: str):
110
- pipe = get_whisper_pipe(model_id, device=-1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  if language == "auto":
112
  result = pipe(audio_path)
113
  else:
114
  result = pipe(audio_path, generate_kwargs={"language": language})
115
- return result.get("text", "").strip()
116
-
117
- # Wrapper selects GPU or CPU path
118
- def transcribe_whisper(model_id: str, language: str, audio_path: str, device_sel: str):
119
- if device_sel == "GPU" and torch.cuda.is_available():
120
- return _transcribe_whisper_gpu(model_id, language, audio_path)
121
- else:
122
- return _transcribe_whisper_cpu(model_id, language, audio_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
- # —————— SenseVoice Transcriber ——————
125
  @spaces.GPU
126
  def transcribe_sense(model_id: str, language: str, audio_path: str, enable_punct: bool):
127
  model = get_sense_model(model_id)
@@ -142,7 +163,7 @@ def transcribe_sense(model_id: str, language: str, audio_path: str, enable_punct
142
  # —————— Gradio UI ——————
143
  demo = gr.Blocks()
144
  with demo:
145
- gr.Markdown("## Whisper vs. SenseVoice Transcription (Language & Device)")
146
 
147
  audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio Input")
148
 
@@ -153,12 +174,14 @@ with demo:
153
  whisper_dd = gr.Dropdown(choices=WHISPER_MODELS, value=WHISPER_MODELS[0], label="Whisper Model")
154
  whisper_lang = gr.Dropdown(choices=WHISPER_LANGUAGES, value="auto", label="Whisper Language")
155
  device_radio = gr.Radio(choices=["GPU", "CPU"], value="GPU", label="Device")
 
156
  whisper_btn = gr.Button("Transcribe with Whisper")
157
  out_whisper = gr.Textbox(label="Whisper Transcript")
 
158
  whisper_btn.click(
159
  fn=transcribe_whisper,
160
- inputs=[whisper_dd, whisper_lang, audio_input, device_radio],
161
- outputs=[out_whisper]
162
  )
163
 
164
  # SenseVoice column
 
1
  import os
2
  import re
3
+ import tempfile
4
 
5
  import torch
6
  import gradio as gr
7
  from transformers import pipeline
8
+ from pydub import AudioSegment
9
+ from pyannote.audio import Pipeline as DiarizationPipeline
10
 
11
  import spaces # zeroGPU support
12
  from funasr import AutoModel
 
70
  # —————— Caches ——————
71
  whisper_pipes = {}
72
  sense_models = {}
73
+ dar_pipe = None
74
 
75
  # —————— Helpers ——————
76
  def get_whisper_pipe(model_id: str, device: int):
 
77
  key = (model_id, device)
78
  if key not in whisper_pipes:
79
  whisper_pipes[key] = pipeline(
 
99
  )
100
  return sense_models[model_id]
101
 
 
 
 
 
 
 
 
 
 
102
 
103
+ def get_diarization_pipe():
104
+ global dar_pipe
105
+ if dar_pipe is None:
106
+ dar_pipe = DiarizationPipeline.from_pretrained(
107
+ "pyannote/speaker-diarization@2.1",
108
+ use_auth_token=True
109
+ )
110
+ return dar_pipe
111
+
112
+ # —————— Transcription Functions ——————
113
+ def transcribe_whisper(model_id: str, language: str, audio_path: str, device_sel: str, enable_diar: bool):
114
+ # select device
115
+ use_gpu = (device_sel == "GPU" and torch.cuda.is_available())
116
+ device = 0 if use_gpu else -1
117
+ pipe = get_whisper_pipe(model_id, device)
118
+ # full transcription
119
  if language == "auto":
120
  result = pipe(audio_path)
121
  else:
122
  result = pipe(audio_path, generate_kwargs={"language": language})
123
+ transcript = result.get("text", "").strip()
124
+ diar_text = ""
125
+ # optional diarization
126
+ if enable_diar:
127
+ diarizer = get_diarization_pipe()
128
+ diarization = diarizer(audio_path)
129
+ snippets = []
130
+ for turn, _, speaker in diarization.itertracks(yield_label=True):
131
+ start_ms = int(turn.start * 1000)
132
+ end_ms = int(turn.end * 1000)
133
+ segment = AudioSegment.from_file(audio_path)[start_ms:end_ms]
134
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
135
+ segment.export(tmp.name, format="wav")
136
+ if language == "auto":
137
+ seg_out = pipe(tmp.name)
138
+ else:
139
+ seg_out = pipe(tmp.name, generate_kwargs={"language": language})
140
+ os.unlink(tmp.name)
141
+ txt = seg_out.get("text", "").strip()
142
+ snippets.append(f"[{speaker}] {txt}")
143
+ diar_text = "\n".join(snippets)
144
+ return transcript, diar_text
145
 
 
146
  @spaces.GPU
147
  def transcribe_sense(model_id: str, language: str, audio_path: str, enable_punct: bool):
148
  model = get_sense_model(model_id)
 
163
  # —————— Gradio UI ——————
164
  demo = gr.Blocks()
165
  with demo:
166
+ gr.Markdown("## Whisper vs. SenseVoice Transcription (with Language, Device & Diarization)")
167
 
168
  audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio Input")
169
 
 
174
  whisper_dd = gr.Dropdown(choices=WHISPER_MODELS, value=WHISPER_MODELS[0], label="Whisper Model")
175
  whisper_lang = gr.Dropdown(choices=WHISPER_LANGUAGES, value="auto", label="Whisper Language")
176
  device_radio = gr.Radio(choices=["GPU", "CPU"], value="GPU", label="Device")
177
+ diar_check = gr.Checkbox(label="Enable Speaker Diarization", value=False)
178
  whisper_btn = gr.Button("Transcribe with Whisper")
179
  out_whisper = gr.Textbox(label="Whisper Transcript")
180
+ out_diar = gr.Textbox(label="Diarized Transcript (Whisper)")
181
  whisper_btn.click(
182
  fn=transcribe_whisper,
183
+ inputs=[whisper_dd, whisper_lang, audio_input, device_radio, diar_check],
184
+ outputs=[out_whisper, out_diar]
185
  )
186
 
187
  # SenseVoice column
requirements.txt CHANGED
@@ -1,12 +1,20 @@
1
- gradio>=5.0
2
- transformers>=4.30.0
3
- torch>=1.10
4
- torchaudio>=0.10
5
- accelerate>=0.20.0
6
- ffmpeg-python>=0.2.0
7
-
8
- # FunASR core & post-processing
9
- funasr>=0.1.1
10
-
11
- # HF utilities (already present on Spaces, but pin if you need a specific version)
12
- huggingface-hub>=0.14.1
 
 
 
 
 
 
 
 
 
1
+ # Gradio UI
2
+ gradio>=3.39.0
3
+
4
+ # Core ASR
5
+ torch>=2.0.0
6
+ transformers>=4.35.0
7
+
8
+ # FunASR SenseVoice
9
+ funasr>=0.6.4
10
+
11
+ # Audio handling
12
+ pydub>=0.25.1
13
+ ffmpeg-python>=0.2.0 # wrapper for ffmpeg; you’ll still need system ffmpeg installed
14
+
15
+ # Speaker Diarization
16
+ pyannote.audio>=2.1.1
17
+ huggingface-hub>=0.18.0 # for pyannote model download/auth
18
+
19
+ # (Optional) if you want GPU‐accelerated pipelines outside of HF Spaces
20
+ # accelerate>=0.20.0