Luigi commited on
Commit
2d01cbb
·
1 Parent(s): f8ba113

clean code

Browse files
Files changed (1) hide show
  1. app.py +11 -24
app.py CHANGED
@@ -4,7 +4,6 @@ import tempfile
4
 
5
  import torch
6
  import gradio as gr
7
- from transformers import pipeline
8
  from faster_whisper import WhisperModel
9
  from pydub import AudioSegment
10
  from pyannote.audio import Pipeline as DiarizationPipeline
@@ -90,18 +89,6 @@ def format_diarization_html(snippets):
90
  return "<div>" + "".join(html_lines) + "</div>"
91
 
92
  # —————— Helpers ——————
93
- def get_whisper_pipe(model_id: str, device: int):
94
- key = (model_id, device)
95
- if key not in whisper_pipes:
96
- whisper_pipes[key] = pipeline(
97
- "automatic-speech-recognition",
98
- model=model_id,
99
- device=device,
100
- chunk_length_s=30,
101
- stride_length_s=5,
102
- return_timestamps=False,
103
- )
104
- return whisper_pipes[key]
105
 
106
  # —————— Faster-Whisper Cache & Factory ——————
107
  _fwhisper_models: dict[tuple[str, str], WhisperModel] = {}
@@ -150,7 +137,7 @@ def get_diarization_pipe():
150
  use_auth_token=token or True
151
  )
152
 
153
- dar_pipe.to(torch.device("cpu"))
154
  return dar_pipe
155
 
156
 
@@ -163,14 +150,14 @@ def transcribe_with_fwhisper(model: WhisperModel, audio_path: str, language: str
163
  lang_arg = None if language == "auto" else language
164
  segments, _ = model.transcribe(
165
  audio_path,
166
- beam_size=5,
167
- best_of=5,
168
  language=lang_arg,
169
  vad_filter=True,
170
  )
171
  return "".join(seg.text for seg in segments).strip()
172
 
173
- def _transcribe_whisper_cpu(model_id, language, audio_path, enable_diar):
174
  model = get_fwhisper_model(model_id, "cpu")
175
  cprint('Whisper (faster-whisper) using CPU', 'red')
176
  # Diarization-only branch
@@ -196,7 +183,7 @@ def _transcribe_whisper_cpu(model_id, language, audio_path, enable_diar):
196
 
197
 
198
  @spaces.GPU
199
- def _transcribe_whisper_gpu(model_id, language, audio_path, enable_diar):
200
  pipe = get_fwhisper_model(model_id, "cuda")
201
  cprint('Whisper (faster-whisper) using CUDA', 'green')
202
  # Diarization-only branch
@@ -221,10 +208,10 @@ def _transcribe_whisper_gpu(model_id, language, audio_path, enable_diar):
221
  return transcript, ""
222
 
223
 
224
- def transcribe_whisper(model_id, language, audio_path, device_sel, enable_diar):
225
  if device_sel == "GPU" and torch.cuda.is_available():
226
- return _transcribe_whisper_gpu(model_id, language, audio_path, enable_diar)
227
- return _transcribe_whisper_cpu(model_id, language, audio_path, enable_diar)
228
 
229
 
230
  # —————— SenseVoice Transcription ——————
@@ -368,7 +355,7 @@ with Demo:
368
 
369
  with gr.Row():
370
  with gr.Column():
371
- gr.Markdown("### Whisper ASR")
372
  whisper_dd = gr.Dropdown(choices=WHISPER_MODELS, value=WHISPER_MODELS[0], label="Whisper Model")
373
  whisper_lang = gr.Dropdown(choices=WHISPER_LANGUAGES, value="auto", label="Whisper Language")
374
  device_radio = gr.Radio(choices=["GPU", "CPU"], value="GPU", label="Device")
@@ -378,8 +365,8 @@ with Demo:
378
  # Toggle visibility based on checkbox
379
  diar_check.change(lambda e: gr.update(visible=not e), inputs=diar_check, outputs=out_w)
380
  diar_check.change(lambda e: gr.update(visible=e), inputs=diar_check, outputs=out_w_d)
381
- btn_w = gr.Button("Transcribe with Whisper")
382
- btn_w.click(fn=transcribe_whisper,
383
  inputs=[whisper_dd, whisper_lang, audio_input, device_radio, diar_check],
384
  outputs=[out_w, out_w_d])
385
 
 
4
 
5
  import torch
6
  import gradio as gr
 
7
  from faster_whisper import WhisperModel
8
  from pydub import AudioSegment
9
  from pyannote.audio import Pipeline as DiarizationPipeline
 
89
  return "<div>" + "".join(html_lines) + "</div>"
90
 
91
  # —————— Helpers ——————
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
  # —————— Faster-Whisper Cache & Factory ——————
94
  _fwhisper_models: dict[tuple[str, str], WhisperModel] = {}
 
137
  use_auth_token=token or True
138
  )
139
 
140
+ # dar_pipe.to(torch.device("cpu"))
141
  return dar_pipe
142
 
143
 
 
150
  lang_arg = None if language == "auto" else language
151
  segments, _ = model.transcribe(
152
  audio_path,
153
+ beam_size=1,
154
+ best_of=1,
155
  language=lang_arg,
156
  vad_filter=True,
157
  )
158
  return "".join(seg.text for seg in segments).strip()
159
 
160
+ def _transcribe_fwhisper_cpu(model_id, language, audio_path, enable_diar):
161
  model = get_fwhisper_model(model_id, "cpu")
162
  cprint('Whisper (faster-whisper) using CPU', 'red')
163
  # Diarization-only branch
 
183
 
184
 
185
  @spaces.GPU
186
+ def _transcribe_fwhisper_gpu(model_id, language, audio_path, enable_diar):
187
  pipe = get_fwhisper_model(model_id, "cuda")
188
  cprint('Whisper (faster-whisper) using CUDA', 'green')
189
  # Diarization-only branch
 
208
  return transcript, ""
209
 
210
 
211
+ def transcribe_fwhisper(model_id, language, audio_path, device_sel, enable_diar):
212
  if device_sel == "GPU" and torch.cuda.is_available():
213
+ return _transcribe_fwhisper_gpu(model_id, language, audio_path, enable_diar)
214
+ return _transcribe_fwhisper_cpu(model_id, language, audio_path, enable_diar)
215
 
216
 
217
  # —————— SenseVoice Transcription ——————
 
355
 
356
  with gr.Row():
357
  with gr.Column():
358
+ gr.Markdown("### Faster-Whisper ASR")
359
  whisper_dd = gr.Dropdown(choices=WHISPER_MODELS, value=WHISPER_MODELS[0], label="Whisper Model")
360
  whisper_lang = gr.Dropdown(choices=WHISPER_LANGUAGES, value="auto", label="Whisper Language")
361
  device_radio = gr.Radio(choices=["GPU", "CPU"], value="GPU", label="Device")
 
365
  # Toggle visibility based on checkbox
366
  diar_check.change(lambda e: gr.update(visible=not e), inputs=diar_check, outputs=out_w)
367
  diar_check.change(lambda e: gr.update(visible=e), inputs=diar_check, outputs=out_w_d)
368
+ btn_w = gr.Button("Transcribe with Faster-Whisper")
369
+ btn_w.click(fn=transcribe_fwhisper,
370
  inputs=[whisper_dd, whisper_lang, audio_input, device_radio, diar_check],
371
  outputs=[out_w, out_w_d])
372