Spaces:
Running
on
Zero
Running
on
Zero
clean code
Browse files
app.py
CHANGED
@@ -4,7 +4,6 @@ import tempfile
|
|
4 |
|
5 |
import torch
|
6 |
import gradio as gr
|
7 |
-
from transformers import pipeline
|
8 |
from faster_whisper import WhisperModel
|
9 |
from pydub import AudioSegment
|
10 |
from pyannote.audio import Pipeline as DiarizationPipeline
|
@@ -90,18 +89,6 @@ def format_diarization_html(snippets):
|
|
90 |
return "<div>" + "".join(html_lines) + "</div>"
|
91 |
|
92 |
# —————— Helpers ——————
|
93 |
-
def get_whisper_pipe(model_id: str, device: int):
|
94 |
-
key = (model_id, device)
|
95 |
-
if key not in whisper_pipes:
|
96 |
-
whisper_pipes[key] = pipeline(
|
97 |
-
"automatic-speech-recognition",
|
98 |
-
model=model_id,
|
99 |
-
device=device,
|
100 |
-
chunk_length_s=30,
|
101 |
-
stride_length_s=5,
|
102 |
-
return_timestamps=False,
|
103 |
-
)
|
104 |
-
return whisper_pipes[key]
|
105 |
|
106 |
# —————— Faster-Whisper Cache & Factory ——————
|
107 |
_fwhisper_models: dict[tuple[str, str], WhisperModel] = {}
|
@@ -150,7 +137,7 @@ def get_diarization_pipe():
|
|
150 |
use_auth_token=token or True
|
151 |
)
|
152 |
|
153 |
-
dar_pipe.to(torch.device("cpu"))
|
154 |
return dar_pipe
|
155 |
|
156 |
|
@@ -163,14 +150,14 @@ def transcribe_with_fwhisper(model: WhisperModel, audio_path: str, language: str
|
|
163 |
lang_arg = None if language == "auto" else language
|
164 |
segments, _ = model.transcribe(
|
165 |
audio_path,
|
166 |
-
beam_size=
|
167 |
-
best_of=
|
168 |
language=lang_arg,
|
169 |
vad_filter=True,
|
170 |
)
|
171 |
return "".join(seg.text for seg in segments).strip()
|
172 |
|
173 |
-
def
|
174 |
model = get_fwhisper_model(model_id, "cpu")
|
175 |
cprint('Whisper (faster-whisper) using CPU', 'red')
|
176 |
# Diarization-only branch
|
@@ -196,7 +183,7 @@ def _transcribe_whisper_cpu(model_id, language, audio_path, enable_diar):
|
|
196 |
|
197 |
|
198 |
@spaces.GPU
|
199 |
-
def
|
200 |
pipe = get_fwhisper_model(model_id, "cuda")
|
201 |
cprint('Whisper (faster-whisper) using CUDA', 'green')
|
202 |
# Diarization-only branch
|
@@ -221,10 +208,10 @@ def _transcribe_whisper_gpu(model_id, language, audio_path, enable_diar):
|
|
221 |
return transcript, ""
|
222 |
|
223 |
|
224 |
-
def
|
225 |
if device_sel == "GPU" and torch.cuda.is_available():
|
226 |
-
return
|
227 |
-
return
|
228 |
|
229 |
|
230 |
# —————— SenseVoice Transcription ——————
|
@@ -368,7 +355,7 @@ with Demo:
|
|
368 |
|
369 |
with gr.Row():
|
370 |
with gr.Column():
|
371 |
-
gr.Markdown("### Whisper ASR")
|
372 |
whisper_dd = gr.Dropdown(choices=WHISPER_MODELS, value=WHISPER_MODELS[0], label="Whisper Model")
|
373 |
whisper_lang = gr.Dropdown(choices=WHISPER_LANGUAGES, value="auto", label="Whisper Language")
|
374 |
device_radio = gr.Radio(choices=["GPU", "CPU"], value="GPU", label="Device")
|
@@ -378,8 +365,8 @@ with Demo:
|
|
378 |
# Toggle visibility based on checkbox
|
379 |
diar_check.change(lambda e: gr.update(visible=not e), inputs=diar_check, outputs=out_w)
|
380 |
diar_check.change(lambda e: gr.update(visible=e), inputs=diar_check, outputs=out_w_d)
|
381 |
-
btn_w = gr.Button("Transcribe with Whisper")
|
382 |
-
btn_w.click(fn=
|
383 |
inputs=[whisper_dd, whisper_lang, audio_input, device_radio, diar_check],
|
384 |
outputs=[out_w, out_w_d])
|
385 |
|
|
|
4 |
|
5 |
import torch
|
6 |
import gradio as gr
|
|
|
7 |
from faster_whisper import WhisperModel
|
8 |
from pydub import AudioSegment
|
9 |
from pyannote.audio import Pipeline as DiarizationPipeline
|
|
|
89 |
return "<div>" + "".join(html_lines) + "</div>"
|
90 |
|
91 |
# —————— Helpers ——————
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
|
93 |
# —————— Faster-Whisper Cache & Factory ——————
|
94 |
_fwhisper_models: dict[tuple[str, str], WhisperModel] = {}
|
|
|
137 |
use_auth_token=token or True
|
138 |
)
|
139 |
|
140 |
+
# dar_pipe.to(torch.device("cpu"))
|
141 |
return dar_pipe
|
142 |
|
143 |
|
|
|
150 |
lang_arg = None if language == "auto" else language
|
151 |
segments, _ = model.transcribe(
|
152 |
audio_path,
|
153 |
+
beam_size=1,
|
154 |
+
best_of=1,
|
155 |
language=lang_arg,
|
156 |
vad_filter=True,
|
157 |
)
|
158 |
return "".join(seg.text for seg in segments).strip()
|
159 |
|
160 |
+
def _transcribe_fwhisper_cpu(model_id, language, audio_path, enable_diar):
|
161 |
model = get_fwhisper_model(model_id, "cpu")
|
162 |
cprint('Whisper (faster-whisper) using CPU', 'red')
|
163 |
# Diarization-only branch
|
|
|
183 |
|
184 |
|
185 |
@spaces.GPU
|
186 |
+
def _transcribe_fwhisper_gpu(model_id, language, audio_path, enable_diar):
|
187 |
pipe = get_fwhisper_model(model_id, "cuda")
|
188 |
cprint('Whisper (faster-whisper) using CUDA', 'green')
|
189 |
# Diarization-only branch
|
|
|
208 |
return transcript, ""
|
209 |
|
210 |
|
211 |
+
def transcribe_fwhisper(model_id, language, audio_path, device_sel, enable_diar):
|
212 |
if device_sel == "GPU" and torch.cuda.is_available():
|
213 |
+
return _transcribe_fwhisper_gpu(model_id, language, audio_path, enable_diar)
|
214 |
+
return _transcribe_fwhisper_cpu(model_id, language, audio_path, enable_diar)
|
215 |
|
216 |
|
217 |
# —————— SenseVoice Transcription ——————
|
|
|
355 |
|
356 |
with gr.Row():
|
357 |
with gr.Column():
|
358 |
+
gr.Markdown("### Faster-Whisper ASR")
|
359 |
whisper_dd = gr.Dropdown(choices=WHISPER_MODELS, value=WHISPER_MODELS[0], label="Whisper Model")
|
360 |
whisper_lang = gr.Dropdown(choices=WHISPER_LANGUAGES, value="auto", label="Whisper Language")
|
361 |
device_radio = gr.Radio(choices=["GPU", "CPU"], value="GPU", label="Device")
|
|
|
365 |
# Toggle visibility based on checkbox
|
366 |
diar_check.change(lambda e: gr.update(visible=not e), inputs=diar_check, outputs=out_w)
|
367 |
diar_check.change(lambda e: gr.update(visible=e), inputs=diar_check, outputs=out_w_d)
|
368 |
+
btn_w = gr.Button("Transcribe with Faster-Whisper")
|
369 |
+
btn_w.click(fn=transcribe_fwhisper,
|
370 |
inputs=[whisper_dd, whisper_lang, audio_input, device_radio, diar_check],
|
371 |
outputs=[out_w, out_w_d])
|
372 |
|