Luigi commited on
Commit
c5bcdb3
·
1 Parent(s): 38f97a7

get HF_TOKEN from secret variables

Browse files
Files changed (1) hide show
  1. app.py +67 -75
app.py CHANGED
@@ -44,7 +44,6 @@ WHISPER_MODELS = [
44
  "momo103197/whisper-small-zh-TW-16",
45
  "k1nto/Belle-whisper-large-v3-zh-punct-ct2"
46
  ]
47
-
48
  SENSEVOICE_MODELS = [
49
  "FunAudioLLM/SenseVoiceSmall",
50
  "AXERA-TECH/SenseVoice",
@@ -55,14 +54,16 @@ SENSEVOICE_MODELS = [
55
 
56
  # —————— Language Options ——————
57
  WHISPER_LANGUAGES = [
58
- "auto", "af","am","ar","as","az","ba","be","bg","bn","bo","br","bs","ca",
59
- "cs","cy","da","de","el","en","es","et","eu","fa","fi","fo","fr",
60
- "gl","gu","ha","haw","he","hi","hr","ht","hu","hy","id","is","it",
61
- "ja","jw","ka","kk","km","kn","ko","la","lb","ln","lo","lt","lv",
62
- "mg","mi","mk","ml","mn","mr","ms","mt","my","ne","nl","nn","no",
63
- "oc","pa","pl","ps","pt","ro","ru","sa","sd","si","sk","sl","sn",
64
- "so","sq","sr","su","sv","sw","ta","te","tg","th","tk","tl","tr",
65
- "tt","uk","ur","uz","vi","yi","yo","zh","yue"
 
 
66
  ]
67
  SENSEVOICE_LANGUAGES = ["auto", "zh", "yue", "en", "ja", "ko", "nospeech"]
68
 
@@ -102,52 +103,57 @@ def get_sense_model(model_id: str):
102
  def get_diarization_pipe():
103
  global dar_pipe
104
  if dar_pipe is None:
 
 
105
  dar_pipe = DiarizationPipeline.from_pretrained(
106
  "pyannote/speaker-diarization-3.1",
107
- use_auth_token=True
108
  )
109
  return dar_pipe
110
 
111
  # —————— Transcription Functions ——————
112
- def transcribe_whisper(model_id: str, language: str, audio_path: str, device_sel: str, enable_diar: bool):
113
- # select device for Whisper
 
 
 
 
114
  use_gpu = (device_sel == "GPU" and torch.cuda.is_available())
115
  device = 0 if use_gpu else -1
116
  pipe = get_whisper_pipe(model_id, device)
117
  # full transcription
118
- if language == "auto":
119
- result = pipe(audio_path)
120
- else:
121
- result = pipe(audio_path, generate_kwargs={"language": language})
122
  transcript = result.get("text", "").strip()
123
  diar_text = ""
124
- # optional diarization for Whisper
125
  if enable_diar:
126
  diarizer = get_diarization_pipe()
127
- diarization = diarizer(audio_path)
128
  snippets = []
129
- for turn, _, speaker in diarization.itertracks(yield_label=True):
130
- start_ms = int(turn.start * 1000)
131
- end_ms = int(turn.end * 1000)
132
  segment = AudioSegment.from_file(audio_path)[start_ms:end_ms]
133
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
134
  segment.export(tmp.name, format="wav")
135
- if language == "auto":
136
- seg_out = pipe(tmp.name)
137
- else:
138
- seg_out = pipe(tmp.name, generate_kwargs={"language": language})
139
  os.unlink(tmp.name)
140
- txt = seg_out.get("text", "").strip()
141
- snippets.append(f"[{speaker}] {txt}")
142
  diar_text = "\n".join(snippets)
143
  return transcript, diar_text
144
 
145
  @spaces.GPU
146
- def transcribe_sense(model_id: str, language: str, audio_path: str, enable_punct: bool, enable_diar: bool):
 
 
 
 
147
  model = get_sense_model(model_id)
148
- # if no diarization, full file
149
  if not enable_diar:
150
- segments = model.generate(
151
  input=audio_path,
152
  cache={},
153
  language=language,
@@ -156,21 +162,20 @@ def transcribe_sense(model_id: str, language: str, audio_path: str, enable_punct
156
  merge_vad=True,
157
  merge_length_s=15,
158
  )
159
- text = rich_transcription_postprocess(segments[0]['text'])
160
  if not enable_punct:
161
  text = re.sub(r"[^\w\s]", "", text)
162
  return text, ""
163
- # with diarization: split by speaker
164
  diarizer = get_diarization_pipe()
165
- diarization = diarizer(audio_path)
166
- speaker_snippets = []
167
- for turn, _, speaker in diarization.itertracks(yield_label=True):
168
- start_ms = int(turn.start * 1000)
169
- end_ms = int(turn.end * 1000)
170
  segment = AudioSegment.from_file(audio_path)[start_ms:end_ms]
171
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
172
  segment.export(tmp.name, format="wav")
173
- segments = model.generate(
174
  input=tmp.name,
175
  cache={},
176
  language=language,
@@ -180,65 +185,52 @@ def transcribe_sense(model_id: str, language: str, audio_path: str, enable_punct
180
  merge_length_s=0,
181
  )
182
  os.unlink(tmp.name)
183
- txt = rich_transcription_postprocess(segments[0]['text'])
184
  if not enable_punct:
185
  txt = re.sub(r"[^\w\s]", "", txt)
186
- speaker_snippets.append(f"[{speaker}] {txt}")
187
- full_text = "\n".join(speaker_snippets)
188
- # also return full non-diarized transcript for comparison
189
- segments_full = model.generate(
190
  input=audio_path,
191
  cache={},
192
  language=language,
193
  use_itn=True,
194
  batch_size_s=300,
195
  merge_vad=True,
196
- merge_length_s=15,
197
- )
198
- text_full = rich_transcription_postprocess(segments_full[0]['text'])
199
  if not enable_punct:
200
- text_full = re.sub(r"[^\w\s]", "", text_full)
201
- return text_full, full_text
202
 
203
  # —————— Gradio UI ——————
204
  demo = gr.Blocks()
205
  with demo:
206
- gr.Markdown("## Whisper vs. SenseVoice (Language, Device & Speaker Diarization)")
207
-
208
- audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio Input")
209
-
210
  with gr.Row():
211
- # Whisper column
212
  with gr.Column():
213
  gr.Markdown("### Whisper ASR")
214
  whisper_dd = gr.Dropdown(choices=WHISPER_MODELS, value=WHISPER_MODELS[0], label="Whisper Model")
215
  whisper_lang = gr.Dropdown(choices=WHISPER_LANGUAGES, value="auto", label="Whisper Language")
216
- device_radio = gr.Radio(choices=["GPU", "CPU"], value="GPU", label="Device")
217
- diar_check = gr.Checkbox(label="Enable Speaker Diarization", value=False)
218
- whisper_btn = gr.Button("Transcribe with Whisper")
219
- out_whisper = gr.Textbox(label="Transcript")
220
- out_whisper_diar = gr.Textbox(label="Diarized Transcript")
221
- whisper_btn.click(
222
- fn=transcribe_whisper,
223
- inputs=[whisper_dd, whisper_lang, audio_input, device_radio, diar_check],
224
- outputs=[out_whisper, out_whisper_diar]
225
- )
226
-
227
- # SenseVoice column
228
  with gr.Column():
229
  gr.Markdown("### FunASR SenseVoice ASR")
230
  sense_dd = gr.Dropdown(choices=SENSEVOICE_MODELS, value=SENSEVOICE_MODELS[0], label="SenseVoice Model")
231
  sense_lang = gr.Dropdown(choices=SENSEVOICE_LANGUAGES, value="auto", label="SenseVoice Language")
232
  punct = gr.Checkbox(label="Enable Punctuation", value=True)
233
- diar_sense = gr.Checkbox(label="Enable Speaker Diarization", value=False)
234
- sense_btn = gr.Button("Transcribe with SenseVoice")
235
- out_sense = gr.Textbox(label="Transcript")
236
- out_sense_diar = gr.Textbox(label="Diarized Transcript")
237
- sense_btn.click(
238
- fn=transcribe_sense,
239
- inputs=[sense_dd, sense_lang, audio_input, punct, diar_sense],
240
- outputs=[out_sense, out_sense_diar]
241
- )
242
-
243
  if __name__ == "__main__":
244
  demo.launch()
 
44
  "momo103197/whisper-small-zh-TW-16",
45
  "k1nto/Belle-whisper-large-v3-zh-punct-ct2"
46
  ]
 
47
  SENSEVOICE_MODELS = [
48
  "FunAudioLLM/SenseVoiceSmall",
49
  "AXERA-TECH/SenseVoice",
 
54
 
55
  # —————— Language Options ——————
56
  WHISPER_LANGUAGES = [
57
+ "auto", "af","am","ar","as","az","ba","be","bg","bn","bo",
58
+ "br","bs","ca","cs","cy","da","de","el","en","es","et",
59
+ "eu","fa","fi","fo","fr","gl","gu","ha","haw","he","hi",
60
+ "hr","ht","hu","hy","id","is","it","ja","jw","ka","kk",
61
+ "km","kn","ko","la","lb","ln","lo","lt","lv","mg","mi",
62
+ "mk","ml","mn","mr","ms","mt","my","ne","nl","nn","no",
63
+ "oc","pa","pl","ps","pt","ro","ru","sa","sd","si","sk",
64
+ "sl","sn","so","sq","sr","su","sv","sw","ta","te","tg",
65
+ "th","tk","tl","tr","tt","uk","ur","uz","vi","yi","yo",
66
+ "zh","yue"
67
  ]
68
  SENSEVOICE_LANGUAGES = ["auto", "zh", "yue", "en", "ja", "ko", "nospeech"]
69
 
 
103
  def get_diarization_pipe():
104
  global dar_pipe
105
  if dar_pipe is None:
106
+ # Pull token from environment (HF_TOKEN or HUGGINGFACE_TOKEN)
107
+ token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_TOKEN")
108
  dar_pipe = DiarizationPipeline.from_pretrained(
109
  "pyannote/speaker-diarization-3.1",
110
+ use_auth_token=token or True
111
  )
112
  return dar_pipe
113
 
114
  # —————— Transcription Functions ——————
115
+ def transcribe_whisper(model_id: str,
116
+ language: str,
117
+ audio_path: str,
118
+ device_sel: str,
119
+ enable_diar: bool):
120
+ # select device: 0 for GPU, -1 for CPU
121
  use_gpu = (device_sel == "GPU" and torch.cuda.is_available())
122
  device = 0 if use_gpu else -1
123
  pipe = get_whisper_pipe(model_id, device)
124
  # full transcription
125
+ result = (pipe(audio_path) if language == "auto"
126
+ else pipe(audio_path, generate_kwargs={"language": language}))
 
 
127
  transcript = result.get("text", "").strip()
128
  diar_text = ""
129
+ # optional speaker diarization
130
  if enable_diar:
131
  diarizer = get_diarization_pipe()
132
+ diary = diarizer(audio_path)
133
  snippets = []
134
+ for turn, _, speaker in diary.itertracks(yield_label=True):
135
+ start_ms, end_ms = int(turn.start*1000), int(turn.end*1000)
 
136
  segment = AudioSegment.from_file(audio_path)[start_ms:end_ms]
137
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
138
  segment.export(tmp.name, format="wav")
139
+ seg_out = (pipe(tmp.name) if language == "auto"
140
+ else pipe(tmp.name, generate_kwargs={"language": language}))
 
 
141
  os.unlink(tmp.name)
142
+ text = seg_out.get("text", "").strip()
143
+ snippets.append(f"[{speaker}] {text}")
144
  diar_text = "\n".join(snippets)
145
  return transcript, diar_text
146
 
147
  @spaces.GPU
148
+ def transcribe_sense(model_id: str,
149
+ language: str,
150
+ audio_path: str,
151
+ enable_punct: bool,
152
+ enable_diar: bool):
153
  model = get_sense_model(model_id)
154
+ # no diarization
155
  if not enable_diar:
156
+ segs = model.generate(
157
  input=audio_path,
158
  cache={},
159
  language=language,
 
162
  merge_vad=True,
163
  merge_length_s=15,
164
  )
165
+ text = rich_transcription_postprocess(segs[0]['text'])
166
  if not enable_punct:
167
  text = re.sub(r"[^\w\s]", "", text)
168
  return text, ""
169
+ # with diarization
170
  diarizer = get_diarization_pipe()
171
+ diary = diarizer(audio_path)
172
+ snippets = []
173
+ for turn, _, speaker in diary.itertracks(yield_label=True):
174
+ start_ms, end_ms = int(turn.start*1000), int(turn.end*1000)
 
175
  segment = AudioSegment.from_file(audio_path)[start_ms:end_ms]
176
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
177
  segment.export(tmp.name, format="wav")
178
+ segs = model.generate(
179
  input=tmp.name,
180
  cache={},
181
  language=language,
 
185
  merge_length_s=0,
186
  )
187
  os.unlink(tmp.name)
188
+ txt = rich_transcription_postprocess(segs[0]['text'])
189
  if not enable_punct:
190
  txt = re.sub(r"[^\w\s]", "", txt)
191
+ snippets.append(f"[{speaker}] {txt}")
192
+ full = rich_transcription_postprocess(model.generate(
 
 
193
  input=audio_path,
194
  cache={},
195
  language=language,
196
  use_itn=True,
197
  batch_size_s=300,
198
  merge_vad=True,
199
+ merge_length_s=15
200
+ )[0]['text'])
 
201
  if not enable_punct:
202
+ full = re.sub(r"[^\w\s]", "", full)
203
+ return full, "\n".join(snippets)
204
 
205
  # —————— Gradio UI ——————
206
  demo = gr.Blocks()
207
  with demo:
208
+ gr.Markdown("## Whisper vs. SenseVoice (Language, Device & Diarization)")
209
+ audio_input = gr.Audio(sources=["upload","microphone"], type="filepath", label="Audio Input")
 
 
210
  with gr.Row():
 
211
  with gr.Column():
212
  gr.Markdown("### Whisper ASR")
213
  whisper_dd = gr.Dropdown(choices=WHISPER_MODELS, value=WHISPER_MODELS[0], label="Whisper Model")
214
  whisper_lang = gr.Dropdown(choices=WHISPER_LANGUAGES, value="auto", label="Whisper Language")
215
+ device_radio = gr.Radio(choices=["GPU","CPU"], value="GPU", label="Device")
216
+ diar_check = gr.Checkbox(label="Enable Diarization")
217
+ btn_w = gr.Button("Transcribe with Whisper")
218
+ out_w = gr.Textbox(label="Transcript")
219
+ out_w_d = gr.Textbox(label="Diarized Transcript")
220
+ btn_w.click(fn=transcribe_whisper,
221
+ inputs=[whisper_dd, whisper_lang, audio_input, device_radio, diar_check],
222
+ outputs=[out_w, out_w_d])
 
 
 
 
223
  with gr.Column():
224
  gr.Markdown("### FunASR SenseVoice ASR")
225
  sense_dd = gr.Dropdown(choices=SENSEVOICE_MODELS, value=SENSEVOICE_MODELS[0], label="SenseVoice Model")
226
  sense_lang = gr.Dropdown(choices=SENSEVOICE_LANGUAGES, value="auto", label="SenseVoice Language")
227
  punct = gr.Checkbox(label="Enable Punctuation", value=True)
228
+ diar_s = gr.Checkbox(label="Enable Diarization")
229
+ btn_s = gr.Button("Transcribe with SenseVoice")
230
+ out_s = gr.Textbox(label="Transcript")
231
+ out_s_d = gr.Textbox(label="Diarized Transcript")
232
+ btn_s.click(fn=transcribe_sense,
233
+ inputs=[sense_dd, sense_lang, audio_input, punct, diar_s],
234
+ outputs=[out_s, out_s_d])
 
 
 
235
  if __name__ == "__main__":
236
  demo.launch()