Spaces:
Running
on
Zero
Running
on
Zero
get HF_TOKEN from secret variables
Browse files
app.py
CHANGED
@@ -44,7 +44,6 @@ WHISPER_MODELS = [
|
|
44 |
"momo103197/whisper-small-zh-TW-16",
|
45 |
"k1nto/Belle-whisper-large-v3-zh-punct-ct2"
|
46 |
]
|
47 |
-
|
48 |
SENSEVOICE_MODELS = [
|
49 |
"FunAudioLLM/SenseVoiceSmall",
|
50 |
"AXERA-TECH/SenseVoice",
|
@@ -55,14 +54,16 @@ SENSEVOICE_MODELS = [
|
|
55 |
|
56 |
# —————— Language Options ——————
|
57 |
WHISPER_LANGUAGES = [
|
58 |
-
"auto", "af","am","ar","as","az","ba","be","bg","bn","bo",
|
59 |
-
"
|
60 |
-
"
|
61 |
-
"
|
62 |
-
"
|
63 |
-
"
|
64 |
-
"
|
65 |
-
"
|
|
|
|
|
66 |
]
|
67 |
SENSEVOICE_LANGUAGES = ["auto", "zh", "yue", "en", "ja", "ko", "nospeech"]
|
68 |
|
@@ -102,52 +103,57 @@ def get_sense_model(model_id: str):
|
|
102 |
def get_diarization_pipe():
|
103 |
global dar_pipe
|
104 |
if dar_pipe is None:
|
|
|
|
|
105 |
dar_pipe = DiarizationPipeline.from_pretrained(
|
106 |
"pyannote/speaker-diarization-3.1",
|
107 |
-
use_auth_token=True
|
108 |
)
|
109 |
return dar_pipe
|
110 |
|
111 |
# —————— Transcription Functions ——————
|
112 |
-
def transcribe_whisper(model_id: str,
|
113 |
-
|
|
|
|
|
|
|
|
|
114 |
use_gpu = (device_sel == "GPU" and torch.cuda.is_available())
|
115 |
device = 0 if use_gpu else -1
|
116 |
pipe = get_whisper_pipe(model_id, device)
|
117 |
# full transcription
|
118 |
-
if language == "auto"
|
119 |
-
|
120 |
-
else:
|
121 |
-
result = pipe(audio_path, generate_kwargs={"language": language})
|
122 |
transcript = result.get("text", "").strip()
|
123 |
diar_text = ""
|
124 |
-
# optional diarization
|
125 |
if enable_diar:
|
126 |
diarizer = get_diarization_pipe()
|
127 |
-
|
128 |
snippets = []
|
129 |
-
for turn, _, speaker in
|
130 |
-
start_ms = int(turn.start *
|
131 |
-
end_ms = int(turn.end * 1000)
|
132 |
segment = AudioSegment.from_file(audio_path)[start_ms:end_ms]
|
133 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
|
134 |
segment.export(tmp.name, format="wav")
|
135 |
-
if language == "auto"
|
136 |
-
|
137 |
-
else:
|
138 |
-
seg_out = pipe(tmp.name, generate_kwargs={"language": language})
|
139 |
os.unlink(tmp.name)
|
140 |
-
|
141 |
-
snippets.append(f"[{speaker}] {
|
142 |
diar_text = "\n".join(snippets)
|
143 |
return transcript, diar_text
|
144 |
|
145 |
@spaces.GPU
|
146 |
-
def transcribe_sense(model_id: str,
|
|
|
|
|
|
|
|
|
147 |
model = get_sense_model(model_id)
|
148 |
-
#
|
149 |
if not enable_diar:
|
150 |
-
|
151 |
input=audio_path,
|
152 |
cache={},
|
153 |
language=language,
|
@@ -156,21 +162,20 @@ def transcribe_sense(model_id: str, language: str, audio_path: str, enable_punct
|
|
156 |
merge_vad=True,
|
157 |
merge_length_s=15,
|
158 |
)
|
159 |
-
text = rich_transcription_postprocess(
|
160 |
if not enable_punct:
|
161 |
text = re.sub(r"[^\w\s]", "", text)
|
162 |
return text, ""
|
163 |
-
# with diarization
|
164 |
diarizer = get_diarization_pipe()
|
165 |
-
|
166 |
-
|
167 |
-
for turn, _, speaker in
|
168 |
-
start_ms = int(turn.start *
|
169 |
-
end_ms = int(turn.end * 1000)
|
170 |
segment = AudioSegment.from_file(audio_path)[start_ms:end_ms]
|
171 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
|
172 |
segment.export(tmp.name, format="wav")
|
173 |
-
|
174 |
input=tmp.name,
|
175 |
cache={},
|
176 |
language=language,
|
@@ -180,65 +185,52 @@ def transcribe_sense(model_id: str, language: str, audio_path: str, enable_punct
|
|
180 |
merge_length_s=0,
|
181 |
)
|
182 |
os.unlink(tmp.name)
|
183 |
-
txt = rich_transcription_postprocess(
|
184 |
if not enable_punct:
|
185 |
txt = re.sub(r"[^\w\s]", "", txt)
|
186 |
-
|
187 |
-
|
188 |
-
# also return full non-diarized transcript for comparison
|
189 |
-
segments_full = model.generate(
|
190 |
input=audio_path,
|
191 |
cache={},
|
192 |
language=language,
|
193 |
use_itn=True,
|
194 |
batch_size_s=300,
|
195 |
merge_vad=True,
|
196 |
-
merge_length_s=15
|
197 |
-
)
|
198 |
-
text_full = rich_transcription_postprocess(segments_full[0]['text'])
|
199 |
if not enable_punct:
|
200 |
-
|
201 |
-
return
|
202 |
|
203 |
# —————— Gradio UI ——————
|
204 |
demo = gr.Blocks()
|
205 |
with demo:
|
206 |
-
gr.Markdown("## Whisper vs. SenseVoice (Language, Device &
|
207 |
-
|
208 |
-
audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio Input")
|
209 |
-
|
210 |
with gr.Row():
|
211 |
-
# Whisper column
|
212 |
with gr.Column():
|
213 |
gr.Markdown("### Whisper ASR")
|
214 |
whisper_dd = gr.Dropdown(choices=WHISPER_MODELS, value=WHISPER_MODELS[0], label="Whisper Model")
|
215 |
whisper_lang = gr.Dropdown(choices=WHISPER_LANGUAGES, value="auto", label="Whisper Language")
|
216 |
-
device_radio = gr.Radio(choices=["GPU",
|
217 |
-
diar_check = gr.Checkbox(label="Enable
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
outputs=[out_whisper, out_whisper_diar]
|
225 |
-
)
|
226 |
-
|
227 |
-
# SenseVoice column
|
228 |
with gr.Column():
|
229 |
gr.Markdown("### FunASR SenseVoice ASR")
|
230 |
sense_dd = gr.Dropdown(choices=SENSEVOICE_MODELS, value=SENSEVOICE_MODELS[0], label="SenseVoice Model")
|
231 |
sense_lang = gr.Dropdown(choices=SENSEVOICE_LANGUAGES, value="auto", label="SenseVoice Language")
|
232 |
punct = gr.Checkbox(label="Enable Punctuation", value=True)
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
outputs=[out_sense, out_sense_diar]
|
241 |
-
)
|
242 |
-
|
243 |
if __name__ == "__main__":
|
244 |
demo.launch()
|
|
|
44 |
"momo103197/whisper-small-zh-TW-16",
|
45 |
"k1nto/Belle-whisper-large-v3-zh-punct-ct2"
|
46 |
]
|
|
|
47 |
SENSEVOICE_MODELS = [
|
48 |
"FunAudioLLM/SenseVoiceSmall",
|
49 |
"AXERA-TECH/SenseVoice",
|
|
|
54 |
|
55 |
# —————— Language Options ——————
|
56 |
WHISPER_LANGUAGES = [
|
57 |
+
"auto", "af","am","ar","as","az","ba","be","bg","bn","bo",
|
58 |
+
"br","bs","ca","cs","cy","da","de","el","en","es","et",
|
59 |
+
"eu","fa","fi","fo","fr","gl","gu","ha","haw","he","hi",
|
60 |
+
"hr","ht","hu","hy","id","is","it","ja","jw","ka","kk",
|
61 |
+
"km","kn","ko","la","lb","ln","lo","lt","lv","mg","mi",
|
62 |
+
"mk","ml","mn","mr","ms","mt","my","ne","nl","nn","no",
|
63 |
+
"oc","pa","pl","ps","pt","ro","ru","sa","sd","si","sk",
|
64 |
+
"sl","sn","so","sq","sr","su","sv","sw","ta","te","tg",
|
65 |
+
"th","tk","tl","tr","tt","uk","ur","uz","vi","yi","yo",
|
66 |
+
"zh","yue"
|
67 |
]
|
68 |
SENSEVOICE_LANGUAGES = ["auto", "zh", "yue", "en", "ja", "ko", "nospeech"]
|
69 |
|
|
|
103 |
def get_diarization_pipe():
|
104 |
global dar_pipe
|
105 |
if dar_pipe is None:
|
106 |
+
# Pull token from environment (HF_TOKEN or HUGGINGFACE_TOKEN)
|
107 |
+
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_TOKEN")
|
108 |
dar_pipe = DiarizationPipeline.from_pretrained(
|
109 |
"pyannote/speaker-diarization-3.1",
|
110 |
+
use_auth_token=token or True
|
111 |
)
|
112 |
return dar_pipe
|
113 |
|
114 |
# —————— Transcription Functions ——————
|
115 |
+
def transcribe_whisper(model_id: str,
|
116 |
+
language: str,
|
117 |
+
audio_path: str,
|
118 |
+
device_sel: str,
|
119 |
+
enable_diar: bool):
|
120 |
+
# select device: 0 for GPU, -1 for CPU
|
121 |
use_gpu = (device_sel == "GPU" and torch.cuda.is_available())
|
122 |
device = 0 if use_gpu else -1
|
123 |
pipe = get_whisper_pipe(model_id, device)
|
124 |
# full transcription
|
125 |
+
result = (pipe(audio_path) if language == "auto"
|
126 |
+
else pipe(audio_path, generate_kwargs={"language": language}))
|
|
|
|
|
127 |
transcript = result.get("text", "").strip()
|
128 |
diar_text = ""
|
129 |
+
# optional speaker diarization
|
130 |
if enable_diar:
|
131 |
diarizer = get_diarization_pipe()
|
132 |
+
diary = diarizer(audio_path)
|
133 |
snippets = []
|
134 |
+
for turn, _, speaker in diary.itertracks(yield_label=True):
|
135 |
+
start_ms, end_ms = int(turn.start*1000), int(turn.end*1000)
|
|
|
136 |
segment = AudioSegment.from_file(audio_path)[start_ms:end_ms]
|
137 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
|
138 |
segment.export(tmp.name, format="wav")
|
139 |
+
seg_out = (pipe(tmp.name) if language == "auto"
|
140 |
+
else pipe(tmp.name, generate_kwargs={"language": language}))
|
|
|
|
|
141 |
os.unlink(tmp.name)
|
142 |
+
text = seg_out.get("text", "").strip()
|
143 |
+
snippets.append(f"[{speaker}] {text}")
|
144 |
diar_text = "\n".join(snippets)
|
145 |
return transcript, diar_text
|
146 |
|
147 |
@spaces.GPU
|
148 |
+
def transcribe_sense(model_id: str,
|
149 |
+
language: str,
|
150 |
+
audio_path: str,
|
151 |
+
enable_punct: bool,
|
152 |
+
enable_diar: bool):
|
153 |
model = get_sense_model(model_id)
|
154 |
+
# no diarization
|
155 |
if not enable_diar:
|
156 |
+
segs = model.generate(
|
157 |
input=audio_path,
|
158 |
cache={},
|
159 |
language=language,
|
|
|
162 |
merge_vad=True,
|
163 |
merge_length_s=15,
|
164 |
)
|
165 |
+
text = rich_transcription_postprocess(segs[0]['text'])
|
166 |
if not enable_punct:
|
167 |
text = re.sub(r"[^\w\s]", "", text)
|
168 |
return text, ""
|
169 |
+
# with diarization
|
170 |
diarizer = get_diarization_pipe()
|
171 |
+
diary = diarizer(audio_path)
|
172 |
+
snippets = []
|
173 |
+
for turn, _, speaker in diary.itertracks(yield_label=True):
|
174 |
+
start_ms, end_ms = int(turn.start*1000), int(turn.end*1000)
|
|
|
175 |
segment = AudioSegment.from_file(audio_path)[start_ms:end_ms]
|
176 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
|
177 |
segment.export(tmp.name, format="wav")
|
178 |
+
segs = model.generate(
|
179 |
input=tmp.name,
|
180 |
cache={},
|
181 |
language=language,
|
|
|
185 |
merge_length_s=0,
|
186 |
)
|
187 |
os.unlink(tmp.name)
|
188 |
+
txt = rich_transcription_postprocess(segs[0]['text'])
|
189 |
if not enable_punct:
|
190 |
txt = re.sub(r"[^\w\s]", "", txt)
|
191 |
+
snippets.append(f"[{speaker}] {txt}")
|
192 |
+
full = rich_transcription_postprocess(model.generate(
|
|
|
|
|
193 |
input=audio_path,
|
194 |
cache={},
|
195 |
language=language,
|
196 |
use_itn=True,
|
197 |
batch_size_s=300,
|
198 |
merge_vad=True,
|
199 |
+
merge_length_s=15
|
200 |
+
)[0]['text'])
|
|
|
201 |
if not enable_punct:
|
202 |
+
full = re.sub(r"[^\w\s]", "", full)
|
203 |
+
return full, "\n".join(snippets)
|
204 |
|
205 |
# —————— Gradio UI ——————
|
206 |
demo = gr.Blocks()
|
207 |
with demo:
|
208 |
+
gr.Markdown("## Whisper vs. SenseVoice (Language, Device & Diarization)")
|
209 |
+
audio_input = gr.Audio(sources=["upload","microphone"], type="filepath", label="Audio Input")
|
|
|
|
|
210 |
with gr.Row():
|
|
|
211 |
with gr.Column():
|
212 |
gr.Markdown("### Whisper ASR")
|
213 |
whisper_dd = gr.Dropdown(choices=WHISPER_MODELS, value=WHISPER_MODELS[0], label="Whisper Model")
|
214 |
whisper_lang = gr.Dropdown(choices=WHISPER_LANGUAGES, value="auto", label="Whisper Language")
|
215 |
+
device_radio = gr.Radio(choices=["GPU","CPU"], value="GPU", label="Device")
|
216 |
+
diar_check = gr.Checkbox(label="Enable Diarization")
|
217 |
+
btn_w = gr.Button("Transcribe with Whisper")
|
218 |
+
out_w = gr.Textbox(label="Transcript")
|
219 |
+
out_w_d = gr.Textbox(label="Diarized Transcript")
|
220 |
+
btn_w.click(fn=transcribe_whisper,
|
221 |
+
inputs=[whisper_dd, whisper_lang, audio_input, device_radio, diar_check],
|
222 |
+
outputs=[out_w, out_w_d])
|
|
|
|
|
|
|
|
|
223 |
with gr.Column():
|
224 |
gr.Markdown("### FunASR SenseVoice ASR")
|
225 |
sense_dd = gr.Dropdown(choices=SENSEVOICE_MODELS, value=SENSEVOICE_MODELS[0], label="SenseVoice Model")
|
226 |
sense_lang = gr.Dropdown(choices=SENSEVOICE_LANGUAGES, value="auto", label="SenseVoice Language")
|
227 |
punct = gr.Checkbox(label="Enable Punctuation", value=True)
|
228 |
+
diar_s = gr.Checkbox(label="Enable Diarization")
|
229 |
+
btn_s = gr.Button("Transcribe with SenseVoice")
|
230 |
+
out_s = gr.Textbox(label="Transcript")
|
231 |
+
out_s_d = gr.Textbox(label="Diarized Transcript")
|
232 |
+
btn_s.click(fn=transcribe_sense,
|
233 |
+
inputs=[sense_dd, sense_lang, audio_input, punct, diar_s],
|
234 |
+
outputs=[out_s, out_s_d])
|
|
|
|
|
|
|
235 |
if __name__ == "__main__":
|
236 |
demo.launch()
|