Automatic Speech Recognition
Transformers
Safetensors
Japanese
whisper
audio
hf-asr-leaderboard
asahi417 commited on
Commit
59da090
·
verified ·
1 Parent(s): 58e589d

Update pipeline/kotoba_whisper.py

Browse files
Files changed (1) hide show
  1. pipeline/kotoba_whisper.py +4 -23
pipeline/kotoba_whisper.py CHANGED
@@ -249,6 +249,8 @@ class KotobaWhisperPipeline(AutomaticSpeechRecognitionPipeline):
249
  encoder = self.model.get_encoder()
250
  # Consume values so we can let extra information flow freely through
251
  # the pipeline (important for `partial` in microphone)
 
 
252
  if "input_features" in model_inputs:
253
  inputs = model_inputs.pop("input_features")
254
  elif "input_values" in model_inputs:
@@ -260,18 +262,7 @@ class KotobaWhisperPipeline(AutomaticSpeechRecognitionPipeline):
260
  )
261
 
262
  # custom processing for Whisper timestamps and word-level timestamps
263
- if return_timestamps:
264
- generate_kwargs["return_timestamps"] = return_timestamps
265
- if return_timestamps == "word":
266
- generate_kwargs["return_token_timestamps"] = True
267
- generate_kwargs["return_segments"] = True
268
-
269
- if stride is not None:
270
- if isinstance(stride, tuple):
271
- generate_kwargs["num_frames"] = stride[0] // self.feature_extractor.hop_length
272
- else:
273
- generate_kwargs["num_frames"] = [s[0] // self.feature_extractor.hop_length for s in stride]
274
-
275
  if inputs.shape[-1] > self.feature_extractor.nb_max_frames:
276
  generate_kwargs["input_features"] = inputs
277
  else:
@@ -279,17 +270,7 @@ class KotobaWhisperPipeline(AutomaticSpeechRecognitionPipeline):
279
 
280
  tokens = self.model.generate(attention_mask=attention_mask, **generate_kwargs)
281
  # whisper longform generation stores timestamps in "segments"
282
- if return_timestamps == "word":
283
- if "segments" not in tokens:
284
- out = {"tokens": tokens["sequences"], "token_timestamps": tokens["token_timestamps"]}
285
- else:
286
- token_timestamps = [
287
- torch.cat([segment["token_timestamps"] for segment in segment_list])
288
- for segment_list in tokens["segments"]
289
- ]
290
- out = {"tokens": tokens["sequences"], "token_timestamps": token_timestamps}
291
- else:
292
- out = {"tokens": tokens}
293
  if self.type == "seq2seq_whisper":
294
  if stride is not None:
295
  out["stride"] = stride
 
249
  encoder = self.model.get_encoder()
250
  # Consume values so we can let extra information flow freely through
251
  # the pipeline (important for `partial` in microphone)
252
+ if type(return_timestamps) is not bool:
253
+ raise ValueError("return_timestamps should be bool")
254
  if "input_features" in model_inputs:
255
  inputs = model_inputs.pop("input_features")
256
  elif "input_values" in model_inputs:
 
262
  )
263
 
264
  # custom processing for Whisper timestamps and word-level timestamps
265
+ generate_kwargs["return_timestamps"] = True
 
 
 
 
 
 
 
 
 
 
 
266
  if inputs.shape[-1] > self.feature_extractor.nb_max_frames:
267
  generate_kwargs["input_features"] = inputs
268
  else:
 
270
 
271
  tokens = self.model.generate(attention_mask=attention_mask, **generate_kwargs)
272
  # whisper longform generation stores timestamps in "segments"
273
+ out = {"tokens": tokens}
 
 
 
 
 
 
 
 
 
 
274
  if self.type == "seq2seq_whisper":
275
  if stride is not None:
276
  out["stride"] = stride