kotoba-tech
/

kotoba-whisper-v1.1

@@ -249,6 +249,8 @@ class KotobaWhisperPipeline(AutomaticSpeechRecognitionPipeline):
         encoder = self.model.get_encoder()
         # Consume values so we can let extra information flow freely through
         # the pipeline (important for `partial` in microphone)
         if "input_features" in model_inputs:
             inputs = model_inputs.pop("input_features")
         elif "input_values" in model_inputs:
@@ -260,18 +262,7 @@ class KotobaWhisperPipeline(AutomaticSpeechRecognitionPipeline):
             )
         # custom processing for Whisper timestamps and word-level timestamps
-        if return_timestamps:
-            generate_kwargs["return_timestamps"] = return_timestamps
-            if return_timestamps == "word":
-                generate_kwargs["return_token_timestamps"] = True
-                generate_kwargs["return_segments"] = True
-                if stride is not None:
-                    if isinstance(stride, tuple):
-                        generate_kwargs["num_frames"] = stride[0] // self.feature_extractor.hop_length
-                    else:
-                        generate_kwargs["num_frames"] = [s[0] // self.feature_extractor.hop_length for s in stride]
         if inputs.shape[-1] > self.feature_extractor.nb_max_frames:
             generate_kwargs["input_features"] = inputs
         else:
@@ -279,17 +270,7 @@ class KotobaWhisperPipeline(AutomaticSpeechRecognitionPipeline):
         tokens = self.model.generate(attention_mask=attention_mask, **generate_kwargs)
         # whisper longform generation stores timestamps in "segments"
-        if return_timestamps == "word":
-            if "segments" not in tokens:
-                out = {"tokens": tokens["sequences"], "token_timestamps": tokens["token_timestamps"]}
-            else:
-                token_timestamps = [
-                    torch.cat([segment["token_timestamps"] for segment in segment_list])
-                    for segment_list in tokens["segments"]
-                ]
-                out = {"tokens": tokens["sequences"], "token_timestamps": token_timestamps}
-        else:
-            out = {"tokens": tokens}
         if self.type == "seq2seq_whisper":
             if stride is not None:
                 out["stride"] = stride

         encoder = self.model.get_encoder()
         # Consume values so we can let extra information flow freely through
         # the pipeline (important for `partial` in microphone)
+        if type(return_timestamps) is not bool:
+            raise ValueError("return_timestamps should be bool")
         if "input_features" in model_inputs:
             inputs = model_inputs.pop("input_features")
         elif "input_values" in model_inputs:
             )
         # custom processing for Whisper timestamps and word-level timestamps
+        generate_kwargs["return_timestamps"] = True
         if inputs.shape[-1] > self.feature_extractor.nb_max_frames:
             generate_kwargs["input_features"] = inputs
         else:
         tokens = self.model.generate(attention_mask=attention_mask, **generate_kwargs)
         # whisper longform generation stores timestamps in "segments"
+        out = {"tokens": tokens}
         if self.type == "seq2seq_whisper":
             if stride is not None:
                 out["stride"] = stride