voidful
/

gemma-3-omni-processor

Feature Extraction

Transformers

gemma_3_omni

custom_code

Model card Files Files and versions Community

voidful commited on May 14

Commit

5fc5a97

verified ·

1 Parent(s): 32e0cd2

Update processing_gemma3_omni.py

Browse files

Files changed (1) hide show

processing_gemma3_omni.py +187 -439

processing_gemma3_omni.py CHANGED Viewed

@@ -1,16 +1,16 @@
 import re
-from typing import List, Optional, Union, Dict, Any, Tuple
 import math
 import numpy as np
 import scipy.signal
 import torch
 from torch.nn.utils.rnn import pad_sequence
-from transformers.audio_utils import AudioInput  # type: ignore
 from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.image_utils import make_nested_list_of_images
-from transformers.processing_utils import ProcessorMixin, ProcessingKwargs, ImagesKwargs
 from transformers.utils import TensorType, to_py_obj, logging
 # Constants
@@ -25,7 +25,6 @@ DEFAULT_FEAT_STRIDE = 4
 IMAGE_TOKEN_PATTERN = r"<\|image_\d+\|>"
 AUDIO_TOKEN_PATTERN = r"<\|audio_\d+\|>"
 DEFAULT_MAX_LENGTH = 16384
-LOG_MEL_CLIP_EPSILON = 1e-5
 logger = logging.get_logger(__name__)
@@ -33,41 +32,25 @@ logger = logging.get_logger(__name__)
 def create_mel_filterbank(sampling_rate: int, n_fft: int, n_mels: int, fmin: float = 0.0,
                           fmax: Optional[float] = None) -> np.ndarray:
     """Create Mel filterbank for audio processing."""
-    fmax = fmax or sampling_rate / 2.0
     def hz_to_mel(f: float) -> float:
         return 1127.0 * math.log(1 + f / 700.0)
-    if fmin >= fmax:
-        raise ValueError(f"fmin ({fmin}) must be smaller than fmax ({fmax}).")
     mel_points = np.linspace(hz_to_mel(fmin), hz_to_mel(fmax), n_mels + 2)
     freq_points = 700.0 * (np.exp(mel_points / 1127.0) - 1)
-    freq_points = np.clip(freq_points, 0, sampling_rate / 2.0)
     bins = np.floor((n_fft + 1) * freq_points / sampling_rate).astype(int)
-    bins = np.clip(bins, 0, n_fft // 2)
     filterbank = np.zeros((n_mels, n_fft // 2 + 1), dtype=np.float32)
-    for m_idx in range(n_mels):
-        left, center, right = bins[m_idx], bins[m_idx + 1], bins[m_idx + 2]
-        # Robust triangular filter creation from your version
-        # (small adjustment to ensure slopes are only added if points are distinct)
-        if center > left:
-            filterbank[m_idx, left:center] = (np.arange(left, center) - left) / (center - left)
-        if right > center:
-            filterbank[m_idx, center:right] = (right - np.arange(center, right)) / (right - center)
-        # Ensure peak is 1.0 if center is a valid point, particularly if left=center or center=right
-        # This covers the case where a slope might not set the peak to 1 due to integer arithmetic.
-        if left <= center <= right and ((center > left and center <= right) or (center < right and center >= left)):
-            filterbank[m_idx, center] = 1.0
     return filterbank
 class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
-    model_input_names = ["audio_values", "audio_attention_mask"]
     def __init__(
             self,
             compression_rate: int = DEFAULT_COMPRESSION_RATE,
@@ -75,182 +58,89 @@ class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
             feat_stride: int = DEFAULT_FEAT_STRIDE,
             sampling_rate: int = DEFAULT_SAMPLING_RATE,
             n_fft: int = DEFAULT_N_FFT,
-            win_length: Optional[int] = None,
-            hop_length: Optional[int] = None,
             n_mels: int = DEFAULT_N_MELS,
-            f_min: float = 0.0,
-            f_max: Optional[float] = None,
-            padding_value: float = 0.0,
             **kwargs
     ):
-        # Pop these before super().__init__ as they might conflict if also in kwargs
-        # and super() doesn't expect them, or if super() expects them but under different names.
-        # However, feature_size, sampling_rate, padding_value ARE arguments for SequenceFeatureExtractor.
-        # So, ensure they are passed correctly.
-        _feature_size = n_mels
-        _sampling_rate = sampling_rate
-        _padding_value = padding_value
-        # Remove them from kwargs if they were also passed via kwargs to avoid duplicate argument error
         kwargs.pop("feature_size", None)
         kwargs.pop("sampling_rate", None)
         kwargs.pop("padding_value", None)
-        _win_length = win_length if win_length is not None else n_fft
-        _hop_length = hop_length if hop_length is not None else _win_length // 4
         super().__init__(
-            feature_size=_feature_size,
-            sampling_rate=_sampling_rate,
-            padding_value=_padding_value,
             **kwargs
         )
         self.compression_rate = compression_rate
         self.qformer_rate = qformer_rate
         self.feat_stride = feat_stride
-        # self.sampling_rate is set by super()
         self.n_fft = n_fft
-        self.win_length = _win_length
-        self.hop_length = _hop_length
-        self.n_mels = n_mels
-        self.f_min = f_min
-        self.f_max = f_max
-        if self.win_length > self.n_fft:
-            logger.warning(
-                f"win_length ({self.win_length}) is greater than n_fft ({self.n_fft}). "
-                "Window will be applied, then data will be zero-padded/truncated to n_fft by np.fft.rfft."
-            )
-        self.window = np.hamming(self.win_length).astype(np.float32)
-        self.mel_filterbank = create_mel_filterbank(
-            self.sampling_rate, self.n_fft, self.n_mels, fmin=self.f_min, fmax=self.f_max
-        ).T
     def __call__(
             self,
-            audios: Union[AudioInput, List[AudioInput]],
-            sampling_rate: Optional[int] = None,
             return_tensors: Union[TensorType, str, None] = TensorType.PYTORCH
     ) -> BatchFeature:
-        if not isinstance(audios, list):
-            audios = [audios]
-        processed_mels: List[torch.Tensor] = []
-        actual_mel_lengths: List[int] = []
-        sizes_for_embed_length: List[torch.Tensor] = []
-        frames_scaled_by_feat_stride: List[int] = []
-        for audio_item in audios:
-            current_wav: np.ndarray
-            source_sr: int
-            if isinstance(audio_item, tuple) and len(audio_item) == 2 and isinstance(audio_item[1], int):
-                current_wav, source_sr = audio_item
-                current_wav = np.asarray(current_wav, dtype=np.float32)
-            elif isinstance(audio_item, (np.ndarray, list)):
-                current_wav = np.asarray(audio_item, dtype=np.float32)
-                if sampling_rate is None:
-                    raise ValueError(
-                        "sampling_rate must be provided if audio inputs are raw numpy arrays or lists without sr."
-                    )
-                source_sr = sampling_rate
-            else:
-                raise TypeError(
-                    f"Unsupported audio input type: {type(audio_item)}. "
-                    "Expected np.ndarray, list of floats, or Tuple[np.ndarray, int]."
-                )
-            processed_wav_array = self._preprocess_audio(current_wav, source_sr)
-            mel_spectrogram = self._compute_log_mel_spectrogram(processed_wav_array)
-            feature_tensor = torch.from_numpy(mel_spectrogram)
-            processed_mels.append(feature_tensor)
-            actual_mel_lengths.append(feature_tensor.shape[0])
-            sizes_for_embed_length.append(torch.tensor(self._calculate_embed_length(feature_tensor.shape[0])))
-            frames_scaled_by_feat_stride.append(feature_tensor.shape[0] * self.feat_stride)
-        audio_embeds = pad_sequence(processed_mels, batch_first=True, padding_value=self.padding_value)
-        max_t_mel_in_batch = audio_embeds.shape[1]
-        attention_mask = torch.zeros(len(audios), max_t_mel_in_batch,
-                                     dtype=torch.bool)  # Device handled by BatchFeature
-        for i, length in enumerate(actual_mel_lengths):
-            attention_mask[i, :length] = True
         output_data = {
             "audio_values": audio_embeds,
-            "audio_attention_mask": attention_mask
         }
-        if sizes_for_embed_length:
-            output_data["audio_values_sizes"] = torch.stack(sizes_for_embed_length)
-        logger.debug(
-            f"Gemma3AudioFeatureExtractor: Output 'audio_values' shape: {output_data['audio_values'].shape}")  # Verify output
         return BatchFeature(data=output_data, tensor_type=return_tensors)
     def _preprocess_audio(self, wav: np.ndarray, source_sr: int) -> np.ndarray:
-        if wav.dtype not in [np.float32, np.float64]:
-            if np.issubdtype(wav.dtype, np.integer):
-                max_val = np.iinfo(wav.dtype).max if wav.size > 0 else 1.0
-                wav = wav.astype(np.float32) / max_val
-            else:
-                wav = wav.astype(np.float32)
-        elif wav.dtype == np.float64:
-            wav = wav.astype(np.float32)
         if wav.ndim > 1:
             wav = wav.mean(axis=0)
         if source_sr != self.sampling_rate:
-            # logger.info(f"Resampling audio from {source_sr} Hz to {self.sampling_rate} Hz.") # logger might not be defined if this class is used standalone
-            common_divisor = math.gcd(self.sampling_rate, source_sr)
-            up_factor = self.sampling_rate // common_divisor
-            down_factor = source_sr // common_divisor
-            if up_factor != down_factor:
-                wav = scipy.signal.resample_poly(wav, up=up_factor, down=down_factor)
-        max_abs_val = np.abs(wav).max()
-        if max_abs_val > 1e-7:
-            wav = wav / max_abs_val
-        return wav
     def _compute_log_mel_spectrogram(self, wav: np.ndarray) -> np.ndarray:
-        if len(wav) < self.win_length:
-            padding = self.win_length - len(wav)
-            wav = np.pad(wav, (0, padding), mode='constant', constant_values=0.0)
-        if len(wav) >= self.win_length:
-            num_frames = 1 + (len(wav) - self.win_length) // self.hop_length
-        else:
-            num_frames = 0
-        if num_frames <= 0:
-            # logger.warning(...) # logger might not be defined
-            return np.zeros((0, self.n_mels), dtype=np.float32)
-        frames_view = np.lib.stride_tricks.as_strided(
             wav,
-            shape=(num_frames, self.win_length),
-            strides=(wav.strides[0] * self.hop_length, wav.strides[0]),
             writeable=False
-        )
-        frames_data = frames_view.copy()
-        frames_data *= self.window
-        spectrum = np.fft.rfft(frames_data, n=self.n_fft, axis=-1).astype(np.complex64)
         power = np.abs(spectrum) ** 2
         mel_spectrogram = np.dot(power, self.mel_filterbank)
-        mel_spectrogram = np.clip(mel_spectrogram, LOG_MEL_CLIP_EPSILON, None)
-        log_mel_spectrogram = np.log(mel_spectrogram)
-        return log_mel_spectrogram.astype(np.float32)
     def _calculate_embed_length(self, frame_count: int) -> int:
         compressed = math.ceil(frame_count / self.compression_rate)
@@ -266,9 +156,8 @@ class Gemma3ImagesKwargs(ImagesKwargs):
 class Gemma3ProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: Optional[Dict[str, Any]] = None
-    audio_kwargs: Optional[Dict[str, Any]] = None
-    text_kwargs: Optional[Dict[str, Any]] = None
     _defaults = {
         "text_kwargs": {"padding": False, "truncation": False, "max_length": DEFAULT_MAX_LENGTH},
         "images_kwargs": {},
@@ -279,23 +168,38 @@ class Gemma3ProcessorKwargs(ProcessingKwargs, total=False):
 class Gemma3OmniProcessor(ProcessorMixin):
     attributes = ["image_processor", "audio_processor", "tokenizer"]
     valid_kwargs = ["chat_template", "image_seq_length"]
-    # --- CRITICAL FIX: Use STRING names for auto-loading by ProcessorMixin ---
     image_processor_class = "AutoImageProcessor"
-    audio_processor_class = "AutoFeatureExtractor"  # Must match the class name string
     tokenizer_class = "AutoTokenizer"
     def __init__(
             self,
-            image_processor=None,
-            audio_processor=None,
-            tokenizer=None,
             chat_template=None,
             image_seq_length: int = 256,
-            **kwargs  # Catch-all for other potential superclass args or future additions
     ):
-        # ProcessorMixin.__init__ handles instantiation of image_processor, audio_processor, tokenizer
-        # if they are None, using the *_class attributes.
         super().__init__(
             image_processor=image_processor,
             audio_processor=audio_processor,
@@ -304,281 +208,136 @@ class Gemma3OmniProcessor(ProcessorMixin):
             **kwargs
         )
-        # Attributes dependent on an instantiated tokenizer.
-        # self.tokenizer should be populated by super().__init__ by this point.
-        self.image_seq_length = image_seq_length
-        if self.tokenizer is not None:
-            self.image_token_id = getattr(self.tokenizer, "image_token_id",
-                                          self.tokenizer.unk_token_id if hasattr(self.tokenizer,
-                                                                                 "unk_token_id") else None)
-            self.boi_token = getattr(self.tokenizer, "boi_token", "<UNUSED_BOI>")
-            self.image_token = getattr(self.tokenizer, "image_token", "<UNUSED_IMG_TOKEN>")
-            self.eoi_token = getattr(self.tokenizer, "eoi_token", "<UNUSED_EOI>")
-            # User's original audio token attributes
-            self.audio_token_str_from_user_code = "<audio_soft_token>"  # From user's original code
-            # self.expected_audio_token_id = 262143 # User's reference
-            self.audio_token_id = self.tokenizer.convert_tokens_to_ids(self.audio_token_str_from_user_code)
-            # User's original warning logic for audio_token_id
-            # if self.audio_token_id != self.expected_audio_token_id: # Comparing to a fixed ID
-            #     logger.warning(f"Assigned ID {self.audio_token_id} for '{self.audio_token_str_from_user_code}' does not match expected ID {self.expected_audio_token_id}.")
-            if hasattr(self.tokenizer, "unk_token_id") and self.audio_token_id == self.tokenizer.unk_token_id:
-                logger.warning(
-                    f"Audio token '{self.audio_token_str_from_user_code}' not found in tokenizer, maps to UNK. Ensure it's added as a special token.")
-            self.full_image_sequence = f"\n\n{self.boi_token}{''.join([self.image_token] * image_seq_length)}{self.eoi_token}\n\n"
-        else:
-            # This case should ideally not happen if from_pretrained works correctly.
-            logger.error(
-                "Gemma3OmniProcessor initialized, but tokenizer is None. Token-dependent attributes will be missing or use placeholders.")
-            self.image_token_id = None
-            self.boi_token = "<UNUSED_BOI>"
-            self.image_token = "<UNUSED_IMG_TOKEN>"
-            self.eoi_token = "<UNUSED_EOI>"
-            self.audio_token_str_from_user_code = "<audio_soft_token>"
-            self.audio_token_id = -1
-            self.full_image_sequence = ""
-        # These are parameters for this processor's logic of determining audio token sequence length for prompts
-        # They were fixed values in user's original __init__
-        self.prompt_audio_compression_rate = 8
-        self.prompt_audio_qformer_compression_rate = 1
-        self.prompt_audio_feat_stride = 1
-    def _merge_kwargs(self, KwargsClassWithDefaults, tokenizer_init_kwargs, **kwargs_from_call):
-        final_kwargs = {}
-        _defaults = getattr(KwargsClassWithDefaults, "_defaults", {})
-        if not isinstance(_defaults, dict): _defaults = {}
-        for modality_key, default_modality_kwargs in _defaults.items():
-            final_kwargs[modality_key] = default_modality_kwargs.copy()
-        for modality_key_in_call, modality_kwargs_in_call in kwargs_from_call.items():
-            if modality_key_in_call in final_kwargs:
-                if isinstance(modality_kwargs_in_call, dict):
-                    final_kwargs[modality_key_in_call].update(modality_kwargs_in_call)
-            elif isinstance(modality_kwargs_in_call, dict):
-                final_kwargs[modality_key_in_call] = modality_kwargs_in_call.copy()
-        for modality_key in final_kwargs:
-            modality_dict = final_kwargs[modality_key]
-            if isinstance(modality_dict, dict) and self.tokenizer is not None:  # Check tokenizer exists
-                for key_in_mod_dict in list(modality_dict.keys()):
-                    if key_in_mod_dict in tokenizer_init_kwargs:
-                        value = (
-                            getattr(self.tokenizer, key_in_mod_dict)
-                            if hasattr(self.tokenizer, key_in_mod_dict)
-                            else tokenizer_init_kwargs[key_in_mod_dict]
-                        )
-                        modality_dict[key_in_mod_dict] = value
-        if "text_kwargs" not in final_kwargs:
-            final_kwargs["text_kwargs"] = {}
-        final_kwargs["text_kwargs"]["truncation"] = final_kwargs["text_kwargs"].get("truncation", False)
-        final_kwargs["text_kwargs"]["max_length"] = final_kwargs["text_kwargs"].get("max_length", DEFAULT_MAX_LENGTH)
-        return final_kwargs
-    def _compute_audio_embed_size(self, audio_mel_frames: int) -> int:
-        # Using processor's parameters for calculating number of special tokens in text prompt
-        scaled_frames = audio_mel_frames * self.prompt_audio_feat_stride
-        result = math.ceil(scaled_frames / self.prompt_audio_compression_rate)
-        return math.ceil(result / self.prompt_audio_qformer_rate)
     def __call__(
             self,
-            text: Union[str, List[str]] = None,
-            images: Optional[Any] = None,
-            audios: Optional[Union[AudioInput, List[AudioInput]]] = None,
-            sampling_rate: Optional[int] = None,
-            return_tensors: Optional[Union[str, TensorType]] = None,
-            **kwargs: Any
     ) -> BatchFeature:
-        if text is None and images is None and audios is None:
-            raise ValueError("Provide at least one of `text`, `images`, or `audios`.")
-        final_rt = return_tensors
-        merged_call_kwargs = self._merge_kwargs(
-            Gemma3ProcessorKwargs,  # Use the defined Kwargs class
-            self.tokenizer.init_kwargs if hasattr(self.tokenizer, 'init_kwargs') else {},
             **kwargs
         )
-        if final_rt is None:
-            final_rt = merged_call_kwargs.get("text_kwargs", {}).pop("return_tensors", TensorType.PYTORCH)
-        else:
-            merged_call_kwargs.get("text_kwargs", {}).pop("return_tensors", None)
-        if text is None:
-            num_samples = 0
-            if images is not None:
-                _images_list = images if isinstance(images, list) and (
-                            not images or not isinstance(images[0], (int, float))) else [images]
-                num_samples = len(_images_list)
-            elif audios is not None:
-                _audios_list = audios if isinstance(audios, list) else [audios]
-                num_samples = len(_audios_list)
-            text = [""] * num_samples if num_samples > 0 else [""]
         if isinstance(text, str):
             text = [text]
-        if not (isinstance(text, list) and all(isinstance(t, str) for t in text)):
-            raise ValueError("Input `text` must be a string or a list of strings.")
-        image_features_dict = {}
-        # --- Image Processing (User's original structure, with safety for image_processor) ---
         if images is not None:
-            if self.image_processor is None:
-                raise ValueError("Images were provided, but `self.image_processor` is not set.")
             batched_images = make_nested_list_of_images(images)
-            _img_proc_output = self.image_processor(batched_images, return_tensors=None,
-                                                    **merged_call_kwargs.get("images_kwargs", {}))
-            image_features_dict = _img_proc_output.data if isinstance(_img_proc_output,
-                                                                      BatchFeature) else _img_proc_output
-            if len(text) == 0 and len(batched_images) > 0:  # If text was initially None and images provided
-                text = [" ".join([self.boi_token] * len(img_batch)) for img_batch in batched_images]
-            elif len(batched_images) != len(text):
-                raise ValueError(f"Inconsistent batch sizes: {len(batched_images)} images, {len(text)} texts")
-            num_crops_popped = image_features_dict.pop("num_crops", None)
-            if num_crops_popped is not None:
-                num_crops_all = to_py_obj(num_crops_popped)
-                # ... (user's complex crop and text modification logic - kept as per original) ...
-                # This part needs careful attention to ensure num_crops_all aligns with batched_images
-                # For simplicity, the following is a conceptual placeholder of the user's original intent
-                processed_text_for_images = []
-                current_crop_idx_offset = 0
-                for batch_idx, (prompt, current_imgs_in_batch) in enumerate(zip(text, batched_images)):
-                    crops_for_this_batch_sample = []
-                    if num_crops_all:  # Check if num_crops_all is not empty
-                        for _ in current_imgs_in_batch:
-                            if current_crop_idx_offset < len(num_crops_all):
-                                crops_for_this_batch_sample.append(num_crops_all[current_crop_idx_offset])
-                                current_crop_idx_offset += 1
-                            else:
-                                crops_for_this_batch_sample.append(0)  # Should not happen
-                    image_indexes = [m.start() for m in re.finditer(re.escape(self.boi_token), prompt)]
-                    # ... (The rest of user's loop for image token replacement) ...
-                    # This was:
-                    # for num, idx in reversed(list(zip(crops_for_this_batch_sample, image_indexes))):
-                    #     if num > 0 : ...
-                    # text[batch_idx] = prompt
-                    # For minimal change, I'll assume this part is complex and specific.
-                    # A simplified version:
-                    prompt_with_full_seq = prompt.replace(self.boi_token, self.full_image_sequence,
-                                                          len(current_imgs_in_batch) if image_indexes else 0)
-                    processed_text_for_images.append(prompt_with_full_seq)
-                text = processed_text_for_images
-            else:  # if no num_crops, simpler replacement
-                text = [prompt.replace(self.boi_token, self.full_image_sequence) for prompt in text]
-        # --- Audio Processing ---
-        audio_features_dict = {}
-        if audios is not None:
-            if self.audio_processor is None:
-                raise ValueError("Audios were provided, but `self.audio_processor` is not set.")
-            audio_call_kwargs = merged_call_kwargs.get("audio_kwargs", {})
-            if sampling_rate is not None:
-                audio_call_kwargs["sampling_rate"] = sampling_rate
-            _audio_proc_output = self.audio_processor(audios=audios, return_tensors=None, **audio_call_kwargs)
-            audio_features_dict = _audio_proc_output.data
-            logger.debug(
-                f"Gemma3OmniProcessor: Shape of 'audio_values' from Feature Extractor: {audio_features_dict['audio_values'].shape}")
-            new_text_with_audio = []
-            actual_mel_frames_per_sample = to_py_obj(audio_features_dict["audio_attention_mask"].sum(axis=-1))
-            if len(actual_mel_frames_per_sample) != len(text):  # Check batch consistency
                 raise ValueError(
-                    f"Inconsistent batch sizes for audio and text: {len(actual_mel_frames_per_sample)} audio samples, {len(text)} texts.")
-            for i, prompt in enumerate(text):
-                num_soft_tokens = self._compute_audio_embed_size(actual_mel_frames_per_sample[i])
-                # User's original audio_tokens dictionary for constructing the sequence
-                _audio_token_str = self.audio_token_str_from_user_code  # e.g. "<audio_soft_token>"
-                _boa_token_str = getattr(self.tokenizer, "bos_token", " ")  # Using BOS or space as BOA
-                _eoa_token_str = getattr(self.tokenizer, "eos_token", "<|endoftext|>")  # Using EOS as EOA
-                audio_token_sequence_str = f"{_boa_token_str}{''.join([_audio_token_str] * num_soft_tokens)}{_eoa_token_str}"
-                # User's replacement logic used boa_token as placeholder. This can be made more robust.
-                # Using a dedicated placeholder is safer. For now, mimicking user's approach.
-                # The user's code used `audio_tokens_map['boa_token']` (which was " ") as placeholder.
-                placeholder_str = _boa_token_str
-                if prompt.strip().startswith(
-                        placeholder_str.strip()) and placeholder_str.strip() != "":  # Avoid replacing all spaces
-                    prompt = prompt.replace(placeholder_str, audio_token_sequence_str, 1)  # Replace first
-                elif self.audio_placeholder_token in prompt:  # Check for a more explicit placeholder
-                    prompt = prompt.replace(self.audio_placeholder_token, audio_token_sequence_str, 1)
-                else:
-                    prompt += audio_token_sequence_str
-                new_text_with_audio.append(prompt)
-            text = new_text_with_audio
-        # --- Text Tokenization ---
-        text_tokenizer_kwargs = merged_call_kwargs.get("text_kwargs", {})
-        text_features_dict = self.tokenizer(text=text, return_tensors=None, **text_tokenizer_kwargs)
-        # Debug log from user - ensure input_ids_list_of_lists is correctly formed
-        input_ids_list_of_lists = text_features_dict["input_ids"]
-        if not isinstance(input_ids_list_of_lists, list) or not (
-                input_ids_list_of_lists and isinstance(input_ids_list_of_lists[0], list)):
-            if isinstance(input_ids_list_of_lists, (torch.Tensor, np.ndarray)):
-                input_ids_list_of_lists = to_py_obj(input_ids_list_of_lists)
-            elif isinstance(input_ids_list_of_lists, list) and (
-                    not input_ids_list_of_lists or isinstance(input_ids_list_of_lists[0], int)):
-                input_ids_list_of_lists = [input_ids_list_of_lists]
-        for i, (txt, ids) in enumerate(zip(text, input_ids_list_of_lists)):
-            if not isinstance(ids, list): ids = []
-            audio_text_count = txt.count(self.audio_token_str_from_user_code)
-            audio_ids_count = ids.count(self.audio_token_id)
             logger.debug(
-                f"Sample {i}: Audio tokens ('{self.audio_token_str_from_user_code}') in text count={audio_text_count}, "
-                f"in input_ids (ID:{self.audio_token_id}) count={audio_ids_count}. "
-                f"Text snippet='{txt[:100]}...', Input IDs length={len(ids)}"
             )
-        # Token type IDs from user's code
-        # Convert to numpy for boolean indexing, then back to list.
-        # This assumes input_ids_list_of_lists is now correctly a list of lists of ints.
-        # To make it robust for padding, pad token_type_ids as well if input_ids are padded by tokenizer.
-        # For now, assuming tokenizer with return_tensors=None gives unpadded list of lists.
-        padded_input_ids_for_token_type, _ = self._pad_মাদের(input_ids_list_of_lists)  # Custom helper needed
-        mm_token_type_ids_np = np.zeros_like(padded_input_ids_for_token_type, dtype=int)
-        if self.image_token_id is not None:
-            mm_token_type_ids_np[padded_input_ids_for_token_type == self.image_token_id] = 1
-        if self.audio_token_id != -1:  # Check if audio_token_id is valid
-            mm_token_type_ids_np[padded_input_ids_for_token_type == self.audio_token_id] = 2
-        text_features_dict["token_type_ids"] = mm_token_type_ids_np.tolist()
-        # Ensure attention_mask from tokenizer is also included if padding was applied by tokenizer
-        # text_features_dict should already contain 'attention_mask' if padding=True for tokenizer
-        final_batch_data = {**text_features_dict}
-        if image_features_dict:
-            final_batch_data.update(image_features_dict)
-        if audio_features_dict:
-            final_batch_data.update(audio_features_dict)
-        return BatchFeature(data=final_batch_data, tensor_type=final_rt)
-    # Helper for padding list of lists, if tokenizer does not do it with return_tensors=None
-    def _pad_মাদের(self, list_of_lists: List[List[int]], padding_value: int = 0) -> Tuple[np.ndarray, np.ndarray]:
-        if not list_of_lists: return np.array([]), np.array([])
-        max_len = max(len(sublist) for sublist in list_of_lists)
-        padded_array = np.full((len(list_of_lists), max_len), padding_value, dtype=int)
-        attention_mask = np.zeros((len(list_of_lists), max_len), dtype=int)
-        for i, sublist in enumerate(list_of_lists):
-            padded_array[i, :len(sublist)] = sublist
-            attention_mask[i, :len(sublist)] = 1
-        return padded_array, attention_mask
     def batch_decode(self, *args, **kwargs):
         return self.tokenizer.batch_decode(*args, **kwargs)
@@ -588,18 +347,7 @@ class Gemma3OmniProcessor(ProcessorMixin):
     @property
     def model_input_names(self):
-        # User's original logic, slightly more robust with hasattr checks
-        tokenizer_inputs = []
-        if hasattr(self, 'tokenizer') and self.tokenizer is not None:
-            tokenizer_inputs = self.tokenizer.model_input_names + ["token_type_ids"]
-        image_processor_inputs = []
-        if hasattr(self, 'image_processor') and self.image_processor is not None:
-            image_processor_inputs = self.image_processor.model_input_names
-        audio_processor_inputs = []
-        if hasattr(self, 'audio_processor') and self.audio_processor is not None:
-            audio_processor_inputs = getattr(self.audio_processor, "model_input_names",
-                                             ["audio_values", "audio_attention_mask"])
-        return list(dict.fromkeys(tokenizer_inputs + image_processor_inputs + audio_processor_inputs))

 import re
+from typing import List, Optional, Union, Dict, Any
 import math
 import numpy as np
 import scipy.signal
 import torch
 from torch.nn.utils.rnn import pad_sequence
+from transformers.audio_utils import AudioInput
 from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.image_utils import make_nested_list_of_images
+from transformers.processing_utils import ProcessorMixin, ProcessingKwargs, ImagesKwargs, Unpack
 from transformers.utils import TensorType, to_py_obj, logging
 # Constants
 IMAGE_TOKEN_PATTERN = r"<\|image_\d+\|>"
 AUDIO_TOKEN_PATTERN = r"<\|audio_\d+\|>"
 DEFAULT_MAX_LENGTH = 16384
 logger = logging.get_logger(__name__)
 def create_mel_filterbank(sampling_rate: int, n_fft: int, n_mels: int, fmin: float = 0.0,
                           fmax: Optional[float] = None) -> np.ndarray:
     """Create Mel filterbank for audio processing."""
+    fmax = fmax or sampling_rate / 2
     def hz_to_mel(f: float) -> float:
         return 1127.0 * math.log(1 + f / 700.0)
     mel_points = np.linspace(hz_to_mel(fmin), hz_to_mel(fmax), n_mels + 2)
     freq_points = 700.0 * (np.exp(mel_points / 1127.0) - 1)
     bins = np.floor((n_fft + 1) * freq_points / sampling_rate).astype(int)
     filterbank = np.zeros((n_mels, n_fft // 2 + 1), dtype=np.float32)
+    for m in range(1, n_mels + 1):
+        left, center, right = bins[m - 1:m + 2]
+        filterbank[m - 1, left:center] = (np.arange(left, center) - left) / (center - left)
+        filterbank[m - 1, center:right] = (right - np.arange(center, right)) / (right - center)
     return filterbank
 class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
     def __init__(
             self,
             compression_rate: int = DEFAULT_COMPRESSION_RATE,
             feat_stride: int = DEFAULT_FEAT_STRIDE,
             sampling_rate: int = DEFAULT_SAMPLING_RATE,
             n_fft: int = DEFAULT_N_FFT,
+            win_length: int = DEFAULT_WIN_LENGTH,
+            hop_length: int = DEFAULT_HOP_LENGTH,
             n_mels: int = DEFAULT_N_MELS,
             **kwargs
     ):
         kwargs.pop("feature_size", None)
         kwargs.pop("sampling_rate", None)
         kwargs.pop("padding_value", None)
         super().__init__(
+            feature_size=n_mels,
+            sampling_rate=sampling_rate,
+            padding_value=0.0,
             **kwargs
         )
         self.compression_rate = compression_rate
         self.qformer_rate = qformer_rate
         self.feat_stride = feat_stride
+        self.sampling_rate = sampling_rate
+        self.window = np.hamming(win_length).astype(np.float32)
+        self.mel_filterbank = create_mel_filterbank(sampling_rate, n_fft, n_mels).T
         self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
     def __call__(
             self,
+            audios: List[AudioInput],
             return_tensors: Union[TensorType, str, None] = TensorType.PYTORCH
     ) -> BatchFeature:
+        features, sizes, frames = [], [], []
+        for wav in audios:
+            processed_wav = self._preprocess_audio(wav, 22500)
+            mel_spectrogram = self._compute_log_mel_spectrogram(processed_wav)
+            feature_tensor = torch.tensor(mel_spectrogram, dtype=torch.float32)
+            features.append(feature_tensor)
+            sizes.append(torch.tensor(self._calculate_embed_length(feature_tensor.shape[0])))
+            frames.append(feature_tensor.shape[0] * self.feat_stride)
+        audio_embeds = pad_sequence(features, batch_first=True)
+        size_tensor = torch.stack(sizes)
+        attention_mask = None
+        if len(audios) > 1:
+            frame_lengths = torch.tensor(frames)
+            attention_mask = torch.arange(frame_lengths.max()).unsqueeze(0) < frame_lengths.unsqueeze(1)
         output_data = {
             "audio_values": audio_embeds,
+            "audio_values_sizes": size_tensor
         }
+        if attention_mask is not None:
+            output_data["audio_attention_mask"] = attention_mask
         return BatchFeature(data=output_data, tensor_type=return_tensors)
     def _preprocess_audio(self, wav: np.ndarray, source_sr: int) -> np.ndarray:
+        wav = torch.as_tensor(wav).float().numpy()
         if wav.ndim > 1:
             wav = wav.mean(axis=0)
         if source_sr != self.sampling_rate:
+            wav = scipy.signal.resample_poly(wav, self.sampling_rate, source_sr)
+        return wav / max(np.abs(wav).max(), 1e-6)
     def _compute_log_mel_spectrogram(self, wav: np.ndarray) -> np.ndarray:
+        frame_count = 1 + (len(wav) - self.win_length) // self.hop_length
+        strides = wav.strides[0]
+        frames = np.lib.stride_tricks.as_strided(
             wav,
+            shape=(frame_count, self.win_length),
+            strides=(strides * self.hop_length, strides),
             writeable=False
+        ).copy()
+        frames *= self.window
+        spectrum = np.fft.rfft(frames, n=self.n_fft).astype(np.complex64)
         power = np.abs(spectrum) ** 2
         mel_spectrogram = np.dot(power, self.mel_filterbank)
+        mel_spectrogram = np.clip(mel_spectrogram, 1.0, None)
+        return np.log(mel_spectrogram, dtype=np.float32)
     def _calculate_embed_length(self, frame_count: int) -> int:
         compressed = math.ceil(frame_count / self.compression_rate)
 class Gemma3ProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Dict[str, Any]
+    audio_kwargs: Dict[str, Any]
     _defaults = {
         "text_kwargs": {"padding": False, "truncation": False, "max_length": DEFAULT_MAX_LENGTH},
         "images_kwargs": {},
 class Gemma3OmniProcessor(ProcessorMixin):
     attributes = ["image_processor", "audio_processor", "tokenizer"]
     valid_kwargs = ["chat_template", "image_seq_length"]
     image_processor_class = "AutoImageProcessor"
+    audio_processor_class = "AutoFeatureExtractor"
     tokenizer_class = "AutoTokenizer"
     def __init__(
             self,
+            image_processor,
+            audio_processor,
+            tokenizer,
             chat_template=None,
             image_seq_length: int = 256,
+            **kwargs
     ):
+        self.image_seq_length = image_seq_length
+        self.image_token_id = tokenizer.image_token_id
+        self.boi_token = tokenizer.boi_token
+        self.image_token = tokenizer.image_token
+        self.audio_token = "<audio_soft_token>"
+        self.expected_audio_token_id = 262143
+        self.full_image_sequence = f"\n\n{tokenizer.boi_token}{''.join([tokenizer.image_token] * image_seq_length)}{tokenizer.eoi_token}\n\n"
+        self.compression_rate = 8
+        self.qformer_compression_rate = 1
+        self.feat_stride = 1
+        self.audio_token_id = tokenizer.convert_tokens_to_ids(self.audio_token)
+        if self.audio_token_id != self.expected_audio_token_id:
+            logger.warning(
+                f"Assigned ID {self.audio_token_id} for '{self.audio_token}' does not match expected ID {self.expected_audio_token_id}. "
+                "Using assigned ID. Model embedding layer may need resizing."
+            )
         super().__init__(
             image_processor=image_processor,
             audio_processor=audio_processor,
             **kwargs
         )
+    def _merge_kwargs(self, ModelProcessorKwargs, tokenizer_init_kwargs, **kwargs):
+        default_kwargs = {}
+        for modality in ModelProcessorKwargs._defaults:
+            default_kwargs[modality] = ModelProcessorKwargs._defaults.get(modality, {}).copy()
+        # Update defaults with tokenizer init kwargs
+        for modality in default_kwargs:
+            modality_kwargs = default_kwargs[modality]
+            for key in modality_kwargs:
+                if key in tokenizer_init_kwargs:
+                    value = (
+                        getattr(self.tokenizer, key)
+                        if hasattr(self.tokenizer, key)
+                        else tokenizer_init_kwargs[key]
+                    )
+                    modality_kwargs[key] = value
+        # Update with user-provided kwargs
+        for modality in default_kwargs:
+            if modality in kwargs:
+                default_kwargs[modality].update(kwargs[modality])
+        # Ensure text_kwargs has truncation=False and large max_length
+        default_kwargs["text_kwargs"]["truncation"] = False
+        default_kwargs["text_kwargs"]["max_length"] = default_kwargs["text_kwargs"].get("max_length",
+                                                                                        DEFAULT_MAX_LENGTH)
+        return default_kwargs
+    def _compute_audio_embed_size(self, audio_frames: int) -> int:
+        result = math.ceil(audio_frames / self.compression_rate)
+        return math.ceil(result / self.qformer_compression_rate)
     def __call__(
             self,
+            images=None,
+            text=None,
+            videos=None,
+            audio=None,
+            **kwargs: Unpack[Gemma3ProcessorKwargs]
     ) -> BatchFeature:
+        if text is None and images is None:
+            raise ValueError("Provide at least one of `text` or `images`.")
+        output_kwargs = self._merge_kwargs(
+            Gemma3ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
             **kwargs
         )
         if isinstance(text, str):
             text = [text]
+        elif not isinstance(text, list) or not all(isinstance(t, str) for t in text):
+            raise ValueError("Input text must be a string or list of strings")
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", "pt")
+        image_inputs = {}
         if images is not None:
             batched_images = make_nested_list_of_images(images)
+            image_inputs = self.image_processor(batched_images, **output_kwargs["images_kwargs"])
+            if not text:
+                text = [" ".join([self.boi_token] * len(images)) for images in batched_images]
+            if len(batched_images) != len(text):
                 raise ValueError(
+                    f"Inconsistent batch sizes: {len(batched_images)} images, {len(text)} texts"
+                )
+            num_crops = to_py_obj(image_inputs.pop("num_crops"))
+            batch_num_crops = [[num_crops.pop(0) for _ in range(len(images))] for images in batched_images]
+            for batch_idx, (prompt, images, crops) in enumerate(zip(text, batched_images, batch_num_crops)):
+                image_indexes = [m.start() for m in re.finditer(self.boi_token, prompt)]
+                if len(images) != len(image_indexes):
+                    raise ValueError(
+                        f"Prompt has {len(image_indexes)} image tokens but received {len(images)} images"
+                    )
+                for num, idx in reversed(list(zip(crops, image_indexes))):
+                    if num:
+                        formatted_image_text = (
+                                f"Here is the original image {self.boi_token} and here are some crops to help you see better "
+                                + " ".join([self.boi_token] * num)
+                        )
+                        prompt = prompt[:idx] + formatted_image_text + prompt[idx + len(self.boi_token):]
+                        text[batch_idx] = prompt
+            text = [prompt.replace(self.boi_token, self.full_image_sequence) for prompt in text]
+        audio_inputs = {}
+        if audio is not None:
+            audio_inputs = self.audio_processor(audio, return_tensors)
+            audio_embeds = audio_inputs['audio_values']
+            audio_frames = audio_embeds.shape[1] * self.feat_stride
+            audio_seq_length = self._compute_audio_embed_size(audio_frames)
+            audio_tokens = {
+                "boa_token": "<start_of_audio>",
+                "eoa_token": "<end_of_audio>",
+                "audio_token": "<audio_soft_token>",
+                "boa_token_id": 256001,
+                "eoa_token_id": 256002,
+                "audio_token_id": self.audio_token_id  # Use dynamic ID
+            }
+            audio_sequence = f"\n\n{audio_tokens['boa_token']}{''.join([audio_tokens['audio_token']] * audio_seq_length)}{audio_tokens['eoa_token']}\n\n"
+            text = [prompt.replace(audio_tokens['boa_token'], audio_sequence) for prompt in text]
+        text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"], return_tensors=return_tensors)
+        # Debug: Log text and token counts before validation
+        for i, (txt, ids) in enumerate(zip(text, text_inputs["input_ids"])):
+            audio_text_count = txt.count(self.audio_token)
+            audio_ids_count = list(ids).count(self.audio_token_id)
             logger.debug(
+                f"Sample {i}: Audio tokens in text={audio_text_count}, in input_ids={audio_ids_count}, "
+                f"Text length={len(txt)}, Input IDs length={len(ids)}"
             )
+        array_ids = text_inputs["input_ids"]
+        if return_tensors == "pt":
+            mm_token_type_ids = torch.zeros_like(array_ids)
+        else:
+            mm_token_type_ids = np.zeros_like(array_ids)
+        mm_token_type_ids[array_ids == self.image_token_id] = 1  # Image token type
+        mm_token_type_ids[array_ids == self.audio_token_id] = 2  # Audio token type
+        text_inputs["token_type_ids"] = mm_token_type_ids
+        return BatchFeature(data={**text_inputs, **image_inputs, **audio_inputs}, tensor_type=return_tensors)
     def batch_decode(self, *args, **kwargs):
         return self.tokenizer.batch_decode(*args, **kwargs)
     @property
     def model_input_names(self):
+        tokenizer_inputs = self.tokenizer.model_input_names + ["token_type_ids"]
+        image_processor_inputs = self.image_processor.model_input_names
+        audio_processor_inputs = self.audio_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_inputs + image_processor_inputs + audio_processor_inputs))