3v324v23 commited on 2 days ago

Commit

5644dea

1 Parent(s): d1ed198

fix bug

Browse files

Files changed (37) hide show

attention_temporal_videoae.py +0 -1314
base_encoder.py +0 -68
builder.py +0 -17
config.json +4 -2
configuration_qwen2.py +0 -169
llava_arch.py +30 -105
llava_qwen.py +673 -9
modeling_qwen2.py +2 -0
multimodal_encoder/.ipynb_checkpoints/base_encoder-checkpoint.py +0 -68
multimodal_encoder/.ipynb_checkpoints/builder-checkpoint.py +0 -29
multimodal_encoder/.ipynb_checkpoints/clip_encoder-checkpoint.py +0 -179
multimodal_encoder/.ipynb_checkpoints/siglip_encoder-checkpoint.py +0 -151
multimodal_encoder/__pycache__/base_encoder.cpython-310.pyc +0 -0
multimodal_encoder/__pycache__/builder.cpython-310.pyc +0 -0
multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc +0 -0
multimodal_encoder/__pycache__/siglip_encoder.cpython-310.pyc +0 -0
multimodal_encoder/base_encoder.py +0 -68
multimodal_encoder/builder.py +0 -20
multimodal_projector/__pycache__/builder.cpython-310.pyc +0 -0
multimodal_projector/__pycache__/pooler_projector.cpython-310.pyc +0 -0
multimodal_projector/pooler_projector.py +0 -33
multimodal_resampler/__pycache__/builder.cpython-310.pyc +0 -0
multimodal_resampler/__pycache__/masked_drop.cpython-310.pyc +0 -0
multimodal_resampler/__pycache__/perceiver.cpython-310.pyc +0 -0
multimodal_resampler/__pycache__/qformer.cpython-310.pyc +0 -0
multimodal_resampler/__pycache__/spatial_pool.cpython-310.pyc +0 -0
multimodal_resampler/builder.py +0 -34
multimodal_resampler/masked_drop.py +0 -80
multimodal_resampler/perceiver.py +0 -155
multimodal_resampler/qformer.py +0 -1160
sae.py +1434 -10
sae_utils.py +0 -302
siglip_encoder.py +0 -154
utils_encoder.py +0 -296
multimodal_projector/builder.py → vision_projector_builder.py +29 -1
multimodal_resampler/spatial_pool.py → vision_resampler_builder.py +23 -0
multimodal_encoder/siglip_encoder.py → vision_tower_builder.py +92 -17

attention_temporal_videoae.py DELETED Viewed

@@ -1,1314 +0,0 @@
-from inspect import isfunction
-import math
-import torch
-import torch as th
-import torch.nn.functional as F
-from torch import nn, einsum
-from einops import rearrange, repeat
-from typing import Optional, Any
-try:
-    import xformers
-    import xformers.ops
-    XFORMERS_IS_AVAILBLE = True
-except:
-    XFORMERS_IS_AVAILBLE = False
-from .utils_encoder import (
-    conv_nd,
-    zero_module,
-    normalization,
-)
-def exists(val):
-    return val is not None
-def uniq(arr):
-    return {el: True for el in arr}.keys()
-def default(val, d):
-    if exists(val):
-        return val
-    return d() if isfunction(d) else d
-def max_neg_value(t):
-    return -torch.finfo(t.dtype).max
-def init_(tensor):
-    dim = tensor.shape[-1]
-    std = 1 / math.sqrt(dim)
-    tensor.uniform_(-std, std)
-    return tensor
-# feedforward
-class GEGLU(nn.Module):
-    def __init__(self, dim_in, dim_out):
-        super().__init__()
-        self.proj = nn.Linear(dim_in, dim_out * 2)
-    def forward(self, x):
-        x, gate = self.proj(x).chunk(2, dim=-1)
-        return x * F.gelu(gate)
-class FeedForward(nn.Module):
-    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0):
-        super().__init__()
-        inner_dim = int(dim * mult)
-        dim_out = default(dim_out, dim)
-        project_in = (
-            nn.Sequential(nn.Linear(dim, inner_dim), nn.GELU())
-            if not glu
-            else GEGLU(dim, inner_dim)
-        )
-        self.net = nn.Sequential(
-            project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out)
-        )
-    def forward(self, x):
-        return self.net(x)
-def zero_module(module):
-    """
-    Zero out the parameters of a module and return it.
-    """
-    for p in module.parameters():
-        p.detach().zero_()
-    return module
-def Normalize(in_channels, num_groups=32):
-    return torch.nn.GroupNorm(
-        num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True
-    )
-# ---------------------------------------------------------------------------------------------------
-class RelativePosition(nn.Module):
-    """https://github.com/evelinehong/Transformer_Relative_Position_PyTorch/blob/master/relative_position.py"""
-    def __init__(self, num_units, max_relative_position):
-        super().__init__()
-        self.num_units = num_units
-        self.max_relative_position = max_relative_position
-        self.embeddings_table = nn.Parameter(
-            th.Tensor(max_relative_position * 2 + 1, num_units)
-        )
-        nn.init.xavier_uniform_(self.embeddings_table)
-    def forward(self, length_q, length_k):
-        device = self.embeddings_table.device
-        range_vec_q = th.arange(length_q, device=device)
-        range_vec_k = th.arange(length_k, device=device)
-        distance_mat = range_vec_k[None, :] - range_vec_q[:, None]
-        distance_mat_clipped = th.clamp(
-            distance_mat, -self.max_relative_position, self.max_relative_position
-        )
-        final_mat = distance_mat_clipped + self.max_relative_position
-        # final_mat = th.LongTensor(final_mat).to(self.embeddings_table.device)
-        # final_mat = th.tensor(final_mat, device=self.embeddings_table.device, dtype=torch.long)
-        final_mat = final_mat.long()
-        embeddings = self.embeddings_table[final_mat]
-        return embeddings
-class TemporalCrossAttention(nn.Module):
-    def __init__(
-        self,
-        query_dim,
-        context_dim=None,
-        heads=8,
-        dim_head=64,
-        dropout=0.0,
-        temporal_length=None,  # For relative positional representation and image-video joint training.
-        image_length=None,  # For image-video joint training.
-        use_relative_position=False,  # whether use relative positional representation in temporal attention.
-        img_video_joint_train=False,  # For image-video joint training.
-        use_tempoal_causal_attn=False,
-        bidirectional_causal_attn=False,
-        tempoal_attn_type=None,
-        joint_train_mode="same_batch",
-        **kwargs,
-    ):
-        super().__init__()
-        inner_dim = dim_head * heads
-        context_dim = default(context_dim, query_dim)
-        self.context_dim = context_dim
-        self.scale = dim_head**-0.5
-        self.heads = heads
-        self.temporal_length = temporal_length
-        self.use_relative_position = use_relative_position
-        self.img_video_joint_train = img_video_joint_train
-        self.bidirectional_causal_attn = bidirectional_causal_attn
-        self.joint_train_mode = joint_train_mode
-        assert joint_train_mode in ["same_batch", "diff_batch"]
-        self.tempoal_attn_type = tempoal_attn_type
-        if bidirectional_causal_attn:
-            assert use_tempoal_causal_attn
-        if tempoal_attn_type:
-            assert tempoal_attn_type in ["sparse_causal", "sparse_causal_first"]
-            assert not use_tempoal_causal_attn
-            assert not (
-                img_video_joint_train and (self.joint_train_mode == "same_batch")
-            )
-        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
-        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
-        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
-        assert not (
-            img_video_joint_train
-            and (self.joint_train_mode == "same_batch")
-            and use_tempoal_causal_attn
-        )
-        if img_video_joint_train:
-            if self.joint_train_mode == "same_batch":
-                mask = torch.ones(
-                    [1, temporal_length + image_length, temporal_length + image_length]
-                )
-                # mask[:, image_length:, :] = 0
-                # mask[:, :, image_length:] = 0
-                mask[:, temporal_length:, :] = 0
-                mask[:, :, temporal_length:] = 0
-                self.mask = mask
-            else:
-                self.mask = None
-        elif use_tempoal_causal_attn:
-            # normal causal attn
-            self.mask = torch.tril(torch.ones([1, temporal_length, temporal_length]))
-        elif tempoal_attn_type == "sparse_causal":
-            # all frames interact with only the `prev` & self frame
-            mask1 = torch.tril(
-                torch.ones([1, temporal_length, temporal_length])
-            ).bool()  # true indicates keeping
-            mask2 = torch.zeros(
-                [1, temporal_length, temporal_length]
-            )  # initialize to same shape with mask1
-            mask2[:, 2:temporal_length, : temporal_length - 2] = torch.tril(
-                torch.ones([1, temporal_length - 2, temporal_length - 2])
-            )
-            mask2 = (1 - mask2).bool()  # false indicates masking
-            self.mask = mask1 & mask2
-        elif tempoal_attn_type == "sparse_causal_first":
-            # all frames interact with only the `first` & self frame
-            mask1 = torch.tril(
-                torch.ones([1, temporal_length, temporal_length])
-            ).bool()  # true indicates keeping
-            mask2 = torch.zeros([1, temporal_length, temporal_length])
-            mask2[:, 2:temporal_length, 1 : temporal_length - 1] = torch.tril(
-                torch.ones([1, temporal_length - 2, temporal_length - 2])
-            )
-            mask2 = (1 - mask2).bool()  # false indicates masking
-            self.mask = mask1 & mask2
-        else:
-            self.mask = None
-        if use_relative_position:
-            assert temporal_length is not None
-            self.relative_position_k = RelativePosition(
-                num_units=dim_head, max_relative_position=temporal_length
-            )
-            self.relative_position_v = RelativePosition(
-                num_units=dim_head, max_relative_position=temporal_length
-            )
-        self.to_out = nn.Sequential(
-            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
-        )
-        nn.init.constant_(self.to_q.weight, 0)
-        nn.init.constant_(self.to_k.weight, 0)
-        nn.init.constant_(self.to_v.weight, 0)
-        nn.init.constant_(self.to_out[0].weight, 0)
-        nn.init.constant_(self.to_out[0].bias, 0)
-    def forward(self, x, context=None, mask=None):
-        # if context is None:
-        #     print(f'[Temp Attn] x={x.shape},context=None')
-        # else:
-        #     print(f'[Temp Attn] x={x.shape},context={context.shape}')
-        nh = self.heads
-        out = x
-        q = self.to_q(out)
-        # if context is not None:
-        #     print(f'temporal context 1 ={context.shape}')
-        # print(f'x={x.shape}')
-        context = default(context, x)
-        # print(f'temporal context 2 ={context.shape}')
-        k = self.to_k(context)
-        v = self.to_v(context)
-        # print(f'q ={q.shape},k={k.shape}')
-        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> (b h) n d", h=nh), (q, k, v))
-        sim = einsum("b i d, b j d -> b i j", q, k) * self.scale
-        if self.use_relative_position:
-            len_q, len_k, len_v = q.shape[1], k.shape[1], v.shape[1]
-            k2 = self.relative_position_k(len_q, len_k)
-            sim2 = einsum("b t d, t s d -> b t s", q, k2) * self.scale  # TODO check
-            sim += sim2
-        # print('mask',mask)
-        if exists(self.mask):
-            if mask is None:
-                mask = self.mask.to(sim.device)
-            else:
-                mask = self.mask.to(sim.device).bool() & mask  # .to(sim.device)
-        else:
-            mask = mask
-            # if self.img_video_joint_train:
-            #     # process mask (make mask same shape with sim)
-            #     c, h, w = mask.shape
-            #     c, t, s = sim.shape
-            #     # assert(h == w and t == s),f"mask={mask.shape}, sim={sim.shape}, h={h}, w={w}, t={t}, s={s}"
-            #     if h > t:
-            #         mask = mask[:, :t, :]
-            #     elif h < t: # pad zeros to mask (no attention) only initial mask =1 area compute weights
-            #         mask_ = torch.zeros([c,t,w]).to(mask.device)
-            #         mask_[:, :h, :] = mask
-            #         mask = mask_
-            #     c, h, w = mask.shape
-            #     if w > s:
-            #         mask = mask[:, :, :s]
-            #     elif w < s: # pad zeros to mask
-            #         mask_ = torch.zeros([c,h,s]).to(mask.device)
-            #         mask_[:, :, :w] = mask
-            #         mask = mask_
-            # max_neg_value = -torch.finfo(sim.dtype).max
-            # sim = sim.float().masked_fill(mask == 0, max_neg_value)
-        if mask is not None:
-            max_neg_value = -1e9
-            sim = sim + (1 - mask.float()) * max_neg_value  # 1=masking,0=no masking
-            # print('sim after masking: ', sim)
-            # if torch.isnan(sim).any() or torch.isinf(sim).any() or (not sim.any()):
-            #     print(f'sim [after masking], isnan={torch.isnan(sim).any()}, isinf={torch.isinf(sim).any()}, allzero={not sim.any()}')
-        attn = sim.softmax(dim=-1)
-        # print('attn after softmax: ', attn)
-        # if torch.isnan(attn).any() or torch.isinf(attn).any() or (not attn.any()):
-        #     print(f'attn [after softmax], isnan={torch.isnan(attn).any()}, isinf={torch.isinf(attn).any()}, allzero={not attn.any()}')
-        # attn = torch.where(torch.isnan(attn), torch.full_like(attn,0), attn)
-        # if torch.isinf(attn.detach()).any():
-        #     import pdb;pdb.set_trace()
-        # if torch.isnan(attn.detach()).any():
-        #     import pdb;pdb.set_trace()
-        out = einsum("b i j, b j d -> b i d", attn, v)
-        if self.bidirectional_causal_attn:
-            mask_reverse = torch.triu(
-                torch.ones(
-                    [1, self.temporal_length, self.temporal_length], device=sim.device
-                )
-            )
-            sim_reverse = sim.float().masked_fill(mask_reverse == 0, max_neg_value)
-            attn_reverse = sim_reverse.softmax(dim=-1)
-            out_reverse = einsum("b i j, b j d -> b i d", attn_reverse, v)
-            out += out_reverse
-        if self.use_relative_position:
-            v2 = self.relative_position_v(len_q, len_v)
-            out2 = einsum("b t s, t s d -> b t d", attn, v2)  # TODO check
-            out += out2  # TODO check：先add还是先merge head？先计算rpr，on split head之后的数据，然后再merge。
-        out = rearrange(out, "(b h) n d -> b n (h d)", h=nh)  # merge head
-        return self.to_out(out)
-# ---------------------------------------------------------------------------------------------------
-class SpatialSelfAttention(nn.Module):
-    def __init__(self, in_channels):
-        super().__init__()
-        self.in_channels = in_channels
-        self.norm = Normalize(in_channels)
-        self.q = torch.nn.Conv2d(
-            in_channels, in_channels, kernel_size=1, stride=1, padding=0
-        )
-        self.k = torch.nn.Conv2d(
-            in_channels, in_channels, kernel_size=1, stride=1, padding=0
-        )
-        self.v = torch.nn.Conv2d(
-            in_channels, in_channels, kernel_size=1, stride=1, padding=0
-        )
-        self.proj_out = torch.nn.Conv2d(
-            in_channels, in_channels, kernel_size=1, stride=1, padding=0
-        )
-    def forward(self, x):
-        h_ = x
-        h_ = self.norm(h_)
-        q = self.q(h_)
-        k = self.k(h_)
-        v = self.v(h_)
-        # compute attention
-        b, c, h, w = q.shape
-        q = rearrange(q, "b c h w -> b (h w) c")
-        k = rearrange(k, "b c h w -> b c (h w)")
-        w_ = torch.einsum("bij,bjk->bik", q, k)
-        w_ = w_ * (int(c) ** (-0.5))
-        w_ = torch.nn.functional.softmax(w_, dim=2)
-        # attend to values
-        v = rearrange(v, "b c h w -> b c (h w)")
-        w_ = rearrange(w_, "b i j -> b j i")
-        h_ = torch.einsum("bij,bjk->bik", v, w_)
-        h_ = rearrange(h_, "b c (h w) -> b c h w", h=h)
-        h_ = self.proj_out(h_)
-        return x + h_
-class CrossAttention(nn.Module):
-    def __init__(
-        self,
-        query_dim,
-        context_dim=None,
-        heads=8,
-        dim_head=64,
-        dropout=0.0,
-        sa_shared_kv=False,
-        shared_type="only_first",
-        **kwargs,
-    ):
-        super().__init__()
-        inner_dim = dim_head * heads
-        context_dim = default(context_dim, query_dim)
-        self.sa_shared_kv = sa_shared_kv
-        assert shared_type in [
-            "only_first",
-            "all_frames",
-            "first_and_prev",
-            "only_prev",
-            "full",
-            "causal",
-            "full_qkv",
-        ]
-        self.shared_type = shared_type
-        self.scale = dim_head**-0.5
-        self.heads = heads
-        self.dim_head = dim_head
-        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
-        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
-        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
-        self.to_out = nn.Sequential(
-            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
-        )
-        self.attention_op: Optional[Any] = None
-    def forward(self, x, context=None, mask=None):
-        h = self.heads
-        b = x.shape[0]
-        q = self.to_q(x)
-        context = default(context, x)
-        k = self.to_k(context)
-        v = self.to_v(context)
-        if self.sa_shared_kv:
-            if self.shared_type == "only_first":
-                k, v = map(
-                    lambda xx: rearrange(xx[0].unsqueeze(0), "b n c -> (b n) c")
-                    .unsqueeze(0)
-                    .repeat(b, 1, 1),
-                    (k, v),
-                )
-            else:
-                raise NotImplementedError
-        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> (b h) n d", h=h), (q, k, v))
-        sim = einsum("b i d, b j d -> b i j", q, k) * self.scale
-        if exists(mask):
-            mask = rearrange(mask, "b ... -> b (...)")
-            max_neg_value = -torch.finfo(sim.dtype).max
-            mask = repeat(mask, "b j -> (b h) () j", h=h)
-            sim.masked_fill_(~mask, max_neg_value)
-        # attention, what we cannot get enough of
-        attn = sim.softmax(dim=-1)
-        out = einsum("b i j, b j d -> b i d", attn, v)
-        out = rearrange(out, "(b h) n d -> b n (h d)", h=h)
-        return self.to_out(out)
-    def efficient_forward(self, x, context=None, mask=None):
-        q = self.to_q(x)
-        context = default(context, x)
-        k = self.to_k(context)
-        v = self.to_v(context)
-        b, _, _ = q.shape
-        q, k, v = map(
-            lambda t: t.unsqueeze(3)
-            .reshape(b, t.shape[1], self.heads, self.dim_head)
-            .permute(0, 2, 1, 3)
-            .reshape(b * self.heads, t.shape[1], self.dim_head)
-            .contiguous(),
-            (q, k, v),
-        )
-        # actually compute the attention, what we cannot get enough of
-        out = xformers.ops.memory_efficient_attention(
-            q, k, v, attn_bias=None, op=self.attention_op
-        )
-        if exists(mask):
-            raise NotImplementedError
-        out = (
-            out.unsqueeze(0)
-            .reshape(b, self.heads, out.shape[1], self.dim_head)
-            .permute(0, 2, 1, 3)
-            .reshape(b, out.shape[1], self.heads * self.dim_head)
-        )
-        return self.to_out(out)
-class VideoSpatialCrossAttention(CrossAttention):
-    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0):
-        super().__init__(query_dim, context_dim, heads, dim_head, dropout)
-    def forward(self, x, context=None, mask=None):
-        b, c, t, h, w = x.shape
-        if context is not None:
-            context = context.repeat(t, 1, 1)
-        x = super.forward(spatial_attn_reshape(x), context=context) + x
-        return spatial_attn_reshape_back(x, b, h)
-# class BasicTransformerBlockST(nn.Module):
-#     def __init__(
-#         self,
-#         # Spatial Stuff
-#         dim,
-#         n_heads,
-#         d_head,
-#         dropout=0.0,
-#         context_dim=None,
-#         gated_ff=True,
-#         checkpoint=True,
-#         # Temporal Stuff
-#         temporal_length=None,
-#         image_length=None,
-#         use_relative_position=True,
-#         img_video_joint_train=False,
-#         cross_attn_on_tempoal=False,
-#         temporal_crossattn_type="selfattn",
-#         order="stst",
-#         temporalcrossfirst=False,
-#         temporal_context_dim=None,
-#         split_stcontext=False,
-#         local_spatial_temporal_attn=False,
-#         window_size=2,
-#         random_t=False,
-#         **kwargs,
-#     ):
-#         super().__init__()
-#         # Self attention
-#         self.attn1 = CrossAttention(
-#             query_dim=dim,
-#             heads=n_heads,
-#             dim_head=d_head,
-#             dropout=dropout,
-#             **kwargs,
-#         )
-#         self.attn2 = CrossAttention(
-#             query_dim=dim,
-#             context_dim=context_dim,
-#             heads=n_heads,
-#             dim_head=d_head,
-#             dropout=dropout,
-#             **kwargs,
-#         )
-#         if XFORMERS_IS_AVAILBLE:
-#             self.attn1.forward = self.attn1.efficient_forward
-#             self.attn2.forward = self.attn2.efficient_forward
-#         self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
-#         # cross attention if context is not None
-#         self.norm1 = nn.LayerNorm(dim)
-#         self.norm2 = nn.LayerNorm(dim)
-#         self.norm3 = nn.LayerNorm(dim)
-#         self.checkpoint = checkpoint
-#         self.order = order
-#         assert self.order in ["stst", "sstt", "st_parallel"]
-#         self.temporalcrossfirst = temporalcrossfirst
-#         self.split_stcontext = split_stcontext
-#         self.local_spatial_temporal_attn = local_spatial_temporal_attn
-#         if self.local_spatial_temporal_attn:
-#             assert self.order == "stst"
-#             assert self.order == "stst"
-#             self.window_size = window_size
-#         if not split_stcontext:
-#             temporal_context_dim = context_dim
-#         # Temporal attention
-#         assert temporal_crossattn_type in ["selfattn", "crossattn", "skip"]
-#         self.temporal_crossattn_type = temporal_crossattn_type
-#         self.attn1_tmp = TemporalCrossAttention(
-#             query_dim=dim,
-#             heads=n_heads,
-#             dim_head=d_head,
-#             dropout=dropout,
-#             temporal_length=temporal_length,
-#             image_length=image_length,
-#             use_relative_position=use_relative_position,
-#             img_video_joint_train=img_video_joint_train,
-#             **kwargs,
-#         )
-#         self.attn2_tmp = TemporalCrossAttention(
-#             query_dim=dim,
-#             heads=n_heads,
-#             dim_head=d_head,
-#             dropout=dropout,
-#             # cross attn
-#             context_dim=(
-#                 temporal_context_dim if temporal_crossattn_type == "crossattn" else None
-#             ),
-#             # temporal attn
-#             temporal_length=temporal_length,
-#             image_length=image_length,
-#             use_relative_position=use_relative_position,
-#             img_video_joint_train=img_video_joint_train,
-#             **kwargs,
-#         )
-#         self.norm4 = nn.LayerNorm(dim)
-#         self.norm5 = nn.LayerNorm(dim)
-#         self.random_t = random_t
-#         # self.norm1_tmp = nn.LayerNorm(dim)
-#         # self.norm2_tmp = nn.LayerNorm(dim)
-#     ##############################################################################################################################################
-#     def forward(
-#         self,
-#         x,
-#         context=None,
-#         temporal_context=None,
-#         no_temporal_attn=None,
-#         attn_mask=None,
-#         **kwargs,
-#     ):
-#         # print(f'no_temporal_attn={no_temporal_attn}')
-#         if not self.split_stcontext:
-#             # st cross attention use the same context vector
-#             temporal_context = context.detach().clone()
-#         if context is None and temporal_context is None:
-#             # self-attention models
-#             if no_temporal_attn:
-#                 raise NotImplementedError
-#             return checkpoint(
-#                 self._forward_nocontext, (x), self.parameters(), self.checkpoint
-#             )
-#         else:
-#             # cross-attention models
-#             if no_temporal_attn:
-#                 forward_func = self._forward_no_temporal_attn
-#             else:
-#                 forward_func = self._forward
-#             inputs = (
-#                 (x, context, temporal_context)
-#                 if temporal_context is not None
-#                 else (x, context)
-#             )
-#             return checkpoint(forward_func, inputs, self.parameters(), self.checkpoint)
-#             # if attn_mask is not None:
-#             #     return checkpoint(self._forward, (x, context, temporal_context, attn_mask), self.parameters(), self.checkpoint)
-#             # return checkpoint(self._forward, (x, context, temporal_context), self.parameters(), self.checkpoint)
-#     def _forward(
-#         self,
-#         x,
-#         context=None,
-#         temporal_context=None,
-#         mask=None,
-#         no_temporal_attn=None,
-#     ):
-#         assert x.dim() == 5, f"x shape = {x.shape}"
-#         b, c, t, h, w = x.shape
-#         if self.order in ["stst", "sstt"]:
-#             x = self._st_cross_attn(
-#                 x,
-#                 context,
-#                 temporal_context=temporal_context,
-#                 order=self.order,
-#                 mask=mask,
-#             )  # no_temporal_attn=no_temporal_attn,
-#         elif self.order == "st_parallel":
-#             x = self._st_cross_attn_parallel(
-#                 x,
-#                 context,
-#                 temporal_context=temporal_context,
-#                 order=self.order,
-#             )  # no_temporal_attn=no_temporal_attn,
-#         else:
-#             raise NotImplementedError
-#         x = self.ff(self.norm3(x)) + x
-#         if (no_temporal_attn is None) or (not no_temporal_attn):
-#             x = rearrange(x, "(b h w) t c -> b c t h w", b=b, h=h, w=w)  # 3d -> 5d
-#         elif no_temporal_attn:
-#             x = rearrange(x, "(b t) (h w) c -> b c t h w", b=b, h=h, w=w)  # 3d -> 5d
-#         return x
-#     def _forward_no_temporal_attn(
-#         self,
-#         x,
-#         context=None,
-#         temporal_context=None,
-#     ):
-#         # temporary implementation :(
-#         # because checkpoint does not support non-tensor inputs currently.
-#         assert x.dim() == 5, f"x shape = {x.shape}"
-#         b, c, t, h, w = x.shape
-#         if self.order in ["stst", "sstt"]:
-#             # x = self._st_cross_attn(x, context, temporal_context=temporal_context, order=self.order, no_temporal_attn=True,)
-#             # mask = torch.zeros([1, t, t], device=x.device).bool() if context is None else torch.zeros([1, context.shape[1], t], device=x.device).bool()
-#             mask = torch.zeros([1, t, t], device=x.device).bool()
-#             x = self._st_cross_attn(
-#                 x,
-#                 context,
-#                 temporal_context=temporal_context,
-#                 order=self.order,
-#                 mask=mask,
-#             )
-#         elif self.order == "st_parallel":
-#             x = self._st_cross_attn_parallel(
-#                 x,
-#                 context,
-#                 temporal_context=temporal_context,
-#                 order=self.order,
-#                 no_temporal_attn=True,
-#             )
-#         else:
-#             raise NotImplementedError
-#         x = self.ff(self.norm3(x)) + x
-#         x = rearrange(x, "(b h w) t c -> b c t h w", b=b, h=h, w=w)  # 3d -> 5d
-#         # x = rearrange(x, '(b t) (h w) c -> b c t h w', b=b,h=h,w=w) # 3d -> 5d
-#         return x
-#     def _forward_nocontext(self, x, no_temporal_attn=None):
-#         assert x.dim() == 5, f"x shape = {x.shape}"
-#         b, c, t, h, w = x.shape
-#         if self.order in ["stst", "sstt"]:
-#             x = self._st_cross_attn(
-#                 x, order=self.order, no_temporal_attn=no_temporal_attn
-#             )
-#         elif self.order == "st_parallel":
-#             x = self._st_cross_attn_parallel(
-#                 x, order=self.order, no_temporal_attn=no_temporal_attn
-#             )
-#         else:
-#             raise NotImplementedError
-#         x = self.ff(self.norm3(x)) + x
-#         x = rearrange(x, "(b h w) t c -> b c t h w", b=b, h=h, w=w)  # 3d -> 5d
-#         return x
-#     ##############################################################################################################################################
-#     def _st_cross_attn(
-#         self, x, context=None, temporal_context=None, order="stst", mask=None
-#     ):  # no_temporal_attn=None,
-#         b, c, t, h, w = x.shape
-#         # if context is not None:
-#         #     print(f'[_st_cross_attn input] x={x.shape}, context={context.shape}')
-#         # else:
-#         #     print(f'[_st_cross_attn input] x={x.shape}')
-#         if order == "stst":
-#             # spatial self attention
-#             x = rearrange(x, "b c t h w -> (b t) (h w) c")
-#             # print(f'before attn1,x={x.shape}')
-#             x = self.attn1(self.norm1(x)) + x
-#             x = rearrange(x, "(b t) (h w) c -> b c t h w", b=b, h=h)
-#             # temporal self attention
-#             # if (no_temporal_attn is None) or (not no_temporal_attn):
-#             if self.local_spatial_temporal_attn:
-#                 x = local_spatial_temporal_attn_reshape(x, window_size=self.window_size)
-#             else:
-#                 x = rearrange(x, "b c t h w -> (b h w) t c")
-#             x = self.attn1_tmp(self.norm4(x), mask=mask) + x
-#             if self.local_spatial_temporal_attn:
-#                 x = local_spatial_temporal_attn_reshape_back(
-#                     x, window_size=self.window_size, b=b, h=h, w=w, t=t
-#                 )
-#             else:
-#                 x = rearrange(x, "(b h w) t c -> b c t h w", b=b, h=h, w=w)  # 3d -> 5d
-#             # spatial cross attention
-#             x = rearrange(x, "b c t h w -> (b t) (h w) c")
-#             # print(f'before attn2, x={x.shape}')
-#             # if context is not None:
-#             # print(f'[before attn2] context={context.shape}')
-#             if context is not None:
-#                 if self.random_t:
-#                     context_ = []
-#                     for i in range(context.shape[0]):
-#                         context_.append(context[i].unsqueeze(0).repeat(t, 1, 1))
-#                     context_ = torch.cat(context_, dim=0)
-#                 else:
-#                     if context.shape[0] == t:  # img captions no_temporal_attn or
-#                         context_ = context
-#                     else:
-#                         # repeat conditions with t times
-#                         context_ = []
-#                         for i in range(context.shape[0]):
-#                             context_.append(context[i].unsqueeze(0).repeat(t, 1, 1))
-#                         context_ = torch.cat(context_, dim=0)
-#             else:
-#                 context_ = None
-#             # if context_ is not None:
-#             #     print(f'[before attn2] x={x.shape}, context_={context_.shape}')
-#             # else:
-#             #     print(f'[before attn2] x={x.shape}')
-#             x = self.attn2(self.norm2(x), context=context_) + x
-#             # temporal cross attention
-#             # if (no_temporal_attn is None) or (not no_temporal_attn):
-#             x = rearrange(x, "(b t) (h w) c -> b c t h w", b=b, h=h)
-#             x = rearrange(x, "b c t h w -> (b h w) t c")
-#             if self.temporal_crossattn_type == "crossattn":
-#                 # tmporal cross attention
-#                 if temporal_context is not None:
-#                     # print(f'STATTN context={context.shape}, temporal_context={temporal_context.shape}')
-#                     temporal_context = torch.cat(
-#                         [context, temporal_context], dim=1
-#                     )  # blc
-#                     # print(f'STATTN after concat temporal_context={temporal_context.shape}')
-#                     temporal_context = temporal_context.repeat(h * w, 1, 1)
-#                     # print(f'after repeat temporal_context={temporal_context.shape}')
-#                 else:
-#                     temporal_context = context[0:1, ...].repeat(h * w, 1, 1)
-#                 # print(f'STATTN after concat x={x.shape}')
-#                 x = (
-#                     self.attn2_tmp(self.norm5(x), context=temporal_context, mask=mask)
-#                     + x
-#                 )
-#             elif self.temporal_crossattn_type == "selfattn":
-#                 # temporal self attention
-#                 x = self.attn2_tmp(self.norm5(x), context=None, mask=mask) + x
-#             elif self.temporal_crossattn_type == "skip":
-#                 # no temporal cross and self attention
-#                 pass
-#             else:
-#                 raise NotImplementedError
-#         elif order == "sstt":
-#             # spatial self attention
-#             x = rearrange(x, "b c t h w -> (b t) (h w) c")
-#             x = self.attn1(self.norm1(x)) + x
-#             # spatial cross attention
-#             context_ = context.repeat(t, 1, 1) if context is not None else None
-#             x = self.attn2(self.norm2(x), context=context_) + x
-#             x = rearrange(x, "(b t) (h w) c -> b c t h w", b=b, h=h)
-#             if (no_temporal_attn is None) or (not no_temporal_attn):
-#                 if self.temporalcrossfirst:
-#                     # temporal cross attention
-#                     if self.temporal_crossattn_type == "crossattn":
-#                         # if temporal_context is not None:
-#                         temporal_context = context.repeat(h * w, 1, 1)
-#                         x = (
-#                             self.attn2_tmp(
-#                                 self.norm5(x), context=temporal_context, mask=mask
-#                             )
-#                             + x
-#                         )
-#                     elif self.temporal_crossattn_type == "selfattn":
-#                         x = self.attn2_tmp(self.norm5(x), context=None, mask=mask) + x
-#                     elif self.temporal_crossattn_type == "skip":
-#                         pass
-#                     else:
-#                         raise NotImplementedError
-#                     # temporal self attention
-#                     x = rearrange(x, "b c t h w -> (b h w) t c")
-#                     x = self.attn1_tmp(self.norm4(x), mask=mask) + x
-#                 else:
-#                     # temporal self attention
-#                     x = rearrange(x, "b c t h w -> (b h w) t c")
-#                     x = self.attn1_tmp(self.norm4(x), mask=mask) + x
-#                     # temporal cross attention
-#                     if self.temporal_crossattn_type == "crossattn":
-#                         if temporal_context is not None:
-#                             temporal_context = context.repeat(h * w, 1, 1)
-#                         x = (
-#                             self.attn2_tmp(
-#                                 self.norm5(x), context=temporal_context, mask=mask
-#                             )
-#                             + x
-#                         )
-#                     elif self.temporal_crossattn_type == "selfattn":
-#                         x = self.attn2_tmp(self.norm5(x), context=None, mask=mask) + x
-#                     elif self.temporal_crossattn_type == "skip":
-#                         pass
-#                     else:
-#                         raise NotImplementedError
-#         else:
-#             raise NotImplementedError
-#         return x
-#     def _st_cross_attn_parallel(
-#         self, x, context=None, temporal_context=None, order="sst", no_temporal_attn=None
-#     ):
-#         """order: x -> Self Attn -> Cross Attn -> attn_s
-#         x -> Temp Self Attn -> attn_t
-#         x' = x + attn_s + attn_t
-#         """
-#         if no_temporal_attn is not None:
-#             raise NotImplementedError
-#         B, C, T, H, W = x.shape
-#         # spatial self attention
-#         h = x
-#         h = rearrange(h, "b c t h w -> (b t) (h w) c")
-#         h = self.attn1(self.norm1(h)) + h
-#         # spatial cross
-#         # context_ = context.repeat(T, 1, 1) if context is not None else None
-#         if context is not None:
-#             context_ = []
-#             for i in range(context.shape[0]):
-#                 context_.append(context[i].unsqueeze(0).repeat(T, 1, 1))
-#             context_ = torch.cat(context_, dim=0)
-#         else:
-#             context_ = None
-#         h = self.attn2(self.norm2(h), context=context_) + h
-#         h = rearrange(h, "(b t) (h w) c -> b c t h w", b=B, h=H)
-#         # temporal self
-#         h2 = x
-#         h2 = rearrange(h2, "b c t h w -> (b h w) t c")
-#         h2 = self.attn1_tmp(self.norm4(h2))  # + h2
-#         h2 = rearrange(h2, "(b h w) t c -> b c t h w", b=B, h=H, w=W)
-#         out = h + h2
-#         return rearrange(out, "b c t h w -> (b h w) t c")
-    ##############################################################################################################################################
-def spatial_attn_reshape(x):
-    return rearrange(x, "b c t h w -> (b t) (h w) c")
-def spatial_attn_reshape_back(x, b, h):
-    return rearrange(x, "(b t) (h w) c -> b c t h w", b=b, h=h)
-def temporal_attn_reshape(x):
-    return rearrange(x, "b c t h w -> (b h w) t c")
-def temporal_attn_reshape_back(x, b, h, w):
-    return rearrange(x, "(b h w) t c -> b c t h w", b=b, h=h, w=w)
-def local_spatial_temporal_attn_reshape(x, window_size):
-    B, C, T, H, W = x.shape
-    NH = H // window_size
-    NW = W // window_size
-    # x = x.view(B, C, T, NH, window_size, NW, window_size)
-    # tokens = x.permute(0, 1, 2, 3, 5, 4, 6).contiguous()
-    # tokens = tokens.view(-1, window_size, window_size, C)
-    x = rearrange(
-        x,
-        "b c t (nh wh) (nw ww) -> b c t nh wh nw ww",
-        nh=NH,
-        nw=NW,
-        wh=window_size,
-        ww=window_size,
-    ).contiguous()  # # B, C, T, NH, NW, window_size, window_size
-    x = rearrange(
-        x, "b c t nh wh nw ww -> (b nh nw) (t wh ww) c"
-    )  # (B, NH, NW) (T, window_size, window_size) C
-    return x
-def local_spatial_temporal_attn_reshape_back(x, window_size, b, h, w, t):
-    B, L, C = x.shape
-    NH = h // window_size
-    NW = w // window_size
-    x = rearrange(
-        x,
-        "(b nh nw) (t wh ww) c -> b c t nh wh nw ww",
-        b=b,
-        nh=NH,
-        nw=NW,
-        t=t,
-        wh=window_size,
-        ww=window_size,
-    )
-    x = rearrange(x, "b c t nh wh nw ww -> b c t (nh wh) (nw ww)")
-    return x
-class SpatialTemporalTransformer(nn.Module):
-    """
-    Transformer block for video-like data (5D tensor).
-    First, project the input (aka embedding) with NO reshape.
-    Then apply standard transformer action.
-    The 5D -> 3D reshape operation will be done in the specific attention module.
-    """
-    def __init__(
-        self,
-        in_channels,
-        n_heads,
-        d_head,
-        depth=1,
-        dropout=0.0,
-        context_dim=None,
-        # Temporal stuff
-        temporal_length=None,
-        image_length=None,
-        use_relative_position=True,
-        img_video_joint_train=False,
-        cross_attn_on_tempoal=False,
-        temporal_crossattn_type="selfattn",
-        order="stst",
-        temporalcrossfirst=False,
-        split_stcontext=False,
-        temporal_context_dim=None,
-        **kwargs,
-    ):
-        super().__init__()
-        self.in_channels = in_channels
-        inner_dim = n_heads * d_head
-        self.norm = Normalize(in_channels)
-        self.proj_in = nn.Conv3d(
-            in_channels, inner_dim, kernel_size=1, stride=1, padding=0
-        )
-        self.transformer_blocks = nn.ModuleList(
-            [
-                BasicTransformerBlockST(
-                    inner_dim,
-                    n_heads,
-                    d_head,
-                    dropout=dropout,
-                    # cross attn
-                    context_dim=context_dim,
-                    # temporal attn
-                    temporal_length=temporal_length,
-                    image_length=image_length,
-                    use_relative_position=use_relative_position,
-                    img_video_joint_train=img_video_joint_train,
-                    temporal_crossattn_type=temporal_crossattn_type,
-                    order=order,
-                    temporalcrossfirst=temporalcrossfirst,
-                    split_stcontext=split_stcontext,
-                    temporal_context_dim=temporal_context_dim,
-                    **kwargs,
-                )
-                for d in range(depth)
-            ]
-        )
-        self.proj_out = zero_module(
-            nn.Conv3d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
-        )
-    def forward(self, x, context=None, temporal_context=None, **kwargs):
-        # note: if no context is given, cross-attention defaults to self-attention
-        assert x.dim() == 5, f"x shape = {x.shape}"
-        b, c, t, h, w = x.shape
-        x_in = x
-        x = self.norm(x)
-        x = self.proj_in(x)
-        for block in self.transformer_blocks:
-            x = block(x, context=context, temporal_context=temporal_context, **kwargs)
-        x = self.proj_out(x)
-        return x + x_in
-# ---------------------------------------------------------------------------------------------------
-class STAttentionBlock2(nn.Module):
-    def __init__(
-        self,
-        channels,
-        num_heads=1,
-        num_head_channels=-1,
-        use_checkpoint=False,  # not used, only used in ResBlock
-        use_new_attention_order=False,  # QKVAttention or QKVAttentionLegacy
-        temporal_length=16,  # used in relative positional representation.
-        image_length=8,  # used for image-video joint training.
-        use_relative_position=False,  # whether use relative positional representation in temporal attention.
-        img_video_joint_train=False,
-        # norm_type="groupnorm",
-        attn_norm_type="group",
-        use_tempoal_causal_attn=False,
-    ):
-        """
-        version 1: guided_diffusion implemented version
-        version 2: remove args input argument
-        """
-        super().__init__()
-        if num_head_channels == -1:
-            self.num_heads = num_heads
-        else:
-            assert (
-                channels % num_head_channels == 0
-            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
-            self.num_heads = channels // num_head_channels
-        self.use_checkpoint = use_checkpoint
-        self.temporal_length = temporal_length
-        self.image_length = image_length
-        self.use_relative_position = use_relative_position
-        self.img_video_joint_train = img_video_joint_train
-        self.attn_norm_type = attn_norm_type
-        assert self.attn_norm_type in ["group", "no_norm"]
-        self.use_tempoal_causal_attn = use_tempoal_causal_attn
-        if self.attn_norm_type == "group":
-            self.norm_s = normalization(channels)
-            self.norm_t = normalization(channels)
-        self.qkv_s = conv_nd(1, channels, channels * 3, 1)
-        self.qkv_t = conv_nd(1, channels, channels * 3, 1)
-        if self.img_video_joint_train:
-            mask = th.ones(
-                [1, temporal_length + image_length, temporal_length + image_length]
-            )
-            mask[:, temporal_length:, :] = 0
-            mask[:, :, temporal_length:] = 0
-            self.register_buffer("mask", mask)
-        else:
-            self.mask = None
-        if use_new_attention_order:
-            # split qkv before split heads
-            self.attention_s = QKVAttention(self.num_heads)
-            self.attention_t = QKVAttention(self.num_heads)
-        else:
-            # split heads before split qkv
-            self.attention_s = QKVAttentionLegacy(self.num_heads)
-            self.attention_t = QKVAttentionLegacy(self.num_heads)
-        if use_relative_position:
-            self.relative_position_k = RelativePosition(
-                num_units=channels // self.num_heads,
-                max_relative_position=temporal_length,
-            )
-            self.relative_position_v = RelativePosition(
-                num_units=channels // self.num_heads,
-                max_relative_position=temporal_length,
-            )
-        self.proj_out_s = zero_module(
-            conv_nd(1, channels, channels, 1)
-        )  # conv_dim, in_channels, out_channels, kernel_size
-        self.proj_out_t = zero_module(
-            conv_nd(1, channels, channels, 1)
-        )  # conv_dim, in_channels, out_channels, kernel_size
-    def forward(self, x, mask=None):
-        b, c, t, h, w = x.shape
-        # spatial
-        out = rearrange(x, "b c t h w -> (b t) c (h w)")
-        if self.attn_norm_type == "no_norm":
-            qkv = self.qkv_s(out)
-        else:
-            qkv = self.qkv_s(self.norm_s(out))
-        out = self.attention_s(qkv)
-        out = self.proj_out_s(out)
-        out = rearrange(out, "(b t) c (h w) -> b c t h w", b=b, h=h)
-        x += out
-        # temporal
-        out = rearrange(x, "b c t h w -> (b h w) c t")
-        if self.attn_norm_type == "no_norm":
-            qkv = self.qkv_t(out)
-        else:
-            qkv = self.qkv_t(self.norm_t(out))
-        # relative positional embedding
-        if self.use_relative_position:
-            len_q = qkv.size()[-1]
-            len_k, len_v = len_q, len_q
-            k_rp = self.relative_position_k(len_q, len_k)
-            v_rp = self.relative_position_v(len_q, len_v)  # [T,T,head_dim]
-            out = self.attention_t(
-                qkv,
-                rp=(k_rp, v_rp),
-                mask=self.mask,
-                use_tempoal_causal_attn=self.use_tempoal_causal_attn,
-            )
-        else:
-            out = self.attention_t(
-                qkv,
-                rp=None,
-                mask=self.mask,
-                use_tempoal_causal_attn=self.use_tempoal_causal_attn,
-            )
-        out = self.proj_out_t(out)
-        out = rearrange(out, "(b h w) c t -> b c t h w", b=b, h=h, w=w)
-        return x + out
-# ---------------------------------------------------------------------------------------------------------------
-class QKVAttentionLegacy(nn.Module):
-    """
-    A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping
-    """
-    def __init__(self, n_heads):
-        super().__init__()
-        self.n_heads = n_heads
-    def forward(self, qkv, rp=None, mask=None):
-        """
-        Apply QKV attention.
-        :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
-        :return: an [N x (H * C) x T] tensor after attention.
-        """
-        if rp is not None or mask is not None:
-            raise NotImplementedError
-        bs, width, length = qkv.shape
-        assert width % (3 * self.n_heads) == 0
-        ch = width // (3 * self.n_heads)
-        q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
-        scale = 1 / math.sqrt(math.sqrt(ch))
-        weight = th.einsum(
-            "bct,bcs->bts", q * scale, k * scale
-        )  # More stable with f16 than dividing afterwards
-        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
-        a = th.einsum("bts,bcs->bct", weight, v)
-        return a.reshape(bs, -1, length)
-    @staticmethod
-    def count_flops(model, _x, y):
-        return count_flops_attn(model, _x, y)
-# ---------------------------------------------------------------------------------------------------------------
-class QKVAttention(nn.Module):
-    """
-    A module which performs QKV attention and splits in a different order.
-    """
-    def __init__(self, n_heads):
-        super().__init__()
-        self.n_heads = n_heads
-    def forward(self, qkv, rp=None, mask=None, use_tempoal_causal_attn=False):
-        """
-        Apply QKV attention.
-        :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
-        :return: an [N x (H * C) x T] tensor after attention.
-        """
-        bs, width, length = qkv.shape
-        assert width % (3 * self.n_heads) == 0
-        ch = width // (3 * self.n_heads)
-        # print('qkv', qkv.size())
-        qkv=qkv.contiguous()
-        q, k, v = qkv.chunk(3, dim=1)
-        scale = 1 / math.sqrt(math.sqrt(ch))
-        # print('bs, self.n_heads, ch, length', bs, self.n_heads, ch, length)
-        weight = th.einsum(
-            "bct,bcs->bts",
-            (q * scale).view(bs * self.n_heads, ch, length),
-            (k * scale).view(bs * self.n_heads, ch, length),
-        )  # More stable with f16 than dividing afterwards
-        # weight:[b,t,s] b=bs*n_heads*T
-        if rp is not None:
-            k_rp, v_rp = rp  # [length, length, head_dim] [8, 8, 48]
-            weight2 = th.einsum(
-                "bct,tsc->bst", (q * scale).view(bs * self.n_heads, ch, length), k_rp
-            )
-            weight += weight2
-        if use_tempoal_causal_attn:
-            # weight = torch.tril(weight)
-            assert mask is None, f"Not implemented for merging two masks!"
-            mask = torch.tril(torch.ones(weight.shape))
-        else:
-            if mask is not None:  # only keep upper-left matrix
-                # process mask
-                c, t, _ = weight.shape
-                if mask.shape[-1] > t:
-                    mask = mask[:, :t, :t]
-                elif mask.shape[-1] < t:  # pad ones
-                    mask_ = th.zeros([c, t, t]).to(mask.device)
-                    t_ = mask.shape[-1]
-                    mask_[:, :t_, :t_] = mask
-                    mask = mask_
-                else:
-                    assert (
-                        weight.shape[-1] == mask.shape[-1]
-                    ), f"weight={weight.shape}, mask={mask.shape}"
-        if mask is not None:
-            INF = -1e8  # float('-inf')
-            weight = weight.float().masked_fill(mask == 0, INF)
-        weight = F.softmax(weight.float(), dim=-1).type(
-            weight.dtype
-        )  # [256, 8, 8] [b, t, t] b=bs*n_heads*h*w,t=nframes
-        # weight = F.softmax(weight, dim=-1)#[256, 8, 8] [b, t, t] b=bs*n_heads*h*w,t=nframes
-        a = th.einsum(
-            "bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length)
-        )  # [256, 48, 8] [b, head_dim, t]
-        if rp is not None:
-            a2 = th.einsum("bts,tsc->btc", weight, v_rp).transpose(1, 2)  # btc->bct
-            a += a2
-        return a.reshape(bs, -1, length)
-# ---------------------------------------------------------------------------------------------------------------
-# ---------------------------------------------------------------------------------------------------------------

base_encoder.py DELETED Viewed

@@ -1,68 +0,0 @@
-from abc import ABC, abstractmethod
-import torch
-import torch.nn as nn
-class BaseVisionTower(nn.Module):
-    def __init__(self, vision_tower_name, vision_tower_cfg, delay_load=False):
-        super().__init__()
-        self.is_loaded = False
-        self.vision_tower_name = vision_tower_name
-        self.delay_load = delay_load
-    @abstractmethod
-    def load_model(self, device_map=None):
-        raise NotImplementedError("Subclasses must implement load_model")
-    @abstractmethod
-    def _forward(self, images):
-        raise NotImplementedError("Subclasses must implement forward")
-    def forward(self, images):
-        if type(images) is list:
-            image_features = [self._forward(image.unsqueeze(0)) for image in images]
-        else:
-            image_features = self._forward(images)
-        return image_features
-    @property
-    def dummy_feature(self):
-        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
-    @property
-    def dtype(self):
-        # Dynamically infer the dtype from the first parameter, if not explicitly specified
-        if hasattr(self.vision_tower, "dtype"):
-            return self.vision_tower.dtype
-        else:
-            params = list(self.vision_tower.parameters())
-            return (
-                params[0].dtype if len(params) > 0 else torch.float32
-            )  # Default to torch.float32 if no parameters
-    @property
-    def device(self):
-        # Dynamically infer the device from the first parameter, if not explicitly specified
-        if hasattr(self.vision_tower, "device"):
-            return self.vision_tower.device
-        else:
-            params = list(self.vision_tower.parameters())
-            return (
-                params[0].device if len(params) > 0 else torch.device("cpu")
-            )  # Default to CPU if no parameters
-    @property
-    def config(self):
-        if self.is_loaded:
-            return self.vision_tower.config
-        else:
-            return self.cfg_only
-    @property
-    def hidden_size(self):
-        try:
-            return self.config.hidden_size
-        except:
-            return self._hidden_size

builder.py DELETED Viewed

@@ -1,17 +0,0 @@
-import os
-from .siglip_encoder import SigLipVisionTower
-def build_vision_tower(vision_tower_cfg, **kwargs):
-    vision_tower = getattr(vision_tower_cfg, "mm_vision_tower", getattr(vision_tower_cfg, "vision_tower", None))
-    is_absolute_path_exists = os.path.exists(vision_tower)
-    use_s2 = getattr(vision_tower_cfg, "s2", False)
-    #print(getattr(vision_tower_cfg, "vision_tower", None))
-    return SigLipVisionTower(vision_tower, vision_tower_cfg=vision_tower_cfg, **kwargs)
-    if getattr(vision_tower_cfg, "vision_tower", None) and "siglip" in getattr(vision_tower_cfg, "vision_tower", None).lower():
-        #print('*************\n')
-        return SigLipVisionTower(vision_tower, vision_tower_cfg=vision_tower_cfg, **kwargs)
-    raise ValueError(f"Unknown vision tower: {vision_tower}")

config.json CHANGED Viewed

@@ -4,7 +4,7 @@
   ],
   "auto_map": {
     "AutoConfig": "llava_qwen.LlavaQwenConfig",
-    "AutoModel": "llava_qwen.LlavaQwenForCausalLM"
   },
   "attention_dropout": 0.0,
   "bos_token_id": 151643,
@@ -202,5 +202,7 @@
   "use_pos_skipping": false,
   "use_sliding_window": false,
   "vision_tower_pretrained": null,
-  "vocab_size": 152064
 }

   ],
   "auto_map": {
     "AutoConfig": "llava_qwen.LlavaQwenConfig",
+    "AutoModelForCausalLM": "llava_qwen.LlavaQwenForCausalLM"
   },
   "attention_dropout": 0.0,
   "bos_token_id": 151643,
   "use_pos_skipping": false,
   "use_sliding_window": false,
   "vision_tower_pretrained": null,
+  "vocab_size": 152064,
+  "enable_chunk_prefill": false,
+  "prefill_config": {}
 }

configuration_qwen2.py DELETED Viewed

@@ -1,169 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Qwen2 model configuration"""
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-logger = logging.get_logger(__name__)
-QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "Qwen/Qwen2-7B-beta": "https://huggingface.co/Qwen/Qwen2-7B-beta/resolve/main/config.json",
-}
-class Qwen2Config(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
-    Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of
-    Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-    Args:
-        vocab_size (`int`, *optional*, defaults to 151936):
-            Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`Qwen2Model`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 22016):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*, defaults to 32):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 32768):
-            The maximum sequence length that this model might ever be used with.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether the model's input and output word embeddings should be tied.
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        use_sliding_window (`bool`, *optional*, defaults to `False`):
-            Whether to use sliding window attention.
-        sliding_window (`int`, *optional*, defaults to 4096):
-            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
-        max_window_layers (`int`, *optional*, defaults to 28):
-            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-    ```python
-    >>> from transformers import Qwen2Model, Qwen2Config
-    >>> # Initializing a Qwen2 style configuration
-    >>> configuration = Qwen2Config()
-    >>> # Initializing a model from the Qwen2-7B style configuration
-    >>> model = Qwen2Model(configuration)
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "qwen2"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    def __init__(
-        self,
-        vocab_size=151936,
-        hidden_size=4096,
-        intermediate_size=22016,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=32,
-        hidden_act="silu",
-        max_position_embeddings=32768,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        use_sliding_window=False,
-        sliding_window=4096,
-        rope_scaling=None,
-        max_window_layers=28,
-        attention_dropout=0.0,
-        beacon_window=1024,
-        beacon_stride=1024,
-        beacon_attn="full-coverage",
-        beacon_ratio=[2,4,8,16,32],
-        beacon_ratio_mix="step-random",
-        beacon_param=[],
-        beacon_embed_init="eos",
-        beacon_sink_size=0,
-        beacon_attend_prev=True,
-        beacon_pos="interleave",
-        beacon_parallel_window=1,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.use_sliding_window = use_sliding_window
-        self.sliding_window = sliding_window
-        self.max_window_layers = max_window_layers
-        self.rope_scaling = rope_scaling
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.attention_dropout = attention_dropout
-        self.beacon_window = beacon_window
-        self.beacon_stride = beacon_stride
-        self.beacon_attn = beacon_attn
-        self.beacon_ratio = beacon_ratio
-        self.beacon_ratio_mix = beacon_ratio_mix
-        self.beacon_param = beacon_param
-        self.beacon_embed_init = beacon_embed_init
-        self.beacon_sink_size = beacon_sink_size
-        self.beacon_attend_prev = beacon_attend_prev
-        self.beacon_pos = beacon_pos
-        self.beacon_parallel_window = beacon_parallel_window
-        super().__init__(
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )

llava_arch.py CHANGED Viewed

@@ -1,17 +1,3 @@
-#    Copyright 2023 Haotian Liu
-#
-#    Licensed under the Apache License, Version 2.0 (the "License");
-#    you may not use this file except in compliance with the License.
-#    You may obtain a copy of the License at
-#
-#        http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS,
-#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#    See the License for the specific language governing permissions and
-#    limitations under the License.
 from abc import ABC, abstractmethod
 import importlib.util
 import os.path as osp
@@ -26,25 +12,6 @@ import torch.nn.functional as F
 from .multimodal_encoder.builder import build_vision_tower
 from .multimodal_resampler.builder import build_vision_resampler
 from .multimodal_projector.builder import build_vision_projector
-# except ModuleNotFoundError:
-#     spec = importlib.util.spec_from_file_location(
-#         "builder",
-#         osp.join(osp.dirname(__file__), "builder.py"),
-#     )
-#     builder = importlib.util.module_from_spec(spec)
-#     spec.loader.exec_module(builder)
-#     build_vision_tower = getattr(
-#         builder,
-#         "build_vision_tower",
-#     )
-#     build_vision_resampler = getattr(
-#         builder,
-#         "build_vision_resampler",
-#     )
-#     build_vision_projector = getattr(
-#         builder,
-#         "build_vision_projector",
-#     )
 from transformers import AutoTokenizer
@@ -59,7 +26,6 @@ import torch.nn.functional as F
 import pdb
 class LlavaMetaModel:
     def __init__(self, config):
         super(LlavaMetaModel, self).__init__(config)
@@ -72,31 +38,13 @@ class LlavaMetaModel:
             if "unpad" in getattr(config, "mm_patch_merge_type", ""):
                 self.image_newline = nn.Parameter(torch.empty(config.hidden_size, dtype=self.dtype))
-        # self.llm_tokenizer = AutoTokenizer.from_pretrained(config._name_or_path)
         self.hidden_size=config.hidden_size
-        # print(config)
-        # exit(0)
-#         self.text_tokenizer = T5Tokenizer.from_pretrained('google-t5/t5-small')
-##############################################################################
-#         self.text_select_model = T5EncoderModel.from_pretrained('google-t5/t5-small')
-        # self.text_gamma=0.75
-###############################################################################
         self.text_mlp=nn.Sequential(
             nn.Linear(config.hidden_size,config.hidden_size),
             nn.GELU(),
         )
         self.sae=SiglipAE()
-        #self.sae.load_state_dict(torch.load('/share/LXRlxr0_0/code/videoxl2/videoxl2/longva/longva/model/encoder.pth'),strict=False)
-###############################################################################
-        # self.vision_select=nn.Parameter(
-        #         torch.randn((4, self.config.hidden_size), dtype=self.dtype)
-        # )
-##############################################################################
     def get_vision_tower(self):
         vision_tower = getattr(self, "vision_tower", None)
         if type(vision_tower) is list:
@@ -147,22 +95,6 @@ class LlavaMetaModel:
         self.sae=SiglipAE()
         self.sae.load_state_dict(torch.load('/share/LXRlxr0_0/code/videoxl2/videoxl2/longva/longva/model/encoder.pth'),strict=False)
-        ##############################################################################
-#         self.vision_select=nn.Parameter(
-#                 torch.randn((30, self.config.hidden_size), dtype=self.dtype)
-#         )
-#         #self.text_tokenizer = T5Tokenizer.from_pretrained('google-t5/t5-small')
-#         self.text_select_model = T5EncoderModel.from_pretrained('google-t5/t5-small')
-#         self.text_mlp=nn.Sequential(
-#             nn.Linear(512,self.config.hidden_size),
-#             nn.GELU(),
-#             # nn.Linear(config.hidden_size,config.hidden_size),
-#             # nn.GELU(),
-#         )
-        ##############################################################################
         if getattr(self, "mm_projector", None) is None:
             self.mm_projector = build_vision_projector(self.config, vision_cfg=vision_tower.config)
@@ -185,15 +117,7 @@ class LlavaMetaModel:
             rank0_print(f"Loaded mm projector weights from {pretrain_mm_mlp_adapter}. Incompatible keys: {incompatible_keys}")
             incompatible_keys = self.vision_resampler.load_state_dict(get_w(mm_projector_weights, "vision_resampler"), strict=False)
             rank0_print(f"Loaded vision resampler weights from {pretrain_mm_mlp_adapter}. Incompatible keys: {incompatible_keys}")
-#             self.vision_select.data = mm_projector_weights["model.vision_select"]
-#             self.text_mlp.load_state_dict(get_w(mm_projector_weights, "text_mlp"))
-#             self.text_select_model.load_state_dict(get_w(mm_projector_weights, "text_select_model"),strict=False)
-            #self.vision_tower.load_state_dict(get_w(mm_projector_weights, "vision_tower"),strict=False)
 def unpad_image(tensor, original_size):
     """
     Unpads a PyTorch tensor of a padded and resized image.
@@ -283,25 +207,30 @@ class LlavaMetaForCausalLM(ABC):
         return torch.repeat_interleave(image_features, repeats=4, dim=0)
     def add_video(self, video_features):
-        if video_features.size(0)<4:
             last_feature = video_features[-1:]
-            repeated_features = last_feature.repeat(4 - video_features.size(0), 1,1,1)
             expanded_x = torch.cat([video_features, repeated_features], dim=0)
             return expanded_x
-        repeat_counts = torch.ones(video_features.size(0), dtype=torch.long, device=video_features.device)
-        sum_counts=torch.sum(repeat_counts)
-        if sum_counts % 4!=0:
-            padding_size = 4 - (sum_counts % 4)
-            random_indices = torch.randperm(repeat_counts.size(0))[:padding_size].to(video_features.device)
-            repeat_counts[random_indices] += 1
-        expanded_x = torch.repeat_interleave(video_features, repeat_counts, dim=0)
-        return expanded_x
     def encode_multimodals(self, videos_or_images, video_idx_in_batch, split_sizes=None):
         if self.config.enable_chunk_prefill:
             chunk_size_for_vision_tower = self.config.prefill_config['chunk_size_for_vision_tower']
@@ -351,28 +280,27 @@ class LlavaMetaForCausalLM(ABC):
         torch.cuda.empty_cache()
         chunk_size = chunk_size_for_vision_tower
-        print(f'chunk_size: {chunk_size}')
         all_feat_list = []
         for idx, feat in enumerate(per_videos_or_images_features):
             for i in range(0, feat.shape[0], chunk_size):
-                batched_feat = feat[i:i+chunk_size]
-                batched_feat=self.interpolate(batched_feat) # torch.Size([187, 1152, 24, 24])
                 if idx in video_idx_in_batch:
-                    batched_feat = self.add_video(batched_feat)  # torch.Size([188, 1152, 24, 24])
                 else:
                     batched_feat = self.add_image(batched_feat)
                 bc,ch,h,w = batched_feat.shape
                 batched_feat = batched_feat.view(bc//4,ch,4,h,w)
-                batched_feat=self.get_model().sae(batched_feat).squeeze(2)
                 batched_feat = batched_feat.permute(0, 2, 3, 1).contiguous().flatten(1, 2)
                 batched_feat = self.get_model().mm_projector(batched_feat)
                 batched_feat = self.get_2dPool(batched_feat)
                 all_feat_list.append(batched_feat)
         feat = torch.cat(all_feat_list, dim=0)
         # peak_memory_allocated = torch.cuda.max_memory_allocated()
         # print(f"sae 显存峰值: {peak_memory_allocated / (1024**3):.2f} GB") # 转换为GB
@@ -380,10 +308,8 @@ class LlavaMetaForCausalLM(ABC):
         del per_videos_or_images_features
         del all_feat_list
         torch.cuda.empty_cache()
         all_videos_or_images_features.append(feat)
         return all_videos_or_images_features
     def interpolate(self,image_features):
         b, num_tokens, dim = image_features.shape
@@ -673,7 +599,7 @@ class LlavaMetaForCausalLM(ABC):
         # Truncate sequences to max length as image embeddings can make the sequence longer
         tokenizer_model_max_length = getattr(self.config, "tokenizer_model_max_length", None)
-        # NOTE: qmh 注释
         # new_input_embeds = [x[:tokenizer_model_max_length] for x, modality in zip(new_input_embeds, modalities)]
         # new_labels = [x[:tokenizer_model_max_length] for x, modality in zip(new_labels, modalities)]
@@ -771,5 +697,4 @@ class LlavaMetaForCausalLM(ABC):
                 for p in self.get_input_embeddings().parameters():
                     p.requires_grad = False
                 for p in self.get_output_embeddings().parameters():
-                    p.requires_grad = False

 from abc import ABC, abstractmethod
 import importlib.util
 import os.path as osp
 from .multimodal_encoder.builder import build_vision_tower
 from .multimodal_resampler.builder import build_vision_resampler
 from .multimodal_projector.builder import build_vision_projector
 from transformers import AutoTokenizer
 import pdb
 class LlavaMetaModel:
     def __init__(self, config):
         super(LlavaMetaModel, self).__init__(config)
             if "unpad" in getattr(config, "mm_patch_merge_type", ""):
                 self.image_newline = nn.Parameter(torch.empty(config.hidden_size, dtype=self.dtype))
         self.hidden_size=config.hidden_size
         self.text_mlp=nn.Sequential(
             nn.Linear(config.hidden_size,config.hidden_size),
             nn.GELU(),
         )
         self.sae=SiglipAE()
     def get_vision_tower(self):
         vision_tower = getattr(self, "vision_tower", None)
         if type(vision_tower) is list:
         self.sae=SiglipAE()
         self.sae.load_state_dict(torch.load('/share/LXRlxr0_0/code/videoxl2/videoxl2/longva/longva/model/encoder.pth'),strict=False)
         if getattr(self, "mm_projector", None) is None:
             self.mm_projector = build_vision_projector(self.config, vision_cfg=vision_tower.config)
             rank0_print(f"Loaded mm projector weights from {pretrain_mm_mlp_adapter}. Incompatible keys: {incompatible_keys}")
             incompatible_keys = self.vision_resampler.load_state_dict(get_w(mm_projector_weights, "vision_resampler"), strict=False)
             rank0_print(f"Loaded vision resampler weights from {pretrain_mm_mlp_adapter}. Incompatible keys: {incompatible_keys}")
 def unpad_image(tensor, original_size):
     """
     Unpads a PyTorch tensor of a padded and resized image.
         return torch.repeat_interleave(image_features, repeats=4, dim=0)
     def add_video(self, video_features):
+        # Current batch size
+        current_batch_size = video_features.size(0)
+        # Handle cases where the batch size is less than 4
+        if current_batch_size < 4:
             last_feature = video_features[-1:]
+            # Calculate how many times the last feature needs to be repeated
+            num_repeats = 4 - current_batch_size
+            repeated_features = last_feature.repeat(num_repeats, 1, 1, 1)
+            # Concatenate original features with repeated last feature
             expanded_x = torch.cat([video_features, repeated_features], dim=0)
             return expanded_x
+        # Handle cases where the batch size is 4 or greater, but not a multiple of 4
+        if current_batch_size % 4 != 0:
+            last_feature = video_features[-1:]
+            # Calculate how many features are needed to reach the next multiple of 4
+            padding_size = 4 - (current_batch_size % 4)
+            repeated_features = last_feature.repeat(padding_size, 1, 1, 1)
+            # Concatenate original features with repeated last feature
+            expanded_x = torch.cat([video_features, repeated_features], dim=0)
+            return expanded_x
+        # If the batch size is already a multiple of 4, return as is
+        return video_features
     def encode_multimodals(self, videos_or_images, video_idx_in_batch, split_sizes=None):
         if self.config.enable_chunk_prefill:
             chunk_size_for_vision_tower = self.config.prefill_config['chunk_size_for_vision_tower']
         torch.cuda.empty_cache()
         chunk_size = chunk_size_for_vision_tower
+        # print(f'chunk_size: {chunk_size}')
         all_feat_list = []
         for idx, feat in enumerate(per_videos_or_images_features):
             for i in range(0, feat.shape[0], chunk_size):
+                batched_feat = feat[i:i+chunk_size] # chunk_size = 48, batched_feat.shape=[48, 729, 1152]
+                batched_feat=self.interpolate(batched_feat) # 插值后 batched_feat.shape=[48, 1152, 24, 24]
                 if idx in video_idx_in_batch:
+                    batched_feat = self.add_video(batched_feat) # 第一纬度补充到4的倍数
                 else:
                     batched_feat = self.add_image(batched_feat)
                 bc,ch,h,w = batched_feat.shape
                 batched_feat = batched_feat.view(bc//4,ch,4,h,w)
+                batched_feat = self.get_model().sae(batched_feat).squeeze(2)
                 batched_feat = batched_feat.permute(0, 2, 3, 1).contiguous().flatten(1, 2)
                 batched_feat = self.get_model().mm_projector(batched_feat)
                 batched_feat = self.get_2dPool(batched_feat)
                 all_feat_list.append(batched_feat)
         feat = torch.cat(all_feat_list, dim=0)
         # peak_memory_allocated = torch.cuda.max_memory_allocated()
         # print(f"sae 显存峰值: {peak_memory_allocated / (1024**3):.2f} GB") # 转换为GB
         del per_videos_or_images_features
         del all_feat_list
         torch.cuda.empty_cache()
         all_videos_or_images_features.append(feat)
         return all_videos_or_images_features
     def interpolate(self,image_features):
         b, num_tokens, dim = image_features.shape
         # Truncate sequences to max length as image embeddings can make the sequence longer
         tokenizer_model_max_length = getattr(self.config, "tokenizer_model_max_length", None)
+        # NOTE: qmh
         # new_input_embeds = [x[:tokenizer_model_max_length] for x, modality in zip(new_input_embeds, modalities)]
         # new_labels = [x[:tokenizer_model_max_length] for x, modality in zip(new_labels, modalities)]
                 for p in self.get_input_embeddings().parameters():
                     p.requires_grad = False
                 for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False

llava_qwen.py CHANGED Viewed

@@ -11,8 +11,6 @@
 #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #    See the License for the specific language governing permissions and
 #    limitations under the License.
 from typing import List, Optional, Tuple, Union, Dict
 import torch
 import torch.nn as nn
@@ -21,9 +19,9 @@ import transformers
 from transformers import AutoConfig, AutoModelForCausalLM, LlamaConfig, LlamaModel, LlamaForCausalLM
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.generation.utils import GenerateOutput
-from .llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
-# from longva.longva.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
 from .modeling_qwen2 import Qwen2Config, Qwen2Model, Qwen2ForCausalLM
 import pdb
 import time
 import random
@@ -35,7 +33,671 @@ import PIL
 from decord import VideoReader, cpu
 from .conversation import conv_templates, SeparatorStyle
 from .constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_IMAGE_TOKEN
-from .mm_utils import tokenizer_image_token, load_video
 class LlavaQwenConfig(Qwen2Config):
@@ -518,7 +1180,6 @@ class LlavaQwenForCausalLM(Qwen2ForCausalLM, LlavaMetaForCausalLM):
             )
         if inputs_embeds is None:
-            pdb.set_trace()
             (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = self.prepare_inputs_labels_for_multimodal(input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities, image_sizes, time_embedding)
         if self.config.enable_chunk_prefill:
@@ -600,8 +1261,6 @@ class LlavaQwenForCausalLM(Qwen2ForCausalLM, LlavaMetaForCausalLM):
         **kwargs,
     ) -> Union[GenerateOutput, torch.LongTensor]:
         position_ids = kwargs.pop("position_ids", None)
         attention_mask = kwargs.pop("attention_mask", None)
@@ -664,9 +1323,14 @@ class LlavaQwenForCausalLM(Qwen2ForCausalLM, LlavaMetaForCausalLM):
         prompt = conv.get_prompt()
         input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.model.device)
         # prepare video input
         frames, timestamps = load_video(video_path, max_num_frames, fps=sample_fps, max_fps=max_sample_fps)
-        print(f'video has loaded, extratc {len(frames)} frames.')
         time_stamps=[]
         token_frames_sum=(len(timestamps)+3)//4

 #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #    See the License for the specific language governing permissions and
 #    limitations under the License.
 from typing import List, Optional, Tuple, Union, Dict
 import torch
 import torch.nn as nn
 from transformers import AutoConfig, AutoModelForCausalLM, LlamaConfig, LlamaModel, LlamaForCausalLM
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.generation.utils import GenerateOutput
+# from .llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
 from .modeling_qwen2 import Qwen2Config, Qwen2Model, Qwen2ForCausalLM
+# from transformers import Qwen2Config, Qwen2Model, Qwen2ForCausalLM
 import pdb
 import time
 import random
 from decord import VideoReader, cpu
 from .conversation import conv_templates, SeparatorStyle
 from .constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_IMAGE_TOKEN
+from .mm_utils import tokenizer_image_token, load_video, KeywordsStoppingCriteria, get_anyres_image_grid_shape
+import math
+import re
+from .vision_tower_builder import build_vision_tower
+from .vision_resampler_builder import build_vision_resampler
+from .vision_projector_builder import build_vision_projector
+from .utils import rank0_print
+from .sae import SiglipAE
+import numpy as np
+import pdb
+from abc import ABC, abstractmethod
+class LlavaMetaModel:
+    def __init__(self, config):
+        super(LlavaMetaModel, self).__init__(config)
+        if hasattr(config, "mm_vision_tower"):
+            delay_load = getattr(config, "delay_load", False)
+            self.vision_tower = build_vision_tower(config, delay_load=delay_load)
+            self.vision_resampler = build_vision_resampler(config, vision_tower=self.vision_tower)
+            self.mm_projector = build_vision_projector(config, vision_cfg=self.vision_tower.config)
+            if "unpad" in getattr(config, "mm_patch_merge_type", ""):
+                self.image_newline = nn.Parameter(torch.empty(config.hidden_size, dtype=self.dtype))
+        self.hidden_size=config.hidden_size
+        self.text_mlp=nn.Sequential(
+            nn.Linear(config.hidden_size,config.hidden_size),
+            nn.GELU(),
+        )
+        self.sae=SiglipAE()
+    def get_vision_tower(self):
+        vision_tower = getattr(self, "vision_tower", None)
+        if type(vision_tower) is list:
+            vision_tower = vision_tower[0]
+        return vision_tower
+    def initialize_vision_modules(self, model_args, fsdp=None):
+        vision_tower = model_args.vision_tower
+        mm_vision_select_layer = model_args.mm_vision_select_layer
+        mm_vision_select_feature = model_args.mm_vision_select_feature
+        pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
+        mm_patch_merge_type = model_args.mm_patch_merge_type
+        self.config.mm_vision_tower = vision_tower
+        self.config.vision_tower_pretrained = getattr(model_args, "vision_tower_pretrained", "")
+        if self.get_vision_tower() is None:
+            vision_tower = build_vision_tower(model_args)
+            vision_resampler = build_vision_resampler(model_args, vision_tower=vision_tower)
+            for k, v in vision_resampler.config.items():
+                setattr(self.config, k, v)
+            if fsdp is not None and len(fsdp) > 0:
+                self.vision_tower = [vision_tower]
+                self.vision_resampler = [vision_resampler]
+            else:
+                self.vision_tower = vision_tower
+                self.vision_resampler = vision_resampler
+        else:
+            if fsdp is not None and len(fsdp) > 0:
+                vision_resampler = self.vision_resampler[0]
+                vision_tower = self.vision_tower[0]
+            else:
+                vision_resampler = self.vision_resampler
+                vision_tower = self.vision_tower
+            vision_tower.load_model()
+            # In case it is frozen by LoRA
+            for p in self.vision_resampler.parameters():
+                p.requires_grad = True
+        self.config.use_mm_proj = True
+        self.config.mm_projector_type = getattr(model_args, "mm_projector_type", "linear")
+        self.config.mm_hidden_size = getattr(vision_resampler, "hidden_size", vision_tower.hidden_size)
+        self.config.mm_vision_select_layer = mm_vision_select_layer
+        self.config.mm_vision_select_feature = mm_vision_select_feature
+        self.config.mm_patch_merge_type = mm_patch_merge_type
+        self.sae=SiglipAE()
+        self.sae.load_state_dict(torch.load('/share/LXRlxr0_0/code/videoxl2/videoxl2/longva/longva/model/encoder.pth'),strict=False)
+        if getattr(self, "mm_projector", None) is None:
+            self.mm_projector = build_vision_projector(self.config, vision_cfg=vision_tower.config)
+            if "unpad" in mm_patch_merge_type:
+                embed_std = 1 / torch.sqrt(torch.tensor(self.config.hidden_size, dtype=self.dtype))
+                self.image_newline = nn.Parameter(torch.randn(self.config.hidden_size, dtype=self.dtype) * embed_std)
+        else:
+            # In case it is frozen by LoRA
+            for p in self.mm_projector.parameters():
+                p.requires_grad = True
+        if pretrain_mm_mlp_adapter is not None:
+            mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location="cpu")
+            def get_w(weights, keyword):
+                return {k.split(keyword + ".")[1]: v for k, v in weights.items() if keyword in k}
+            incompatible_keys = self.mm_projector.load_state_dict(get_w(mm_projector_weights, "mm_projector"))
+            rank0_print(f"Loaded mm projector weights from {pretrain_mm_mlp_adapter}. Incompatible keys: {incompatible_keys}")
+            incompatible_keys = self.vision_resampler.load_state_dict(get_w(mm_projector_weights, "vision_resampler"), strict=False)
+            rank0_print(f"Loaded vision resampler weights from {pretrain_mm_mlp_adapter}. Incompatible keys: {incompatible_keys}")
+def unpad_image(tensor, original_size):
+    """
+    Unpads a PyTorch tensor of a padded and resized image.
+    Args:
+    tensor (torch.Tensor): The image tensor, assumed to be in CxHxW format.
+    original_size (tuple): The original size of the image (height, width).
+    Returns:
+    torch.Tensor: The unpadded image tensor.
+    """
+    original_width, original_height = original_size
+    current_height, current_width = tensor.shape[1:]
+    # Compute aspect ratios
+    original_aspect_ratio = original_width / original_height
+    current_aspect_ratio = current_width / current_height
+    # Determine padding size and direction
+    if original_aspect_ratio > current_aspect_ratio:
+        # Padding was added to the height
+        scale_factor = current_width / original_width
+        new_height = int(original_height * scale_factor)
+        padding = (current_height - new_height) // 2
+        unpadded_tensor = tensor[:, padding : current_height - padding, :]
+    else:
+        # Padding was added to the width
+        scale_factor = current_height / original_height
+        new_width = int(original_width * scale_factor)
+        padding = (current_width - new_width) // 2
+        unpadded_tensor = tensor[:, :, padding : current_width - padding]
+    return unpadded_tensor
+class LlavaMetaForCausalLM(ABC):
+    @abstractmethod
+    def get_model(self):
+        pass
+    def get_vision_tower(self):
+        return self.get_model().get_vision_tower()
+    def get_2dPool(self, image_feature):
+        height = width = self.get_vision_tower().num_patches_per_side
+        num_frames, num_tokens, num_dim = image_feature.shape
+        image_feature = image_feature.view(num_frames, height, width, -1)
+        image_feature = image_feature.permute(0, 3, 1, 2).contiguous()
+        # image_feature = nn.functional.max_pool2d(image_feature, self.config.mm_spatial_pool_stride)
+        if self.config.mm_spatial_pool_mode == "average":
+            image_feature = nn.functional.avg_pool2d(image_feature, self.config.mm_spatial_pool_stride)
+        elif self.config.mm_spatial_pool_mode == "max":
+            image_feature = nn.functional.max_pool2d(image_feature, self.config.mm_spatial_pool_stride)
+        else:
+            raise ValueError(f"Unexpected mm_spatial_pool_mode: {self.config.mm_spatial_pool_mode}")
+        image_feature = image_feature.permute(0, 2, 3, 1)
+        image_feature = image_feature.view(num_frames, -1, num_dim)
+        return image_feature
+    def encode_images(self, images):
+        image_features = self.get_model().get_vision_tower()(images)
+        #image_features = self.get_model().vision_resampler(image_features, images=images)
+        image_features = self.get_model().mm_projector(image_features)
+        image_features = self.get_model().vision_resampler(image_features, images=images)
+        return image_features
+    def add_image(self, image_features):
+        return torch.repeat_interleave(image_features, repeats=4, dim=0)
+    def add_video(self, video_features):
+        # Current batch size
+        current_batch_size = video_features.size(0)
+        # Handle cases where the batch size is less than 4
+        if current_batch_size < 4:
+            last_feature = video_features[-1:]
+            # Calculate how many times the last feature needs to be repeated
+            num_repeats = 4 - current_batch_size
+            repeated_features = last_feature.repeat(num_repeats, 1, 1, 1)
+            # Concatenate original features with repeated last feature
+            expanded_x = torch.cat([video_features, repeated_features], dim=0)
+            return expanded_x
+        # Handle cases where the batch size is 4 or greater, but not a multiple of 4
+        if current_batch_size % 4 != 0:
+            last_feature = video_features[-1:]
+            # Calculate how many features are needed to reach the next multiple of 4
+            padding_size = 4 - (current_batch_size % 4)
+            repeated_features = last_feature.repeat(padding_size, 1, 1, 1)
+            # Concatenate original features with repeated last feature
+            expanded_x = torch.cat([video_features, repeated_features], dim=0)
+            return expanded_x
+        # If the batch size is already a multiple of 4, return as is
+        return video_features
+    def encode_multimodals(self, videos_or_images, video_idx_in_batch, split_sizes=None):
+        if self.config.enable_chunk_prefill:
+            chunk_size_for_vision_tower = self.config.prefill_config['chunk_size_for_vision_tower']
+        else:
+            chunk_size_for_vision_tower = 100000
+        # pdb.set_trace()
+        # Define the maximum batch size (1024 frames)
+        max_batch_size = chunk_size_for_vision_tower
+        # print(f'max_batch_size: {max_batch_size}')
+        num_frames = videos_or_images.shape[0]
+        # Initialize a list to store the features from each batch
+        videos_or_images_features = []
+        videos_or_images_features = torch.empty((num_frames, 729, 1152), device=self.get_model().device, dtype=self.get_model().dtype)
+        # Split videos_or_images into smaller batches if num_frames > max_batch_size
+        current_idx = 0
+        if num_frames > max_batch_size:
+            # Calculate the number of batches needed
+            num_batches = (num_frames + max_batch_size - 1) // max_batch_size
+            for i in range(num_batches):
+                start_idx = i * max_batch_size
+                end_idx = min((i + 1) * max_batch_size, num_frames)
+                # Process each batch separately
+                batch_videos_or_images = videos_or_images[start_idx:end_idx]
+                batch_features = self.get_model().get_vision_tower()(batch_videos_or_images)
+                # videos_or_images_features.append(batch_features)
+                videos_or_images_features[current_idx:current_idx + batch_features.shape[0]] = batch_features
+                # Update the current index for the next batch
+                current_idx += batch_features.shape[0]
+                # peak_memory_allocated = torch.cuda.max_memory_allocated()
+                # print(f"vision encoder 显存峰值: {peak_memory_allocated / (1024**3):.2f} GB") # 转换为GB
+            # Concatenate the features of all batches
+            # videos_or_images_features = torch.cat(videos_or_images_features, dim=0)
+        else:
+            videos_or_images_features = self.get_model().get_vision_tower()(videos_or_images)
+        per_videos_or_images_features = torch.split(videos_or_images_features, split_sizes, dim=0)
+        all_videos_or_images_features = []
+        # peak_memory_allocated = torch.cuda.max_memory_allocated()
+        # print(f"vision encoder 显存峰值: {peak_memory_allocated / (1024**3):.2f} GB") # 转换为GB
+        del videos_or_images_features
+        torch.cuda.empty_cache()
+        chunk_size = chunk_size_for_vision_tower
+        # print(f'chunk_size: {chunk_size}')
+        all_feat_list = []
+        for idx, feat in enumerate(per_videos_or_images_features):
+            for i in range(0, feat.shape[0], chunk_size):
+                batched_feat = feat[i:i+chunk_size] # chunk_size = 48, batched_feat.shape=[48, 729, 1152]
+                batched_feat=self.interpolate(batched_feat) # 插值后 batched_feat.shape=[48, 1152, 24, 24]
+                if idx in video_idx_in_batch:
+                    batched_feat = self.add_video(batched_feat) # 第一纬度补充到4的倍数
+                else:
+                    batched_feat = self.add_image(batched_feat)
+                bc,ch,h,w = batched_feat.shape
+                batched_feat = batched_feat.view(bc//4,ch,4,h,w)
+                batched_feat = self.get_model().sae(batched_feat).squeeze(2)
+                batched_feat = batched_feat.permute(0, 2, 3, 1).contiguous().flatten(1, 2)
+                batched_feat = self.get_model().mm_projector(batched_feat)
+                batched_feat = self.get_2dPool(batched_feat)
+                all_feat_list.append(batched_feat)
+        feat = torch.cat(all_feat_list, dim=0)
+        # peak_memory_allocated = torch.cuda.max_memory_allocated()
+        # print(f"sae 显存峰值: {peak_memory_allocated / (1024**3):.2f} GB") # 转换为GB
+        del per_videos_or_images_features
+        del all_feat_list
+        torch.cuda.empty_cache()
+        all_videos_or_images_features.append(feat)
+        return all_videos_or_images_features
+    def interpolate(self,image_features):
+        b, num_tokens, dim = image_features.shape
+        #print(str(image_features.shape)+' i\n')
+        target_h = target_w = int(576**0.5)
+        h = w = int(num_tokens**0.5)
+        image_features = image_features.view(b, h, w, dim)
+        image_features = image_features.permute(0, 3, 1, 2).contiguous()
+        chunk_size = 24
+        chunks = torch.split(image_features, chunk_size, dim=0)
+        interpolated_chunks = []
+        for chunk in chunks:
+            interpolated_chunk = F.interpolate(
+                chunk.to(torch.float32),
+                size=(target_h, target_w),
+                mode="bilinear",
+                align_corners=False,
+            ).to(chunk.dtype)
+            interpolated_chunks.append(interpolated_chunk)
+        image_features = torch.cat(interpolated_chunks, dim=0)
+        del interpolated_chunks
+        del chunks
+        return image_features
+    def prepare_inputs_labels_for_multimodal(self, input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities=["image"], image_sizes=None,time_embedding=None):
+        vision_tower = self.get_vision_tower()
+        if vision_tower is None or images is None or input_ids.shape[1] == 1:
+            return input_ids, position_ids, attention_mask, past_key_values, None, labels
+        if type(images) is list or images.ndim == 5:
+            if type(images) is list:
+                images = [x.unsqueeze(0) if x.ndim == 3 else x for x in images]
+            video_idx_in_batch = []
+            for _ in range(len(modalities)):
+                if modalities[_] == "video":
+                    video_idx_in_batch.append(_)
+            images_list = []
+            for image in images:
+                if image.ndim == 4:
+                    images_list.append(image)
+                else:
+                    images_list.append(image.unsqueeze(0))
+            #print(len(images_list),images_list[0].shape)
+            concat_images = torch.cat([image for image in images_list], dim=0)
+            split_sizes = [image.shape[0] for image in images_list]
+            image_features = self.encode_multimodals(concat_images, video_idx_in_batch, split_sizes)    #16,144,3584
+            mm_patch_merge_type = getattr(self.config, "mm_patch_merge_type", "flat")
+            image_aspect_ratio = getattr(self.config, "image_aspect_ratio", "square")
+            visual_drop_score=[]
+            new_image_features=[]
+            if mm_patch_merge_type == "flat":
+                if image_features[0].ndim>2:
+                    image_features = [x.flatten(0, 1) for x in image_features]
+            elif mm_patch_merge_type== "unires":
+                #print('unires')
+                for image_idx, image_feature in enumerate(image_features):
+                    # rank0_print(f"Initial feature size : {image_feature.shape}")
+                    if image_idx in video_idx_in_batch:  # video operations
+                        #print(image_feature.shape)
+                        image_feature = image_feature.flatten(0, 1)
+                    elif image_feature.shape[0] > 1:
+                        # base image feature is never used in unires
+                        base_image_feature = image_feature[0]
+                        image_feature = image_feature[1:]
+                        height = width = self.get_vision_tower().num_patches_per_side
+                        assert height * width == base_image_feature.shape[0]
+                        kernel_size = mm_patch_merge_type.split("avgpool")[-1].split("x")[-1]
+                        kernel_size = 2
+                        image_feature = image_feature.view(image_feature.shape[0], height, width, -1) # [4, 24, 24, 4096]
+                        image_feature = image_feature.permute(0, 3, 1, 2).contiguous() # [4, 4096, 24, 24]
+                        image_feature = nn.functional.avg_pool2d(image_feature,kernel_size) # [4, 4096, 12, 12]
+                        image_feature = image_feature.flatten(2, 3) # [4, 4096, 144]
+                        image_feature = image_feature.permute(0, 2, 1).contiguous() # [4, 144, 4096]
+                        #print(image_feature.shape)
+                        image_feature = image_feature.flatten(0, 1)
+                    else:
+                        image_feature = image_feature[0]
+                    new_image_features.append(image_feature)
+                image_features = new_image_features
+            elif mm_patch_merge_type.startswith("spatial"):
+                new_image_features = []
+                for image_idx, image_feature in enumerate(image_features):
+                    # FIXME: now assume the image is square, and split to 2x2 patches
+                    # num_patches = h * w, where h = w = sqrt(num_patches)
+                    # currently image_feature is a tensor of shape (4, num_patches, hidden_size)
+                    # we want to first unflatten it to (2, 2, h, w, hidden_size)
+                    if image_idx in video_idx_in_batch:  # video operations
+                        if "unpad" in mm_patch_merge_type:
+                            # image_feature = image_feature.permute(2, 0, 1).contiguous()
+                            # image_feature =  torch.cat((image_feature, self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)), dim=-1)
+                            # image_feature = image_feature.permute(1, 2, 0).contiguous()
+                            image_feature = image_feature.flatten(0, 1)
+                            image_feature = torch.cat((image_feature, self.model.image_newline[None].to(image_feature.device)), dim=0)
+                    elif image_feature.shape[0] > 1:  # multi patches and multi images operations
+                        base_image_feature = image_feature[0]
+                        image_feature = image_feature[1:]
+                        height = width = self.get_vision_tower().num_patches_per_side
+                        assert height * width == base_image_feature.shape[0]
+                        if "anyres_max" in image_aspect_ratio:
+                            matched_anyres_max_num_patches = re.match(r"anyres_max_(\d+)", image_aspect_ratio)
+                            if matched_anyres_max_num_patches:
+                                max_num_patches = int(matched_anyres_max_num_patches.group(1))
+                        if image_aspect_ratio == "anyres" or "anyres_max" in image_aspect_ratio:
+                            if hasattr(self.get_vision_tower(), "image_size"):
+                                vision_tower_image_size = self.get_vision_tower().image_size
+                            else:
+                                raise ValueError("vision_tower_image_size is not found in the vision tower.")
+                            num_patch_width, num_patch_height = get_anyres_image_grid_shape(image_sizes[image_idx], self.config.image_grid_pinpoints, vision_tower_image_size)
+                            image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
+                        else:
+                            image_feature = image_feature.view(2, 2, height, width, -1)
+                        if "maxpool2x2" in mm_patch_merge_type:
+                            image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
+                            image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+                            image_feature = nn.functional.max_pool2d(image_feature, 2)
+                            image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+                        elif "unpad" in mm_patch_merge_type and "anyres_max" in image_aspect_ratio and matched_anyres_max_num_patches:
+                            unit = image_feature.shape[2]
+                            image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
+                            image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+                            image_feature = unpad_image(image_feature, image_sizes[image_idx])
+                            c, h, w = image_feature.shape
+                            times = math.sqrt(h * w / (max_num_patches * unit**2))
+                            if times > 1.1:
+                                image_feature = image_feature[None]
+                                image_feature = nn.functional.interpolate(image_feature, [int(h // times), int(w // times)], mode="bilinear")[0]
+                            image_feature = torch.cat((image_feature, self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)), dim=-1)
+                            image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+                        elif "unpad" in mm_patch_merge_type:
+                            image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
+                            image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+                            image_feature = unpad_image(image_feature, image_sizes[image_idx])
+                            image_feature = torch.cat((image_feature, self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)), dim=-1)
+                            image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+                        else:
+                            image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
+                            image_feature = image_feature.flatten(0, 3)
+                        if "nobase" in mm_patch_merge_type:
+                            pass
+                        else:
+                            image_feature = torch.cat((base_image_feature, image_feature), dim=0)
+                    else:  # single image operations
+                        image_feature = image_feature[0]
+                        if "unpad" in mm_patch_merge_type:
+                            image_feature = torch.cat((image_feature, self.model.image_newline[None]), dim=0)
+                    new_image_features.append(image_feature)
+                image_features = new_image_features
+            else:
+                raise ValueError(f"Unexpected mm_patch_merge_type: {self.config.mm_patch_merge_type}")
+        else:
+            error_message = """
+            Something is wrong with the input shape. Most likely, you did not wrap the image or video input in a list:
+            This is correct:
+                model.generate(input_ids, images=[video_tensor],  modalities=["video"], **gen_kwargs)
+                model.generate(input_ids, images=[image_tensor],  modalities=["image"], **gen_kwargs)
+            This is wrong:
+                model.generate(input_ids, images=video_tensor,  modalities=["video"], **gen_kwargs)
+                model.generate(input_ids, images=image_tensor,  modalities=["image"], **gen_kwargs)
+            """
+            raise ValueError(error_message)
+        #print(time_embedding[0].shape)
+        #video_token_indices=[]
+        for image_idx, image_feature in enumerate(image_features):
+            if time_embedding[image_idx] is not None:
+                mask = (time_embedding[image_idx] == 151654)
+                indices = torch.nonzero(mask).squeeze()
+                embed_token=self.get_model().embed_tokens(time_embedding[image_idx])
+                embed_token[indices]=image_features[image_idx]
+                #video_token_indices.append(indices)
+                image_features[image_idx]=embed_token
+        if getattr(self.config, "tune_mm_mlp_adapter", False) and getattr(self.config, "mm_use_im_start_end", False):
+            raise NotImplementedError
+        # Let's just add dummy tensors if they do not exist,
+        # it is a headache to deal with None all the time.
+        # But it is not ideal, and if you have a better idea,
+        # please open an issue / submit a PR, thanks.
+        _labels = labels
+        _position_ids = position_ids
+        _attention_mask = attention_mask
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+        else:
+            attention_mask = attention_mask.bool()
+        if position_ids is None:
+            position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
+        if labels is None:
+            labels = torch.full_like(input_ids, IGNORE_INDEX)
+        # remove the padding using attention_mask -- FIXME
+        _input_ids = input_ids
+        input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)]
+        labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
+        new_input_embeds = []
+        new_labels = []
+        cur_image_idx = 0
+        for batch_idx, cur_input_ids in enumerate(input_ids):
+            num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
+            #print(num_images)
+            if num_images>=2:
+                print(num_images,input_ids)
+            if num_images == 0:
+                cur_image_features = image_features[cur_image_idx]
+                cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids)
+                cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0)
+                new_input_embeds.append(cur_input_embeds)
+                new_labels.append(labels[batch_idx])
+                cur_image_idx += 1
+                continue
+            image_token_indices = [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
+            #print(image_token_indices) #[-1, 14, 236]
+            cur_input_ids_noim = []
+            cur_labels = labels[batch_idx]
+            # print(cur_input_ids)
+            # print(labels[batch_idx])
+            cur_labels_noim = []
+            for i in range(len(image_token_indices) - 1):
+                cur_input_ids_noim.append(cur_input_ids[image_token_indices[i] + 1 : image_token_indices[i + 1]])
+                cur_labels_noim.append(cur_labels[image_token_indices[i] + 1 : image_token_indices[i + 1]])
+            split_sizes = [x.shape[0] for x in cur_labels_noim]
+           #print(torch.cat(cur_input_ids_noim).shape,torch.cat(cur_input_ids_noim))
+            cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_noim))
+            cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
+            cur_new_input_embeds = []
+            cur_new_labels = []
+            for i in range(num_images + 1):
+                cur_new_input_embeds.append(cur_input_embeds_no_im[i])
+                cur_new_labels.append(cur_labels_noim[i])
+                if i < num_images:
+                    ##############
+                    cur_image_features = image_features[cur_image_idx]
+                    cur_image_idx += 1
+                    cur_new_input_embeds.append(cur_image_features)
+                    cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=cur_labels.device, dtype=cur_labels.dtype))
+            cur_new_input_embeds = [x.to(self.device) for x in cur_new_input_embeds]
+            # import pdb; pdb.set_trace()
+            cur_new_input_embeds = torch.cat(cur_new_input_embeds)
+            cur_new_labels = torch.cat(cur_new_labels)
+            new_input_embeds.append(cur_new_input_embeds)
+            new_labels.append(cur_new_labels)
+        # Truncate sequences to max length as image embeddings can make the sequence longer
+        tokenizer_model_max_length = getattr(self.config, "tokenizer_model_max_length", None)
+        # NOTE: qmh
+        # new_input_embeds = [x[:tokenizer_model_max_length] for x, modality in zip(new_input_embeds, modalities)]
+        # new_labels = [x[:tokenizer_model_max_length] for x, modality in zip(new_labels, modalities)]
+        # TODO: Hard code for control loss spike
+        # if tokenizer_model_max_length is not None:
+        #     new_input_embeds = [x[:4096] if modality != "video" else x[:tokenizer_model_max_length] for x, modality in zip(new_input_embeds, modalities)]
+        #     new_labels = [x[:4096] if modality != "video" else x[:tokenizer_model_max_length] for x, modality in zip(new_labels, modalities)]
+        # Combine them
+        max_len = max(x.shape[0] for x in new_input_embeds)
+        batch_size = len(new_input_embeds)
+        new_input_embeds_padded = []
+        new_labels_padded = torch.full((batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device)
+        attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
+        position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
+        for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
+            cur_len = cur_new_embed.shape[0]
+            if getattr(self.config, "tokenizer_padding_side", "right") == "left":
+                new_input_embeds_padded.append(torch.cat((torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device), cur_new_embed), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, -cur_len:] = cur_new_labels
+                    attention_mask[i, -cur_len:] = True
+                    position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+            else:
+                new_input_embeds_padded.append(torch.cat((cur_new_embed, torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, :cur_len] = cur_new_labels
+                    attention_mask[i, :cur_len] = True
+                    position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+        new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
+        if _labels is None:
+            new_labels = None
+        else:
+            new_labels = new_labels_padded
+        if _attention_mask is None:
+            attention_mask = None
+        else:
+            attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
+        if _position_ids is None:
+            position_ids = None
+        if getattr(self.config, "use_pos_skipping", False) and self.training:
+            position_ids = torch.arange(new_input_embeds.size(1), device=new_input_embeds.device).unsqueeze(0).to(new_input_embeds.device)
+            split_position = random.randint(0, new_input_embeds.size(1))
+            left_add = random.randint(0, self.config.pos_skipping_range)
+            right_add = random.randint(left_add, self.config.pos_skipping_range)
+            position_ids[:, :split_position] += left_add
+            position_ids[:, split_position:] += right_add
+        # import pdb; pdb.set_trace()
+        return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
+    def initialize_vision_tokenizer(self, model_args, tokenizer):
+        if model_args.mm_use_im_patch_token:
+            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+        if model_args.mm_use_im_start_end:
+            num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+            if num_new_tokens > 0:
+                input_embeddings = self.get_input_embeddings().weight.data
+                output_embeddings = self.get_output_embeddings().weight.data
+                input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
+                output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
+                input_embeddings[-num_new_tokens:] = input_embeddings_avg
+                output_embeddings[-num_new_tokens:] = output_embeddings_avg
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = True
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False
+            if model_args.pretrain_mm_mlp_adapter:
+                mm_projector_weights = torch.load(model_args.pretrain_mm_mlp_adapter, map_location="cpu")
+                embed_tokens_weight = mm_projector_weights["model.embed_tokens.weight"]
+                assert num_new_tokens == 2
+                if input_embeddings.shape == embed_tokens_weight.shape:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight[-num_new_tokens:]
+                elif embed_tokens_weight.shape[0] == num_new_tokens:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight
+                else:
+                    raise ValueError(f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.")
+        elif model_args.mm_use_im_patch_token:
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = False
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False
 class LlavaQwenConfig(Qwen2Config):
             )
         if inputs_embeds is None:
             (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = self.prepare_inputs_labels_for_multimodal(input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities, image_sizes, time_embedding)
         if self.config.enable_chunk_prefill:
         **kwargs,
     ) -> Union[GenerateOutput, torch.LongTensor]:
         position_ids = kwargs.pop("position_ids", None)
         attention_mask = kwargs.pop("attention_mask", None)
         prompt = conv.get_prompt()
         input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.model.device)
+        stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+        keywords = [stop_str]
+        stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+        generation_config["stopping_criteria"] = [stopping_criteria]
         # prepare video input
         frames, timestamps = load_video(video_path, max_num_frames, fps=sample_fps, max_fps=max_sample_fps)
+        print(f'video has loaded, extract {len(frames)} frames.')
         time_stamps=[]
         token_frames_sum=(len(timestamps)+3)//4

modeling_qwen2.py CHANGED Viewed

@@ -503,10 +503,12 @@ class Qwen2FlashAttention2(Qwen2Attention):
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_value: Optional[Cache] = None,
         output_attentions: bool = False,
         use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
     ):
         bsz, q_len, _ = hidden_states.size()

         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
+        key_position_ids: Optional[torch.LongTensor] = None,
         past_key_value: Optional[Cache] = None,
         output_attentions: bool = False,
         use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
+        blocks_positions=None,
     ):
         bsz, q_len, _ = hidden_states.size()

multimodal_encoder/.ipynb_checkpoints/base_encoder-checkpoint.py DELETED Viewed

@@ -1,68 +0,0 @@
-from abc import ABC, abstractmethod
-import torch
-import torch.nn as nn
-class BaseVisionTower(nn.Module):
-    def __init__(self, vision_tower_name, vision_tower_cfg, delay_load=False):
-        super().__init__()
-        self.is_loaded = False
-        self.vision_tower_name = vision_tower_name
-        self.delay_load = delay_load
-    @abstractmethod
-    def load_model(self, device_map=None):
-        raise NotImplementedError("Subclasses must implement load_model")
-    @abstractmethod
-    def _forward(self, images):
-        raise NotImplementedError("Subclasses must implement forward")
-    def forward(self, images):
-        if type(images) is list:
-            image_features = [self._forward(image.unsqueeze(0)) for image in images]
-        else:
-            image_features = self._forward(images)
-        return image_features
-    @property
-    def dummy_feature(self):
-        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
-    @property
-    def dtype(self):
-        # Dynamically infer the dtype from the first parameter, if not explicitly specified
-        if hasattr(self.vision_tower, "dtype"):
-            return self.vision_tower.dtype
-        else:
-            params = list(self.vision_tower.parameters())
-            return (
-                params[0].dtype if len(params) > 0 else torch.float32
-            )  # Default to torch.float32 if no parameters
-    @property
-    def device(self):
-        # Dynamically infer the device from the first parameter, if not explicitly specified
-        if hasattr(self.vision_tower, "device"):
-            return self.vision_tower.device
-        else:
-            params = list(self.vision_tower.parameters())
-            return (
-                params[0].device if len(params) > 0 else torch.device("cpu")
-            )  # Default to CPU if no parameters
-    @property
-    def config(self):
-        if self.is_loaded:
-            return self.vision_tower.config
-        else:
-            return self.cfg_only
-    @property
-    def hidden_size(self):
-        try:
-            return self.config.hidden_size
-        except:
-            return self._hidden_size

multimodal_encoder/.ipynb_checkpoints/builder-checkpoint.py DELETED Viewed

@@ -1,29 +0,0 @@
-import os
-from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2
-from .siglip_encoder import SigLipVisionTower
-# from .eva_clip.eva_clip_encoder import EvaClipVisionTower
-# from .dev_eva_clip.eva_vit import EvaViTWrapper
-def build_vision_tower(vision_tower_cfg, **kwargs):
-    vision_tower = getattr(vision_tower_cfg, "mm_vision_tower", getattr(vision_tower_cfg, "vision_tower", None))
-    is_absolute_path_exists = os.path.exists(vision_tower)
-    use_s2 = getattr(vision_tower_cfg, "s2", False)
-    #print(getattr(vision_tower_cfg, "vision_tower", None))
-    return SigLipVisionTower(vision_tower, vision_tower_cfg=vision_tower_cfg, **kwargs)
-    if getattr(vision_tower_cfg, "vision_tower", None) and "siglip" in getattr(vision_tower_cfg, "vision_tower", None).lower():
-        #print('*************\n')
-        return SigLipVisionTower(vision_tower, vision_tower_cfg=vision_tower_cfg, **kwargs)
-    if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower:
-        if use_s2:
-            return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs)
-        else:
-            return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
-    # elif "internal-eva" in vision_tower.lower() or "eva02" in vision_tower.lower():
-    #     return EvaClipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
-    # elif vision_tower in ["EVA-CLIP-8B", "EVA-CLIP-8B-plus"]:
-    #     return EvaViTWrapper(vision_tower, args=vision_tower_cfg, **kwargs)
-    raise ValueError(f"Unknown vision tower: {vision_tower}")

multimodal_encoder/.ipynb_checkpoints/clip_encoder-checkpoint.py DELETED Viewed

@@ -1,179 +0,0 @@
-import torch
-import torch.nn as nn
-from longva.longva.utils import rank0_print
-from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
-try:
-    from s2wrapper import forward as multiscale_forward
-except:
-    pass
-class CLIPVisionTower(nn.Module):
-    def __init__(self, vision_tower, args, delay_load=False):
-        super().__init__()
-        self.is_loaded = False
-        self.vision_tower_name = vision_tower
-        self.select_layer = args.mm_vision_select_layer
-        self.select_feature = getattr(args, "mm_vision_select_feature", "patch")
-        if not delay_load:
-            rank0_print(f"Loading vision tower: {vision_tower}")
-            self.load_model()
-        elif getattr(args, "unfreeze_mm_vision_tower", False):
-            # TODO: better detector is needed.
-            rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.")
-            self.load_model()
-        elif hasattr(args, "mm_tunable_parts") and "mm_vision_tower" in args.mm_tunable_parts:
-            rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.")
-            self.load_model()
-        else:
-            self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
-    def load_model(self, device_map=None):
-        if self.is_loaded:
-            rank0_print("{} is already loaded, `load_model` called again, skipping.".format(self.vision_tower_name))
-            return
-        self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
-        self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
-        self.vision_tower.requires_grad_(False)
-        self.is_loaded = True
-    def feature_select(self, image_forward_outs):
-        select_feature_type = self.select_feature
-        if self.select_feature in ["slicefour_patch", "slicefour_cls_patch"]:
-            select_every_k_layer = len(image_forward_outs.hidden_states) // 4
-            image_features = torch.cat([image_forward_outs.hidden_states[i] for i in range(select_every_k_layer + self.select_layer, len(image_forward_outs.hidden_states), select_every_k_layer)], dim=-1)
-            select_feature_type = select_feature_type.replace("slicefour_", "")
-        elif self.select_feature in ["slice_m25811_f6_patch", "slice_m25811_f6_cls_patch"]:
-            select_layers = [-2, -5, -8, -11, 6]
-            image_features = torch.cat([image_forward_outs.hidden_states[i] for i in select_layers], dim=-1)
-            select_feature_type = select_feature_type.replace("slice_m25811_f6_", "")
-        else:
-            image_features = image_forward_outs.hidden_states[self.select_layer]
-        if select_feature_type == "patch":
-            image_features = image_features[:, 1:]
-        elif select_feature_type == "cls_patch":
-            image_features = image_features
-        else:
-            raise ValueError(f"Unexpected select feature: {select_feature_type}")
-        return image_features
-    def forward(self, images):
-        if type(images) is list:
-            image_features = []
-            for image in images:
-                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
-                #print('image_feature before select ',image_forward_out.hidden_states[-1].shape)
-                image_feature = self.feature_select(image_forward_out).to(image.dtype)
-                #print('image_feature after select ',image_feature.shape)
-                image_features.append(image_feature)
-        else:
-            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
-            #print('image_feature before select ',image_forward_outs.hidden_states[-1].shape)
-            image_features = self.feature_select(image_forward_outs).to(images.dtype)
-            #print('image_feature after select ',image_features.shape)
-        return image_features
-    @property
-    def dummy_feature(self):
-        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
-    @property
-    def dtype(self):
-        return self.vision_tower.dtype
-    @property
-    def device(self):
-        return self.vision_tower.device
-    @property
-    def config(self):
-        if self.is_loaded:
-            return self.vision_tower.config
-        else:
-            return self.cfg_only
-    @property
-    def hidden_size(self):
-        _hidden_size = self.config.hidden_size
-        if "slicefour" in self.select_feature:
-            _hidden_size *= 4
-        if "slice_m25811_f6" in self.select_feature:
-            _hidden_size *= 5
-        return _hidden_size
-    @property
-    def num_patches_per_side(self):
-        return self.config.image_size // self.config.patch_size
-    @property
-    def num_patches(self):
-        _num_patches = (self.config.image_size // self.config.patch_size) ** 2
-        if "cls_patch" in self.select_feature:
-            _num_patches += 1
-        return _num_patches
-    @property
-    def image_size(self):
-        return self.config.image_size
-class CLIPVisionTowerS2(CLIPVisionTower):
-    def __init__(self, vision_tower, args, delay_load=False):
-        self.s2_scales = getattr(args, "s2_scales", "336,672,1008")
-        self.s2_scales = list(map(int, self.s2_scales.split(",")))
-        self.s2_scales.sort()
-        self.s2_split_size = self.s2_scales[0]
-        self.s2_image_size = self.s2_scales[-1]
-        super().__init__(vision_tower, args, delay_load)
-        # change resize/crop size in preprocessing to the largest image size in s2_scale
-        if not delay_load or getattr(args, "unfreeze_mm_vision_tower", False):
-            self.image_processor.size["shortest_edge"] = self.s2_image_size
-            self.image_processor.crop_size["height"] = self.image_processor.crop_size["width"] = self.s2_image_size
-    def load_model(self, device_map=None):
-        if self.is_loaded:
-            rank0_print("{} is already loaded, `load_model` called again, skipping.".format(self.vision_tower_name))
-            return
-        self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
-        self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
-        self.vision_tower.requires_grad_(False)
-        self.image_processor.size["shortest_edge"] = self.s2_image_size
-        self.image_processor.crop_size["height"] = self.image_processor.crop_size["width"] = self.s2_image_size
-        self.is_loaded = True
-    @torch.no_grad()
-    def forward_feature(self, images):
-        image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
-        image_features = self.feature_select(image_forward_outs).to(images.dtype)
-        return image_features
-    @torch.no_grad()
-    def forward(self, images):
-        if type(images) is list:
-            image_features = []
-            for image in images:
-                image_feature = multiscale_forward(self.forward_feature, image.unsqueeze(0), img_sizes=self.s2_scales, max_split_size=self.s2_split_size, split_forward=True)
-                image_features.append(image_feature)
-        else:
-            image_features = multiscale_forward(self.forward_feature, images, img_sizes=self.s2_scales, max_split_size=self.s2_split_size, split_forward=True)
-        return image_features
-    @property
-    def hidden_size(self):
-        return self.config.hidden_size * len(self.s2_scales)

multimodal_encoder/.ipynb_checkpoints/siglip_encoder-checkpoint.py DELETED Viewed

@@ -1,151 +0,0 @@
-import torch
-import torch.nn.functional as F
-from torch import nn
-from typing import Optional, Tuple, Union, Dict
-from PIL import Image
-from functools import partial, reduce
-from transformers import SiglipImageProcessor, SiglipVisionConfig, SiglipVisionModel
-from .base_encoder import BaseVisionTower
-import torch.distributed as dist
-# --data_path /share/shuyan/video_traindata/anno/\{cinepine_order\}.json \
-#     --image_folder /share/shuyan/video_traindata/Bunny-v1_0-data/finetune/images \
-#     --video_folder /share/shuyan/video_traindata \
-def rank0_print(*args):
-    if dist.is_initialized():
-        if dist.get_rank() == 0:
-            print(f"Rank {dist.get_rank()}: ", *args)
-    else:
-        print(*args)
-from transformers.image_processing_utils import BatchFeature, get_size_dict
-from transformers.image_transforms import (
-    convert_to_rgb,
-    normalize,
-    rescale,
-    resize,
-    to_channel_dimension_format,
-)
-from transformers.image_utils import (
-    ChannelDimension,
-    PILImageResampling,
-    to_numpy_array,
-)
-class SigLipImageProcessor:
-    def __init__(self, image_mean=(0.5, 0.5, 0.5), image_std=(0.5, 0.5, 0.5), size=(384, 384), crop_size: Dict[str, int] = None, resample=PILImageResampling.BICUBIC, rescale_factor=1 / 255, data_format=ChannelDimension.FIRST):
-        crop_size = crop_size if crop_size is not None else {"height": 384, "width": 384}
-        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.size = size
-        self.resample = resample
-        self.rescale_factor = rescale_factor
-        self.data_format = data_format
-        self.crop_size = crop_size
-    def preprocess(self, images, return_tensors):
-        if isinstance(images, Image.Image):
-            images = [images]
-        else:
-            # to adapt video data
-            images = [to_numpy_array(image) for image in images]
-            assert isinstance(images, list)
-        transforms = [
-            convert_to_rgb,
-            to_numpy_array,
-            partial(resize, size=self.size, resample=self.resample, data_format=self.data_format),
-            partial(rescale, scale=self.rescale_factor, data_format=self.data_format),
-            partial(normalize, mean=self.image_mean, std=self.image_std, data_format=self.data_format),
-            partial(to_channel_dimension_format, channel_dim=self.data_format, input_channel_dim=self.data_format),
-        ]
-        images = reduce(lambda x, f: [*map(f, x)], transforms, images)
-        data = {"pixel_values": images}
-        return BatchFeature(data=data, tensor_type=return_tensors)
-class SigLipVisionTower(BaseVisionTower):
-    def __init__(self, vision_tower_name, vision_tower_cfg, delay_load=False):
-        super(SigLipVisionTower, self).__init__(vision_tower_name, vision_tower_cfg, delay_load)
-        model_path = "google/siglip-so400m-patch14-384"
-        base_model_name, res, interp = model_path, 384, 576
-        self.vision_tower_name = base_model_name
-        self._image_size = res if res is not None else 512
-        self.unfreeze_mm_vision_tower = getattr(vision_tower_cfg, "unfreeze_mm_vision_tower", False)
-        if not delay_load:
-            rank0_print(f"Loading vision tower: {vision_tower_name}")
-            self.load_model()
-        elif getattr(vision_tower_cfg, "unfreeze_mm_vision_tower", False):
-            # TODO: better detector is needed.
-            rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.")
-            self.load_model()
-        elif hasattr(vision_tower_cfg, "mm_tunable_parts") and "mm_vision_tower" in vision_tower_cfg.mm_tunable_parts:
-            rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.")
-            self.load_model()
-        else:
-            self.cfg_only = self.config
-    def load_model(self, device_map=None):
-        self.vision_model = "siglip"
-        # clip_model, processor = create_model_from_pretrained(self.vision_tower_name)
-        self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name)
-        # self.vision_tower = clip_model.visual.trunk
-        self.vision_tower.output_tokens = True
-        self._hidden_size = self.vision_tower.config.hidden_size
-        self.image_processor = SigLipImageProcessor()
-        del self.vision_tower.vision_model.encoder.layers[-1:]
-        self.vision_tower.vision_model.head = nn.Identity()
-        self.vision_tower.requires_grad_(self.unfreeze_mm_vision_tower)
-        self.is_loaded = True
-    def _forward(self, images):
-        with torch.set_grad_enabled(self.unfreeze_mm_vision_tower):
-            image_features = self.vision_tower.forward(
-                images.to(device=self.device, dtype=self.dtype),
-                output_hidden_states=True,
-            ).hidden_states[-1]
-            return image_features
-    @property
-    def dummy_feature(self):
-        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
-    @property
-    def dtype(self):
-        for p in self.vision_tower.parameters():
-            return p.dtype
-    @property
-    def device(self):
-        for p in self.vision_tower.parameters():
-            return p.device
-    @property
-    def hidden_size(self):
-        return self.config.hidden_size
-    @property
-    def num_patches(self):
-        return (336 // 14) ** 2
-    @property
-    def num_patches_per_side(self):
-        #return self.config.image_size // self.config.patch_size
-        return 336//14
-        #return 27
-        # return self.model_config["vision_cfg"]["image_size"] // self.model_config["vision_cfg"]["patch_size"]
-    @property
-    def image_size(self):
-        return 384
-        #return self.config.image_size

multimodal_encoder/__pycache__/base_encoder.cpython-310.pyc DELETED Viewed

Binary file (2.62 kB)

multimodal_encoder/__pycache__/builder.cpython-310.pyc DELETED Viewed

Binary file (697 Bytes)

multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc DELETED Viewed

Binary file (6.53 kB)

multimodal_encoder/__pycache__/siglip_encoder.cpython-310.pyc DELETED Viewed

Binary file (5.81 kB)

multimodal_encoder/base_encoder.py DELETED Viewed

@@ -1,68 +0,0 @@
-from abc import ABC, abstractmethod
-import torch
-import torch.nn as nn
-class BaseVisionTower(nn.Module):
-    def __init__(self, vision_tower_name, vision_tower_cfg, delay_load=False):
-        super().__init__()
-        self.is_loaded = False
-        self.vision_tower_name = vision_tower_name
-        self.delay_load = delay_load
-    @abstractmethod
-    def load_model(self, device_map=None):
-        raise NotImplementedError("Subclasses must implement load_model")
-    @abstractmethod
-    def _forward(self, images):
-        raise NotImplementedError("Subclasses must implement forward")
-    def forward(self, images):
-        if type(images) is list:
-            image_features = [self._forward(image.unsqueeze(0)) for image in images]
-        else:
-            image_features = self._forward(images)
-        return image_features
-    @property
-    def dummy_feature(self):
-        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
-    @property
-    def dtype(self):
-        # Dynamically infer the dtype from the first parameter, if not explicitly specified
-        if hasattr(self.vision_tower, "dtype"):
-            return self.vision_tower.dtype
-        else:
-            params = list(self.vision_tower.parameters())
-            return (
-                params[0].dtype if len(params) > 0 else torch.float32
-            )  # Default to torch.float32 if no parameters
-    @property
-    def device(self):
-        # Dynamically infer the device from the first parameter, if not explicitly specified
-        if hasattr(self.vision_tower, "device"):
-            return self.vision_tower.device
-        else:
-            params = list(self.vision_tower.parameters())
-            return (
-                params[0].device if len(params) > 0 else torch.device("cpu")
-            )  # Default to CPU if no parameters
-    @property
-    def config(self):
-        if self.is_loaded:
-            return self.vision_tower.config
-        else:
-            return self.cfg_only
-    @property
-    def hidden_size(self):
-        try:
-            return self.config.hidden_size
-        except:
-            return self._hidden_size

multimodal_encoder/builder.py DELETED Viewed

@@ -1,20 +0,0 @@
-import os
-from .siglip_encoder import SigLipVisionTower
-# from .eva_clip.eva_clip_encoder import EvaClipVisionTower
-# from .dev_eva_clip.eva_vit import EvaViTWrapper
-def build_vision_tower(vision_tower_cfg, **kwargs):
-    vision_tower = getattr(vision_tower_cfg, "mm_vision_tower", getattr(vision_tower_cfg, "vision_tower", None))
-    is_absolute_path_exists = os.path.exists(vision_tower)
-    use_s2 = getattr(vision_tower_cfg, "s2", False)
-    #print(getattr(vision_tower_cfg, "vision_tower", None))
-    return SigLipVisionTower(vision_tower, vision_tower_cfg=vision_tower_cfg, **kwargs)
-    if getattr(vision_tower_cfg, "vision_tower", None) and "siglip" in getattr(vision_tower_cfg, "vision_tower", None).lower():
-        #print('*************\n')
-        return SigLipVisionTower(vision_tower, vision_tower_cfg=vision_tower_cfg, **kwargs)
-    raise ValueError(f"Unknown vision tower: {vision_tower}")

multimodal_projector/__pycache__/builder.cpython-310.pyc DELETED Viewed

Binary file (2.4 kB)

multimodal_projector/__pycache__/pooler_projector.cpython-310.pyc DELETED Viewed

Binary file (1.47 kB)

multimodal_projector/pooler_projector.py DELETED Viewed

@@ -1,33 +0,0 @@
-import torch
-import torch.nn as nn
-import math
-from transformers.models.clip.modeling_clip import CLIPVisionModel
-class PoolerProjector(nn.Module):
-    def __init__(self, config, vision_cfg):
-        super().__init__()
-        self._config = config
-        self.hw = vision_cfg.image_size // vision_cfg.patch_size
-        self.conv_pool = nn.Conv2d(config.mm_hidden_size, config.hidden_size, kernel_size=2, stride=2)
-        self.proj = nn.Sequential(
-            nn.GELU(),
-            nn.Linear(config.hidden_size, config.hidden_size),
-        )
-    def forward(self, x, *args, **kwargs):
-        height = width = self.hw
-        assert height * width == x.shape[1]
-        x = x.view(x.shape[0], height, width, -1).permute(0, 3, 1, 2)
-        x = self.conv_pool(x)
-        x = x.flatten(2).transpose(1, 2)
-        x = self.proj(x)
-        return x
-    @property
-    def config(self):
-        return {"mm_projector_type": "pooler"}

multimodal_resampler/__pycache__/builder.cpython-310.pyc DELETED Viewed

Binary file (1.45 kB)

multimodal_resampler/__pycache__/masked_drop.cpython-310.pyc DELETED Viewed

Binary file (2.47 kB)

multimodal_resampler/__pycache__/perceiver.cpython-310.pyc DELETED Viewed

Binary file (4.86 kB)

multimodal_resampler/__pycache__/qformer.cpython-310.pyc DELETED Viewed

Binary file (32.7 kB)

multimodal_resampler/__pycache__/spatial_pool.cpython-310.pyc DELETED Viewed

Binary file (1.9 kB)

multimodal_resampler/builder.py DELETED Viewed

@@ -1,34 +0,0 @@
-import torch
-from .masked_drop import MaskedDrop
-from .spatial_pool import SpatialPool
-from .perceiver import PerceiverResampler
-from .qformer import Qformer
-class IdentityMap(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-    def forward(self, x, *args, **kwargs):
-        return x
-    @property
-    def config(self):
-        return {"mm_resampler_type": None}
-def build_vision_resampler(model_args, delay_load=False, **kwargs):
-    resampler_type = getattr(model_args, "mm_resampler_type", None)
-    if resampler_type == "masked_drop":
-        return MaskedDrop(model_args)
-    elif resampler_type == "spatial_pool":
-        return SpatialPool(model_args, **kwargs)
-    elif resampler_type == "perceiver":
-        return PerceiverResampler(model_args, **kwargs)
-    elif resampler_type == "qformer":
-        return Qformer(model_args, **kwargs)
-    elif resampler_type is None:
-        return IdentityMap()
-    raise ValueError(f"Unknown resampler type: {resampler_type}")

multimodal_resampler/masked_drop.py DELETED Viewed

@@ -1,80 +0,0 @@
-import torch
-import torch.nn as nn
-import random
-class MaskedDrop(nn.Module):
-    def __init__(self, model_args):
-        super().__init__()
-        self.mode = model_args.mm_mask_drop_mode
-        self.skip_percentage = model_args.mm_mask_drop_skip_percentage
-        self.ratio = model_args.mm_mask_drop_ratio
-        self.ratio_upper = model_args.mm_mask_drop_ratio_upper
-        self.ratio_lower = model_args.mm_mask_drop_ratio_lower
-    def forward(self, image_features, *args, **kwargs):
-        if not self.training:
-            return image_features
-        if self.skip_percentage > random.random():
-            return image_features
-        masked_features = []
-        for image_feature in image_features:
-            num_tokens = image_feature.shape[0]
-            if self.mode == "fixed":
-                num_keep = int(num_tokens * self.ratio)
-                masked_features.append(self.random_masking(image_feature.unsqueeze(0), num_keep)[0][0])
-            elif self.mode == "range":
-                num_keep = int(num_tokens * random.uniform(self.ratio_lower, self.ratio_upper))
-                masked_features.append(self.random_masking(image_feature.unsqueeze(0), num_keep)[0])
-            elif self.mode == "cls_only":
-                masked_features.append(image_feature[0:1])
-            else:
-                raise ValueError(f"Unexpected masked drop mode: {self.mode}")
-        if self.mode not in ["range"] and (type(image_features) is not list or self.mode in ["cls_only"]):
-            masked_features = torch.stack(masked_features, dim=0)
-        return masked_features
-    @property
-    def config(self):
-        return {
-            "mm_resampler_type": "masked_drop",
-            "mm_mask_drop_mode": self.mode,
-            "mm_mask_drop_skip_percentage": self.skip_percentage,
-            "mm_mask_drop_ratio": self.ratio,
-            "mm_mask_drop_ratio_upper": self.ratio_upper,
-            "mm_mask_drop_ratio_lower": self.ratio_lower,
-        }
-    def random_masking(self, x, len_keep):
-        """
-        Perform per-sample random masking by per-sample shuffling.
-        Per-sample shuffling is done by argsort random noise.
-        x: [N, L, D], sequence
-        """
-        N, L, D = x.shape  # batch, length, dim
-        noise = torch.rand(N, L, device=x.device)  # noise in [0, 1]
-        # sort noise for each sample
-        ids_shuffle = torch.argsort(noise, dim=1)  # ascend: small is keep, large is remove
-        ids_restore = torch.argsort(ids_shuffle, dim=1)
-        # keep the first subset
-        ids_keep = ids_shuffle[:, :len_keep]
-        x_masked = torch.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D))
-        # generate the binary mask: 0 is keep, 1 is remove
-        mask = torch.ones([N, L], device=x.device)
-        mask[:, :len_keep] = 0
-        # unshuffle to get the binary mask
-        mask = torch.gather(mask, dim=1, index=ids_restore)
-        return x_masked, mask, ids_restore

multimodal_resampler/perceiver.py DELETED Viewed

@@ -1,155 +0,0 @@
-"""
-Taken from https://github.com/lucidrains/flamingo-pytorch
-"""
-import torch
-from einops import rearrange, repeat
-try:
-    from einops_exts import rearrange_many
-except:
-    pass
-from torch import einsum, nn
-def exists(val):
-    return val is not None
-def FeedForward(dim, mult=4):
-    inner_dim = int(dim * mult)
-    return nn.Sequential(
-        nn.LayerNorm(dim),
-        nn.Linear(dim, inner_dim, bias=False),
-        nn.GELU(),
-        nn.Linear(inner_dim, dim, bias=False),
-    )
-class PerceiverAttention(nn.Module):
-    def __init__(self, *, dim, dim_head=64, heads=8):
-        super().__init__()
-        self.scale = dim_head**-0.5
-        self.heads = heads
-        inner_dim = dim_head * heads
-        self.norm_media = nn.LayerNorm(dim)
-        self.norm_latents = nn.LayerNorm(dim)
-        self.to_q = nn.Linear(dim, inner_dim, bias=False)
-        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
-        self.to_out = nn.Linear(inner_dim, dim, bias=False)
-    def forward(self, x, latents):
-        """
-        Args:
-            x (torch.Tensor): image features
-                shape (b, T, n1, D)
-            latent (torch.Tensor): latent features
-                shape (b, T, n2, D)
-        """
-        x = self.norm_media(x)
-        latents = self.norm_latents(latents)
-        h = self.heads
-        q = self.to_q(latents)
-        kv_input = torch.cat((x, latents), dim=-2)
-        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
-        q, k, v = rearrange_many((q, k, v), "b t n (h d) -> b h t n d", h=h)
-        q = q * self.scale
-        # attention
-        sim = einsum("... i d, ... j d  -> ... i j", q, k)
-        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
-        attn = sim.softmax(dim=-1)
-        out = einsum("... i j, ... j d -> ... i d", attn, v)
-        out = rearrange(out, "b h t n d -> b t n (h d)", h=h)
-        return self.to_out(out)
-class PerceiverResamplerModule(nn.Module):
-    def __init__(
-        self,
-        *,
-        dim,
-        depth=6,
-        dim_head=64,
-        heads=8,
-        num_latents=64,
-        max_num_media=None,
-        max_num_frames=None,
-        ff_mult=4,
-    ):
-        super().__init__()
-        self.latents = nn.Parameter(torch.randn(num_latents, dim))
-        self.frame_embs = nn.Parameter(torch.randn(max_num_frames, dim)) if exists(max_num_frames) else None
-        self.media_time_embs = nn.Parameter(torch.randn(max_num_media, 1, dim)) if exists(max_num_media) else None
-        self.layers = nn.ModuleList([])
-        for _ in range(depth):
-            self.layers.append(
-                nn.ModuleList(
-                    [
-                        PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
-                        FeedForward(dim=dim, mult=ff_mult) if ff_mult > 0 else nn.Identity(),
-                    ]
-                )
-            )
-        self.norm = nn.LayerNorm(dim)
-    def forward(self, x):
-        """
-        Args:
-            x (torch.Tensor): image features
-                shape (b, T, F, v, D)
-        Returns:
-            shape (b, T, n, D) where n is self.num_latents
-        """
-        b, T, F, v = x.shape[:4]
-        # frame and media time embeddings
-        if exists(self.frame_embs):
-            frame_embs = repeat(self.frame_embs[:F], "F d -> b T F v d", b=b, T=T, v=v)
-            x = x + frame_embs
-        x = rearrange(x, "b T F v d -> b T (F v) d")  # flatten the frame and spatial dimensions
-        if exists(self.media_time_embs):
-            x = x + self.media_time_embs[:T]
-        # blocks
-        latents = repeat(self.latents, "n d -> b T n d", b=b, T=T)
-        for attn, ff in self.layers:
-            latents = attn(x, latents) + latents
-            latents = ff(latents) + latents
-        return self.norm(latents)
-class PerceiverResampler(nn.Module):
-    def __init__(self, model_args, vision_tower):
-        super().__init__()
-        self.depth = model_args.mm_perceiver_depth
-        self.num_latents = model_args.mm_perceiver_latents
-        self.ff_mult = model_args.mm_perceiver_ff_mult
-        self.pretrained = model_args.mm_perceiver_pretrained
-        self.perceiver = PerceiverResamplerModule(dim=vision_tower.hidden_size, depth=self.depth, num_latents=self.num_latents, ff_mult=self.ff_mult)
-        if self.pretrained is not None:
-            self.load_state_dict(torch.load(self.pretrained))
-    def forward(self, image_features, *args, **kwargs):
-        return self.perceiver(image_features[:, None, None]).squeeze(1)
-    @property
-    def config(self):
-        return {
-            "mm_resampler_type": "perceiver",
-            "mm_perceiver_depth": self.depth,
-            "mm_perceiver_latents": self.num_latents,
-            "mm_perceiver_ff_mult": self.ff_mult,
-            "mm_perceiver_pretrained": self.pretrained,
-        }

multimodal_resampler/qformer.py DELETED Viewed

@@ -1,1160 +0,0 @@
-"""
- * Copyright (c) 2023, salesforce.com, inc.
- * All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
- * By Junnan Li
- * Based on huggingface code base
- * https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/models/bert
-"""
-import math
-import os
-import warnings
-from dataclasses import dataclass
-from typing import Optional, Tuple, Dict, Any
-import torch
-from torch import Tensor, device, dtype, nn
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import CrossEntropyLoss
-import torch.nn.functional as F
-from transformers.activations import ACT2FN
-from transformers.file_utils import (
-    ModelOutput,
-)
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPastAndCrossAttentions,
-    BaseModelOutputWithPoolingAndCrossAttentions,
-    CausalLMOutputWithCrossAttentions,
-    MaskedLMOutput,
-    MultipleChoiceModelOutput,
-    NextSentencePredictorOutput,
-    QuestionAnsweringModelOutput,
-    SequenceClassifierOutput,
-    TokenClassifierOutput,
-)
-from transformers.modeling_utils import (
-    PreTrainedModel,
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-)
-from transformers.utils import logging
-from transformers.models.bert.configuration_bert import BertConfig
-logger = logging.get_logger(__name__)
-def disabled_train(self, mode=True):
-    """Overwrite model.train with this function to make sure train/eval mode
-    does not change anymore."""
-    return self
-class BertEmbeddings(nn.Module):
-    """Construct the embeddings from word and position embeddings."""
-    def __init__(self, config):
-        super().__init__()
-        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
-        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
-        # any TensorFlow checkpoint file
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
-        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        self.config = config
-    def forward(
-        self,
-        input_ids=None,
-        position_ids=None,
-        query_embeds=None,
-        past_key_values_length=0,
-    ):
-        if input_ids is not None:
-            seq_length = input_ids.size()[1]
-        else:
-            seq_length = 0
-        if position_ids is None:
-            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length].clone()
-        if input_ids is not None:
-            embeddings = self.word_embeddings(input_ids)
-            if self.position_embedding_type == "absolute":
-                position_embeddings = self.position_embeddings(position_ids)
-                embeddings = embeddings + position_embeddings
-            if query_embeds is not None:
-                embeddings = torch.cat((query_embeds, embeddings), dim=1)
-        else:
-            embeddings = query_embeds
-        embeddings = self.LayerNorm(embeddings)
-        embeddings = self.dropout(embeddings)
-        return embeddings
-class BertSelfAttention(nn.Module):
-    def __init__(self, config, is_cross_attention):
-        super().__init__()
-        self.config = config
-        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
-            raise ValueError("The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (config.hidden_size, config.num_attention_heads))
-        self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-        self.query = nn.Linear(config.hidden_size, self.all_head_size)
-        if is_cross_attention:
-            self.key = nn.Linear(config.encoder_width, self.all_head_size)
-            self.value = nn.Linear(config.encoder_width, self.all_head_size)
-        else:
-            self.key = nn.Linear(config.hidden_size, self.all_head_size)
-            self.value = nn.Linear(config.hidden_size, self.all_head_size)
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
-            self.max_position_embeddings = config.max_position_embeddings
-            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
-        self.save_attention = False
-    def save_attn_gradients(self, attn_gradients):
-        self.attn_gradients = attn_gradients
-    def get_attn_gradients(self):
-        return self.attn_gradients
-    def save_attention_map(self, attention_map):
-        self.attention_map = attention_map
-    def get_attention_map(self):
-        return self.attention_map
-    def transpose_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (
-            self.num_attention_heads,
-            self.attention_head_size,
-        )
-        x = x.view(*new_x_shape)
-        return x.permute(0, 2, 1, 3)
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
-    ):
-        # If this is instantiated as a cross-attention module, the keys
-        # and values come from an encoder; the attention mask needs to be
-        # such that the encoder's padding tokens are not attended to.
-        is_cross_attention = encoder_hidden_states is not None
-        if is_cross_attention:
-            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
-            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
-            attention_mask = encoder_attention_mask
-        elif past_key_value is not None:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
-            value_layer = self.transpose_for_scores(self.value(hidden_states))
-            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
-            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
-        else:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
-            value_layer = self.transpose_for_scores(self.value(hidden_states))
-        mixed_query_layer = self.query(hidden_states)
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-        past_key_value = (key_layer, value_layer)
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
-            seq_length = hidden_states.size()[1]
-            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
-            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
-            distance = position_ids_l - position_ids_r
-            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
-            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
-            if self.position_embedding_type == "relative_key":
-                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
-                attention_scores = attention_scores + relative_position_scores
-            elif self.position_embedding_type == "relative_key_query":
-                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
-                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
-                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
-        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-        if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
-            attention_scores = attention_scores + attention_mask
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.Softmax(dim=-1)(attention_scores)
-        if is_cross_attention and self.save_attention:
-            self.save_attention_map(attention_probs)
-            attention_probs.register_hook(self.save_attn_gradients)
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs_dropped = self.dropout(attention_probs)
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs_dropped = attention_probs_dropped * head_mask
-        context_layer = torch.matmul(attention_probs_dropped, value_layer)
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(*new_context_layer_shape)
-        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
-        outputs = outputs + (past_key_value,)
-        return outputs
-class BertSelfOutput(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-    def forward(self, hidden_states, input_tensor):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-class BertAttention(nn.Module):
-    def __init__(self, config, is_cross_attention=False):
-        super().__init__()
-        self.self = BertSelfAttention(config, is_cross_attention)
-        self.output = BertSelfOutput(config)
-        self.pruned_heads = set()
-    def prune_heads(self, heads):
-        if len(heads) == 0:
-            return
-        heads, index = find_pruneable_heads_and_indices(
-            heads,
-            self.self.num_attention_heads,
-            self.self.attention_head_size,
-            self.pruned_heads,
-        )
-        # Prune linear layers
-        self.self.query = prune_linear_layer(self.self.query, index)
-        self.self.key = prune_linear_layer(self.self.key, index)
-        self.self.value = prune_linear_layer(self.self.value, index)
-        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
-        # Update hyper params and store pruned heads
-        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
-        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
-        self.pruned_heads = self.pruned_heads.union(heads)
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
-    ):
-        self_outputs = self.self(
-            hidden_states,
-            attention_mask,
-            head_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
-            past_key_value,
-            output_attentions,
-        )
-        attention_output = self.output(self_outputs[0], hidden_states)
-        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
-        return outputs
-class BertIntermediate(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
-        if isinstance(config.hidden_act, str):
-            self.intermediate_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.intermediate_act_fn = config.hidden_act
-    def forward(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.intermediate_act_fn(hidden_states)
-        return hidden_states
-class BertOutput(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-    def forward(self, hidden_states, input_tensor):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-class BertLayer(nn.Module):
-    def __init__(self, config, layer_num):
-        super().__init__()
-        self.config = config
-        self.chunk_size_feed_forward = config.chunk_size_feed_forward
-        self.seq_len_dim = 1
-        self.attention = BertAttention(config)
-        self.layer_num = layer_num
-        if self.config.add_cross_attention and layer_num % self.config.cross_attention_freq == 0:
-            self.crossattention = BertAttention(config, is_cross_attention=self.config.add_cross_attention)
-            self.has_cross_attention = True
-        else:
-            self.has_cross_attention = False
-        self.intermediate = BertIntermediate(config)
-        self.output = BertOutput(config)
-        self.intermediate_query = BertIntermediate(config)
-        self.output_query = BertOutput(config)
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
-        query_length=0,
-    ):
-        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
-        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
-        self_attention_outputs = self.attention(
-            hidden_states,
-            attention_mask,
-            head_mask,
-            output_attentions=output_attentions,
-            past_key_value=self_attn_past_key_value,
-        )
-        attention_output = self_attention_outputs[0]
-        outputs = self_attention_outputs[1:-1]
-        present_key_value = self_attention_outputs[-1]
-        if query_length > 0:
-            query_attention_output = attention_output[:, :query_length, :]
-            if self.has_cross_attention:
-                assert encoder_hidden_states is not None, "encoder_hidden_states must be given for cross-attention layers"
-                cross_attention_outputs = self.crossattention(
-                    query_attention_output,
-                    attention_mask,
-                    head_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                    output_attentions=output_attentions,
-                )
-                query_attention_output = cross_attention_outputs[0]
-                outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
-            layer_output = apply_chunking_to_forward(
-                self.feed_forward_chunk_query,
-                self.chunk_size_feed_forward,
-                self.seq_len_dim,
-                query_attention_output,
-            )
-            if attention_output.shape[1] > query_length:
-                layer_output_text = apply_chunking_to_forward(
-                    self.feed_forward_chunk,
-                    self.chunk_size_feed_forward,
-                    self.seq_len_dim,
-                    attention_output[:, query_length:, :],
-                )
-                layer_output = torch.cat([layer_output, layer_output_text], dim=1)
-        else:
-            layer_output = apply_chunking_to_forward(
-                self.feed_forward_chunk,
-                self.chunk_size_feed_forward,
-                self.seq_len_dim,
-                attention_output,
-            )
-        outputs = (layer_output,) + outputs
-        outputs = outputs + (present_key_value,)
-        return outputs
-    def feed_forward_chunk(self, attention_output):
-        intermediate_output = self.intermediate(attention_output)
-        layer_output = self.output(intermediate_output, attention_output)
-        return layer_output
-    def feed_forward_chunk_query(self, attention_output):
-        intermediate_output = self.intermediate_query(attention_output)
-        layer_output = self.output_query(intermediate_output, attention_output)
-        return layer_output
-class BertEncoder(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.layer = nn.ModuleList([BertLayer(config, i) for i in range(config.num_hidden_layers)])
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict=True,
-        query_length=0,
-    ):
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attentions = () if output_attentions else None
-        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
-        next_decoder_cache = () if use_cache else None
-        for i in range(self.config.num_hidden_layers):
-            layer_module = self.layer[i]
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-            layer_head_mask = head_mask[i] if head_mask is not None else None
-            past_key_value = past_key_values[i] if past_key_values is not None else None
-            if getattr(self.config, "gradient_checkpointing", False) and self.training:
-                if use_cache:
-                    logger.warn("`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...")
-                    use_cache = False
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs, past_key_value, output_attentions, query_length)
-                    return custom_forward
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(layer_module),
-                    hidden_states,
-                    attention_mask,
-                    layer_head_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                )
-            else:
-                layer_outputs = layer_module(
-                    hidden_states,
-                    attention_mask,
-                    layer_head_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                    past_key_value,
-                    output_attentions,
-                    query_length,
-                )
-            hidden_states = layer_outputs[0]
-            if use_cache:
-                next_decoder_cache += (layer_outputs[-1],)
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (layer_outputs[1],)
-                all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-        if not return_dict:
-            return tuple(
-                v
-                for v in [
-                    hidden_states,
-                    next_decoder_cache,
-                    all_hidden_states,
-                    all_self_attentions,
-                    all_cross_attentions,
-                ]
-                if v is not None
-            )
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=next_decoder_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-            cross_attentions=all_cross_attentions,
-        )
-class BertPooler(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.activation = nn.Tanh()
-    def forward(self, hidden_states):
-        # We "pool" the model by simply taking the hidden state corresponding
-        # to the first token.
-        first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense(first_token_tensor)
-        pooled_output = self.activation(pooled_output)
-        return pooled_output
-class BertPredictionHeadTransform(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        if isinstance(config.hidden_act, str):
-            self.transform_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.transform_act_fn = config.hidden_act
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-    def forward(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.transform_act_fn(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states)
-        return hidden_states
-class BertLMPredictionHead(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.transform = BertPredictionHeadTransform(config)
-        # The output weights are the same as the input embeddings, but there is
-        # an output-only bias for each token.
-        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
-        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
-        self.decoder.bias = self.bias
-    def forward(self, hidden_states):
-        hidden_states = self.transform(hidden_states)
-        hidden_states = self.decoder(hidden_states)
-        return hidden_states
-class BertOnlyMLMHead(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.predictions = BertLMPredictionHead(config)
-    def forward(self, sequence_output):
-        prediction_scores = self.predictions(sequence_output)
-        return prediction_scores
-class BertPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-    config_class = BertConfig
-    base_model_prefix = "bert"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-    def _init_weights(self, module):
-        """Initialize the weights"""
-        if isinstance(module, (nn.Linear, nn.Embedding)):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-        if isinstance(module, nn.Linear) and module.bias is not None:
-            module.bias.data.zero_()
-class BertModel(BertPreTrainedModel):
-    """
-    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
-    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
-    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
-    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
-    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
-    input to the forward pass.
-    """
-    def __init__(self, config, add_pooling_layer=False):
-        super().__init__(config)
-        self.config = config
-        self.embeddings = BertEmbeddings(config)
-        self.encoder = BertEncoder(config)
-        self.pooler = BertPooler(config) if add_pooling_layer else None
-        self.init_weights()
-    def get_input_embeddings(self):
-        return self.embeddings.word_embeddings
-    def set_input_embeddings(self, value):
-        self.embeddings.word_embeddings = value
-    def _prune_heads(self, heads_to_prune):
-        """
-        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
-        class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
-    def get_extended_attention_mask(
-        self,
-        attention_mask: Tensor,
-        input_shape: Tuple[int],
-        device: device,
-        is_decoder: bool,
-        has_query: bool = False,
-    ) -> Tensor:
-        """
-        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
-        Arguments:
-            attention_mask (:obj:`torch.Tensor`):
-                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
-            input_shape (:obj:`Tuple[int]`):
-                The shape of the input to the model.
-            device: (:obj:`torch.device`):
-                The device of the input to the model.
-        Returns:
-            :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
-        """
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        if attention_mask.dim() == 3:
-            extended_attention_mask = attention_mask[:, None, :, :]
-        elif attention_mask.dim() == 2:
-            # Provided a padding mask of dimensions [batch_size, seq_length]
-            # - if the model is a decoder, apply a causal mask in addition to the padding mask
-            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
-            if is_decoder:
-                batch_size, seq_length = input_shape
-                seq_ids = torch.arange(seq_length, device=device)
-                causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
-                # add a prefix ones mask to the causal mask
-                # causal and attention masks must have same type with pytorch version < 1.3
-                causal_mask = causal_mask.to(attention_mask.dtype)
-                if causal_mask.shape[1] < attention_mask.shape[1]:
-                    prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
-                    if has_query:  # UniLM style attention mask
-                        causal_mask = torch.cat(
-                            [
-                                torch.zeros(
-                                    (batch_size, prefix_seq_len, seq_length),
-                                    device=device,
-                                    dtype=causal_mask.dtype,
-                                ),
-                                causal_mask,
-                            ],
-                            axis=1,
-                        )
-                    causal_mask = torch.cat(
-                        [
-                            torch.ones(
-                                (batch_size, causal_mask.shape[1], prefix_seq_len),
-                                device=device,
-                                dtype=causal_mask.dtype,
-                            ),
-                            causal_mask,
-                        ],
-                        axis=-1,
-                    )
-                extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
-            else:
-                extended_attention_mask = attention_mask[:, None, None, :]
-        else:
-            raise ValueError("Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(input_shape, attention_mask.shape))
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and -10000.0 for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
-        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
-        return extended_attention_mask
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        position_ids=None,
-        head_mask=None,
-        query_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        is_decoder=False,
-    ):
-        r"""
-        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
-            the model is configured as a decoder.
-        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
-            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
-            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
-            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
-            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
-        use_cache (:obj:`bool`, `optional`):
-            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
-            decoding (see :obj:`past_key_values`).
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        # use_cache = use_cache if use_cache is not None else self.config.use_cache
-        if input_ids is None:
-            assert query_embeds is not None, "You have to specify query_embeds when input_ids is None"
-        # past_key_values_length
-        past_key_values_length = past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0
-        query_length = query_embeds.shape[1] if query_embeds is not None else 0
-        embedding_output = self.embeddings(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            query_embeds=query_embeds,
-            past_key_values_length=past_key_values_length,
-        )
-        input_shape = embedding_output.size()[:-1]
-        batch_size, seq_length = input_shape
-        device = embedding_output.device
-        if attention_mask is None:
-            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        if is_decoder:
-            extended_attention_mask = self.get_extended_attention_mask(
-                attention_mask,
-                input_ids.shape,
-                device,
-                is_decoder,
-                has_query=(query_embeds is not None),
-            )
-        else:
-            extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device, is_decoder)
-        # If a 2D or 3D attention mask is provided for the cross-attention
-        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
-        if encoder_hidden_states is not None:
-            if type(encoder_hidden_states) == list:
-                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size()
-            else:
-                (
-                    encoder_batch_size,
-                    encoder_sequence_length,
-                    _,
-                ) = encoder_hidden_states.size()
-            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
-            if type(encoder_attention_mask) == list:
-                encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
-            elif encoder_attention_mask is None:
-                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
-                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
-            else:
-                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
-        else:
-            encoder_extended_attention_mask = None
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
-        encoder_outputs = self.encoder(
-            embedding_output,
-            attention_mask=extended_attention_mask,
-            head_mask=head_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_extended_attention_mask,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            query_length=query_length,
-        )
-        sequence_output = encoder_outputs[0]
-        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
-        if not return_dict:
-            return (sequence_output, pooled_output) + encoder_outputs[1:]
-        return BaseModelOutputWithPoolingAndCrossAttentions(
-            last_hidden_state=sequence_output,
-            pooler_output=pooled_output,
-            past_key_values=encoder_outputs.past_key_values,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-            cross_attentions=encoder_outputs.cross_attentions,
-        )
-class BertLMHeadModel(BertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
-    def __init__(self, config):
-        super().__init__(config)
-        self.bert = BertModel(config, add_pooling_layer=False)
-        self.cls = BertOnlyMLMHead(config)
-        self.init_weights()
-    def get_output_embeddings(self):
-        return self.cls.predictions.decoder
-    def set_output_embeddings(self, new_embeddings):
-        self.cls.predictions.decoder = new_embeddings
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        position_ids=None,
-        head_mask=None,
-        query_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        labels=None,
-        past_key_values=None,
-        use_cache=True,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        return_logits=False,
-        is_decoder=True,
-        reduction="mean",
-    ):
-        r"""
-        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
-            the model is configured as a decoder.
-        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
-            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
-            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
-            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
-        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
-            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
-            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
-            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
-        use_cache (:obj:`bool`, `optional`):
-            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
-            decoding (see :obj:`past_key_values`).
-        Returns:
-        Example::
-            >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
-            >>> import torch
-            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
-            >>> config = BertConfig.from_pretrained("bert-base-cased")
-            >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
-            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-            >>> outputs = model(**inputs)
-            >>> prediction_logits = outputs.logits
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if labels is not None:
-            use_cache = False
-        if past_key_values is not None:
-            query_embeds = None
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            query_embeds=query_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            is_decoder=is_decoder,
-        )
-        sequence_output = outputs[0]
-        if query_embeds is not None:
-            sequence_output = outputs[0][:, query_embeds.shape[1] :, :]
-        prediction_scores = self.cls(sequence_output)
-        if return_logits:
-            return prediction_scores[:, :-1, :].contiguous()
-        lm_loss = None
-        if labels is not None:
-            # we are doing next-token prediction; shift prediction scores and input ids by one
-            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
-            labels = labels[:, 1:].contiguous()
-            loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1)
-            lm_loss = loss_fct(
-                shifted_prediction_scores.view(-1, self.config.vocab_size),
-                labels.view(-1),
-            )
-            if reduction == "none":
-                lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1)
-        if not return_dict:
-            output = (prediction_scores,) + outputs[2:]
-            return ((lm_loss,) + output) if lm_loss is not None else output
-        return CausalLMOutputWithCrossAttentions(
-            loss=lm_loss,
-            logits=prediction_scores,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-            cross_attentions=outputs.cross_attentions,
-        )
-    def prepare_inputs_for_generation(self, input_ids, query_embeds, past=None, attention_mask=None, **model_kwargs):
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_ids.shape)
-        query_mask = input_ids.new_ones(query_embeds.shape[:-1])
-        attention_mask = torch.cat([query_mask, attention_mask], dim=-1)
-        # cut decoder_input_ids if past is used
-        if past is not None:
-            input_ids = input_ids[:, -1:]
-        return {
-            "input_ids": input_ids,
-            "query_embeds": query_embeds,
-            "attention_mask": attention_mask,
-            "past_key_values": past,
-            "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None),
-            "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None),
-            "is_decoder": True,
-        }
-    def _reorder_cache(self, past, beam_idx):
-        reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
-        return reordered_past
-class BertForMaskedLM(BertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
-    def __init__(self, config):
-        super().__init__(config)
-        self.bert = BertModel(config, add_pooling_layer=False)
-        self.cls = BertOnlyMLMHead(config)
-        self.init_weights()
-    def get_output_embeddings(self):
-        return self.cls.predictions.decoder
-    def set_output_embeddings(self, new_embeddings):
-        self.cls.predictions.decoder = new_embeddings
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        position_ids=None,
-        head_mask=None,
-        query_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        return_logits=False,
-        is_decoder=False,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
-            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
-            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            query_embeds=query_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            is_decoder=is_decoder,
-        )
-        if query_embeds is not None:
-            sequence_output = outputs[0][:, query_embeds.shape[1] :, :]
-        prediction_scores = self.cls(sequence_output)
-        if return_logits:
-            return prediction_scores
-        masked_lm_loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()  # -100 index = padding token
-            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
-        if not return_dict:
-            output = (prediction_scores,) + outputs[2:]
-            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
-        return MaskedLMOutput(
-            loss=masked_lm_loss,
-            logits=prediction_scores,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-class Qformer(nn.Module):
-    def __init__(self, model_args, vision_tower):
-        super().__init__()
-        self.depth = model_args.mm_qformer_depth
-        self.num_latents = model_args.mm_qformer_latents
-        self.pretrained = model_args.mm_qformer_pretrained
-        self.Qformer, self.query_tokens, self.ln_vision = self.build_Qformer(vision_tower.hidden_size, self.depth, self.num_latents)
-        if self.pretrained is not None:
-            pretrained_dict = torch.load(self.pretrained, map_location="cpu")["model"]
-            pretrained_dict = {k: v for k, v in pretrained_dict.items() if not k.startswith("t5_proj")}
-            self.load_state_dict(pretrained_dict)
-    def build_Qformer(self, vision_width, cross_attention_freq, num_query_token):
-        encoder_config = BertConfig.from_pretrained("bert-base-uncased")
-        encoder_config.encoder_width = vision_width
-        # insert cross-attention layer every other block
-        encoder_config.add_cross_attention = True
-        encoder_config.cross_attention_freq = cross_attention_freq
-        encoder_config.query_length = num_query_token
-        Qformer = BertLMHeadModel(config=encoder_config)
-        query_tokens = nn.Parameter(torch.zeros(1, num_query_token, encoder_config.hidden_size))
-        query_tokens.data.normal_(mean=0.0, std=encoder_config.initializer_range)
-        Qformer.cls = None
-        Qformer.bert.embeddings.word_embeddings = None
-        Qformer.bert.embeddings.position_embeddings = None
-        for layer in Qformer.bert.encoder.layer:
-            layer.output = None
-            layer.intermediate = None
-        return Qformer, query_tokens, nn.LayerNorm(vision_width)
-    def forward(self, image_features, *args, **kwargs):
-        x = self.ln_vision(image_features)
-        image_atts = torch.ones(x.size()[:-1], dtype=torch.long).to(x.device)
-        query_tokens = self.query_tokens.expand(x.shape[0], -1, -1)
-        query_output = self.Qformer.bert(
-            query_embeds=query_tokens,
-            encoder_hidden_states=x,
-            encoder_attention_mask=image_atts,
-            return_dict=True,
-        )
-        return query_output.last_hidden_state
-    @property
-    def hidden_size(self):
-        return 768
-    @property
-    def config(self):
-        return {
-            "mm_resampler_type": "qformer",
-            "mm_qformer_depth": self.depth,
-            "mm_qformer_latents": self.num_latents,
-            "mm_qformer_pretrained": self.pretrained,
-        }

sae.py CHANGED Viewed

@@ -1,8 +1,1440 @@
 import torch
-from .sae_utils import SamePadConv3d,Normalize,SiLU,TemporalAttention,AttnBlock3D,MultiHeadAttention3D,TemporalAttention_lin
 import torch.nn as nn
 import pdb
 class SiglipAE(nn.Module):
     def __init__(self):
@@ -34,12 +1466,4 @@ class SiglipAE(nn.Module):
         x=self.encoder(x)
         return x
-# image=torch.randn(1,1152,4,24,24).to('cuda')
-# model = SiglipAE().to('cuda')
-# model.load_state_dict(torch.load('encoder.pth'),strict=False)
-# image=model(image)
-# print(image.shape)

 import torch
 import torch.nn as nn
 import pdb
+import math
+from transformers.activations import ACT2FN
+from einops import rearrange, reduce, repeat
+from inspect import isfunction
+import math
+import torch.nn.functional as F
+from torch import nn, einsum
+from einops import rearrange, repeat
+from typing import Optional, Any
+try:
+    import xformers
+    import xformers.ops
+    XFORMERS_IS_AVAILBLE = True
+except:
+    XFORMERS_IS_AVAILBLE = False
+import importlib
+import numpy as np
+import cv2, os
+import torch.distributed as dist
+def count_params(model, verbose=False):
+    total_params = sum(p.numel() for p in model.parameters())
+    if verbose:
+        print(f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.")
+    return total_params
+def check_istarget(name, para_list):
+    """
+    name: full name of source para
+    para_list: partial name of target para
+    """
+    istarget = False
+    for para in para_list:
+        if para in name:
+            return True
+    return istarget
+def instantiate_from_config(config):
+    if not "target" in config:
+        if config == "__is_first_stage__":
+            return None
+        elif config == "__is_unconditional__":
+            return None
+        raise KeyError("Expected key `target` to instantiate.")
+    return get_obj_from_str(config["target"])(**config.get("params", dict()))
+def get_obj_from_str(string, reload=False):
+    module, cls = string.rsplit(".", 1)
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)
+def load_npz_from_dir(data_dir):
+    data = [
+        np.load(os.path.join(data_dir, data_name))["arr_0"]
+        for data_name in os.listdir(data_dir)
+    ]
+    data = np.concatenate(data, axis=0)
+    return data
+def load_npz_from_paths(data_paths):
+    data = [np.load(data_path)["arr_0"] for data_path in data_paths]
+    data = np.concatenate(data, axis=0)
+    return data
+def resize_numpy_image(image, max_resolution=512 * 512, resize_short_edge=None):
+    h, w = image.shape[:2]
+    if resize_short_edge is not None:
+        k = resize_short_edge / min(h, w)
+    else:
+        k = max_resolution / (h * w)
+        k = k**0.5
+    h = int(np.round(h * k / 64)) * 64
+    w = int(np.round(w * k / 64)) * 64
+    image = cv2.resize(image, (w, h), interpolation=cv2.INTER_LANCZOS4)
+    return image
+def setup_dist(args):
+    if dist.is_initialized():
+        return
+    torch.cuda.set_device(args.local_rank)
+    torch.distributed.init_process_group("nccl", init_method="env://")
+# adopted from
+# https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+# and
+# https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
+# and
+# https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py
+#
+# thanks!
+import torch.nn as nn
+import math
+from inspect import isfunction
+import torch
+from torch import nn
+import torch.distributed as dist
+def gather_data(data, return_np=True):
+    """gather data from multiple processes to one list"""
+    data_list = [torch.zeros_like(data) for _ in range(dist.get_world_size())]
+    dist.all_gather(data_list, data)  # gather not supported with NCCL
+    if return_np:
+        data_list = [data.cpu().numpy() for data in data_list]
+    return data_list
+def autocast(f):
+    def do_autocast(*args, **kwargs):
+        with torch.cuda.amp.autocast(
+            enabled=True,
+            dtype=torch.get_autocast_gpu_dtype(),
+            cache_enabled=torch.is_autocast_cache_enabled(),
+        ):
+            return f(*args, **kwargs)
+    return do_autocast
+def extract_into_tensor(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+def noise_like(shape, device, repeat=False):
+    repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(
+        shape[0], *((1,) * (len(shape) - 1))
+    )
+    noise = lambda: torch.randn(shape, device=device)
+    return repeat_noise() if repeat else noise()
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+def exists(val):
+    return val is not None
+def identity(*args, **kwargs):
+    return nn.Identity()
+def uniq(arr):
+    return {el: True for el in arr}.keys()
+def mean_flat(tensor):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+def ismap(x):
+    if not isinstance(x, torch.Tensor):
+        return False
+    return (len(x.shape) == 4) and (x.shape[1] > 3)
+def isimage(x):
+    if not isinstance(x, torch.Tensor):
+        return False
+    return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1)
+def max_neg_value(t):
+    return -torch.finfo(t.dtype).max
+def shape_to_str(x):
+    shape_str = "x".join([str(x) for x in x.shape])
+    return shape_str
+def init_(tensor):
+    dim = tensor.shape[-1]
+    std = 1 / math.sqrt(dim)
+    tensor.uniform_(-std, std)
+    return tensor
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode
+    does not change anymore."""
+    return self
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+def scale_module(module, scale):
+    """
+    Scale the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().mul_(scale)
+    return module
+def conv_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D convolution module.
+    """
+    if dims == 1:
+        return nn.Conv1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.Conv2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.Conv3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+def linear(*args, **kwargs):
+    """
+    Create a linear module.
+    """
+    return nn.Linear(*args, **kwargs)
+def avg_pool_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D average pooling module.
+    """
+    if dims == 1:
+        return nn.AvgPool1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.AvgPool2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.AvgPool3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+def nonlinearity(type="silu"):
+    if type == "silu":
+        return nn.SiLU()
+    elif type == "leaky_relu":
+        return nn.LeakyReLU()
+class GroupNormSpecific(nn.GroupNorm):
+    def forward(self, x):
+        if x.dtype == torch.float16 or x.dtype == torch.bfloat16:
+            return super().forward(x).type(x.dtype)
+        else:
+            return super().forward(x.float()).type(x.dtype)
+def normalization(channels, num_groups=32):
+    """
+    Make a standard normalization layer.
+    :param channels: number of input channels.
+    :return: an nn.Module for normalization.
+    """
+    return GroupNormSpecific(num_groups, channels)
+class HybridConditioner(nn.Module):
+    def __init__(self, c_concat_config, c_crossattn_config):
+        super().__init__()
+        self.concat_conditioner = instantiate_from_config(c_concat_config)
+        self.crossattn_conditioner = instantiate_from_config(c_crossattn_config)
+    def forward(self, c_concat, c_crossattn):
+        c_concat = self.concat_conditioner(c_concat)
+        c_crossattn = self.crossattn_conditioner(c_crossattn)
+        return {"c_concat": [c_concat], "c_crossattn": [c_crossattn]}
+def exists(val):
+    return val is not None
+def uniq(arr):
+    return {el: True for el in arr}.keys()
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+def max_neg_value(t):
+    return -torch.finfo(t.dtype).max
+def init_(tensor):
+    dim = tensor.shape[-1]
+    std = 1 / math.sqrt(dim)
+    tensor.uniform_(-std, std)
+    return tensor
+# feedforward
+class GEGLU(nn.Module):
+    def __init__(self, dim_in, dim_out):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+    def forward(self, x):
+        x, gate = self.proj(x).chunk(2, dim=-1)
+        return x * F.gelu(gate)
+class FeedForward(nn.Module):
+    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = default(dim_out, dim)
+        project_in = (
+            nn.Sequential(nn.Linear(dim, inner_dim), nn.GELU())
+            if not glu
+            else GEGLU(dim, inner_dim)
+        )
+        self.net = nn.Sequential(
+            project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out)
+        )
+    def forward(self, x):
+        return self.net(x)
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+def Normalize(in_channels, num_groups=32):
+    return torch.nn.GroupNorm(
+        num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True
+    )
+class RelativePosition(nn.Module):
+    """https://github.com/evelinehong/Transformer_Relative_Position_PyTorch/blob/master/relative_position.py"""
+    def __init__(self, num_units, max_relative_position):
+        super().__init__()
+        self.num_units = num_units
+        self.max_relative_position = max_relative_position
+        self.embeddings_table = nn.Parameter(
+            torch.Tensor(max_relative_position * 2 + 1, num_units)
+        )
+        nn.init.xavier_uniform_(self.embeddings_table)
+    def forward(self, length_q, length_k):
+        device = self.embeddings_table.device
+        range_vec_q = torch.arange(length_q, device=device)
+        range_vec_k = torch.arange(length_k, device=device)
+        distance_mat = range_vec_k[None, :] - range_vec_q[:, None]
+        distance_mat_clipped = torch.clamp(
+            distance_mat, -self.max_relative_position, self.max_relative_position
+        )
+        final_mat = distance_mat_clipped + self.max_relative_position
+        # final_mat = torch.LongTensor(final_mat).to(self.embeddings_table.device)
+        # final_mat = torch.tensor(final_mat, device=self.embeddings_table.device, dtype=torch.long)
+        final_mat = final_mat.long()
+        embeddings = self.embeddings_table[final_mat]
+        return embeddings
+class TemporalCrossAttention(nn.Module):
+    def __init__(
+        self,
+        query_dim,
+        context_dim=None,
+        heads=8,
+        dim_head=64,
+        dropout=0.0,
+        temporal_length=None,  # For relative positional representation and image-video joint training.
+        image_length=None,  # For image-video joint training.
+        use_relative_position=False,  # whether use relative positional representation in temporal attention.
+        img_video_joint_train=False,  # For image-video joint training.
+        use_tempoal_causal_attn=False,
+        bidirectional_causal_attn=False,
+        tempoal_attn_type=None,
+        joint_train_mode="same_batch",
+        **kwargs,
+    ):
+        super().__init__()
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+        self.context_dim = context_dim
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        self.temporal_length = temporal_length
+        self.use_relative_position = use_relative_position
+        self.img_video_joint_train = img_video_joint_train
+        self.bidirectional_causal_attn = bidirectional_causal_attn
+        self.joint_train_mode = joint_train_mode
+        assert joint_train_mode in ["same_batch", "diff_batch"]
+        self.tempoal_attn_type = tempoal_attn_type
+        if bidirectional_causal_attn:
+            assert use_tempoal_causal_attn
+        if tempoal_attn_type:
+            assert tempoal_attn_type in ["sparse_causal", "sparse_causal_first"]
+            assert not use_tempoal_causal_attn
+            assert not (
+                img_video_joint_train and (self.joint_train_mode == "same_batch")
+            )
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+        assert not (
+            img_video_joint_train
+            and (self.joint_train_mode == "same_batch")
+            and use_tempoal_causal_attn
+        )
+        if img_video_joint_train:
+            if self.joint_train_mode == "same_batch":
+                mask = torch.ones(
+                    [1, temporal_length + image_length, temporal_length + image_length]
+                )
+                # mask[:, image_length:, :] = 0
+                # mask[:, :, image_length:] = 0
+                mask[:, temporal_length:, :] = 0
+                mask[:, :, temporal_length:] = 0
+                self.mask = mask
+            else:
+                self.mask = None
+        elif use_tempoal_causal_attn:
+            # normal causal attn
+            self.mask = torch.tril(torch.ones([1, temporal_length, temporal_length]))
+        elif tempoal_attn_type == "sparse_causal":
+            # all frames interact with only the `prev` & self frame
+            mask1 = torch.tril(
+                torch.ones([1, temporal_length, temporal_length])
+            ).bool()  # true indicates keeping
+            mask2 = torch.zeros(
+                [1, temporal_length, temporal_length]
+            )  # initialize to same shape with mask1
+            mask2[:, 2:temporal_length, : temporal_length - 2] = torch.tril(
+                torch.ones([1, temporal_length - 2, temporal_length - 2])
+            )
+            mask2 = (1 - mask2).bool()  # false indicates masking
+            self.mask = mask1 & mask2
+        elif tempoal_attn_type == "sparse_causal_first":
+            # all frames interact with only the `first` & self frame
+            mask1 = torch.tril(
+                torch.ones([1, temporal_length, temporal_length])
+            ).bool()  # true indicates keeping
+            mask2 = torch.zeros([1, temporal_length, temporal_length])
+            mask2[:, 2:temporal_length, 1 : temporal_length - 1] = torch.tril(
+                torch.ones([1, temporal_length - 2, temporal_length - 2])
+            )
+            mask2 = (1 - mask2).bool()  # false indicates masking
+            self.mask = mask1 & mask2
+        else:
+            self.mask = None
+        if use_relative_position:
+            assert temporal_length is not None
+            self.relative_position_k = RelativePosition(
+                num_units=dim_head, max_relative_position=temporal_length
+            )
+            self.relative_position_v = RelativePosition(
+                num_units=dim_head, max_relative_position=temporal_length
+            )
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
+        )
+        nn.init.constant_(self.to_q.weight, 0)
+        nn.init.constant_(self.to_k.weight, 0)
+        nn.init.constant_(self.to_v.weight, 0)
+        nn.init.constant_(self.to_out[0].weight, 0)
+        nn.init.constant_(self.to_out[0].bias, 0)
+    def forward(self, x, context=None, mask=None):
+        # if context is None:
+        #     print(f'[Temp Attn] x={x.shape},context=None')
+        # else:
+        #     print(f'[Temp Attn] x={x.shape},context={context.shape}')
+        nh = self.heads
+        out = x
+        q = self.to_q(out)
+        # if context is not None:
+        #     print(f'temporal context 1 ={context.shape}')
+        # print(f'x={x.shape}')
+        context = default(context, x)
+        # print(f'temporal context 2 ={context.shape}')
+        k = self.to_k(context)
+        v = self.to_v(context)
+        # print(f'q ={q.shape},k={k.shape}')
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> (b h) n d", h=nh), (q, k, v))
+        sim = einsum("b i d, b j d -> b i j", q, k) * self.scale
+        if self.use_relative_position:
+            len_q, len_k, len_v = q.shape[1], k.shape[1], v.shape[1]
+            k2 = self.relative_position_k(len_q, len_k)
+            sim2 = einsum("b t d, t s d -> b t s", q, k2) * self.scale  # TODO check
+            sim += sim2
+        # print('mask',mask)
+        if exists(self.mask):
+            if mask is None:
+                mask = self.mask.to(sim.device)
+            else:
+                mask = self.mask.to(sim.device).bool() & mask  # .to(sim.device)
+        else:
+            mask = mask
+            # if self.img_video_joint_train:
+            #     # process mask (make mask same shape with sim)
+            #     c, h, w = mask.shape
+            #     c, t, s = sim.shape
+            #     # assert(h == w and t == s),f"mask={mask.shape}, sim={sim.shape}, h={h}, w={w}, t={t}, s={s}"
+            #     if h > t:
+            #         mask = mask[:, :t, :]
+            #     elif h < t: # pad zeros to mask (no attention) only initial mask =1 area compute weights
+            #         mask_ = torch.zeros([c,t,w]).to(mask.device)
+            #         mask_[:, :h, :] = mask
+            #         mask = mask_
+            #     c, h, w = mask.shape
+            #     if w > s:
+            #         mask = mask[:, :, :s]
+            #     elif w < s: # pad zeros to mask
+            #         mask_ = torch.zeros([c,h,s]).to(mask.device)
+            #         mask_[:, :, :w] = mask
+            #         mask = mask_
+            # max_neg_value = -torch.finfo(sim.dtype).max
+            # sim = sim.float().masked_fill(mask == 0, max_neg_value)
+        if mask is not None:
+            max_neg_value = -1e9
+            sim = sim + (1 - mask.float()) * max_neg_value  # 1=masking,0=no masking
+            # print('sim after masking: ', sim)
+            # if torch.isnan(sim).any() or torch.isinf(sim).any() or (not sim.any()):
+            #     print(f'sim [after masking], isnan={torch.isnan(sim).any()}, isinf={torch.isinf(sim).any()}, allzero={not sim.any()}')
+        attn = sim.softmax(dim=-1)
+        # print('attn after softmax: ', attn)
+        # if torch.isnan(attn).any() or torch.isinf(attn).any() or (not attn.any()):
+        #     print(f'attn [after softmax], isnan={torch.isnan(attn).any()}, isinf={torch.isinf(attn).any()}, allzero={not attn.any()}')
+        # attn = torch.where(torch.isnan(attn), torch.full_like(attn,0), attn)
+        # if torch.isinf(attn.detach()).any():
+        #     import pdb;pdb.set_trace()
+        # if torch.isnan(attn.detach()).any():
+        #     import pdb;pdb.set_trace()
+        out = einsum("b i j, b j d -> b i d", attn, v)
+        if self.bidirectional_causal_attn:
+            mask_reverse = torch.triu(
+                torch.ones(
+                    [1, self.temporal_length, self.temporal_length], device=sim.device
+                )
+            )
+            sim_reverse = sim.float().masked_fill(mask_reverse == 0, max_neg_value)
+            attn_reverse = sim_reverse.softmax(dim=-1)
+            out_reverse = einsum("b i j, b j d -> b i d", attn_reverse, v)
+            out += out_reverse
+        if self.use_relative_position:
+            v2 = self.relative_position_v(len_q, len_v)
+            out2 = einsum("b t s, t s d -> b t d", attn, v2)  # TODO check
+            out += out2  # TODO check：先add还是先merge head？先计算rpr，on split head之后的数据，然后再merge。
+        out = rearrange(out, "(b h) n d -> b n (h d)", h=nh)  # merge head
+        return self.to_out(out)
+class SpatialSelfAttention(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.k = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.v = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.proj_out = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b, c, h, w = q.shape
+        q = rearrange(q, "b c h w -> b (h w) c")
+        k = rearrange(k, "b c h w -> b c (h w)")
+        w_ = torch.einsum("bij,bjk->bik", q, k)
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = rearrange(v, "b c h w -> b c (h w)")
+        w_ = rearrange(w_, "b i j -> b j i")
+        h_ = torch.einsum("bij,bjk->bik", v, w_)
+        h_ = rearrange(h_, "b c (h w) -> b c h w", h=h)
+        h_ = self.proj_out(h_)
+        return x + h_
+class CrossAttention(nn.Module):
+    def __init__(
+        self,
+        query_dim,
+        context_dim=None,
+        heads=8,
+        dim_head=64,
+        dropout=0.0,
+        sa_shared_kv=False,
+        shared_type="only_first",
+        **kwargs,
+    ):
+        super().__init__()
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+        self.sa_shared_kv = sa_shared_kv
+        assert shared_type in [
+            "only_first",
+            "all_frames",
+            "first_and_prev",
+            "only_prev",
+            "full",
+            "causal",
+            "full_qkv",
+        ]
+        self.shared_type = shared_type
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        self.dim_head = dim_head
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
+        )
+        self.attention_op: Optional[Any] = None
+    def forward(self, x, context=None, mask=None):
+        h = self.heads
+        b = x.shape[0]
+        q = self.to_q(x)
+        context = default(context, x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+        if self.sa_shared_kv:
+            if self.shared_type == "only_first":
+                k, v = map(
+                    lambda xx: rearrange(xx[0].unsqueeze(0), "b n c -> (b n) c")
+                    .unsqueeze(0)
+                    .repeat(b, 1, 1),
+                    (k, v),
+                )
+            else:
+                raise NotImplementedError
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> (b h) n d", h=h), (q, k, v))
+        sim = einsum("b i d, b j d -> b i j", q, k) * self.scale
+        if exists(mask):
+            mask = rearrange(mask, "b ... -> b (...)")
+            max_neg_value = -torch.finfo(sim.dtype).max
+            mask = repeat(mask, "b j -> (b h) () j", h=h)
+            sim.masked_fill_(~mask, max_neg_value)
+        # attention, what we cannot get enough of
+        attn = sim.softmax(dim=-1)
+        out = einsum("b i j, b j d -> b i d", attn, v)
+        out = rearrange(out, "(b h) n d -> b n (h d)", h=h)
+        return self.to_out(out)
+    def efficient_forward(self, x, context=None, mask=None):
+        q = self.to_q(x)
+        context = default(context, x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+        b, _, _ = q.shape
+        q, k, v = map(
+            lambda t: t.unsqueeze(3)
+            .reshape(b, t.shape[1], self.heads, self.dim_head)
+            .permute(0, 2, 1, 3)
+            .reshape(b * self.heads, t.shape[1], self.dim_head)
+            .contiguous(),
+            (q, k, v),
+        )
+        # actually compute the attention, what we cannot get enough of
+        out = xformers.ops.memory_efficient_attention(
+            q, k, v, attn_bias=None, op=self.attention_op
+        )
+        if exists(mask):
+            raise NotImplementedError
+        out = (
+            out.unsqueeze(0)
+            .reshape(b, self.heads, out.shape[1], self.dim_head)
+            .permute(0, 2, 1, 3)
+            .reshape(b, out.shape[1], self.heads * self.dim_head)
+        )
+        return self.to_out(out)
+class VideoSpatialCrossAttention(CrossAttention):
+    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0):
+        super().__init__(query_dim, context_dim, heads, dim_head, dropout)
+    def forward(self, x, context=None, mask=None):
+        b, c, t, h, w = x.shape
+        if context is not None:
+            context = context.repeat(t, 1, 1)
+        x = super.forward(spatial_attn_reshape(x), context=context) + x
+        return spatial_attn_reshape_back(x, b, h)
+def spatial_attn_reshape(x):
+    return rearrange(x, "b c t h w -> (b t) (h w) c")
+def spatial_attn_reshape_back(x, b, h):
+    return rearrange(x, "(b t) (h w) c -> b c t h w", b=b, h=h)
+def temporal_attn_reshape(x):
+    return rearrange(x, "b c t h w -> (b h w) t c")
+def temporal_attn_reshape_back(x, b, h, w):
+    return rearrange(x, "(b h w) t c -> b c t h w", b=b, h=h, w=w)
+def local_spatial_temporal_attn_reshape(x, window_size):
+    B, C, T, H, W = x.shape
+    NH = H // window_size
+    NW = W // window_size
+    # x = x.view(B, C, T, NH, window_size, NW, window_size)
+    # tokens = x.permute(0, 1, 2, 3, 5, 4, 6).contiguous()
+    # tokens = tokens.view(-1, window_size, window_size, C)
+    x = rearrange(
+        x,
+        "b c t (nh wh) (nw ww) -> b c t nh wh nw ww",
+        nh=NH,
+        nw=NW,
+        wh=window_size,
+        ww=window_size,
+    ).contiguous()  # # B, C, T, NH, NW, window_size, window_size
+    x = rearrange(
+        x, "b c t nh wh nw ww -> (b nh nw) (t wh ww) c"
+    )  # (B, NH, NW) (T, window_size, window_size) C
+    return x
+def local_spatial_temporal_attn_reshape_back(x, window_size, b, h, w, t):
+    B, L, C = x.shape
+    NH = h // window_size
+    NW = w // window_size
+    x = rearrange(
+        x,
+        "(b nh nw) (t wh ww) c -> b c t nh wh nw ww",
+        b=b,
+        nh=NH,
+        nw=NW,
+        t=t,
+        wh=window_size,
+        ww=window_size,
+    )
+    x = rearrange(x, "b c t nh wh nw ww -> b c t (nh wh) (nw ww)")
+    return x
+class SpatialTemporalTransformer(nn.Module):
+    """
+    Transformer block for video-like data (5D tensor).
+    First, project the input (aka embedding) with NO reshape.
+    Then apply standard transformer action.
+    The 5D -> 3D reshape operation will be done in the specific attention module.
+    """
+    def __init__(
+        self,
+        in_channels,
+        n_heads,
+        d_head,
+        depth=1,
+        dropout=0.0,
+        context_dim=None,
+        # Temporal stuff
+        temporal_length=None,
+        image_length=None,
+        use_relative_position=True,
+        img_video_joint_train=False,
+        cross_attn_on_tempoal=False,
+        temporal_crossattn_type="selfattn",
+        order="stst",
+        temporalcrossfirst=False,
+        split_stcontext=False,
+        temporal_context_dim=None,
+        **kwargs,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        inner_dim = n_heads * d_head
+        self.norm = Normalize(in_channels)
+        self.proj_in = nn.Conv3d(
+            in_channels, inner_dim, kernel_size=1, stride=1, padding=0
+        )
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlockST(
+                    inner_dim,
+                    n_heads,
+                    d_head,
+                    dropout=dropout,
+                    # cross attn
+                    context_dim=context_dim,
+                    # temporal attn
+                    temporal_length=temporal_length,
+                    image_length=image_length,
+                    use_relative_position=use_relative_position,
+                    img_video_joint_train=img_video_joint_train,
+                    temporal_crossattn_type=temporal_crossattn_type,
+                    order=order,
+                    temporalcrossfirst=temporalcrossfirst,
+                    split_stcontext=split_stcontext,
+                    temporal_context_dim=temporal_context_dim,
+                    **kwargs,
+                )
+                for d in range(depth)
+            ]
+        )
+        self.proj_out = zero_module(
+            nn.Conv3d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+        )
+    def forward(self, x, context=None, temporal_context=None, **kwargs):
+        # note: if no context is given, cross-attention defaults to self-attention
+        assert x.dim() == 5, f"x shape = {x.shape}"
+        b, c, t, h, w = x.shape
+        x_in = x
+        x = self.norm(x)
+        x = self.proj_in(x)
+        for block in self.transformer_blocks:
+            x = block(x, context=context, temporal_context=temporal_context, **kwargs)
+        x = self.proj_out(x)
+        return x + x_in
+class STAttentionBlock2(nn.Module):
+    def __init__(
+        self,
+        channels,
+        num_heads=1,
+        num_head_channels=-1,
+        use_checkpoint=False,  # not used, only used in ResBlock
+        use_new_attention_order=False,  # QKVAttention or QKVAttentionLegacy
+        temporal_length=16,  # used in relative positional representation.
+        image_length=8,  # used for image-video joint training.
+        use_relative_position=False,  # whether use relative positional representation in temporal attention.
+        img_video_joint_train=False,
+        # norm_type="groupnorm",
+        attn_norm_type="group",
+        use_tempoal_causal_attn=False,
+    ):
+        """
+        version 1: guided_diffusion implemented version
+        version 2: remove args input argument
+        """
+        super().__init__()
+        if num_head_channels == -1:
+            self.num_heads = num_heads
+        else:
+            assert (
+                channels % num_head_channels == 0
+            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
+            self.num_heads = channels // num_head_channels
+        self.use_checkpoint = use_checkpoint
+        self.temporal_length = temporal_length
+        self.image_length = image_length
+        self.use_relative_position = use_relative_position
+        self.img_video_joint_train = img_video_joint_train
+        self.attn_norm_type = attn_norm_type
+        assert self.attn_norm_type in ["group", "no_norm"]
+        self.use_tempoal_causal_attn = use_tempoal_causal_attn
+        if self.attn_norm_type == "group":
+            self.norm_s = normalization(channels)
+            self.norm_t = normalization(channels)
+        self.qkv_s = conv_nd(1, channels, channels * 3, 1)
+        self.qkv_t = conv_nd(1, channels, channels * 3, 1)
+        if self.img_video_joint_train:
+            mask = torch.ones(
+                [1, temporal_length + image_length, temporal_length + image_length]
+            )
+            mask[:, temporal_length:, :] = 0
+            mask[:, :, temporal_length:] = 0
+            self.register_buffer("mask", mask)
+        else:
+            self.mask = None
+        if use_new_attention_order:
+            # split qkv before split heads
+            self.attention_s = QKVAttention(self.num_heads)
+            self.attention_t = QKVAttention(self.num_heads)
+        else:
+            # split heads before split qkv
+            self.attention_s = QKVAttentionLegacy(self.num_heads)
+            self.attention_t = QKVAttentionLegacy(self.num_heads)
+        if use_relative_position:
+            self.relative_position_k = RelativePosition(
+                num_units=channels // self.num_heads,
+                max_relative_position=temporal_length,
+            )
+            self.relative_position_v = RelativePosition(
+                num_units=channels // self.num_heads,
+                max_relative_position=temporal_length,
+            )
+        self.proj_out_s = zero_module(
+            conv_nd(1, channels, channels, 1)
+        )  # conv_dim, in_channels, out_channels, kernel_size
+        self.proj_out_t = zero_module(
+            conv_nd(1, channels, channels, 1)
+        )  # conv_dim, in_channels, out_channels, kernel_size
+    def forward(self, x, mask=None):
+        b, c, t, h, w = x.shape
+        # spatial
+        out = rearrange(x, "b c t h w -> (b t) c (h w)")
+        if self.attn_norm_type == "no_norm":
+            qkv = self.qkv_s(out)
+        else:
+            qkv = self.qkv_s(self.norm_s(out))
+        out = self.attention_s(qkv)
+        out = self.proj_out_s(out)
+        out = rearrange(out, "(b t) c (h w) -> b c t h w", b=b, h=h)
+        x += out
+        # temporal
+        out = rearrange(x, "b c t h w -> (b h w) c t")
+        if self.attn_norm_type == "no_norm":
+            qkv = self.qkv_t(out)
+        else:
+            qkv = self.qkv_t(self.norm_t(out))
+        # relative positional embedding
+        if self.use_relative_position:
+            len_q = qkv.size()[-1]
+            len_k, len_v = len_q, len_q
+            k_rp = self.relative_position_k(len_q, len_k)
+            v_rp = self.relative_position_v(len_q, len_v)  # [T,T,head_dim]
+            out = self.attention_t(
+                qkv,
+                rp=(k_rp, v_rp),
+                mask=self.mask,
+                use_tempoal_causal_attn=self.use_tempoal_causal_attn,
+            )
+        else:
+            out = self.attention_t(
+                qkv,
+                rp=None,
+                mask=self.mask,
+                use_tempoal_causal_attn=self.use_tempoal_causal_attn,
+            )
+        out = self.proj_out_t(out)
+        out = rearrange(out, "(b h w) c t -> b c t h w", b=b, h=h, w=w)
+        return x + out
+class QKVAttentionLegacy(nn.Module):
+    """
+    A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping
+    """
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+    def forward(self, qkv, rp=None, mask=None):
+        """
+        Apply QKV attention.
+        :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
+        :return: an [N x (H * C) x T] tensor after attention.
+        """
+        if rp is not None or mask is not None:
+            raise NotImplementedError
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = torch.einsum(
+            "bct,bcs->bts", q * scale, k * scale
+        )  # More stable with f16 than dividing afterwards
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = torch.einsum("bts,bcs->bct", weight, v)
+        return a.reshape(bs, -1, length)
+    @staticmethod
+    def count_flops(model, _x, y):
+        return count_flops_attn(model, _x, y)
+class QKVAttention(nn.Module):
+    """
+    A module which performs QKV attention and splits in a different order.
+    """
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+    def forward(self, qkv, rp=None, mask=None, use_tempoal_causal_attn=False):
+        """
+        Apply QKV attention.
+        :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
+        :return: an [N x (H * C) x T] tensor after attention.
+        """
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        # print('qkv', qkv.size())
+        qkv=qkv.contiguous()
+        q, k, v = qkv.chunk(3, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        # print('bs, self.n_heads, ch, length', bs, self.n_heads, ch, length)
+        weight = torch.einsum(
+            "bct,bcs->bts",
+            (q * scale).view(bs * self.n_heads, ch, length),
+            (k * scale).view(bs * self.n_heads, ch, length),
+        )  # More stable with f16 than dividing afterwards
+        # weight:[b,t,s] b=bs*n_heads*T
+        if rp is not None:
+            k_rp, v_rp = rp  # [length, length, head_dim] [8, 8, 48]
+            weight2 = torch.einsum(
+                "bct,tsc->bst", (q * scale).view(bs * self.n_heads, ch, length), k_rp
+            )
+            weight += weight2
+        if use_tempoal_causal_attn:
+            # weight = torch.tril(weight)
+            assert mask is None, f"Not implemented for merging two masks!"
+            mask = torch.tril(torch.ones(weight.shape))
+        else:
+            if mask is not None:  # only keep upper-left matrix
+                # process mask
+                c, t, _ = weight.shape
+                if mask.shape[-1] > t:
+                    mask = mask[:, :t, :t]
+                elif mask.shape[-1] < t:  # pad ones
+                    mask_ = torch.zeros([c, t, t]).to(mask.device)
+                    t_ = mask.shape[-1]
+                    mask_[:, :t_, :t_] = mask
+                    mask = mask_
+                else:
+                    assert (
+                        weight.shape[-1] == mask.shape[-1]
+                    ), f"weight={weight.shape}, mask={mask.shape}"
+        if mask is not None:
+            INF = -1e8  # float('-inf')
+            weight = weight.float().masked_fill(mask == 0, INF)
+        weight = F.softmax(weight.float(), dim=-1).type(
+            weight.dtype
+        )  # [256, 8, 8] [b, t, t] b=bs*n_heads*h*w,t=nframes
+        # weight = F.softmax(weight, dim=-1)#[256, 8, 8] [b, t, t] b=bs*n_heads*h*w,t=nframes
+        a = torch.einsum(
+            "bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length)
+        )  # [256, 48, 8] [b, head_dim, t]
+        if rp is not None:
+            a2 = torch.einsum("bts,tsc->btc", weight, v_rp).transpose(1, 2)  # btc->bct
+            a += a2
+        return a.reshape(bs, -1, length)
+def silu(x):
+    # swish
+    return x * torch.sigmoid(x)
+class SiLU(nn.Module):
+    def __init__(self):
+        super(SiLU, self).__init__()
+    def forward(self, x):
+        return silu(x)
+def Normalize(in_channels, norm_type="group"):
+    assert norm_type in ["group", "batch",'layer']
+    if norm_type == "group":
+        return torch.nn.GroupNorm(
+            num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
+        )
+    elif norm_type == "batch":
+        return torch.nn.SyncBatchNorm(in_channels)
+    elif norm_type == "layer":
+        return nn.LayerNorm(in_channels)
+class SamePadConv3d(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        bias=True,
+        padding_type="replicate",
+    ):
+        super().__init__()
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size,) * 3
+        if isinstance(stride, int):
+            stride = (stride,) * 3
+        # assumes that the input shape is divisible by stride
+        total_pad = tuple([k - s for k, s in zip(kernel_size, stride)])
+        pad_input = []
+        for p in total_pad[::-1]:  # reverse since F.pad starts from last dim
+            pad_input.append((p // 2 + p % 2, p // 2))
+        pad_input = sum(pad_input, tuple())
+        self.pad_input = pad_input
+        self.padding_type = padding_type
+        self.conv = nn.Conv3d(
+            in_channels, out_channels, kernel_size, stride=stride, padding=0, bias=bias
+        )
+    def forward(self, x):
+        tp=x.dtype
+        x = x.float()
+        # 执行填充操作
+        x_padded = F.pad(x, self.pad_input, mode=self.padding_type)
+        # 如果需要，将结果转换回 BFloat16
+        x_padded = x_padded.to(tp)
+        return self.conv(x_padded)
+class TemporalAttention(nn.Module):
+    def __init__(
+        self,
+        channels,
+        num_heads=1,
+        num_head_channels=-1,
+        max_temporal_length=64,
+    ):
+        """
+        a clean multi-head temporal attention
+        """
+        super().__init__()
+        if num_head_channels == -1:
+            self.num_heads = num_heads
+        else:
+            assert (
+                channels % num_head_channels == 0
+            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
+            self.num_heads = channels // num_head_channels
+        self.norm = Normalize(channels)
+        self.qkv = zero_module(conv_nd(1, channels, channels * 3, 1))
+        self.attention = QKVAttention(self.num_heads)
+        self.relative_position_k = RelativePosition(
+            num_units=channels // self.num_heads,
+            max_relative_position=max_temporal_length,
+        )
+        self.relative_position_v = RelativePosition(
+            num_units=channels // self.num_heads,
+            max_relative_position=max_temporal_length,
+        )
+        self.proj_out = zero_module(
+            conv_nd(1, channels, channels, 1)
+        )  # conv_dim, in_channels, out_channels, kernel_size
+    def forward(self, x, mask=None):
+        b, c, t, h, w = x.shape
+        out = rearrange(x, "b c t h w -> (b h w) c t")
+        # torch.Size([4608, 1152, 2])1
+        # torch.Size([4608, 3456, 2])2
+        # torch.Size([4608, 1152, 2])3
+        # torch.Size([4608, 1152, 2])4
+        #print(out.shape,end='1\n')
+        qkv = self.qkv(self.norm(out))
+        #print(qkv.shape,end='2\n')
+        len_q = qkv.size()[-1]
+        len_k, len_v = len_q, len_q
+        k_rp = self.relative_position_k(len_q, len_k)
+        v_rp = self.relative_position_v(len_q, len_v)  # [T,T,head_dim]
+        out = self.attention(qkv, rp=(k_rp, v_rp))
+        #print(out.shape,end='3\n')
+        out = self.proj_out(out)
+        #print(out.shape,end='4\n')
+        out = rearrange(out, "(b h w) c t -> b c t h w", b=b, h=h, w=w)
+        return x + out
+class TemporalAttention_lin(nn.Module):
+    def __init__(
+        self,
+        channels,
+        num_heads=8,
+        num_head_channels=-1,
+        max_temporal_length=64,
+    ):
+        """
+        a clean multi-head temporal attention
+        """
+        super().__init__()
+        if num_head_channels == -1:
+            self.num_heads = num_heads
+        else:
+            assert (
+                channels % num_head_channels == 0
+            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
+            self.num_heads = channels // num_head_channels
+        self.norm = nn.LayerNorm(channels)
+        #self.norm = Normalize(channels)
+        #self.qkv = zero_module(conv_nd(1, channels, channels * 3, 1))
+        self.qkv = nn.Linear(channels, channels * 3)
+        self.attention = QKVAttention(self.num_heads)
+        self.relative_position_k = RelativePosition(
+            num_units=channels // self.num_heads,
+            max_relative_position=max_temporal_length,
+        )
+        self.relative_position_v = RelativePosition(
+            num_units=channels // self.num_heads,
+            max_relative_position=max_temporal_length,
+        )
+        self.proj_out = nn.Linear(channels, channels)
+    def forward(self, x, mask=None):
+        b, c, t, h, w = x.shape
+        out = rearrange(x, "b c t h w -> (b h w) t c")
+        # torch.Size([4608, 1152, 2])1
+        # torch.Size([4608, 3456, 2])2
+        # torch.Size([4608, 1152, 2])3
+        # torch.Size([4608, 1152, 2])4
+        #print(out.shape,end='1\n')
+        qkv = self.qkv(self.norm(out)).transpose(-1, -2)
+        #print(qkv.shape,end='2\n')
+        len_q = qkv.size()[-1]
+        len_k, len_v = len_q, len_q
+        k_rp = self.relative_position_k(len_q, len_k)
+        v_rp = self.relative_position_v(len_q, len_v)  # [T,T,head_dim]
+        out = self.attention(qkv, rp=(k_rp, v_rp))
+        out = self.proj_out(out.transpose(-1, -2)).transpose(-1, -2)
+        #print(out.shape,end='4\n')
+        out = rearrange(out, "(b h w) c t -> b c t h w", b=b, h=h, w=w)
+        return x + out
+class AttnBlock3D(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv3d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.k = torch.nn.Conv3d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.v = torch.nn.Conv3d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.proj_out = torch.nn.Conv3d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+    def forward(self, x):
+        h_ = x
+        # self.norm.to(x.device)
+        # self.norm.to(x.dtype)
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        b, c, t, h, w = q.shape
+        # q = q.reshape(b,c,h*w) # bcl
+        # q = q.permute(0,2,1)   # bcl -> blc l=hw
+        # k = k.reshape(b,c,h*w) # bcl
+        q = rearrange(q, "b c t h w -> (b t) (h w) c")  # blc
+        k = rearrange(k, "b c t h w -> (b t) c (h w)")  # bcl
+        w_ = torch.bmm(q, k)  # b,l,l
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # v = v.reshape(b,c,h*w)
+        v = rearrange(v, "b c t h w -> (b t) c (h w)")  # bcl
+        # attend to values
+        w_ = w_.permute(0, 2, 1)  # bll
+        h_ = torch.bmm(v, w_)  # bcl
+        # h_ = h_.reshape(b,c,h,w)
+        h_ = rearrange(h_, "(b t) c (h w) -> b c t h w", b=b, h=h)
+        h_ = self.proj_out(h_)
+        return x + h_
+class MultiHeadAttention3D(nn.Module):
+    def __init__(self, in_channels, num_heads=8):
+        super().__init__()
+        self.in_channels = in_channels
+        self.num_heads = num_heads
+        self.head_dim = in_channels // num_heads
+        assert self.head_dim * num_heads == in_channels, "in_channels must be divisible by num_heads"
+        self.norm = nn.LayerNorm(in_channels)
+        self.q_linear = nn.Linear(in_channels, in_channels)
+        self.k_linear = nn.Linear(in_channels, in_channels)
+        self.v_linear = nn.Linear(in_channels, in_channels)
+        self.proj_out = nn.Linear(in_channels, in_channels)
+    def forward(self, x):
+        b, c, t, h, w = x.shape
+        #print(x.shape)
+        # Normalize and reshape input
+        h_ = rearrange(x, "b c t h w -> (b t) (h w) c")
+        h_ = self.norm(h_)
+        # Linear projections
+        q = self.q_linear(h_)
+        k = self.k_linear(h_)
+        v = self.v_linear(h_)
+        # Reshape to multi-head
+        q = rearrange(q, "b l (h d) -> b h l d", h=self.num_heads)
+        k = rearrange(k, "b l (h d) -> b h l d", h=self.num_heads)
+        v = rearrange(v, "b l (h d) -> b h l d", h=self.num_heads)
+        # Scaled Dot-Product Attention
+        scores = torch.matmul(q, k.transpose(-2, -1)) / (self.head_dim ** 0.5)
+        attn = F.softmax(scores, dim=-1)
+        # Apply attention to values
+        out = torch.matmul(attn, v)
+        out = rearrange(out, "b h l d -> b l (h d)")
+        # Project back to original dimension
+        out = self.proj_out(out)
+        # Reshape back to original shape
+        out = rearrange(out, "(b t) (h w) c -> b c t h w", b=b, h=h, t=t)
+        #print(out.shape)
+        return x + out
 class SiglipAE(nn.Module):
     def __init__(self):
         x=self.encoder(x)
         return x

sae_utils.py DELETED Viewed

@@ -1,302 +0,0 @@
-import math
-import torch
-import torch.nn as nn
-from transformers.activations import ACT2FN
-from .attention_temporal_videoae import *
-from einops import rearrange, reduce, repeat
-try:
-    import xformers
-    import xformers.ops as xops
-    XFORMERS_IS_AVAILBLE = True
-except:
-    XFORMERS_IS_AVAILBLE = False
-def silu(x):
-    # swish
-    return x * torch.sigmoid(x)
-class SiLU(nn.Module):
-    def __init__(self):
-        super(SiLU, self).__init__()
-    def forward(self, x):
-        return silu(x)
-def Normalize(in_channels, norm_type="group"):
-    assert norm_type in ["group", "batch",'layer']
-    if norm_type == "group":
-        return torch.nn.GroupNorm(
-            num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
-        )
-    elif norm_type == "batch":
-        return torch.nn.SyncBatchNorm(in_channels)
-    elif norm_type == "layer":
-        return nn.LayerNorm(in_channels)
-class SamePadConv3d(nn.Module):
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size,
-        stride=1,
-        bias=True,
-        padding_type="replicate",
-    ):
-        super().__init__()
-        if isinstance(kernel_size, int):
-            kernel_size = (kernel_size,) * 3
-        if isinstance(stride, int):
-            stride = (stride,) * 3
-        # assumes that the input shape is divisible by stride
-        total_pad = tuple([k - s for k, s in zip(kernel_size, stride)])
-        pad_input = []
-        for p in total_pad[::-1]:  # reverse since F.pad starts from last dim
-            pad_input.append((p // 2 + p % 2, p // 2))
-        pad_input = sum(pad_input, tuple())
-        self.pad_input = pad_input
-        self.padding_type = padding_type
-        self.conv = nn.Conv3d(
-            in_channels, out_channels, kernel_size, stride=stride, padding=0, bias=bias
-        )
-    def forward(self, x):
-        tp=x.dtype
-        x = x.float()
-        # 执行填充操作
-        x_padded = F.pad(x, self.pad_input, mode=self.padding_type)
-        # 如果需要，将结果转换回 BFloat16
-        x_padded = x_padded.to(tp)
-        return self.conv(x_padded)
-class TemporalAttention(nn.Module):
-    def __init__(
-        self,
-        channels,
-        num_heads=1,
-        num_head_channels=-1,
-        max_temporal_length=64,
-    ):
-        """
-        a clean multi-head temporal attention
-        """
-        super().__init__()
-        if num_head_channels == -1:
-            self.num_heads = num_heads
-        else:
-            assert (
-                channels % num_head_channels == 0
-            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
-            self.num_heads = channels // num_head_channels
-        self.norm = Normalize(channels)
-        self.qkv = zero_module(conv_nd(1, channels, channels * 3, 1))
-        self.attention = QKVAttention(self.num_heads)
-        self.relative_position_k = RelativePosition(
-            num_units=channels // self.num_heads,
-            max_relative_position=max_temporal_length,
-        )
-        self.relative_position_v = RelativePosition(
-            num_units=channels // self.num_heads,
-            max_relative_position=max_temporal_length,
-        )
-        self.proj_out = zero_module(
-            conv_nd(1, channels, channels, 1)
-        )  # conv_dim, in_channels, out_channels, kernel_size
-    def forward(self, x, mask=None):
-        b, c, t, h, w = x.shape
-        out = rearrange(x, "b c t h w -> (b h w) c t")
-        # torch.Size([4608, 1152, 2])1
-        # torch.Size([4608, 3456, 2])2
-        # torch.Size([4608, 1152, 2])3
-        # torch.Size([4608, 1152, 2])4
-        #print(out.shape,end='1\n')
-        qkv = self.qkv(self.norm(out))
-        #print(qkv.shape,end='2\n')
-        len_q = qkv.size()[-1]
-        len_k, len_v = len_q, len_q
-        k_rp = self.relative_position_k(len_q, len_k)
-        v_rp = self.relative_position_v(len_q, len_v)  # [T,T,head_dim]
-        out = self.attention(qkv, rp=(k_rp, v_rp))
-        #print(out.shape,end='3\n')
-        out = self.proj_out(out)
-        #print(out.shape,end='4\n')
-        out = rearrange(out, "(b h w) c t -> b c t h w", b=b, h=h, w=w)
-        return x + out
-class TemporalAttention_lin(nn.Module):
-    def __init__(
-        self,
-        channels,
-        num_heads=8,
-        num_head_channels=-1,
-        max_temporal_length=64,
-    ):
-        """
-        a clean multi-head temporal attention
-        """
-        super().__init__()
-        if num_head_channels == -1:
-            self.num_heads = num_heads
-        else:
-            assert (
-                channels % num_head_channels == 0
-            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
-            self.num_heads = channels // num_head_channels
-        self.norm = nn.LayerNorm(channels)
-        #self.norm = Normalize(channels)
-        #self.qkv = zero_module(conv_nd(1, channels, channels * 3, 1))
-        self.qkv = nn.Linear(channels, channels * 3)
-        self.attention = QKVAttention(self.num_heads)
-        self.relative_position_k = RelativePosition(
-            num_units=channels // self.num_heads,
-            max_relative_position=max_temporal_length,
-        )
-        self.relative_position_v = RelativePosition(
-            num_units=channels // self.num_heads,
-            max_relative_position=max_temporal_length,
-        )
-        self.proj_out = nn.Linear(channels, channels)
-    def forward(self, x, mask=None):
-        b, c, t, h, w = x.shape
-        out = rearrange(x, "b c t h w -> (b h w) t c")
-        # torch.Size([4608, 1152, 2])1
-        # torch.Size([4608, 3456, 2])2
-        # torch.Size([4608, 1152, 2])3
-        # torch.Size([4608, 1152, 2])4
-        #print(out.shape,end='1\n')
-        qkv = self.qkv(self.norm(out)).transpose(-1, -2)
-        #print(qkv.shape,end='2\n')
-        len_q = qkv.size()[-1]
-        len_k, len_v = len_q, len_q
-        k_rp = self.relative_position_k(len_q, len_k)
-        v_rp = self.relative_position_v(len_q, len_v)  # [T,T,head_dim]
-        out = self.attention(qkv, rp=(k_rp, v_rp))
-        out = self.proj_out(out.transpose(-1, -2)).transpose(-1, -2)
-        #print(out.shape,end='4\n')
-        out = rearrange(out, "(b h w) c t -> b c t h w", b=b, h=h, w=w)
-        return x + out
-class AttnBlock3D(nn.Module):
-    def __init__(self, in_channels):
-        super().__init__()
-        self.in_channels = in_channels
-        self.norm = Normalize(in_channels)
-        self.q = torch.nn.Conv3d(
-            in_channels, in_channels, kernel_size=1, stride=1, padding=0
-        )
-        self.k = torch.nn.Conv3d(
-            in_channels, in_channels, kernel_size=1, stride=1, padding=0
-        )
-        self.v = torch.nn.Conv3d(
-            in_channels, in_channels, kernel_size=1, stride=1, padding=0
-        )
-        self.proj_out = torch.nn.Conv3d(
-            in_channels, in_channels, kernel_size=1, stride=1, padding=0
-        )
-    def forward(self, x):
-        h_ = x
-        # self.norm.to(x.device)
-        # self.norm.to(x.dtype)
-        h_ = self.norm(h_)
-        q = self.q(h_)
-        k = self.k(h_)
-        v = self.v(h_)
-        b, c, t, h, w = q.shape
-        # q = q.reshape(b,c,h*w) # bcl
-        # q = q.permute(0,2,1)   # bcl -> blc l=hw
-        # k = k.reshape(b,c,h*w) # bcl
-        q = rearrange(q, "b c t h w -> (b t) (h w) c")  # blc
-        k = rearrange(k, "b c t h w -> (b t) c (h w)")  # bcl
-        w_ = torch.bmm(q, k)  # b,l,l
-        w_ = w_ * (int(c) ** (-0.5))
-        w_ = torch.nn.functional.softmax(w_, dim=2)
-        # v = v.reshape(b,c,h*w)
-        v = rearrange(v, "b c t h w -> (b t) c (h w)")  # bcl
-        # attend to values
-        w_ = w_.permute(0, 2, 1)  # bll
-        h_ = torch.bmm(v, w_)  # bcl
-        # h_ = h_.reshape(b,c,h,w)
-        h_ = rearrange(h_, "(b t) c (h w) -> b c t h w", b=b, h=h)
-        h_ = self.proj_out(h_)
-        return x + h_
-class MultiHeadAttention3D(nn.Module):
-    def __init__(self, in_channels, num_heads=8):
-        super().__init__()
-        self.in_channels = in_channels
-        self.num_heads = num_heads
-        self.head_dim = in_channels // num_heads
-        assert self.head_dim * num_heads == in_channels, "in_channels must be divisible by num_heads"
-        self.norm = nn.LayerNorm(in_channels)
-        self.q_linear = nn.Linear(in_channels, in_channels)
-        self.k_linear = nn.Linear(in_channels, in_channels)
-        self.v_linear = nn.Linear(in_channels, in_channels)
-        self.proj_out = nn.Linear(in_channels, in_channels)
-    def forward(self, x):
-        b, c, t, h, w = x.shape
-        #print(x.shape)
-        # Normalize and reshape input
-        h_ = rearrange(x, "b c t h w -> (b t) (h w) c")
-        h_ = self.norm(h_)
-        # Linear projections
-        q = self.q_linear(h_)
-        k = self.k_linear(h_)
-        v = self.v_linear(h_)
-        # Reshape to multi-head
-        q = rearrange(q, "b l (h d) -> b h l d", h=self.num_heads)
-        k = rearrange(k, "b l (h d) -> b h l d", h=self.num_heads)
-        v = rearrange(v, "b l (h d) -> b h l d", h=self.num_heads)
-        # Scaled Dot-Product Attention
-        scores = torch.matmul(q, k.transpose(-2, -1)) / (self.head_dim ** 0.5)
-        attn = F.softmax(scores, dim=-1)
-        # Apply attention to values
-        out = torch.matmul(attn, v)
-        out = rearrange(out, "b h l d -> b l (h d)")
-        # Project back to original dimension
-        out = self.proj_out(out)
-        # Reshape back to original shape
-        out = rearrange(out, "(b t) (h w) c -> b c t h w", b=b, h=h, t=t)
-        #print(out.shape)
-        return x + out

siglip_encoder.py DELETED Viewed

@@ -1,154 +0,0 @@
-import torch
-import torch.nn.functional as F
-from torch import nn
-from typing import Optional, Tuple, Union, Dict
-from PIL import Image
-from functools import partial, reduce
-from transformers import SiglipImageProcessor, SiglipVisionConfig, SiglipVisionModel
-from .base_encoder import BaseVisionTower
-import torch.distributed as dist
-# --data_path /share/shuyan/video_traindata/anno/\{cinepine_order\}.json \
-#     --image_folder /share/shuyan/video_traindata/Bunny-v1_0-data/finetune/images \
-#     --video_folder /share/shuyan/video_traindata \
-def rank0_print(*args):
-    if dist.is_initialized():
-        if dist.get_rank() == 0:
-            print(f"Rank {dist.get_rank()}: ", *args)
-    else:
-        print(*args)
-from transformers.image_processing_utils import BatchFeature, get_size_dict
-from transformers.image_transforms import (
-    convert_to_rgb,
-    normalize,
-    rescale,
-    resize,
-    to_channel_dimension_format,
-)
-from transformers.image_utils import (
-    ChannelDimension,
-    PILImageResampling,
-    to_numpy_array,
-)
-class SigLipImageProcessor:
-    def __init__(self, image_mean=(0.5, 0.5, 0.5), image_std=(0.5, 0.5, 0.5), size=(384, 384), crop_size: Dict[str, int] = None, resample=PILImageResampling.BICUBIC, rescale_factor=1 / 255, data_format=ChannelDimension.FIRST):
-        crop_size = crop_size if crop_size is not None else {"height": 384, "width": 384}
-        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.size = size
-        self.resample = resample
-        self.rescale_factor = rescale_factor
-        self.data_format = data_format
-        self.crop_size = crop_size
-    def preprocess(self, images, return_tensors):
-        if isinstance(images, Image.Image):
-            images = [images]
-        else:
-            # to adapt video data
-            images = [to_numpy_array(image) for image in images]
-            assert isinstance(images, list)
-        transforms = [
-            convert_to_rgb,
-            to_numpy_array,
-            partial(resize, size=self.size, resample=self.resample, data_format=self.data_format),
-            partial(rescale, scale=self.rescale_factor, data_format=self.data_format),
-            partial(normalize, mean=self.image_mean, std=self.image_std, data_format=self.data_format),
-            partial(to_channel_dimension_format, channel_dim=self.data_format, input_channel_dim=self.data_format),
-        ]
-        images = reduce(lambda x, f: [*map(f, x)], transforms, images)
-        data = {"pixel_values": images}
-        return BatchFeature(data=data, tensor_type=return_tensors)
-class SigLipVisionTower(BaseVisionTower):
-    def __init__(self, vision_tower_name, vision_tower_cfg, delay_load=False):
-        super(SigLipVisionTower, self).__init__(vision_tower_name, vision_tower_cfg, delay_load)
-        # model_path = "google/siglip-so400m-patch14-384"
-        # base_model_name, res, interp = model_path, 384, 576
-        # self.vision_tower_name = base_model_name
-        self.vision_tower_name, res, interp = vision_tower_name, 384, 576
-        self._image_size = res if res is not None else 512
-        self.unfreeze_mm_vision_tower = getattr(vision_tower_cfg, "unfreeze_mm_vision_tower", False)
-        if not delay_load:
-            rank0_print(f"Loading vision tower: {vision_tower_name}")
-            self.load_model()
-        elif getattr(vision_tower_cfg, "unfreeze_mm_vision_tower", False):
-            # TODO: better detector is needed.
-            rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.")
-            self.load_model()
-        elif hasattr(vision_tower_cfg, "mm_tunable_parts") and "mm_vision_tower" in vision_tower_cfg.mm_tunable_parts:
-            rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.")
-            self.load_model()
-        else:
-            self.cfg_only = self.config
-    def load_model(self, device_map=None):
-        self.vision_model = "siglip"
-        # clip_model, processor = create_model_from_pretrained(self.vision_tower_name)
-        print(self.vision_tower_name)
-        self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name)
-        # self.vision_tower = clip_model.visual.trunk
-        self.vision_tower.output_tokens = True
-        self._hidden_size = self.vision_tower.config.hidden_size
-        self.image_processor = SigLipImageProcessor()
-        del self.vision_tower.vision_model.encoder.layers[-1:]
-        self.vision_tower.vision_model.head = nn.Identity()
-        self.vision_tower.requires_grad_(self.unfreeze_mm_vision_tower)
-        self.is_loaded = True
-    def _forward(self, images):
-        with torch.set_grad_enabled(self.unfreeze_mm_vision_tower):
-            image_features = self.vision_tower.forward(
-                images.to(device=self.device, dtype=self.dtype),
-                output_hidden_states=True,
-            ).hidden_states[-1]
-            return image_features
-    @property
-    def dummy_feature(self):
-        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
-    @property
-    def dtype(self):
-        for p in self.vision_tower.parameters():
-            return p.dtype
-    @property
-    def device(self):
-        for p in self.vision_tower.parameters():
-            return p.device
-    @property
-    def hidden_size(self):
-        return self.config.hidden_size
-    @property
-    def num_patches(self):
-        return (336 // 14) ** 2
-    @property
-    def num_patches_per_side(self):
-        #return self.config.image_size // self.config.patch_size
-        return 336//14
-        #return 27
-        # return self.model_config["vision_cfg"]["image_size"] // self.model_config["vision_cfg"]["patch_size"]
-    @property
-    def image_size(self):
-        return 384
-        #return self.config.image_size

utils_encoder.py DELETED Viewed

@@ -1,296 +0,0 @@
-import importlib
-import numpy as np
-import cv2, os
-import torch
-import torch.distributed as dist
-def count_params(model, verbose=False):
-    total_params = sum(p.numel() for p in model.parameters())
-    if verbose:
-        print(f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.")
-    return total_params
-def check_istarget(name, para_list):
-    """
-    name: full name of source para
-    para_list: partial name of target para
-    """
-    istarget = False
-    for para in para_list:
-        if para in name:
-            return True
-    return istarget
-def instantiate_from_config(config):
-    if not "target" in config:
-        if config == "__is_first_stage__":
-            return None
-        elif config == "__is_unconditional__":
-            return None
-        raise KeyError("Expected key `target` to instantiate.")
-    return get_obj_from_str(config["target"])(**config.get("params", dict()))
-def get_obj_from_str(string, reload=False):
-    module, cls = string.rsplit(".", 1)
-    if reload:
-        module_imp = importlib.import_module(module)
-        importlib.reload(module_imp)
-    return getattr(importlib.import_module(module, package=None), cls)
-def load_npz_from_dir(data_dir):
-    data = [
-        np.load(os.path.join(data_dir, data_name))["arr_0"]
-        for data_name in os.listdir(data_dir)
-    ]
-    data = np.concatenate(data, axis=0)
-    return data
-def load_npz_from_paths(data_paths):
-    data = [np.load(data_path)["arr_0"] for data_path in data_paths]
-    data = np.concatenate(data, axis=0)
-    return data
-def resize_numpy_image(image, max_resolution=512 * 512, resize_short_edge=None):
-    h, w = image.shape[:2]
-    if resize_short_edge is not None:
-        k = resize_short_edge / min(h, w)
-    else:
-        k = max_resolution / (h * w)
-        k = k**0.5
-    h = int(np.round(h * k / 64)) * 64
-    w = int(np.round(w * k / 64)) * 64
-    image = cv2.resize(image, (w, h), interpolation=cv2.INTER_LANCZOS4)
-    return image
-def setup_dist(args):
-    if dist.is_initialized():
-        return
-    torch.cuda.set_device(args.local_rank)
-    torch.distributed.init_process_group("nccl", init_method="env://")
-# adopted from
-# https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
-# and
-# https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
-# and
-# https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py
-#
-# thanks!
-import torch.nn as nn
-import math
-from inspect import isfunction
-import torch
-from torch import nn
-import torch.distributed as dist
-def gather_data(data, return_np=True):
-    """gather data from multiple processes to one list"""
-    data_list = [torch.zeros_like(data) for _ in range(dist.get_world_size())]
-    dist.all_gather(data_list, data)  # gather not supported with NCCL
-    if return_np:
-        data_list = [data.cpu().numpy() for data in data_list]
-    return data_list
-def autocast(f):
-    def do_autocast(*args, **kwargs):
-        with torch.cuda.amp.autocast(
-            enabled=True,
-            dtype=torch.get_autocast_gpu_dtype(),
-            cache_enabled=torch.is_autocast_cache_enabled(),
-        ):
-            return f(*args, **kwargs)
-    return do_autocast
-def extract_into_tensor(a, t, x_shape):
-    b, *_ = t.shape
-    out = a.gather(-1, t)
-    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
-def noise_like(shape, device, repeat=False):
-    repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(
-        shape[0], *((1,) * (len(shape) - 1))
-    )
-    noise = lambda: torch.randn(shape, device=device)
-    return repeat_noise() if repeat else noise()
-def default(val, d):
-    if exists(val):
-        return val
-    return d() if isfunction(d) else d
-def exists(val):
-    return val is not None
-def identity(*args, **kwargs):
-    return nn.Identity()
-def uniq(arr):
-    return {el: True for el in arr}.keys()
-def mean_flat(tensor):
-    """
-    Take the mean over all non-batch dimensions.
-    """
-    return tensor.mean(dim=list(range(1, len(tensor.shape))))
-def ismap(x):
-    if not isinstance(x, torch.Tensor):
-        return False
-    return (len(x.shape) == 4) and (x.shape[1] > 3)
-def isimage(x):
-    if not isinstance(x, torch.Tensor):
-        return False
-    return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1)
-def max_neg_value(t):
-    return -torch.finfo(t.dtype).max
-def shape_to_str(x):
-    shape_str = "x".join([str(x) for x in x.shape])
-    return shape_str
-def init_(tensor):
-    dim = tensor.shape[-1]
-    std = 1 / math.sqrt(dim)
-    tensor.uniform_(-std, std)
-    return tensor
-# ckpt = torch.utils.checkpoint.checkpoint
-# def checkpoint(func, inputs, params, flag):
-#     """
-#     Evaluate a function without caching intermediate activations, allowing for
-#     reduced memory at the expense of extra compute in the backward pass.
-#     :param func: the function to evaluate.
-#     :param inputs: the argument sequence to pass to `func`.
-#     :param params: a sequence of parameters `func` depends on but does not
-#                    explicitly take as arguments.
-#     :param flag: if False, disable gradient checkpointing.
-#     """
-#     if flag:
-#         return ckpt(func, *inputs)
-#     else:
-#         return func(*inputs)
-def disabled_train(self, mode=True):
-    """Overwrite model.train with this function to make sure train/eval mode
-    does not change anymore."""
-    return self
-def zero_module(module):
-    """
-    Zero out the parameters of a module and return it.
-    """
-    for p in module.parameters():
-        p.detach().zero_()
-    return module
-def scale_module(module, scale):
-    """
-    Scale the parameters of a module and return it.
-    """
-    for p in module.parameters():
-        p.detach().mul_(scale)
-    return module
-def conv_nd(dims, *args, **kwargs):
-    """
-    Create a 1D, 2D, or 3D convolution module.
-    """
-    if dims == 1:
-        return nn.Conv1d(*args, **kwargs)
-    elif dims == 2:
-        return nn.Conv2d(*args, **kwargs)
-    elif dims == 3:
-        return nn.Conv3d(*args, **kwargs)
-    raise ValueError(f"unsupported dimensions: {dims}")
-def linear(*args, **kwargs):
-    """
-    Create a linear module.
-    """
-    return nn.Linear(*args, **kwargs)
-def avg_pool_nd(dims, *args, **kwargs):
-    """
-    Create a 1D, 2D, or 3D average pooling module.
-    """
-    if dims == 1:
-        return nn.AvgPool1d(*args, **kwargs)
-    elif dims == 2:
-        return nn.AvgPool2d(*args, **kwargs)
-    elif dims == 3:
-        return nn.AvgPool3d(*args, **kwargs)
-    raise ValueError(f"unsupported dimensions: {dims}")
-def nonlinearity(type="silu"):
-    if type == "silu":
-        return nn.SiLU()
-    elif type == "leaky_relu":
-        return nn.LeakyReLU()
-class GroupNormSpecific(nn.GroupNorm):
-    def forward(self, x):
-        if x.dtype == torch.float16 or x.dtype == torch.bfloat16:
-            return super().forward(x).type(x.dtype)
-        else:
-            return super().forward(x.float()).type(x.dtype)
-def normalization(channels, num_groups=32):
-    """
-    Make a standard normalization layer.
-    :param channels: number of input channels.
-    :return: an nn.Module for normalization.
-    """
-    return GroupNormSpecific(num_groups, channels)
-class HybridConditioner(nn.Module):
-    def __init__(self, c_concat_config, c_crossattn_config):
-        super().__init__()
-        self.concat_conditioner = instantiate_from_config(c_concat_config)
-        self.crossattn_conditioner = instantiate_from_config(c_crossattn_config)
-    def forward(self, c_concat, c_crossattn):
-        c_concat = self.concat_conditioner(c_concat)
-        c_crossattn = self.crossattn_conditioner(c_crossattn)
-        return {"c_concat": [c_concat], "c_crossattn": [c_crossattn]}

multimodal_projector/builder.py → vision_projector_builder.py RENAMED Viewed

@@ -1,8 +1,36 @@
 import torch
 import torch.nn as nn
 import re
-from .pooler_projector import PoolerProjector
 class IdentityMap(nn.Module):

 import torch
 import torch.nn as nn
 import re
+import math
+from transformers.models.clip.modeling_clip import CLIPVisionModel
+class PoolerProjector(nn.Module):
+    def __init__(self, config, vision_cfg):
+        super().__init__()
+        self._config = config
+        self.hw = vision_cfg.image_size // vision_cfg.patch_size
+        self.conv_pool = nn.Conv2d(config.mm_hidden_size, config.hidden_size, kernel_size=2, stride=2)
+        self.proj = nn.Sequential(
+            nn.GELU(),
+            nn.Linear(config.hidden_size, config.hidden_size),
+        )
+    def forward(self, x, *args, **kwargs):
+        height = width = self.hw
+        assert height * width == x.shape[1]
+        x = x.view(x.shape[0], height, width, -1).permute(0, 3, 1, 2)
+        x = self.conv_pool(x)
+        x = x.flatten(2).transpose(1, 2)
+        x = self.proj(x)
+        return x
+    @property
+    def config(self):
+        return {"mm_projector_type": "pooler"}
 class IdentityMap(nn.Module):

multimodal_resampler/spatial_pool.py → vision_resampler_builder.py RENAMED Viewed

@@ -43,3 +43,26 @@ class SpatialPool(nn.Module):
     @property
     def hidden_size(self):
         return self.out_channels

     @property
     def hidden_size(self):
         return self.out_channels
+class IdentityMap(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x, *args, **kwargs):
+        return x
+    @property
+    def config(self):
+        return {"mm_resampler_type": None}
+def build_vision_resampler(model_args, delay_load=False, **kwargs):
+    resampler_type = getattr(model_args, "mm_resampler_type", None)
+    if resampler_type == "spatial_pool":
+        return SpatialPool(model_args, **kwargs)
+    elif resampler_type is None:
+        return IdentityMap()
+    raise ValueError(f"Unknown resampler type: {resampler_type}")

multimodal_encoder/siglip_encoder.py → vision_tower_builder.py RENAMED Viewed

@@ -1,24 +1,13 @@
-import torch
-import torch.nn.functional as F
-from torch import nn
 from typing import Optional, Tuple, Union, Dict
 from PIL import Image
 from functools import partial, reduce
 from transformers import SiglipImageProcessor, SiglipVisionConfig, SiglipVisionModel
-from .base_encoder import BaseVisionTower
 import torch.distributed as dist
-# --data_path /share/shuyan/video_traindata/anno/\{cinepine_order\}.json \
-#     --image_folder /share/shuyan/video_traindata/Bunny-v1_0-data/finetune/images \
-#     --video_folder /share/shuyan/video_traindata \
-def rank0_print(*args):
-    if dist.is_initialized():
-        if dist.get_rank() == 0:
-            print(f"Rank {dist.get_rank()}: ", *args)
-    else:
-        print(*args)
 from transformers.image_processing_utils import BatchFeature, get_size_dict
 from transformers.image_transforms import (
     convert_to_rgb,
@@ -32,6 +21,78 @@ from transformers.image_utils import (
     PILImageResampling,
     to_numpy_array,
 )
 class SigLipImageProcessor:
     def __init__(self, image_mean=(0.5, 0.5, 0.5), image_std=(0.5, 0.5, 0.5), size=(384, 384), crop_size: Dict[str, int] = None, resample=PILImageResampling.BICUBIC, rescale_factor=1 / 255, data_format=ChannelDimension.FIRST):
         crop_size = crop_size if crop_size is not None else {"height": 384, "width": 384}
@@ -151,4 +212,18 @@ class SigLipVisionTower(BaseVisionTower):
     @property
     def image_size(self):
         return 384
-        #return self.config.image_size

+import os
 from typing import Optional, Tuple, Union, Dict
 from PIL import Image
 from functools import partial, reduce
 from transformers import SiglipImageProcessor, SiglipVisionConfig, SiglipVisionModel
 import torch.distributed as dist
+from abc import ABC, abstractmethod
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
 from transformers.image_processing_utils import BatchFeature, get_size_dict
 from transformers.image_transforms import (
     convert_to_rgb,
     PILImageResampling,
     to_numpy_array,
 )
+def rank0_print(*args):
+    if dist.is_initialized():
+        if dist.get_rank() == 0:
+            print(f"Rank {dist.get_rank()}: ", *args)
+    else:
+        print(*args)
+class BaseVisionTower(nn.Module):
+    def __init__(self, vision_tower_name, vision_tower_cfg, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.vision_tower_name = vision_tower_name
+        self.delay_load = delay_load
+    @abstractmethod
+    def load_model(self, device_map=None):
+        raise NotImplementedError("Subclasses must implement load_model")
+    @abstractmethod
+    def _forward(self, images):
+        raise NotImplementedError("Subclasses must implement forward")
+    def forward(self, images):
+        if type(images) is list:
+            image_features = [self._forward(image.unsqueeze(0)) for image in images]
+        else:
+            image_features = self._forward(images)
+        return image_features
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        # Dynamically infer the dtype from the first parameter, if not explicitly specified
+        if hasattr(self.vision_tower, "dtype"):
+            return self.vision_tower.dtype
+        else:
+            params = list(self.vision_tower.parameters())
+            return (
+                params[0].dtype if len(params) > 0 else torch.float32
+            )  # Default to torch.float32 if no parameters
+    @property
+    def device(self):
+        # Dynamically infer the device from the first parameter, if not explicitly specified
+        if hasattr(self.vision_tower, "device"):
+            return self.vision_tower.device
+        else:
+            params = list(self.vision_tower.parameters())
+            return (
+                params[0].device if len(params) > 0 else torch.device("cpu")
+            )  # Default to CPU if no parameters
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+    @property
+    def hidden_size(self):
+        try:
+            return self.config.hidden_size
+        except:
+            return self._hidden_size
 class SigLipImageProcessor:
     def __init__(self, image_mean=(0.5, 0.5, 0.5), image_std=(0.5, 0.5, 0.5), size=(384, 384), crop_size: Dict[str, int] = None, resample=PILImageResampling.BICUBIC, rescale_factor=1 / 255, data_format=ChannelDimension.FIRST):
         crop_size = crop_size if crop_size is not None else {"height": 384, "width": 384}
     @property
     def image_size(self):
         return 384
+def build_vision_tower(vision_tower_cfg, **kwargs):
+    vision_tower = getattr(vision_tower_cfg, "mm_vision_tower", getattr(vision_tower_cfg, "vision_tower", None))
+    is_absolute_path_exists = os.path.exists(vision_tower)
+    use_s2 = getattr(vision_tower_cfg, "s2", False)
+    #print(getattr(vision_tower_cfg, "vision_tower", None))
+    return SigLipVisionTower(vision_tower, vision_tower_cfg=vision_tower_cfg, **kwargs)
+    if getattr(vision_tower_cfg, "vision_tower", None) and "siglip" in getattr(vision_tower_cfg, "vision_tower", None).lower():
+        #print('*************\n')
+        return SigLipVisionTower(vision_tower, vision_tower_cfg=vision_tower_cfg, **kwargs)
+    raise ValueError(f"Unknown vision tower: {vision_tower}")