update

Browse files

Files changed (12) hide show

attention_temporal_videoae.py +1314 -0
base_encoder.py +68 -0
builder.py +17 -0
llava_arch.py +76 -52
llava_qwen.py +44 -24
mm_utils.py +18 -14
modeling_qwen2.py +4 -1
sae.py +45 -0
sae_utils.py +302 -0
siglip_encoder.py +154 -0
utils.py +166 -0
utils_encoder.py +296 -0

attention_temporal_videoae.py ADDED Viewed

	@@ -0,0 +1,1314 @@

+from inspect import isfunction
+import math
+import torch
+import torch as th
+import torch.nn.functional as F
+from torch import nn, einsum
+from einops import rearrange, repeat
+from typing import Optional, Any
+try:
+    import xformers
+    import xformers.ops
+    XFORMERS_IS_AVAILBLE = True
+except:
+    XFORMERS_IS_AVAILBLE = False
+from .utils_encoder import (
+    conv_nd,
+    zero_module,
+    normalization,
+)
+def exists(val):
+    return val is not None
+def uniq(arr):
+    return {el: True for el in arr}.keys()
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+def max_neg_value(t):
+    return -torch.finfo(t.dtype).max
+def init_(tensor):
+    dim = tensor.shape[-1]
+    std = 1 / math.sqrt(dim)
+    tensor.uniform_(-std, std)
+    return tensor
+# feedforward
+class GEGLU(nn.Module):
+    def __init__(self, dim_in, dim_out):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+    def forward(self, x):
+        x, gate = self.proj(x).chunk(2, dim=-1)
+        return x * F.gelu(gate)
+class FeedForward(nn.Module):
+    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = default(dim_out, dim)
+        project_in = (
+            nn.Sequential(nn.Linear(dim, inner_dim), nn.GELU())
+            if not glu
+            else GEGLU(dim, inner_dim)
+        )
+        self.net = nn.Sequential(
+            project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out)
+        )
+    def forward(self, x):
+        return self.net(x)
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+def Normalize(in_channels, num_groups=32):
+    return torch.nn.GroupNorm(
+        num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True
+    )
+# ---------------------------------------------------------------------------------------------------
+class RelativePosition(nn.Module):
+    """https://github.com/evelinehong/Transformer_Relative_Position_PyTorch/blob/master/relative_position.py"""
+    def __init__(self, num_units, max_relative_position):
+        super().__init__()
+        self.num_units = num_units
+        self.max_relative_position = max_relative_position
+        self.embeddings_table = nn.Parameter(
+            th.Tensor(max_relative_position * 2 + 1, num_units)
+        )
+        nn.init.xavier_uniform_(self.embeddings_table)
+    def forward(self, length_q, length_k):
+        device = self.embeddings_table.device
+        range_vec_q = th.arange(length_q, device=device)
+        range_vec_k = th.arange(length_k, device=device)
+        distance_mat = range_vec_k[None, :] - range_vec_q[:, None]
+        distance_mat_clipped = th.clamp(
+            distance_mat, -self.max_relative_position, self.max_relative_position
+        )
+        final_mat = distance_mat_clipped + self.max_relative_position
+        # final_mat = th.LongTensor(final_mat).to(self.embeddings_table.device)
+        # final_mat = th.tensor(final_mat, device=self.embeddings_table.device, dtype=torch.long)
+        final_mat = final_mat.long()
+        embeddings = self.embeddings_table[final_mat]
+        return embeddings
+class TemporalCrossAttention(nn.Module):
+    def __init__(
+        self,
+        query_dim,
+        context_dim=None,
+        heads=8,
+        dim_head=64,
+        dropout=0.0,
+        temporal_length=None,  # For relative positional representation and image-video joint training.
+        image_length=None,  # For image-video joint training.
+        use_relative_position=False,  # whether use relative positional representation in temporal attention.
+        img_video_joint_train=False,  # For image-video joint training.
+        use_tempoal_causal_attn=False,
+        bidirectional_causal_attn=False,
+        tempoal_attn_type=None,
+        joint_train_mode="same_batch",
+        **kwargs,
+    ):
+        super().__init__()
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+        self.context_dim = context_dim
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        self.temporal_length = temporal_length
+        self.use_relative_position = use_relative_position
+        self.img_video_joint_train = img_video_joint_train
+        self.bidirectional_causal_attn = bidirectional_causal_attn
+        self.joint_train_mode = joint_train_mode
+        assert joint_train_mode in ["same_batch", "diff_batch"]
+        self.tempoal_attn_type = tempoal_attn_type
+        if bidirectional_causal_attn:
+            assert use_tempoal_causal_attn
+        if tempoal_attn_type:
+            assert tempoal_attn_type in ["sparse_causal", "sparse_causal_first"]
+            assert not use_tempoal_causal_attn
+            assert not (
+                img_video_joint_train and (self.joint_train_mode == "same_batch")
+            )
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+        assert not (
+            img_video_joint_train
+            and (self.joint_train_mode == "same_batch")
+            and use_tempoal_causal_attn
+        )
+        if img_video_joint_train:
+            if self.joint_train_mode == "same_batch":
+                mask = torch.ones(
+                    [1, temporal_length + image_length, temporal_length + image_length]
+                )
+                # mask[:, image_length:, :] = 0
+                # mask[:, :, image_length:] = 0
+                mask[:, temporal_length:, :] = 0
+                mask[:, :, temporal_length:] = 0
+                self.mask = mask
+            else:
+                self.mask = None
+        elif use_tempoal_causal_attn:
+            # normal causal attn
+            self.mask = torch.tril(torch.ones([1, temporal_length, temporal_length]))
+        elif tempoal_attn_type == "sparse_causal":
+            # all frames interact with only the `prev` & self frame
+            mask1 = torch.tril(
+                torch.ones([1, temporal_length, temporal_length])
+            ).bool()  # true indicates keeping
+            mask2 = torch.zeros(
+                [1, temporal_length, temporal_length]
+            )  # initialize to same shape with mask1
+            mask2[:, 2:temporal_length, : temporal_length - 2] = torch.tril(
+                torch.ones([1, temporal_length - 2, temporal_length - 2])
+            )
+            mask2 = (1 - mask2).bool()  # false indicates masking
+            self.mask = mask1 & mask2
+        elif tempoal_attn_type == "sparse_causal_first":
+            # all frames interact with only the `first` & self frame
+            mask1 = torch.tril(
+                torch.ones([1, temporal_length, temporal_length])
+            ).bool()  # true indicates keeping
+            mask2 = torch.zeros([1, temporal_length, temporal_length])
+            mask2[:, 2:temporal_length, 1 : temporal_length - 1] = torch.tril(
+                torch.ones([1, temporal_length - 2, temporal_length - 2])
+            )
+            mask2 = (1 - mask2).bool()  # false indicates masking
+            self.mask = mask1 & mask2
+        else:
+            self.mask = None
+        if use_relative_position:
+            assert temporal_length is not None
+            self.relative_position_k = RelativePosition(
+                num_units=dim_head, max_relative_position=temporal_length
+            )
+            self.relative_position_v = RelativePosition(
+                num_units=dim_head, max_relative_position=temporal_length
+            )
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
+        )
+        nn.init.constant_(self.to_q.weight, 0)
+        nn.init.constant_(self.to_k.weight, 0)
+        nn.init.constant_(self.to_v.weight, 0)
+        nn.init.constant_(self.to_out[0].weight, 0)
+        nn.init.constant_(self.to_out[0].bias, 0)
+    def forward(self, x, context=None, mask=None):
+        # if context is None:
+        #     print(f'[Temp Attn] x={x.shape},context=None')
+        # else:
+        #     print(f'[Temp Attn] x={x.shape},context={context.shape}')
+        nh = self.heads
+        out = x
+        q = self.to_q(out)
+        # if context is not None:
+        #     print(f'temporal context 1 ={context.shape}')
+        # print(f'x={x.shape}')
+        context = default(context, x)
+        # print(f'temporal context 2 ={context.shape}')
+        k = self.to_k(context)
+        v = self.to_v(context)
+        # print(f'q ={q.shape},k={k.shape}')
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> (b h) n d", h=nh), (q, k, v))
+        sim = einsum("b i d, b j d -> b i j", q, k) * self.scale
+        if self.use_relative_position:
+            len_q, len_k, len_v = q.shape[1], k.shape[1], v.shape[1]
+            k2 = self.relative_position_k(len_q, len_k)
+            sim2 = einsum("b t d, t s d -> b t s", q, k2) * self.scale  # TODO check
+            sim += sim2
+        # print('mask',mask)
+        if exists(self.mask):
+            if mask is None:
+                mask = self.mask.to(sim.device)
+            else:
+                mask = self.mask.to(sim.device).bool() & mask  # .to(sim.device)
+        else:
+            mask = mask
+            # if self.img_video_joint_train:
+            #     # process mask (make mask same shape with sim)
+            #     c, h, w = mask.shape
+            #     c, t, s = sim.shape
+            #     # assert(h == w and t == s),f"mask={mask.shape}, sim={sim.shape}, h={h}, w={w}, t={t}, s={s}"
+            #     if h > t:
+            #         mask = mask[:, :t, :]
+            #     elif h < t: # pad zeros to mask (no attention) only initial mask =1 area compute weights
+            #         mask_ = torch.zeros([c,t,w]).to(mask.device)
+            #         mask_[:, :h, :] = mask
+            #         mask = mask_
+            #     c, h, w = mask.shape
+            #     if w > s:
+            #         mask = mask[:, :, :s]
+            #     elif w < s: # pad zeros to mask
+            #         mask_ = torch.zeros([c,h,s]).to(mask.device)
+            #         mask_[:, :, :w] = mask
+            #         mask = mask_
+            # max_neg_value = -torch.finfo(sim.dtype).max
+            # sim = sim.float().masked_fill(mask == 0, max_neg_value)
+        if mask is not None:
+            max_neg_value = -1e9
+            sim = sim + (1 - mask.float()) * max_neg_value  # 1=masking,0=no masking
+            # print('sim after masking: ', sim)
+            # if torch.isnan(sim).any() or torch.isinf(sim).any() or (not sim.any()):
+            #     print(f'sim [after masking], isnan={torch.isnan(sim).any()}, isinf={torch.isinf(sim).any()}, allzero={not sim.any()}')
+        attn = sim.softmax(dim=-1)
+        # print('attn after softmax: ', attn)
+        # if torch.isnan(attn).any() or torch.isinf(attn).any() or (not attn.any()):
+        #     print(f'attn [after softmax], isnan={torch.isnan(attn).any()}, isinf={torch.isinf(attn).any()}, allzero={not attn.any()}')
+        # attn = torch.where(torch.isnan(attn), torch.full_like(attn,0), attn)
+        # if torch.isinf(attn.detach()).any():
+        #     import pdb;pdb.set_trace()
+        # if torch.isnan(attn.detach()).any():
+        #     import pdb;pdb.set_trace()
+        out = einsum("b i j, b j d -> b i d", attn, v)
+        if self.bidirectional_causal_attn:
+            mask_reverse = torch.triu(
+                torch.ones(
+                    [1, self.temporal_length, self.temporal_length], device=sim.device
+                )
+            )
+            sim_reverse = sim.float().masked_fill(mask_reverse == 0, max_neg_value)
+            attn_reverse = sim_reverse.softmax(dim=-1)
+            out_reverse = einsum("b i j, b j d -> b i d", attn_reverse, v)
+            out += out_reverse
+        if self.use_relative_position:
+            v2 = self.relative_position_v(len_q, len_v)
+            out2 = einsum("b t s, t s d -> b t d", attn, v2)  # TODO check
+            out += out2  # TODO check：先add还是先merge head？先计算rpr，on split head之后的数据，然后再merge。
+        out = rearrange(out, "(b h) n d -> b n (h d)", h=nh)  # merge head
+        return self.to_out(out)
+# ---------------------------------------------------------------------------------------------------
+class SpatialSelfAttention(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.k = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.v = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.proj_out = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b, c, h, w = q.shape
+        q = rearrange(q, "b c h w -> b (h w) c")
+        k = rearrange(k, "b c h w -> b c (h w)")
+        w_ = torch.einsum("bij,bjk->bik", q, k)
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = rearrange(v, "b c h w -> b c (h w)")
+        w_ = rearrange(w_, "b i j -> b j i")
+        h_ = torch.einsum("bij,bjk->bik", v, w_)
+        h_ = rearrange(h_, "b c (h w) -> b c h w", h=h)
+        h_ = self.proj_out(h_)
+        return x + h_
+class CrossAttention(nn.Module):
+    def __init__(
+        self,
+        query_dim,
+        context_dim=None,
+        heads=8,
+        dim_head=64,
+        dropout=0.0,
+        sa_shared_kv=False,
+        shared_type="only_first",
+        **kwargs,
+    ):
+        super().__init__()
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+        self.sa_shared_kv = sa_shared_kv
+        assert shared_type in [
+            "only_first",
+            "all_frames",
+            "first_and_prev",
+            "only_prev",
+            "full",
+            "causal",
+            "full_qkv",
+        ]
+        self.shared_type = shared_type
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        self.dim_head = dim_head
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
+        )
+        self.attention_op: Optional[Any] = None
+    def forward(self, x, context=None, mask=None):
+        h = self.heads
+        b = x.shape[0]
+        q = self.to_q(x)
+        context = default(context, x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+        if self.sa_shared_kv:
+            if self.shared_type == "only_first":
+                k, v = map(
+                    lambda xx: rearrange(xx[0].unsqueeze(0), "b n c -> (b n) c")
+                    .unsqueeze(0)
+                    .repeat(b, 1, 1),
+                    (k, v),
+                )
+            else:
+                raise NotImplementedError
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> (b h) n d", h=h), (q, k, v))
+        sim = einsum("b i d, b j d -> b i j", q, k) * self.scale
+        if exists(mask):
+            mask = rearrange(mask, "b ... -> b (...)")
+            max_neg_value = -torch.finfo(sim.dtype).max
+            mask = repeat(mask, "b j -> (b h) () j", h=h)
+            sim.masked_fill_(~mask, max_neg_value)
+        # attention, what we cannot get enough of
+        attn = sim.softmax(dim=-1)
+        out = einsum("b i j, b j d -> b i d", attn, v)
+        out = rearrange(out, "(b h) n d -> b n (h d)", h=h)
+        return self.to_out(out)
+    def efficient_forward(self, x, context=None, mask=None):
+        q = self.to_q(x)
+        context = default(context, x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+        b, _, _ = q.shape
+        q, k, v = map(
+            lambda t: t.unsqueeze(3)
+            .reshape(b, t.shape[1], self.heads, self.dim_head)
+            .permute(0, 2, 1, 3)
+            .reshape(b * self.heads, t.shape[1], self.dim_head)
+            .contiguous(),
+            (q, k, v),
+        )
+        # actually compute the attention, what we cannot get enough of
+        out = xformers.ops.memory_efficient_attention(
+            q, k, v, attn_bias=None, op=self.attention_op
+        )
+        if exists(mask):
+            raise NotImplementedError
+        out = (
+            out.unsqueeze(0)
+            .reshape(b, self.heads, out.shape[1], self.dim_head)
+            .permute(0, 2, 1, 3)
+            .reshape(b, out.shape[1], self.heads * self.dim_head)
+        )
+        return self.to_out(out)
+class VideoSpatialCrossAttention(CrossAttention):
+    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0):
+        super().__init__(query_dim, context_dim, heads, dim_head, dropout)
+    def forward(self, x, context=None, mask=None):
+        b, c, t, h, w = x.shape
+        if context is not None:
+            context = context.repeat(t, 1, 1)
+        x = super.forward(spatial_attn_reshape(x), context=context) + x
+        return spatial_attn_reshape_back(x, b, h)
+# class BasicTransformerBlockST(nn.Module):
+#     def __init__(
+#         self,
+#         # Spatial Stuff
+#         dim,
+#         n_heads,
+#         d_head,
+#         dropout=0.0,
+#         context_dim=None,
+#         gated_ff=True,
+#         checkpoint=True,
+#         # Temporal Stuff
+#         temporal_length=None,
+#         image_length=None,
+#         use_relative_position=True,
+#         img_video_joint_train=False,
+#         cross_attn_on_tempoal=False,
+#         temporal_crossattn_type="selfattn",
+#         order="stst",
+#         temporalcrossfirst=False,
+#         temporal_context_dim=None,
+#         split_stcontext=False,
+#         local_spatial_temporal_attn=False,
+#         window_size=2,
+#         random_t=False,
+#         **kwargs,
+#     ):
+#         super().__init__()
+#         # Self attention
+#         self.attn1 = CrossAttention(
+#             query_dim=dim,
+#             heads=n_heads,
+#             dim_head=d_head,
+#             dropout=dropout,
+#             **kwargs,
+#         )
+#         self.attn2 = CrossAttention(
+#             query_dim=dim,
+#             context_dim=context_dim,
+#             heads=n_heads,
+#             dim_head=d_head,
+#             dropout=dropout,
+#             **kwargs,
+#         )
+#         if XFORMERS_IS_AVAILBLE:
+#             self.attn1.forward = self.attn1.efficient_forward
+#             self.attn2.forward = self.attn2.efficient_forward
+#         self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+#         # cross attention if context is not None
+#         self.norm1 = nn.LayerNorm(dim)
+#         self.norm2 = nn.LayerNorm(dim)
+#         self.norm3 = nn.LayerNorm(dim)
+#         self.checkpoint = checkpoint
+#         self.order = order
+#         assert self.order in ["stst", "sstt", "st_parallel"]
+#         self.temporalcrossfirst = temporalcrossfirst
+#         self.split_stcontext = split_stcontext
+#         self.local_spatial_temporal_attn = local_spatial_temporal_attn
+#         if self.local_spatial_temporal_attn:
+#             assert self.order == "stst"
+#             assert self.order == "stst"
+#             self.window_size = window_size
+#         if not split_stcontext:
+#             temporal_context_dim = context_dim
+#         # Temporal attention
+#         assert temporal_crossattn_type in ["selfattn", "crossattn", "skip"]
+#         self.temporal_crossattn_type = temporal_crossattn_type
+#         self.attn1_tmp = TemporalCrossAttention(
+#             query_dim=dim,
+#             heads=n_heads,
+#             dim_head=d_head,
+#             dropout=dropout,
+#             temporal_length=temporal_length,
+#             image_length=image_length,
+#             use_relative_position=use_relative_position,
+#             img_video_joint_train=img_video_joint_train,
+#             **kwargs,
+#         )
+#         self.attn2_tmp = TemporalCrossAttention(
+#             query_dim=dim,
+#             heads=n_heads,
+#             dim_head=d_head,
+#             dropout=dropout,
+#             # cross attn
+#             context_dim=(
+#                 temporal_context_dim if temporal_crossattn_type == "crossattn" else None
+#             ),
+#             # temporal attn
+#             temporal_length=temporal_length,
+#             image_length=image_length,
+#             use_relative_position=use_relative_position,
+#             img_video_joint_train=img_video_joint_train,
+#             **kwargs,
+#         )
+#         self.norm4 = nn.LayerNorm(dim)
+#         self.norm5 = nn.LayerNorm(dim)
+#         self.random_t = random_t
+#         # self.norm1_tmp = nn.LayerNorm(dim)
+#         # self.norm2_tmp = nn.LayerNorm(dim)
+#     ##############################################################################################################################################
+#     def forward(
+#         self,
+#         x,
+#         context=None,
+#         temporal_context=None,
+#         no_temporal_attn=None,
+#         attn_mask=None,
+#         **kwargs,
+#     ):
+#         # print(f'no_temporal_attn={no_temporal_attn}')
+#         if not self.split_stcontext:
+#             # st cross attention use the same context vector
+#             temporal_context = context.detach().clone()
+#         if context is None and temporal_context is None:
+#             # self-attention models
+#             if no_temporal_attn:
+#                 raise NotImplementedError
+#             return checkpoint(
+#                 self._forward_nocontext, (x), self.parameters(), self.checkpoint
+#             )
+#         else:
+#             # cross-attention models
+#             if no_temporal_attn:
+#                 forward_func = self._forward_no_temporal_attn
+#             else:
+#                 forward_func = self._forward
+#             inputs = (
+#                 (x, context, temporal_context)
+#                 if temporal_context is not None
+#                 else (x, context)
+#             )
+#             return checkpoint(forward_func, inputs, self.parameters(), self.checkpoint)
+#             # if attn_mask is not None:
+#             #     return checkpoint(self._forward, (x, context, temporal_context, attn_mask), self.parameters(), self.checkpoint)
+#             # return checkpoint(self._forward, (x, context, temporal_context), self.parameters(), self.checkpoint)
+#     def _forward(
+#         self,
+#         x,
+#         context=None,
+#         temporal_context=None,
+#         mask=None,
+#         no_temporal_attn=None,
+#     ):
+#         assert x.dim() == 5, f"x shape = {x.shape}"
+#         b, c, t, h, w = x.shape
+#         if self.order in ["stst", "sstt"]:
+#             x = self._st_cross_attn(
+#                 x,
+#                 context,
+#                 temporal_context=temporal_context,
+#                 order=self.order,
+#                 mask=mask,
+#             )  # no_temporal_attn=no_temporal_attn,
+#         elif self.order == "st_parallel":
+#             x = self._st_cross_attn_parallel(
+#                 x,
+#                 context,
+#                 temporal_context=temporal_context,
+#                 order=self.order,
+#             )  # no_temporal_attn=no_temporal_attn,
+#         else:
+#             raise NotImplementedError
+#         x = self.ff(self.norm3(x)) + x
+#         if (no_temporal_attn is None) or (not no_temporal_attn):
+#             x = rearrange(x, "(b h w) t c -> b c t h w", b=b, h=h, w=w)  # 3d -> 5d
+#         elif no_temporal_attn:
+#             x = rearrange(x, "(b t) (h w) c -> b c t h w", b=b, h=h, w=w)  # 3d -> 5d
+#         return x
+#     def _forward_no_temporal_attn(
+#         self,
+#         x,
+#         context=None,
+#         temporal_context=None,
+#     ):
+#         # temporary implementation :(
+#         # because checkpoint does not support non-tensor inputs currently.
+#         assert x.dim() == 5, f"x shape = {x.shape}"
+#         b, c, t, h, w = x.shape
+#         if self.order in ["stst", "sstt"]:
+#             # x = self._st_cross_attn(x, context, temporal_context=temporal_context, order=self.order, no_temporal_attn=True,)
+#             # mask = torch.zeros([1, t, t], device=x.device).bool() if context is None else torch.zeros([1, context.shape[1], t], device=x.device).bool()
+#             mask = torch.zeros([1, t, t], device=x.device).bool()
+#             x = self._st_cross_attn(
+#                 x,
+#                 context,
+#                 temporal_context=temporal_context,
+#                 order=self.order,
+#                 mask=mask,
+#             )
+#         elif self.order == "st_parallel":
+#             x = self._st_cross_attn_parallel(
+#                 x,
+#                 context,
+#                 temporal_context=temporal_context,
+#                 order=self.order,
+#                 no_temporal_attn=True,
+#             )
+#         else:
+#             raise NotImplementedError
+#         x = self.ff(self.norm3(x)) + x
+#         x = rearrange(x, "(b h w) t c -> b c t h w", b=b, h=h, w=w)  # 3d -> 5d
+#         # x = rearrange(x, '(b t) (h w) c -> b c t h w', b=b,h=h,w=w) # 3d -> 5d
+#         return x
+#     def _forward_nocontext(self, x, no_temporal_attn=None):
+#         assert x.dim() == 5, f"x shape = {x.shape}"
+#         b, c, t, h, w = x.shape
+#         if self.order in ["stst", "sstt"]:
+#             x = self._st_cross_attn(
+#                 x, order=self.order, no_temporal_attn=no_temporal_attn
+#             )
+#         elif self.order == "st_parallel":
+#             x = self._st_cross_attn_parallel(
+#                 x, order=self.order, no_temporal_attn=no_temporal_attn
+#             )
+#         else:
+#             raise NotImplementedError
+#         x = self.ff(self.norm3(x)) + x
+#         x = rearrange(x, "(b h w) t c -> b c t h w", b=b, h=h, w=w)  # 3d -> 5d
+#         return x
+#     ##############################################################################################################################################
+#     def _st_cross_attn(
+#         self, x, context=None, temporal_context=None, order="stst", mask=None
+#     ):  # no_temporal_attn=None,
+#         b, c, t, h, w = x.shape
+#         # if context is not None:
+#         #     print(f'[_st_cross_attn input] x={x.shape}, context={context.shape}')
+#         # else:
+#         #     print(f'[_st_cross_attn input] x={x.shape}')
+#         if order == "stst":
+#             # spatial self attention
+#             x = rearrange(x, "b c t h w -> (b t) (h w) c")
+#             # print(f'before attn1,x={x.shape}')
+#             x = self.attn1(self.norm1(x)) + x
+#             x = rearrange(x, "(b t) (h w) c -> b c t h w", b=b, h=h)
+#             # temporal self attention
+#             # if (no_temporal_attn is None) or (not no_temporal_attn):
+#             if self.local_spatial_temporal_attn:
+#                 x = local_spatial_temporal_attn_reshape(x, window_size=self.window_size)
+#             else:
+#                 x = rearrange(x, "b c t h w -> (b h w) t c")
+#             x = self.attn1_tmp(self.norm4(x), mask=mask) + x
+#             if self.local_spatial_temporal_attn:
+#                 x = local_spatial_temporal_attn_reshape_back(
+#                     x, window_size=self.window_size, b=b, h=h, w=w, t=t
+#                 )
+#             else:
+#                 x = rearrange(x, "(b h w) t c -> b c t h w", b=b, h=h, w=w)  # 3d -> 5d
+#             # spatial cross attention
+#             x = rearrange(x, "b c t h w -> (b t) (h w) c")
+#             # print(f'before attn2, x={x.shape}')
+#             # if context is not None:
+#             # print(f'[before attn2] context={context.shape}')
+#             if context is not None:
+#                 if self.random_t:
+#                     context_ = []
+#                     for i in range(context.shape[0]):
+#                         context_.append(context[i].unsqueeze(0).repeat(t, 1, 1))
+#                     context_ = torch.cat(context_, dim=0)
+#                 else:
+#                     if context.shape[0] == t:  # img captions no_temporal_attn or
+#                         context_ = context
+#                     else:
+#                         # repeat conditions with t times
+#                         context_ = []
+#                         for i in range(context.shape[0]):
+#                             context_.append(context[i].unsqueeze(0).repeat(t, 1, 1))
+#                         context_ = torch.cat(context_, dim=0)
+#             else:
+#                 context_ = None
+#             # if context_ is not None:
+#             #     print(f'[before attn2] x={x.shape}, context_={context_.shape}')
+#             # else:
+#             #     print(f'[before attn2] x={x.shape}')
+#             x = self.attn2(self.norm2(x), context=context_) + x
+#             # temporal cross attention
+#             # if (no_temporal_attn is None) or (not no_temporal_attn):
+#             x = rearrange(x, "(b t) (h w) c -> b c t h w", b=b, h=h)
+#             x = rearrange(x, "b c t h w -> (b h w) t c")
+#             if self.temporal_crossattn_type == "crossattn":
+#                 # tmporal cross attention
+#                 if temporal_context is not None:
+#                     # print(f'STATTN context={context.shape}, temporal_context={temporal_context.shape}')
+#                     temporal_context = torch.cat(
+#                         [context, temporal_context], dim=1
+#                     )  # blc
+#                     # print(f'STATTN after concat temporal_context={temporal_context.shape}')
+#                     temporal_context = temporal_context.repeat(h * w, 1, 1)
+#                     # print(f'after repeat temporal_context={temporal_context.shape}')
+#                 else:
+#                     temporal_context = context[0:1, ...].repeat(h * w, 1, 1)
+#                 # print(f'STATTN after concat x={x.shape}')
+#                 x = (
+#                     self.attn2_tmp(self.norm5(x), context=temporal_context, mask=mask)
+#                     + x
+#                 )
+#             elif self.temporal_crossattn_type == "selfattn":
+#                 # temporal self attention
+#                 x = self.attn2_tmp(self.norm5(x), context=None, mask=mask) + x
+#             elif self.temporal_crossattn_type == "skip":
+#                 # no temporal cross and self attention
+#                 pass
+#             else:
+#                 raise NotImplementedError
+#         elif order == "sstt":
+#             # spatial self attention
+#             x = rearrange(x, "b c t h w -> (b t) (h w) c")
+#             x = self.attn1(self.norm1(x)) + x
+#             # spatial cross attention
+#             context_ = context.repeat(t, 1, 1) if context is not None else None
+#             x = self.attn2(self.norm2(x), context=context_) + x
+#             x = rearrange(x, "(b t) (h w) c -> b c t h w", b=b, h=h)
+#             if (no_temporal_attn is None) or (not no_temporal_attn):
+#                 if self.temporalcrossfirst:
+#                     # temporal cross attention
+#                     if self.temporal_crossattn_type == "crossattn":
+#                         # if temporal_context is not None:
+#                         temporal_context = context.repeat(h * w, 1, 1)
+#                         x = (
+#                             self.attn2_tmp(
+#                                 self.norm5(x), context=temporal_context, mask=mask
+#                             )
+#                             + x
+#                         )
+#                     elif self.temporal_crossattn_type == "selfattn":
+#                         x = self.attn2_tmp(self.norm5(x), context=None, mask=mask) + x
+#                     elif self.temporal_crossattn_type == "skip":
+#                         pass
+#                     else:
+#                         raise NotImplementedError
+#                     # temporal self attention
+#                     x = rearrange(x, "b c t h w -> (b h w) t c")
+#                     x = self.attn1_tmp(self.norm4(x), mask=mask) + x
+#                 else:
+#                     # temporal self attention
+#                     x = rearrange(x, "b c t h w -> (b h w) t c")
+#                     x = self.attn1_tmp(self.norm4(x), mask=mask) + x
+#                     # temporal cross attention
+#                     if self.temporal_crossattn_type == "crossattn":
+#                         if temporal_context is not None:
+#                             temporal_context = context.repeat(h * w, 1, 1)
+#                         x = (
+#                             self.attn2_tmp(
+#                                 self.norm5(x), context=temporal_context, mask=mask
+#                             )
+#                             + x
+#                         )
+#                     elif self.temporal_crossattn_type == "selfattn":
+#                         x = self.attn2_tmp(self.norm5(x), context=None, mask=mask) + x
+#                     elif self.temporal_crossattn_type == "skip":
+#                         pass
+#                     else:
+#                         raise NotImplementedError
+#         else:
+#             raise NotImplementedError
+#         return x
+#     def _st_cross_attn_parallel(
+#         self, x, context=None, temporal_context=None, order="sst", no_temporal_attn=None
+#     ):
+#         """order: x -> Self Attn -> Cross Attn -> attn_s
+#         x -> Temp Self Attn -> attn_t
+#         x' = x + attn_s + attn_t
+#         """
+#         if no_temporal_attn is not None:
+#             raise NotImplementedError
+#         B, C, T, H, W = x.shape
+#         # spatial self attention
+#         h = x
+#         h = rearrange(h, "b c t h w -> (b t) (h w) c")
+#         h = self.attn1(self.norm1(h)) + h
+#         # spatial cross
+#         # context_ = context.repeat(T, 1, 1) if context is not None else None
+#         if context is not None:
+#             context_ = []
+#             for i in range(context.shape[0]):
+#                 context_.append(context[i].unsqueeze(0).repeat(T, 1, 1))
+#             context_ = torch.cat(context_, dim=0)
+#         else:
+#             context_ = None
+#         h = self.attn2(self.norm2(h), context=context_) + h
+#         h = rearrange(h, "(b t) (h w) c -> b c t h w", b=B, h=H)
+#         # temporal self
+#         h2 = x
+#         h2 = rearrange(h2, "b c t h w -> (b h w) t c")
+#         h2 = self.attn1_tmp(self.norm4(h2))  # + h2
+#         h2 = rearrange(h2, "(b h w) t c -> b c t h w", b=B, h=H, w=W)
+#         out = h + h2
+#         return rearrange(out, "b c t h w -> (b h w) t c")
+    ##############################################################################################################################################
+def spatial_attn_reshape(x):
+    return rearrange(x, "b c t h w -> (b t) (h w) c")
+def spatial_attn_reshape_back(x, b, h):
+    return rearrange(x, "(b t) (h w) c -> b c t h w", b=b, h=h)
+def temporal_attn_reshape(x):
+    return rearrange(x, "b c t h w -> (b h w) t c")
+def temporal_attn_reshape_back(x, b, h, w):
+    return rearrange(x, "(b h w) t c -> b c t h w", b=b, h=h, w=w)
+def local_spatial_temporal_attn_reshape(x, window_size):
+    B, C, T, H, W = x.shape
+    NH = H // window_size
+    NW = W // window_size
+    # x = x.view(B, C, T, NH, window_size, NW, window_size)
+    # tokens = x.permute(0, 1, 2, 3, 5, 4, 6).contiguous()
+    # tokens = tokens.view(-1, window_size, window_size, C)
+    x = rearrange(
+        x,
+        "b c t (nh wh) (nw ww) -> b c t nh wh nw ww",
+        nh=NH,
+        nw=NW,
+        wh=window_size,
+        ww=window_size,
+    ).contiguous()  # # B, C, T, NH, NW, window_size, window_size
+    x = rearrange(
+        x, "b c t nh wh nw ww -> (b nh nw) (t wh ww) c"
+    )  # (B, NH, NW) (T, window_size, window_size) C
+    return x
+def local_spatial_temporal_attn_reshape_back(x, window_size, b, h, w, t):
+    B, L, C = x.shape
+    NH = h // window_size
+    NW = w // window_size
+    x = rearrange(
+        x,
+        "(b nh nw) (t wh ww) c -> b c t nh wh nw ww",
+        b=b,
+        nh=NH,
+        nw=NW,
+        t=t,
+        wh=window_size,
+        ww=window_size,
+    )
+    x = rearrange(x, "b c t nh wh nw ww -> b c t (nh wh) (nw ww)")
+    return x
+class SpatialTemporalTransformer(nn.Module):
+    """
+    Transformer block for video-like data (5D tensor).
+    First, project the input (aka embedding) with NO reshape.
+    Then apply standard transformer action.
+    The 5D -> 3D reshape operation will be done in the specific attention module.
+    """
+    def __init__(
+        self,
+        in_channels,
+        n_heads,
+        d_head,
+        depth=1,
+        dropout=0.0,
+        context_dim=None,
+        # Temporal stuff
+        temporal_length=None,
+        image_length=None,
+        use_relative_position=True,
+        img_video_joint_train=False,
+        cross_attn_on_tempoal=False,
+        temporal_crossattn_type="selfattn",
+        order="stst",
+        temporalcrossfirst=False,
+        split_stcontext=False,
+        temporal_context_dim=None,
+        **kwargs,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        inner_dim = n_heads * d_head
+        self.norm = Normalize(in_channels)
+        self.proj_in = nn.Conv3d(
+            in_channels, inner_dim, kernel_size=1, stride=1, padding=0
+        )
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlockST(
+                    inner_dim,
+                    n_heads,
+                    d_head,
+                    dropout=dropout,
+                    # cross attn
+                    context_dim=context_dim,
+                    # temporal attn
+                    temporal_length=temporal_length,
+                    image_length=image_length,
+                    use_relative_position=use_relative_position,
+                    img_video_joint_train=img_video_joint_train,
+                    temporal_crossattn_type=temporal_crossattn_type,
+                    order=order,
+                    temporalcrossfirst=temporalcrossfirst,
+                    split_stcontext=split_stcontext,
+                    temporal_context_dim=temporal_context_dim,
+                    **kwargs,
+                )
+                for d in range(depth)
+            ]
+        )
+        self.proj_out = zero_module(
+            nn.Conv3d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+        )
+    def forward(self, x, context=None, temporal_context=None, **kwargs):
+        # note: if no context is given, cross-attention defaults to self-attention
+        assert x.dim() == 5, f"x shape = {x.shape}"
+        b, c, t, h, w = x.shape
+        x_in = x
+        x = self.norm(x)
+        x = self.proj_in(x)
+        for block in self.transformer_blocks:
+            x = block(x, context=context, temporal_context=temporal_context, **kwargs)
+        x = self.proj_out(x)
+        return x + x_in
+# ---------------------------------------------------------------------------------------------------
+class STAttentionBlock2(nn.Module):
+    def __init__(
+        self,
+        channels,
+        num_heads=1,
+        num_head_channels=-1,
+        use_checkpoint=False,  # not used, only used in ResBlock
+        use_new_attention_order=False,  # QKVAttention or QKVAttentionLegacy
+        temporal_length=16,  # used in relative positional representation.
+        image_length=8,  # used for image-video joint training.
+        use_relative_position=False,  # whether use relative positional representation in temporal attention.
+        img_video_joint_train=False,
+        # norm_type="groupnorm",
+        attn_norm_type="group",
+        use_tempoal_causal_attn=False,
+    ):
+        """
+        version 1: guided_diffusion implemented version
+        version 2: remove args input argument
+        """
+        super().__init__()
+        if num_head_channels == -1:
+            self.num_heads = num_heads
+        else:
+            assert (
+                channels % num_head_channels == 0
+            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
+            self.num_heads = channels // num_head_channels
+        self.use_checkpoint = use_checkpoint
+        self.temporal_length = temporal_length
+        self.image_length = image_length
+        self.use_relative_position = use_relative_position
+        self.img_video_joint_train = img_video_joint_train
+        self.attn_norm_type = attn_norm_type
+        assert self.attn_norm_type in ["group", "no_norm"]
+        self.use_tempoal_causal_attn = use_tempoal_causal_attn
+        if self.attn_norm_type == "group":
+            self.norm_s = normalization(channels)
+            self.norm_t = normalization(channels)
+        self.qkv_s = conv_nd(1, channels, channels * 3, 1)
+        self.qkv_t = conv_nd(1, channels, channels * 3, 1)
+        if self.img_video_joint_train:
+            mask = th.ones(
+                [1, temporal_length + image_length, temporal_length + image_length]
+            )
+            mask[:, temporal_length:, :] = 0
+            mask[:, :, temporal_length:] = 0
+            self.register_buffer("mask", mask)
+        else:
+            self.mask = None
+        if use_new_attention_order:
+            # split qkv before split heads
+            self.attention_s = QKVAttention(self.num_heads)
+            self.attention_t = QKVAttention(self.num_heads)
+        else:
+            # split heads before split qkv
+            self.attention_s = QKVAttentionLegacy(self.num_heads)
+            self.attention_t = QKVAttentionLegacy(self.num_heads)
+        if use_relative_position:
+            self.relative_position_k = RelativePosition(
+                num_units=channels // self.num_heads,
+                max_relative_position=temporal_length,
+            )
+            self.relative_position_v = RelativePosition(
+                num_units=channels // self.num_heads,
+                max_relative_position=temporal_length,
+            )
+        self.proj_out_s = zero_module(
+            conv_nd(1, channels, channels, 1)
+        )  # conv_dim, in_channels, out_channels, kernel_size
+        self.proj_out_t = zero_module(
+            conv_nd(1, channels, channels, 1)
+        )  # conv_dim, in_channels, out_channels, kernel_size
+    def forward(self, x, mask=None):
+        b, c, t, h, w = x.shape
+        # spatial
+        out = rearrange(x, "b c t h w -> (b t) c (h w)")
+        if self.attn_norm_type == "no_norm":
+            qkv = self.qkv_s(out)
+        else:
+            qkv = self.qkv_s(self.norm_s(out))
+        out = self.attention_s(qkv)
+        out = self.proj_out_s(out)
+        out = rearrange(out, "(b t) c (h w) -> b c t h w", b=b, h=h)
+        x += out
+        # temporal
+        out = rearrange(x, "b c t h w -> (b h w) c t")
+        if self.attn_norm_type == "no_norm":
+            qkv = self.qkv_t(out)
+        else:
+            qkv = self.qkv_t(self.norm_t(out))
+        # relative positional embedding
+        if self.use_relative_position:
+            len_q = qkv.size()[-1]
+            len_k, len_v = len_q, len_q
+            k_rp = self.relative_position_k(len_q, len_k)
+            v_rp = self.relative_position_v(len_q, len_v)  # [T,T,head_dim]
+            out = self.attention_t(
+                qkv,
+                rp=(k_rp, v_rp),
+                mask=self.mask,
+                use_tempoal_causal_attn=self.use_tempoal_causal_attn,
+            )
+        else:
+            out = self.attention_t(
+                qkv,
+                rp=None,
+                mask=self.mask,
+                use_tempoal_causal_attn=self.use_tempoal_causal_attn,
+            )
+        out = self.proj_out_t(out)
+        out = rearrange(out, "(b h w) c t -> b c t h w", b=b, h=h, w=w)
+        return x + out
+# ---------------------------------------------------------------------------------------------------------------
+class QKVAttentionLegacy(nn.Module):
+    """
+    A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping
+    """
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+    def forward(self, qkv, rp=None, mask=None):
+        """
+        Apply QKV attention.
+        :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
+        :return: an [N x (H * C) x T] tensor after attention.
+        """
+        if rp is not None or mask is not None:
+            raise NotImplementedError
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = th.einsum(
+            "bct,bcs->bts", q * scale, k * scale
+        )  # More stable with f16 than dividing afterwards
+        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = th.einsum("bts,bcs->bct", weight, v)
+        return a.reshape(bs, -1, length)
+    @staticmethod
+    def count_flops(model, _x, y):
+        return count_flops_attn(model, _x, y)
+# ---------------------------------------------------------------------------------------------------------------
+class QKVAttention(nn.Module):
+    """
+    A module which performs QKV attention and splits in a different order.
+    """
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+    def forward(self, qkv, rp=None, mask=None, use_tempoal_causal_attn=False):
+        """
+        Apply QKV attention.
+        :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
+        :return: an [N x (H * C) x T] tensor after attention.
+        """
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        # print('qkv', qkv.size())
+        qkv=qkv.contiguous()
+        q, k, v = qkv.chunk(3, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        # print('bs, self.n_heads, ch, length', bs, self.n_heads, ch, length)
+        weight = th.einsum(
+            "bct,bcs->bts",
+            (q * scale).view(bs * self.n_heads, ch, length),
+            (k * scale).view(bs * self.n_heads, ch, length),
+        )  # More stable with f16 than dividing afterwards
+        # weight:[b,t,s] b=bs*n_heads*T
+        if rp is not None:
+            k_rp, v_rp = rp  # [length, length, head_dim] [8, 8, 48]
+            weight2 = th.einsum(
+                "bct,tsc->bst", (q * scale).view(bs * self.n_heads, ch, length), k_rp
+            )
+            weight += weight2
+        if use_tempoal_causal_attn:
+            # weight = torch.tril(weight)
+            assert mask is None, f"Not implemented for merging two masks!"
+            mask = torch.tril(torch.ones(weight.shape))
+        else:
+            if mask is not None:  # only keep upper-left matrix
+                # process mask
+                c, t, _ = weight.shape
+                if mask.shape[-1] > t:
+                    mask = mask[:, :t, :t]
+                elif mask.shape[-1] < t:  # pad ones
+                    mask_ = th.zeros([c, t, t]).to(mask.device)
+                    t_ = mask.shape[-1]
+                    mask_[:, :t_, :t_] = mask
+                    mask = mask_
+                else:
+                    assert (
+                        weight.shape[-1] == mask.shape[-1]
+                    ), f"weight={weight.shape}, mask={mask.shape}"
+        if mask is not None:
+            INF = -1e8  # float('-inf')
+            weight = weight.float().masked_fill(mask == 0, INF)
+        weight = F.softmax(weight.float(), dim=-1).type(
+            weight.dtype
+        )  # [256, 8, 8] [b, t, t] b=bs*n_heads*h*w,t=nframes
+        # weight = F.softmax(weight, dim=-1)#[256, 8, 8] [b, t, t] b=bs*n_heads*h*w,t=nframes
+        a = th.einsum(
+            "bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length)
+        )  # [256, 48, 8] [b, head_dim, t]
+        if rp is not None:
+            a2 = th.einsum("bts,tsc->btc", weight, v_rp).transpose(1, 2)  # btc->bct
+            a += a2
+        return a.reshape(bs, -1, length)
+# ---------------------------------------------------------------------------------------------------------------
+# ---------------------------------------------------------------------------------------------------------------

base_encoder.py ADDED Viewed

	@@ -0,0 +1,68 @@

+from abc import ABC, abstractmethod
+import torch
+import torch.nn as nn
+class BaseVisionTower(nn.Module):
+    def __init__(self, vision_tower_name, vision_tower_cfg, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.vision_tower_name = vision_tower_name
+        self.delay_load = delay_load
+    @abstractmethod
+    def load_model(self, device_map=None):
+        raise NotImplementedError("Subclasses must implement load_model")
+    @abstractmethod
+    def _forward(self, images):
+        raise NotImplementedError("Subclasses must implement forward")
+    def forward(self, images):
+        if type(images) is list:
+            image_features = [self._forward(image.unsqueeze(0)) for image in images]
+        else:
+            image_features = self._forward(images)
+        return image_features
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        # Dynamically infer the dtype from the first parameter, if not explicitly specified
+        if hasattr(self.vision_tower, "dtype"):
+            return self.vision_tower.dtype
+        else:
+            params = list(self.vision_tower.parameters())
+            return (
+                params[0].dtype if len(params) > 0 else torch.float32
+            )  # Default to torch.float32 if no parameters
+    @property
+    def device(self):
+        # Dynamically infer the device from the first parameter, if not explicitly specified
+        if hasattr(self.vision_tower, "device"):
+            return self.vision_tower.device
+        else:
+            params = list(self.vision_tower.parameters())
+            return (
+                params[0].device if len(params) > 0 else torch.device("cpu")
+            )  # Default to CPU if no parameters
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+    @property
+    def hidden_size(self):
+        try:
+            return self.config.hidden_size
+        except:
+            return self._hidden_size

builder.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import os
+from .siglip_encoder import SigLipVisionTower
+def build_vision_tower(vision_tower_cfg, **kwargs):
+    vision_tower = getattr(vision_tower_cfg, "mm_vision_tower", getattr(vision_tower_cfg, "vision_tower", None))
+    is_absolute_path_exists = os.path.exists(vision_tower)
+    use_s2 = getattr(vision_tower_cfg, "s2", False)
+    #print(getattr(vision_tower_cfg, "vision_tower", None))
+    return SigLipVisionTower(vision_tower, vision_tower_cfg=vision_tower_cfg, **kwargs)
+    if getattr(vision_tower_cfg, "vision_tower", None) and "siglip" in getattr(vision_tower_cfg, "vision_tower", None).lower():
+        #print('*************\n')
+        return SigLipVisionTower(vision_tower, vision_tower_cfg=vision_tower_cfg, **kwargs)
+    raise ValueError(f"Unknown vision tower: {vision_tower}")

llava_arch.py CHANGED Viewed

@@ -14,25 +14,48 @@
 from abc import ABC, abstractmethod
 import math
 import re
 import time
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from .multimodal_encoder.builder import build_vision_tower
-from .multimodal_resampler.builder import build_vision_resampler
-from .multimodal_projector.builder import build_vision_projector
 from transformers import AutoTokenizer
-from longva.longva.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
-from longva.longva.mm_utils import get_anyres_image_grid_shape
-from longva.longva.utils import rank0_print
 import random
 from .sae import SiglipAE
-from .WindowTimeToTokenAttention import WindowTimeToTokenAttention
 import numpy as np
 import torch.nn.functional as F
 import pdb
@@ -281,15 +304,13 @@ class LlavaMetaForCausalLM(ABC):
         return expanded_x
     def encode_multimodals(self, videos_or_images, video_idx_in_batch, split_sizes=None):
-        #################################################################################
-        # if videos_or_images.shape[0] > 360:
-        #     random_indices = np.random.choice(videos_or_images.shape[0], size=360, replace=False)
-        #     videos_or_images = videos_or_images[random_indices]
-        #     split_sizes=videos_or_images.shape[0]
-        #################################################################################
         # Define the maximum batch size (1024 frames)
-        max_batch_size = 60
         num_frames = videos_or_images.shape[0]
         # Initialize a list to store the features from each batch
         videos_or_images_features = []
@@ -312,47 +333,49 @@ class LlavaMetaForCausalLM(ABC):
         else:
             videos_or_images_features = self.get_model().get_vision_tower()(videos_or_images)
-        per_videos_or_images_features = torch.split(videos_or_images_features, split_sizes, dim=0)  # tuple, (dim_1, 576, 4096)
         all_videos_or_images_features = []
         for idx, feat in enumerate(per_videos_or_images_features):
-            #print(feat.shape,end='1\n')
-            feat=self.interpolate(feat)
-            #######################################################
-            if idx in video_idx_in_batch:
-                feat=self.add_video(feat)
-            else:
-                feat=self.add_image(feat)
-            bc,ch,h,w=feat.shape
-            feat = feat.view(bc//4,ch,4,h,w)
-            if bc//4>24:
-                chunk_size = 24
-                chunks = torch.split(feat, chunk_size, dim=0)
-                interpolated_chunks = []
-                for chunk in chunks:
-                    interpolated_chunk=self.get_model().sae(chunk).squeeze(2)
-                    interpolated_chunks.append(interpolated_chunk)
-                feat = torch.cat(interpolated_chunks, dim=0)
-                del interpolated_chunks
-                del chunks
-            else:
-                feat=self.get_model().sae(feat).squeeze(2)
-            feat = feat.permute(0, 2, 3, 1).contiguous().flatten(1, 2)
-            #print(feat.shape,end='3\n')
-            feat = self.get_model().mm_projector(feat)
-            #print(feat.shape,end='4\n')
-            # Post pooling
-            if idx in video_idx_in_batch:
-                #print('************************',idx,video_idx_in_batch)
-                feat = self.get_2dPool(feat)
-            all_videos_or_images_features.append(feat)
         del per_videos_or_images_features
         return all_videos_or_images_features
-    ########################################################
     def interpolate(self,image_features):
         b, num_tokens, dim = image_features.shape
@@ -383,6 +406,7 @@ class LlavaMetaForCausalLM(ABC):
         return image_features
     def prepare_inputs_labels_for_multimodal(self, input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities=["image"], image_sizes=None,time_embedding=None):
         vision_tower = self.get_vision_tower()
         if vision_tower is None or images is None or input_ids.shape[1] == 1:
             return input_ids, position_ids, attention_mask, past_key_values, None, labels

 from abc import ABC, abstractmethod
+import importlib.util
+import os.path as osp
 import math
 import re
 import time
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+try:
+    from .builder import build_vision_tower
+    from .builder import build_vision_resampler
+    from .builder import build_vision_projector
+except ModuleNotFoundError:
+    spec = importlib.util.spec_from_file_location(
+        "builder",
+        osp.join(osp.dirname(__file__), "builder.py"),
+    )
+    builder = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(builder)
+    build_vision_tower = getattr(
+        builder,
+        "build_vision_tower",
+    )
+    build_vision_resampler = getattr(
+        builder,
+        "build_vision_resampler",
+    )
+    build_vision_projector = getattr(
+        builder,
+        "build_vision_projector",
+    )
 from transformers import AutoTokenizer
+from .constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from .mm_utils import get_anyres_image_grid_shape
+from .utils import rank0_print
 import random
 from .sae import SiglipAE
 import numpy as np
 import torch.nn.functional as F
 import pdb
         return expanded_x
     def encode_multimodals(self, videos_or_images, video_idx_in_batch, split_sizes=None):
+        pdb.set_trace()
+        if self.config.enable_chunk_prefill:
+            chunk_size_for_vision_tower = self.config.prefill_config['chunk_size_for_vision_tower']
+        else:
+            chunk_size_for_vision_tower = 100000
         # Define the maximum batch size (1024 frames)
+        max_batch_size = chunk_size_for_vision_tower
         num_frames = videos_or_images.shape[0]
         # Initialize a list to store the features from each batch
         videos_or_images_features = []
         else:
             videos_or_images_features = self.get_model().get_vision_tower()(videos_or_images)
+        per_videos_or_images_features = torch.split(videos_or_images_features, split_sizes, dim=0)
         all_videos_or_images_features = []
+        peak_memory_allocated = torch.cuda.max_memory_allocated()
+        print(f"vision encoder 显存峰值: {peak_memory_allocated / (1024**3):.2f} GB") # 转换为GB
+        del videos_or_images_features
+        torch.cuda.empty_cache()
+        chunk_size = chunk_size_for_vision_tower
+        all_feat_list = []
         for idx, feat in enumerate(per_videos_or_images_features):
+            for i in range(0, feat.shape[0], chunk_size):
+                batched_feat = feat[i:i+chunk_size]
+                batched_feat=self.interpolate(batched_feat) # torch.Size([187, 1152, 24, 24])
+                if idx in video_idx_in_batch:
+                    batched_feat = self.add_video(batched_feat)  # torch.Size([188, 1152, 24, 24])
+                else:
+                    batched_feat = self.add_image(batched_feat)
+                bc,ch,h,w = batched_feat.shape
+                batched_feat = batched_feat.view(bc//4,ch,4,h,w)
+                batched_feat=self.get_model().sae(batched_feat).squeeze(2)
+                batched_feat = batched_feat.permute(0, 2, 3, 1).contiguous().flatten(1, 2)
+                batched_feat = self.get_model().mm_projector(batched_feat)
+                batched_feat = self.get_2dPool(batched_feat)
+                all_feat_list.append(batched_feat)
+        feat = torch.cat(all_feat_list, dim=0)
+        peak_memory_allocated = torch.cuda.max_memory_allocated()
+        print(f"sae 显存峰值: {peak_memory_allocated / (1024**3):.2f} GB") # 转换为GB
         del per_videos_or_images_features
+        del all_feat_list
+        torch.cuda.empty_cache()
+        all_videos_or_images_features.append(feat)
         return all_videos_or_images_features
     def interpolate(self,image_features):
         b, num_tokens, dim = image_features.shape
         return image_features
     def prepare_inputs_labels_for_multimodal(self, input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities=["image"], image_sizes=None,time_embedding=None):
+        pdb.set_trace()
         vision_tower = self.get_vision_tower()
         if vision_tower is None or images is None or input_ids.shape[1] == 1:
             return input_ids, position_ids, attention_mask, past_key_values, None, labels

llava_qwen.py CHANGED Viewed

@@ -21,7 +21,7 @@ import transformers
 from transformers import AutoConfig, AutoModelForCausalLM, LlamaConfig, LlamaModel, LlamaForCausalLM
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.generation.utils import GenerateOutput
-from longva.longva.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
 from .modeling_qwen2 import Qwen2Config, Qwen2Model, Qwen2ForCausalLM
 import pdb
 import time
@@ -211,6 +211,7 @@ class LlavaQwenForCausalLM(Qwen2ForCausalLM, LlavaMetaForCausalLM):
         time_token_end_indices=None,
         block_size_chosed=None,
         prev_blocks_num=None,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         block_size = block_size_chosed
@@ -218,7 +219,6 @@ class LlavaQwenForCausalLM(Qwen2ForCausalLM, LlavaMetaForCausalLM):
         visual_token_end_pos = visual_token_end_pos
         visual_len = visual_token_end_pos - visual_token_start_pos
         num_blocks = (frames_num  + block_size * 4 - 1) // (block_size * 4)
-        # print(f'block_size: {block_size}, num_blocks: {num_blocks}')
         # streaming inps
         blocks_positions = [[(0, 0, visual_token_start_pos)]]
@@ -254,10 +254,10 @@ class LlavaQwenForCausalLM(Qwen2ForCausalLM, LlavaMetaForCausalLM):
         suffix_embeds = full_inputs_embeds[:, visual_token_end_pos:, :]
         num_visual_tokens = visual_embeds.size(1)
-        all_past_key_values = [[] for _ in range(len(self.model.layers))]  # 假设 model 有 layers 属性
         prefix_past_key_values = []
-        torch.cuda.reset_peak_memory_stats()
         if prefix_embeds.size(1) > 0:
             pkv = self.process_block(prefix_embeds, bsz=bsz, device=device)
@@ -288,16 +288,15 @@ class LlavaQwenForCausalLM(Qwen2ForCausalLM, LlavaMetaForCausalLM):
         block_streaming_past_key_values_part1 = prefix_past_key_values
         position_ids_part1 = torch.arange(0, prefix_past_key_values[0][0].size(2), dtype=torch.long, device=device)
-        block_streaming_past_key_values_part2 = [[] for _ in range(len(self.model.layers))]   # 存
         position_ids_part2 = torch.tensor([], dtype=torch.long, device=device)
         block_streaming_past_key_values_part3=None
         position_ids_part3 = None
         query_position_ids = None
         for idx, single_block in enumerate(blocks_positions[:]):
-            if idx == 0:
-                continue
-            if idx <= prev_blocks_num:
                 continue
             b_start, _, _ = single_block[0]
@@ -312,13 +311,15 @@ class LlavaQwenForCausalLM(Qwen2ForCausalLM, LlavaMetaForCausalLM):
             true_block_length = b_end - b_start
             block_streaming_past_key_values_part3 = [tmp[-prev_blocks_num:] for tmp in all_past_key_values]
-            # block_streaming_past_key_values_part3 = [
-            #     [
-            #         (t[0].to(device=device), t[1].to(device=device))
-            #         for t in sublist
-            #     ]
-            #     for sublist in block_streaming_past_key_values_part3
-            # ]
             block_streaming_past_key_values = self.cat_history_kvs(block_streaming_past_key_values_part1, block_streaming_past_key_values_part2, block_streaming_past_key_values_part3)
@@ -337,8 +338,11 @@ class LlavaQwenForCausalLM(Qwen2ForCausalLM, LlavaMetaForCausalLM):
                 key_this_block, val_this_block = pkv[i]
                 key_this_block = key_this_block[:,:,length_before_chunk:,:]
                 val_this_block = val_this_block[:,:,length_before_chunk:,:]
-                all_past_key_values[i].append( (key_this_block, val_this_block) )
-                # all_past_key_values[i].append( (key_this_block.to('cpu'), val_this_block.to('cpu')) )
                 time_keys_list = []
                 time_vals_list = []
@@ -371,6 +375,9 @@ class LlavaQwenForCausalLM(Qwen2ForCausalLM, LlavaMetaForCausalLM):
             values = torch.cat([pkv[1].to(device=device) for pkv in layer_pkvs], dim=2)
             merged_pkv.append((keys, values))
         pkv = merged_pkv
         del block_streaming_past_key_values
@@ -383,6 +390,8 @@ class LlavaQwenForCausalLM(Qwen2ForCausalLM, LlavaMetaForCausalLM):
         # TODO: bi-decoding acceleration
         mixed_prefill_past_key_values = pkv
         prefill_len = visual_token_end_pos
         # Process suffix
         if suffix_embeds.size(1) > 0:
@@ -404,6 +413,8 @@ class LlavaQwenForCausalLM(Qwen2ForCausalLM, LlavaMetaForCausalLM):
                 return_dict=return_dict,
                 # blocks_positions=None,
             )
             del mixed_prefill_past_key_values
             torch.cuda.empty_cache()
@@ -508,12 +519,17 @@ class LlavaQwenForCausalLM(Qwen2ForCausalLM, LlavaMetaForCausalLM):
             )
         if inputs_embeds is None:
             (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = self.prepare_inputs_labels_for_multimodal(input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities, image_sizes, time_embedding)
-        if self.config.enable_sparse:
-            block_size_chosed = self.config.sparse_config['block_size_chosed']
-            prev_blocks_num = self.config.sparse_config['prev_blocks_num']
-            if self.config.sparse_mode=='streaming':
                 return self.forward_streaming(
                     input_ids=input_ids,
                     attention_mask=attention_mask,
@@ -533,10 +549,11 @@ class LlavaQwenForCausalLM(Qwen2ForCausalLM, LlavaMetaForCausalLM):
                     frames_num=frames_num,
                     time_token_indices=time_token_indices,
                     time_token_end_indices=time_token_end_indices,
-                    block_size_chosed=block_size_chosed,
-                    prev_blocks_num=prev_blocks_num,
                 )
-            elif self.config.sparse_mode=='mask':
                 return self.forward_mask(
                     input_ids=input_ids,
                     attention_mask=attention_mask,
@@ -584,6 +601,8 @@ class LlavaQwenForCausalLM(Qwen2ForCausalLM, LlavaMetaForCausalLM):
         **kwargs,
     ) -> Union[GenerateOutput, torch.LongTensor]:
         position_ids = kwargs.pop("position_ids", None)
         attention_mask = kwargs.pop("attention_mask", None)
@@ -631,6 +650,7 @@ class LlavaQwenForCausalLM(Qwen2ForCausalLM, LlavaMetaForCausalLM):
         sample_fps=1,
         max_sample_fps=4,
         generation_config={}):
         # prepare text input
         conv = conv_templates["qwen_1_5"].copy()

 from transformers import AutoConfig, AutoModelForCausalLM, LlamaConfig, LlamaModel, LlamaForCausalLM
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.generation.utils import GenerateOutput
+from .llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
 from .modeling_qwen2 import Qwen2Config, Qwen2Model, Qwen2ForCausalLM
 import pdb
 import time
         time_token_end_indices=None,
         block_size_chosed=None,
         prev_blocks_num=None,
+        offload: Optional[bool] = None,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         block_size = block_size_chosed
         visual_token_end_pos = visual_token_end_pos
         visual_len = visual_token_end_pos - visual_token_start_pos
         num_blocks = (frames_num  + block_size * 4 - 1) // (block_size * 4)
         # streaming inps
         blocks_positions = [[(0, 0, visual_token_start_pos)]]
         suffix_embeds = full_inputs_embeds[:, visual_token_end_pos:, :]
         num_visual_tokens = visual_embeds.size(1)
+        all_past_key_values = [[] for _ in range(len(self.model.layers))]
         prefix_past_key_values = []
+        # torch.cuda.reset_peak_memory_stats()
         if prefix_embeds.size(1) > 0:
             pkv = self.process_block(prefix_embeds, bsz=bsz, device=device)
         block_streaming_past_key_values_part1 = prefix_past_key_values
         position_ids_part1 = torch.arange(0, prefix_past_key_values[0][0].size(2), dtype=torch.long, device=device)
+        block_streaming_past_key_values_part2 = [[] for _ in range(len(self.model.layers))]
         position_ids_part2 = torch.tensor([], dtype=torch.long, device=device)
         block_streaming_past_key_values_part3=None
         position_ids_part3 = None
         query_position_ids = None
         for idx, single_block in enumerate(blocks_positions[:]):
+            if idx == 0 or idx <= prev_blocks_num:
                 continue
             b_start, _, _ = single_block[0]
             true_block_length = b_end - b_start
             block_streaming_past_key_values_part3 = [tmp[-prev_blocks_num:] for tmp in all_past_key_values]
+            if offload:
+                block_streaming_past_key_values_part3 = [
+                    [
+                        (t[0].to(device=device), t[1].to(device=device))
+                        for t in sublist
+                    ]
+                    for sublist in block_streaming_past_key_values_part3
+                ]
             block_streaming_past_key_values = self.cat_history_kvs(block_streaming_past_key_values_part1, block_streaming_past_key_values_part2, block_streaming_past_key_values_part3)
                 key_this_block, val_this_block = pkv[i]
                 key_this_block = key_this_block[:,:,length_before_chunk:,:]
                 val_this_block = val_this_block[:,:,length_before_chunk:,:]
+                if offload:
+                    all_past_key_values[i].append( (key_this_block.to('cpu'), val_this_block.to('cpu')) )
+                else:
+                    all_past_key_values[i].append( (key_this_block, val_this_block) )
                 time_keys_list = []
                 time_vals_list = []
             values = torch.cat([pkv[1].to(device=device) for pkv in layer_pkvs], dim=2)
             merged_pkv.append((keys, values))
+        peak_memory_allocated = torch.cuda.max_memory_allocated()
+        print(f"prefill 显存峰值: {peak_memory_allocated / (1024**3):.2f} GB") # 转换为GB
         pkv = merged_pkv
         del block_streaming_past_key_values
         # TODO: bi-decoding acceleration
         mixed_prefill_past_key_values = pkv
         prefill_len = visual_token_end_pos
+        # torch.cuda.reset_peak_memory_stats()
         # Process suffix
         if suffix_embeds.size(1) > 0:
                 return_dict=return_dict,
                 # blocks_positions=None,
             )
+            peak_memory_allocated = torch.cuda.max_memory_allocated()
+            print(f"decoding 显存峰值: {peak_memory_allocated / (1024**3):.2f} GB") # 转换为GB
             del mixed_prefill_past_key_values
             torch.cuda.empty_cache()
             )
         if inputs_embeds is None:
+            pdb.set_trace()
             (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = self.prepare_inputs_labels_for_multimodal(input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities, image_sizes, time_embedding)
+        if self.config.enable_chunk_prefill:
+            prefill_mode = self.config.prefill_config['chunk_prefill_mode']
+            chunk_size = self.config.prefill_config['chunk_size']
+            step_size = self.config.prefill_config['step_size']
+            offload = self.config.prefill_config['offload']
+            if prefill_mode=='streaming':
                 return self.forward_streaming(
                     input_ids=input_ids,
                     attention_mask=attention_mask,
                     frames_num=frames_num,
                     time_token_indices=time_token_indices,
                     time_token_end_indices=time_token_end_indices,
+                    block_size_chosed=chunk_size,
+                    prev_blocks_num=chunk_size - step_size,
+                    offload=offload,
                 )
+            elif prefill_mode=='mask':
                 return self.forward_mask(
                     input_ids=input_ids,
                     attention_mask=attention_mask,
         **kwargs,
     ) -> Union[GenerateOutput, torch.LongTensor]:
         position_ids = kwargs.pop("position_ids", None)
         attention_mask = kwargs.pop("attention_mask", None)
         sample_fps=1,
         max_sample_fps=4,
         generation_config={}):
+        pdb.set_trace()
         # prepare text input
         conv = conv_templates["qwen_1_5"].copy()

mm_utils.py CHANGED Viewed

@@ -419,6 +419,7 @@ class KeywordsStoppingCriteria(StoppingCriteria):
 from decord import VideoReader, cpu
 def load_video(video_path, max_frames_num, fps=1, max_fps=4):
     if isinstance(video_path, str):
         vr = VideoReader(video_path, ctx=cpu(0))
     else:
@@ -431,22 +432,25 @@ def load_video(video_path, max_frames_num, fps=1, max_fps=4):
         return None, None, []
     video_fps = fps
-    step = round(avg_fps_from_decord / video_fps) if video_fps > 0 and avg_fps_from_decord > 0 else 1
-    frame_idx = [i for i in range(0, total_frame_num, step)]
     fps_upbound = max_fps
     frames_upbound = max_frames_num
-    if fps_upbound is not None:
-        higher_fps = min(frames_upbound//len(frame_idx), fps_upbound)
-        if higher_fps > video_fps:
-            higher_steps = round(avg_fps_from_decord / higher_fps)
-            frame_idx = [i for i in range(0, total_frame_num, higher_steps)]
-    if frames_upbound > 0:
-        if len(frame_idx) > frames_upbound:
-            uniform_sampled_frames = np.linspace(0, total_frame_num - 1, frames_upbound, dtype=int)
-            frame_idx = uniform_sampled_frames.tolist()
     timestamps = [round(idx / avg_fps_from_decord, 1) for idx in frame_idx]
     video = vr.get_batch(frame_idx).asnumpy()

 from decord import VideoReader, cpu
 def load_video(video_path, max_frames_num, fps=1, max_fps=4):
     if isinstance(video_path, str):
         vr = VideoReader(video_path, ctx=cpu(0))
     else:
         return None, None, []
     video_fps = fps
     fps_upbound = max_fps
     frames_upbound = max_frames_num
+    if fps is not None:
+        step = round(avg_fps_from_decord / video_fps) if video_fps > 0 and avg_fps_from_decord > 0 else 1
+        frame_idx = [i for i in range(0, total_frame_num, step)]
+        if fps_upbound is not None:
+            higher_fps = min(frames_upbound//len(frame_idx), fps_upbound)
+            if higher_fps > video_fps:
+                higher_steps = round(avg_fps_from_decord / higher_fps)
+                frame_idx = [i for i in range(0, total_frame_num, higher_steps)]
+        if frames_upbound > 0:
+            if len(frame_idx) > frames_upbound:
+                uniform_sampled_frames = np.linspace(0, total_frame_num - 1, frames_upbound, dtype=int)
+                frame_idx = uniform_sampled_frames.tolist()
+    else:   # use uiform sample
+        uniform_sampled_frames = np.linspace(0, total_frame_num - 1, frames_upbound, dtype=int)
+        frame_idx = uniform_sampled_frames.tolist()
     timestamps = [round(idx / avg_fps_from_decord, 1) for idx in frame_idx]
     video = vr.get_batch(frame_idx).asnumpy()

modeling_qwen2.py CHANGED Viewed

@@ -688,7 +688,10 @@ class Qwen2SdpaAttention(Qwen2Attention):
         try:
             query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids, key_position_ids)
-        except:
             pdb.set_trace()
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)

         try:
             query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids, key_position_ids)
+        except Exception as e:
+            print(e)
+            import traceback
+            traceback.print_exc()
             pdb.set_trace()
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)

sae.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import torch
+from .sae_utils import SamePadConv3d,Normalize,SiLU,TemporalAttention,AttnBlock3D,MultiHeadAttention3D,TemporalAttention_lin
+import torch.nn as nn
+import pdb
+class SiglipAE(nn.Module):
+    def __init__(self):
+        super().__init__()
+        temporal_stride=2
+        norm_type = "group"
+        self.temporal_encoding = nn.Parameter(torch.randn((4,1152)))
+        #self.vision_tower=SigLipVisionTower('google/siglip-so400m-patch14-384')
+        self.encoder=nn.Sequential(
+            AttnBlock3D(1152),
+            TemporalAttention(1152),
+            SamePadConv3d(1152,1152,kernel_size=3,stride=(temporal_stride, 1, 1),padding_type="replicate"),
+            AttnBlock3D(1152),
+            TemporalAttention(1152),
+            SamePadConv3d(1152,1152,kernel_size=3,stride=(temporal_stride, 1, 1),padding_type="replicate"),
+        )
+    def forward(self, x):
+        b_,c_,t_,h_,w_=x.shape
+        temporal_encoding = self.temporal_encoding.unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+        temporal_encoding = temporal_encoding.expand(b_, -1, -1, h_, w_)  # (B, T, C, H, W)
+        temporal_encoding = temporal_encoding.permute(0, 2, 1, 3, 4)  # (B, C, T, H, W)
+        x = x + temporal_encoding
+        x=self.encoder(x)
+        return x
+# image=torch.randn(1,1152,4,24,24).to('cuda')
+# model = SiglipAE().to('cuda')
+# model.load_state_dict(torch.load('encoder.pth'),strict=False)
+# image=model(image)
+# print(image.shape)

sae_utils.py ADDED Viewed

	@@ -0,0 +1,302 @@

+import math
+import torch
+import torch.nn as nn
+from transformers.activations import ACT2FN
+from .attention_temporal_videoae import *
+from einops import rearrange, reduce, repeat
+try:
+    import xformers
+    import xformers.ops as xops
+    XFORMERS_IS_AVAILBLE = True
+except:
+    XFORMERS_IS_AVAILBLE = False
+def silu(x):
+    # swish
+    return x * torch.sigmoid(x)
+class SiLU(nn.Module):
+    def __init__(self):
+        super(SiLU, self).__init__()
+    def forward(self, x):
+        return silu(x)
+def Normalize(in_channels, norm_type="group"):
+    assert norm_type in ["group", "batch",'layer']
+    if norm_type == "group":
+        return torch.nn.GroupNorm(
+            num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
+        )
+    elif norm_type == "batch":
+        return torch.nn.SyncBatchNorm(in_channels)
+    elif norm_type == "layer":
+        return nn.LayerNorm(in_channels)
+class SamePadConv3d(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        bias=True,
+        padding_type="replicate",
+    ):
+        super().__init__()
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size,) * 3
+        if isinstance(stride, int):
+            stride = (stride,) * 3
+        # assumes that the input shape is divisible by stride
+        total_pad = tuple([k - s for k, s in zip(kernel_size, stride)])
+        pad_input = []
+        for p in total_pad[::-1]:  # reverse since F.pad starts from last dim
+            pad_input.append((p // 2 + p % 2, p // 2))
+        pad_input = sum(pad_input, tuple())
+        self.pad_input = pad_input
+        self.padding_type = padding_type
+        self.conv = nn.Conv3d(
+            in_channels, out_channels, kernel_size, stride=stride, padding=0, bias=bias
+        )
+    def forward(self, x):
+        tp=x.dtype
+        x = x.float()
+        # 执行填充操作
+        x_padded = F.pad(x, self.pad_input, mode=self.padding_type)
+        # 如果需要，将结果转换回 BFloat16
+        x_padded = x_padded.to(tp)
+        return self.conv(x_padded)
+class TemporalAttention(nn.Module):
+    def __init__(
+        self,
+        channels,
+        num_heads=1,
+        num_head_channels=-1,
+        max_temporal_length=64,
+    ):
+        """
+        a clean multi-head temporal attention
+        """
+        super().__init__()
+        if num_head_channels == -1:
+            self.num_heads = num_heads
+        else:
+            assert (
+                channels % num_head_channels == 0
+            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
+            self.num_heads = channels // num_head_channels
+        self.norm = Normalize(channels)
+        self.qkv = zero_module(conv_nd(1, channels, channels * 3, 1))
+        self.attention = QKVAttention(self.num_heads)
+        self.relative_position_k = RelativePosition(
+            num_units=channels // self.num_heads,
+            max_relative_position=max_temporal_length,
+        )
+        self.relative_position_v = RelativePosition(
+            num_units=channels // self.num_heads,
+            max_relative_position=max_temporal_length,
+        )
+        self.proj_out = zero_module(
+            conv_nd(1, channels, channels, 1)
+        )  # conv_dim, in_channels, out_channels, kernel_size
+    def forward(self, x, mask=None):
+        b, c, t, h, w = x.shape
+        out = rearrange(x, "b c t h w -> (b h w) c t")
+        # torch.Size([4608, 1152, 2])1
+        # torch.Size([4608, 3456, 2])2
+        # torch.Size([4608, 1152, 2])3
+        # torch.Size([4608, 1152, 2])4
+        #print(out.shape,end='1\n')
+        qkv = self.qkv(self.norm(out))
+        #print(qkv.shape,end='2\n')
+        len_q = qkv.size()[-1]
+        len_k, len_v = len_q, len_q
+        k_rp = self.relative_position_k(len_q, len_k)
+        v_rp = self.relative_position_v(len_q, len_v)  # [T,T,head_dim]
+        out = self.attention(qkv, rp=(k_rp, v_rp))
+        #print(out.shape,end='3\n')
+        out = self.proj_out(out)
+        #print(out.shape,end='4\n')
+        out = rearrange(out, "(b h w) c t -> b c t h w", b=b, h=h, w=w)
+        return x + out
+class TemporalAttention_lin(nn.Module):
+    def __init__(
+        self,
+        channels,
+        num_heads=8,
+        num_head_channels=-1,
+        max_temporal_length=64,
+    ):
+        """
+        a clean multi-head temporal attention
+        """
+        super().__init__()
+        if num_head_channels == -1:
+            self.num_heads = num_heads
+        else:
+            assert (
+                channels % num_head_channels == 0
+            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
+            self.num_heads = channels // num_head_channels
+        self.norm = nn.LayerNorm(channels)
+        #self.norm = Normalize(channels)
+        #self.qkv = zero_module(conv_nd(1, channels, channels * 3, 1))
+        self.qkv = nn.Linear(channels, channels * 3)
+        self.attention = QKVAttention(self.num_heads)
+        self.relative_position_k = RelativePosition(
+            num_units=channels // self.num_heads,
+            max_relative_position=max_temporal_length,
+        )
+        self.relative_position_v = RelativePosition(
+            num_units=channels // self.num_heads,
+            max_relative_position=max_temporal_length,
+        )
+        self.proj_out = nn.Linear(channels, channels)
+    def forward(self, x, mask=None):
+        b, c, t, h, w = x.shape
+        out = rearrange(x, "b c t h w -> (b h w) t c")
+        # torch.Size([4608, 1152, 2])1
+        # torch.Size([4608, 3456, 2])2
+        # torch.Size([4608, 1152, 2])3
+        # torch.Size([4608, 1152, 2])4
+        #print(out.shape,end='1\n')
+        qkv = self.qkv(self.norm(out)).transpose(-1, -2)
+        #print(qkv.shape,end='2\n')
+        len_q = qkv.size()[-1]
+        len_k, len_v = len_q, len_q
+        k_rp = self.relative_position_k(len_q, len_k)
+        v_rp = self.relative_position_v(len_q, len_v)  # [T,T,head_dim]
+        out = self.attention(qkv, rp=(k_rp, v_rp))
+        out = self.proj_out(out.transpose(-1, -2)).transpose(-1, -2)
+        #print(out.shape,end='4\n')
+        out = rearrange(out, "(b h w) c t -> b c t h w", b=b, h=h, w=w)
+        return x + out
+class AttnBlock3D(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv3d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.k = torch.nn.Conv3d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.v = torch.nn.Conv3d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.proj_out = torch.nn.Conv3d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+    def forward(self, x):
+        h_ = x
+        # self.norm.to(x.device)
+        # self.norm.to(x.dtype)
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        b, c, t, h, w = q.shape
+        # q = q.reshape(b,c,h*w) # bcl
+        # q = q.permute(0,2,1)   # bcl -> blc l=hw
+        # k = k.reshape(b,c,h*w) # bcl
+        q = rearrange(q, "b c t h w -> (b t) (h w) c")  # blc
+        k = rearrange(k, "b c t h w -> (b t) c (h w)")  # bcl
+        w_ = torch.bmm(q, k)  # b,l,l
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # v = v.reshape(b,c,h*w)
+        v = rearrange(v, "b c t h w -> (b t) c (h w)")  # bcl
+        # attend to values
+        w_ = w_.permute(0, 2, 1)  # bll
+        h_ = torch.bmm(v, w_)  # bcl
+        # h_ = h_.reshape(b,c,h,w)
+        h_ = rearrange(h_, "(b t) c (h w) -> b c t h w", b=b, h=h)
+        h_ = self.proj_out(h_)
+        return x + h_
+class MultiHeadAttention3D(nn.Module):
+    def __init__(self, in_channels, num_heads=8):
+        super().__init__()
+        self.in_channels = in_channels
+        self.num_heads = num_heads
+        self.head_dim = in_channels // num_heads
+        assert self.head_dim * num_heads == in_channels, "in_channels must be divisible by num_heads"
+        self.norm = nn.LayerNorm(in_channels)
+        self.q_linear = nn.Linear(in_channels, in_channels)
+        self.k_linear = nn.Linear(in_channels, in_channels)
+        self.v_linear = nn.Linear(in_channels, in_channels)
+        self.proj_out = nn.Linear(in_channels, in_channels)
+    def forward(self, x):
+        b, c, t, h, w = x.shape
+        #print(x.shape)
+        # Normalize and reshape input
+        h_ = rearrange(x, "b c t h w -> (b t) (h w) c")
+        h_ = self.norm(h_)
+        # Linear projections
+        q = self.q_linear(h_)
+        k = self.k_linear(h_)
+        v = self.v_linear(h_)
+        # Reshape to multi-head
+        q = rearrange(q, "b l (h d) -> b h l d", h=self.num_heads)
+        k = rearrange(k, "b l (h d) -> b h l d", h=self.num_heads)
+        v = rearrange(v, "b l (h d) -> b h l d", h=self.num_heads)
+        # Scaled Dot-Product Attention
+        scores = torch.matmul(q, k.transpose(-2, -1)) / (self.head_dim ** 0.5)
+        attn = F.softmax(scores, dim=-1)
+        # Apply attention to values
+        out = torch.matmul(attn, v)
+        out = rearrange(out, "b h l d -> b l (h d)")
+        # Project back to original dimension
+        out = self.proj_out(out)
+        # Reshape back to original shape
+        out = rearrange(out, "(b t) (h w) c -> b c t h w", b=b, h=h, t=t)
+        #print(out.shape)
+        return x + out

siglip_encoder.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import torch
+import torch.nn.functional as F
+from torch import nn
+from typing import Optional, Tuple, Union, Dict
+from PIL import Image
+from functools import partial, reduce
+from transformers import SiglipImageProcessor, SiglipVisionConfig, SiglipVisionModel
+from .base_encoder import BaseVisionTower
+import torch.distributed as dist
+# --data_path /share/shuyan/video_traindata/anno/\{cinepine_order\}.json \
+#     --image_folder /share/shuyan/video_traindata/Bunny-v1_0-data/finetune/images \
+#     --video_folder /share/shuyan/video_traindata \
+def rank0_print(*args):
+    if dist.is_initialized():
+        if dist.get_rank() == 0:
+            print(f"Rank {dist.get_rank()}: ", *args)
+    else:
+        print(*args)
+from transformers.image_processing_utils import BatchFeature, get_size_dict
+from transformers.image_transforms import (
+    convert_to_rgb,
+    normalize,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from transformers.image_utils import (
+    ChannelDimension,
+    PILImageResampling,
+    to_numpy_array,
+)
+class SigLipImageProcessor:
+    def __init__(self, image_mean=(0.5, 0.5, 0.5), image_std=(0.5, 0.5, 0.5), size=(384, 384), crop_size: Dict[str, int] = None, resample=PILImageResampling.BICUBIC, rescale_factor=1 / 255, data_format=ChannelDimension.FIRST):
+        crop_size = crop_size if crop_size is not None else {"height": 384, "width": 384}
+        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.size = size
+        self.resample = resample
+        self.rescale_factor = rescale_factor
+        self.data_format = data_format
+        self.crop_size = crop_size
+    def preprocess(self, images, return_tensors):
+        if isinstance(images, Image.Image):
+            images = [images]
+        else:
+            # to adapt video data
+            images = [to_numpy_array(image) for image in images]
+            assert isinstance(images, list)
+        transforms = [
+            convert_to_rgb,
+            to_numpy_array,
+            partial(resize, size=self.size, resample=self.resample, data_format=self.data_format),
+            partial(rescale, scale=self.rescale_factor, data_format=self.data_format),
+            partial(normalize, mean=self.image_mean, std=self.image_std, data_format=self.data_format),
+            partial(to_channel_dimension_format, channel_dim=self.data_format, input_channel_dim=self.data_format),
+        ]
+        images = reduce(lambda x, f: [*map(f, x)], transforms, images)
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+class SigLipVisionTower(BaseVisionTower):
+    def __init__(self, vision_tower_name, vision_tower_cfg, delay_load=False):
+        super(SigLipVisionTower, self).__init__(vision_tower_name, vision_tower_cfg, delay_load)
+        # model_path = "google/siglip-so400m-patch14-384"
+        # base_model_name, res, interp = model_path, 384, 576
+        # self.vision_tower_name = base_model_name
+        self.vision_tower_name, res, interp = vision_tower_name, 384, 576
+        self._image_size = res if res is not None else 512
+        self.unfreeze_mm_vision_tower = getattr(vision_tower_cfg, "unfreeze_mm_vision_tower", False)
+        if not delay_load:
+            rank0_print(f"Loading vision tower: {vision_tower_name}")
+            self.load_model()
+        elif getattr(vision_tower_cfg, "unfreeze_mm_vision_tower", False):
+            # TODO: better detector is needed.
+            rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.")
+            self.load_model()
+        elif hasattr(vision_tower_cfg, "mm_tunable_parts") and "mm_vision_tower" in vision_tower_cfg.mm_tunable_parts:
+            rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.")
+            self.load_model()
+        else:
+            self.cfg_only = self.config
+    def load_model(self, device_map=None):
+        self.vision_model = "siglip"
+        # clip_model, processor = create_model_from_pretrained(self.vision_tower_name)
+        print(self.vision_tower_name)
+        self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name)
+        # self.vision_tower = clip_model.visual.trunk
+        self.vision_tower.output_tokens = True
+        self._hidden_size = self.vision_tower.config.hidden_size
+        self.image_processor = SigLipImageProcessor()
+        del self.vision_tower.vision_model.encoder.layers[-1:]
+        self.vision_tower.vision_model.head = nn.Identity()
+        self.vision_tower.requires_grad_(self.unfreeze_mm_vision_tower)
+        self.is_loaded = True
+    def _forward(self, images):
+        with torch.set_grad_enabled(self.unfreeze_mm_vision_tower):
+            image_features = self.vision_tower.forward(
+                images.to(device=self.device, dtype=self.dtype),
+                output_hidden_states=True,
+            ).hidden_states[-1]
+            return image_features
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        for p in self.vision_tower.parameters():
+            return p.dtype
+    @property
+    def device(self):
+        for p in self.vision_tower.parameters():
+            return p.device
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+    @property
+    def num_patches(self):
+        return (336 // 14) ** 2
+    @property
+    def num_patches_per_side(self):
+        #return self.config.image_size // self.config.patch_size
+        return 336//14
+        #return 27
+        # return self.model_config["vision_cfg"]["image_size"] // self.model_config["vision_cfg"]["patch_size"]
+    @property
+    def image_size(self):
+        return 384
+        #return self.config.image_size

utils.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import datetime
+import logging
+import logging.handlers
+import os
+import sys
+import numpy as np
+import requests
+from .constants import LOGDIR
+server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
+moderation_msg = "I am sorry. Your input may violate our content moderation guidelines. Please avoid using harmful or offensive content."
+handler = None
+import torch.distributed as dist
+try:
+    import av
+except ImportError:
+    print("Please install pyav to use video processing functions.")
+def process_video_with_pyav(video_file, data_args):
+    container = av.open(video_file)
+    stream = container.streams.video[0]
+    total_frame_num = stream.frames
+    avg_fps = round(stream.average_rate / data_args.video_fps)
+    frame_idx = [i for i in range(0, total_frame_num, avg_fps)]
+    if data_args.frames_upbound > 0:
+        if len(frame_idx) > data_args.frames_upbound:
+            uniform_sampled_frames = np.linspace(0, total_frame_num - 1, data_args.frames_upbound, dtype=int)
+            frame_idx = uniform_sampled_frames.tolist()
+    video_frames = []
+    for index, frame in enumerate(container.decode(video=0)):
+        if index in frame_idx:
+            video_frames.append(frame.to_rgb().to_ndarray())
+            if len(video_frames) == len(frame_idx):  # Stop decoding once we have all needed frames
+                break
+    video = np.stack(video_frames)
+    return video
+def rank0_print(*args):
+    if dist.is_initialized():
+        if dist.get_rank() == 0:
+            print(f"Rank {dist.get_rank()}: ", *args)
+    else:
+        print(*args)
+def build_logger(logger_name, logger_filename):
+    global handler
+    formatter = logging.Formatter(
+        fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    # Set the format of root handlers
+    if not logging.getLogger().handlers:
+        logging.basicConfig(level=logging.INFO)
+    logging.getLogger().handlers[0].setFormatter(formatter)
+    # Redirect stdout and stderr to loggers
+    stdout_logger = logging.getLogger("stdout")
+    stdout_logger.setLevel(logging.INFO)
+    sl = StreamToLogger(stdout_logger, logging.INFO)
+    sys.stdout = sl
+    stderr_logger = logging.getLogger("stderr")
+    stderr_logger.setLevel(logging.ERROR)
+    sl = StreamToLogger(stderr_logger, logging.ERROR)
+    sys.stderr = sl
+    # Get logger
+    logger = logging.getLogger(logger_name)
+    logger.setLevel(logging.INFO)
+    # Add a file handler for all loggers
+    if handler is None:
+        os.makedirs(LOGDIR, exist_ok=True)
+        filename = os.path.join(LOGDIR, logger_filename)
+        handler = logging.handlers.TimedRotatingFileHandler(filename, when="D", utc=True)
+        handler.setFormatter(formatter)
+        for name, item in logging.root.manager.loggerDict.items():
+            if isinstance(item, logging.Logger):
+                item.addHandler(handler)
+    return logger
+class StreamToLogger(object):
+    """
+    Fake file-like stream object that redirects writes to a logger instance.
+    """
+    def __init__(self, logger, log_level=logging.INFO):
+        self.terminal = sys.stdout
+        self.logger = logger
+        self.log_level = log_level
+        self.linebuf = ""
+    def __getattr__(self, attr):
+        return getattr(self.terminal, attr)
+    def write(self, buf):
+        temp_linebuf = self.linebuf + buf
+        self.linebuf = ""
+        for line in temp_linebuf.splitlines(True):
+            # From the io.TextIOWrapper docs:
+            #   On output, if newline is None, any '\n' characters written
+            #   are translated to the system default line separator.
+            # By default sys.stdout.write() expects '\n' newlines and then
+            # translates them so this is still cross platform.
+            if line[-1] == "\n":
+                self.logger.log(self.log_level, line.rstrip())
+            else:
+                self.linebuf += line
+    def flush(self):
+        if self.linebuf != "":
+            self.logger.log(self.log_level, self.linebuf.rstrip())
+        self.linebuf = ""
+def disable_torch_init():
+    """
+    Disable the redundant torch default initialization to accelerate model creation.
+    """
+    import torch
+    setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
+    setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
+def violates_moderation(text):
+    """
+    Check whether the text violates OpenAI moderation API.
+    """
+    url = "https://api.openai.com/v1/moderations"
+    headers = {"Content-Type": "application/json", "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]}
+    text = text.replace("\n", "")
+    data = "{" + '"input": ' + f'"{text}"' + "}"
+    data = data.encode("utf-8")
+    try:
+        ret = requests.post(url, headers=headers, data=data, timeout=5)
+        flagged = ret.json()["results"][0]["flagged"]
+    except requests.exceptions.RequestException as e:
+        print(f"######################### Moderation Error: {e} #########################")
+        flagged = False
+    except KeyError as e:
+        print(f"######################### Moderation Error: {e} #########################")
+        flagged = False
+    return flagged
+def pretty_print_semaphore(semaphore):
+    if semaphore is None:
+        return "None"
+    return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"

utils_encoder.py ADDED Viewed

	@@ -0,0 +1,296 @@

+import importlib
+import numpy as np
+import cv2, os
+import torch
+import torch.distributed as dist
+def count_params(model, verbose=False):
+    total_params = sum(p.numel() for p in model.parameters())
+    if verbose:
+        print(f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.")
+    return total_params
+def check_istarget(name, para_list):
+    """
+    name: full name of source para
+    para_list: partial name of target para
+    """
+    istarget = False
+    for para in para_list:
+        if para in name:
+            return True
+    return istarget
+def instantiate_from_config(config):
+    if not "target" in config:
+        if config == "__is_first_stage__":
+            return None
+        elif config == "__is_unconditional__":
+            return None
+        raise KeyError("Expected key `target` to instantiate.")
+    return get_obj_from_str(config["target"])(**config.get("params", dict()))
+def get_obj_from_str(string, reload=False):
+    module, cls = string.rsplit(".", 1)
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)
+def load_npz_from_dir(data_dir):
+    data = [
+        np.load(os.path.join(data_dir, data_name))["arr_0"]
+        for data_name in os.listdir(data_dir)
+    ]
+    data = np.concatenate(data, axis=0)
+    return data
+def load_npz_from_paths(data_paths):
+    data = [np.load(data_path)["arr_0"] for data_path in data_paths]
+    data = np.concatenate(data, axis=0)
+    return data
+def resize_numpy_image(image, max_resolution=512 * 512, resize_short_edge=None):
+    h, w = image.shape[:2]
+    if resize_short_edge is not None:
+        k = resize_short_edge / min(h, w)
+    else:
+        k = max_resolution / (h * w)
+        k = k**0.5
+    h = int(np.round(h * k / 64)) * 64
+    w = int(np.round(w * k / 64)) * 64
+    image = cv2.resize(image, (w, h), interpolation=cv2.INTER_LANCZOS4)
+    return image
+def setup_dist(args):
+    if dist.is_initialized():
+        return
+    torch.cuda.set_device(args.local_rank)
+    torch.distributed.init_process_group("nccl", init_method="env://")
+# adopted from
+# https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+# and
+# https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
+# and
+# https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py
+#
+# thanks!
+import torch.nn as nn
+import math
+from inspect import isfunction
+import torch
+from torch import nn
+import torch.distributed as dist
+def gather_data(data, return_np=True):
+    """gather data from multiple processes to one list"""
+    data_list = [torch.zeros_like(data) for _ in range(dist.get_world_size())]
+    dist.all_gather(data_list, data)  # gather not supported with NCCL
+    if return_np:
+        data_list = [data.cpu().numpy() for data in data_list]
+    return data_list
+def autocast(f):
+    def do_autocast(*args, **kwargs):
+        with torch.cuda.amp.autocast(
+            enabled=True,
+            dtype=torch.get_autocast_gpu_dtype(),
+            cache_enabled=torch.is_autocast_cache_enabled(),
+        ):
+            return f(*args, **kwargs)
+    return do_autocast
+def extract_into_tensor(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+def noise_like(shape, device, repeat=False):
+    repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(
+        shape[0], *((1,) * (len(shape) - 1))
+    )
+    noise = lambda: torch.randn(shape, device=device)
+    return repeat_noise() if repeat else noise()
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+def exists(val):
+    return val is not None
+def identity(*args, **kwargs):
+    return nn.Identity()
+def uniq(arr):
+    return {el: True for el in arr}.keys()
+def mean_flat(tensor):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+def ismap(x):
+    if not isinstance(x, torch.Tensor):
+        return False
+    return (len(x.shape) == 4) and (x.shape[1] > 3)
+def isimage(x):
+    if not isinstance(x, torch.Tensor):
+        return False
+    return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1)
+def max_neg_value(t):
+    return -torch.finfo(t.dtype).max
+def shape_to_str(x):
+    shape_str = "x".join([str(x) for x in x.shape])
+    return shape_str
+def init_(tensor):
+    dim = tensor.shape[-1]
+    std = 1 / math.sqrt(dim)
+    tensor.uniform_(-std, std)
+    return tensor
+# ckpt = torch.utils.checkpoint.checkpoint
+# def checkpoint(func, inputs, params, flag):
+#     """
+#     Evaluate a function without caching intermediate activations, allowing for
+#     reduced memory at the expense of extra compute in the backward pass.
+#     :param func: the function to evaluate.
+#     :param inputs: the argument sequence to pass to `func`.
+#     :param params: a sequence of parameters `func` depends on but does not
+#                    explicitly take as arguments.
+#     :param flag: if False, disable gradient checkpointing.
+#     """
+#     if flag:
+#         return ckpt(func, *inputs)
+#     else:
+#         return func(*inputs)
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode
+    does not change anymore."""
+    return self
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+def scale_module(module, scale):
+    """
+    Scale the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().mul_(scale)
+    return module
+def conv_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D convolution module.
+    """
+    if dims == 1:
+        return nn.Conv1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.Conv2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.Conv3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+def linear(*args, **kwargs):
+    """
+    Create a linear module.
+    """
+    return nn.Linear(*args, **kwargs)
+def avg_pool_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D average pooling module.
+    """
+    if dims == 1:
+        return nn.AvgPool1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.AvgPool2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.AvgPool3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+def nonlinearity(type="silu"):
+    if type == "silu":
+        return nn.SiLU()
+    elif type == "leaky_relu":
+        return nn.LeakyReLU()
+class GroupNormSpecific(nn.GroupNorm):
+    def forward(self, x):
+        if x.dtype == torch.float16 or x.dtype == torch.bfloat16:
+            return super().forward(x).type(x.dtype)
+        else:
+            return super().forward(x.float()).type(x.dtype)
+def normalization(channels, num_groups=32):
+    """
+    Make a standard normalization layer.
+    :param channels: number of input channels.
+    :return: an nn.Module for normalization.
+    """
+    return GroupNormSpecific(num_groups, channels)
+class HybridConditioner(nn.Module):
+    def __init__(self, c_concat_config, c_crossattn_config):
+        super().__init__()
+        self.concat_conditioner = instantiate_from_config(c_concat_config)
+        self.crossattn_conditioner = instantiate_from_config(c_crossattn_config)
+    def forward(self, c_concat, c_crossattn):
+        c_concat = self.concat_conditioner(c_concat)
+        c_crossattn = self.crossattn_conditioner(c_crossattn)
+        return {"c_concat": [c_concat], "c_crossattn": [c_crossattn]}