Video-XL-2 / sae.py

fix bug

5644dea 1 day ago

48.6 kB

	import torch
	import torch.nn as nn
	import pdb
	import math
	from transformers.activations import ACT2FN
	from einops import rearrange, reduce, repeat
	from inspect import isfunction
	import math
	import torch.nn.functional as F
	from torch import nn, einsum
	from einops import rearrange, repeat
	from typing import Optional, Any

	try:
	import xformers
	import xformers.ops

	XFORMERS_IS_AVAILBLE = True
	except:
	XFORMERS_IS_AVAILBLE = False

	import importlib
	import numpy as np
	import cv2, os
	import torch.distributed as dist


	def count_params(model, verbose=False):
	total_params = sum(p.numel() for p in model.parameters())
	if verbose:
	print(f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.")
	return total_params


	def check_istarget(name, para_list):
	"""
	name: full name of source para
	para_list: partial name of target para
	"""
	istarget = False
	for para in para_list:
	if para in name:
	return True
	return istarget


	def instantiate_from_config(config):
	if not "target" in config:
	if config == "__is_first_stage__":
	return None
	elif config == "__is_unconditional__":
	return None
	raise KeyError("Expected key `target` to instantiate.")

	return get_obj_from_str(config["target"])(**config.get("params", dict()))


	def get_obj_from_str(string, reload=False):
	module, cls = string.rsplit(".", 1)
	if reload:
	module_imp = importlib.import_module(module)
	importlib.reload(module_imp)
	return getattr(importlib.import_module(module, package=None), cls)


	def load_npz_from_dir(data_dir):
	data = [
	np.load(os.path.join(data_dir, data_name))["arr_0"]
	for data_name in os.listdir(data_dir)
	]
	data = np.concatenate(data, axis=0)
	return data


	def load_npz_from_paths(data_paths):
	data = [np.load(data_path)["arr_0"] for data_path in data_paths]
	data = np.concatenate(data, axis=0)
	return data


	def resize_numpy_image(image, max_resolution=512 * 512, resize_short_edge=None):
	h, w = image.shape[:2]
	if resize_short_edge is not None:
	k = resize_short_edge / min(h, w)
	else:
	k = max_resolution / (h * w)
	k = k**0.5
	h = int(np.round(h * k / 64)) * 64
	w = int(np.round(w * k / 64)) * 64
	image = cv2.resize(image, (w, h), interpolation=cv2.INTER_LANCZOS4)
	return image


	def setup_dist(args):
	if dist.is_initialized():
	return
	torch.cuda.set_device(args.local_rank)
	torch.distributed.init_process_group("nccl", init_method="env://")


	# adopted from
	# https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
	# and
	# https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
	# and
	# https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py
	#
	# thanks!

	import torch.nn as nn
	import math
	from inspect import isfunction
	import torch
	from torch import nn
	import torch.distributed as dist


	def gather_data(data, return_np=True):
	"""gather data from multiple processes to one list"""
	data_list = [torch.zeros_like(data) for _ in range(dist.get_world_size())]
	dist.all_gather(data_list, data) # gather not supported with NCCL
	if return_np:
	data_list = [data.cpu().numpy() for data in data_list]
	return data_list


	def autocast(f):
	def do_autocast(args, *kwargs):
	with torch.cuda.amp.autocast(
	enabled=True,
	dtype=torch.get_autocast_gpu_dtype(),
	cache_enabled=torch.is_autocast_cache_enabled(),
	):
	return f(args, *kwargs)

	return do_autocast


	def extract_into_tensor(a, t, x_shape):
	b, *_ = t.shape
	out = a.gather(-1, t)
	return out.reshape(b, ((1,) (len(x_shape) - 1)))


	def noise_like(shape, device, repeat=False):
	repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(
	shape[0], ((1,) (len(shape) - 1))
	)
	noise = lambda: torch.randn(shape, device=device)
	return repeat_noise() if repeat else noise()


	def default(val, d):
	if exists(val):
	return val
	return d() if isfunction(d) else d


	def exists(val):
	return val is not None


	def identity(args, *kwargs):
	return nn.Identity()


	def uniq(arr):
	return {el: True for el in arr}.keys()


	def mean_flat(tensor):
	"""
	Take the mean over all non-batch dimensions.
	"""
	return tensor.mean(dim=list(range(1, len(tensor.shape))))


	def ismap(x):
	if not isinstance(x, torch.Tensor):
	return False
	return (len(x.shape) == 4) and (x.shape[1] > 3)


	def isimage(x):
	if not isinstance(x, torch.Tensor):
	return False
	return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1)


	def max_neg_value(t):
	return -torch.finfo(t.dtype).max


	def shape_to_str(x):
	shape_str = "x".join([str(x) for x in x.shape])
	return shape_str


	def init_(tensor):
	dim = tensor.shape[-1]
	std = 1 / math.sqrt(dim)
	tensor.uniform_(-std, std)
	return tensor



	def disabled_train(self, mode=True):
	"""Overwrite model.train with this function to make sure train/eval mode
	does not change anymore."""
	return self


	def zero_module(module):
	"""
	Zero out the parameters of a module and return it.
	"""
	for p in module.parameters():
	p.detach().zero_()
	return module


	def scale_module(module, scale):
	"""
	Scale the parameters of a module and return it.
	"""
	for p in module.parameters():
	p.detach().mul_(scale)
	return module


	def conv_nd(dims, args, *kwargs):
	"""
	Create a 1D, 2D, or 3D convolution module.
	"""
	if dims == 1:
	return nn.Conv1d(args, *kwargs)
	elif dims == 2:
	return nn.Conv2d(args, *kwargs)
	elif dims == 3:
	return nn.Conv3d(args, *kwargs)
	raise ValueError(f"unsupported dimensions: {dims}")


	def linear(args, *kwargs):
	"""
	Create a linear module.
	"""
	return nn.Linear(args, *kwargs)


	def avg_pool_nd(dims, args, *kwargs):
	"""
	Create a 1D, 2D, or 3D average pooling module.
	"""
	if dims == 1:
	return nn.AvgPool1d(args, *kwargs)
	elif dims == 2:
	return nn.AvgPool2d(args, *kwargs)
	elif dims == 3:
	return nn.AvgPool3d(args, *kwargs)
	raise ValueError(f"unsupported dimensions: {dims}")


	def nonlinearity(type="silu"):
	if type == "silu":
	return nn.SiLU()
	elif type == "leaky_relu":
	return nn.LeakyReLU()


	class GroupNormSpecific(nn.GroupNorm):
	def forward(self, x):
	if x.dtype == torch.float16 or x.dtype == torch.bfloat16:
	return super().forward(x).type(x.dtype)
	else:
	return super().forward(x.float()).type(x.dtype)


	def normalization(channels, num_groups=32):
	"""
	Make a standard normalization layer.
	:param channels: number of input channels.
	:return: an nn.Module for normalization.
	"""
	return GroupNormSpecific(num_groups, channels)


	class HybridConditioner(nn.Module):

	def __init__(self, c_concat_config, c_crossattn_config):
	super().__init__()
	self.concat_conditioner = instantiate_from_config(c_concat_config)
	self.crossattn_conditioner = instantiate_from_config(c_crossattn_config)

	def forward(self, c_concat, c_crossattn):
	c_concat = self.concat_conditioner(c_concat)
	c_crossattn = self.crossattn_conditioner(c_crossattn)
	return {"c_concat": [c_concat], "c_crossattn": [c_crossattn]}

	def exists(val):
	return val is not None


	def uniq(arr):
	return {el: True for el in arr}.keys()


	def default(val, d):
	if exists(val):
	return val
	return d() if isfunction(d) else d


	def max_neg_value(t):
	return -torch.finfo(t.dtype).max


	def init_(tensor):
	dim = tensor.shape[-1]
	std = 1 / math.sqrt(dim)
	tensor.uniform_(-std, std)
	return tensor


	# feedforward
	class GEGLU(nn.Module):
	def __init__(self, dim_in, dim_out):
	super().__init__()
	self.proj = nn.Linear(dim_in, dim_out * 2)

	def forward(self, x):
	x, gate = self.proj(x).chunk(2, dim=-1)
	return x * F.gelu(gate)


	class FeedForward(nn.Module):
	def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0):
	super().__init__()
	inner_dim = int(dim * mult)
	dim_out = default(dim_out, dim)
	project_in = (
	nn.Sequential(nn.Linear(dim, inner_dim), nn.GELU())
	if not glu
	else GEGLU(dim, inner_dim)
	)

	self.net = nn.Sequential(
	project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out)
	)

	def forward(self, x):
	return self.net(x)


	def zero_module(module):
	"""
	Zero out the parameters of a module and return it.
	"""
	for p in module.parameters():
	p.detach().zero_()
	return module


	def Normalize(in_channels, num_groups=32):
	return torch.nn.GroupNorm(
	num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True
	)


	class RelativePosition(nn.Module):
	"""https://github.com/evelinehong/Transformer_Relative_Position_PyTorch/blob/master/relative_position.py"""

	def __init__(self, num_units, max_relative_position):
	super().__init__()
	self.num_units = num_units
	self.max_relative_position = max_relative_position
	self.embeddings_table = nn.Parameter(
	torch.Tensor(max_relative_position * 2 + 1, num_units)
	)
	nn.init.xavier_uniform_(self.embeddings_table)

	def forward(self, length_q, length_k):
	device = self.embeddings_table.device
	range_vec_q = torch.arange(length_q, device=device)
	range_vec_k = torch.arange(length_k, device=device)
	distance_mat = range_vec_k[None, :] - range_vec_q[:, None]
	distance_mat_clipped = torch.clamp(
	distance_mat, -self.max_relative_position, self.max_relative_position
	)
	final_mat = distance_mat_clipped + self.max_relative_position
	# final_mat = torch.LongTensor(final_mat).to(self.embeddings_table.device)
	# final_mat = torch.tensor(final_mat, device=self.embeddings_table.device, dtype=torch.long)
	final_mat = final_mat.long()
	embeddings = self.embeddings_table[final_mat]
	return embeddings


	class TemporalCrossAttention(nn.Module):
	def __init__(
	self,
	query_dim,
	context_dim=None,
	heads=8,
	dim_head=64,
	dropout=0.0,
	temporal_length=None, # For relative positional representation and image-video joint training.
	image_length=None, # For image-video joint training.
	use_relative_position=False, # whether use relative positional representation in temporal attention.
	img_video_joint_train=False, # For image-video joint training.
	use_tempoal_causal_attn=False,
	bidirectional_causal_attn=False,
	tempoal_attn_type=None,
	joint_train_mode="same_batch",
	**kwargs,
	):
	super().__init__()
	inner_dim = dim_head * heads
	context_dim = default(context_dim, query_dim)
	self.context_dim = context_dim

	self.scale = dim_head**-0.5
	self.heads = heads
	self.temporal_length = temporal_length
	self.use_relative_position = use_relative_position
	self.img_video_joint_train = img_video_joint_train
	self.bidirectional_causal_attn = bidirectional_causal_attn
	self.joint_train_mode = joint_train_mode
	assert joint_train_mode in ["same_batch", "diff_batch"]
	self.tempoal_attn_type = tempoal_attn_type

	if bidirectional_causal_attn:
	assert use_tempoal_causal_attn
	if tempoal_attn_type:
	assert tempoal_attn_type in ["sparse_causal", "sparse_causal_first"]
	assert not use_tempoal_causal_attn
	assert not (
	img_video_joint_train and (self.joint_train_mode == "same_batch")
	)
	self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
	self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
	self.to_v = nn.Linear(context_dim, inner_dim, bias=False)

	assert not (
	img_video_joint_train
	and (self.joint_train_mode == "same_batch")
	and use_tempoal_causal_attn
	)
	if img_video_joint_train:
	if self.joint_train_mode == "same_batch":
	mask = torch.ones(
	[1, temporal_length + image_length, temporal_length + image_length]
	)
	# mask[:, image_length:, :] = 0
	# mask[:, :, image_length:] = 0
	mask[:, temporal_length:, :] = 0
	mask[:, :, temporal_length:] = 0
	self.mask = mask
	else:
	self.mask = None
	elif use_tempoal_causal_attn:
	# normal causal attn
	self.mask = torch.tril(torch.ones([1, temporal_length, temporal_length]))
	elif tempoal_attn_type == "sparse_causal":
	# all frames interact with only the `prev` & self frame
	mask1 = torch.tril(
	torch.ones([1, temporal_length, temporal_length])
	).bool() # true indicates keeping
	mask2 = torch.zeros(
	[1, temporal_length, temporal_length]
	) # initialize to same shape with mask1
	mask2[:, 2:temporal_length, : temporal_length - 2] = torch.tril(
	torch.ones([1, temporal_length - 2, temporal_length - 2])
	)
	mask2 = (1 - mask2).bool() # false indicates masking
	self.mask = mask1 & mask2
	elif tempoal_attn_type == "sparse_causal_first":
	# all frames interact with only the `first` & self frame
	mask1 = torch.tril(
	torch.ones([1, temporal_length, temporal_length])
	).bool() # true indicates keeping
	mask2 = torch.zeros([1, temporal_length, temporal_length])
	mask2[:, 2:temporal_length, 1 : temporal_length - 1] = torch.tril(
	torch.ones([1, temporal_length - 2, temporal_length - 2])
	)
	mask2 = (1 - mask2).bool() # false indicates masking
	self.mask = mask1 & mask2
	else:
	self.mask = None

	if use_relative_position:
	assert temporal_length is not None
	self.relative_position_k = RelativePosition(
	num_units=dim_head, max_relative_position=temporal_length
	)
	self.relative_position_v = RelativePosition(
	num_units=dim_head, max_relative_position=temporal_length
	)

	self.to_out = nn.Sequential(
	nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
	)

	nn.init.constant_(self.to_q.weight, 0)
	nn.init.constant_(self.to_k.weight, 0)
	nn.init.constant_(self.to_v.weight, 0)
	nn.init.constant_(self.to_out[0].weight, 0)
	nn.init.constant_(self.to_out[0].bias, 0)

	def forward(self, x, context=None, mask=None):
	# if context is None:
	# print(f'[Temp Attn] x={x.shape},context=None')
	# else:
	# print(f'[Temp Attn] x={x.shape},context={context.shape}')

	nh = self.heads
	out = x
	q = self.to_q(out)
	# if context is not None:
	# print(f'temporal context 1 ={context.shape}')
	# print(f'x={x.shape}')
	context = default(context, x)
	# print(f'temporal context 2 ={context.shape}')
	k = self.to_k(context)
	v = self.to_v(context)
	# print(f'q ={q.shape},k={k.shape}')

	q, k, v = map(lambda t: rearrange(t, "b n (h d) -> (b h) n d", h=nh), (q, k, v))
	sim = einsum("b i d, b j d -> b i j", q, k) * self.scale

	if self.use_relative_position:
	len_q, len_k, len_v = q.shape[1], k.shape[1], v.shape[1]
	k2 = self.relative_position_k(len_q, len_k)
	sim2 = einsum("b t d, t s d -> b t s", q, k2) * self.scale # TODO check
	sim += sim2
	# print('mask',mask)
	if exists(self.mask):
	if mask is None:
	mask = self.mask.to(sim.device)
	else:
	mask = self.mask.to(sim.device).bool() & mask # .to(sim.device)
	else:
	mask = mask
	# if self.img_video_joint_train:
	# # process mask (make mask same shape with sim)
	# c, h, w = mask.shape
	# c, t, s = sim.shape
	# # assert(h == w and t == s),f"mask={mask.shape}, sim={sim.shape}, h={h}, w={w}, t={t}, s={s}"

	# if h > t:
	# mask = mask[:, :t, :]
	# elif h < t: # pad zeros to mask (no attention) only initial mask =1 area compute weights
	# mask_ = torch.zeros([c,t,w]).to(mask.device)
	# mask_[:, :h, :] = mask
	# mask = mask_
	# c, h, w = mask.shape
	# if w > s:
	# mask = mask[:, :, :s]
	# elif w < s: # pad zeros to mask
	# mask_ = torch.zeros([c,h,s]).to(mask.device)
	# mask_[:, :, :w] = mask
	# mask = mask_

	# max_neg_value = -torch.finfo(sim.dtype).max
	# sim = sim.float().masked_fill(mask == 0, max_neg_value)
	if mask is not None:
	max_neg_value = -1e9
	sim = sim + (1 - mask.float()) * max_neg_value # 1=masking,0=no masking
	# print('sim after masking: ', sim)

	# if torch.isnan(sim).any() or torch.isinf(sim).any() or (not sim.any()):
	# print(f'sim [after masking], isnan={torch.isnan(sim).any()}, isinf={torch.isinf(sim).any()}, allzero={not sim.any()}')

	attn = sim.softmax(dim=-1)
	# print('attn after softmax: ', attn)
	# if torch.isnan(attn).any() or torch.isinf(attn).any() or (not attn.any()):
	# print(f'attn [after softmax], isnan={torch.isnan(attn).any()}, isinf={torch.isinf(attn).any()}, allzero={not attn.any()}')

	# attn = torch.where(torch.isnan(attn), torch.full_like(attn,0), attn)
	# if torch.isinf(attn.detach()).any():
	# import pdb;pdb.set_trace()
	# if torch.isnan(attn.detach()).any():
	# import pdb;pdb.set_trace()
	out = einsum("b i j, b j d -> b i d", attn, v)

	if self.bidirectional_causal_attn:
	mask_reverse = torch.triu(
	torch.ones(
	[1, self.temporal_length, self.temporal_length], device=sim.device
	)
	)
	sim_reverse = sim.float().masked_fill(mask_reverse == 0, max_neg_value)
	attn_reverse = sim_reverse.softmax(dim=-1)
	out_reverse = einsum("b i j, b j d -> b i d", attn_reverse, v)
	out += out_reverse

	if self.use_relative_position:
	v2 = self.relative_position_v(len_q, len_v)
	out2 = einsum("b t s, t s d -> b t d", attn, v2) # TODO check
	out += out2 # TODO check：先add还是先merge head？先计算rpr，on split head之后的数据，然后再merge。
	out = rearrange(out, "(b h) n d -> b n (h d)", h=nh) # merge head
	return self.to_out(out)


	class SpatialSelfAttention(nn.Module):
	def __init__(self, in_channels):
	super().__init__()
	self.in_channels = in_channels

	self.norm = Normalize(in_channels)
	self.q = torch.nn.Conv2d(
	in_channels, in_channels, kernel_size=1, stride=1, padding=0
	)
	self.k = torch.nn.Conv2d(
	in_channels, in_channels, kernel_size=1, stride=1, padding=0
	)
	self.v = torch.nn.Conv2d(
	in_channels, in_channels, kernel_size=1, stride=1, padding=0
	)
	self.proj_out = torch.nn.Conv2d(
	in_channels, in_channels, kernel_size=1, stride=1, padding=0
	)

	def forward(self, x):
	h_ = x
	h_ = self.norm(h_)
	q = self.q(h_)
	k = self.k(h_)
	v = self.v(h_)

	# compute attention
	b, c, h, w = q.shape
	q = rearrange(q, "b c h w -> b (h w) c")
	k = rearrange(k, "b c h w -> b c (h w)")
	w_ = torch.einsum("bij,bjk->bik", q, k)

	w_ = w_ * (int(c) ** (-0.5))
	w_ = torch.nn.functional.softmax(w_, dim=2)

	# attend to values
	v = rearrange(v, "b c h w -> b c (h w)")
	w_ = rearrange(w_, "b i j -> b j i")
	h_ = torch.einsum("bij,bjk->bik", v, w_)
	h_ = rearrange(h_, "b c (h w) -> b c h w", h=h)
	h_ = self.proj_out(h_)

	return x + h_


	class CrossAttention(nn.Module):
	def __init__(
	self,
	query_dim,
	context_dim=None,
	heads=8,
	dim_head=64,
	dropout=0.0,
	sa_shared_kv=False,
	shared_type="only_first",
	**kwargs,
	):
	super().__init__()
	inner_dim = dim_head * heads
	context_dim = default(context_dim, query_dim)
	self.sa_shared_kv = sa_shared_kv
	assert shared_type in [
	"only_first",
	"all_frames",
	"first_and_prev",
	"only_prev",
	"full",
	"causal",
	"full_qkv",
	]
	self.shared_type = shared_type

	self.scale = dim_head**-0.5
	self.heads = heads
	self.dim_head = dim_head

	self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
	self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
	self.to_v = nn.Linear(context_dim, inner_dim, bias=False)

	self.to_out = nn.Sequential(
	nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
	)
	self.attention_op: Optional[Any] = None

	def forward(self, x, context=None, mask=None):
	h = self.heads
	b = x.shape[0]

	q = self.to_q(x)
	context = default(context, x)
	k = self.to_k(context)
	v = self.to_v(context)
	if self.sa_shared_kv:
	if self.shared_type == "only_first":
	k, v = map(
	lambda xx: rearrange(xx[0].unsqueeze(0), "b n c -> (b n) c")
	.unsqueeze(0)
	.repeat(b, 1, 1),
	(k, v),
	)
	else:
	raise NotImplementedError

	q, k, v = map(lambda t: rearrange(t, "b n (h d) -> (b h) n d", h=h), (q, k, v))

	sim = einsum("b i d, b j d -> b i j", q, k) * self.scale

	if exists(mask):
	mask = rearrange(mask, "b ... -> b (...)")
	max_neg_value = -torch.finfo(sim.dtype).max
	mask = repeat(mask, "b j -> (b h) () j", h=h)
	sim.masked_fill_(~mask, max_neg_value)

	# attention, what we cannot get enough of
	attn = sim.softmax(dim=-1)

	out = einsum("b i j, b j d -> b i d", attn, v)
	out = rearrange(out, "(b h) n d -> b n (h d)", h=h)
	return self.to_out(out)

	def efficient_forward(self, x, context=None, mask=None):
	q = self.to_q(x)
	context = default(context, x)
	k = self.to_k(context)
	v = self.to_v(context)

	b, _, _ = q.shape
	q, k, v = map(
	lambda t: t.unsqueeze(3)
	.reshape(b, t.shape[1], self.heads, self.dim_head)
	.permute(0, 2, 1, 3)
	.reshape(b * self.heads, t.shape[1], self.dim_head)
	.contiguous(),
	(q, k, v),
	)
	# actually compute the attention, what we cannot get enough of
	out = xformers.ops.memory_efficient_attention(
	q, k, v, attn_bias=None, op=self.attention_op
	)

	if exists(mask):
	raise NotImplementedError
	out = (
	out.unsqueeze(0)
	.reshape(b, self.heads, out.shape[1], self.dim_head)
	.permute(0, 2, 1, 3)
	.reshape(b, out.shape[1], self.heads * self.dim_head)
	)
	return self.to_out(out)


	class VideoSpatialCrossAttention(CrossAttention):
	def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0):
	super().__init__(query_dim, context_dim, heads, dim_head, dropout)

	def forward(self, x, context=None, mask=None):
	b, c, t, h, w = x.shape
	if context is not None:
	context = context.repeat(t, 1, 1)
	x = super.forward(spatial_attn_reshape(x), context=context) + x
	return spatial_attn_reshape_back(x, b, h)


	def spatial_attn_reshape(x):
	return rearrange(x, "b c t h w -> (b t) (h w) c")


	def spatial_attn_reshape_back(x, b, h):
	return rearrange(x, "(b t) (h w) c -> b c t h w", b=b, h=h)


	def temporal_attn_reshape(x):
	return rearrange(x, "b c t h w -> (b h w) t c")


	def temporal_attn_reshape_back(x, b, h, w):
	return rearrange(x, "(b h w) t c -> b c t h w", b=b, h=h, w=w)


	def local_spatial_temporal_attn_reshape(x, window_size):
	B, C, T, H, W = x.shape
	NH = H // window_size
	NW = W // window_size
	# x = x.view(B, C, T, NH, window_size, NW, window_size)
	# tokens = x.permute(0, 1, 2, 3, 5, 4, 6).contiguous()
	# tokens = tokens.view(-1, window_size, window_size, C)
	x = rearrange(
	x,
	"b c t (nh wh) (nw ww) -> b c t nh wh nw ww",
	nh=NH,
	nw=NW,
	wh=window_size,
	ww=window_size,
	).contiguous() # # B, C, T, NH, NW, window_size, window_size
	x = rearrange(
	x, "b c t nh wh nw ww -> (b nh nw) (t wh ww) c"
	) # (B, NH, NW) (T, window_size, window_size) C
	return x


	def local_spatial_temporal_attn_reshape_back(x, window_size, b, h, w, t):
	B, L, C = x.shape
	NH = h // window_size
	NW = w // window_size
	x = rearrange(
	x,
	"(b nh nw) (t wh ww) c -> b c t nh wh nw ww",
	b=b,
	nh=NH,
	nw=NW,
	t=t,
	wh=window_size,
	ww=window_size,
	)
	x = rearrange(x, "b c t nh wh nw ww -> b c t (nh wh) (nw ww)")
	return x


	class SpatialTemporalTransformer(nn.Module):
	"""
	Transformer block for video-like data (5D tensor).
	First, project the input (aka embedding) with NO reshape.
	Then apply standard transformer action.
	The 5D -> 3D reshape operation will be done in the specific attention module.
	"""

	def __init__(
	self,
	in_channels,
	n_heads,
	d_head,
	depth=1,
	dropout=0.0,
	context_dim=None,
	# Temporal stuff
	temporal_length=None,
	image_length=None,
	use_relative_position=True,
	img_video_joint_train=False,
	cross_attn_on_tempoal=False,
	temporal_crossattn_type="selfattn",
	order="stst",
	temporalcrossfirst=False,
	split_stcontext=False,
	temporal_context_dim=None,
	**kwargs,
	):
	super().__init__()

	self.in_channels = in_channels
	inner_dim = n_heads * d_head

	self.norm = Normalize(in_channels)
	self.proj_in = nn.Conv3d(
	in_channels, inner_dim, kernel_size=1, stride=1, padding=0
	)

	self.transformer_blocks = nn.ModuleList(
	[
	BasicTransformerBlockST(
	inner_dim,
	n_heads,
	d_head,
	dropout=dropout,
	# cross attn
	context_dim=context_dim,
	# temporal attn
	temporal_length=temporal_length,
	image_length=image_length,
	use_relative_position=use_relative_position,
	img_video_joint_train=img_video_joint_train,
	temporal_crossattn_type=temporal_crossattn_type,
	order=order,
	temporalcrossfirst=temporalcrossfirst,
	split_stcontext=split_stcontext,
	temporal_context_dim=temporal_context_dim,
	**kwargs,
	)
	for d in range(depth)
	]
	)

	self.proj_out = zero_module(
	nn.Conv3d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
	)

	def forward(self, x, context=None, temporal_context=None, **kwargs):
	# note: if no context is given, cross-attention defaults to self-attention
	assert x.dim() == 5, f"x shape = {x.shape}"
	b, c, t, h, w = x.shape
	x_in = x

	x = self.norm(x)
	x = self.proj_in(x)

	for block in self.transformer_blocks:
	x = block(x, context=context, temporal_context=temporal_context, **kwargs)

	x = self.proj_out(x)
	return x + x_in


	class STAttentionBlock2(nn.Module):
	def __init__(
	self,
	channels,
	num_heads=1,
	num_head_channels=-1,
	use_checkpoint=False, # not used, only used in ResBlock
	use_new_attention_order=False, # QKVAttention or QKVAttentionLegacy
	temporal_length=16, # used in relative positional representation.
	image_length=8, # used for image-video joint training.
	use_relative_position=False, # whether use relative positional representation in temporal attention.
	img_video_joint_train=False,
	# norm_type="groupnorm",
	attn_norm_type="group",
	use_tempoal_causal_attn=False,
	):
	"""
	version 1: guided_diffusion implemented version
	version 2: remove args input argument
	"""
	super().__init__()

	if num_head_channels == -1:
	self.num_heads = num_heads
	else:
	assert (
	channels % num_head_channels == 0
	), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
	self.num_heads = channels // num_head_channels
	self.use_checkpoint = use_checkpoint

	self.temporal_length = temporal_length
	self.image_length = image_length
	self.use_relative_position = use_relative_position
	self.img_video_joint_train = img_video_joint_train
	self.attn_norm_type = attn_norm_type
	assert self.attn_norm_type in ["group", "no_norm"]
	self.use_tempoal_causal_attn = use_tempoal_causal_attn

	if self.attn_norm_type == "group":
	self.norm_s = normalization(channels)
	self.norm_t = normalization(channels)

	self.qkv_s = conv_nd(1, channels, channels * 3, 1)
	self.qkv_t = conv_nd(1, channels, channels * 3, 1)

	if self.img_video_joint_train:
	mask = torch.ones(
	[1, temporal_length + image_length, temporal_length + image_length]
	)
	mask[:, temporal_length:, :] = 0
	mask[:, :, temporal_length:] = 0
	self.register_buffer("mask", mask)
	else:
	self.mask = None

	if use_new_attention_order:
	# split qkv before split heads
	self.attention_s = QKVAttention(self.num_heads)
	self.attention_t = QKVAttention(self.num_heads)
	else:
	# split heads before split qkv
	self.attention_s = QKVAttentionLegacy(self.num_heads)
	self.attention_t = QKVAttentionLegacy(self.num_heads)

	if use_relative_position:
	self.relative_position_k = RelativePosition(
	num_units=channels // self.num_heads,
	max_relative_position=temporal_length,
	)
	self.relative_position_v = RelativePosition(
	num_units=channels // self.num_heads,
	max_relative_position=temporal_length,
	)

	self.proj_out_s = zero_module(
	conv_nd(1, channels, channels, 1)
	) # conv_dim, in_channels, out_channels, kernel_size
	self.proj_out_t = zero_module(
	conv_nd(1, channels, channels, 1)
	) # conv_dim, in_channels, out_channels, kernel_size

	def forward(self, x, mask=None):
	b, c, t, h, w = x.shape

	# spatial
	out = rearrange(x, "b c t h w -> (b t) c (h w)")
	if self.attn_norm_type == "no_norm":
	qkv = self.qkv_s(out)
	else:
	qkv = self.qkv_s(self.norm_s(out))
	out = self.attention_s(qkv)
	out = self.proj_out_s(out)
	out = rearrange(out, "(b t) c (h w) -> b c t h w", b=b, h=h)
	x += out

	# temporal
	out = rearrange(x, "b c t h w -> (b h w) c t")
	if self.attn_norm_type == "no_norm":
	qkv = self.qkv_t(out)
	else:
	qkv = self.qkv_t(self.norm_t(out))

	# relative positional embedding
	if self.use_relative_position:
	len_q = qkv.size()[-1]
	len_k, len_v = len_q, len_q
	k_rp = self.relative_position_k(len_q, len_k)
	v_rp = self.relative_position_v(len_q, len_v) # [T,T,head_dim]
	out = self.attention_t(
	qkv,
	rp=(k_rp, v_rp),
	mask=self.mask,
	use_tempoal_causal_attn=self.use_tempoal_causal_attn,
	)
	else:
	out = self.attention_t(
	qkv,
	rp=None,
	mask=self.mask,
	use_tempoal_causal_attn=self.use_tempoal_causal_attn,
	)

	out = self.proj_out_t(out)
	out = rearrange(out, "(b h w) c t -> b c t h w", b=b, h=h, w=w)

	return x + out


	class QKVAttentionLegacy(nn.Module):
	"""
	A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping
	"""

	def __init__(self, n_heads):
	super().__init__()
	self.n_heads = n_heads

	def forward(self, qkv, rp=None, mask=None):
	"""
	Apply QKV attention.

	:param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
	:return: an [N x (H * C) x T] tensor after attention.
	"""
	if rp is not None or mask is not None:
	raise NotImplementedError
	bs, width, length = qkv.shape
	assert width % (3 * self.n_heads) == 0
	ch = width // (3 * self.n_heads)
	q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
	scale = 1 / math.sqrt(math.sqrt(ch))
	weight = torch.einsum(
	"bct,bcs->bts", q * scale, k * scale
	) # More stable with f16 than dividing afterwards
	weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
	a = torch.einsum("bts,bcs->bct", weight, v)
	return a.reshape(bs, -1, length)

	@staticmethod
	def count_flops(model, _x, y):
	return count_flops_attn(model, _x, y)


	class QKVAttention(nn.Module):
	"""
	A module which performs QKV attention and splits in a different order.
	"""

	def __init__(self, n_heads):
	super().__init__()
	self.n_heads = n_heads

	def forward(self, qkv, rp=None, mask=None, use_tempoal_causal_attn=False):
	"""
	Apply QKV attention.

	:param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
	:return: an [N x (H * C) x T] tensor after attention.
	"""
	bs, width, length = qkv.shape
	assert width % (3 * self.n_heads) == 0
	ch = width // (3 * self.n_heads)
	# print('qkv', qkv.size())
	qkv=qkv.contiguous()
	q, k, v = qkv.chunk(3, dim=1)
	scale = 1 / math.sqrt(math.sqrt(ch))
	# print('bs, self.n_heads, ch, length', bs, self.n_heads, ch, length)

	weight = torch.einsum(
	"bct,bcs->bts",
	(q * scale).view(bs * self.n_heads, ch, length),
	(k * scale).view(bs * self.n_heads, ch, length),
	) # More stable with f16 than dividing afterwards
	# weight:[b,t,s] b=bsn_headsT

	if rp is not None:
	k_rp, v_rp = rp # [length, length, head_dim] [8, 8, 48]
	weight2 = torch.einsum(
	"bct,tsc->bst", (q * scale).view(bs * self.n_heads, ch, length), k_rp
	)
	weight += weight2

	if use_tempoal_causal_attn:
	# weight = torch.tril(weight)
	assert mask is None, f"Not implemented for merging two masks!"
	mask = torch.tril(torch.ones(weight.shape))
	else:
	if mask is not None: # only keep upper-left matrix
	# process mask
	c, t, _ = weight.shape

	if mask.shape[-1] > t:
	mask = mask[:, :t, :t]
	elif mask.shape[-1] < t: # pad ones
	mask_ = torch.zeros([c, t, t]).to(mask.device)
	t_ = mask.shape[-1]
	mask_[:, :t_, :t_] = mask
	mask = mask_
	else:
	assert (
	weight.shape[-1] == mask.shape[-1]
	), f"weight={weight.shape}, mask={mask.shape}"

	if mask is not None:
	INF = -1e8 # float('-inf')
	weight = weight.float().masked_fill(mask == 0, INF)

	weight = F.softmax(weight.float(), dim=-1).type(
	weight.dtype
	) # [256, 8, 8] [b, t, t] b=bsn_headsh*w,t=nframes
	# weight = F.softmax(weight, dim=-1)#[256, 8, 8] [b, t, t] b=bsn_headsh*w,t=nframes
	a = torch.einsum(
	"bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length)
	) # [256, 48, 8] [b, head_dim, t]

	if rp is not None:
	a2 = torch.einsum("bts,tsc->btc", weight, v_rp).transpose(1, 2) # btc->bct
	a += a2

	return a.reshape(bs, -1, length)


	def silu(x):
	# swish
	return x * torch.sigmoid(x)


	class SiLU(nn.Module):
	def __init__(self):
	super(SiLU, self).__init__()

	def forward(self, x):
	return silu(x)


	def Normalize(in_channels, norm_type="group"):
	assert norm_type in ["group", "batch",'layer']
	if norm_type == "group":
	return torch.nn.GroupNorm(
	num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
	)
	elif norm_type == "batch":
	return torch.nn.SyncBatchNorm(in_channels)
	elif norm_type == "layer":
	return nn.LayerNorm(in_channels)

	class SamePadConv3d(nn.Module):
	def __init__(
	self,
	in_channels,
	out_channels,
	kernel_size,
	stride=1,
	bias=True,
	padding_type="replicate",
	):
	super().__init__()
	if isinstance(kernel_size, int):
	kernel_size = (kernel_size,) * 3
	if isinstance(stride, int):
	stride = (stride,) * 3

	# assumes that the input shape is divisible by stride
	total_pad = tuple([k - s for k, s in zip(kernel_size, stride)])
	pad_input = []
	for p in total_pad[::-1]: # reverse since F.pad starts from last dim
	pad_input.append((p // 2 + p % 2, p // 2))
	pad_input = sum(pad_input, tuple())

	self.pad_input = pad_input
	self.padding_type = padding_type

	self.conv = nn.Conv3d(
	in_channels, out_channels, kernel_size, stride=stride, padding=0, bias=bias
	)

	def forward(self, x):
	tp=x.dtype
	x = x.float()

	# 执行填充操作
	x_padded = F.pad(x, self.pad_input, mode=self.padding_type)

	# 如果需要，将结果转换回 BFloat16
	x_padded = x_padded.to(tp)

	return self.conv(x_padded)

	class TemporalAttention(nn.Module):
	def __init__(
	self,
	channels,
	num_heads=1,
	num_head_channels=-1,
	max_temporal_length=64,
	):
	"""
	a clean multi-head temporal attention
	"""
	super().__init__()

	if num_head_channels == -1:
	self.num_heads = num_heads
	else:
	assert (
	channels % num_head_channels == 0
	), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
	self.num_heads = channels // num_head_channels

	self.norm = Normalize(channels)
	self.qkv = zero_module(conv_nd(1, channels, channels * 3, 1))
	self.attention = QKVAttention(self.num_heads)
	self.relative_position_k = RelativePosition(
	num_units=channels // self.num_heads,
	max_relative_position=max_temporal_length,
	)
	self.relative_position_v = RelativePosition(
	num_units=channels // self.num_heads,
	max_relative_position=max_temporal_length,
	)
	self.proj_out = zero_module(
	conv_nd(1, channels, channels, 1)
	) # conv_dim, in_channels, out_channels, kernel_size

	def forward(self, x, mask=None):
	b, c, t, h, w = x.shape
	out = rearrange(x, "b c t h w -> (b h w) c t")
	# torch.Size([4608, 1152, 2])1
	# torch.Size([4608, 3456, 2])2
	# torch.Size([4608, 1152, 2])3
	# torch.Size([4608, 1152, 2])4
	#print(out.shape,end='1\n')
	qkv = self.qkv(self.norm(out))
	#print(qkv.shape,end='2\n')

	len_q = qkv.size()[-1]
	len_k, len_v = len_q, len_q

	k_rp = self.relative_position_k(len_q, len_k)
	v_rp = self.relative_position_v(len_q, len_v) # [T,T,head_dim]
	out = self.attention(qkv, rp=(k_rp, v_rp))
	#print(out.shape,end='3\n')
	out = self.proj_out(out)
	#print(out.shape,end='4\n')
	out = rearrange(out, "(b h w) c t -> b c t h w", b=b, h=h, w=w)

	return x + out
	class TemporalAttention_lin(nn.Module):
	def __init__(
	self,
	channels,
	num_heads=8,
	num_head_channels=-1,
	max_temporal_length=64,
	):
	"""
	a clean multi-head temporal attention
	"""
	super().__init__()

	if num_head_channels == -1:
	self.num_heads = num_heads
	else:
	assert (
	channels % num_head_channels == 0
	), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
	self.num_heads = channels // num_head_channels

	self.norm = nn.LayerNorm(channels)
	#self.norm = Normalize(channels)
	#self.qkv = zero_module(conv_nd(1, channels, channels * 3, 1))
	self.qkv = nn.Linear(channels, channels * 3)
	self.attention = QKVAttention(self.num_heads)
	self.relative_position_k = RelativePosition(
	num_units=channels // self.num_heads,
	max_relative_position=max_temporal_length,
	)
	self.relative_position_v = RelativePosition(
	num_units=channels // self.num_heads,
	max_relative_position=max_temporal_length,
	)
	self.proj_out = nn.Linear(channels, channels)

	def forward(self, x, mask=None):
	b, c, t, h, w = x.shape
	out = rearrange(x, "b c t h w -> (b h w) t c")
	# torch.Size([4608, 1152, 2])1
	# torch.Size([4608, 3456, 2])2
	# torch.Size([4608, 1152, 2])3
	# torch.Size([4608, 1152, 2])4
	#print(out.shape,end='1\n')
	qkv = self.qkv(self.norm(out)).transpose(-1, -2)
	#print(qkv.shape,end='2\n')

	len_q = qkv.size()[-1]
	len_k, len_v = len_q, len_q

	k_rp = self.relative_position_k(len_q, len_k)
	v_rp = self.relative_position_v(len_q, len_v) # [T,T,head_dim]

	out = self.attention(qkv, rp=(k_rp, v_rp))

	out = self.proj_out(out.transpose(-1, -2)).transpose(-1, -2)

	#print(out.shape,end='4\n')
	out = rearrange(out, "(b h w) c t -> b c t h w", b=b, h=h, w=w)

	return x + out

	class AttnBlock3D(nn.Module):
	def __init__(self, in_channels):
	super().__init__()
	self.in_channels = in_channels

	self.norm = Normalize(in_channels)
	self.q = torch.nn.Conv3d(
	in_channels, in_channels, kernel_size=1, stride=1, padding=0
	)
	self.k = torch.nn.Conv3d(
	in_channels, in_channels, kernel_size=1, stride=1, padding=0
	)
	self.v = torch.nn.Conv3d(
	in_channels, in_channels, kernel_size=1, stride=1, padding=0
	)
	self.proj_out = torch.nn.Conv3d(
	in_channels, in_channels, kernel_size=1, stride=1, padding=0
	)

	def forward(self, x):
	h_ = x
	# self.norm.to(x.device)
	# self.norm.to(x.dtype)
	h_ = self.norm(h_)
	q = self.q(h_)
	k = self.k(h_)
	v = self.v(h_)

	b, c, t, h, w = q.shape
	# q = q.reshape(b,c,h*w) # bcl
	# q = q.permute(0,2,1) # bcl -> blc l=hw
	# k = k.reshape(b,c,h*w) # bcl
	q = rearrange(q, "b c t h w -> (b t) (h w) c") # blc
	k = rearrange(k, "b c t h w -> (b t) c (h w)") # bcl

	w_ = torch.bmm(q, k) # b,l,l
	w_ = w_ * (int(c) ** (-0.5))
	w_ = torch.nn.functional.softmax(w_, dim=2)

	# v = v.reshape(b,c,h*w)
	v = rearrange(v, "b c t h w -> (b t) c (h w)") # bcl

	# attend to values
	w_ = w_.permute(0, 2, 1) # bll
	h_ = torch.bmm(v, w_) # bcl

	# h_ = h_.reshape(b,c,h,w)
	h_ = rearrange(h_, "(b t) c (h w) -> b c t h w", b=b, h=h)

	h_ = self.proj_out(h_)

	return x + h_

	class MultiHeadAttention3D(nn.Module):
	def __init__(self, in_channels, num_heads=8):
	super().__init__()
	self.in_channels = in_channels
	self.num_heads = num_heads
	self.head_dim = in_channels // num_heads

	assert self.head_dim * num_heads == in_channels, "in_channels must be divisible by num_heads"

	self.norm = nn.LayerNorm(in_channels)
	self.q_linear = nn.Linear(in_channels, in_channels)
	self.k_linear = nn.Linear(in_channels, in_channels)
	self.v_linear = nn.Linear(in_channels, in_channels)
	self.proj_out = nn.Linear(in_channels, in_channels)

	def forward(self, x):
	b, c, t, h, w = x.shape
	#print(x.shape)
	# Normalize and reshape input
	h_ = rearrange(x, "b c t h w -> (b t) (h w) c")
	h_ = self.norm(h_)

	# Linear projections
	q = self.q_linear(h_)
	k = self.k_linear(h_)
	v = self.v_linear(h_)

	# Reshape to multi-head
	q = rearrange(q, "b l (h d) -> b h l d", h=self.num_heads)
	k = rearrange(k, "b l (h d) -> b h l d", h=self.num_heads)
	v = rearrange(v, "b l (h d) -> b h l d", h=self.num_heads)

	# Scaled Dot-Product Attention
	scores = torch.matmul(q, k.transpose(-2, -1)) / (self.head_dim ** 0.5)
	attn = F.softmax(scores, dim=-1)

	# Apply attention to values
	out = torch.matmul(attn, v)
	out = rearrange(out, "b h l d -> b l (h d)")

	# Project back to original dimension
	out = self.proj_out(out)

	# Reshape back to original shape
	out = rearrange(out, "(b t) (h w) c -> b c t h w", b=b, h=h, t=t)
	#print(out.shape)
	return x + out


	class SiglipAE(nn.Module):
	def __init__(self):
	super().__init__()
	temporal_stride=2
	norm_type = "group"

	self.temporal_encoding = nn.Parameter(torch.randn((4,1152)))
	#self.vision_tower=SigLipVisionTower('google/siglip-so400m-patch14-384')
	self.encoder=nn.Sequential(
	AttnBlock3D(1152),
	TemporalAttention(1152),

	SamePadConv3d(1152,1152,kernel_size=3,stride=(temporal_stride, 1, 1),padding_type="replicate"),

	AttnBlock3D(1152),
	TemporalAttention(1152),

	SamePadConv3d(1152,1152,kernel_size=3,stride=(temporal_stride, 1, 1),padding_type="replicate"),

	)
	def forward(self, x):
	b_,c_,t_,h_,w_=x.shape

	temporal_encoding = self.temporal_encoding.unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
	temporal_encoding = temporal_encoding.expand(b_, -1, -1, h_, w_) # (B, T, C, H, W)
	temporal_encoding = temporal_encoding.permute(0, 2, 1, 3, 4) # (B, C, T, H, W)
	x = x + temporal_encoding

	x=self.encoder(x)
	return x