WishArdently commited on
Commit
63dc69f
·
verified ·
1 Parent(s): c37dd2c

Upload InternVideo2Stage2VideoEncoder

Browse files
Files changed (4) hide show
  1. config.json +3 -3
  2. config.py +7 -2
  3. model.py +25 -9
  4. model.safetensors +1 -1
config.json CHANGED
@@ -102,7 +102,7 @@
102
  "num_frames": 8,
103
  "only_mask": true,
104
  "patch_size": 14,
105
- "pretrained": "/home/linanxi/InternVideo/checkpoints/InternVideo2-stage2_1b-224p-f4/InternVideo2-stage2_1b-224p-f4.pt",
106
  "sep_image_video_pos_embed": true,
107
  "tubelet_size": 1,
108
  "use_checkpoint": false,
@@ -156,10 +156,10 @@
156
  "tokenizer": null,
157
  "torch_dtype": "float16",
158
  "train_file": "available_corpus[\"pretrain_example_data_1B\"]",
159
- "transformers_version": "4.47.0",
160
  "use_bf16": true,
161
  "use_flash_sdp": false,
162
- "use_half_precision": true,
163
  "use_mem_efficient_sdp": false,
164
  "wandb": {
165
  "enable": false,
 
102
  "num_frames": 8,
103
  "only_mask": true,
104
  "patch_size": 14,
105
+ "pretrained": "/home/bingxing2/home/scx7l3k/linanxi/workspace/low_level/Encoders/InternVideo2-stage2_1b-224p-f4.pt",
106
  "sep_image_video_pos_embed": true,
107
  "tubelet_size": 1,
108
  "use_checkpoint": false,
 
156
  "tokenizer": null,
157
  "torch_dtype": "float16",
158
  "train_file": "available_corpus[\"pretrain_example_data_1B\"]",
159
+ "transformers_version": "4.42.4",
160
  "use_bf16": true,
161
  "use_flash_sdp": false,
162
+ "use_half_precision": false,
163
  "use_mem_efficient_sdp": false,
164
  "wandb": {
165
  "enable": false,
config.py CHANGED
@@ -58,7 +58,7 @@ class InternVideo2Config(PretrainedConfig):
58
  evaluate=False,
59
  deep_fusion=False,
60
  evaluation=None,
61
- use_half_precision=True,
62
  use_bf16=True,
63
  gradient_checkpointing=True,
64
  use_flash_sdp=False,
@@ -132,7 +132,7 @@ class InternVideo2Config(PretrainedConfig):
132
  "clip_norm_type": "l2",
133
  "clip_return_layer": 6,
134
  "clip_student_return_interval": 1,
135
- "pretrained": "/home/linanxi/InternVideo/checkpoints/InternVideo2-stage2_1b-224p-f4/InternVideo2-stage2_1b-224p-f4.pt",
136
  "use_checkpoint": False,
137
  "checkpoint_num": 40,
138
  "use_flash_attn": True,
@@ -233,3 +233,8 @@ class InternVideo2Config(PretrainedConfig):
233
  "enable": True,
234
  "stage": 1
235
  })
 
 
 
 
 
 
58
  evaluate=False,
59
  deep_fusion=False,
60
  evaluation=None,
61
+ use_half_precision=False,
62
  use_bf16=True,
63
  gradient_checkpointing=True,
64
  use_flash_sdp=False,
 
132
  "clip_norm_type": "l2",
133
  "clip_return_layer": 6,
134
  "clip_student_return_interval": 1,
135
+ "pretrained": "/home/bingxing2/home/scx7l3k/linanxi/workspace/low_level/Encoders/InternVideo2-stage2_1b-224p-f4.pt",
136
  "use_checkpoint": False,
137
  "checkpoint_num": 40,
138
  "use_flash_attn": True,
 
233
  "enable": True,
234
  "stage": 1
235
  })
236
+ def set_num_frames(self, num_frames):
237
+ # print('Here ', num_frames)
238
+ self.num_frames = num_frames
239
+ self.inputs.video_input.num_frames = num_frames
240
+ self.model.vision_encoder.num_frames = num_frames
model.py CHANGED
@@ -3,8 +3,11 @@ from transformers import PretrainedConfig, PreTrainedModel, AutoModel, AutoConfi
3
  from config import InternVideo2Config as config
4
  import warnings
5
  import torch
 
6
  warnings.filterwarnings("ignore")
7
 
 
 
8
  # model_config = config()
9
  # model = IV2S2(model_config)
10
  # print(model)
@@ -15,24 +18,37 @@ class InternVideo2Stage2VideoEncoder(PreTrainedModel):
15
  def __init__(self, config):
16
  super().__init__(config)
17
  self.config = config
18
- self.model = IV2S2(config).half().to(config.device)
 
19
 
20
  def forward(self, x: torch.tensor):
21
  """forward pass
22
  Args:
23
- x (torch.tensor): Shape (B, N, C, H, W) or (N, C, H, W)
24
  Returns:
25
- torch.tensor: Shape (B*N, hidden_size)
26
  """
27
- # x: Shape(B, C, N, H, W)
28
- # output: Shape(B, N*98, hidden_size)
 
 
 
 
 
 
29
  if len(x.shape) == 4:
30
- x = x.unsqueeze(0)
 
31
  B, N, C, H, W = x.shape
32
- x = x.permute(0, 2, 1, 3, 4) # Shape(B, C, N, H, W)
33
  output = self.model.encode_vision(x)
34
- pooled_vision_embeds = output[1]
35
- return pooled_vision_embeds
 
 
 
 
 
36
 
37
  if __name__ == "__main__":
38
  model_config = config()
 
3
  from config import InternVideo2Config as config
4
  import warnings
5
  import torch
6
+ # from transformers.utils import logging
7
  warnings.filterwarnings("ignore")
8
 
9
+ # logging.set_verbosity_error()
10
+
11
  # model_config = config()
12
  # model = IV2S2(model_config)
13
  # print(model)
 
18
  def __init__(self, config):
19
  super().__init__(config)
20
  self.config = config
21
+ # print(self.config.model.vision_encoder.num_frames)
22
+ self.model = IV2S2(self.config).to(config.device).to(torch.float16)
23
 
24
  def forward(self, x: torch.tensor):
25
  """forward pass
26
  Args:
27
+ x (torch.tensor): Shape (B, N, C, H, W) or (B, C, H, W)
28
  Returns:
29
+ torch.tensor: Shape (B*N, hidden_size) or (B, hidden_size)
30
  """
31
+ if len(x.shape) == 5 and x.shape[1] > 8:
32
+ ## There is no way, the weight limits the number of input frames to be less than or equal to 8.
33
+ ## Forgive me for dealing with input frames greater than 8 in such a stupid way. T^T
34
+ T = x.shape[1]
35
+ embs = torch.cat([self.forward(x[:, i:i+8, :, :, :])for i in range(0, T, 8)], dim=1)
36
+ return embs
37
+
38
+ image = False
39
  if len(x.shape) == 4:
40
+ x = x.unsqueeze(1)
41
+ image = True
42
  B, N, C, H, W = x.shape
43
+ # x = x.permute(0, 2, 1, 3, 4) # Shape(B, N, C, H, W)
44
  output = self.model.encode_vision(x)
45
+ pooled_vision_embeds = output[1] # Shape(B, N*256 + 1, Hidden_size)
46
+ output = pooled_vision_embeds[:, :256*N, :] # Shape(B, N*256, Hidden_size)
47
+ output = output.reshape(B, N, 256, -1) # Shape(B, N, 256, Hidden_size)
48
+ output = output.mean(dim=2) # Shape(B, N, Hidden_size)
49
+ if image:
50
+ output = output.squeeze(1)
51
+ return output
52
 
53
  if __name__ == "__main__":
54
  model_config = config()
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ad189b4ab3a2e80495bc7c9997d6e7a3408faf3a11c40da99c553cecb52c42a4
3
  size 2104856154
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:611b74750f429e7d50ee53c0df0d05a524c6b55961a8cff7da57ae8e8cb7fb82
3
  size 2104856154