File size: 5,513 Bytes
3440f83 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
from sacred import Experiment
ex = Experiment("VLMo")
def _loss_names(d):
ret = {
"itm": 0, # image-text matching loss
"itc": 0, # image-text contrastive loss
"caption": 0, # image captioning loss
"mvlm": 0, # masked language modeling loss
"textmlm": 0, # text-only masked language modeling
"imagemlm": 0, # image-only masked language modeling
"vqa": 0,
"nlvr2": 0,
"irtr": 0, # retrieval task ft
}
ret.update(d)
return ret
@ex.config
def config():
exp_name = "vlmo"
seed = 1
datasets = ["coco", "vg", "sbu", "gcc"] # dataset name, the definition can refer to: vlmo/datamodules/__init__.py # noqa
loss_names = _loss_names({"itm": 0, "itc": 0, "mvlm": 0}) # training loss
batch_size = 1024 # this is a desired batch size; pl trainer will accumulate gradients.
# BEiT-v3 setting
encoder_layers = 12 # the layer number of backbone
encoder_embed_dim = 768 # the hidden size of tokenizer
out_embed_dim = 768 # the hidden size of output embedding
beit_version = "base" # model size: base(0.4B)|large(1B)|huge(10B)
beit3_vl_layers = 3 # the layer number of vl_backbone
deepnorm_init = True # init method
share_layer = False # if share the weight between layer within backbone
share_attn = False # if share the attention weight of different layer
one_attn = False # if share the attention weight of vision and language
# Image setting
train_transform_keys = ["square_transform_randaug"] # train transform: refer to vlmo/transforms/__init__.py
val_transform_keys = ["square_transform"] # test transform: refer to refer to vlmo/transforms/__init__.py
image_size = 224 # image size
reclip_image_size = None # reclip image size
patch_size = 16 # patch size
draw_false_image = 0 # if get negative image
image_only = False # only input image
text_only = False # # only input text
# Video setting, video_num_frm is not None means video input
video_num_frm = None
# Visual tokenizer setting based on beit2
tokenizer_model = "beit2_visual_tokenizer"
codebook_size = 8192
codebook_dim = 32
visual_mask_size = 14
visual_mask_num = 80
# Text Setting
lang = 'cn' # language for zero-shot imagenet testing: cn|en
vqav2_label_size = 3129
max_text_len = 40 # the number of characters
max_text_len_of_initckpt = 196
tokenizer_type = "BertTokenizer" # Chinese text
vocab_size = 21128
tokenizer = "./vocab.txt"
whole_word_masking = True
mlm_prob = 0.15 # language mask ratio
draw_false_text = 0
mvlm_prob = 0.50 # vision-langurage mlm task
mask_ratio = 0 # flip: mask ratio for image
# cap setting
cap_onlytext = False # default caption image to text
# imagemlm setting
split_data_for_imagemlm = False # if True, split a batch data to two parts, and the first part for imagemlm.
# itc setting
itc_mask = False # itc use masked token
aggregate_nodes = -1 # aggregate nodes num for compute_itc, default -1 is for all nodes
# Transformer Setting
model_arch = "vlmo_base_patch16"
drop_path_rate = 0.1
# Downstream Setting
get_recall_metric = False
get_recall_rerank_metric = False
get_zeroshot_metric = False
get_muge_feat = False
get_f30k_feat = False
k_test = 32
# PL Trainer Setting
resume_from = None
fast_dev_run = False
val_check_interval = 1.0
test_only = False
use_sharded_training = False
resume_during_training = False
save_top_k = 10
every_n_train_steps = 2000 # the step to save checkpoint
log_metric_steps = 100 # the step to log metric
# below params varies with the environment
use_pcache = False # data storage method: pcache or nas
pcache_root = ""
# main_site: pcache://multimodalproxyi-pool.cz50c.alipay.com:39999/mnt/
# public_cloud: pcache://pcache_public_cloud.pcache.local:39999/mnt/abc7c88079a60b45ddfce7afa40720b7/
gpu_env = "main_site" # public_cloud or main_site
data_root = "" # data root for data list
log_dir = "result"
per_gpu_batchsize = 4 # you should define this manually with per_gpu_batch_size=#
num_gpus = 1
num_nodes = 1
load_path = ""
num_workers = 8
precision = 16
local_run = True
flash_attn = False
deepspeed_config = None # "ds_config.json"
coalesce_backbone = False
mask_data = "v+l" # 'v+l':choose input of imagemlm+textmlm task, 'vl': choose input of mvlm task.
communication_benchmark = False
checkpoint_activations = False
# dataset setting
single_cap = True # if have only one caption
random_one = False # if choose one caption from caption list
# ITC setting
itc_feats_name = "cls_vlffn_feats" # feat for itc loss
itc_distill = ""
itc_distill_dim = 1024
itc_teacher_weights = ""
# mup training setting
mup = False
base_encoder_embed_dim = 1
delta_encoder_embed_dim = 2
mup_encoder_attention_heads = 1
base_encoder_ffn_embed_dim = 1
delta_encoder_ffn_embed_dim = 2
# atorch
atorch_config = None
compile_op = False
optimizer_state_shard_save = False
model_state_shard_save = False
# itc loss
local_loss = False
use_dual_softmax = False
num_frames = 1
# ----------------------- LMM pretraining config -----------------------
# norm setting
deepnorm = False
|