File size: 5,513 Bytes
3440f83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
from sacred import Experiment

ex = Experiment("VLMo")


def _loss_names(d):
    ret = {
        "itm": 0,  # image-text matching loss
        "itc": 0,  # image-text contrastive loss
        "caption": 0,  # image captioning loss
        "mvlm": 0,  # masked language modeling loss
        "textmlm": 0,  # text-only masked language modeling
        "imagemlm": 0,  # image-only masked language modeling
        "vqa": 0,
        "nlvr2": 0,
        "irtr": 0,  # retrieval task ft
    }
    ret.update(d)
    return ret


@ex.config
def config():
    exp_name = "vlmo"
    seed = 1
    datasets = ["coco", "vg", "sbu", "gcc"]  # dataset name, the definition can refer to: vlmo/datamodules/__init__.py  # noqa
    loss_names = _loss_names({"itm": 0, "itc": 0, "mvlm": 0})  # training loss
    batch_size = 1024  # this is a desired batch size; pl trainer will accumulate gradients.

    # BEiT-v3 setting
    encoder_layers = 12  # the layer number of backbone
    encoder_embed_dim = 768  # the hidden size of tokenizer
    out_embed_dim = 768  # the hidden size of output embedding
    beit_version = "base"  # model size: base(0.4B)|large(1B)|huge(10B)
    beit3_vl_layers = 3  # the layer number of vl_backbone
    deepnorm_init = True  # init method
    share_layer = False  # if share the weight between layer within backbone
    share_attn = False  # if share the attention weight of different layer
    one_attn = False  # if share the attention weight of vision and language

    # Image setting
    train_transform_keys = ["square_transform_randaug"]  # train transform: refer to vlmo/transforms/__init__.py
    val_transform_keys = ["square_transform"]  # test transform: refer to refer to vlmo/transforms/__init__.py
    image_size = 224  # image size
    reclip_image_size = None  # reclip image size
    patch_size = 16  # patch size
    draw_false_image = 0  # if get negative image
    image_only = False  # only input image
    text_only = False  # # only input text

    # Video setting, video_num_frm is not None means video input
    video_num_frm = None

    # Visual tokenizer setting based on beit2
    tokenizer_model = "beit2_visual_tokenizer"
    codebook_size = 8192
    codebook_dim = 32
    visual_mask_size = 14
    visual_mask_num = 80

    # Text Setting
    lang = 'cn'  # language for zero-shot imagenet testing: cn|en
    vqav2_label_size = 3129
    max_text_len = 40  # the number of characters
    max_text_len_of_initckpt = 196
    tokenizer_type = "BertTokenizer"  # Chinese text
    vocab_size = 21128
    tokenizer = "./vocab.txt"
    whole_word_masking = True
    mlm_prob = 0.15  # language mask ratio
    draw_false_text = 0
    mvlm_prob = 0.50  # vision-langurage mlm task
    mask_ratio = 0  # flip: mask ratio for image

    # cap setting
    cap_onlytext = False  # default caption image to text

    # imagemlm setting
    split_data_for_imagemlm = False  # if True, split a batch data to two parts, and the first part for imagemlm.

    # itc setting
    itc_mask = False  # itc use masked token
    aggregate_nodes = -1  # aggregate nodes num for compute_itc, default -1 is for all nodes

    # Transformer Setting
    model_arch = "vlmo_base_patch16"
    drop_path_rate = 0.1

    # Downstream Setting
    get_recall_metric = False
    get_recall_rerank_metric = False
    get_zeroshot_metric = False
    get_muge_feat = False
    get_f30k_feat = False
    k_test = 32

    # PL Trainer Setting
    resume_from = None
    fast_dev_run = False
    val_check_interval = 1.0
    test_only = False
    use_sharded_training = False
    resume_during_training = False
    save_top_k = 10
    every_n_train_steps = 2000  # the step to save checkpoint
    log_metric_steps = 100  # the step to log metric

    # below params varies with the environment
    use_pcache = False  # data storage method: pcache or nas
    pcache_root = ""
    # main_site: pcache://multimodalproxyi-pool.cz50c.alipay.com:39999/mnt/
    # public_cloud: pcache://pcache_public_cloud.pcache.local:39999/mnt/abc7c88079a60b45ddfce7afa40720b7/
    gpu_env = "main_site"  # public_cloud or main_site
    data_root = ""  # data root for data list


    log_dir = "result"
    per_gpu_batchsize = 4  # you should define this manually with per_gpu_batch_size=#
    num_gpus = 1
    num_nodes = 1
    load_path = ""
    num_workers = 8
    precision = 16
    local_run = True
    flash_attn = False
    deepspeed_config = None  # "ds_config.json"
    coalesce_backbone = False
    mask_data = "v+l"  # 'v+l':choose input of imagemlm+textmlm task, 'vl': choose input of mvlm task.
    communication_benchmark = False
    checkpoint_activations = False

    # dataset setting
    single_cap = True  # if have only one caption
    random_one = False  # if choose one caption from caption list

    # ITC setting
    itc_feats_name = "cls_vlffn_feats"  # feat for itc loss
    itc_distill = ""
    itc_distill_dim = 1024
    itc_teacher_weights = ""

    # mup training setting
    mup = False
    base_encoder_embed_dim = 1
    delta_encoder_embed_dim = 2
    mup_encoder_attention_heads = 1
    base_encoder_ffn_embed_dim = 1
    delta_encoder_ffn_embed_dim = 2

    # atorch
    atorch_config = None
    compile_op = False
    optimizer_state_shard_save = False
    model_state_shard_save = False

    # itc loss
    local_loss = False
    use_dual_softmax = False

    num_frames = 1
# ----------------------- LMM pretraining config -----------------------

    # norm setting
    deepnorm = False