# ------------------------------------------------------------------------ # Semantic SAM # Copyright (c) MicroSoft, Inc. and its affiliates. # Modified from OpenSeed https://github.com/IDEA-Research/OpenSeed by Feng Li. # ------------------------------------------------------------------------ ################## # Task settings ################## WEIGHT: '' PORT: 53711 VERBOSE: true OUTPUT_DIR: '../../data/output/test' # misc LOADER: JOINT: True KEY_DATASET: 'coco' # model MODEL: NAME: interactive_mask_dino HEAD: general_head MASK_ON: false KEYPOINT_ON: false LOAD_PROPOSALS: false DIM_PROJ: 512 BACKBONE_DIM: 768 BACKGROUND: False WEIGHTS: '' TEXT: ARCH: noencoder # no language encoder for training only sa-1b data NAME: transformer TOKENIZER: clip CONTEXT_LENGTH: 18 # 77 WIDTH: 512 HEADS: 8 LAYERS: 12 # 6 AUTOGRESSIVE: True BACKBONE: NAME: swin PRETRAINED: 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth' LOAD_PRETRAINED: true SWIN: PRETRAIN_IMG_SIZE: 384 PATCH_SIZE: 4 EMBED_DIM: 192 DEPTHS: [ 2, 2, 18, 2 ] NUM_HEADS: [ 6, 12, 24, 48 ] WINDOW_SIZE: 12 MLP_RATIO: 4.0 QKV_BIAS: true QK_SCALE: ~ DROP_RATE: 0.0 ATTN_DROP_RATE: 0.0 DROP_PATH_RATE: 0.3 APE: false PATCH_NORM: true USE_CHECKPOINT: false OUT_FEATURES: [ 'res2', 'res3', 'res4', 'res5' ] ENCODER: NAME: encoder_deform IGNORE_VALUE: 255 NUM_CLASSES: 1 LOSS_WEIGHT: 1.0 CONVS_DIM: 256 MASK_DIM: 256 NORM: "GN" IN_FEATURES: [ "res2", "res3", "res4", "res5" ] DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: [ "res3", "res4", "res5" ] COMMON_STRIDE: 4 TRANSFORMER_ENC_LAYERS: 6 TOTAL_NUM_FEATURE_LEVELS: 4 NUM_FEATURE_LEVELS: 3 FEATURE_ORDER: "low2high" DECODER: NAME: interactive_mask_dino TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" MASK: True BOX: True PART: True GROUNDING: ENABLED: False MAX_LEN: 5 TEXT_WEIGHT: 2.0 CLASS_WEIGHT: 0.5 CAPTION: ENABLED: False PHRASE_PROB: 0.0 SIM_THRES: 0.95 CAPTIONING: ENABLED: False STEP: 50 RETRIEVAL: ENABLED: False DIM_IMG: 768 ENSEMBLE: True OPENIMAGE: ENABLED: False NEGATIVE_SAMPLES: 5 GROUNDING: ENABLED: False MAX_LEN: 5 DEEP_SUPERVISION: True NO_OBJECT_WEIGHT: 0.1 CLASS_WEIGHT: 4.0 MASK_WEIGHT: 5.0 DICE_WEIGHT: 5.0 BOX_WEIGHT: 5.0 GIOU_WEIGHT: 2.0 IOU_WEIGHT: 1.0 COST_CLASS_WEIGHT: 4.0 COST_DICE_WEIGHT: 5.0 COST_MASK_WEIGHT: 5.0 COST_BOX_WEIGHT: 5.0 COST_GIOU_WEIGHT: 2.0 HIDDEN_DIM: 256 NUM_OBJECT_QUERIES: 0 NHEADS: 8 DROPOUT: 0.0 DIM_FEEDFORWARD: 2048 ENC_LAYERS: 0 PRE_NORM: False ENFORCE_INPUT_PROJ: False SIZE_DIVISIBILITY: 32 DEC_LAYERS: 9 # 9 decoder layers, add one for the loss on learnable query TRAIN_NUM_POINTS: 12544 OVERSAMPLE_RATIO: 3.0 IMPORTANCE_SAMPLE_RATIO: 0.75 TWO_STAGE: False INITIALIZE_BOX_TYPE: 'no' DN: seg DN_NOISE_SCALE: 0.4 DN_NUM: 100 INITIAL_PRED: False LEARN_TGT: False TOTAL_NUM_FEATURE_LEVELS: 4 SEMANTIC_CE_LOSS: False PANO_BOX_LOSS: False COCO: False O365: False SAM: True PASCAL: False RE_POINT: True NUM_INTERACTIVE_TOKENS: 6 MAX_NUM_INSTANCE: 60 TEST: SEMANTIC_ON: True INSTANCE_ON: True PANOPTIC_ON: True BOX_INTERACTIVE: False CLASSIFICATION_ON: False OVERLAP_THRESHOLD: 0.8 OBJECT_MASK_THRESHOLD: 0.25 SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false TEST_FOUCUS_ON_BOX: False PANO_TRANSFORM_EVAL: True PANO_TEMPERATURE: 0.06 TEST: EVAL_PERIOD: 500000 PRECISE_BN: NUM_ITER: 1 ENABLED: False AUG: ENABLED: False SAM: INPUT: MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 IMAGE_SIZE: 1024 MIN_SCALE: 0.99 MAX_SCALE: 1.01 DATASET_MAPPER_NAME: "sam" IGNORE_VALUE: 255 COLOR_AUG_SSD: False SIZE_DIVISIBILITY: 32 RANDOM_FLIP: "horizontal" MASK_FORMAT: "polygon" FORMAT: "RGB" CROP: ENABLED: True DATASET: DATASET: 'sam' TEST: DETECTIONS_PER_IMAGE: 100 NAME: coco_eval IOU_TYPE: ['bbox', 'segm'] USE_MULTISCALE: false BATCH_SIZE_TOTAL: 8 MODEL_FILE: '' AUG: ENABLED: False TRAIN: BATCH_SIZE_TOTAL: 1 BATCH_SIZE_PER_GPU: 1 SHUFFLE: true DATALOADER: FILTER_EMPTY_ANNOTATIONS: False NUM_WORKERS: 4 LOAD_PROPOSALS: False SAMPLER_TRAIN: "TrainingSampler" ASPECT_RATIO_GROUPING: True COCO: INPUT: MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 IMAGE_SIZE: 1024 MIN_SCALE: 0.1 MAX_SCALE: 2.0 DATASET_MAPPER_NAME: "coco_interactive_panoptic_lsj" IGNORE_VALUE: 255 COLOR_AUG_SSD: False SIZE_DIVISIBILITY: 32 RANDOM_FLIP: "horizontal" MASK_FORMAT: "polygon" FORMAT: "RGB" CROP: ENABLED: True DATASET: DATASET: 'coco' TEST: DETECTIONS_PER_IMAGE: 100 NAME: coco_eval IOU_TYPE: ['bbox', 'segm'] USE_MULTISCALE: false BATCH_SIZE_TOTAL: 1 MODEL_FILE: '' AUG: ENABLED: False TRAIN: BATCH_SIZE_TOTAL: 1 BATCH_SIZE_PER_GPU: 1 SHUFFLE: true DATALOADER: FILTER_EMPTY_ANNOTATIONS: False NUM_WORKERS: 2 LOAD_PROPOSALS: False SAMPLER_TRAIN: "TrainingSampler" ASPECT_RATIO_GROUPING: True VLP: INPUT: IMAGE_SIZE: 224 DATASET_MAPPER_NAME: "vlpretrain" IGNORE_VALUE: 255 COLOR_AUG_SSD: False SIZE_DIVISIBILITY: 32 MASK_FORMAT: "polygon" FORMAT: "RGB" CROP: ENABLED: True TRAIN: BATCH_SIZE_TOTAL: 2 BATCH_SIZE_PER_GPU: 2 TEST: BATCH_SIZE_TOTAL: 256 DATALOADER: FILTER_EMPTY_ANNOTATIONS: False NUM_WORKERS: 16 LOAD_PROPOSALS: False SAMPLER_TRAIN: "TrainingSampler" ASPECT_RATIO_GROUPING: True INPUT: PIXEL_MEAN: [123.675, 116.280, 103.530] PIXEL_STD: [58.395, 57.120, 57.375] DATASETS: TRAIN: ["sam_train"] # interactive segmentation evaluation. TEST: ["coco_2017_val_panoptic_with_sem_seg_interactive_jointboxpoint"] # TEST: ["sam_minival"] CLASS_CONCAT: false SIZE_DIVISIBILITY: 32 PROPOSAL_FILES_TRAIN: [] DATALOADER: FILTER_EMPTY_ANNOTATIONS: False NUM_WORKERS: 16 LOAD_PROPOSALS: False SAMPLER_TRAIN: "TrainingSampler" ASPECT_RATIO_GROUPING: True # Detectron2 training config for optimizer and lr scheduler SOLVER: BASE_LR_END: 0.0 MOMENTUM: 0.9 NESTEROV: False CHECKPOINT_PERIOD: 5000 IMS_PER_BATCH: 1 REFERENCE_WORLD_SIZE: 0 BIAS_LR_FACTOR: 1.0 WEIGHT_DECAY_BIAS: None # original BASE_LR: 0.0001 STEPS: [327778, 355092] MAX_ITER: 368750 GAMMA: 0.1 WARMUP_FACTOR: 1.0 WARMUP_ITERS: 10 WARMUP_METHOD: "linear" WEIGHT_DECAY: 0.05 OPTIMIZER: "ADAMW" LR_SCHEDULER_NAME: "WarmupMultiStepLR" LR_MULTIPLIER: backbone: 0.1 lang_encoder: 0.1 WEIGHT_DECAY_NORM: 0.0 WEIGHT_DECAY_EMBED: 0.0 CLIP_GRADIENTS: ENABLED: True CLIP_TYPE: "full_model" CLIP_VALUE: 0.01 NORM_TYPE: 2.0 AMP: ENABLED: True # Evaluation Dataset ADE20K: INPUT: MIN_SIZE_TRAIN: [320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1088, 1152, 1216, 1280] MIN_SIZE_TRAIN_SAMPLING: "choice" MIN_SIZE_TEST: 640 MAX_SIZE_TRAIN: 2560 MAX_SIZE_TEST: 2560 MASK_FORMAT: "polygon" CROP: ENABLED: True TYPE: "absolute" SIZE: [640, 640] SINGLE_CATEGORY_MAX_AREA: 1.0 IGNORE_VALUE: 255 COLOR_AUG_SSD: True SIZE_DIVISIBILITY: 640 # used in dataset mapper DATASET_MAPPER_NAME: "mask_former_panoptic" FORMAT: "RGB" DATASET: DATASET: 'ade' TRAIN: ASPECT_RATIO_GROUPING: true BATCH_SIZE_TOTAL: 16 BATCH_SIZE_PER_GPU: 2 SHUFFLE: true TEST: DETECTIONS_PER_IMAGE: 100 NAME: coco_eval IOU_TYPE: ['bbox', 'segm'] USE_MULTISCALE: false BATCH_SIZE_TOTAL: 8 MODEL_FILE: '' AUG: ENABLED: False DATALOADER: FILTER_EMPTY_ANNOTATIONS: False NUM_WORKERS: 8 LOAD_PROPOSALS: False SAMPLER_TRAIN: "TrainingSampler" ASPECT_RATIO_GROUPING: True #ADE20K: # INPUT: # MIN_SIZE_TRAIN: 640 # MIN_SIZE_TRAIN_SAMPLING: "choice" # MIN_SIZE_TEST: 640 # MAX_SIZE_TRAIN: 2560 # MAX_SIZE_TEST: 2560 # MASK_FORMAT: "polygon" # CROP: # ENABLED: True # TYPE: "absolute" # SIZE: (640, 640) # SINGLE_CATEGORY_MAX_AREA: 1.0 # COLOR_AUG_SSD: True # SIZE_DIVISIBILITY: 640 # used in dataset mapper # DATASET_MAPPER_NAME: "mask_former_panoptic" # FORMAT: "RGB" # DATASET: # DATASET: 'ade' # TEST: # BATCH_SIZE_TOTAL: 8 REF: INPUT: PIXEL_MEAN: [123.675, 116.280, 103.530] PIXEL_STD: [58.395, 57.120, 57.375] MIN_SIZE_TEST: 512 MAX_SIZE_TEST: 1024 FORMAT: "RGB" DATALOADER: FILTER_EMPTY_ANNOTATIONS: False NUM_WORKERS: 0 LOAD_PROPOSALS: False SAMPLER_TRAIN: "TrainingSampler" ASPECT_RATIO_GROUPING: False TEST: BATCH_SIZE_TOTAL: 8 SUN: INPUT: PIXEL_MEAN: [123.675, 116.280, 103.530] PIXEL_STD: [58.395, 57.120, 57.375] MIN_SIZE_TEST: 512 MAX_SIZE_TEST: 1024 DATALOADER: FILTER_EMPTY_ANNOTATIONS: False NUM_WORKERS: 0 LOAD_PROPOSALS: False SAMPLER_TRAIN: "TrainingSampler" ASPECT_RATIO_GROUPING: False TEST: BATCH_SIZE_TOTAL: 8 SCAN: INPUT: PIXEL_MEAN: [123.675, 116.280, 103.530] PIXEL_STD: [58.395, 57.120, 57.375] MIN_SIZE_TEST: 512 MAX_SIZE_TEST: 1024 DATALOADER: FILTER_EMPTY_ANNOTATIONS: False NUM_WORKERS: 0 LOAD_PROPOSALS: False SAMPLER_TRAIN: "TrainingSampler" ASPECT_RATIO_GROUPING: False TEST: BATCH_SIZE_TOTAL: 8 BDD: INPUT: PIXEL_MEAN: [123.675, 116.280, 103.530] PIXEL_STD: [58.395, 57.120, 57.375] MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 DATALOADER: FILTER_EMPTY_ANNOTATIONS: False NUM_WORKERS: 0 LOAD_PROPOSALS: False SAMPLER_TRAIN: "TrainingSampler" ASPECT_RATIO_GROUPING: False TEST: BATCH_SIZE_TOTAL: 8 CITY: INPUT: MIN_SIZE_TRAIN: [ 512, 614, 716, 819, 921, 1024, 1126, 1228, 1331, 1433, 1536, 1638, 1740, 1843, 1945, 2048 ] MIN_SIZE_TRAIN_SAMPLING: "choice" MIN_SIZE_TEST: 1024 MAX_SIZE_TRAIN: 4096 MAX_SIZE_TEST: 2048 CROP: ENABLED: True TYPE: "absolute" SIZE: [ 512, 1024 ] SINGLE_CATEGORY_MAX_AREA: 1.0 IGNORE_VALUE: 255 COLOR_AUG_SSD: True SIZE_DIVISIBILITY: -1 FORMAT: "RGB" DATASET_MAPPER_NAME: "mask_former_panoptic" MASK_FORMAT: "polygon" TEST: EVAL_PERIOD: 5000 BATCH_SIZE_TOTAL: 1 AUG: ENABLED: False MIN_SIZES: [ 512, 768, 1024, 1280, 1536, 1792 ] MAX_SIZE: 4096 FLIP: True DATALOADER: FILTER_EMPTY_ANNOTATIONS: True NUM_WORKERS: 2 LOAD_PROPOSALS: False SAMPLER_TRAIN: "TrainingSampler" ASPECT_RATIO_GROUPING: True TRAIN: ASPECT_RATIO_GROUPING: true BATCH_SIZE_TOTAL: 2 BATCH_SIZE_PER_GPU: 2 SHUFFLE: true PSACAL_PART: INPUT: MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 IMAGE_SIZE: 1024 MIN_SCALE: 0.1 MAX_SCALE: 2.0 DATASET_MAPPER_NAME: "pascal_part_lsj" IGNORE_VALUE: 255 COLOR_AUG_SSD: False SIZE_DIVISIBILITY: 32 RANDOM_FLIP: "horizontal" MASK_FORMAT: "polygon" FORMAT: "RGB" CROP: ENABLED: True MODEL: MASK_ON: True KEYPOINT_ON: False LOAD_PROPOSALS: False # DATASET: # DATASET: 'coco' TEST: DETECTIONS_PER_IMAGE: 100 NAME: coco_eval IOU_TYPE: ['bbox', 'segm'] USE_MULTISCALE: false BATCH_SIZE_TOTAL: 8 MODEL_FILE: '' AUG: ENABLED: False TRAIN: BATCH_SIZE_TOTAL: 1 BATCH_SIZE_PER_GPU: 1 SHUFFLE: true DATALOADER: FILTER_EMPTY_ANNOTATIONS: False NUM_WORKERS: 2 LOAD_PROPOSALS: False SAMPLER_TRAIN: "TrainingSampler" ASPECT_RATIO_GROUPING: True