expt_dir: experiments
expt_name: instruct_v2v
trainer_args:
  max_epochs: 10
  accelerator: "gpu"
  devices: [0]
  limit_train_batches: 2048
  limit_val_batches: 1
  # strategy: "ddp"
  strategy: "deepspeed_stage_2"
  accumulate_grad_batches: 256
  check_val_every_n_epoch: 5
diffusion:
  target: pl_trainer.instruct_p2p_video.InstructP2PVideoTrainerTemporal
  params:
    beta_schedule_args:
      beta_schedule: scaled_linear
      num_train_timesteps: 1000
      beta_start: 0.00085
      beta_end: 0.012
      clip_sample: false
      thresholding: false
    prediction_type: epsilon
    loss_fn: l2
    optim_args:
      lr: 1e-5
    unet_init_weights: 
      - pretrained_models/instruct_pix2pix/diffusion_pytorch_model.bin
      - pretrained_models/Motion_Module/mm_sd_v15.ckpt
    vae_init_weights: pretrained_models/instruct_pix2pix/vqvae.ckpt
    text_model_init_weights: pretrained_models/instruct_pix2pix/text.ckpt
    scale_factor: 0.18215
    guidance_scale: 5 # not used
    ddim_sampling_steps: 20
    text_cfg: 7.5
    img_cfg: 1.2
    cond_image_dropout: 0.1
    prompt_type: edit_prompt
unet:
  target: modules.video_unet_temporal.unet.UNet3DConditionModel
  params:
    in_channels: 8
    out_channels: 4
    act_fn: silu
    attention_head_dim: 8
    block_out_channels: 
      - 320
      - 640
      - 1280
      - 1280
    cross_attention_dim: 768
    down_block_types: 
      - CrossAttnDownBlock3D
      - CrossAttnDownBlock3D
      - CrossAttnDownBlock3D
      - DownBlock3D
    up_block_types: 
      - UpBlock3D
      - CrossAttnUpBlock3D
      - CrossAttnUpBlock3D
      - CrossAttnUpBlock3D
    downsample_padding: 1
    layers_per_block: 2
    mid_block_scale_factor: 1
    norm_eps: 1e-05
    norm_num_groups: 32
    sample_size: 64
    use_motion_module: true
    motion_module_resolutions:
    - 1
    - 2
    - 4
    - 8
    motion_module_mid_block: false
    motion_module_decoder_only: false
    motion_module_type: Vanilla
    motion_module_kwargs:
      num_attention_heads: 8
      num_transformer_block: 1
      attention_block_types:
      - Temporal_Self
      - Temporal_Self
      temporal_position_encoding: true
      temporal_position_encoding_max_len: 32
      temporal_attention_dim_div: 1
vae:
  target: modules.kl_autoencoder.autoencoder.AutoencoderKL
  params:
    embed_dim: 4
    ddconfig:
      double_z: true
      z_channels: 4
      resolution: 256
      in_channels: 3
      out_ch: 3
      ch: 128
      ch_mult:
        - 1
        - 2
        - 4
        - 4
      num_res_blocks: 2
      attn_resolutions: []
      dropout: 0.0
    lossconfig:
      target: torch.nn.Identity
text_model:
  target: modules.openclip.modules.FrozenCLIPEmbedder
  params:
    freeze: true
data:
  batch_size: 1
  val_batch_size: 1
  train:
    target: dataset.videoP2P.VideoPromptToPromptMotionAug
    params:
      root_dirs: 
        - video_ptp/raw_generated
        - video_ptp/raw_generated_webvid
      num_frames: 16
      zoom_ratio: 0.2
      max_zoom: 1.25
      translation_ratio: 0.7
      translation_range: [0, 0.2]
  val:
    target: dataset.videoP2P.VideoPromptToPromptMotionAug
    params:
      root_dirs: 
        - video_ptp/raw_generated
      num_frames: 16
      zoom_ratio: 0.2
      max_zoom: 1.25
      translation_ratio: 0.7
      translation_range: [0, 0.2]
callbacks:
  - target: pytorch_lightning.callbacks.ModelCheckpoint
    params:
      dirpath: "${expt_dir}/${expt_name}"
      filename: "{epoch:04d}"
      monitor: epoch
      mode: max
      save_top_k: 5
      save_last: true
  - target: callbacks.instruct_p2p_video.InstructP2PLogger
    params:
      max_num_images: 1
    require_wandb: true