diffusion:
  target: pl_trainer.instruct_p2p_video.InstructP2PVideoTrainerTemporalText
  params:
    beta_schedule_args:
      beta_schedule: scaled_linear
      num_train_timesteps: 1000
      beta_start: 0.00085
      beta_end: 0.012
      clip_sample: false
      thresholding: false
    prediction_type: epsilon
    loss_fn: l2
    optim_args:
      lr: 1e-5
    unet_init_weights: #! 注意一下, 完全可以从iv2v的ckpt开始train
      - unet/diffusion_pytorch_model.safetensors # iclight, unet, sf tensor
      - pretrained_models/Motion_Module/mm_sd_v15.ckpt # motion module, 推测加载的是animatediff的
      - pretrained_models/iclight/iclight_sd15_fc.safetensors # iclight lora weights
    base_path: /mnt/petrelfs/fangye/.cache/huggingface/hub/models--stablediffusionapi--realistic-vision-v51/snapshots/19e3643d7d963c156d01537188ec08f0b79a514a
    # vae_init_weights: pretrained_models/instruct_pix2pix/vqvae.ckpt
    # text_model_init_weights: pretrained_models/instruct_pix2pix/text.ckpt #! 这两个可以直接设为None, 从from_pretrained中加载
    scale_factor: 0.18215
    guidance_scale: 5 # not used
    ddim_sampling_steps: 20
    text_cfg: 7.5
    img_cfg: 1.2
    cond_image_dropout: 0.1
    prompt_type: edit_prompt
    hdr_train: True
unet:
  target: modules.video_unet_temporal.unet.UNet3DConditionModel
  params:
    in_channels: 4 #! change:8->12 iclight 改为12 注意一下...
    out_channels: 4
    act_fn: silu
    attention_head_dim: 8
    block_out_channels: 
      - 320
      - 640
      - 1280
      - 1280
    cross_attention_dim: 768
    down_block_types: 
      - CrossAttnDownBlock3D
      - CrossAttnDownBlock3D
      - CrossAttnDownBlock3D
      - DownBlock3D
    up_block_types: 
      - UpBlock3D
      - CrossAttnUpBlock3D
      - CrossAttnUpBlock3D
      - CrossAttnUpBlock3D
    downsample_padding: 1
    layers_per_block: 2
    mid_block_scale_factor: 1
    norm_eps: 1e-05
    norm_num_groups: 32
    sample_size: 64
    use_motion_module: true #!!! 这边test iclight的时候可以不用motion module 即False
    motion_module_resolutions:
    - 1
    - 2
    - 4
    - 8
    motion_module_mid_block: false
    motion_module_decoder_only: false
    motion_module_type: Vanilla
    motion_module_kwargs:
      num_attention_heads: 8
      num_transformer_block: 1
      attention_block_types:
      - Temporal_Self
      - Temporal_Self
      temporal_position_encoding: true
      temporal_position_encoding_max_len: 32
      temporal_attention_dim_div: 1
text_model:
  target: modules.openclip.modules.FrozenCLIPEmbedder
  params:
    freeze: true