diffusion: target: pl_trainer.instruct_p2p_video.InstructP2PVideoTrainerTemporal params: beta_schedule_args: beta_schedule: scaled_linear num_train_timesteps: 1000 beta_start: 0.00085 beta_end: 0.012 clip_sample: false thresholding: false prediction_type: epsilon loss_fn: l2 unet_init_weights: - pretrained_models/instruct_pix2pix/diffusion_pytorch_model.bin - pretrained_models/Motion_Module/mm_sd_v15.ckpt vae_init_weights: pretrained_models/instruct_pix2pix/vqvae.ckpt text_model_init_weights: pretrained_models/instruct_pix2pix/text.ckpt optim_args: lr: 1e-5 scale_factor: 0.18215 guidance_scale: 5 # not used ddim_sampling_steps: 20 text_cfg: 7.5 img_cfg: 1.2 cond_image_dropout: 0.1 prompt_type: edit_prompt unet: target: modules.video_unet_temporal.unet.UNet3DConditionModel params: in_channels: 8 out_channels: 4 act_fn: silu attention_head_dim: 8 block_out_channels: - 320 - 640 - 1280 - 1280 cross_attention_dim: 768 down_block_types: - CrossAttnDownBlock3D - CrossAttnDownBlock3D - CrossAttnDownBlock3D - DownBlock3D up_block_types: - UpBlock3D - CrossAttnUpBlock3D - CrossAttnUpBlock3D - CrossAttnUpBlock3D downsample_padding: 1 layers_per_block: 2 mid_block_scale_factor: 1 norm_eps: 1e-05 norm_num_groups: 32 sample_size: 64 use_motion_module: true motion_module_resolutions: - 1 - 2 - 4 - 8 motion_module_mid_block: false motion_module_decoder_only: false motion_module_type: Vanilla motion_module_kwargs: num_attention_heads: 8 num_transformer_block: 1 attention_block_types: - Temporal_Self - Temporal_Self temporal_position_encoding: true temporal_position_encoding_max_len: 32 temporal_attention_dim_div: 1 vae: target: modules.kl_autoencoder.autoencoder.AutoencoderKL params: embed_dim: 4 ddconfig: double_z: true z_channels: 4 resolution: 256 in_channels: 3 out_ch: 3 ch: 128 ch_mult: - 1 - 2 - 4 - 4 num_res_blocks: 2 attn_resolutions: [] dropout: 0.0 lossconfig: target: torch.nn.Identity text_model: target: modules.openclip.modules.FrozenCLIPEmbedder params: freeze: true