openai_unet_sd:
  type: openai_unet
  args:
    image_size: null # no use
    in_channels: 4
    out_channels: 4
    model_channels: 320
    attention_resolutions: [ 4, 2, 1 ]
    num_res_blocks: [ 2, 2, 2, 2 ]
    channel_mult: [ 1, 2, 4, 4 ]
    # disable_self_attentions: [ False, False, False, False ]  # converts the self-attention to a cross-attention layer if true
    num_heads: 8
    use_spatial_transformer: True
    transformer_depth: 1
    context_dim: 768
    use_checkpoint: True
    legacy: False

openai_unet_dual_context:
  super_cfg: openai_unet_sd
  type: openai_unet_dual_context

########################
# Code cleaned version #
########################

openai_unet_2d:
  type: openai_unet_2d
  args:
    input_channels: 4
    model_channels: 320
    output_channels: 4
    num_noattn_blocks: [ 2, 2, 2, 2 ]
    channel_mult: [ 1, 2, 4, 4 ]
    with_attn: [true, true, true, false]
    num_heads: 8
    context_dim: 768
    use_checkpoint: True

openai_unet_0d:
  type: openai_unet_0d
  args:
    input_channels: 768
    model_channels: 320
    output_channels: 768
    num_noattn_blocks: [ 2, 2, 2, 2 ]
    channel_mult: [ 1, 2, 4, 4 ]
    with_attn: [true, true, true, false]
    num_heads: 8
    context_dim: 768
    use_checkpoint: True

openai_unet_0dmd:
  type: openai_unet_0dmd
  args:
    input_channels: 768
    model_channels: 320
    output_channels: 768
    num_noattn_blocks: [ 2, 2, 2, 2 ]
    channel_mult: [ 1, 2, 4, 4 ]
    second_dim: [ 4, 4, 4, 4 ]
    with_attn: [true, true, true, false]
    num_heads: 8
    context_dim: 768
    use_checkpoint: True

openai_unet_vd:
  type: openai_unet_vd
  args:
    unet_image_cfg: MODEL(openai_unet_2d)
    unet_text_cfg: MODEL(openai_unet_0dmd)