Spaces:

adaface-neurips
/

adaface

Running on Zero

App Files Files Community

adaface-neurips commited on May 24

Commit

9bcd020

1 Parent(s): c75a857

update code and gradio version

Browse files

Files changed (7) hide show

README.md +1 -1
adaface/adaface_wrapper.py +37 -22
adaface/diffusers_attn_lora_capture.py +67 -62
adaface/face_id_to_ada_prompt.py +26 -19
adaface/unet_teachers.py +37 -36
adaface/util.py +6 -6
app.py +30 -28

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 😻
 colorFrom: indigo
 colorTo: red
 sdk: gradio
-sdk_version: 5.0.2
 app_file: app.py
 pinned: false
 license: apache-2.0

 colorFrom: indigo
 colorTo: red
 sdk: gradio
+sdk_version: 5.30.0
 app_file: app.py
 pinned: false
 license: apache-2.0

adaface/adaface_wrapper.py CHANGED Viewed

@@ -30,8 +30,8 @@ class AdaFaceWrapper(nn.Module):
                  use_840k_vae=False, use_ds_text_encoder=False,
                  main_unet_filepath=None, unet_types=None, extra_unet_dirpaths=None, unet_weights_in_ensemble=None,
                  enable_static_img_suffix_embs=None, unet_uses_attn_lora=False,
-                 attn_lora_layer_names=['q', 'k', 'v', 'out'], shrink_cross_attn=False, q_lora_updates_query=False,
-                 device='cuda', is_training=False):
         '''
         pipeline_name: "text2img", "text2imgxl", "img2img", "text2img3", "flux", or None.
         If None, it's used only as a face encoder, and the unet and vae are
@@ -52,7 +52,7 @@ class AdaFaceWrapper(nn.Module):
         self.q_lora_updates_query  = q_lora_updates_query
         self.use_lcm = use_lcm
         self.subject_string = subject_string
-        self.shrink_cross_attn = shrink_cross_attn
         self.default_scheduler_name = default_scheduler_name
         self.num_inference_steps = num_inference_steps if not use_lcm else 4
@@ -64,6 +64,7 @@ class AdaFaceWrapper(nn.Module):
         self.unet_weights_in_ensemble = unet_weights_in_ensemble
         self.device = device
         self.is_training = is_training
         if negative_prompt is None:
             self.negative_prompt = \
@@ -99,6 +100,7 @@ class AdaFaceWrapper(nn.Module):
                                                                   self.adaface_ckpt_paths,
                                                                   self.adaface_encoder_cfg_scales,
                                                                   self.enabled_encoders,
                                                                   num_static_img_suffix_embs=4)
         self.id2ada_prompt_encoder.to(self.device)
@@ -189,10 +191,10 @@ class AdaFaceWrapper(nn.Module):
             pipeline.unet = unet_ensemble
         print(f"Loaded pipeline from {self.base_model_path}.")
-        if not remove_unet and (self.unet_uses_attn_lora or self.shrink_cross_attn):
             unet2 = self.load_unet_lora_weights(pipeline.unet, use_attn_lora=self.unet_uses_attn_lora,
                                                 attn_lora_layer_names=self.attn_lora_layer_names,
-                                                shrink_cross_attn=self.shrink_cross_attn,
                                                 q_lora_updates_query=self.q_lora_updates_query)
             pipeline.unet = unet2
@@ -294,12 +296,11 @@ class AdaFaceWrapper(nn.Module):
     def load_unet_loras(self, unet, unet_lora_modules_state_dict,
                         use_attn_lora=True, use_ffn_lora=False,
                         attn_lora_layer_names=['q', 'k', 'v', 'out'],
-                        shrink_cross_attn=False, cross_attn_shrink_factor=0.5,
                         q_lora_updates_query=False):
         attn_capture_procs, attn_opt_modules = \
             set_up_attn_processors(unet, use_attn_lora=True, attn_lora_layer_names=attn_lora_layer_names,
                                    lora_rank=192, lora_scale_down=8,
-                                   cross_attn_shrink_factor=cross_attn_shrink_factor,
                                    q_lora_updates_query=q_lora_updates_query)
         # up_blocks.3.resnets.[1~2].conv1, conv2, conv_shortcut. [12] matches 1 or 2.
         if use_ffn_lora:
@@ -343,16 +344,17 @@ class AdaFaceWrapper(nn.Module):
         print(f"Loaded {len(unet_lora_modules_state_dict)} LoRA weights on the UNet:\n{unet_lora_modules.keys()}")
         self.outfeat_capture_blocks.append(unet.up_blocks[3])
-        # If shrink_cross_attn is True and use_attn_lora is False, we load all these params from ckpt,
         # but since we set use_attn_lora to False, attn loras won't be used during inference nonetheless.
         set_lora_and_capture_flags(unet, None, self.attn_capture_procs, self.outfeat_capture_blocks,
                                    use_attn_lora, use_ffn_lora, 'recon_loss', capture_ca_activations=False,
-                                   shrink_cross_attn=shrink_cross_attn)
         return unet
     def load_unet_lora_weights(self, unet, use_attn_lora=True, attn_lora_layer_names=['q', 'k', 'v', 'out'],
-                               shrink_cross_attn=False, q_lora_updates_query=False):
         unet_lora_weight_found = False
         if isinstance(self.adaface_ckpt_paths, str):
             adaface_ckpt_paths = [self.adaface_ckpt_paths]
@@ -360,7 +362,7 @@ class AdaFaceWrapper(nn.Module):
             adaface_ckpt_paths = self.adaface_ckpt_paths
         for adaface_ckpt_path in adaface_ckpt_paths:
-            ckpt_dict = torch.load(adaface_ckpt_path, map_location='cpu')
             if 'unet_lora_modules' in ckpt_dict:
                 unet_lora_modules_state_dict = ckpt_dict['unet_lora_modules']
                 print(f"{len(unet_lora_modules_state_dict)} LoRA weights found in {adaface_ckpt_path}.")
@@ -379,7 +381,7 @@ class AdaFaceWrapper(nn.Module):
                 unet_ = self.load_unet_loras(unet_, unet_lora_modules_state_dict,
                                              use_attn_lora=use_attn_lora,
                                              attn_lora_layer_names=attn_lora_layer_names,
-                                             shrink_cross_attn=shrink_cross_attn,
                                              q_lora_updates_query=q_lora_updates_query)
                 unet.unets[i] = unet_
             print(f"Loaded LoRA processors on UNetEnsemble of {len(unet.unets)} UNets.")
@@ -387,7 +389,7 @@ class AdaFaceWrapper(nn.Module):
             unet = self.load_unet_loras(unet, unet_lora_modules_state_dict,
                                         use_attn_lora=use_attn_lora,
                                         attn_lora_layer_names=attn_lora_layer_names,
-                                        shrink_cross_attn=shrink_cross_attn,
                                         q_lora_updates_query=q_lora_updates_query)
         return unet
@@ -612,8 +614,9 @@ class AdaFaceWrapper(nn.Module):
         # Scan prompt and replace tokens in self.placeholder_token_ids
         # with the corresponding image embeddings.
         prompt_tokens = self.pipeline.tokenizer.tokenize(prompt)
         prompt_embeds2 = prompt_embeds.clone()
-        if alt_prompt_embed_type == 'img':
             if self.img_prompt_embs is None:
                 print("Unable to find img_prompt_embs. Either prepare_adaface_embeddings() hasn't been called, or faceless images were used.")
                 return prompt_embeds
@@ -628,17 +631,18 @@ class AdaFaceWrapper(nn.Module):
             breakpoint()
         repl_tokens = {}
         for i in range(len(prompt_tokens)):
             if prompt_tokens[i] in self.all_placeholder_tokens:
                 encoder_idx = next((i for i, sublist in enumerate(self.encoder_placeholder_tokens) \
                                     if prompt_tokens[i] in sublist), 0)
-                alt_prompt_emb_weight = alt_prompt_emb_weights[encoder_idx]
-                prompt_embeds2[:, i] = prompt_embeds2[:, i] * (1 - alt_prompt_emb_weight) \
                                        + repl_embeddings[:, self.all_placeholder_tokens.index(prompt_tokens[i])] * alt_prompt_emb_weight
                 repl_tokens[prompt_tokens[i]] = 1
         repl_token_count = len(repl_tokens)
-        if np.all(np.array(alt_prompt_emb_weights) == 1):
             print(f"Replaced {repl_token_count} tokens with {alt_prompt_embed_type} embeddings.")
         else:
             print(f"Mixed {repl_token_count} tokens with {alt_prompt_embed_type} embeddings, weight {alt_prompt_emb_weights}.")
@@ -650,7 +654,7 @@ class AdaFaceWrapper(nn.Module):
                       placeholder_tokens_pos='append',
                       ablate_prompt_only_placeholders=False,
                       ablate_prompt_no_placeholders=False,
-                      ablate_prompt_embed_type='ada', # 'ada', 'ada-nonmix', 'img'
                       nonmix_prompt_emb_weight=0,
                       repeat_prompt_for_each_encoder=True,
                       device=None, verbose=False):
@@ -678,14 +682,25 @@ class AdaFaceWrapper(nn.Module):
         prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, negative_pooled_prompt_embeds_ = \
             self.diffusers_encode_prompts(prompt, plain_prompt, negative_prompt, device)
-        if ablate_prompt_embed_type != 'ada':
             alt_prompt_embed_type = ablate_prompt_embed_type
-            alt_prompt_emb_weights = (1, 1)
         elif nonmix_prompt_emb_weight > 0:
             alt_prompt_embed_type = 'ada-nonmix'
-            alt_prompt_emb_weights = (nonmix_prompt_emb_weight, nonmix_prompt_emb_weight)
         else:
-            alt_prompt_emb_weights = (0, 0)
         if sum(alt_prompt_emb_weights) > 0:
             prompt_embeds_ = self.mix_ada_embs_with_other_embs(prompt, prompt_embeds_,

                  use_840k_vae=False, use_ds_text_encoder=False,
                  main_unet_filepath=None, unet_types=None, extra_unet_dirpaths=None, unet_weights_in_ensemble=None,
                  enable_static_img_suffix_embs=None, unet_uses_attn_lora=False,
+                 attn_lora_layer_names=['q', 'k', 'v', 'out'], normalize_cross_attn=False, q_lora_updates_query=False,
+                 device='cuda', is_training=False, is_on_hf_space=False):
         '''
         pipeline_name: "text2img", "text2imgxl", "img2img", "text2img3", "flux", or None.
         If None, it's used only as a face encoder, and the unet and vae are
         self.q_lora_updates_query  = q_lora_updates_query
         self.use_lcm = use_lcm
         self.subject_string = subject_string
+        self.normalize_cross_attn = normalize_cross_attn
         self.default_scheduler_name = default_scheduler_name
         self.num_inference_steps = num_inference_steps if not use_lcm else 4
         self.unet_weights_in_ensemble = unet_weights_in_ensemble
         self.device = device
         self.is_training = is_training
+        self.is_on_hf_space = is_on_hf_space
         if negative_prompt is None:
             self.negative_prompt = \
                                                                   self.adaface_ckpt_paths,
                                                                   self.adaface_encoder_cfg_scales,
                                                                   self.enabled_encoders,
+                                                                  is_on_hf_space=self.is_on_hf_space,
                                                                   num_static_img_suffix_embs=4)
         self.id2ada_prompt_encoder.to(self.device)
             pipeline.unet = unet_ensemble
         print(f"Loaded pipeline from {self.base_model_path}.")
+        if not remove_unet and (self.unet_uses_attn_lora or self.normalize_cross_attn):
             unet2 = self.load_unet_lora_weights(pipeline.unet, use_attn_lora=self.unet_uses_attn_lora,
                                                 attn_lora_layer_names=self.attn_lora_layer_names,
+                                                normalize_cross_attn=self.normalize_cross_attn,
                                                 q_lora_updates_query=self.q_lora_updates_query)
             pipeline.unet = unet2
     def load_unet_loras(self, unet, unet_lora_modules_state_dict,
                         use_attn_lora=True, use_ffn_lora=False,
                         attn_lora_layer_names=['q', 'k', 'v', 'out'],
+                        normalize_cross_attn=False,
                         q_lora_updates_query=False):
         attn_capture_procs, attn_opt_modules = \
             set_up_attn_processors(unet, use_attn_lora=True, attn_lora_layer_names=attn_lora_layer_names,
                                    lora_rank=192, lora_scale_down=8,
                                    q_lora_updates_query=q_lora_updates_query)
         # up_blocks.3.resnets.[1~2].conv1, conv2, conv_shortcut. [12] matches 1 or 2.
         if use_ffn_lora:
         print(f"Loaded {len(unet_lora_modules_state_dict)} LoRA weights on the UNet:\n{unet_lora_modules.keys()}")
         self.outfeat_capture_blocks.append(unet.up_blocks[3])
+        # If normalize_cross_attn is True and use_attn_lora is False, we load all these params from ckpt,
         # but since we set use_attn_lora to False, attn loras won't be used during inference nonetheless.
         set_lora_and_capture_flags(unet, None, self.attn_capture_procs, self.outfeat_capture_blocks,
                                    use_attn_lora, use_ffn_lora, 'recon_loss', capture_ca_activations=False,
+                                   normalize_cross_attn=normalize_cross_attn, mix_attn_mats_in_batch=False,
+                                   res_hidden_states_gradscale=0)
         return unet
     def load_unet_lora_weights(self, unet, use_attn_lora=True, attn_lora_layer_names=['q', 'k', 'v', 'out'],
+                               normalize_cross_attn=False, q_lora_updates_query=False):
         unet_lora_weight_found = False
         if isinstance(self.adaface_ckpt_paths, str):
             adaface_ckpt_paths = [self.adaface_ckpt_paths]
             adaface_ckpt_paths = self.adaface_ckpt_paths
         for adaface_ckpt_path in adaface_ckpt_paths:
+            ckpt_dict = torch.load(adaface_ckpt_path, map_location='cpu', weights_only=False)
             if 'unet_lora_modules' in ckpt_dict:
                 unet_lora_modules_state_dict = ckpt_dict['unet_lora_modules']
                 print(f"{len(unet_lora_modules_state_dict)} LoRA weights found in {adaface_ckpt_path}.")
                 unet_ = self.load_unet_loras(unet_, unet_lora_modules_state_dict,
                                              use_attn_lora=use_attn_lora,
                                              attn_lora_layer_names=attn_lora_layer_names,
+                                             normalize_cross_attn=normalize_cross_attn,
                                              q_lora_updates_query=q_lora_updates_query)
                 unet.unets[i] = unet_
             print(f"Loaded LoRA processors on UNetEnsemble of {len(unet.unets)} UNets.")
             unet = self.load_unet_loras(unet, unet_lora_modules_state_dict,
                                         use_attn_lora=use_attn_lora,
                                         attn_lora_layer_names=attn_lora_layer_names,
+                                        normalize_cross_attn=normalize_cross_attn,
                                         q_lora_updates_query=q_lora_updates_query)
         return unet
         # Scan prompt and replace tokens in self.placeholder_token_ids
         # with the corresponding image embeddings.
         prompt_tokens = self.pipeline.tokenizer.tokenize(prompt)
+        # prompt_embeds are the ada embeddings.
         prompt_embeds2 = prompt_embeds.clone()
+        if alt_prompt_embed_type.startswith('img'):
             if self.img_prompt_embs is None:
                 print("Unable to find img_prompt_embs. Either prepare_adaface_embeddings() hasn't been called, or faceless images were used.")
                 return prompt_embeds
             breakpoint()
         repl_tokens = {}
+        ada_emb_weight = alt_prompt_emb_weights[0]
         for i in range(len(prompt_tokens)):
             if prompt_tokens[i] in self.all_placeholder_tokens:
                 encoder_idx = next((i for i, sublist in enumerate(self.encoder_placeholder_tokens) \
                                     if prompt_tokens[i] in sublist), 0)
+                alt_prompt_emb_weight = alt_prompt_emb_weights[encoder_idx + 1]
+                prompt_embeds2[:, i] = prompt_embeds2[:, i] * ada_emb_weight \
                                        + repl_embeddings[:, self.all_placeholder_tokens.index(prompt_tokens[i])] * alt_prompt_emb_weight
                 repl_tokens[prompt_tokens[i]] = 1
         repl_token_count = len(repl_tokens)
+        if ada_emb_weight == 0:
             print(f"Replaced {repl_token_count} tokens with {alt_prompt_embed_type} embeddings.")
         else:
             print(f"Mixed {repl_token_count} tokens with {alt_prompt_embed_type} embeddings, weight {alt_prompt_emb_weights}.")
                       placeholder_tokens_pos='append',
                       ablate_prompt_only_placeholders=False,
                       ablate_prompt_no_placeholders=False,
+                      ablate_prompt_embed_type='ada', # 'ada', 'ada-nonmix', 'img', 'img1', 'img2'.
                       nonmix_prompt_emb_weight=0,
                       repeat_prompt_for_each_encoder=True,
                       device=None, verbose=False):
         prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, negative_pooled_prompt_embeds_ = \
             self.diffusers_encode_prompts(prompt, plain_prompt, negative_prompt, device)
+        if ablate_prompt_embed_type.startswith('img'):
             alt_prompt_embed_type = ablate_prompt_embed_type
+            if alt_prompt_embed_type == 'img1':
+                # The mixing weights of ada, img1, and img2 are 0, 1, and 0.
+                alt_prompt_emb_weights = (0, 1, 0)
+            elif alt_prompt_embed_type == 'img2':
+                # The mixing weights of ada, img1, and img2 are 0, 0, and 1.
+                alt_prompt_emb_weights = (0, 0, 1)
+            else:
+                # The mixing weights of ada, img1, and img2 are 0, 1, and 1.
+                alt_prompt_emb_weights = (0, 1, 1)
         elif nonmix_prompt_emb_weight > 0:
             alt_prompt_embed_type = 'ada-nonmix'
+            # The mixing weight of ada is 1 - nonmix_prompt_emb_weight, instead of 1 - nonmix_prompt_emb_weight * 2.
+            # It means ada is mixed by this weight with both img1 and img2.
+            alt_prompt_emb_weights = (1 - nonmix_prompt_emb_weight, nonmix_prompt_emb_weight, nonmix_prompt_emb_weight)
         else:
+            # Don't change the prompt embeddings. So we set all the mixing weights to 0.
+            alt_prompt_emb_weights = (0, 0, 0)
         if sum(alt_prompt_emb_weights) > 0:
             prompt_embeds_ = self.mix_ada_embs_with_other_embs(prompt, prompt_embeds_,

adaface/diffusers_attn_lora_capture.py CHANGED Viewed

@@ -4,7 +4,6 @@ import torch.nn.functional as F
 from typing import Optional, Tuple, Dict, Any
 from diffusers.models.attention_processor import Attention, AttnProcessor2_0
 from diffusers.utils import logging, is_torch_version, deprecate
-from diffusers.utils.torch_utils import fourier_filter
 # UNet is a diffusers PeftAdapterMixin instance.
 from diffusers.loaders.peft import PeftAdapterMixin
 from peft import LoraConfig, get_peft_model
@@ -12,7 +11,6 @@ import peft.tuners.lora as peft_lora
 from peft.tuners.lora.dora import DoraLinearLayer
 from einops import rearrange
 import math, re
-import numpy as np
 from peft.tuners.tuners_utils import BaseTunerLayer
@@ -28,7 +26,7 @@ class ScaleGrad(torch.autograd.Function):
         ctx.save_for_backward(alpha_, debug)
         output = input_
         if debug:
-            print(f"input: {input_.abs().mean().item()}")
         return output
     @staticmethod
@@ -38,7 +36,7 @@ class ScaleGrad(torch.autograd.Function):
         if ctx.needs_input_grad[0]:
             grad_output2 = grad_output * alpha_
             if debug:
-                print(f"grad_output2: {grad_output2.abs().mean().item()}")
         else:
             grad_output2 = None
         return grad_output2, None, None
@@ -77,36 +75,11 @@ def split_indices_by_instance(indices, as_dict=False):
         indices_by_instance = { uib.item(): indices_N[indices_B == uib] for uib in unique_indices_B }
     return indices_by_instance
-# If do_sum, returned emb_attns is 3D. Otherwise 4D.
-# indices are applied on the first 2 dims of attn_mat.
-def sel_emb_attns_by_indices(attn_mat, indices, all_token_weights=None, do_sum=True, do_mean=False):
-    indices_by_instance = split_indices_by_instance(indices)
-    # emb_attns[0]: [1, 9, 8, 64]
-    # 8: 8 attention heads. Last dim 64: number of image tokens.
-    emb_attns   = [ attn_mat[inst_indices].unsqueeze(0) for inst_indices in indices_by_instance ]
-    if all_token_weights is not None:
-        # all_token_weights: [4, 77].
-        # token_weights_by_instance[0]: [1, 9, 1, 1].
-        token_weights = [ all_token_weights[inst_indices].reshape(1, -1, 1, 1) for inst_indices in indices_by_instance ]
-    else:
-        token_weights = [ 1 ] * len(indices_by_instance)
-    # Apply token weights.
-    emb_attns = [ emb_attns[i] * token_weights[i] for i in range(len(indices_by_instance)) ]
-    # sum among K_subj_i subj embeddings -> [1, 8, 64]
-    if do_sum:
-        emb_attns   = [ emb_attns[i].sum(dim=1) for i in range(len(indices_by_instance)) ]
-    elif do_mean:
-        emb_attns   = [ emb_attns[i].mean(dim=1) for i in range(len(indices_by_instance)) ]
-    emb_attns = torch.cat(emb_attns, dim=0)
-    return emb_attns
 # Slow implementation equivalent to F.scaled_dot_product_attention.
-def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0,
-                                 shrink_cross_attn=False, cross_attn_shrink_factor=0.5,
                                  is_causal=False, scale=None, enable_gqa=False) -> torch.Tensor:
     B, L, S = query.size(0), query.size(-2), key.size(-2)
     scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
@@ -128,21 +101,39 @@ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.
         key = key.repeat_interleave(query.size(-3)//key.size(-3), -3)
         value = value.repeat_interleave(query.size(-3)//value.size(-3), -3)
-    attn_weight = query @ key.transpose(-2, -1) * scale_factor
-    if shrink_cross_attn:
-        cross_attn_scale = cross_attn_shrink_factor
-    else:
-        cross_attn_scale = 1
-    # attn_bias: [1, 1, 4096, 77], the same size as a single-head attn_weight.
-    attn_weight += attn_bias
-    attn_score = attn_weight
-    attn_weight = torch.softmax(attn_weight, dim=-1)
-    # NOTE: After scaling, the "probabilities" of the subject embeddings will sum to < 1.
-    # But this is intended, as we want to scale down the impact of the subject embeddings
-    # in the computed attention output tensors.
-    attn_weight = attn_weight * cross_attn_scale
     attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
     output = attn_weight @ value
     return output, attn_score, attn_weight
@@ -156,23 +147,25 @@ class AttnProcessor_LoRA_Capture(nn.Module):
     def __init__(self, capture_ca_activations: bool = False, enable_lora: bool = False,
                  lora_uses_dora=True, lora_proj_layers=None,
                  lora_rank: int = 192, lora_alpha: float = 16,
-                 cross_attn_shrink_factor: float = 0.5,
                  q_lora_updates_query=False, attn_proc_idx=-1):
         super().__init__()
         self.global_enable_lora = enable_lora
         self.attn_proc_idx = attn_proc_idx
         # reset_attn_cache_and_flags() sets the local (call-specific) self.enable_lora flag.
-        # By default, shrink_cross_attn is False. Later in layers 22, 23, 24 it will be set to True.
-        self.reset_attn_cache_and_flags(capture_ca_activations, False, enable_lora)
         self.lora_rank = lora_rank
         self.lora_alpha = lora_alpha
         self.lora_scale = self.lora_alpha / self.lora_rank
-        self.cross_attn_shrink_factor = cross_attn_shrink_factor
         self.q_lora_updates_query = q_lora_updates_query
         self.to_q_lora = self.to_k_lora = self.to_v_lora = self.to_out_lora = None
         if self.global_enable_lora:
             for lora_layer_name, lora_proj_layer in lora_proj_layers.items():
                 if lora_layer_name == 'q':
                     self.to_q_lora   = peft_lora.Linear(lora_proj_layer, 'default', r=lora_rank, lora_alpha=lora_alpha,
@@ -188,9 +181,10 @@ class AttnProcessor_LoRA_Capture(nn.Module):
                                                         use_dora=lora_uses_dora, lora_dropout=0.1)
     # LoRA layers can be enabled/disabled dynamically.
-    def reset_attn_cache_and_flags(self, capture_ca_activations, shrink_cross_attn, enable_lora):
         self.capture_ca_activations = capture_ca_activations
-        self.shrink_cross_attn      = shrink_cross_attn
         self.cached_activations     = {}
         # Only enable LoRA for the next call(s) if global_enable_lora is set to True.
         self.enable_lora = enable_lora and self.global_enable_lora
@@ -312,11 +306,14 @@ class AttnProcessor_LoRA_Capture(nn.Module):
             breakpoint()
         # the output of sdp = (batch, num_heads, seq_len, head_dim)
-        if is_cross_attn and (self.capture_ca_activations or self.shrink_cross_attn):
             hidden_states, attn_score, attn_prob = \
                 scaled_dot_product_attention(query, key, value, attn_mask=attention_mask,
-                                             dropout_p=0.0, shrink_cross_attn=self.shrink_cross_attn,
-                                             cross_attn_shrink_factor=self.cross_attn_shrink_factor)
         else:
             # Use the faster implementation of scaled_dot_product_attention
             # when not capturing the activations or suppressing the subject attention.
@@ -452,7 +449,7 @@ def CrossAttnUpBlock2D_forward_capture(
 # Adapted from ConsistentIDPipeline:set_ip_adapter().
 # attn_lora_layer_names: candidates are subsets of ['q', 'k', 'v', 'out'].
 def set_up_attn_processors(unet, use_attn_lora, attn_lora_layer_names=['q', 'k', 'v', 'out'],
-                           lora_rank=192, lora_scale_down=8, cross_attn_shrink_factor=0.5,
                            q_lora_updates_query=False):
     attn_procs = {}
     attn_capture_procs = {}
@@ -502,7 +499,6 @@ def set_up_attn_processors(unet, use_attn_lora, attn_lora_layer_names=['q', 'k',
             lora_uses_dora=True, lora_proj_layers=lora_proj_layers,
             # LoRA up is initialized to 0. So no need to worry that the LoRA output may be too large.
             lora_rank=lora_rank, lora_alpha=lora_rank // lora_scale_down,
-            cross_attn_shrink_factor=cross_attn_shrink_factor,
             q_lora_updates_query=q_lora_updates_query, attn_proc_idx=attn_proc_idx)
         attn_proc_idx += 1
@@ -513,6 +509,11 @@ def set_up_attn_processors(unet, use_attn_lora, attn_lora_layer_names=['q', 'k',
         attn_capture_procs[name] = attn_capture_proc
         if use_attn_lora:
             for subname, module in attn_capture_proc.named_modules():
                 if isinstance(module, peft_lora.LoraLayer):
                     # ModuleDict doesn't allow "." in the key.
@@ -537,7 +538,7 @@ def set_up_attn_processors(unet, use_attn_lora, attn_lora_layer_names=['q', 'k',
     return attn_capture_procs, attn_opt_modules
 # NOTE: cross-attn layers are included in the returned lora_modules.
-def set_up_ffn_loras(unet, target_modules_pat, lora_uses_dora=False, lora_rank=192, lora_alpha=16):
     # target_modules_pat = 'up_blocks.3.resnets.[12].conv[a-z0-9_]+'
     # up_blocks.3.resnets.[1~2].conv1, conv2, conv_shortcut
     # Cannot set to conv.+ as it will match added adapter module names, including
@@ -592,15 +593,18 @@ def set_up_ffn_loras(unet, target_modules_pat, lora_uses_dora=False, lora_rank=1
 def set_lora_and_capture_flags(unet, unet_lora_modules, attn_capture_procs,
                                outfeat_capture_blocks, res_hidden_states_gradscale_blocks,
                                use_attn_lora, use_ffn_lora, ffn_lora_adapter_name, capture_ca_activations,
-                               shrink_cross_attn, res_hidden_states_gradscale):
     # For attn capture procs, capture_ca_activations and use_attn_lora are set in reset_attn_cache_and_flags().
-    for attn_capture_proc in attn_capture_procs:
-        attn_capture_proc.reset_attn_cache_and_flags(capture_ca_activations, shrink_cross_attn, enable_lora=use_attn_lora)
     # outfeat_capture_blocks only contains the last up block, up_blocks[3].
     # It contains 3 FFN layers. We want to capture their output features.
     for block in outfeat_capture_blocks:
         block.capture_outfeats           = capture_ca_activations
     for block in res_hidden_states_gradscale_blocks:
         block.res_hidden_states_gradscale = res_hidden_states_gradscale
@@ -639,6 +643,7 @@ def get_captured_activations(capture_ca_activations, attn_capture_procs, outfeat
         block.cached_outfeats = {}
         block.capture_outfeats = False
     for layer_idx in captured_layer_indices:
         # Subtract 22 to ca_layer_idx to match the layer index in up_blocks[3].cached_outfeats.
         # 23, 24 -> 1, 2 (!! not 0, 1 !!)

 from typing import Optional, Tuple, Dict, Any
 from diffusers.models.attention_processor import Attention, AttnProcessor2_0
 from diffusers.utils import logging, is_torch_version, deprecate
 # UNet is a diffusers PeftAdapterMixin instance.
 from diffusers.loaders.peft import PeftAdapterMixin
 from peft import LoraConfig, get_peft_model
 from peft.tuners.lora.dora import DoraLinearLayer
 from einops import rearrange
 import math, re
 from peft.tuners.tuners_utils import BaseTunerLayer
         ctx.save_for_backward(alpha_, debug)
         output = input_
         if debug:
+            print(f"input: {input_.abs().mean().detach().item()}")
         return output
     @staticmethod
         if ctx.needs_input_grad[0]:
             grad_output2 = grad_output * alpha_
             if debug:
+                print(f"grad_output2: {grad_output2.abs().mean().detach().item()}")
         else:
             grad_output2 = None
         return grad_output2, None, None
         indices_by_instance = { uib.item(): indices_N[indices_B == uib] for uib in unique_indices_B }
     return indices_by_instance
 # Slow implementation equivalent to F.scaled_dot_product_attention.
+def scaled_dot_product_attention(query, key, value, cross_attn_scale_factor,
+                                 attn_mask=None, dropout_p=0.0,
+                                 subj_indices=None, normalize_cross_attn=False,
+                                 mix_attn_mats_in_batch=False,
                                  is_causal=False, scale=None, enable_gqa=False) -> torch.Tensor:
     B, L, S = query.size(0), query.size(-2), key.size(-2)
     scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
         key = key.repeat_interleave(query.size(-3)//key.size(-3), -3)
         value = value.repeat_interleave(query.size(-3)//value.size(-3), -3)
+    attn_score = query @ key.transpose(-2, -1) * scale_factor
+    # attn_bias: [1, 1, 4096, 77], the same size as a single-head attn_score.
+    attn_score += attn_bias
+    if mix_attn_mats_in_batch:
+        # The instances in the batch are [sc, mc]. We average their attn scores,
+        # and apply to both instances.
+        # attn_score: [2, 8, 4096, 77] -> [1, 8, 4096, 77] -> [2, 8, 4096, 77].
+        # If BLOCK_SIZE > 1, attn_score.shape[0] = 2 * BLOCK_SIZE.
+        if attn_score.shape[0] %2 != 0:
+            breakpoint()
+        attn_score_sc, attn_score_mc = attn_score.chunk(2, dim=0)
+        # Cut off the grad flow from the SC instance to the MC instance.
+        attn_score = (attn_score_sc + attn_score_mc.detach()) / 2
+        attn_score = attn_score.repeat(2, 1, 1, 1)
+    elif normalize_cross_attn:
+        if subj_indices is None:
+            breakpoint()
+        subj_indices_B, subj_indices_N = subj_indices
+        subj_attn_score = attn_score[subj_indices_B, :, :, subj_indices_N]
+        # Normalize the attention score of the subject tokens to have mean 0 across tokens,
+        # so that positive and negative scores are balanced.
+        subj_attn_score = subj_attn_score - subj_attn_score.mean(dim=2, keepdim=True).detach()
+        # cross_attn_scale is a learnable parameter, so the score will be scaled appropriately.
+        # Scale up the BP'ed gradient to cross_attn_scale_factor by 10x.
+        ca_scale_grad_scaler = gen_gradient_scaler(10)
+        subj_attn_score = subj_attn_score * ca_scale_grad_scaler(cross_attn_scale_factor)
+        attn_score2 = attn_score.clone()
+        attn_score2[subj_indices_B, :, :, subj_indices_N] = subj_attn_score
+        attn_score = attn_score2
+    # Otherwise, do nothing to attn_score.
+    attn_weight = torch.softmax(attn_score, dim=-1)
     attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
     output = attn_weight @ value
     return output, attn_score, attn_weight
     def __init__(self, capture_ca_activations: bool = False, enable_lora: bool = False,
                  lora_uses_dora=True, lora_proj_layers=None,
                  lora_rank: int = 192, lora_alpha: float = 16,
                  q_lora_updates_query=False, attn_proc_idx=-1):
         super().__init__()
         self.global_enable_lora = enable_lora
         self.attn_proc_idx = attn_proc_idx
         # reset_attn_cache_and_flags() sets the local (call-specific) self.enable_lora flag.
+        # By default, normalize_cross_attn is False. Later in layers 22, 23, 24 it will be set to True.
+        self.reset_attn_cache_and_flags(capture_ca_activations, False, False, enable_lora)
         self.lora_rank = lora_rank
         self.lora_alpha = lora_alpha
         self.lora_scale = self.lora_alpha / self.lora_rank
         self.q_lora_updates_query = q_lora_updates_query
         self.to_q_lora = self.to_k_lora = self.to_v_lora = self.to_out_lora = None
         if self.global_enable_lora:
+            # enable_lora = True iff this is a cross-attn layer in the last 3 up blocks.
+            # Since we only use cross_attn_scale_factor on cross-attn layers,
+            # we only use cross_attn_scale_factor when enable_lora is True.
+            self.cross_attn_scale_factor = nn.Parameter(torch.tensor(0.8), requires_grad=True)
             for lora_layer_name, lora_proj_layer in lora_proj_layers.items():
                 if lora_layer_name == 'q':
                     self.to_q_lora   = peft_lora.Linear(lora_proj_layer, 'default', r=lora_rank, lora_alpha=lora_alpha,
                                                         use_dora=lora_uses_dora, lora_dropout=0.1)
     # LoRA layers can be enabled/disabled dynamically.
+    def reset_attn_cache_and_flags(self, capture_ca_activations, normalize_cross_attn, mix_attn_mats_in_batch, enable_lora):
         self.capture_ca_activations = capture_ca_activations
+        self.normalize_cross_attn      = normalize_cross_attn
+        self.mix_attn_mats_in_batch = mix_attn_mats_in_batch
         self.cached_activations     = {}
         # Only enable LoRA for the next call(s) if global_enable_lora is set to True.
         self.enable_lora = enable_lora and self.global_enable_lora
             breakpoint()
         # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        if is_cross_attn and (self.capture_ca_activations or self.normalize_cross_attn):
             hidden_states, attn_score, attn_prob = \
                 scaled_dot_product_attention(query, key, value, attn_mask=attention_mask,
+                                             dropout_p=0.0, subj_indices=subj_indices,
+                                             normalize_cross_attn=self.normalize_cross_attn,
+                                             cross_attn_scale_factor=self.cross_attn_scale_factor,
+                                             mix_attn_mats_in_batch=self.mix_attn_mats_in_batch)
         else:
             # Use the faster implementation of scaled_dot_product_attention
             # when not capturing the activations or suppressing the subject attention.
 # Adapted from ConsistentIDPipeline:set_ip_adapter().
 # attn_lora_layer_names: candidates are subsets of ['q', 'k', 'v', 'out'].
 def set_up_attn_processors(unet, use_attn_lora, attn_lora_layer_names=['q', 'k', 'v', 'out'],
+                           lora_rank=192, lora_scale_down=8,
                            q_lora_updates_query=False):
     attn_procs = {}
     attn_capture_procs = {}
             lora_uses_dora=True, lora_proj_layers=lora_proj_layers,
             # LoRA up is initialized to 0. So no need to worry that the LoRA output may be too large.
             lora_rank=lora_rank, lora_alpha=lora_rank // lora_scale_down,
             q_lora_updates_query=q_lora_updates_query, attn_proc_idx=attn_proc_idx)
         attn_proc_idx += 1
         attn_capture_procs[name] = attn_capture_proc
         if use_attn_lora:
+            cross_attn_scale_factor_name = name + "_cross_attn_scale_factor"
+            # Put cross_attn_scale_factor in attn_opt_modules, so that we can optimize and save/load it.
+            attn_opt_modules[cross_attn_scale_factor_name] = attn_capture_proc.cross_attn_scale_factor
+            # Put LoRA layers in attn_opt_modules, so that we can optimize and save/load them.
             for subname, module in attn_capture_proc.named_modules():
                 if isinstance(module, peft_lora.LoraLayer):
                     # ModuleDict doesn't allow "." in the key.
     return attn_capture_procs, attn_opt_modules
 # NOTE: cross-attn layers are included in the returned lora_modules.
+def set_up_ffn_loras(unet, target_modules_pat, lora_uses_dora=True, lora_rank=192, lora_alpha=16):
     # target_modules_pat = 'up_blocks.3.resnets.[12].conv[a-z0-9_]+'
     # up_blocks.3.resnets.[1~2].conv1, conv2, conv_shortcut
     # Cannot set to conv.+ as it will match added adapter module names, including
 def set_lora_and_capture_flags(unet, unet_lora_modules, attn_capture_procs,
                                outfeat_capture_blocks, res_hidden_states_gradscale_blocks,
                                use_attn_lora, use_ffn_lora, ffn_lora_adapter_name, capture_ca_activations,
+                               normalize_cross_attn, mix_attn_mats_in_batch, res_hidden_states_gradscale):
     # For attn capture procs, capture_ca_activations and use_attn_lora are set in reset_attn_cache_and_flags().
+    for i, attn_capture_proc in enumerate(attn_capture_procs):
+        attn_capture_proc.reset_attn_cache_and_flags(capture_ca_activations, normalize_cross_attn, mix_attn_mats_in_batch,
+                                                     enable_lora=use_attn_lora)
     # outfeat_capture_blocks only contains the last up block, up_blocks[3].
     # It contains 3 FFN layers. We want to capture their output features.
     for block in outfeat_capture_blocks:
         block.capture_outfeats           = capture_ca_activations
+    # res_hidden_states_gradscale_blocks contain the second to the last up blocks, up_blocks[1:].
+    # It's only used to set res_hidden_states_gradscale, and doesn't capture anything.
     for block in res_hidden_states_gradscale_blocks:
         block.res_hidden_states_gradscale = res_hidden_states_gradscale
         block.cached_outfeats = {}
         block.capture_outfeats = False
     for layer_idx in captured_layer_indices:
         # Subtract 22 to ca_layer_idx to match the layer index in up_blocks[3].cached_outfeats.
         # 23, 24 -> 1, 2 (!! not 0, 1 !!)

adaface/face_id_to_ada_prompt.py CHANGED Viewed

@@ -26,7 +26,7 @@ def create_id2ada_prompt_encoder(adaface_encoder_types, adaface_ckpt_paths=None,
         if adaface_encoder_type == 'arc2face':
             id2ada_prompt_encoder = \
                 Arc2Face_ID2AdaPrompt(adaface_ckpt_path=adaface_ckpt_path,
-                                    *args, **kwargs)
         elif adaface_encoder_type == 'consistentID':
             id2ada_prompt_encoder = \
                 ConsistentID_ID2AdaPrompt(pipe=None,
@@ -64,6 +64,7 @@ class FaceID2AdaPrompt(nn.Module):
         # i.e., 6 for arc2face and 1 for consistentID.
         self.out_id_embs_cfg_scale          = kwargs.get('out_id_embs_cfg_scale', -1)
         self.is_training                    = kwargs.get('is_training', False)
         # extend_prompt2token_proj_attention_multiplier is an integer >= 1.
         # TODO: extend_prompt2token_proj_attention_multiplier should be a list of integers.
         self.extend_prompt2token_proj_attention_multiplier = kwargs.get('extend_prompt2token_proj_attention_multiplier', 1)
@@ -603,9 +604,13 @@ class Arc2Face_ID2AdaPrompt(FaceID2AdaPrompt):
         '''
         # Use the same model as ID2AdaPrompt does.
         # FaceAnalysis will try to find the ckpt in: models/insightface/models/antelopev2.
-        # Note there are two "models" in the path.
         self.face_app = FaceAnalysis(name='antelopev2', root='models/insightface',
-                                            providers=['CPUExecutionProvider'])
         self.face_app.prepare(ctx_id=0, det_size=(512, 512))
         print(f'Arc2Face Face encoder loaded on CPU.')
@@ -642,7 +647,6 @@ class Arc2Face_ID2AdaPrompt(FaceID2AdaPrompt):
     def _apply(self, fn):
         super()._apply(fn)  # Call the parent _apply to handle parameters and buffers
-        return
         # A dirty hack to get the device of the model, passed from
         # parent.model.to(self.root_device) => parent._apply(convert) => module._apply(fn)
         test_tensor = torch.zeros(1)  # Create a test tensor
@@ -651,22 +655,24 @@ class Arc2Face_ID2AdaPrompt(FaceID2AdaPrompt):
         # No need to reload face_app on the same device.
         if device == self.device:
             return
         if str(device) == 'cpu':
             self.face_app = FaceAnalysis(name='antelopev2', root='models/insightface',
-                                        providers=['CPUExecutionProvider'])
             self.face_app.prepare(ctx_id=0, det_size=(512, 512))
         else:
             device_id = device.index
             self.face_app = FaceAnalysis(name='antelopev2', root='models/insightface',
                                          providers=['CUDAExecutionProvider'],
-                                         provider_options=[{"device_id": device_id,
-                                                            "cudnn_conv_algo_search": "HEURISTIC",
-                                                            "gpu_mem_limit": 2 * 1024**3
-                                                            }])
             self.face_app.prepare(ctx_id=device_id, det_size=(512, 512))
-        self.device = device
         print(f'Arc2Face Face encoder reloaded on {device}.')
         return
@@ -739,8 +745,8 @@ class ConsistentID_ID2AdaPrompt(FaceID2AdaPrompt):
             # but diffusers will call .to(dtype) in .from_single_file(),
             # and at that moment, the consistentID specific modules are not loaded yet.
             pipe = ConsistentIDPipeline.from_single_file(base_model_path)
-            pipe.load_ConsistentID_model(consistentID_weight_path="./models/ConsistentID/ConsistentID-v1.bin",
-                                         bise_net_weight_path="./models/ConsistentID/BiSeNet_pretrained_for_ConsistentID.pth")
             pipe.to(dtype=self.dtype)
             # Since the passed-in pipe is None, this should be called during inference,
             # when the teacher ConsistentIDPipeline is not initialized.
@@ -791,7 +797,6 @@ class ConsistentID_ID2AdaPrompt(FaceID2AdaPrompt):
     def _apply(self, fn):
         super()._apply(fn)  # Call the parent _apply to handle parameters and buffers
-        return
         # A dirty hack to get the device of the model, passed from
         # parent.model.to(self.root_device) => parent._apply(convert) => module._apply(fn)
         test_tensor = torch.zeros(1)  # Create a test tensor
@@ -800,6 +805,11 @@ class ConsistentID_ID2AdaPrompt(FaceID2AdaPrompt):
         # No need to reload face_app on the same device.
         if device == self.device:
             return
         if str(device) == 'cpu':
             self.face_app = FaceAnalysis(name='buffalo_l', root='models/insightface',
@@ -809,13 +819,10 @@ class ConsistentID_ID2AdaPrompt(FaceID2AdaPrompt):
             device_id = device.index
             self.face_app = FaceAnalysis(name='buffalo_l', root='models/insightface',
                                          providers=['CUDAExecutionProvider'],
-                                         provider_options=[{"device_id": device_id,
-                                                            "cudnn_conv_algo_search": "HEURISTIC",
-                                                            "gpu_mem_limit": 2 * 1024**3
-                                                            }])
             self.face_app.prepare(ctx_id=device_id, det_size=(512, 512))
-        self.device = device
         self.pipe.face_app = self.face_app
         print(f'ConsistentID Face encoder reloaded on {device}.')
@@ -1277,7 +1284,7 @@ class Joint_FaceID2AdaPrompt(FaceID2AdaPrompt):
         # No faces are found in the images, so return None embeddings.
         # We don't want to return an all-zero embedding, which is useless.
         if num_available_id_vecs == 0:
-            return None, [0]
         # If id2ada_prompt_encoders are ["arc2face", "consistentID"], then
         # during inference, we average across the batch dim.

         if adaface_encoder_type == 'arc2face':
             id2ada_prompt_encoder = \
                 Arc2Face_ID2AdaPrompt(adaface_ckpt_path=adaface_ckpt_path,
+                                      *args, **kwargs)
         elif adaface_encoder_type == 'consistentID':
             id2ada_prompt_encoder = \
                 ConsistentID_ID2AdaPrompt(pipe=None,
         # i.e., 6 for arc2face and 1 for consistentID.
         self.out_id_embs_cfg_scale          = kwargs.get('out_id_embs_cfg_scale', -1)
         self.is_training                    = kwargs.get('is_training', False)
+        self.is_on_hf_space                 = kwargs.get('is_on_hf_space', False)
         # extend_prompt2token_proj_attention_multiplier is an integer >= 1.
         # TODO: extend_prompt2token_proj_attention_multiplier should be a list of integers.
         self.extend_prompt2token_proj_attention_multiplier = kwargs.get('extend_prompt2token_proj_attention_multiplier', 1)
         '''
         # Use the same model as ID2AdaPrompt does.
         # FaceAnalysis will try to find the ckpt in: models/insightface/models/antelopev2.
+        # Note there's a second "model" in the path.
+        # Note DO use CUDAExecutionProvider during training and CPUExecutionProvider during inference.
+        # Otherwise, CPUExecutionProvider will hang DDP training,
+        # and CUDAExecutionProvider will cause OOM on huggingface spaces.
+        self.onnx_providers = ['CUDAExecutionProvider'] if self.is_training else ['CPUExecutionProvider']
         self.face_app = FaceAnalysis(name='antelopev2', root='models/insightface',
+                                     providers=self.onnx_providers)
         self.face_app.prepare(ctx_id=0, det_size=(512, 512))
         print(f'Arc2Face Face encoder loaded on CPU.')
     def _apply(self, fn):
         super()._apply(fn)  # Call the parent _apply to handle parameters and buffers
         # A dirty hack to get the device of the model, passed from
         # parent.model.to(self.root_device) => parent._apply(convert) => module._apply(fn)
         test_tensor = torch.zeros(1)  # Create a test tensor
         # No need to reload face_app on the same device.
         if device == self.device:
             return
+        self.device = device
+        if self.is_on_hf_space and self.face_app is not None:
+            print(f'On HF space. Arc2Face Face encoder already loaded on cpu.')
+            return
         if str(device) == 'cpu':
             self.face_app = FaceAnalysis(name='antelopev2', root='models/insightface',
+                                         providers=['CPUExecutionProvider'])
             self.face_app.prepare(ctx_id=0, det_size=(512, 512))
         else:
             device_id = device.index
             self.face_app = FaceAnalysis(name='antelopev2', root='models/insightface',
                                          providers=['CUDAExecutionProvider'],
+                                         provider_options=[{'device_id': device_id,
+                                                            'cudnn_conv_algo_search': 'HEURISTIC'}])
             self.face_app.prepare(ctx_id=device_id, det_size=(512, 512))
         print(f'Arc2Face Face encoder reloaded on {device}.')
         return
             # but diffusers will call .to(dtype) in .from_single_file(),
             # and at that moment, the consistentID specific modules are not loaded yet.
             pipe = ConsistentIDPipeline.from_single_file(base_model_path)
+            pipe.load_ConsistentID_model(consistentID_weight_path="models/ConsistentID/ConsistentID-v1.bin",
+                                         bise_net_weight_path="models/ConsistentID/BiSeNet_pretrained_for_ConsistentID.pth")
             pipe.to(dtype=self.dtype)
             # Since the passed-in pipe is None, this should be called during inference,
             # when the teacher ConsistentIDPipeline is not initialized.
     def _apply(self, fn):
         super()._apply(fn)  # Call the parent _apply to handle parameters and buffers
         # A dirty hack to get the device of the model, passed from
         # parent.model.to(self.root_device) => parent._apply(convert) => module._apply(fn)
         test_tensor = torch.zeros(1)  # Create a test tensor
         # No need to reload face_app on the same device.
         if device == self.device:
             return
+        self.device = device
+        if self.is_on_hf_space and self.face_app is not None:
+            print(f'On HF space. Arc2Face Face encoder already loaded on cpu.')
+            return
         if str(device) == 'cpu':
             self.face_app = FaceAnalysis(name='buffalo_l', root='models/insightface',
             device_id = device.index
             self.face_app = FaceAnalysis(name='buffalo_l', root='models/insightface',
                                          providers=['CUDAExecutionProvider'],
+                                         provider_options=[{'device_id': device_id,
+                                                            'cudnn_conv_algo_search': 'HEURISTIC'}])
             self.face_app.prepare(ctx_id=device_id, det_size=(512, 512))
         self.pipe.face_app = self.face_app
         print(f'ConsistentID Face encoder reloaded on {device}.')
         # No faces are found in the images, so return None embeddings.
         # We don't want to return an all-zero embedding, which is useless.
         if num_available_id_vecs == 0:
+            return None, None, [0]
         # If id2ada_prompt_encoders are ["arc2face", "consistentID"], then
         # during inference, we average across the batch dim.

adaface/unet_teachers.py CHANGED Viewed

@@ -62,46 +62,41 @@ class UNetTeacher(nn.Module):
     # t: the initial t. We will sample additional (num_denoising_steps - 1) smaller t.
     # same_t_noise_across_instances: when sampling t and noise, use the same t and noise for all instances.
     def forward(self, ddpm_model, x_start, noise, t, teacher_context, negative_context=None,
-                num_denoising_steps=1, same_t_noise_across_instances=False,
                 global_t_lb=0, global_t_ub=1000):
         assert num_denoising_steps <= 10
-        if self.p_uses_cfg > 0:
             self.uses_cfg = np.random.rand() < self.p_uses_cfg
-            if self.uses_cfg:
-                # Randomly sample a cfg_scale from cfg_scale_range.
-                self.cfg_scale = np.random.uniform(*self.cfg_scale_range)
-                if self.cfg_scale == 1:
-                    self.uses_cfg = False
-            if self.uses_cfg:
-                print(f"Teacher samples CFG scale {self.cfg_scale:.1f}.")
-                if negative_context is not None:
-                    negative_context = negative_context[:1].repeat(x_start.shape[0], 1, 1)
-                # if negative_context is None, then teacher_context is a combination of
-                # (one or multiple if unet_ensemble) pos_context and neg_context.
-                # If negative_context is not None, then teacher_context is only pos_context.
-            else:
-                self.cfg_scale = 1
-                print("Teacher does not use CFG.")
-                # If negative_context is None, then teacher_context is a combination of
-                # (one or multiple if unet_ensemble) pos_context and neg_context.
-                # Since not uses_cfg, we only need pos_context.
-                # If negative_context is not None, then teacher_context is only pos_context.
-                if negative_context is None:
-                    teacher_context = self.extract_pos_context(teacher_context, x_start.shape[0])
         else:
             # p_uses_cfg = 0. Never use CFG.
             self.uses_cfg = False
-            # In this case, the student only passes pos_context to the teacher,
-            # so no need to split teacher_context into pos_context and neg_context.
-            # self.cfg_scale will be accessed by the student,
-            # so we need to make sure it is always set correctly,
-            # in case someday we want to switch from CFG to non-CFG during runtime.
             self.cfg_scale = 1
         is_context_doubled = 2 if (self.uses_cfg and negative_context is None) else 1
         if self.name == 'unet_ensemble':
             # teacher_context is a list of teacher contexts.
@@ -199,14 +194,20 @@ class UNetTeacher(nn.Module):
             teacher_pos_contexts = []
             # teacher_context is a list of teacher contexts.
             for teacher_context_i in teacher_context:
-                pos_context, neg_context = torch.chunk(teacher_context_i, 2, dim=0)
-                if pos_context.shape[0] != BS:
-                    breakpoint()
                 teacher_pos_contexts.append(pos_context)
             teacher_context = teacher_pos_contexts
         else:
-            pos_context, neg_context = torch.chunk(teacher_context, 2, dim=0)
-            if pos_context.shape[0] != BS:
                 breakpoint()
             teacher_context = pos_context

     # t: the initial t. We will sample additional (num_denoising_steps - 1) smaller t.
     # same_t_noise_across_instances: when sampling t and noise, use the same t and noise for all instances.
     def forward(self, ddpm_model, x_start, noise, t, teacher_context, negative_context=None,
+                num_denoising_steps=1, force_uses_cfg=False, same_t_noise_across_instances=False,
                 global_t_lb=0, global_t_ub=1000):
         assert num_denoising_steps <= 10
+        # force_uses_cfg overrides p_uses_cfg.
+        if force_uses_cfg > 0:
+            self.uses_cfg = True
+        elif self.p_uses_cfg > 0:
             self.uses_cfg = np.random.rand() < self.p_uses_cfg
         else:
             # p_uses_cfg = 0. Never use CFG.
             self.uses_cfg = False
             self.cfg_scale = 1
+        if self.uses_cfg:
+            # Randomly sample a cfg_scale from cfg_scale_range.
+            self.cfg_scale = np.random.uniform(*self.cfg_scale_range)
+            print(f"Teacher samples CFG scale {self.cfg_scale:.1f}.")
+            if negative_context is not None:
+                negative_context = negative_context[:1].repeat(x_start.shape[0], 1, 1)
+            # if negative_context is None, then teacher_context is a combination of
+            # (one or multiple if unet_ensemble) pos_context and neg_context.
+            # If negative_context is not None, then teacher_context is only pos_context.
+        else:
+            self.cfg_scale = 1
+            print("Teacher does not use CFG.")
+            # If negative_context is None, then teacher_context is either a combination of
+            # (one or multiple if unet_ensemble) pos_context and neg_context, or only pos_context.
+            # Since not uses_cfg, we only need pos_context.
+            # If negative_context is not None, then teacher_context is only pos_context.
+            if negative_context is None:
+                teacher_context = self.extract_pos_context(teacher_context, x_start.shape[0])
         is_context_doubled = 2 if (self.uses_cfg and negative_context is None) else 1
         if self.name == 'unet_ensemble':
             # teacher_context is a list of teacher contexts.
             teacher_pos_contexts = []
             # teacher_context is a list of teacher contexts.
             for teacher_context_i in teacher_context:
+                if teacher_context_i.shape[0] == BS * 2:
+                    pos_context, neg_context = torch.chunk(teacher_context_i, 2, dim=0)
+                elif teacher_context_i.shape[0] == BS:
+                    pos_context = teacher_context_i
+                else:
+                    breakpoint()
                 teacher_pos_contexts.append(pos_context)
             teacher_context = teacher_pos_contexts
         else:
+            if teacher_context.shape[0] == BS * 2:
+                pos_context, neg_context = torch.chunk(teacher_context, 2, dim=0)
+            elif teacher_context.shape[0] == BS:
+                pos_context = teacher_context
+            else:
                 breakpoint()
             teacher_context = pos_context

adaface/util.py CHANGED Viewed

@@ -48,7 +48,7 @@ def perturb_tensor(ts, perturb_std, perturb_std_is_relative=True, keep_norm=Fals
         ts = ts + noise
     if verbose:
-        print(f"Correlations between new and original tensors: {F.cosine_similarity(ts.flatten(), orig_ts.flatten(), dim=0).item():.03f}")
     return ts
@@ -69,7 +69,7 @@ def calc_stats(emb_name, embeddings, mean_dim=-1):
     # Compute it manually.
     l2_loss = ((embeddings - emb_mean) ** 2).mean().sqrt()
     norms = torch.norm(embeddings, dim=1).detach().cpu().numpy()
-    print("L1: %.4f, L2: %.4f" %(l1_loss.item(), l2_loss.item()))
     print("Norms: min: %.4f, max: %.4f, mean: %.4f, std: %.4f" %(norms.min(), norms.max(), norms.mean(), norms.std()))
@@ -80,7 +80,7 @@ class ScaleGrad(torch.autograd.Function):
         ctx.save_for_backward(alpha_, debug)
         output = input_
         if debug:
-            print(f"input: {input_.abs().mean().item()}")
         return output
     @staticmethod
@@ -90,7 +90,7 @@ class ScaleGrad(torch.autograd.Function):
         if ctx.needs_input_grad[0]:
             grad_output2 = grad_output * alpha_
             if debug:
-                print(f"grad_output2: {grad_output2.abs().mean().item()}")
         else:
             grad_output2 = None
         return grad_output2, None, None
@@ -232,8 +232,8 @@ def create_consistentid_pipeline(base_model_path="models/sd15-dste8-vae.safetens
     # consistentID specific modules are still in fp32. Will be converted to fp16
     # later with .to(device, torch_dtype) by the caller.
     pipe.load_ConsistentID_model(
-        consistentID_weight_path="./models/ConsistentID/ConsistentID-v1.bin",
-        bise_net_weight_path="./models/ConsistentID/BiSeNet_pretrained_for_ConsistentID.pth",
     )
     # Avoid passing dtype to ConsistentIDPipeline.from_single_file(),
     # because we've overloaded .to() to convert consistentID specific modules as well,

         ts = ts + noise
     if verbose:
+        print(f"Correlations between new and original tensors: {F.cosine_similarity(ts.flatten(), orig_ts.flatten(), dim=0).detach().item():.03f}")
     return ts
     # Compute it manually.
     l2_loss = ((embeddings - emb_mean) ** 2).mean().sqrt()
     norms = torch.norm(embeddings, dim=1).detach().cpu().numpy()
+    print("L1: %.4f, L2: %.4f" %(l1_loss.detach().item(), l2_loss.detach().item()))
     print("Norms: min: %.4f, max: %.4f, mean: %.4f, std: %.4f" %(norms.min(), norms.max(), norms.mean(), norms.std()))
         ctx.save_for_backward(alpha_, debug)
         output = input_
         if debug:
+            print(f"input: {input_.abs().mean().detach().item()}")
         return output
     @staticmethod
         if ctx.needs_input_grad[0]:
             grad_output2 = grad_output * alpha_
             if debug:
+                print(f"grad_output2: {grad_output2.abs().mean().detach().item()}")
         else:
             grad_output2 = None
         return grad_output2, None, None
     # consistentID specific modules are still in fp32. Will be converted to fp16
     # later with .to(device, torch_dtype) by the caller.
     pipe.load_ConsistentID_model(
+        consistentID_weight_path="models/ConsistentID/ConsistentID-v1.bin",
+        bise_net_weight_path="models/ConsistentID/BiSeNet_pretrained_for_ConsistentID.pth",
     )
     # Avoid passing dtype to ConsistentIDPipeline.from_single_file(),
     # because we've overloaded .to() to convert consistentID specific modules as well,

app.py CHANGED Viewed

@@ -20,14 +20,14 @@ def str2bool(v):
     else:
         raise argparse.ArgumentTypeError("Boolean value expected.")
-def is_running_on_spaces():
     return os.getenv("SPACE_ID") is not None
 import argparse
 parser = argparse.ArgumentParser()
 parser.add_argument("--adaface_encoder_types", type=str, nargs="+", default=["consistentID", "arc2face"],
                     choices=["arc2face", "consistentID"], help="Type(s) of the ID2Ada prompt encoders")
-parser.add_argument('--adaface_ckpt_path', type=str, default='models/adaface/VGGface2_HQ_masks2025-03-06T03-31-21_zero3-ada-1000.pt',
                     help="Path to the checkpoint of the ID2Ada prompt encoders")
 # If adaface_encoder_cfg_scales is not specified, the weights will be set to 6.0 (consistentID) and 1.0 (arc2face).
 parser.add_argument('--adaface_encoder_cfg_scales', type=float, nargs="+", default=[6.0, 1.0],
@@ -75,6 +75,16 @@ MAX_SEED = np.iinfo(np.int32).max
 global adaface
 adaface = None
 if not args.test_ui_only:
     adaface = AdaFaceWrapper(pipeline_name="text2img", base_model_path=base_model_path,
                              adaface_encoder_types=args.adaface_encoder_types,
@@ -84,9 +94,10 @@ if not args.test_ui_only:
                              unet_types=None, extra_unet_dirpaths=None, unet_weights_in_ensemble=None,
                              unet_uses_attn_lora=args.unet_uses_attn_lora,
                              attn_lora_layer_names=args.attn_lora_layer_names,
-                             shrink_cross_attn=False,
                              q_lora_updates_query=args.q_lora_updates_query,
-                             device='cpu')
 def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
     if randomize_seed:
@@ -114,18 +125,7 @@ def generate_image(image_paths, image_paths2, guidance_scale, perturb_std,
     global adaface, args
-    if is_running_on_spaces():
-        device = 'cuda:0'
-    else:
-        if args.gpu is None:
-            device = "cuda"
-        else:
-            device = f"cuda:{args.gpu}"
-    print(f"Device: {device}")
-    adaface.to(device)
-    args.device = device
     if image_paths is None or len(image_paths) == 0:
         raise gr.Error(f"Cannot find any input face image! Please upload a face image.")
@@ -255,16 +255,17 @@ def check_prompt_and_model_type(prompt, model_style_type, adaface_encoder_cfg_sc
             print(f"Switching to the base model type: {model_style_type}.")
             adaface = AdaFaceWrapper(pipeline_name="text2img", base_model_path=model_style_type2base_model_path[model_style_type],
-                                    adaface_encoder_types=args.adaface_encoder_types,
-                                    adaface_ckpt_paths=args.adaface_ckpt_path,
-                                    adaface_encoder_cfg_scales=args.adaface_encoder_cfg_scales,
-                                    enabled_encoders=args.enabled_encoders,
-                                    unet_types=None, extra_unet_dirpaths=None, unet_weights_in_ensemble=None,
-                                    unet_uses_attn_lora=args.unet_uses_attn_lora,
-                                    attn_lora_layer_names=args.attn_lora_layer_names,
-                                    shrink_cross_attn=False,
-                                    q_lora_updates_query=args.q_lora_updates_query,
-                                    device='cpu')
     if adaface_encoder_cfg_scale1 != args.adaface_encoder_cfg_scales[0]:
         args.adaface_encoder_cfg_scales[0] = adaface_encoder_cfg_scale1
@@ -370,12 +371,13 @@ with gr.Blocks(css=css, theme=gr.themes.Origin()) as demo:
                                     "portrait, night view of tokyo street, neon light",
                                     "portrait, playing guitar on a boat, ocean waves",
                                     "portrait, with a passion for reading, curled up with a book in a cozy nook near a window, front view",
-                                    "portrait, celebrating new year, fireworks",
                                     "portrait, running pose in a park",
                                     "portrait, in space suit, space helmet, walking on mars",
                                     "portrait, in superman costume, the sky ablaze with hues of orange and purple",
                                     "in a wheelchair",
-                                    "on a horse"
                             ])
             highlight_face = gr.Checkbox(label="Highlight face", value=False,

     else:
         raise argparse.ArgumentTypeError("Boolean value expected.")
+def is_running_on_hf_space():
     return os.getenv("SPACE_ID") is not None
 import argparse
 parser = argparse.ArgumentParser()
 parser.add_argument("--adaface_encoder_types", type=str, nargs="+", default=["consistentID", "arc2face"],
                     choices=["arc2face", "consistentID"], help="Type(s) of the ID2Ada prompt encoders")
+parser.add_argument('--adaface_ckpt_path', type=str, default='models/adaface/VGGface2_HQ_masks2025-05-22T17-51-19_zero3-ada-1000.pt',
                     help="Path to the checkpoint of the ID2Ada prompt encoders")
 # If adaface_encoder_cfg_scales is not specified, the weights will be set to 6.0 (consistentID) and 1.0 (arc2face).
 parser.add_argument('--adaface_encoder_cfg_scales', type=float, nargs="+", default=[6.0, 1.0],
 global adaface
 adaface = None
+if is_running_on_hf_space():
+    args.device = 'cuda:0'
+    is_on_hf_space = True
+else:
+    if args.gpu is None:
+        args.device = "cuda"
+    else:
+        args.device = f"cuda:{args.gpu}"
+    is_on_hf_space = False
 if not args.test_ui_only:
     adaface = AdaFaceWrapper(pipeline_name="text2img", base_model_path=base_model_path,
                              adaface_encoder_types=args.adaface_encoder_types,
                              unet_types=None, extra_unet_dirpaths=None, unet_weights_in_ensemble=None,
                              unet_uses_attn_lora=args.unet_uses_attn_lora,
                              attn_lora_layer_names=args.attn_lora_layer_names,
+                             normalize_cross_attn=False,
                              q_lora_updates_query=args.q_lora_updates_query,
+                             device='cpu',
+                             is_on_hf_space=is_on_hf_space)
 def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
     if randomize_seed:
     global adaface, args
+    adaface.to(args.device)
     if image_paths is None or len(image_paths) == 0:
         raise gr.Error(f"Cannot find any input face image! Please upload a face image.")
             print(f"Switching to the base model type: {model_style_type}.")
             adaface = AdaFaceWrapper(pipeline_name="text2img", base_model_path=model_style_type2base_model_path[model_style_type],
+                                     adaface_encoder_types=args.adaface_encoder_types,
+                                     adaface_ckpt_paths=args.adaface_ckpt_path,
+                                     adaface_encoder_cfg_scales=args.adaface_encoder_cfg_scales,
+                                     enabled_encoders=args.enabled_encoders,
+                                     unet_types=None, extra_unet_dirpaths=None, unet_weights_in_ensemble=None,
+                                     unet_uses_attn_lora=args.unet_uses_attn_lora,
+                                     attn_lora_layer_names=args.attn_lora_layer_names,
+                                     normalize_cross_attn=False,
+                                     q_lora_updates_query=args.q_lora_updates_query,
+                                     device='cpu',
+                                     is_on_hf_space=is_on_hf_space)
     if adaface_encoder_cfg_scale1 != args.adaface_encoder_cfg_scales[0]:
         args.adaface_encoder_cfg_scales[0] = adaface_encoder_cfg_scale1
                                     "portrait, night view of tokyo street, neon light",
                                     "portrait, playing guitar on a boat, ocean waves",
                                     "portrait, with a passion for reading, curled up with a book in a cozy nook near a window, front view",
+                                    "portrait, celebrating new year alone, fireworks",
                                     "portrait, running pose in a park",
                                     "portrait, in space suit, space helmet, walking on mars",
                                     "portrait, in superman costume, the sky ablaze with hues of orange and purple",
                                     "in a wheelchair",
+                                    "on a horse",
+                                    "on a bike",
                             ])
             highlight_face = gr.Checkbox(label="Highlight face", value=False,