Spaces:

adaface-neurips
/

adaface

Running on Zero

App Files Files Community

adaface-neurips commited on Jun 2

Commit

7fe715c

1 Parent(s): 44542f2

Fix bug of not really extending CLIP, extend from 97 to 147

Browse files

Files changed (2) hide show

adaface/adaface_wrapper.py +12 -14
app.py +1 -1

adaface/adaface_wrapper.py CHANGED Viewed

@@ -117,14 +117,6 @@ class AdaFaceWrapper(nn.Module):
         else:
             vae = None
-        if self.use_ds_text_encoder:
-            # The dreamshaper v7 finetuned text encoder follows the prompt slightly better than the original text encoder.
-            # https://huggingface.co/Lykon/DreamShaper/tree/main/text_encoder
-            text_encoder = CLIPTextModel.from_pretrained("models/diffusers/ds_text_encoder",
-                                                         torch_dtype=torch.float16)
-        else:
-            text_encoder = None
         remove_unet = False
         if self.pipeline_name == "img2img":
@@ -202,6 +194,13 @@ class AdaFaceWrapper(nn.Module):
             pipeline.unet = unet2
         # Extending prompt length is for SD 1.5 only.
         if (self.pipeline_name == "text2img") and (self.max_prompt_length > 77):
             # pipeline.text_encoder.text_model.embeddings.position_embedding.weight: [77, 768] -> [max_length, 768]
@@ -210,20 +209,19 @@ class AdaFaceWrapper(nn.Module):
             # a larger max_position_embeddings, and set ignore_mismatched_sizes=True,
             # then the old position embeddings won't be loaded from the pretrained ckpt,
             # leading to degenerated performance.
-            EL = self.max_prompt_length - 77
             # position_embedding.weight: [77, 768] -> [max_length, 768]
             new_position_embedding = extend_nn_embedding(pipeline.text_encoder.text_model.embeddings.position_embedding,
                                                          pipeline.text_encoder.text_model.embeddings.position_embedding.weight[-EL:])
             pipeline.text_encoder.text_model.embeddings.position_embedding = new_position_embedding
             pipeline.text_encoder.text_model.embeddings.position_ids = torch.arange(self.max_prompt_length).unsqueeze(0)
         if self.use_840k_vae:
             pipeline.vae = vae
             print("Replaced the VAE with the 840k-step VAE.")
-        if self.use_ds_text_encoder:
-            pipeline.text_encoder = text_encoder
-            print("Replaced the text encoder with the DreamShaper text encoder.")
         if remove_unet:
             # Remove unet and vae to release RAM. Only keep tokenizer and text_encoder.

         else:
             vae = None
         remove_unet = False
         if self.pipeline_name == "img2img":
             pipeline.unet = unet2
+        if self.use_ds_text_encoder:
+            # The dreamshaper v7 finetuned text encoder follows the prompt slightly better than the original text encoder.
+            # https://huggingface.co/Lykon/DreamShaper/tree/main/text_encoder
+            pipeline.text_encoder = CLIPTextModel.from_pretrained("models/diffusers/ds_text_encoder",
+                                                                  torch_dtype=torch.float16)
+            print("Replaced the text encoder with the DreamShaper text encoder.")
         # Extending prompt length is for SD 1.5 only.
         if (self.pipeline_name == "text2img") and (self.max_prompt_length > 77):
             # pipeline.text_encoder.text_model.embeddings.position_embedding.weight: [77, 768] -> [max_length, 768]
             # a larger max_position_embeddings, and set ignore_mismatched_sizes=True,
             # then the old position embeddings won't be loaded from the pretrained ckpt,
             # leading to degenerated performance.
+            # max_prompt_length <= 77 + 70 = 147.
+            EL = min(self.max_prompt_length - 77, 70)
             # position_embedding.weight: [77, 768] -> [max_length, 768]
             new_position_embedding = extend_nn_embedding(pipeline.text_encoder.text_model.embeddings.position_embedding,
                                                          pipeline.text_encoder.text_model.embeddings.position_embedding.weight[-EL:])
             pipeline.text_encoder.text_model.embeddings.position_embedding = new_position_embedding
             pipeline.text_encoder.text_model.embeddings.position_ids = torch.arange(self.max_prompt_length).unsqueeze(0)
+            pipeline.text_encoder.text_model.config.max_position_embeddings = self.max_prompt_length
+            pipeline.tokenizer.model_max_length = self.max_prompt_length
         if self.use_840k_vae:
             pipeline.vae = vae
             print("Replaced the VAE with the 840k-step VAE.")
         if remove_unet:
             # Remove unet and vae to release RAM. Only keep tokenizer and text_encoder.

app.py CHANGED Viewed

@@ -55,7 +55,7 @@ parser.add_argument('--show_ablate_prompt_embed_type', type=str2bool, nargs="?",
 parser.add_argument('--extra_save_dir', type=str, default=None, help="Directory to save the generated images")
 parser.add_argument('--test_ui_only', type=str2bool, nargs="?", const=True, default=False,
                     help="Only test the UI layout, and skip loadding the adaface model")
-parser.add_argument('--max_prompt_length', type=int, default=97,
                     help="Maximum length of the prompt. If > 77, the CLIP text encoder will be extended.")
 parser.add_argument('--gpu', type=int, default=None)
 parser.add_argument('--ip', type=str, default="0.0.0.0")

 parser.add_argument('--extra_save_dir', type=str, default=None, help="Directory to save the generated images")
 parser.add_argument('--test_ui_only', type=str2bool, nargs="?", const=True, default=False,
                     help="Only test the UI layout, and skip loadding the adaface model")
+parser.add_argument('--max_prompt_length', type=int, default=147,
                     help="Maximum length of the prompt. If > 77, the CLIP text encoder will be extended.")
 parser.add_argument('--gpu', type=int, default=None)
 parser.add_argument('--ip', type=str, default="0.0.0.0")