adaface-neurips commited on
Commit
7fe715c
·
1 Parent(s): 44542f2

Fix bug of not really extending CLIP, extend from 97 to 147

Browse files
Files changed (2) hide show
  1. adaface/adaface_wrapper.py +12 -14
  2. app.py +1 -1
adaface/adaface_wrapper.py CHANGED
@@ -117,14 +117,6 @@ class AdaFaceWrapper(nn.Module):
117
  else:
118
  vae = None
119
 
120
- if self.use_ds_text_encoder:
121
- # The dreamshaper v7 finetuned text encoder follows the prompt slightly better than the original text encoder.
122
- # https://huggingface.co/Lykon/DreamShaper/tree/main/text_encoder
123
- text_encoder = CLIPTextModel.from_pretrained("models/diffusers/ds_text_encoder",
124
- torch_dtype=torch.float16)
125
- else:
126
- text_encoder = None
127
-
128
  remove_unet = False
129
 
130
  if self.pipeline_name == "img2img":
@@ -202,6 +194,13 @@ class AdaFaceWrapper(nn.Module):
202
 
203
  pipeline.unet = unet2
204
 
 
 
 
 
 
 
 
205
  # Extending prompt length is for SD 1.5 only.
206
  if (self.pipeline_name == "text2img") and (self.max_prompt_length > 77):
207
  # pipeline.text_encoder.text_model.embeddings.position_embedding.weight: [77, 768] -> [max_length, 768]
@@ -210,20 +209,19 @@ class AdaFaceWrapper(nn.Module):
210
  # a larger max_position_embeddings, and set ignore_mismatched_sizes=True,
211
  # then the old position embeddings won't be loaded from the pretrained ckpt,
212
  # leading to degenerated performance.
213
- EL = self.max_prompt_length - 77
 
214
  # position_embedding.weight: [77, 768] -> [max_length, 768]
215
  new_position_embedding = extend_nn_embedding(pipeline.text_encoder.text_model.embeddings.position_embedding,
216
  pipeline.text_encoder.text_model.embeddings.position_embedding.weight[-EL:])
217
  pipeline.text_encoder.text_model.embeddings.position_embedding = new_position_embedding
218
  pipeline.text_encoder.text_model.embeddings.position_ids = torch.arange(self.max_prompt_length).unsqueeze(0)
219
-
 
 
220
  if self.use_840k_vae:
221
  pipeline.vae = vae
222
  print("Replaced the VAE with the 840k-step VAE.")
223
-
224
- if self.use_ds_text_encoder:
225
- pipeline.text_encoder = text_encoder
226
- print("Replaced the text encoder with the DreamShaper text encoder.")
227
 
228
  if remove_unet:
229
  # Remove unet and vae to release RAM. Only keep tokenizer and text_encoder.
 
117
  else:
118
  vae = None
119
 
 
 
 
 
 
 
 
 
120
  remove_unet = False
121
 
122
  if self.pipeline_name == "img2img":
 
194
 
195
  pipeline.unet = unet2
196
 
197
+ if self.use_ds_text_encoder:
198
+ # The dreamshaper v7 finetuned text encoder follows the prompt slightly better than the original text encoder.
199
+ # https://huggingface.co/Lykon/DreamShaper/tree/main/text_encoder
200
+ pipeline.text_encoder = CLIPTextModel.from_pretrained("models/diffusers/ds_text_encoder",
201
+ torch_dtype=torch.float16)
202
+ print("Replaced the text encoder with the DreamShaper text encoder.")
203
+
204
  # Extending prompt length is for SD 1.5 only.
205
  if (self.pipeline_name == "text2img") and (self.max_prompt_length > 77):
206
  # pipeline.text_encoder.text_model.embeddings.position_embedding.weight: [77, 768] -> [max_length, 768]
 
209
  # a larger max_position_embeddings, and set ignore_mismatched_sizes=True,
210
  # then the old position embeddings won't be loaded from the pretrained ckpt,
211
  # leading to degenerated performance.
212
+ # max_prompt_length <= 77 + 70 = 147.
213
+ EL = min(self.max_prompt_length - 77, 70)
214
  # position_embedding.weight: [77, 768] -> [max_length, 768]
215
  new_position_embedding = extend_nn_embedding(pipeline.text_encoder.text_model.embeddings.position_embedding,
216
  pipeline.text_encoder.text_model.embeddings.position_embedding.weight[-EL:])
217
  pipeline.text_encoder.text_model.embeddings.position_embedding = new_position_embedding
218
  pipeline.text_encoder.text_model.embeddings.position_ids = torch.arange(self.max_prompt_length).unsqueeze(0)
219
+ pipeline.text_encoder.text_model.config.max_position_embeddings = self.max_prompt_length
220
+ pipeline.tokenizer.model_max_length = self.max_prompt_length
221
+
222
  if self.use_840k_vae:
223
  pipeline.vae = vae
224
  print("Replaced the VAE with the 840k-step VAE.")
 
 
 
 
225
 
226
  if remove_unet:
227
  # Remove unet and vae to release RAM. Only keep tokenizer and text_encoder.
app.py CHANGED
@@ -55,7 +55,7 @@ parser.add_argument('--show_ablate_prompt_embed_type', type=str2bool, nargs="?",
55
  parser.add_argument('--extra_save_dir', type=str, default=None, help="Directory to save the generated images")
56
  parser.add_argument('--test_ui_only', type=str2bool, nargs="?", const=True, default=False,
57
  help="Only test the UI layout, and skip loadding the adaface model")
58
- parser.add_argument('--max_prompt_length', type=int, default=97,
59
  help="Maximum length of the prompt. If > 77, the CLIP text encoder will be extended.")
60
  parser.add_argument('--gpu', type=int, default=None)
61
  parser.add_argument('--ip', type=str, default="0.0.0.0")
 
55
  parser.add_argument('--extra_save_dir', type=str, default=None, help="Directory to save the generated images")
56
  parser.add_argument('--test_ui_only', type=str2bool, nargs="?", const=True, default=False,
57
  help="Only test the UI layout, and skip loadding the adaface model")
58
+ parser.add_argument('--max_prompt_length', type=int, default=147,
59
  help="Maximum length of the prompt. If > 77, the CLIP text encoder will be extended.")
60
  parser.add_argument('--gpu', type=int, default=None)
61
  parser.add_argument('--ip', type=str, default="0.0.0.0")