Spaces:

ironjr
/

semantic-draw-canvas-sdxl

Running on Zero

App Files Files Community

ironjr commited on 25 days ago

Commit

ce3f8a9

verified ·

1 Parent(s): 13124ab

Update model.py

Browse files

Files changed (1) hide show

model.py +102 -79

model.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (c) 2024 Jaerin Lee
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -55,7 +55,7 @@ from typing import Tuple, List, Literal, Optional, Union
 from tqdm import tqdm
 from PIL import Image
-from util import gaussian_lowpass, blend, get_panorama_views, shift_to_mask_bbox_center
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
@@ -73,7 +73,7 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
     return noise_cfg
-class StableMultiDiffusionSDXLPipeline(nn.Module):
     def __init__(
         self,
         device: torch.device,
@@ -93,7 +93,7 @@ class StableMultiDiffusionSDXLPipeline(nn.Module):
         has_i2t: bool = True,
         lora_weight: float = 1.0,
     ) -> None:
-        r"""Stabilized MultiDiffusion for fast sampling.
         Accelrated region-based text-to-image synthesis with Latent Consistency
         Model while preserving mask fidelity and quality.
@@ -131,7 +131,7 @@ class StableMultiDiffusionSDXLPipeline(nn.Module):
             default_preprocess_mask_cover_alpha (float): Optional preprocessing
                 where each mask covered by other masks is reduced in its alpha
                 value by this specified factor.
-            t_index_list (List[int]): The default scheduling for LCM scheduler.
             mask_type (Literal['discrete', 'semi-continuous', 'continuous']):
                 defines the mask quantization modes. Details in the codes of
                 `self.process_mask`. Basically, this (subtly) controls the
@@ -170,10 +170,10 @@ class StableMultiDiffusionSDXLPipeline(nn.Module):
             model_key = hf_key
             lora_ckpt = 'sdxl_lightning_4step_lora.safetensors'
-            self.pipe = StableDiffusionXLPipeline.from_pretrained(model_key, variant=variant, torch_dtype=self.dtype).to(self.device)
             self.pipe.load_lora_weights(hf_hub_download(lightning_repo, lora_ckpt), adapter_name='lightning')
             self.pipe.set_adapters(["lightning"], adapter_weights=[lora_weight])
-            # self.pipe.fuse_lora()
         else:
             model_key = 'stabilityai/stable-diffusion-xl-base-1.0'
             variant = 'fp16'
@@ -212,7 +212,7 @@ class StableMultiDiffusionSDXLPipeline(nn.Module):
         self.vae_scale_factor = self.pipe.vae_scale_factor
         # Prepare white background for bootstrapping.
-        # self.get_white_background(1024, 1024)
         print(f'[INFO] Model is loaded!')
@@ -691,7 +691,7 @@ class StableMultiDiffusionSDXLPipeline(nn.Module):
             25, 37], the masks are split into binary masks whose values are
             greater than these levels. This results in tradual increase of mask
             region as the timesteps increase. Details are described in our
-            paper at https://arxiv.org/pdf/2403.09055.pdf.
         On the Three Modes of `mask_type`:
             `self.mask_type` is predefined at the initialization stage of this
@@ -949,6 +949,9 @@ class StableMultiDiffusionSDXLPipeline(nn.Module):
         boostrap_mix_steps: Optional[float] = None,
         bootstrap_leak_sensitivity: Optional[float] = None,
         preprocess_mask_cover_alpha: Optional[float] = None,
     ) -> Image.Image:
         r"""Arbitrary-size image generation from multiple pairs of (regional)
         text prompt-mask pairs.
@@ -957,7 +960,7 @@ class StableMultiDiffusionSDXLPipeline(nn.Module):
         Example:
             >>> device = torch.device('cuda:0')
-            >>> smd = StableMultiDiffusionPipeline(device)
             >>> prompts = {... specify prompts}
             >>> masks = {... specify mask tensors}
             >>> height, width = masks.shape[-2:]
@@ -1046,7 +1049,7 @@ class StableMultiDiffusionSDXLPipeline(nn.Module):
         # prompts is None: return background.
         # masks is None but prompts is not None: return prompts
-        # masks is not None and prompts is not None: Do StableMultiDiffusion.
         if prompts is None or (isinstance(prompts, (list, tuple, str)) and len(prompts) == 0):
             if background is None and background_prompt is not None:
@@ -1157,27 +1160,22 @@ class StableMultiDiffusionSDXLPipeline(nn.Module):
         # SDXL pipeline settings.
         batch_size = 1
-        output_type = 'pil'
-        guidance_rescale = 0.7
-        prompt_2 = None
-        device = self.device
         num_images_per_prompt = 1
-        negative_prompt_2 = None
         original_size = (height, width)
         target_size = (height, width)
         crops_coords_top_left = (0, 0)
-        negative_crops_coords_top_left = (0, 0)
         negative_original_size = None
         negative_target_size = None
-        pooled_prompt_embeds = None
-        negative_pooled_prompt_embeds = None
-        text_encoder_lora_scale = None
         prompt_embeds = None
         negative_prompt_embeds = None
         (
             prompt_embeds,
@@ -1187,7 +1185,7 @@ class StableMultiDiffusionSDXLPipeline(nn.Module):
         ) = self.encode_prompt(
             prompt=prompts,
             prompt_2=prompt_2,
-            device=device,
             num_images_per_prompt=num_images_per_prompt,
             do_classifier_free_guidance=do_classifier_free_guidance,
             negative_prompt=negative_prompts,
@@ -1199,30 +1197,6 @@ class StableMultiDiffusionSDXLPipeline(nn.Module):
             lora_scale=text_encoder_lora_scale,
         )
-        add_text_embeds = pooled_prompt_embeds
-        if self.text_encoder_2 is None:
-            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
-        else:
-            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
-        add_time_ids = self._get_add_time_ids(
-            original_size,
-            crops_coords_top_left,
-            target_size,
-            dtype=prompt_embeds.dtype,
-            text_encoder_projection_dim=text_encoder_projection_dim,
-        )
-        if negative_original_size is not None and negative_target_size is not None:
-            negative_add_time_ids = self._get_add_time_ids(
-                negative_original_size,
-                negative_crops_coords_top_left,
-                negative_target_size,
-                dtype=prompt_embeds.dtype,
-                text_encoder_projection_dim=text_encoder_projection_dim,
-            )
-        else:
-            negative_add_time_ids = add_time_ids
         if has_background:
             # First channel is background prompt text embeds. Background prompt itself is not used for generation.
             s = prompt_strengths
@@ -1248,10 +1222,26 @@ class StableMultiDiffusionSDXLPipeline(nn.Module):
                     assert fu.shape[0] == 1 and fe.shape == num_prompts
                     fu = fu.repeat(num_prompts, 1, 1)
                 negative_prompt_embeds = torch.lerp(bu, fu, s)  # (n, 77, 1024)
         elif negative_prompt_embeds is not None and num_prompts > num_nprompts:
             # # negative prompts = 1; # prompts > 1.
             assert negative_prompt_embeds.shape[0] == 1 and prompt_embeds.shape[0] == num_prompts
             negative_prompt_embeds = negative_prompt_embeds.repeat(num_prompts, 1, 1)
         # assert negative_prompt_embeds.shape[0] == prompt_embeds.shape[0] == num_prompts
         if num_masks > num_prompts:
             assert masks.shape[0] == num_masks and num_prompts == 1
@@ -1259,6 +1249,34 @@ class StableMultiDiffusionSDXLPipeline(nn.Module):
             if negative_prompt_embeds is not None:
                 negative_prompt_embeds = negative_prompt_embeds.repeat(num_masks, 1, 1)
         # SDXL pipeline settings.
         if do_classifier_free_guidance:
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
@@ -1266,19 +1284,25 @@ class StableMultiDiffusionSDXLPipeline(nn.Module):
             add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
         del negative_prompt_embeds, negative_pooled_prompt_embeds, negative_add_time_ids
-        prompt_embeds = prompt_embeds.to(device)
-        add_text_embeds = add_text_embeds.to(device)
-        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
         ### Run
         # Latent initialization.
         if self.timesteps[0] < 999 and has_background:
-            latents = self.scheduler_add_noise(bg_latents, None, 0, initial=True)
         else:
-            latents = torch.randn((1, self.unet.config.in_channels, h, w), dtype=self.dtype, device=self.device)
-            latents = latents * self.scheduler.init_noise_sigma
         # Tiling (if needed).
         if height > tile_size or width > tile_size:
@@ -1287,9 +1311,9 @@ class StableMultiDiffusionSDXLPipeline(nn.Module):
             tile_masks = tile_masks.to(self.device)
         else:
             views = [(0, h, 0, w)]
-            tile_masks = latents.new_ones((1, 1, h, w))
-        value = torch.zeros_like(latents)
-        count_all = torch.zeros_like(latents)
         with torch.autocast('cuda'):
             for i, t in enumerate(tqdm(self.timesteps)):
@@ -1300,7 +1324,7 @@ class StableMultiDiffusionSDXLPipeline(nn.Module):
                 count_all.zero_()
                 for j, (h_start, h_end, w_start, w_end) in enumerate(views):
                     fg_mask_ = fg_mask[..., h_start:h_end, w_start:w_end]
-                    latents_ = latents[..., h_start:h_end, w_start:w_end].repeat(num_masks, 1, 1, 1)
                     # Additional arguments for the SDXL pipeline.
                     add_time_ids_input = add_time_ids.clone()
@@ -1312,16 +1336,16 @@ class StableMultiDiffusionSDXLPipeline(nn.Module):
                     if i < bootstrap_steps:
                         mix_ratio = min(1, max(0, boostrap_mix_steps - i))
                         # Treat the first foreground latent as the background latent if one does not exist.
-                        bg_latents_ = bg_latents[..., h_start:h_end, w_start:w_end] if has_background else latents_[:1]
                         white_ = white[..., h_start:h_end, w_start:w_end]
-                        white_ = self.scheduler_add_noise(white_, None, i, initial=True)
-                        bg_latents_ = mix_ratio * white_ + (1.0 - mix_ratio) * bg_latents_
-                        latents_ = (1.0 - fg_mask_) * bg_latents_ + fg_mask_ * latents_
                         # Centering.
-                        latents_ = shift_to_mask_bbox_center(latents_, fg_mask_, reverse=True)
-                    latent_model_input = torch.cat([latents_] * 2) if do_classifier_free_guidance else latents_
                     latent_model_input = self.scheduler_scale_model_input(latent_model_input, i)
                     # Perform one step of the reverse diffusion.
@@ -1341,33 +1365,32 @@ class StableMultiDiffusionSDXLPipeline(nn.Module):
                         noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
                     if do_classifier_free_guidance and guidance_rescale > 0.0:
-                        # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
                         noise_pred = rescale_noise_cfg(noise_pred, noise_pred_cond, guidance_rescale=guidance_rescale)
-                    latents_ = self.scheduler_step(noise_pred, i, latents_)
                     if i < bootstrap_steps:
                         # Uncentering.
-                        latents_ = shift_to_mask_bbox_center(latents_, fg_mask_)
                         # Remove leakage (optional).
-                        leak = (latents_ - bg_latents_).pow(2).mean(dim=1, keepdim=True)
                         leak_sigmoid = torch.sigmoid(leak / bootstrap_leak_sensitivity) * 2 - 1
                         fg_mask_ = fg_mask_ * leak_sigmoid
                     # Mix the latents.
                     fg_mask_ = fg_mask_ * tile_masks[:, j:j + 1, h_start:h_end, w_start:w_end]
-                    value[..., h_start:h_end, w_start:w_end] += (fg_mask_ * latents_).sum(dim=0, keepdim=True)
                     count_all[..., h_start:h_end, w_start:w_end] += fg_mask_.sum(dim=0, keepdim=True)
-                latents = torch.where(count_all > 0, value / count_all, value)
                 bg_mask = (1 - count_all).clip_(0, 1)  # (T, 1, h, w)
                 if has_background:
-                    latents = (1 - bg_mask) * latents + bg_mask * bg_latents
                 # Noise is added after mixing.
                 if i < len(self.timesteps) - 1:
-                    latents = self.scheduler_add_noise(latents, None, i + 1)
         if not output_type == "latent":
             # make sure the VAE is in float32 mode, as it overflows in float16
@@ -1375,7 +1398,7 @@ class StableMultiDiffusionSDXLPipeline(nn.Module):
             if needs_upcasting:
                 self.upcast_vae()
-                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
             # unscale/denormalize the latents
             # denormalize with the mean and std if available and not None
@@ -1383,22 +1406,22 @@ class StableMultiDiffusionSDXLPipeline(nn.Module):
             has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
             if has_latents_mean and has_latents_std:
                 latents_mean = (
-                    torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
                 )
                 latents_std = (
-                    torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
                 )
-                latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
             else:
-                latents = latents / self.vae.config.scaling_factor
-            image = self.vae.decode(latents, return_dict=False)[0]
             # cast back to fp16 if needed
             if needs_upcasting:
                 self.vae.to(dtype=torch.float16)
         else:
-            image = latents
         # Return PIL Image.
         image = image[0].clip_(-1, 1) * 0.5 + 0.5
@@ -1407,4 +1430,4 @@ class StableMultiDiffusionSDXLPipeline(nn.Module):
             image = blend(image, background[0], fg_mask)
         else:
             image = T.ToPILImage()(image)
-        return image

+# Copyright (c) 2025 Jaerin Lee
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 from tqdm import tqdm
 from PIL import Image
+from util import load_model, gaussian_lowpass, blend, get_panorama_views, shift_to_mask_bbox_center
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
     return noise_cfg
+class SemanticDrawSDXLPipeline(nn.Module):
     def __init__(
         self,
         device: torch.device,
         has_i2t: bool = True,
         lora_weight: float = 1.0,
     ) -> None:
+        r"""Stabilized regionally assigned texts-to-image generation for fast sampling.
         Accelrated region-based text-to-image synthesis with Latent Consistency
         Model while preserving mask fidelity and quality.
             default_preprocess_mask_cover_alpha (float): Optional preprocessing
                 where each mask covered by other masks is reduced in its alpha
                 value by this specified factor.
+            t_index_list (List[int]): The default scheduling for scheduler.
             mask_type (Literal['discrete', 'semi-continuous', 'continuous']):
                 defines the mask quantization modes. Details in the codes of
                 `self.process_mask`. Basically, this (subtly) controls the
             model_key = hf_key
             lora_ckpt = 'sdxl_lightning_4step_lora.safetensors'
+            self.pipe = load_model(model_key, 'xl', self.device, self.dtype)
             self.pipe.load_lora_weights(hf_hub_download(lightning_repo, lora_ckpt), adapter_name='lightning')
             self.pipe.set_adapters(["lightning"], adapter_weights=[lora_weight])
+            self.pipe.fuse_lora()
         else:
             model_key = 'stabilityai/stable-diffusion-xl-base-1.0'
             variant = 'fp16'
         self.vae_scale_factor = self.pipe.vae_scale_factor
         # Prepare white background for bootstrapping.
+        self.get_white_background(1024, 1024)
         print(f'[INFO] Model is loaded!')
             25, 37], the masks are split into binary masks whose values are
             greater than these levels. This results in tradual increase of mask
             region as the timesteps increase. Details are described in our
+            paper.
         On the Three Modes of `mask_type`:
             `self.mask_type` is predefined at the initialization stage of this
         boostrap_mix_steps: Optional[float] = None,
         bootstrap_leak_sensitivity: Optional[float] = None,
         preprocess_mask_cover_alpha: Optional[float] = None,
+        # SDXL Pipeline setting.
+        guidance_rescale: float = 0.7,
+        output_type = 'pil',
     ) -> Image.Image:
         r"""Arbitrary-size image generation from multiple pairs of (regional)
         text prompt-mask pairs.
         Example:
             >>> device = torch.device('cuda:0')
+            >>> smd = SemanticDrawPipeline(device)
             >>> prompts = {... specify prompts}
             >>> masks = {... specify mask tensors}
             >>> height, width = masks.shape[-2:]
         # prompts is None: return background.
         # masks is None but prompts is not None: return prompts
+        # masks is not None and prompts is not None: Do SemanticDraw.
         if prompts is None or (isinstance(prompts, (list, tuple, str)) and len(prompts) == 0):
             if background is None and background_prompt is not None:
         # SDXL pipeline settings.
         batch_size = 1
         num_images_per_prompt = 1
         original_size = (height, width)
         target_size = (height, width)
         crops_coords_top_left = (0, 0)
         negative_original_size = None
         negative_target_size = None
+        negative_crops_coords_top_left = (0, 0)
+        prompt_2 = None
+        negative_prompt_2 = None
         prompt_embeds = None
         negative_prompt_embeds = None
+        pooled_prompt_embeds = None
+        negative_pooled_prompt_embeds = None
+        text_encoder_lora_scale = None
         (
             prompt_embeds,
         ) = self.encode_prompt(
             prompt=prompts,
             prompt_2=prompt_2,
+            device=self.device,
             num_images_per_prompt=num_images_per_prompt,
             do_classifier_free_guidance=do_classifier_free_guidance,
             negative_prompt=negative_prompts,
             lora_scale=text_encoder_lora_scale,
         )
         if has_background:
             # First channel is background prompt text embeds. Background prompt itself is not used for generation.
             s = prompt_strengths
                     assert fu.shape[0] == 1 and fe.shape == num_prompts
                     fu = fu.repeat(num_prompts, 1, 1)
                 negative_prompt_embeds = torch.lerp(bu, fu, s)  # (n, 77, 1024)
+            be = pooled_prompt_embeds[:1]
+            fe = pooled_prompt_embeds[1:]
+            pooled_prompt_embeds = torch.lerp(be, fe, s[..., 0])  # (p, 1280)
+            if negative_pooled_prompt_embeds is not None:
+                bu = negative_pooled_prompt_embeds[:1]
+                fu = negative_pooled_prompt_embeds[1:]
+                if num_prompts > num_nprompts:
+                    # # negative prompts = 1; # prompts > 1.
+                    assert fu.shape[0] == 1 and fe.shape == num_prompts
+                    fu = fu.repeat(num_prompts, 1)
+                negative_pooled_prompt_embeds = torch.lerp(bu, fu, s[..., 0])  # (n, 1280)
         elif negative_prompt_embeds is not None and num_prompts > num_nprompts:
             # # negative prompts = 1; # prompts > 1.
             assert negative_prompt_embeds.shape[0] == 1 and prompt_embeds.shape[0] == num_prompts
             negative_prompt_embeds = negative_prompt_embeds.repeat(num_prompts, 1, 1)
+            assert negative_pooled_prompt_embeds.shape[0] == 1 and pooled_prompt_embeds.shape[0] == num_prompts
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(num_prompts, 1)
         # assert negative_prompt_embeds.shape[0] == prompt_embeds.shape[0] == num_prompts
         if num_masks > num_prompts:
             assert masks.shape[0] == num_masks and num_prompts == 1
             if negative_prompt_embeds is not None:
                 negative_prompt_embeds = negative_prompt_embeds.repeat(num_masks, 1, 1)
+            pooled_prompt_embeds = pooled_prompt_embeds.repeat(num_masks, 1)
+            if negative_pooled_prompt_embeds is not None:
+                negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(num_masks, 1)
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
         # SDXL pipeline settings.
         if do_classifier_free_guidance:
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
             add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
         del negative_prompt_embeds, negative_pooled_prompt_embeds, negative_add_time_ids
+        prompt_embeds = prompt_embeds.to(self.device)
+        add_text_embeds = add_text_embeds.to(self.device)
+        add_time_ids = add_time_ids.to(self.device).repeat(batch_size * num_images_per_prompt, 1)
         ### Run
         # Latent initialization.
+        noise = torch.randn((1, self.unet.config.in_channels, h, w), dtype=self.dtype, device=self.device)
         if self.timesteps[0] < 999 and has_background:
+            latent = self.scheduler_add_noise(bg_latent, noise, 0, initial=True)
         else:
+            noise = torch.randn((1, self.unet.config.in_channels, h, w), dtype=self.dtype, device=self.device)
+            latent = noise * self.scheduler.init_noise_sigma
+        if has_background:
+            noise_bg_latents = [
+                self.scheduler_add_noise(bg_latent, noise, i, initial=True) for i in range(len(self.timesteps))
+            ] + [bg_latent]
         # Tiling (if needed).
         if height > tile_size or width > tile_size:
             tile_masks = tile_masks.to(self.device)
         else:
             views = [(0, h, 0, w)]
+            tile_masks = latent.new_ones((1, 1, h, w))
+        value = torch.zeros_like(latent)
+        count_all = torch.zeros_like(latent)
         with torch.autocast('cuda'):
             for i, t in enumerate(tqdm(self.timesteps)):
                 count_all.zero_()
                 for j, (h_start, h_end, w_start, w_end) in enumerate(views):
                     fg_mask_ = fg_mask[..., h_start:h_end, w_start:w_end]
+                    latent_ = latent[..., h_start:h_end, w_start:w_end].repeat(num_masks, 1, 1, 1)
                     # Additional arguments for the SDXL pipeline.
                     add_time_ids_input = add_time_ids.clone()
                     if i < bootstrap_steps:
                         mix_ratio = min(1, max(0, boostrap_mix_steps - i))
                         # Treat the first foreground latent as the background latent if one does not exist.
+                        bg_latent_ = noise_bg_latents[i][..., h_start:h_end, w_start:w_end] if has_background else latent_[:1]
                         white_ = white[..., h_start:h_end, w_start:w_end]
+                        white_ = self.scheduler_add_noise(white_, noise[..., h_start:h_end, w_start:w_end], i, initial=True)
+                        bg_latent_ = mix_ratio * white_ + (1.0 - mix_ratio) * bg_latent_
+                        latent_ = (1.0 - fg_mask_) * bg_latent_ + fg_mask_ * latent_
                         # Centering.
+                        latent_ = shift_to_mask_bbox_center(latent_, fg_mask_, reverse=True)
+                    latent_model_input = torch.cat([latent_] * 2) if do_classifier_free_guidance else latent_
                     latent_model_input = self.scheduler_scale_model_input(latent_model_input, i)
                     # Perform one step of the reverse diffusion.
                         noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
                     if do_classifier_free_guidance and guidance_rescale > 0.0:
                         noise_pred = rescale_noise_cfg(noise_pred, noise_pred_cond, guidance_rescale=guidance_rescale)
+                    latent_ = self.scheduler_step(noise_pred, i, latent_)
                     if i < bootstrap_steps:
                         # Uncentering.
+                        latent_ = shift_to_mask_bbox_center(latent_, fg_mask_)
                         # Remove leakage (optional).
+                        leak = (latent_ - bg_latent_).pow(2).mean(dim=1, keepdim=True)
                         leak_sigmoid = torch.sigmoid(leak / bootstrap_leak_sensitivity) * 2 - 1
                         fg_mask_ = fg_mask_ * leak_sigmoid
                     # Mix the latents.
                     fg_mask_ = fg_mask_ * tile_masks[:, j:j + 1, h_start:h_end, w_start:w_end]
+                    value[..., h_start:h_end, w_start:w_end] += (fg_mask_ * latent_).sum(dim=0, keepdim=True)
                     count_all[..., h_start:h_end, w_start:w_end] += fg_mask_.sum(dim=0, keepdim=True)
+                latent = torch.where(count_all > 0, value / count_all, value)
                 bg_mask = (1 - count_all).clip_(0, 1)  # (T, 1, h, w)
                 if has_background:
+                    latent = (1 - bg_mask) * latent + bg_mask * noise_bg_latents[i + 1] # bg_latent
                 # Noise is added after mixing.
                 if i < len(self.timesteps) - 1:
+                    latent = self.scheduler_add_noise(latent, None, i + 1)
         if not output_type == "latent":
             # make sure the VAE is in float32 mode, as it overflows in float16
             if needs_upcasting:
                 self.upcast_vae()
+                latent = latent.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
             # unscale/denormalize the latents
             # denormalize with the mean and std if available and not None
             has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
             if has_latents_mean and has_latents_std:
                 latents_mean = (
+                    torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latent.device, latent.dtype)
                 )
                 latents_std = (
+                    torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latent.device, latent.dtype)
                 )
+                latent = latent * latents_std / self.vae.config.scaling_factor + latents_mean
             else:
+                latent = latent / self.vae.config.scaling_factor
+            image = self.vae.decode(latent, return_dict=False)[0]
             # cast back to fp16 if needed
             if needs_upcasting:
                 self.vae.to(dtype=torch.float16)
         else:
+            image = latent
         # Return PIL Image.
         image = image[0].clip_(-1, 1) * 0.5 + 0.5
             image = blend(image, background[0], fg_mask)
         else:
             image = T.ToPILImage()(image)
+        return image