Spaces:

alexnasa
/

outoffocus

Running on Zero

App Files Files Community

alexnasa commited on May 7

Commit

a0fd130

verified ·

1 Parent(s): 873eb28

Update app.py

Browse files

Files changed (1) hide show

app.py +519 -372

app.py CHANGED Viewed

@@ -1,431 +1,578 @@
-#!/usr/bin/env python
-# Out-of-Focus v1.0 — Zero GPU-ready edition
-# -------------------------------------------------------------
-# 0. Imports (⚠️ keep `import spaces` FIRST)
-# -------------------------------------------------------------
-import warnings, os, gc, math, argparse, pickle
-warnings.filterwarnings("ignore")
-import spaces                              # ← mandatory for Zero GPU
-import torch, torchvision
 import torch.nn.functional as F
 import numpy as np
-from typing import Optional, Dict, Any
 from PIL import Image
-from diffusers import (DiffusionPipeline, DDIMInverseScheduler,
-                       DDIMScheduler, AutoencoderKL)
-from diffusers.models.attention_processor import (
-    Attention, AttnProcessor2_0
-)
 from safetensors.torch import load_file
 from huggingface_hub import hf_hub_download
-import gradio as gr
-# -------------------------------------------------------------
-# 1. Globals (initialised lazily inside the GPU context)
-# -------------------------------------------------------------
-PIPE: Optional[DiffusionPipeline] = None
-INVERSE_SCHEDULER: Optional[DDIMInverseScheduler] = None
-SCHEDULER: Optional[DDIMScheduler] = None
-TORCH_DTYPE = torch.float16                 # H100/A100 FP16 slice
-# your existing state dictionaries / sliders
-weights: Dict[str, Dict[int, Dict[int, float]]] = {}
-res_list, foreground_mask = [], None
-heighest_resolution, signal_value, blur_value = -1, 2.0, None
-allowed_res_max = 1.0
-guidance_scale_value, num_inference_steps = 7.5, 10
-max_scale_value = 16
-res_range_min, res_range_max = 128, 1024
-# -------------------------------------------------------------
-# 2. Lazy pipeline loader (runs inside GPU context)
-# -------------------------------------------------------------
-def _get_pipeline() -> tuple[DiffusionPipeline,
-                             DDIMInverseScheduler,
-                             DDIMScheduler]:
-    """Initialise Stable Diffusion + schedulers on first call."""
-    global PIPE, INVERSE_SCHEDULER, SCHEDULER
-    if PIPE is None:                        # first GPU call ➜ download
-        model_id = "runwayml/stable-diffusion-v1-5"
-        vae_folder = "vae"
-        resadapter_name = "resadapter_v2_sd1.5"
-        PIPE = DiffusionPipeline.from_pretrained(
-            model_id, torch_dtype=TORCH_DTYPE
-        ).to("cuda")
-        # external VAE
-        PIPE.vae = AutoencoderKL.from_pretrained(
-            model_id, subfolder=vae_folder, torch_dtype=TORCH_DTYPE
-        ).to("cuda")
-        # Res-Adapter LoRA + Norm weights
-        lora_path = hf_hub_download(
-            "jiaxiangc/res-adapter",
-            subfolder=resadapter_name,
-            filename="pytorch_lora_weights.safetensors"
-        )
-        norm_path = hf_hub_download(
-            "jiaxiangc/res-adapter",
-            subfolder=resadapter_name,
-            filename="diffusion_pytorch_model.safetensors"
-        )
-        PIPE.load_lora_weights(lora_path, adapter_name="res_adapter")
-        PIPE.set_adapters(["res_adapter"], adapter_weights=[1.0])
-        PIPE.unet.load_state_dict(load_file(norm_path), strict=False)
-        # schedulers
-        INVERSE_SCHEDULER = DDIMInverseScheduler.from_pretrained(
-            model_id, subfolder="scheduler"
-        )
-        SCHEDULER = DDIMScheduler.from_pretrained(
-            model_id, subfolder="scheduler"
-        )
-    return PIPE, INVERSE_SCHEDULER, SCHEDULER
-# -------------------------------------------------------------
-# 3. Helper functions (unchanged from your original)
-# -------------------------------------------------------------
-def save_state_to_file(state):              # … unchanged
     filename = "state.pkl"
     with open(filename, "wb") as f:
         pickle.dump(state, f)
     return filename
-def load_state_from_file(filename):         # … unchanged
     with open(filename, "rb") as f:
-        return pickle.load(f)
 def weight_population(layer_type, resolution, depth, value):
-    global heighest_resolution
     if layer_type not in weights:
         weights[layer_type] = {}
     if resolution not in weights[layer_type]:
         weights[layer_type][resolution] = {}
     if resolution > heighest_resolution:
         heighest_resolution = resolution
     weights[layer_type][resolution][depth] = value
-def resize_image_with_aspect(img, res_min=128, res_max=1024):
-    w, h = img.size
-    if w < res_min or h < res_min:
-        s = max(res_min / w, res_min / h)
-    elif w > res_max or h > res_max:
-        s = min(res_max / w, res_max / h)
     else:
-        s = 1
-    return img.resize(
-        (int(w * s), int(h * s)), Image.Resampling.LANCZOS
-    )
-def adjust_ends(vals, delta):
-    # helpers used by update_scale
-    for i in range(len(vals)):
-        if (delta > 0 and vals[i + 1] == 1.0) or (
-            delta < 0 and vals[i] > 0.0
-        ):
-            vals[i] += delta
-            break
-    for i in range(len(vals) - 1, -1, -1):
-        if (delta > 0 and vals[i - 1] == 1.0) or (
-            delta < 0 and vals[i] > 0.0
-        ):
-            vals[i] += delta
-            break
-    return vals
-def update_scale(scale):
-    global weights
-    values_flat = []
-    for _, d in weights.items():
-        for _, v in d.items():
-            for _ in v:
-                values_flat.append(1.0)
-    for _ in range(scale, max_scale_value):
-        adjust_ends(values_flat, -0.5)
-    idx = 0
-    for k1, d in weights.items():
-        for k2 in d:
-            for k3 in d[k2]:
-                weights[k1][k2][k3] = values_flat[idx]
-                idx += 1
-# -------------------------------------------------------------
-# 4. Custom attention processor (unchanged)
-# -------------------------------------------------------------
 class AttnReplaceProcessor(AttnProcessor2_0):
-    def __init__(self, replace_all, layer_type,
-                 layer_count, blur_sigma=None):
         super().__init__()
         self.replace_all = replace_all
         self.layer_type = layer_type
         self.layer_count = layer_count
         self.blur_sigma = blur_sigma
     def __call__(
-        self, attn: Attention, hidden_states: torch.Tensor,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        temb: Optional[torch.Tensor] = None, *args, **kwargs
-    ) -> torch.Tensor:
-        dim2 = hidden_states.shape[1]
-        is_cross = encoder_hidden_states is not None
         residual = hidden_states
-        # (norms & projections identical to original code) …
-        # --- omitted for brevity, copy your original implementation ---
-        # replace attention values when self.replace_all is True
-        # using global `weights`
-        # --------------------------------------------------------------
-        return hidden_states  # after residual & rescale
 def replace_attention_processor(unet, clear=False, blur_sigma=None):
-    attn_count = 0
     for name, module in unet.named_modules():
         if "attn1" in name and "to" not in name:
             layer_type = name.split(".")[0].split("_")[0]
-            attn_count += 1
-            module.processor = AttnReplaceProcessor(
-                not clear, layer_type, attn_count, blur_sigma
-            )
-# -------------------------------------------------------------
-# 5. GPU-bound functions
-# -------------------------------------------------------------
-@spaces.GPU(duration=120)                   # 2 min quota
-def reconstruct(input_img: Image.Image, caption: str):
-    """
-    Reconstruct the input image & latents.
-    Returns: (np_image, caption, slider_val, meta_state)
-    """
-    pipe, inv_sched, sched = _get_pipeline()
-    img = resize_image_with_aspect(input_img,
-                                   res_range_min, res_range_max)
-    transform = torchvision.transforms.ToTensor()
-    loaded = transform(img).half().to("cuda").unsqueeze(0)
-    if loaded.shape[1] == 4:               # drop alpha
-        loaded = loaded[:, :3, :, :]
-    with torch.no_grad():
-        enc = pipe.vae.encode(loaded * 2 - 1)
-        real_latents = pipe.vae.config.scaling_factor * \
-            enc.latent_dist.sample()
-    # inversion pass
-    inv_sched.set_timesteps(num_inference_steps, device="cuda")
-    latents = real_latents.clone()
-    inversed_latents = [latents]
-    def store_latent(_, step, __, cb_kwargs):
-        if step != num_inference_steps - 1:
-            inversed_latents.append(cb_kwargs["latents"])
-        return cb_kwargs
-    replace_attention_processor(pipe.unet, True)
-    pipe.scheduler = inv_sched
-    pipe(prompt=caption,
-         guidance_scale=1.0,
-         output_type="latent",
-         num_inference_steps=num_inference_steps,
-         latents=latents,
-         callback_on_step_end=store_latent,
-         callback_on_step_end_tensor_inputs=["latents"])
-    real_initial = inversed_latents[-1]
-    # forward synthesis with CFG
-    sched.set_timesteps(num_inference_steps, device="cuda")
-    replace_attention_processor(pipe.unet, True)
-    def adjust_latent(_, step, __, cb_kwargs):
-        cb_kwargs["latents"] = inversed_latents[
-            len(sched.timesteps) - 1 - step
-        ].detach()
-        return cb_kwargs
-    latents = pipe(prompt=caption,
-                   guidance_scale=guidance_scale_value,
-                   output_type="latent",
-                   num_inference_steps=num_inference_steps,
-                   latents=real_initial,
-                   callback_on_step_end=adjust_latent,
-                   callback_on_step_end_tensor_inputs=["latents"])[0]
-    image = pipe.vae.decode(
-        latents / pipe.vae.config.scaling_factor, return_dict=False
-    )[0]
-    img_np = image.squeeze(0).float().permute(1, 2, 0).detach().cpu()
-    img_np = ((img_np / 2 + 0.5).clamp(0, 1).numpy() * 255).astype(np.uint8)
-    update_scale(12)                       # initial cross-attn value
-    pipe.to("cpu"); torch.cuda.empty_cache()
-    return img_np, caption, 12, [caption, real_initial.detach(),
-                                 inversed_latents, weights]
-@spaces.GPU(duration=120)                   # 2 min quota
-def apply_prompt(meta_data: Any, new_prompt: str):
-    """
-    Re-generate the image using stored latents + new prompt.
-    """
-    pipe, _, sched = _get_pipeline()
-    caption, real_latents, inversed, _ = meta_data
-    steps = len(inversed)
-    sched.set_timesteps(steps, device="cuda")
-    initial = torch.cat([real_latents] * 2)
-    def adjust_latent(_, step, __, cb_kwargs):
         replace_attention_processor(pipe.unet)
-        delta = inversed[len(sched.timesteps) - 1 - step].detach()
-        cb_kwargs["latents"][1] += delta - cb_kwargs["latents"][0]
-        cb_kwargs["latents"][0] = delta
-        return cb_kwargs
-    latents = pipe(
-        prompt=[caption, new_prompt],
-        negative_prompt=["", ""],
-        guidance_scale=guidance_scale_value,
-        output_type="latent",
-        num_inference_steps=steps,
-        latents=initial,
-        callback_on_step_end=adjust_latent,
-        callback_on_step_end_tensor_inputs=["latents"]
-    )[0][1]
-    replace_attention_processor(pipe.unet, True)
-    image = pipe.vae.decode(
-        latents.unsqueeze(0) / pipe.vae.config.scaling_factor,
-        return_dict=False
-    )[0]
-    img_np = image.squeeze(0).float().permute(1, 2, 0).detach().cpu()
-    img_np = ((img_np / 2 + 0.5).clamp(0, 1).numpy() * 255).astype(np.uint8)
-    pipe.to("cpu"); torch.cuda.empty_cache()
-    return img_np
-# -------------------------------------------------------------
-# 6. Lightweight CPU callbacks
-# -------------------------------------------------------------
 def on_image_change(filepath):
-    fname = os.path.splitext(os.path.basename(filepath))[0]
-    if fname in ["example1", "example3", "example4"]:
-        meta = load_state_from_file(f"assets/{fname}-turbo.pkl")
         global weights
-        _, _, _, weights = meta
         global num_inference_steps
         num_inference_steps = 10
-        scale_val = 8 if fname == "example1" else 6 if fname == "example3" else 13
-        new_prompt = {
-            "example1": "a photo of a tree, summer, colourful",
-            "example3": ("a realistic photo of a female warrior, flowing "
-                         "dark purple or black hair, bronze shoulder armour, "
-                         "leather chest piece, sky background with clouds"),
-            "example4": ("a photo of plastic bottle on some sand, beach "
-                         "background, sky background")
-        }[fname]
-        update_scale(scale_val)
-        img = apply_prompt(meta, new_prompt)
-        return filepath, img, meta, num_inference_steps, scale_val, scale_val
-    return None
-def update_step(val):
     global num_inference_steps
-    num_inference_steps = val
-# -------------------------------------------------------------
-# 7. Gradio UI (unchanged layout)
-# -------------------------------------------------------------
 with gr.Blocks(analytics_enabled=False) as demo:
     gr.Markdown(
-        """<div style="text-align:center">
-        <img src="https://github.com/user-attachments/assets/55a38e74-ab93-4d80-91c8-0fa6130af45a">
-        <h1>Out of Focus v1.0 Turbo (Zero GPU)</h1>
-        <p>Prompt-based image reconstruction & manipulation.</p></div>"""
     )
     with gr.Row():
         with gr.Column():
-            example_in = gr.Image(type="filepath", visible=False)
-            img_in     = gr.Image(type="pil",
-                                  label="Upload Source Image")
-            steps      = gr.Slider(minimum=5, maximum=50, step=5,
-                                   value=num_inference_steps,
-                                   label="Steps")
-            prompt_box = gr.Textbox(label="Prompt")
-            recon_btn  = gr.Button("Reconstruct")
         with gr.Column():
-            recon_img  = gr.Image(type="pil", label="Result")
-            inv_slider = gr.Slider(minimum=0, maximum=9, step=1,
-                                   value=7, visible=False)
-            xattn      = gr.Slider(minimum=0, maximum=max_scale_value,
-                                   step=1, value=max_scale_value,
-                                   label="Cross-Attention Influence")
-            new_box    = gr.Textbox(label="New Prompt", interactive=False)
-            apply_btn  = gr.Button("Generate Vision",
-                                   variant="primary", interactive=False)
-            gr.Examples(
-                examples=[
-                    ["assets/example4.png",
-                     "a photo of plastic bottle on a rock, mountain background, sky background",
-                     "a photo of plastic bottle on some sand, beach background, sky background",
-                     13],
-                    ["assets/example1.png",
-                     "a photo of a tree, spring, foggy",
-                     "a photo of a tree, summer, colourful",
-                     8],
-                    ["assets/example3.png",
-                     ("a digital illustration of a female warrior, flowing "
-                      "dark purple or black hair, bronze shoulder armour, "
-                      "leather chest piece, sky background with clouds"),
-                     ("a realistic photo of a female warrior, flowing "
-                      "dark purple or black hair, bronze shoulder armour, "
-                      "leather chest piece, sky background with clouds"),
-                     6],
-                ],
-                inputs=[example_in, prompt_box, new_box, xattn],
-            )
-    meta_state = gr.State()
-    example_in.change(
-        on_image_change,
-        inputs=example_in,
-        outputs=[img_in, recon_img, meta_state,
-                 steps, inv_slider, xattn]
-    ).then(lambda: gr.update(interactive=True),
-           outputs=[apply_btn, new_box])
-    steps.release(update_step, inputs=steps)
-    xattn.release(update_scale, inputs=xattn)
-    recon_btn.click(
-        reconstruct,
-        inputs=[img_in, prompt_box],
-        outputs=[recon_img, new_box, xattn, meta_state]
-    ).then(lambda: gr.update(interactive=True),
-           outputs=[recon_btn, new_box, apply_btn])
-    recon_btn.click(lambda: gr.update(interactive=False),
-                    outputs=[recon_btn, apply_btn])
-    apply_btn.click(apply_prompt,
-                    inputs=[meta_state, new_box],
-                    outputs=recon_img)
-# -------------------------------------------------------------
-# 8. Launch
-# -------------------------------------------------------------
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--share", action="store_true",
-                        help="Enable public Gradio sharing")
-    args = parser.parse_args()
     demo.queue()
-    demo.launch(share=args.share, inbrowser=True)

+import warnings
+warnings.filterwarnings("ignore")
+from diffusers import DiffusionPipeline, DDIMInverseScheduler, DDIMScheduler, AutoencoderKL
+import torch
+from typing import Optional
+from tqdm import tqdm
+from diffusers.models.attention_processor import Attention, AttnProcessor2_0
+import torchvision
+import torch.nn as nn
 import torch.nn.functional as F
+import gc
+import gradio as gr
 import numpy as np
+import os
+import pickle
+import argparse
 from PIL import Image
+import requests
+import math
+import torch
 from safetensors.torch import load_file
 from huggingface_hub import hf_hub_download
+from diffusers import DiffusionPipeline
+import spaces
+def save_state_to_file(state):
     filename = "state.pkl"
     with open(filename, "wb") as f:
         pickle.dump(state, f)
     return filename
+def load_state_from_file(filename):
     with open(filename, "rb") as f:
+        state = pickle.load(f)
+    return state
+guidance_scale_value = 7.5
+num_inference_steps = 10
+weights = {}
+res_list = []
+foreground_mask = None
+heighest_resolution = -1
+signal_value = 2.0
+blur_value = None
+allowed_res_max = 1.0
+def load_pipeline():
+    model_id = "stabilityai/stable-diffusion-xl-base-1.0"
+    vae_model_id = "madebyollin/sdxl-vae-fp16-fix"
+    vae_folder = ""
+    guidance_scale_value = 7.5
+    resadapter_model_name = "resadapter_v2_sdxl"
+    res_range_min = 256
+    res_range_max = 1536
+    torch_dtype = torch.float16
+    # torch_dtype = torch.float16
+    pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch_dtype).to("cuda")
+    pipe.vae = AutoencoderKL.from_pretrained(vae_model_id, subfolder=vae_folder, torch_dtype=torch_dtype).to("cuda")
+    pipe.load_lora_weights(
+        hf_hub_download(repo_id="jiaxiangc/res-adapter", subfolder=resadapter_model_name, filename="pytorch_lora_weights.safetensors"),
+        adapter_name="res_adapter",
+        ) # load lora weights
+    pipe.set_adapters(["res_adapter"], adapter_weights=[1.0])
+    pipe.unet.load_state_dict(
+        load_file(hf_hub_download(repo_id="jiaxiangc/res-adapter", subfolder=resadapter_model_name, filename="diffusion_pytorch_model.safetensors")),
+        strict=False,
+        ) # load norm weights
+    inverse_scheduler = DDIMInverseScheduler.from_pretrained(model_id, subfolder="scheduler")
+    scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler")
+    return pipe, inverse_scheduler, scheduler
 def weight_population(layer_type, resolution, depth, value):
+    # Check if layer_type exists, if not, create it
     if layer_type not in weights:
         weights[layer_type] = {}
+    # Check if resolution exists under layer_type, if not, create it
     if resolution not in weights[layer_type]:
         weights[layer_type][resolution] = {}
+    global heighest_resolution
     if resolution > heighest_resolution:
         heighest_resolution = resolution
+    # Add/Modify the value at the specified depth (which can be a string)
     weights[layer_type][resolution][depth] = value
+def resize_image_with_aspect(image, res_range_min=128, res_range_max=1024):
+    # Get the original width and height of the image
+    width, height = image.size
+    # Determine the scaling factor to maintain the aspect ratio
+    scaling_factor = 1
+    if width < res_range_min or height < res_range_min:
+        scaling_factor = max(res_range_min / width, res_range_min / height)
+    elif width > res_range_max or height > res_range_max:
+        scaling_factor = min(res_range_max / width, res_range_max / height)
+    # Calculate the new dimensions
+    new_width = int(width * scaling_factor)
+    new_height = int(height * scaling_factor)
+    print(f'{new_width}-{new_height}')
+    # Resize the image with the new dimensions while maintaining the aspect ratio
+    resized_image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
+    return resized_image
+@spaces.GPU()
+def reconstruct(input_img, caption):
+    pipe, inverse_scheduler, scheduler = load_pipeline()
+    global weights
+    weights = {}
+    prompt = caption
+    img = input_img
+    img = resize_image_with_aspect(img, res_range_min, res_range_max)
+    transform = torchvision.transforms.Compose([
+        torchvision.transforms.ToTensor()
+    ])
+    if torch_dtype == torch.float16:
+        loaded_image = transform(img).half().to("cuda").unsqueeze(0)
     else:
+        loaded_image = transform(img).to("cuda").unsqueeze(0)
+    if loaded_image.shape[1] == 4:
+        loaded_image = loaded_image[:,:3,:,:]
+    with torch.no_grad():
+        encoded_image = pipe.vae.encode(loaded_image*2 - 1)
+        real_image_latents = pipe.vae.config.scaling_factor * encoded_image.latent_dist.sample()
+    # notice we disabled the CFG here by setting guidance scale as 1
+    guidance_scale = 1.0
+    inverse_scheduler.set_timesteps(num_inference_steps, device="cuda")
+    timesteps = inverse_scheduler.timesteps
+    latents = real_image_latents
+    inversed_latents = [latents]
+    def store_latent(pipe, step, timestep, callback_kwargs):
+        latents = callback_kwargs["latents"]
+        with torch.no_grad():
+            if step != num_inference_steps - 1:
+                inversed_latents.append(latents)
+        return callback_kwargs
+    with torch.no_grad():
+        replace_attention_processor(pipe.unet, True)
+        pipe.scheduler = inverse_scheduler
+        latents = pipe(prompt=prompt,
+            guidance_scale = guidance_scale,
+            output_type="latent",
+            return_dict=False,
+            num_inference_steps=num_inference_steps,
+            latents=latents,
+            callback_on_step_end=store_latent,
+            callback_on_step_end_tensor_inputs=["latents"],)[0]
+    # initial state
+    real_image_initial_latents = latents
+    guidance_scale = guidance_scale_value
+    scheduler.set_timesteps(num_inference_steps, device="cuda")
+    timesteps = scheduler.timesteps
+    def adjust_latent(pipe, step, timestep, callback_kwargs):
+        with torch.no_grad():
+            callback_kwargs["latents"] = inversed_latents[len(timesteps) - 1 - step].detach()
+        return callback_kwargs
+    with torch.no_grad():
+        replace_attention_processor(pipe.unet, True)
+        intermediate_values = real_image_initial_latents.clone()
+        pipe.scheduler = scheduler
+        intermediate_values = pipe(prompt=prompt,
+            guidance_scale = guidance_scale,
+            output_type="latent",
+            return_dict=False,
+            num_inference_steps=num_inference_steps,
+            latents=intermediate_values,
+            callback_on_step_end=adjust_latent,
+            callback_on_step_end_tensor_inputs=["latents"],)[0]
+        image = pipe.vae.decode(intermediate_values / pipe.vae.config.scaling_factor, return_dict=False)[0]
+        image_np = image.squeeze(0).float().permute(1, 2, 0).detach().cpu()
+        image_np = (image_np / 2 + 0.5).clamp(0, 1).numpy()
+        image_np = (image_np * 255).astype(np.uint8)
+        update_scale(12)
+        return image_np, caption, 12, [caption, real_image_initial_latents.detach(), inversed_latents, weights]
 class AttnReplaceProcessor(AttnProcessor2_0):
+    def __init__(self, replace_all, layer_type, layer_count, blur_sigma=None):
         super().__init__()
         self.replace_all = replace_all
         self.layer_type = layer_type
         self.layer_count = layer_count
+        self.weight_populated = False
         self.blur_sigma = blur_sigma
     def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+        *args,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        dimension_squared = hidden_states.shape[1]
+        is_cross = not encoder_hidden_states is None
         residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        height = width = math.isqrt(query.shape[2])
+        if self.replace_all:
+            weight_value = weights[self.layer_type][dimension_squared][self.layer_count]
+            ucond_attn_scores, attn_scores = query.chunk(2)
+            attn_scores[1].copy_(weight_value * attn_scores[0] + (1.0 - weight_value) * attn_scores[1])
+            ucond_attn_scores[1].copy_(weight_value * ucond_attn_scores[0] + (1.0 - weight_value) * ucond_attn_scores[1])
+            ucond_attn_scores, attn_scores = key.chunk(2)
+            attn_scores[1].copy_(weight_value * attn_scores[0] + (1.0 - weight_value) * attn_scores[1])
+            ucond_attn_scores[1].copy_(weight_value * ucond_attn_scores[0] + (1.0 - weight_value) * ucond_attn_scores[1])
+        else:
+            weight_population(self.layer_type, dimension_squared, self.layer_count, 1.0)
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False,
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
 def replace_attention_processor(unet, clear=False, blur_sigma=None):
+    attention_count = 0
     for name, module in unet.named_modules():
         if "attn1" in name and "to" not in name:
             layer_type = name.split(".")[0].split("_")[0]
+            attention_count += 1
+            if not clear:
+                if layer_type == "down":
+                    module.processor = AttnReplaceProcessor(True, layer_type, attention_count, blur_sigma=blur_sigma)
+                elif layer_type == "mid":
+                    module.processor = AttnReplaceProcessor(True, layer_type, attention_count, blur_sigma=blur_sigma)
+                elif layer_type == "up":
+                    module.processor = AttnReplaceProcessor(True, layer_type, attention_count, blur_sigma=blur_sigma)
+            else:
+                module.processor = AttnReplaceProcessor(False, layer_type, attention_count, blur_sigma=blur_sigma)
+@spaces.GPU()
+def apply_prompt(meta_data, new_prompt):
+    pipe, inverse_scheduler, scheduler = load_pipeline()
+    caption, real_image_initial_latents, inversed_latents, _ = meta_data
+    negative_prompt = ""
+    inference_steps = len(inversed_latents)
+    guidance_scale = guidance_scale_value
+    scheduler.set_timesteps(inference_steps, device="cuda")
+    timesteps = scheduler.timesteps
+    initial_latents = torch.cat([real_image_initial_latents] * 2)
+    def adjust_latent(pipe, step, timestep, callback_kwargs):
+        replace_attention_processor(pipe.unet)
+        with torch.no_grad():
+            callback_kwargs["latents"][1] = callback_kwargs["latents"][1] + (inversed_latents[len(timesteps) - 1 - step].detach() - callback_kwargs["latents"][0])
+            callback_kwargs["latents"][0] = inversed_latents[len(timesteps) - 1 - step].detach()
+        return callback_kwargs
+    with torch.no_grad():
         replace_attention_processor(pipe.unet)
+        pipe.scheduler = scheduler
+        latents = pipe(prompt=[caption, new_prompt],
+            negative_prompt=[negative_prompt, negative_prompt],
+            guidance_scale = guidance_scale,
+            output_type="latent",
+            return_dict=False,
+            num_inference_steps=num_inference_steps,
+            latents=initial_latents,
+            callback_on_step_end=adjust_latent,
+            callback_on_step_end_tensor_inputs=["latents"],)[0]
+        replace_attention_processor(pipe.unet, True)
+        image = pipe.vae.decode(latents[1].unsqueeze(0) / pipe.vae.config.scaling_factor, return_dict=False)[0]
+        image_np = image.squeeze(0).float().permute(1, 2, 0).detach().cpu()
+        image_np = (image_np / 2 + 0.5).clamp(0, 1).numpy()
+        image_np = (image_np * 255).astype(np.uint8)
+    return image_np
 def on_image_change(filepath):
+    # Extract the filename without extension
+    filename = os.path.splitext(os.path.basename(filepath))[0]
+    if filename in ["example1", "example3", "example4"]:
+        meta_data_raw = load_state_from_file(f"assets/{filename}-turbo.pkl")
         global weights
+        _, _, _, weights = meta_data_raw
         global num_inference_steps
         num_inference_steps = 10
+        scale_value = 7
+        if filename == "example1":
+            scale_value = 8
+            new_prompt = "a photo of a tree, summer, colourful"
+        elif filename == "example3":
+            scale_value = 6
+            new_prompt = "a realistic photo of a female warrior, flowing dark purple or black hair, bronze shoulder armour, leather chest piece, sky background with clouds"
+        elif filename == "example4":
+            scale_value = 13
+            new_prompt = "a photo of plastic bottle on some sand, beach background, sky background"
+        update_scale(scale_value)
+        img = apply_prompt(meta_data_raw, new_prompt)
+    return filepath, img, meta_data_raw, num_inference_steps, scale_value, scale_value
+def update_value(value, layer_type, resolution, depth):
+    global weights
+    weights[layer_type][resolution][depth] = value
+def update_step(value):
     global num_inference_steps
+    num_inference_steps = value
+def adjust_ends(values, adjustment):
+    # Forward loop to adjust the first valid element from the left
+    for i in range(len(values)):
+        if (adjustment > 0 and values[i + 1] == 1.0) or (adjustment < 0 and values[i] > 0.0):
+            values[i] = values[i] + adjustment
+            break
+    # Backward loop to adjust the first valid element from the right
+    for i in range(len(values)-1, -1, -1):
+        if (adjustment > 0 and values[i - 1] == 1.0) or (adjustment < 0 and values[i] > 0.0):
+            values[i] = values[i] + adjustment
+            break
+    return values
+max_scale_value = 16
+def update_scale(scale):
+    global weights
+    value_count = 0
+    for outer_key, inner_dict in weights.items():
+        for inner_key, values in inner_dict.items():
+            for _, value in enumerate(values):
+                value_count += 1
+    list_values = [1.0] * value_count
+    for _ in range(scale, max_scale_value):
+        adjust_ends(list_values, -0.5)
+    value_index = 0
+    for outer_key, inner_dict in weights.items():
+        for inner_key, values in inner_dict.items():
+            for idx, value in enumerate(values):
+                weights[outer_key][inner_key][value] = list_values[value_index]
+                value_index += 1
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--share", action="store_true", help="Enable sharing of the Gradio interface")
+    args = parser.parse_args()
+    num_inference_steps = 10
+    model_id = "stabilityai/stable-diffusion-xl-base-1.0"
+    vae_model_id = "madebyollin/sdxl-vae-fp16-fix"
+    vae_folder = ""
+    guidance_scale_value = 7.5
+    resadapter_model_name = "resadapter_v2_sdxl"
+    res_range_min = 256
+    res_range_max = 1536
+    torch_dtype = torch.float16
 with gr.Blocks(analytics_enabled=False) as demo:
     gr.Markdown(
+        """
+            <div style="text-align: center;">
+                <div style="display: flex; justify-content: center;">
+                    <img src="https://github.com/user-attachments/assets/55a38e74-ab93-4d80-91c8-0fa6130af45a" alt="Logo">
+                </div>
+                <h1>Out of Focus v1.0 Turbo</h1>
+                <p style="font-size:16px;">Out of AI presents a flexible tool to manipulate your images. This is our first version of Image modification tool through prompt manipulation by reconstruction through diffusion inversion process</p>
+            </div>
+            <br>
+            <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
+                <a href="https://www.buymeacoffee.com/outofai" target="_blank"><img src="https://img.shields.io/badge/-buy_me_a%C2%A0coffee-red?logo=buy-me-a-coffee" alt="Buy Me A Coffee"></a> &ensp;
+                <a href="https://twitter.com/OutofAi" target="_blank"><img src="https://img.shields.io/twitter/url/https/twitter.com/cloudposse.svg?style=social&label=Out"></a>
+            </div>
+            """
     )
     with gr.Row():
         with gr.Column():
+            with gr.Row():
+                example_input = gr.Image(type="filepath", visible=False)
+                image_input = gr.Image(type="pil", label="Upload Source Image")
+            steps_slider = gr.Slider(minimum=5, maximum=50, step=5, value=num_inference_steps, label="Steps", info="Number of inference steps required to reconstruct and modify the image")
+            prompt_input = gr.Textbox(label="Prompt", info="Give an initial prompt in details, describing the image")
+            reconstruct_button = gr.Button("Reconstruct")
         with gr.Column():
+            with gr.Row():
+                reconstructed_image = gr.Image(type="pil", label="Reconstructed")
+                invisible_slider = gr.Slider(minimum=0, maximum=9, step=1, value=7, visible=False)
+            interpolate_slider = gr.Slider(minimum=0, maximum=max_scale_value, step=1, value=max_scale_value, label="Cross-Attention Influence", info="Scales the related influence the source image has on the target image")
+            new_prompt_input = gr.Textbox(label="New Prompt", interactive=False, info="Manipulate the image by changing the prompt or adding words at the end; swap words instead of adding or removing them for better results")
+            with gr.Row():
+                apply_button = gr.Button("Generate Vision", variant="primary", interactive=False)
+            with gr.Row():
+                show_case = gr.Examples(
+                    examples=[
+                        ["assets/example4.png", "a photo of plastic bottle on a rock, mountain background, sky background", "a photo of plastic bottle on some sand, beach background, sky background", 13],
+                        ["assets/example1.png", "a photo of a tree, spring, foggy", "a photo of a tree, summer, colourful", 8],
+                        [
+                            "assets/example3.png",
+                            "a digital illustration of a female warrior, flowing dark purple or black hair, bronze shoulder armour, leather chest piece, sky background with clouds",
+                            "a realistic photo of a female warrior, flowing dark purple or black hair, bronze shoulder armour, leather chest piece, sky background with clouds",
+                            6 ,
+                        ],
+                    ],
+                    inputs=[example_input, prompt_input, new_prompt_input, interpolate_slider],
+                    label=None,
+                )
+    meta_data = gr.State()
+    example_input.change(fn=on_image_change, inputs=example_input, outputs=[image_input, reconstructed_image, meta_data, steps_slider, invisible_slider, interpolate_slider]).then(lambda: gr.update(interactive=True), outputs=apply_button).then(
+        lambda: gr.update(interactive=True), outputs=new_prompt_input
+    )
+    steps_slider.release(update_step, inputs=steps_slider)
+    interpolate_slider.release(update_scale, inputs=interpolate_slider)
+    value_trigger = True
+    def triggered():
+        global value_trigger
+        value_trigger = not value_trigger
+        return value_trigger
+    reconstruct_button.click(reconstruct, inputs=[image_input, prompt_input], outputs=[reconstructed_image, new_prompt_input, interpolate_slider, meta_data]).then(lambda: gr.update(interactive=True), outputs=reconstruct_button).then(lambda: gr.update(interactive=True), outputs=new_prompt_input).then(
+        lambda: gr.update(interactive=True), outputs=apply_button
+    )
+    reconstruct_button.click(lambda: gr.update(interactive=False), outputs=reconstruct_button)
+    reconstruct_button.click(lambda: gr.update(interactive=False), outputs=apply_button)
+    apply_button.click(apply_prompt, inputs=[meta_data, new_prompt_input], outputs=reconstructed_image)
     demo.queue()
+    demo.launch(share=args.share, inbrowser=True)