Spaces:

alexnasa
/

outoffocus

Running on Zero

App Files Files Community

alexnasa commited on May 7

Commit

e788852

verified ·

1 Parent(s): 9b0d447

Update app.py

Browse files

Files changed (1) hide show

app.py +387 -410

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import warnings
 warnings.filterwarnings("ignore")
-from diffusers import DiffusionPipeline, DDIMInverseScheduler, DDIMScheduler, AutoencoderKL
 import torch
 from typing import Optional
 from tqdm import tqdm
@@ -14,193 +13,205 @@ import gradio as gr
 import numpy as np
 import os
 import pickle
 import argparse
-from PIL import Image
-import requests
-import math
-import torch
-from safetensors.torch import load_file
-from huggingface_hub import hf_hub_download
-from diffusers import DiffusionPipeline
-import spaces
-@spaces.GPU()
 def save_state_to_file(state):
     filename = "state.pkl"
-    with open(filename, "wb") as f:
-        pickle.dump(state, f)
     return filename
-@spaces.GPU()
 def load_state_from_file(filename):
-    with open(filename, "rb") as f:
-        state = pickle.load(f)
-    return state
-guidance_scale_value = 7.5
-num_inference_steps = 10
-weights = {}
-res_list = []
-foreground_mask = None
-heighest_resolution = -1
-signal_value = 2.0
-blur_value = None
-allowed_res_max = 1.0
-# Device configuration
-device = "cuda"
-print(f"Using device: {device}")
-@spaces.GPU()
-def weight_population(layer_type, resolution, depth, value):
-    # Check if layer_type exists, if not, create it
-    if layer_type not in weights:
-        weights[layer_type] = {}
-    # Check if resolution exists under layer_type, if not, create it
-    if resolution not in weights[layer_type]:
-        weights[layer_type][resolution] = {}
-    global heighest_resolution
-    if resolution > heighest_resolution:
-        heighest_resolution = resolution
-    # Add/Modify the value at the specified depth (which can be a string)
-    weights[layer_type][resolution][depth] = value
-@spaces.GPU()
-def resize_image_with_aspect(image, res_range_min=128, res_range_max=1024):
-    # Get the original width and height of the image
-    width, height = image.size
-    # Determine the scaling factor to maintain the aspect ratio
-    scaling_factor = 1
-    if width < res_range_min or height < res_range_min:
-        scaling_factor = max(res_range_min / width, res_range_min / height)
-    elif width > res_range_max or height > res_range_max:
-        scaling_factor = min(res_range_max / width, res_range_max / height)
-    # Calculate the new dimensions
-    new_width = int(width * scaling_factor)
-    new_height = int(height * scaling_factor)
-    print(f'{new_width}-{new_height}')
-    # Resize the image with the new dimensions while maintaining the aspect ratio
-    resized_image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
-    return resized_image
-@spaces.GPU()
-def reconstruct(input_img, caption):
-    pipe, inverse_scheduler, scheduler = load_pipeline()
-    global weights
-    weights = {}
-    prompt = caption
-    img = input_img
-    img = resize_image_with_aspect(img, res_range_min, res_range_max)
-    transform = torchvision.transforms.Compose([
-        torchvision.transforms.ToTensor()
-    ])
-    if torch_dtype == torch.float16:
-        loaded_image = transform(img).half().to(device).unsqueeze(0)
-    else:
-        loaded_image = transform(img).to(device).unsqueeze(0)
-    if loaded_image.shape[1] == 4:
-        loaded_image = loaded_image[:,:3,:,:]
-    with torch.no_grad():
-        encoded_image = pipe.vae.encode(loaded_image*2 - 1)
-        real_image_latents = pipe.vae.config.scaling_factor * encoded_image.latent_dist.sample()
-    # notice we disabled the CFG here by setting guidance scale as 1
-    guidance_scale = 1.0
-    inverse_scheduler.set_timesteps(num_inference_steps, device=device)
-    timesteps = inverse_scheduler.timesteps
-    latents = real_image_latents
-    inversed_latents = [latents]
-    def store_latent(pipe, step, timestep, callback_kwargs):
-        latents = callback_kwargs["latents"]
-        with torch.no_grad():
-            if step != num_inference_steps - 1:
-                inversed_latents.append(latents)
-        return callback_kwargs
-    with torch.no_grad():
-        replace_attention_processor(pipe.unet, True)
-        pipe.scheduler = inverse_scheduler
-        latents = pipe(prompt=prompt,
-            guidance_scale = guidance_scale,
-            output_type="latent",
-            return_dict=False,
-            num_inference_steps=num_inference_steps,
-            latents=latents,
-            callback_on_step_end=store_latent,
-            callback_on_step_end_tensor_inputs=["latents"],)[0]
-    # initial state
-    real_image_initial_latents = latents
-    guidance_scale = guidance_scale_value
-    scheduler.set_timesteps(num_inference_steps, device=device)
-    timesteps = scheduler.timesteps
-    def adjust_latent(pipe, step, timestep, callback_kwargs):
-        with torch.no_grad():
-            callback_kwargs["latents"] = inversed_latents[len(timesteps) - 1 - step].detach()
-        return callback_kwargs
-    with torch.no_grad():
-        replace_attention_processor(pipe.unet, True)
-        intermediate_values = real_image_initial_latents.clone()
-        pipe.scheduler = scheduler
-        intermediate_values = pipe(prompt=prompt,
-            guidance_scale = guidance_scale,
-            output_type="latent",
-            return_dict=False,
-            num_inference_steps=num_inference_steps,
-            latents=intermediate_values,
-            callback_on_step_end=adjust_latent,
-            callback_on_step_end_tensor_inputs=["latents"],)[0]
-        image = pipe.vae.decode(intermediate_values / pipe.vae.config.scaling_factor, return_dict=False)[0]
-        image_np = image.squeeze(0).float().permute(1, 2, 0).detach().cpu()
-        image_np = (image_np / 2 + 0.5).clamp(0, 1).numpy()
-        image_np = (image_np * 255).astype(np.uint8)
-        update_scale(12)
-        return image_np, caption, 12, [caption, real_image_initial_latents.detach(), inversed_latents, weights]
-@spaces.GPU()
 class AttnReplaceProcessor(AttnProcessor2_0):
-    def __init__(self, replace_all, layer_type, layer_count, blur_sigma=None):
         super().__init__()
         self.replace_all = replace_all
-        self.layer_type = layer_type
-        self.layer_count = layer_count
-        self.weight_populated = False
-        self.blur_sigma = blur_sigma
     def __call__(
         self,
@@ -213,31 +224,20 @@ class AttnReplaceProcessor(AttnProcessor2_0):
         **kwargs,
     ) -> torch.FloatTensor:
-        dimension_squared = hidden_states.shape[1]
         is_cross = not encoder_hidden_states is None
-        residual = hidden_states
-        if attn.spatial_norm is not None:
-            hidden_states = attn.spatial_norm(hidden_states, temb)
         input_ndim = hidden_states.ndim
         if input_ndim == 4:
             batch_size, channel, height, width = hidden_states.shape
             hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-        batch_size, sequence_length, _ = (
             hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
         )
-        if attention_mask is not None:
-            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-            # scaled_dot_product_attention expects attention_mask shape to be
-            # (batch, heads, source_length, target_length)
-            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
         if attn.group_norm is not None:
             hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
@@ -251,43 +251,27 @@ class AttnReplaceProcessor(AttnProcessor2_0):
         key = attn.to_k(encoder_hidden_states)
         value = attn.to_v(encoder_hidden_states)
-        inner_dim = key.shape[-1]
-        head_dim = inner_dim // attn.heads
-        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        height = width = math.isqrt(query.shape[2])
-        if self.replace_all:
-            weight_value = weights[self.layer_type][dimension_squared][self.layer_count]
-            ucond_attn_scores, attn_scores = query.chunk(2)
-            attn_scores[1].copy_(weight_value * attn_scores[0] + (1.0 - weight_value) * attn_scores[1])
-            ucond_attn_scores[1].copy_(weight_value * ucond_attn_scores[0] + (1.0 - weight_value) * ucond_attn_scores[1])
-            ucond_attn_scores, attn_scores = key.chunk(2)
-            attn_scores[1].copy_(weight_value * attn_scores[0] + (1.0 - weight_value) * attn_scores[1])
-            ucond_attn_scores[1].copy_(weight_value * ucond_attn_scores[0] + (1.0 - weight_value) * ucond_attn_scores[1])
-        else:
-            weight_population(self.layer_type, dimension_squared, self.layer_count, 1.0)
-        hidden_states = F.scaled_dot_product_attention(
-            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False,
-        )
-        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
-        hidden_states = hidden_states.to(query.dtype)
-        # linear proj
         hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
         if input_ndim == 4:
             hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
@@ -299,296 +283,289 @@ class AttnReplaceProcessor(AttnProcessor2_0):
         return hidden_states
-@spaces.GPU()
-def replace_attention_processor(unet, clear=False, blur_sigma=None):
-    attention_count = 0
-    for name, module in unet.named_modules():
-        if "attn1" in name and "to" not in name:
-            layer_type = name.split(".")[0].split("_")[0]
-            attention_count += 1
-            if not clear:
-                if layer_type == "down":
-                    module.processor = AttnReplaceProcessor(True, layer_type, attention_count, blur_sigma=blur_sigma)
-                elif layer_type == "mid":
-                    module.processor = AttnReplaceProcessor(True, layer_type, attention_count, blur_sigma=blur_sigma)
-                elif layer_type == "up":
-                    module.processor = AttnReplaceProcessor(True, layer_type, attention_count, blur_sigma=blur_sigma)
-            else:
-                module.processor = AttnReplaceProcessor(False, layer_type, attention_count, blur_sigma=blur_sigma)
-@spaces.GPU()
 def apply_prompt(meta_data, new_prompt):
-    caption, real_image_initial_latents, inversed_latents, _ = meta_data
-    negative_prompt = ""
-    inference_steps = len(inversed_latents)
-    guidance_scale = guidance_scale_value
-    scheduler.set_timesteps(inference_steps, device=device)
-    timesteps = scheduler.timesteps
-    initial_latents = torch.cat([real_image_initial_latents] * 2)
-    def adjust_latent(pipe, step, timestep, callback_kwargs):
-        replace_attention_processor(pipe.unet)
-        with torch.no_grad():
-            callback_kwargs["latents"][1] = callback_kwargs["latents"][1] + (inversed_latents[len(timesteps) - 1 - step].detach() - callback_kwargs["latents"][0])
-            callback_kwargs["latents"][0] = inversed_latents[len(timesteps) - 1 - step].detach()
-        return callback_kwargs
-    with torch.no_grad():
-        replace_attention_processor(pipe.unet)
-        pipe.scheduler = scheduler
-        latents = pipe(prompt=[caption, new_prompt],
-            negative_prompt=[negative_prompt, negative_prompt],
-            guidance_scale = guidance_scale,
-            output_type="latent",
-            return_dict=False,
-            num_inference_steps=num_inference_steps,
-            latents=initial_latents,
-            callback_on_step_end=adjust_latent,
-            callback_on_step_end_tensor_inputs=["latents"],)[0]
-        replace_attention_processor(pipe.unet, True)
-        image = pipe.vae.decode(latents[1].unsqueeze(0) / pipe.vae.config.scaling_factor, return_dict=False)[0]
-        image_np = image.squeeze(0).float().permute(1, 2, 0).detach().cpu()
-        image_np = (image_np / 2 + 0.5).clamp(0, 1).numpy()
-        image_np = (image_np * 255).astype(np.uint8)
-    return image_np
-@spaces.GPU()
-def on_image_change(filepath):
-    # Extract the filename without extension
-    filename = os.path.splitext(os.path.basename(filepath))[0]
-    if filename in ["example1", "example3", "example4"]:
-        meta_data_raw = load_state_from_file(f"assets/{filename}-turbo.pkl")
-        global weights
-        _, _, _, weights = meta_data_raw
         global num_inference_steps
-        num_inference_steps = 10
         scale_value = 7
         if filename == "example1":
-            scale_value = 8
             new_prompt = "a photo of a tree, summer, colourful"
         elif filename == "example3":
-            scale_value = 6
             new_prompt = "a realistic photo of a female warrior, flowing dark purple or black hair, bronze shoulder armour, leather chest piece, sky background with clouds"
         elif filename == "example4":
-            scale_value = 13
             new_prompt = "a photo of plastic bottle on some sand, beach background, sky background"
         update_scale(scale_value)
         img = apply_prompt(meta_data_raw, new_prompt)
     return filepath, img, meta_data_raw, num_inference_steps, scale_value, scale_value
-@spaces.GPU()
-def update_value(value, layer_type, resolution, depth):
     global weights
-    weights[layer_type][resolution][depth] = value
 def update_step(value):
     global num_inference_steps
     num_inference_steps = value
-def adjust_ends(values, adjustment):
-    # Forward loop to adjust the first valid element from the left
-    for i in range(len(values)):
-        if (adjustment > 0 and values[i + 1] == 1.0) or (adjustment < 0 and values[i] > 0.0):
-            values[i] = values[i] + adjustment
-            break
-    # Backward loop to adjust the first valid element from the right
-    for i in range(len(values)-1, -1, -1):
-        if (adjustment > 0 and values[i - 1] == 1.0) or (adjustment < 0 and values[i] > 0.0):
-            values[i] = values[i] + adjustment
             break
-    return values
-max_scale_value = 16
-@spaces.GPU()
-def update_scale(scale):
     global weights
-    value_count = 0
     for outer_key, inner_dict in weights.items():
-        for inner_key, values in inner_dict.items():
-            for _, value in enumerate(values):
-                value_count += 1
-    list_values = [1.0] * value_count
-    for _ in range(scale, max_scale_value):
-        adjust_ends(list_values, -0.5)
-    value_index = 0
-    for outer_key, inner_dict in weights.items():
-        for inner_key, values in inner_dict.items():
-            for idx, value in enumerate(values):
-                weights[outer_key][inner_key][value] = list_values[value_index]
-                value_index += 1
-@spaces.GPU()
-def load_pipeline():
-    model_id = "runwayml/stable-diffusion-v1-5"
-    vae_model_id = "runwayml/stable-diffusion-v1-5"
-    vae_folder = "vae"
-    guidance_scale_value = 7.5
-    resadapter_model_name = "resadapter_v2_sd1.5"
-    res_range_min = 128
-    res_range_max = 1024
-    torch_dtype = torch.float16
-    # torch_dtype = torch.float16
-    pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch_dtype).to(device)
-    pipe.vae = AutoencoderKL.from_pretrained(vae_model_id, subfolder=vae_folder, torch_dtype=torch_dtype).to(device)
-    pipe.load_lora_weights(
-        hf_hub_download(repo_id="jiaxiangc/res-adapter", subfolder=resadapter_model_name, filename="pytorch_lora_weights.safetensors"),
-        adapter_name="res_adapter",
-        ) # load lora weights
-    pipe.set_adapters(["res_adapter"], adapter_weights=[1.0])
-    pipe.unet.load_state_dict(
-        load_file(hf_hub_download(repo_id="jiaxiangc/res-adapter", subfolder=resadapter_model_name, filename="diffusion_pytorch_model.safetensors")),
-        strict=False,
-        ) # load norm weights
-    inverse_scheduler = DDIMInverseScheduler.from_pretrained(model_id, subfolder="scheduler")
-    scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler")
-    return pipe, inverse_scheduler, scheduler
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--share", action="store_true", help="Enable sharing of the Gradio interface")
-    args = parser.parse_args()
-    num_inference_steps = 10
-    # model_id = "stabilityai/stable-diffusion-xl-base-1.0"
-    # vae_model_id = "madebyollin/sdxl-vae-fp16-fix"
-    # vae_folder = ""
-    # guidance_scale_value = 7.5
-    # resadapter_model_name = "resadapter_v2_sdxl"
-    # res_range_min = 256
-    # res_range_max = 1536
-    model_id = "runwayml/stable-diffusion-v1-5"
-    vae_model_id = "runwayml/stable-diffusion-v1-5"
-    vae_folder = "vae"
-    guidance_scale_value = 7.5
-    resadapter_model_name = "resadapter_v2_sd1.5"
-    res_range_min = 128
-    res_range_max = 1024
-    torch_dtype = torch.float16
-with gr.Blocks(analytics_enabled=False) as demo:
     gr.Markdown(
-        """
             <div style="text-align: center;">
                 <div style="display: flex; justify-content: center;">
                     <img src="https://github.com/user-attachments/assets/55a38e74-ab93-4d80-91c8-0fa6130af45a" alt="Logo">
                 </div>
-                <h1>Out of Focus v1.0 Turbo</h1>
                 <p style="font-size:16px;">Out of AI presents a flexible tool to manipulate your images. This is our first version of Image modification tool through prompt manipulation by reconstruction through diffusion inversion process</p>
             </div>
             <br>
             <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
                 <a href="https://www.buymeacoffee.com/outofai" target="_blank"><img src="https://img.shields.io/badge/-buy_me_a%C2%A0coffee-red?logo=buy-me-a-coffee" alt="Buy Me A Coffee"></a> &ensp;
-                <a href="https://twitter.com/OutofAi" target="_blank"><img src="https://img.shields.io/twitter/url/https/twitter.com/cloudposse.svg?style=social&label=Out"></a>
             </div>
-            """
-    )
     with gr.Row():
-        with gr.Column():
-            with gr.Row():
-                example_input = gr.Image(type="filepath", visible=False)
-                image_input = gr.Image(type="pil", label="Upload Source Image")
-            steps_slider = gr.Slider(minimum=5, maximum=50, step=5, value=num_inference_steps, label="Steps", info="Number of inference steps required to reconstruct and modify the image")
-            prompt_input = gr.Textbox(label="Prompt", info="Give an initial prompt in details, describing the image")
-            reconstruct_button = gr.Button("Reconstruct")
-        with gr.Column():
-            with gr.Row():
-                reconstructed_image = gr.Image(type="pil", label="Reconstructed")
-                invisible_slider = gr.Slider(minimum=0, maximum=9, step=1, value=7, visible=False)
-            interpolate_slider = gr.Slider(minimum=0, maximum=max_scale_value, step=1, value=max_scale_value, label="Cross-Attention Influence", info="Scales the related influence the source image has on the target image")
-            new_prompt_input = gr.Textbox(label="New Prompt", interactive=False, info="Manipulate the image by changing the prompt or adding words at the end; swap words instead of adding or removing them for better results")
-            with gr.Row():
-                apply_button = gr.Button("Generate Vision", variant="primary", interactive=False)
-            with gr.Row():
-                show_case = gr.Examples(
-                    examples=[
-                        ["assets/example4.png", "a photo of plastic bottle on a rock, mountain background, sky background", "a photo of plastic bottle on some sand, beach background, sky background", 13],
-                        ["assets/example1.png", "a photo of a tree, spring, foggy", "a photo of a tree, summer, colourful", 8],
-                        [
-                            "assets/example3.png",
-                            "a digital illustration of a female warrior, flowing dark purple or black hair, bronze shoulder armour, leather chest piece, sky background with clouds",
-                            "a realistic photo of a female warrior, flowing dark purple or black hair, bronze shoulder armour, leather chest piece, sky background with clouds",
-                            6 ,
-                        ],
-                    ],
-                    inputs=[example_input, prompt_input, new_prompt_input, interpolate_slider],
-                    label=None,
-                )
     meta_data = gr.State()
-    example_input.change(fn=on_image_change, inputs=example_input, outputs=[image_input, reconstructed_image, meta_data, steps_slider, invisible_slider, interpolate_slider]).then(lambda: gr.update(interactive=True), outputs=apply_button).then(
-        lambda: gr.update(interactive=True), outputs=new_prompt_input
     )
     steps_slider.release(update_step, inputs=steps_slider)
-    interpolate_slider.release(update_scale, inputs=interpolate_slider)
-    value_trigger = True
-    def triggered():
-        global value_trigger
-        value_trigger = not value_trigger
-        return value_trigger
-    reconstruct_button.click(reconstruct, inputs=[image_input, prompt_input], outputs=[reconstructed_image, new_prompt_input, interpolate_slider, meta_data]).then(lambda: gr.update(interactive=True), outputs=reconstruct_button).then(lambda: gr.update(interactive=True), outputs=new_prompt_input).then(
-        lambda: gr.update(interactive=True), outputs=apply_button
     )
-    reconstruct_button.click(lambda: gr.update(interactive=False), outputs=reconstruct_button)
-    reconstruct_button.click(lambda: gr.update(interactive=False), outputs=apply_button)
     apply_button.click(apply_prompt, inputs=[meta_data, new_prompt_input], outputs=reconstructed_image)
     demo.queue()
-    demo.launch(share=args.share, inbrowser=True)

 import warnings
 warnings.filterwarnings("ignore")
+from diffusers import StableDiffusionPipeline, DDIMInverseScheduler, DDIMScheduler
 import torch
 from typing import Optional
 from tqdm import tqdm
 import numpy as np
 import os
 import pickle
+from transformers import CLIPImageProcessor
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 import argparse
+weights = {
+    'down': {
+        4096: 0.0,
+        1024: 1.0,
+        256: 1.0,
+    },
+    'mid': {
+        64: 1.0,
+    },
+    'up': {
+        256: 1.0,
+        1024: 1.0,
+        4096: 0.0,
+    }
+}
+num_inference_steps = 10
+model_id = "stabilityai/stable-diffusion-2-1-base"
+pipe = StableDiffusionPipeline.from_pretrained(model_id).to("cuda")
+inverse_scheduler = DDIMInverseScheduler.from_pretrained(model_id, subfolder="scheduler")
+scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler")
+safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker").to("cuda")
+feature_extractor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
+should_stop = False
 def save_state_to_file(state):
     filename = "state.pkl"
+    with open(filename, 'wb') as f:
+        pickle.dump(state, f)
     return filename
 def load_state_from_file(filename):
+    with open(filename, 'rb') as f:
+        state = pickle.load(f)
+    return state
+def stop_reconstruct():
+  global should_stop
+  should_stop = True
+def reconstruct(input_img, caption):
+  img = input_img
+  cond_prompt_embeds = pipe.encode_prompt(prompt=caption, device="cuda", num_images_per_prompt=1, do_classifier_free_guidance=False)[0]
+  uncond_prompt_embeds = pipe.encode_prompt(prompt="", device="cuda", num_images_per_prompt=1, do_classifier_free_guidance=False)[0]
+  prompt_embeds_combined = torch.cat([uncond_prompt_embeds, cond_prompt_embeds])
+  transform = torchvision.transforms.Compose([
+      torchvision.transforms.Resize((512, 512)),
+      torchvision.transforms.ToTensor()
+  ])
+  loaded_image = transform(img).to("cuda").unsqueeze(0)
+  if loaded_image.shape[1] == 4:
+      loaded_image = loaded_image[:,:3,:,:]
+  with torch.no_grad():
+      encoded_image = pipe.vae.encode(loaded_image*2 - 1)
+      real_image_latents = pipe.vae.config.scaling_factor * encoded_image.latent_dist.sample()
+  guidance_scale = 1
+  inverse_scheduler.set_timesteps(num_inference_steps, device="cuda")
+  timesteps = inverse_scheduler.timesteps
+  latents = real_image_latents
+  inversed_latents = []
+  with torch.no_grad():
+      replace_attention_processor(pipe.unet, True)
+      for i, t in tqdm(enumerate(timesteps), total=len(timesteps), desc="Inference steps"):
+          inversed_latents.append(latents)
+          latent_model_input = torch.cat([latents] * 2)
+          noise_pred = pipe.unet(
+              latent_model_input,
+              t,
+              encoder_hidden_states=prompt_embeds_combined,
+              cross_attention_kwargs=None,
+              return_dict=False,
+          )[0]
+          noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+          noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+          latents = inverse_scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+  # initial state
+  real_image_initial_latents = latents
+  W_values = uncond_prompt_embeds.repeat(num_inference_steps, 1, 1)
+  QT = nn.Parameter(W_values.clone())
+  guidance_scale = 7.5
+  scheduler.set_timesteps(num_inference_steps, device="cuda")
+  timesteps = scheduler.timesteps
+  optimizer = torch.optim.AdamW([QT], lr=0.008)
+  pipe.vae.eval()
+  pipe.vae.requires_grad_(False)
+  pipe.unet.eval()
+  pipe.unet.requires_grad_(False)
+  last_loss = 1
+  for epoch in range(50):
+      gc.collect()
+      torch.cuda.empty_cache()
+      if last_loss < 0.02:
+          break
+      elif last_loss < 0.03:
+          for param_group in optimizer.param_groups:
+              param_group['lr'] = 0.003
+      elif last_loss < 0.035:
+          for param_group in optimizer.param_groups:
+              param_group['lr'] = 0.006
+      intermediate_values = real_image_initial_latents.clone()
+      for i in range(num_inference_steps):
+          latents = intermediate_values.detach().clone()
+          t = timesteps[i]
+          prompt_embeds = torch.cat([QT[i].unsqueeze(0), cond_prompt_embeds.detach()])
+          latent_model_input = torch.cat([latents] * 2)
+          noise_pred_model = pipe.unet(
+              latent_model_input,
+              t,
+              encoder_hidden_states=prompt_embeds,
+              cross_attention_kwargs=None,
+              return_dict=False,
+          )[0]
+          noise_pred_uncond, noise_pred_text = noise_pred_model.chunk(2)
+          noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+          intermediate_values = scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+          loss = F.mse_loss(inversed_latents[len(timesteps) - 1 - i].detach(), intermediate_values, reduction="mean")
+          last_loss = loss
+          optimizer.zero_grad()
+          loss.backward()
+          optimizer.step()
+      global should_stop
+      if should_stop:
+        should_stop = False
+        break
+      image = pipe.vae.decode(latents / pipe.vae.config.scaling_factor, return_dict=False)[0]
+      image = (image / 2.0 + 0.5).clamp(0.0, 1.0)
+      safety_checker_input = feature_extractor(image, return_tensors="pt", do_rescale=False).to("cuda")
+      image = safety_checker(images=[image], clip_input=safety_checker_input.pixel_values.to("cuda"))[0]
+      image_np = image[0].squeeze(0).float().permute(1,2,0).detach().cpu().numpy()
+      image_np = (image_np * 255).astype(np.uint8)
+      yield image_np, caption, [caption, real_image_initial_latents, QT]
+  image = pipe.vae.decode(latents / pipe.vae.config.scaling_factor, return_dict=False)[0]
+  image = (image / 2.0 + 0.5).clamp(0.0, 1.0)
+  safety_checker_input = feature_extractor(image, return_tensors="pt", do_rescale=False).to("cuda")
+  image = safety_checker(images=[image], clip_input=safety_checker_input.pixel_values.to("cuda"))[0]
+  image_np = image[0].squeeze(0).float().permute(1,2,0).detach().cpu().numpy()
+  image_np = (image_np * 255).astype(np.uint8)
+  yield image_np, caption, [caption, real_image_initial_latents, QT]
 class AttnReplaceProcessor(AttnProcessor2_0):
+    def __init__(self, replace_all, weight):
         super().__init__()
         self.replace_all = replace_all
+        self.weight = weight
     def __call__(
         self,
         **kwargs,
     ) -> torch.FloatTensor:
+        residual = hidden_states
         is_cross = not encoder_hidden_states is None
         input_ndim = hidden_states.ndim
         if input_ndim == 4:
             batch_size, channel, height, width = hidden_states.shape
             hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, _, _ = (
             hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
         )
         if attn.group_norm is not None:
             hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
         key = attn.to_k(encoder_hidden_states)
         value = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_scores = attn.scale * torch.bmm(query, key.transpose(-1, -2))
+        dimension_squared = hidden_states.shape[1]
+        if not is_cross and (self.replace_all):
+            ucond_attn_scores_src, ucond_attn_scores_dst, attn_scores_src, attn_scores_dst = attention_scores.chunk(4)
+            attn_scores_dst.copy_(self.weight[dimension_squared] * attn_scores_src + (1.0 - self.weight[dimension_squared]) * attn_scores_dst)
+            ucond_attn_scores_dst.copy_(self.weight[dimension_squared] * ucond_attn_scores_src + (1.0 - self.weight[dimension_squared]) * ucond_attn_scores_dst)
+        attention_probs = attention_scores.softmax(dim=-1)
+        del attention_scores
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        del attention_probs
         hidden_states = attn.to_out[0](hidden_states)
         if input_ndim == 4:
             hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
         return hidden_states
+def replace_attention_processor(unet, clear = False):
+  for name, module in unet.named_modules():
+    if 'attn1' in name and 'to' not in name:
+        layer_type = name.split('.')[0].split('_')[0]
+        if not clear:
+          if layer_type == 'down':
+              module.processor = AttnReplaceProcessor(True, weights['down'])
+          elif layer_type == 'mid':
+              module.processor = AttnReplaceProcessor(True, weights['mid'])
+          elif layer_type == 'up':
+              module.processor = AttnReplaceProcessor(True, weights['up'])
+        else:
+          module.processor = AttnReplaceProcessor(False, 0.0)
 def apply_prompt(meta_data, new_prompt):
+  caption, real_image_initial_latents, QT = meta_data
+  inference_steps = len(QT)
+  cond_prompt_embeds = pipe.encode_prompt(prompt=caption, device="cuda", num_images_per_prompt=1, do_classifier_free_guidance=False)[0]
+#   uncond_prompt_embeds = pipe.encode_prompt(prompt=caption, device="cuda", num_images_per_prompt=1, do_classifier_free_guidance=False)[0]
+  new_prompt_embeds = pipe.encode_prompt(prompt=new_prompt, device="cuda", num_images_per_prompt=1, do_classifier_free_guidance=False)[0]
+  guidance_scale = 7.5
+  scheduler.set_timesteps(inference_steps, device="cuda")
+  timesteps = scheduler.timesteps
+  latents = torch.cat([real_image_initial_latents] * 2)
+  with torch.no_grad():
+    replace_attention_processor(pipe.unet)
+    for i, t in tqdm(enumerate(timesteps), total=len(timesteps), desc="Inference steps"):
+        modified_prompt_embeds = torch.cat([QT[i].unsqueeze(0), QT[i].unsqueeze(0), cond_prompt_embeds, new_prompt_embeds])
+        latent_model_input = torch.cat([latents] * 2)
+        noise_pred = pipe.unet(
+            latent_model_input,
+            t,
+            encoder_hidden_states=modified_prompt_embeds,
+            cross_attention_kwargs=None,
+            return_dict=False,
+        )[0]
+        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+        latents = scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+    replace_attention_processor(pipe.unet, True)
+    image = pipe.vae.decode(latents[1].unsqueeze(0) / pipe.vae.config.scaling_factor, return_dict=False)[0]
+    image = (image / 2.0 + 0.5).clamp(0.0, 1.0)
+    safety_checker_input = feature_extractor(image, return_tensors="pt", do_rescale=False).to("cuda")
+    image = safety_checker(images=[image], clip_input=safety_checker_input.pixel_values.to("cuda"))[0]
+    image_np = image[0].squeeze(0).float().permute(1,2,0).detach().cpu().numpy()
+    image_np = (image_np * 255).astype(np.uint8)
+  return image_np
+def on_image_change(filepath):
+    # Extract the filename without extension
+    filename = os.path.splitext(os.path.basename(filepath))[0]
+    # Check if the filename is "example1" or "example2"
+    if filename in ["example1", "example2", "example3", "example4"]:
+        meta_data_raw = load_state_from_file(f"assets/{filename}.pkl")
+        _, _, QT_raw = meta_data_raw
         global num_inference_steps
+        num_inference_steps = len(QT_raw)
         scale_value = 7
+        new_prompt = ""
         if filename == "example1":
+            scale_value = 7
             new_prompt = "a photo of a tree, summer, colourful"
+        elif filename == "example2":
+            scale_value = 8
+            new_prompt = "a photo of a panda, two ears, white background"
         elif filename == "example3":
+            scale_value = 7
             new_prompt = "a realistic photo of a female warrior, flowing dark purple or black hair, bronze shoulder armour, leather chest piece, sky background with clouds"
         elif filename == "example4":
+            scale_value = 7
             new_prompt = "a photo of plastic bottle on some sand, beach background, sky background"
         update_scale(scale_value)
         img = apply_prompt(meta_data_raw, new_prompt)
     return filepath, img, meta_data_raw, num_inference_steps, scale_value, scale_value
+def update_value(value, key, res):
     global weights
+    weights[key][res] = value
 def update_step(value):
     global num_inference_steps
     num_inference_steps = value
+def update_scale(scale):
+    values = [1.0] * 7
+    if scale == 9:
+        return values
+    reduction_steps = (9 - scale) * 0.5
+    for i in range(4):  # There are 4 positions to reduce symmetrically
+        if reduction_steps >= 1:
+            values[i] = 0.0
+            values[-(i + 1)] = 0.0
+            reduction_steps -= 1
+        elif reduction_steps > 0:
+            values[i] = 0.5
+            values[-(i + 1)] = 0.5
             break
     global weights
+    index = 0
     for outer_key, inner_dict in weights.items():
+        for inner_key in inner_dict:
+            inner_dict[inner_key] = values[index]
+            index += 1
+    return weights['down'][4096], weights['down'][1024], weights['down'][256], weights['mid'][64], weights['up'][256], weights['up'][1024], weights['up'][4096]
+with gr.Blocks() as demo:
     gr.Markdown(
+            '''
             <div style="text-align: center;">
                 <div style="display: flex; justify-content: center;">
                     <img src="https://github.com/user-attachments/assets/55a38e74-ab93-4d80-91c8-0fa6130af45a" alt="Logo">
                 </div>
+                <h1>Out of Focus 1.0</h1>
                 <p style="font-size:16px;">Out of AI presents a flexible tool to manipulate your images. This is our first version of Image modification tool through prompt manipulation by reconstruction through diffusion inversion process</p>
             </div>
             <br>
             <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
                 <a href="https://www.buymeacoffee.com/outofai" target="_blank"><img src="https://img.shields.io/badge/-buy_me_a%C2%A0coffee-red?logo=buy-me-a-coffee" alt="Buy Me A Coffee"></a> &ensp;
+                <a href="https://twitter.com/OutofAi" target="_blank"><img src="https://img.shields.io/twitter/url/https/twitter.com/cloudposse.svg?style=social&label=Ashleigh%20Watson"></a> &ensp;
+                <a href="https://twitter.com/banterless_ai" target="_blank"><img src="https://img.shields.io/twitter/url/https/twitter.com/cloudposse.svg?style=social&label=Alex%20Nasa"></a>
             </div>
+            <br>
+            <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
+                <p style="display: flex;gap: 6px;">
+                <a href="https://huggingface.co/spaces/fffiloni/OutofFocus?duplicate=true">
+                    <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-md.svg" alt="Duplicate this Space">
+                </a> to skip the queue and enjoy faster inference on the GPU of your choice
+                </p>
+            </div>
+            '''
+        )
     with gr.Row():
+      with gr.Column():
+          with gr.Row():
+            example_input = gr.Image(height=512, width=512, type="filepath", visible=False)
+            image_input = gr.Image(height=512, width=512, type="pil", label="Upload Source Image")
+          steps_slider = gr.Slider(minimum=5, maximum=25, step=5, value=num_inference_steps, label="Steps", info="Number of inference steps required to reconstruct and modify the image")
+          prompt_input = gr.Textbox(label="Prompt", info="Give an initial prompt in details, describing the image")
+          reconstruct_button = gr.Button("Reconstruct")
+          stop_button = gr.Button("Stop", variant="stop", interactive=False)
+      with gr.Column():
+        reconstructed_image = gr.Image(type="pil", label="Reconstructed")
+        with gr.Row():
+            invisible_slider = gr.Slider(minimum=0, maximum=9, step=1, value=7, visible=False)
+            interpolate_slider = gr.Slider(minimum=0, maximum=9, step=1, value=7, label="Cross-Attention Influence", info="Scales the related influence the source image has on the target image")
+        with gr.Row():
+            new_prompt_input = gr.Textbox(label="New Prompt", interactive=False, info="Manipulate the image by changing the prompt or word addition at the end, achieve the best results by swapping words instead of adding or removing in between")
+        with gr.Row():
+            apply_button = gr.Button("Generate Vision", variant="primary", interactive=False)
+        with gr.Row():
+            with gr.Accordion(label="Advanced Options", open=False):
+                    gr.Markdown(
+                        '''
+                        <div style="text-align: center;">
+                            <h1>Weight Adjustment</h1>
+                            <p style="font-size:16px;">Specific Cross-Attention Influence weights can be manually modified for given resolutions (1.0 = Fully Source Attn 0.0 = Fully Target Attn)</p>
+                        </div>
+                        '''
+                    )
+                    down_slider_4096 = gr.Number(minimum=0.0, maximum=1.0, step=0.1, value=weights['down'][4096], label="Self-Attn Down 64x64")
+                    down_slider_1024 = gr.Number(minimum=0.0, maximum=1.0, step=0.1, value=weights['down'][1024], label="Self-Attn Down 32x32")
+                    down_slider_256 = gr.Number(minimum=0.0, maximum=1.0, step=0.1, value=weights['down'][256], label="Self-Attn Down 16x16")
+                    mid_slider_64 = gr.Number(minimum=0.0, maximum=1.0, step=0.1, value=weights['mid'][64], label="Self-Attn Mid 8x8")
+                    up_slider_256 = gr.Number(minimum=0.0, maximum=1.0, step=0.1, value=weights['up'][256], label="Self-Attn Up 16x16")
+                    up_slider_1024 = gr.Number(minimum=0.0, maximum=1.0, step=0.1, value=weights['up'][1024], label="Self-Attn Up 32x32")
+                    up_slider_4096 = gr.Number(minimum=0.0, maximum=1.0, step=0.1, value=weights['up'][4096], label="Self-Attn Up 64x64")
+        with gr.Row():
+            show_case = gr.Examples(
+                examples=[
+                    ["assets/example4.png", "a photo of plastic bottle on a rock, mountain background, sky background", "a photo of plastic bottle on some sand, beach background, sky background"],
+                    ["assets/example1.png", "a photo of a tree, spring, foggy", "a photo of a tree, summer, colourful"],
+                    ["assets/example2.png", "a photo of a cat, two ears, white background", "a photo of a panda, two ears, white background"],
+                    ["assets/example3.png", "a digital illustration of a female warrior, flowing dark purple or black hair, bronze shoulder armour, leather chest piece, sky background with clouds", "a realistic photo of a female warrior, flowing dark purple or black hair, bronze shoulder armour, leather chest piece, sky background with clouds"],
+                ],
+                inputs=[example_input, prompt_input, new_prompt_input],
+                label=None
+            )
     meta_data = gr.State()
+    example_input.change(
+        fn=on_image_change,
+        inputs=example_input,
+        outputs=[image_input, reconstructed_image, meta_data, steps_slider, invisible_slider, interpolate_slider]
+    ).then(
+        lambda: gr.update(interactive=True),
+        outputs=apply_button
+    ).then(
+        lambda: gr.update(interactive=True),
+        outputs=new_prompt_input
     )
     steps_slider.release(update_step, inputs=steps_slider)
+    interpolate_slider.release(update_scale, inputs=interpolate_slider, outputs=[down_slider_4096, down_slider_1024, down_slider_256, mid_slider_64, up_slider_256, up_slider_1024, up_slider_4096 ])
+    invisible_slider.change(update_scale, inputs=invisible_slider, outputs=[down_slider_4096, down_slider_1024, down_slider_256, mid_slider_64, up_slider_256, up_slider_1024, up_slider_4096 ])
+    up_slider_4096.change(update_value, inputs=[up_slider_4096, gr.State('up'), gr.State(4096)])
+    up_slider_1024.change(update_value, inputs=[up_slider_1024, gr.State('up'), gr.State(1024)])
+    up_slider_256.change(update_value, inputs=[up_slider_256, gr.State('up'), gr.State(256)])
+    down_slider_4096.change(update_value, inputs=[down_slider_4096, gr.State('down'), gr.State(4096)])
+    down_slider_1024.change(update_value, inputs=[down_slider_1024, gr.State('down'), gr.State(1024)])
+    down_slider_256.change(update_value, inputs=[down_slider_256, gr.State('down'), gr.State(256)])
+    mid_slider_64.change(update_value, inputs=[mid_slider_64, gr.State('mid'), gr.State(64)])
+    reconstruct_button.click(reconstruct, inputs=[image_input, prompt_input], outputs=[reconstructed_image, new_prompt_input, meta_data]).then(
+        lambda: gr.update(interactive=True),
+        outputs=reconstruct_button
+    ).then(
+        lambda: gr.update(interactive=True),
+        outputs=new_prompt_input
+    ).then(
+        lambda: gr.update(interactive=True),
+        outputs=apply_button
+    ).then(
+        lambda: gr.update(interactive=False),
+        outputs=stop_button
+    )
+    reconstruct_button.click(
+        lambda: gr.update(interactive=False),
+        outputs=reconstruct_button
+    )
+    reconstruct_button.click(
+        lambda: gr.update(interactive=True),
+        outputs=stop_button
     )
+    reconstruct_button.click(
+        lambda: gr.update(interactive=False),
+        outputs=apply_button
+    )
+    stop_button.click(
+        lambda: gr.update(interactive=False),
+        outputs=stop_button
+    )
     apply_button.click(apply_prompt, inputs=[meta_data, new_prompt_input], outputs=reconstructed_image)
+    stop_button.click(stop_reconstruct)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--share", action="store_true")
+    args = parser.parse_args()
     demo.queue()
+    demo.launch(share=args.share)