Spaces:

omer11a
/

bounded-attention

Runtime error

App Files Files Community

omer11a commited on Apr 10, 2024

Commit

72df8c8

1 Parent(s): 9d8ec57

Modified examples

Browse files

Files changed (2) hide show

app.py +79 -7
bounded_attention.py +18 -16

app.py CHANGED Viewed

@@ -19,6 +19,30 @@ MIN_SIZE = 0.01
 WHITE = 255
 COLORS = ["red", "blue", "green", "orange", "purple", "turquoise", "olive"]
 CSS = """
 #paper-info a {
     color:#008AD7;
@@ -88,8 +112,11 @@ ADVANCED_OPTION_DESCRIPTION = """
     <div class="tooltip">Final step size &#9432
     <span class="tooltiptext">The final step size of the linear step size scheduler when performing guidance.</span>
     </div>
     <div class="tooltip">Number of self-attention clusters per subject &#9432
-    <span class="tooltiptext">Determines the number of clusters when clustering the self-attention maps (#clusters = #subject x #clusters_per_subject). Changing this value might improve semantics (adherence to the prompt), especially when the subjects exceed their bounding boxes.</span>
     </div>
     <div class="tooltip">Cross-attention loss scale factor &#9432
     <span class="tooltiptext">The scale factor of the cross-attention loss term. Increasing it will improve semantic control (adherence to the prompt), but may reduce image quality.</span>
@@ -120,6 +147,7 @@ def inference(
     num_tokens,
     init_step_size,
     final_step_size,
     num_clusters_per_subject,
     cross_loss_scale,
     self_loss_scale,
@@ -158,6 +186,7 @@ def inference(
         start_step_size=init_step_size,
         end_step_size=final_step_size,
         loss_stopping_value=loss_threshold,
         num_clusters_per_box=num_clusters_per_subject,
         max_resolution=32,
     )
@@ -174,6 +203,7 @@ def generate(
     num_tokens,
     init_step_size,
     final_step_size,
     num_clusters_per_subject,
     cross_loss_scale,
     self_loss_scale,
@@ -185,6 +215,7 @@ def generate(
     seed,
     boxes,
 ):
     subject_token_indices = convert_token_indices(subject_token_indices, nested=True)
     if len(boxes) != len(subject_token_indices):
         raise gr.Error("""
@@ -198,8 +229,8 @@ def generate(
     images = inference(
         boxes, prompts, subject_token_indices, filter_token_indices, num_tokens, init_step_size,
-        final_step_size, num_clusters_per_subject, cross_loss_scale, self_loss_scale, classifier_free_guidance_scale,
-        num_iterations, loss_threshold, num_guidance_steps, seed)
     return images
@@ -251,6 +282,17 @@ def clear(batch_size):
     return [[], None, None, None]
 def main():
     nltk.download("averaged_perceptron_tagger")
@@ -303,6 +345,7 @@ def main():
                     num_guidance_steps = gr.Slider(minimum=5, maximum=20, step=1, value=8, label="Number of timesteps to perform guidance")
                     init_step_size = gr.Slider(minimum=0, maximum=50, step=0.5, value=30, label="Initial step size")
                     final_step_size = gr.Slider(minimum=0, maximum=20, step=0.5, value=15, label="Final step size")
                     num_clusters_per_subject = gr.Slider(minimum=0, maximum=5, step=0.5, value=3, label="Number of clusters per subject")
                     cross_loss_scale = gr.Slider(minimum=0, maximum=2, step=0.1, value=1, label="Cross-attention loss scale factor")
                     self_loss_scale = gr.Slider(minimum=0, maximum=2, step=0.1, value=1, label="Self-attention loss scale factor")
@@ -331,7 +374,7 @@ def main():
                 fn=generate,
                 inputs=[
                     prompt, subject_token_indices, filter_token_indices, num_tokens,
-                    init_step_size, final_step_size, num_clusters_per_subject, cross_loss_scale, self_loss_scale,
                     classifier_free_guidance_scale, batch_size, num_iterations, loss_threshold, num_guidance_steps,
                     seed,
                     boxes,
@@ -342,11 +385,40 @@ def main():
         with gr.Column():
             gr.Examples(
                 examples=[
-                    ["a ginger kitten and a gray puppy in a yard", "2,3;6,7", "1,4,5,8,9", "10"],
-                    ["a realistic photo of a highway with a semi trailer and a concrete mixer and a helicopter", "9,10;13,14;17", "1,4,5,7,8,11,12,15,16", "17"],
                 ],
-                inputs=[prompt, subject_token_indices, filter_token_indices, num_tokens],
             )
         gr.HTML(FOOTNOTE)

 WHITE = 255
 COLORS = ["red", "blue", "green", "orange", "purple", "turquoise", "olive"]
+PROMPT1 = "3D Pixar animation of a cute unicorn and a pink hedgehog and a nerdy owl traveling in a magical forest"
+PROMPT2 = "science fiction movie poster with an astronaut and a robot and a green alien and a spaceship"
+PROMPT3 = "a golden retriever and a german shepherd and a boston terrier and an english bulldog and a border collie in a pool"
+EXAMPLE_BOXES = {
+    PROMPT1 : [
+        [0.35, 0.4, 0.65, 0.9],
+        [0, 0.6, 0.3, 0.9],
+        [0.7, 0.55, 1, 0.85]
+    ],
+    PROMPT2: [
+        [0.4, 0.45, 0.6, 0.95],
+        [0.2, 0.3, 0.4, 0.85],
+        [0.6, 0.3, 0.8, 0.85],
+        [0.1, 0, 0.9, 0.3]
+    ],
+    PROMPT3: [
+        [0, 0.5, 0.2, 0.8],
+        [0.2, 0.2, 0.4, 0.5],
+        [0.4, 0.5, 0.6, 0.8],
+        [0.6, 0.2, 0.8, 0.5],
+        [0.8, 0.5, 1, 0.8]
+    ],
+}
 CSS = """
 #paper-info a {
     color:#008AD7;
     <div class="tooltip">Final step size &#9432
     <span class="tooltiptext">The final step size of the linear step size scheduler when performing guidance.</span>
     </div>
+    <div class="tooltip">First refinement step &#9432
+    <span class="tooltiptext">The timestep from which subject mask refinement is performed.</span>
+    </div>
     <div class="tooltip">Number of self-attention clusters per subject &#9432
+    <span class="tooltiptext">The number of clusters computed when clustering the self-attention maps (#clusters = #subject x #clusters_per_subject). Changing this value might improve semantics (adherence to the prompt), especially when the subjects exceed their bounding boxes.</span>
     </div>
     <div class="tooltip">Cross-attention loss scale factor &#9432
     <span class="tooltiptext">The scale factor of the cross-attention loss term. Increasing it will improve semantic control (adherence to the prompt), but may reduce image quality.</span>
     num_tokens,
     init_step_size,
     final_step_size,
+    first_refinement_step,
     num_clusters_per_subject,
     cross_loss_scale,
     self_loss_scale,
         start_step_size=init_step_size,
         end_step_size=final_step_size,
         loss_stopping_value=loss_threshold,
+        min_clustering_step=first_refinement_step,
         num_clusters_per_box=num_clusters_per_subject,
         max_resolution=32,
     )
     num_tokens,
     init_step_size,
     final_step_size,
+    first_refinement_step,
     num_clusters_per_subject,
     cross_loss_scale,
     self_loss_scale,
     seed,
     boxes,
 ):
+    print('boxes in generate', boxes)
     subject_token_indices = convert_token_indices(subject_token_indices, nested=True)
     if len(boxes) != len(subject_token_indices):
         raise gr.Error("""
     images = inference(
         boxes, prompts, subject_token_indices, filter_token_indices, num_tokens, init_step_size,
+        final_step_size, first_refinement_step, num_clusters_per_subject, cross_loss_scale, self_loss_scale,
+        classifier_free_guidance_scale, num_iterations, loss_threshold, num_guidance_steps, seed)
     return images
     return [[], None, None, None]
+def build_example_layout(prompt, *args):
+    boxes = EXAMPLE_BOXES[prompt]
+    composite = draw_boxes(boxes, is_sketch=True)
+    sketchpad = {"background": None, "layers": [], "composite": composite}
+    layout_image = draw_boxes(boxes)
+    return boxes, sketchpad, layout_image
 def main():
     nltk.download("averaged_perceptron_tagger")
                     num_guidance_steps = gr.Slider(minimum=5, maximum=20, step=1, value=8, label="Number of timesteps to perform guidance")
                     init_step_size = gr.Slider(minimum=0, maximum=50, step=0.5, value=30, label="Initial step size")
                     final_step_size = gr.Slider(minimum=0, maximum=20, step=0.5, value=15, label="Final step size")
+                    first_refinement_step = gr.Slider(minimum=0, maximum=50, step=1, value=15, label="The timestep from which to start refining the subject masks")
                     num_clusters_per_subject = gr.Slider(minimum=0, maximum=5, step=0.5, value=3, label="Number of clusters per subject")
                     cross_loss_scale = gr.Slider(minimum=0, maximum=2, step=0.1, value=1, label="Cross-attention loss scale factor")
                     self_loss_scale = gr.Slider(minimum=0, maximum=2, step=0.1, value=1, label="Self-attention loss scale factor")
                 fn=generate,
                 inputs=[
                     prompt, subject_token_indices, filter_token_indices, num_tokens,
+                    init_step_size, final_step_size, first_refinement_step, num_clusters_per_subject, cross_loss_scale, self_loss_scale,
                     classifier_free_guidance_scale, batch_size, num_iterations, loss_threshold, num_guidance_steps,
                     seed,
                     boxes,
         with gr.Column():
             gr.Examples(
+                #examples=[
+                #    ["a ginger kitten and a gray puppy in a yard", "2,3;6,7", "1,4,5,8,9", "10"],
+                #    ["a realistic photo of a highway with a semi trailer and a concrete mixer and a helicopter", "9,10;13,14;17", "1,4,5,7,8,11,12,15,16", "17"],
+                #],
+                #inputs=[prompt, subject_token_indices, filter_token_indices, num_tokens],
                 examples=[
+                    [
+                        PROMPT1, "7,8,17;11,12,17;15,16,17", "5,6,9,10,13,14,18,19", "21",
+                        25, 18, 3, 1, 1,
+                        7.5, 1, 5, 0.2, 8,
+                        286,
+                    ],
+                    [
+                        PROMPT2, "7;10;13,14;17", "5,6,8,9,11,12,15,16", "17",
+                        18, 12, 3, 1, 1,
+                        7.5, 1, 5, 0.2, 8,
+                        216,
+                    ],
+                    [
+                        PROMPT3, "2,3;6,7;10,11;14,15;18,19", "1,4,5,8,9,12,13,16,17,20,21", "22",
+                        18, 12, 3, 1, 1,
+                        7.5, 1, 5, 0.2, 8,
+                        156,
+                    ],
+                ],
+                fn=build_example_layout,
+                inputs=[
+                    prompt, subject_token_indices, filter_token_indices, num_tokens,
+                    init_step_size, final_step_size, num_clusters_per_subject, cross_loss_scale, self_loss_scale,
+                    classifier_free_guidance_scale, batch_size, num_iterations, loss_threshold, num_guidance_steps,
+                    seed,
                 ],
+                outputs=[boxes, sketchpad, layout_image],
+                run_on_click=True,
             )
         gr.HTML(FOOTNOTE)

bounded_attention.py CHANGED Viewed

@@ -38,6 +38,7 @@ class BoundedAttention(injection_utils.AttentionBase):
         start_step_size=30,
         end_step_size=10,
         loss_stopping_value=0.2,
         cross_mask_threshold=0.2,
         self_mask_threshold=0.2,
         delta_refine_mask_steps=5,
@@ -73,6 +74,7 @@ class BoundedAttention(injection_utils.AttentionBase):
         self.start_step_size = start_step_size
         self.step_size_coef = (end_step_size - start_step_size) / max_guidance_iter
         self.loss_stopping_value = loss_stopping_value
         self.cross_mask_threshold = cross_mask_threshold
         self.self_mask_threshold = self_mask_threshold
@@ -367,7 +369,7 @@ class BoundedAttention(injection_utils.AttentionBase):
     def _obtain_masks(self, resolution, return_boxes=False, return_existing=False, batch_size=None, device=None):
         return_boxes = return_boxes or (return_existing and self.self_masks is None)
-        if return_boxes or self.cur_step < self.max_guidance_iter:
             masks = self._convert_boxes_to_masks(resolution, device=device).unsqueeze(0)
             if batch_size is not None:
                 masks = masks.expand(batch_size, *masks.shape[1:])
@@ -406,21 +408,6 @@ class BoundedAttention(injection_utils.AttentionBase):
         self_masks = F.interpolate(self_masks, resolution, mode='nearest-exact')
         return self_masks.flatten(start_dim=2).bool()
-    def _cluster_self_maps(self):  # b s n
-        self_maps = self._compute_maps(self.mean_self_map)  # b n m
-        if self.pca_rank is not None:
-            dtype = self_maps.dtype
-            _, _, eigen_vectors = torch.pca_lowrank(self_maps.float(), self.pca_rank)
-            self_maps = torch.matmul(self_maps, eigen_vectors.to(dtype=dtype))
-        clustering_results = self.clustering(self_maps, centers=self.centers)
-        self.clustering.num_init = 1  # clustering is deterministic after the first time
-        self.centers = clustering_results.centers
-        clusters = clustering_results.labels
-        num_clusters = self.clustering.n_clusters
-        self._save_maps(clusters / num_clusters, f'clusters')
-        return num_clusters, clusters
     def _build_self_masks(self):
         c, clusters = self._cluster_self_maps()  # b n
         cluster_masks = torch.stack([(clusters == cluster_index) for cluster_index in range(c)], dim=2)  # b n c
@@ -444,6 +431,21 @@ class BoundedAttention(injection_utils.AttentionBase):
         self._save_maps(self_masks, 'self_masks')
         return self_masks
     def _obtain_cross_masks(self, resolution, scale=10):
         maps = self._compute_maps(self.mean_cross_map, resolution=resolution)  # b n k
         maps = F.sigmoid(scale * (maps - self.cross_mask_threshold))

         start_step_size=30,
         end_step_size=10,
         loss_stopping_value=0.2,
+        min_clustering_step=15,
         cross_mask_threshold=0.2,
         self_mask_threshold=0.2,
         delta_refine_mask_steps=5,
         self.start_step_size = start_step_size
         self.step_size_coef = (end_step_size - start_step_size) / max_guidance_iter
         self.loss_stopping_value = loss_stopping_value
+        self.min_clustering_step = min_clustering_step
         self.cross_mask_threshold = cross_mask_threshold
         self.self_mask_threshold = self_mask_threshold
     def _obtain_masks(self, resolution, return_boxes=False, return_existing=False, batch_size=None, device=None):
         return_boxes = return_boxes or (return_existing and self.self_masks is None)
+        if return_boxes or self.cur_step < self.min_clustering_step:
             masks = self._convert_boxes_to_masks(resolution, device=device).unsqueeze(0)
             if batch_size is not None:
                 masks = masks.expand(batch_size, *masks.shape[1:])
         self_masks = F.interpolate(self_masks, resolution, mode='nearest-exact')
         return self_masks.flatten(start_dim=2).bool()
     def _build_self_masks(self):
         c, clusters = self._cluster_self_maps()  # b n
         cluster_masks = torch.stack([(clusters == cluster_index) for cluster_index in range(c)], dim=2)  # b n c
         self._save_maps(self_masks, 'self_masks')
         return self_masks
+    def _cluster_self_maps(self):  # b s n
+        self_maps = self._compute_maps(self.mean_self_map)  # b n m
+        if self.pca_rank is not None:
+            dtype = self_maps.dtype
+            _, _, eigen_vectors = torch.pca_lowrank(self_maps.float(), self.pca_rank)
+            self_maps = torch.matmul(self_maps, eigen_vectors.to(dtype=dtype))
+        clustering_results = self.clustering(self_maps, centers=self.centers)
+        self.clustering.num_init = 1  # clustering is deterministic after the first time
+        self.centers = clustering_results.centers
+        clusters = clustering_results.labels
+        num_clusters = self.clustering.n_clusters
+        self._save_maps(clusters / num_clusters, f'clusters')
+        return num_clusters, clusters
     def _obtain_cross_masks(self, resolution, scale=10):
         maps = self._compute_maps(self.mean_cross_map, resolution=resolution)  # b n k
         maps = F.sigmoid(scale * (maps - self.cross_mask_threshold))