Spaces:

alexnasa
/

outoffocus

Running on Zero

App Files Files Community

alexnasa commited on 26 days ago

Commit

5bc94d4

1 Parent(s): f2de35c

automated caption added

Browse files

Files changed (2) hide show

app.py +69 -2
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -23,6 +23,8 @@ from safetensors.torch import load_file
 from huggingface_hub import hf_hub_download
 from diffusers import DiffusionPipeline
 import spaces
 def save_state_to_file(state):
     filename = "state.pkl"
@@ -76,6 +78,20 @@ def load_pipeline():
     scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler")
     return pipe, inverse_scheduler, scheduler
 def weight_population(layer_type, resolution, depth, value):
     # Check if layer_type exists, if not, create it
     if layer_type not in weights:
@@ -117,7 +133,7 @@ def resize_image_with_aspect(image, res_range_min=128, res_range_max=1024):
 @spaces.GPU()
 def reconstruct(input_img, caption):
-    pipe, inverse_scheduler, scheduler = load_pipeline()
     pipe.to("cuda")
     global weights
@@ -355,7 +371,7 @@ def replace_attention_processor(unet, clear=False, blur_sigma=None):
 @spaces.GPU()
 def apply_prompt(meta_data, new_prompt):
-    pipe, _, scheduler = load_pipeline()
     pipe.to("cuda")
     caption, real_latents_cpu, inversed_latents_cpu, saved_weights = meta_data
@@ -416,8 +432,56 @@ def apply_prompt(meta_data, new_prompt):
     return image_np
 @spaces.GPU(duration=30)
 def on_image_change(filepath):
     filename = os.path.splitext(os.path.basename(filepath))[0]
     if filename not in ["example1","example3","example4"]:
@@ -581,6 +645,9 @@ with gr.Blocks(analytics_enabled=False) as demo:
     example_input.change(fn=on_image_change, inputs=example_input, outputs=[image_input, reconstructed_image, meta_data, steps_slider, invisible_slider, interpolate_slider]).then(lambda: gr.update(interactive=True), outputs=apply_button).then(
         lambda: gr.update(interactive=True), outputs=new_prompt_input
     )
     steps_slider.release(update_step, inputs=steps_slider)
     interpolate_slider.release(update_scale, inputs=interpolate_slider)

 from huggingface_hub import hf_hub_download
 from diffusers import DiffusionPipeline
 import spaces
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+from qwen_vl_utils import process_vision_info
 def save_state_to_file(state):
     filename = "state.pkl"
     scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler")
     return pipe, inverse_scheduler, scheduler
+def load_qwen():
+    # default: Load the model on the available device(s)
+    vlm_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        "Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
+    )
+    # default processer
+    processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
+    return vlm_model, processor
 def weight_population(layer_type, resolution, depth, value):
     # Check if layer_type exists, if not, create it
     if layer_type not in weights:
 @spaces.GPU()
 def reconstruct(input_img, caption):
+    pipe, inverse_scheduler, scheduler= load_pipeline()
     pipe.to("cuda")
     global weights
 @spaces.GPU()
 def apply_prompt(meta_data, new_prompt):
+    pipe, _, scheduler, _, _ = load_pipeline()
     pipe.to("cuda")
     caption, real_latents_cpu, inversed_latents_cpu, saved_weights = meta_data
     return image_np
+@spaces.GPU
+def choose_caption(input_image):
+    vlm_model, processor = load_qwen()
+    image = input_image.convert("RGB")
+    # (b) Wrap into messages
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text",  "text": "Please describe this image in **very** **short** caption."},
+            ],
+        }
+    ]
+    # 1) Turn messages → vision_inputs + (maybe) video_inputs
+    image_inputs, video_inputs = process_vision_info(messages)
+    # 2) Build text prompt from messages
+    text_prompt = processor.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True
+    )
+    # 3) Tokenize both text and vision
+    inputs = processor(
+        text=[text_prompt],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    ).to("cuda")
+    out_ids = vlm_model.generate(**inputs, max_new_tokens=32)
+    orig_ids = [out_ids[i][len(inputs.input_ids[i]):] for i in range(len(out_ids))]
+    caption = processor.batch_decode(orig_ids, skip_special_tokens=True)[0]
+    return caption
 @spaces.GPU(duration=30)
 def on_image_change(filepath):
+    pipe, inverse_scheduler, scheduler = load_pipeline()
     filename = os.path.splitext(os.path.basename(filepath))[0]
     if filename not in ["example1","example3","example4"]:
     example_input.change(fn=on_image_change, inputs=example_input, outputs=[image_input, reconstructed_image, meta_data, steps_slider, invisible_slider, interpolate_slider]).then(lambda: gr.update(interactive=True), outputs=apply_button).then(
         lambda: gr.update(interactive=True), outputs=new_prompt_input
     )
+    image_input.change(fn=choose_caption, inputs=image_input, outputs=[prompt_input])
     steps_slider.release(update_step, inputs=steps_slider)
     interpolate_slider.release(update_scale, inputs=interpolate_slider)

requirements.txt CHANGED Viewed

@@ -7,3 +7,4 @@ transformers
 gradio
 accelerate
 PEFT

 gradio
 accelerate
 PEFT
+qwen-vl-utils