Spaces:
Running
on
Zero
Running
on
Zero
automated caption added
Browse files- app.py +69 -2
- requirements.txt +1 -0
app.py
CHANGED
@@ -23,6 +23,8 @@ from safetensors.torch import load_file
|
|
23 |
from huggingface_hub import hf_hub_download
|
24 |
from diffusers import DiffusionPipeline
|
25 |
import spaces
|
|
|
|
|
26 |
|
27 |
def save_state_to_file(state):
|
28 |
filename = "state.pkl"
|
@@ -76,6 +78,20 @@ def load_pipeline():
|
|
76 |
scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler")
|
77 |
|
78 |
return pipe, inverse_scheduler, scheduler
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
def weight_population(layer_type, resolution, depth, value):
|
80 |
# Check if layer_type exists, if not, create it
|
81 |
if layer_type not in weights:
|
@@ -117,7 +133,7 @@ def resize_image_with_aspect(image, res_range_min=128, res_range_max=1024):
|
|
117 |
@spaces.GPU()
|
118 |
def reconstruct(input_img, caption):
|
119 |
|
120 |
-
pipe, inverse_scheduler, scheduler
|
121 |
pipe.to("cuda")
|
122 |
|
123 |
global weights
|
@@ -355,7 +371,7 @@ def replace_attention_processor(unet, clear=False, blur_sigma=None):
|
|
355 |
@spaces.GPU()
|
356 |
def apply_prompt(meta_data, new_prompt):
|
357 |
|
358 |
-
pipe, _, scheduler = load_pipeline()
|
359 |
pipe.to("cuda")
|
360 |
|
361 |
caption, real_latents_cpu, inversed_latents_cpu, saved_weights = meta_data
|
@@ -416,8 +432,56 @@ def apply_prompt(meta_data, new_prompt):
|
|
416 |
|
417 |
return image_np
|
418 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
419 |
@spaces.GPU(duration=30)
|
420 |
def on_image_change(filepath):
|
|
|
|
|
|
|
421 |
filename = os.path.splitext(os.path.basename(filepath))[0]
|
422 |
|
423 |
if filename not in ["example1","example3","example4"]:
|
@@ -581,6 +645,9 @@ with gr.Blocks(analytics_enabled=False) as demo:
|
|
581 |
example_input.change(fn=on_image_change, inputs=example_input, outputs=[image_input, reconstructed_image, meta_data, steps_slider, invisible_slider, interpolate_slider]).then(lambda: gr.update(interactive=True), outputs=apply_button).then(
|
582 |
lambda: gr.update(interactive=True), outputs=new_prompt_input
|
583 |
)
|
|
|
|
|
|
|
584 |
steps_slider.release(update_step, inputs=steps_slider)
|
585 |
interpolate_slider.release(update_scale, inputs=interpolate_slider)
|
586 |
|
|
|
23 |
from huggingface_hub import hf_hub_download
|
24 |
from diffusers import DiffusionPipeline
|
25 |
import spaces
|
26 |
+
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
|
27 |
+
from qwen_vl_utils import process_vision_info
|
28 |
|
29 |
def save_state_to_file(state):
|
30 |
filename = "state.pkl"
|
|
|
78 |
scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler")
|
79 |
|
80 |
return pipe, inverse_scheduler, scheduler
|
81 |
+
|
82 |
+
def load_qwen():
|
83 |
+
|
84 |
+
# default: Load the model on the available device(s)
|
85 |
+
vlm_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
86 |
+
"Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
|
87 |
+
)
|
88 |
+
|
89 |
+
# default processer
|
90 |
+
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
|
91 |
+
|
92 |
+
return vlm_model, processor
|
93 |
+
|
94 |
+
|
95 |
def weight_population(layer_type, resolution, depth, value):
|
96 |
# Check if layer_type exists, if not, create it
|
97 |
if layer_type not in weights:
|
|
|
133 |
@spaces.GPU()
|
134 |
def reconstruct(input_img, caption):
|
135 |
|
136 |
+
pipe, inverse_scheduler, scheduler= load_pipeline()
|
137 |
pipe.to("cuda")
|
138 |
|
139 |
global weights
|
|
|
371 |
@spaces.GPU()
|
372 |
def apply_prompt(meta_data, new_prompt):
|
373 |
|
374 |
+
pipe, _, scheduler, _, _ = load_pipeline()
|
375 |
pipe.to("cuda")
|
376 |
|
377 |
caption, real_latents_cpu, inversed_latents_cpu, saved_weights = meta_data
|
|
|
432 |
|
433 |
return image_np
|
434 |
|
435 |
+
|
436 |
+
@spaces.GPU
|
437 |
+
def choose_caption(input_image):
|
438 |
+
|
439 |
+
vlm_model, processor = load_qwen()
|
440 |
+
|
441 |
+
image = input_image.convert("RGB")
|
442 |
+
|
443 |
+
# (b) Wrap into messages
|
444 |
+
messages = [
|
445 |
+
{
|
446 |
+
"role": "user",
|
447 |
+
"content": [
|
448 |
+
{"type": "image", "image": image},
|
449 |
+
{"type": "text", "text": "Please describe this image in **very** **short** caption."},
|
450 |
+
],
|
451 |
+
}
|
452 |
+
]
|
453 |
+
|
454 |
+
# 1) Turn messages → vision_inputs + (maybe) video_inputs
|
455 |
+
image_inputs, video_inputs = process_vision_info(messages)
|
456 |
+
|
457 |
+
# 2) Build text prompt from messages
|
458 |
+
text_prompt = processor.apply_chat_template(
|
459 |
+
messages,
|
460 |
+
tokenize=False,
|
461 |
+
add_generation_prompt=True
|
462 |
+
)
|
463 |
+
|
464 |
+
# 3) Tokenize both text and vision
|
465 |
+
inputs = processor(
|
466 |
+
text=[text_prompt],
|
467 |
+
images=image_inputs,
|
468 |
+
videos=video_inputs,
|
469 |
+
padding=True,
|
470 |
+
return_tensors="pt",
|
471 |
+
).to("cuda")
|
472 |
+
|
473 |
+
out_ids = vlm_model.generate(**inputs, max_new_tokens=32)
|
474 |
+
orig_ids = [out_ids[i][len(inputs.input_ids[i]):] for i in range(len(out_ids))]
|
475 |
+
caption = processor.batch_decode(orig_ids, skip_special_tokens=True)[0]
|
476 |
+
|
477 |
+
return caption
|
478 |
+
|
479 |
+
|
480 |
@spaces.GPU(duration=30)
|
481 |
def on_image_change(filepath):
|
482 |
+
|
483 |
+
pipe, inverse_scheduler, scheduler = load_pipeline()
|
484 |
+
|
485 |
filename = os.path.splitext(os.path.basename(filepath))[0]
|
486 |
|
487 |
if filename not in ["example1","example3","example4"]:
|
|
|
645 |
example_input.change(fn=on_image_change, inputs=example_input, outputs=[image_input, reconstructed_image, meta_data, steps_slider, invisible_slider, interpolate_slider]).then(lambda: gr.update(interactive=True), outputs=apply_button).then(
|
646 |
lambda: gr.update(interactive=True), outputs=new_prompt_input
|
647 |
)
|
648 |
+
|
649 |
+
image_input.change(fn=choose_caption, inputs=image_input, outputs=[prompt_input])
|
650 |
+
|
651 |
steps_slider.release(update_step, inputs=steps_slider)
|
652 |
interpolate_slider.release(update_scale, inputs=interpolate_slider)
|
653 |
|
requirements.txt
CHANGED
@@ -7,3 +7,4 @@ transformers
|
|
7 |
gradio
|
8 |
accelerate
|
9 |
PEFT
|
|
|
|
7 |
gradio
|
8 |
accelerate
|
9 |
PEFT
|
10 |
+
qwen-vl-utils
|