alexnasa commited on
Commit
5bc94d4
·
1 Parent(s): f2de35c

automated caption added

Browse files
Files changed (2) hide show
  1. app.py +69 -2
  2. requirements.txt +1 -0
app.py CHANGED
@@ -23,6 +23,8 @@ from safetensors.torch import load_file
23
  from huggingface_hub import hf_hub_download
24
  from diffusers import DiffusionPipeline
25
  import spaces
 
 
26
 
27
  def save_state_to_file(state):
28
  filename = "state.pkl"
@@ -76,6 +78,20 @@ def load_pipeline():
76
  scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler")
77
 
78
  return pipe, inverse_scheduler, scheduler
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  def weight_population(layer_type, resolution, depth, value):
80
  # Check if layer_type exists, if not, create it
81
  if layer_type not in weights:
@@ -117,7 +133,7 @@ def resize_image_with_aspect(image, res_range_min=128, res_range_max=1024):
117
  @spaces.GPU()
118
  def reconstruct(input_img, caption):
119
 
120
- pipe, inverse_scheduler, scheduler = load_pipeline()
121
  pipe.to("cuda")
122
 
123
  global weights
@@ -355,7 +371,7 @@ def replace_attention_processor(unet, clear=False, blur_sigma=None):
355
  @spaces.GPU()
356
  def apply_prompt(meta_data, new_prompt):
357
 
358
- pipe, _, scheduler = load_pipeline()
359
  pipe.to("cuda")
360
 
361
  caption, real_latents_cpu, inversed_latents_cpu, saved_weights = meta_data
@@ -416,8 +432,56 @@ def apply_prompt(meta_data, new_prompt):
416
 
417
  return image_np
418
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
419
  @spaces.GPU(duration=30)
420
  def on_image_change(filepath):
 
 
 
421
  filename = os.path.splitext(os.path.basename(filepath))[0]
422
 
423
  if filename not in ["example1","example3","example4"]:
@@ -581,6 +645,9 @@ with gr.Blocks(analytics_enabled=False) as demo:
581
  example_input.change(fn=on_image_change, inputs=example_input, outputs=[image_input, reconstructed_image, meta_data, steps_slider, invisible_slider, interpolate_slider]).then(lambda: gr.update(interactive=True), outputs=apply_button).then(
582
  lambda: gr.update(interactive=True), outputs=new_prompt_input
583
  )
 
 
 
584
  steps_slider.release(update_step, inputs=steps_slider)
585
  interpolate_slider.release(update_scale, inputs=interpolate_slider)
586
 
 
23
  from huggingface_hub import hf_hub_download
24
  from diffusers import DiffusionPipeline
25
  import spaces
26
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
27
+ from qwen_vl_utils import process_vision_info
28
 
29
  def save_state_to_file(state):
30
  filename = "state.pkl"
 
78
  scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler")
79
 
80
  return pipe, inverse_scheduler, scheduler
81
+
82
+ def load_qwen():
83
+
84
+ # default: Load the model on the available device(s)
85
+ vlm_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
86
+ "Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
87
+ )
88
+
89
+ # default processer
90
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
91
+
92
+ return vlm_model, processor
93
+
94
+
95
  def weight_population(layer_type, resolution, depth, value):
96
  # Check if layer_type exists, if not, create it
97
  if layer_type not in weights:
 
133
  @spaces.GPU()
134
  def reconstruct(input_img, caption):
135
 
136
+ pipe, inverse_scheduler, scheduler= load_pipeline()
137
  pipe.to("cuda")
138
 
139
  global weights
 
371
  @spaces.GPU()
372
  def apply_prompt(meta_data, new_prompt):
373
 
374
+ pipe, _, scheduler, _, _ = load_pipeline()
375
  pipe.to("cuda")
376
 
377
  caption, real_latents_cpu, inversed_latents_cpu, saved_weights = meta_data
 
432
 
433
  return image_np
434
 
435
+
436
+ @spaces.GPU
437
+ def choose_caption(input_image):
438
+
439
+ vlm_model, processor = load_qwen()
440
+
441
+ image = input_image.convert("RGB")
442
+
443
+ # (b) Wrap into messages
444
+ messages = [
445
+ {
446
+ "role": "user",
447
+ "content": [
448
+ {"type": "image", "image": image},
449
+ {"type": "text", "text": "Please describe this image in **very** **short** caption."},
450
+ ],
451
+ }
452
+ ]
453
+
454
+ # 1) Turn messages → vision_inputs + (maybe) video_inputs
455
+ image_inputs, video_inputs = process_vision_info(messages)
456
+
457
+ # 2) Build text prompt from messages
458
+ text_prompt = processor.apply_chat_template(
459
+ messages,
460
+ tokenize=False,
461
+ add_generation_prompt=True
462
+ )
463
+
464
+ # 3) Tokenize both text and vision
465
+ inputs = processor(
466
+ text=[text_prompt],
467
+ images=image_inputs,
468
+ videos=video_inputs,
469
+ padding=True,
470
+ return_tensors="pt",
471
+ ).to("cuda")
472
+
473
+ out_ids = vlm_model.generate(**inputs, max_new_tokens=32)
474
+ orig_ids = [out_ids[i][len(inputs.input_ids[i]):] for i in range(len(out_ids))]
475
+ caption = processor.batch_decode(orig_ids, skip_special_tokens=True)[0]
476
+
477
+ return caption
478
+
479
+
480
  @spaces.GPU(duration=30)
481
  def on_image_change(filepath):
482
+
483
+ pipe, inverse_scheduler, scheduler = load_pipeline()
484
+
485
  filename = os.path.splitext(os.path.basename(filepath))[0]
486
 
487
  if filename not in ["example1","example3","example4"]:
 
645
  example_input.change(fn=on_image_change, inputs=example_input, outputs=[image_input, reconstructed_image, meta_data, steps_slider, invisible_slider, interpolate_slider]).then(lambda: gr.update(interactive=True), outputs=apply_button).then(
646
  lambda: gr.update(interactive=True), outputs=new_prompt_input
647
  )
648
+
649
+ image_input.change(fn=choose_caption, inputs=image_input, outputs=[prompt_input])
650
+
651
  steps_slider.release(update_step, inputs=steps_slider)
652
  interpolate_slider.release(update_scale, inputs=interpolate_slider)
653
 
requirements.txt CHANGED
@@ -7,3 +7,4 @@ transformers
7
  gradio
8
  accelerate
9
  PEFT
 
 
7
  gradio
8
  accelerate
9
  PEFT
10
+ qwen-vl-utils