Spaces:

alexnasa
/

Chain-of-Zoom

Running on Zero

App Files Files Community

alexnasa commited on 4 days ago

Commit

285eb4b

verified ·

1 Parent(s): 0087df7

Update inference_coz_single.py

Browse files

Files changed (1) hide show

inference_coz_single.py +59 -75

inference_coz_single.py CHANGED Viewed

@@ -25,66 +25,77 @@ def resize_and_center_crop(img: Image.Image, size: int) -> Image.Image:
 # Helper: Generate a single VLM prompt for recursive_multiscale
 # -------------------------------------------------------------------
 def _generate_vlm_prompt(
-    vlm_model,
-    vlm_processor,
-    process_vision_info,
-    prev_image_path: str,
-    zoomed_image_path: str,
     device: str = "cuda"
 ) -> str:
     """
-    Given two image file paths:
-      - prev_image_path:   the “full” image at the previous recursion.
-      - zoomed_image_path: the cropped+resized (zoom) image for this step.
-    This builds a single “recursive_multiscale” prompt via Qwen2.5-VL.
-    Returns a string like “cat on sofa, pet, indoor, living room”, etc.
     """
-    # (1) Define the system message for recursive_multiscale:
     message_text = (
         "The second image is a zoom-in of the first image. "
         "Based on this knowledge, what is in the second image? "
         "Give me a set of words."
     )
-    # (2) Build the two-image “chat” payload:
     messages = [
         {"role": "system", "content": message_text},
         {
             "role": "user",
             "content": [
-                {"type": "image", "image": prev_image_path},
-                {"type": "image", "image": zoomed_image_path},
             ],
         },
     ]
-    # (3) Wrap through the VL processor to get “inputs”:
     text = vlm_processor.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
     )
     image_inputs, video_inputs = process_vision_info(messages)
     inputs = vlm_processor(
-        text=[text],
-        images=image_inputs,
-        videos=video_inputs,
-        padding=True,
         return_tensors="pt",
     ).to(device)
-    # (4) Generate tokens → decode
     generated = vlm_model.generate(**inputs, max_new_tokens=128)
-    # strip off the prompt tokens from each generated sequence:
     trimmed = [
-        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated)
     ]
     out_text = vlm_processor.batch_decode(
         trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
     )[0]
-    # (5) Return exactly the bare words (no extra “,” if no additional user prompt)
     return out_text.strip()
 # -------------------------------------------------------------------
 # Main Function: recursive_multiscale_sr (with multiple centers)
 # -------------------------------------------------------------------
@@ -203,88 +214,61 @@ def recursive_multiscale_sr(
         ###############################
         # 6. Prepare the very first “full” image
         ###############################
-        # 6.1 Load + center crop → first_image is (512×512) PIL on CPU
         img0 = Image.open(input_png_path).convert("RGB")
         img0 = resize_and_center_crop(img0, process_size)
-        # 6.2 Save it once so VLM can read it as “prev.png”
-        prev_path = os.path.join(td, "step0_prev.png")
-        img0.save(prev_path)
-        # We will maintain lists of PIL outputs and prompts:
         sr_pil_list: list[Image.Image] = []
-        prompt_list: list[str] = []
-        ###############################
-        # 7. Recursion loop (now up to rec_num times)
-        ###############################
         for rec in range(rec_num):
-            # (A) Load the previous SR output (or original) and compute crop window
-            prev_pil = Image.open(prev_path).convert("RGB")
-            w, h = prev_pil.size  # should be (512×512) each time
-            # (1) Compute the “low-res” window size:
-            new_w, new_h = w // upscale, h // upscale  # e.g. 128×128 for upscale=4
-            # (2) Map normalized center → pixel center, then clamp so crop stays in bounds:
             cx_norm, cy_norm = centers[rec]
             cx = int(cx_norm * w)
             cy = int(cy_norm * h)
-            half_w = new_w // 2
-            half_h = new_h // 2
-            # If center in pixels is too close to left/top, clamp so left=0 or top=0; same on right/bottom
-            left = cx - half_w
-            top = cy - half_h
-            # clamp left ∈ [0, w - new_w], top ∈ [0, h - new_h]
-            left = max(0, min(left, w - new_w))
-            top = max(0, min(top, h - new_h))
-            right = left + new_w
-            bottom = top + new_h
             cropped = prev_pil.crop((left, top, right, bottom))
-            # (B) Resize that crop back up to (512×512) via BICUBIC → zoomed
-            zoomed = cropped.resize((w, h), Image.BICUBIC)
-            zoom_path = os.path.join(td, f"step{rec+1}_zoom.png")
-            zoomed.save(zoom_path)
-            # (C) Generate a recursive_multiscale VLM “tag” prompt
             prompt_tag = _generate_vlm_prompt(
                 vlm_model=vlm_model,
                 vlm_processor=vlm_processor,
                 process_vision_info=process_vision_info,
-                prev_image_path=prev_path,
-                zoomed_image_path=zoom_path,
                 device=device,
             )
-            # (By default, no extra user prompt is appended.)
-            # (D) Prepare the low-res tensor for SR: convert zoomed → Tensor → [0,1] → [−1,1]
             to_tensor = transforms.ToTensor()
-            lq = to_tensor(zoomed).unsqueeze(0).to(device)  # shape (1,3,512,512)
             lq = (lq * 2.0) - 1.0
-            # (E) Do SR inference:
             with torch.no_grad():
-                out_tensor = model_test(lq, prompt=prompt_tag)[0]  # (3,512,512) on CPU or GPU
                 out_tensor = out_tensor.clamp(-1.0, 1.0).cpu()
-                # back to PIL in [0,1]:
                 out_pil = transforms.ToPILImage()((out_tensor * 0.5) + 0.5)
-            # (F) Save this step’s SR output as “prev.png” for next iteration:
-            out_path = os.path.join(td, f"step{rec+1}_sr.png")
-            out_pil.save(out_path)
-            prev_path = out_path
-            # (G) Append the PIL to our list:
             sr_pil_list.append(out_pil)
             prompt_list.append(prompt_tag)
-        # end for(rec)
-        ###############################
-        # 8. Return the SR outputs & prompts
-        ###############################
-        # The list sr_pil_list = [ SR1, SR2, …, SR_rec_num ] in order.
         return sr_pil_list, prompt_list

 # Helper: Generate a single VLM prompt for recursive_multiscale
 # -------------------------------------------------------------------
 def _generate_vlm_prompt(
+    vlm_model: Qwen2_5_VLForConditionalGeneration,
+    vlm_processor: AutoProcessor,
+    process_vision_info,      # this is your helper that turns “messages” → image_inputs / video_inputs
+    prev_pil: Image.Image,    # <– pass PIL instead of path
+    zoomed_pil: Image.Image,  # <– pass PIL instead of path
     device: str = "cuda"
 ) -> str:
     """
+    Given two PIL.Image inputs:
+      - prev_pil:   the “full” image at the previous recursion.
+      - zoomed_pil: the cropped+resized (zoom) image for this step.
+    Returns a single “recursive_multiscale” prompt string.
     """
+    # (1) System message
     message_text = (
         "The second image is a zoom-in of the first image. "
         "Based on this knowledge, what is in the second image? "
         "Give me a set of words."
     )
+    # (2) Build the two-image “chat” payload
+    #
+    #    Instead of passing a filename, we pass the actual PIL.Image.
+    #    The processor’s `process_vision_info` should know how to turn
+    #    a message of the form {"type":"image","image": PIL_IMAGE} into tensors.
     messages = [
         {"role": "system", "content": message_text},
         {
             "role": "user",
             "content": [
+                {"type": "image", "image": prev_pil},
+                {"type": "image", "image": zoomed_pil},
             ],
         },
     ]
+    # (3) Now run the “chat” through the VL processor
+    #
+    #    - `apply_chat_template` will build the tokenized prompt (without running it yet).
+    #    - `process_vision_info` should inspect the same `messages` list and return
+    #      `image_inputs` and `video_inputs` (tensors) for any attached PIL images.
     text = vlm_processor.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True
     )
     image_inputs, video_inputs = process_vision_info(messages)
     inputs = vlm_processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
         return_tensors="pt",
     ).to(device)
+    # (4) Generate and decode
     generated = vlm_model.generate(**inputs, max_new_tokens=128)
     trimmed = [
+        out_ids[len(in_ids):]
+        for in_ids, out_ids in zip(inputs.input_ids, generated)
     ]
     out_text = vlm_processor.batch_decode(
         trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
     )[0]
     return out_text.strip()
 # -------------------------------------------------------------------
 # Main Function: recursive_multiscale_sr (with multiple centers)
 # -------------------------------------------------------------------
         ###############################
         # 6. Prepare the very first “full” image
         ###############################
+        # (6.1) Load + center crop → first_image (512×512)
         img0 = Image.open(input_png_path).convert("RGB")
         img0 = resize_and_center_crop(img0, process_size)
+        # Note: we no longer need to write “prev.png” to disk. Just keep it in memory.
+        prev_pil = img0.copy()
         sr_pil_list: list[Image.Image] = []
+        prompt_list:  list[str]        = []
         for rec in range(rec_num):
+            # (A) Compute low-res crop window on prev_pil
+            w, h = prev_pil.size  # (512×512)
+            new_w, new_h = w // upscale, h // upscale
             cx_norm, cy_norm = centers[rec]
             cx = int(cx_norm * w)
             cy = int(cy_norm * h)
+            half_w, half_h = new_w // 2, new_h // 2
+            left = max(0, min(cx - half_w, w - new_w))
+            top  = max(0, min(cy - half_h, h - new_h))
+            right, bottom = left + new_w, top + new_h
             cropped = prev_pil.crop((left, top, right, bottom))
+            # (B) Upsample that crop back to (512×512)
+            zoomed_pil = cropped.resize((w, h), Image.BICUBIC)
+            # (C) Generate VLM prompt by passing PILs directly:
             prompt_tag = _generate_vlm_prompt(
                 vlm_model=vlm_model,
                 vlm_processor=vlm_processor,
                 process_vision_info=process_vision_info,
+                prev_pil=prev_pil,     # <– PIL
+                zoomed_pil=zoomed_pil, # <– PIL
                 device=device,
             )
+            # (D) Prepare “zoomed_pil” → tensor in [−1, 1]
             to_tensor = transforms.ToTensor()
+            lq = to_tensor(zoomed_pil).unsqueeze(0).to(device)  # (1,3,512,512)
             lq = (lq * 2.0) - 1.0
+            # (E) Run SR inference
             with torch.no_grad():
+                out_tensor = model_test(lq, prompt=prompt_tag)[0]
                 out_tensor = out_tensor.clamp(-1.0, 1.0).cpu()
                 out_pil = transforms.ToPILImage()((out_tensor * 0.5) + 0.5)
+            # (F) Bookkeeping: set prev_pil = out_pil for next iteration
+            prev_pil = out_pil
+            # (G) Append to results
             sr_pil_list.append(out_pil)
             prompt_list.append(prompt_tag)
         return sr_pil_list, prompt_list