Example Usage


def vlm_query(prompt, image):
  inputs = self.processor(text=prompt, images=image, return_tensors="pt").to(device)
  try:
      generated_ids = self.model.module.generate(**inputs, max_new_tokens=256)
      response = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
      return response
  except Exception as e:
      print(f"Error in vlm_query: {e}")
      return None

def score_box(bbox, img_url, full_screenshot):
  x1, y1, x2, y2 = bbox
  try:
      element_image = full_screenshot.crop((x1, y1, x2, y2)).convert("RGB")
  except Exception as e:
      print(f"Error cropping image: {e}")
      return None, False
  if element_image.size[0] <= 28 or element_image.size[1] <= 28:
      scaling_ratio = 28 / min(element_image.size)
      new_size = (
          int(element_image.width * scaling_ratio),
          int(element_image.height * scaling_ratio)
      )
      element_image = element_image.resize(new_size, Image.Resampling.LANCZOS)
  if element_image.size[0] <= 28 or element_image.size[1] <= 28:
      return None, False
  messages = [
      {
          "role": "user",
          "content": [
              {"type": "image_url", "image_url": img_url},
              {
                  "type": "text",
                  "text": """Analyze the provided cropped image from a screenshot to determine whether it contains a single, valid, and visually complete UI element.

                  Criteria for validity:
                  - The image must contain exactly one UI element.
                  - The element must be entirely visible within the cropped area, with no significant cut-off parts.
                  - The image should not consist solely of background, empty space, or meaningless fragments.

                  Response format:
                  Conclude with your final determination in a dedicated section:

                  Conclusion
                  Yes (if the image contains a single, valid, and complete UI element)
                  No (if it does not meet the criteria)"""
              }
          ]
      }
  ]
  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
  response = vlm_query(text, element_image)
  if response is None:
      return None, False
  last_lines = response.strip().split('\n')[-2:]
  conclusion = " ".join(last_lines).lower()
  return response, "yes" in conclusion
likaixin
/

UI-Box-Scorer-Qwen2.5-VL-7B

Example Usage

Model tree for likaixin/UI-Box-Scorer-Qwen2.5-VL-7B