Example Usage
def vlm_query(prompt, image):
inputs = self.processor(text=prompt, images=image, return_tensors="pt").to(device)
try:
generated_ids = self.model.module.generate(**inputs, max_new_tokens=256)
response = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
return response
except Exception as e:
print(f"Error in vlm_query: {e}")
return None
def score_box(bbox, img_url, full_screenshot):
x1, y1, x2, y2 = bbox
try:
element_image = full_screenshot.crop((x1, y1, x2, y2)).convert("RGB")
except Exception as e:
print(f"Error cropping image: {e}")
return None, False
if element_image.size[0] <= 28 or element_image.size[1] <= 28:
scaling_ratio = 28 / min(element_image.size)
new_size = (
int(element_image.width * scaling_ratio),
int(element_image.height * scaling_ratio)
)
element_image = element_image.resize(new_size, Image.Resampling.LANCZOS)
if element_image.size[0] <= 28 or element_image.size[1] <= 28:
return None, False
messages = [
{
"role": "user",
"content": [
{"type": "image_url", "image_url": img_url},
{
"type": "text",
"text": """Analyze the provided cropped image from a screenshot to determine whether it contains a single, valid, and visually complete UI element.
Criteria for validity:
- The image must contain exactly one UI element.
- The element must be entirely visible within the cropped area, with no significant cut-off parts.
- The image should not consist solely of background, empty space, or meaningless fragments.
Response format:
Conclude with your final determination in a dedicated section:
Conclusion
Yes (if the image contains a single, valid, and complete UI element)
No (if it does not meet the criteria)"""
}
]
}
]
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
response = vlm_query(text, element_image)
if response is None:
return None, False
last_lines = response.strip().split('\n')[-2:]
conclusion = " ".join(last_lines).lower()
return response, "yes" in conclusion
- Downloads last month
- 1,394
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
๐
Ask for provider support