Hcompany
/

Holo1-3B

Image-Text-to-Text

text-generation-inference

Model card Files Files and versions Community

Holo1-3B / localization.py

plcedoz38's picture

navigation example

530f5d3 3 days ago

history blame contribute delete

1.6 kB

	import json
	from typing import Any, Literal

	from pydantic import BaseModel


	def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
	guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."

	return [
	{
	"role": "user",
	"content": [
	{
	"type": "image",
	"image": image,
	},
	{"type": "text", "text": f"{guidelines}\n{instruction}"},
	],
	}
	]


	class ClickAction(BaseModel):
	"""Click at specific coordinates on the screen."""

	action: Literal["click"] = "click"
	x: int
	"""The x coordinate, number of pixels from the left edge."""
	y: int
	"""The y coordinate, number of pixels from the top edge."""


	def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
	guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."

	return [
	{
	"role": "system",
	"content": json.dumps([ClickAction.model_json_schema()]),
	},
	{
	"role": "user",
	"content": [
	{
	"type": "image",
	"image": image,
	},
	{"type": "text", "text": f"{guidelines}\n{instruction}"},
	],
	},
	]