|
import json |
|
from typing import Any, Literal |
|
|
|
from pydantic import BaseModel |
|
|
|
|
|
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]: |
|
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge." |
|
|
|
return [ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{ |
|
"type": "image", |
|
"image": image, |
|
}, |
|
{"type": "text", "text": f"{guidelines}\n{instruction}"}, |
|
], |
|
} |
|
] |
|
|
|
|
|
class ClickAction(BaseModel): |
|
"""Click at specific coordinates on the screen.""" |
|
|
|
action: Literal["click"] = "click" |
|
x: int |
|
"""The x coordinate, number of pixels from the left edge.""" |
|
y: int |
|
"""The y coordinate, number of pixels from the top edge.""" |
|
|
|
|
|
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]: |
|
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format." |
|
|
|
return [ |
|
{ |
|
"role": "system", |
|
"content": json.dumps([ClickAction.model_json_schema()]), |
|
}, |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{ |
|
"type": "image", |
|
"image": image, |
|
}, |
|
{"type": "text", "text": f"{guidelines}\n{instruction}"}, |
|
], |
|
}, |
|
] |
|
|