File size: 6,163 Bytes
498ffec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# SYSTEM PROMPT
DEFAULT_SYSTEM_PROMPT_FORMAT = "You are an expert evaluator of web agent. {role_description}"

PROGRESS_WITHOUT_CHECKLIST_ROLE = "Your task is to assess how helpful a given agent's THOUGHT and ACTION is in making progress toward the user's goal, based on the current state of the webpage."
PROGRESS_WITH_CHECKLIST_ROLE = "Your task is to assess how helpful a given agent's THOUGHT and ACTION is in making progress toward the user's goal, based on the current state of the webpage."

GROUNDING_ROLE = "Your task is to assess whether the ACTION taken by the agent is properly grounded, based on agent's THOUGHT and the current state of the webpage."

# USER PROMPT
DEFAULT_USER_PROMPT_FORMAT = """# Action space:
{action_space}

# Task Description
{task_description}

# Given Information
{input_information}

# Output Format
{output_format}
"""


JUDGE_OURS_WO_CHECKLIST_USER_PROMPT_FORMAT = """You are an expert evaluator of web agent. Your task is to assess how helpful a given agent's THOUGHT and ACTION is in making progress toward the user's goal, based on the current state of the webpage.

# Task Description
Evaluate how well the agent’s THOUGHT and ACTION satisfy each item in the checklist using the task instruction, trajectory (including previously completed steps), current webpage state, and the agent’s latest response. Start by writing a concise paragraph summarizing the agent’s overall performance. Refer to the reasoning provided in the trajectory, and discuss whether the THOUGHT is appropriate and the ACTION moves the task forward.

# Given Information
{input_information}
"""


JUDGE_OURS_USER_PROMPT_FORMAT = """You are an expert evaluator of web agent. Your task is to assess how helpful a given agent's THOUGHT and ACTION is in making progress toward the user's goal, based on the current state of the webpage.

# Task Description
Evaluate how well the agent’s THOUGHT and ACTION satisfy each item in the checklist using the task instruction, trajectory (including previously completed steps), current webpage state, and the agent’s latest response. Start by writing a concise paragraph summarizing the agent’s overall performance. Refer to the reasoning provided in the trajectory, and discuss whether the THOUGHT is appropriate and the ACTION moves the task forward.
Then, assess each checklist item individually using the following labels:
- Yes: The item is fully and clearly satisfied, either in the current response or previously completed.
- In Progress: There is meaningful partial progress toward completing the item.
- No: The item is not satisfied due to ambiguity, insufficient evidence, or lack of progress.

# Given Information
{input_information}
"""


JUDGE_OURS_BT_MODELING_USER_PROMPT_FORMAT = """You are an expert web agent that browses internet via GUI actions. Your task is to achieve the user's goal described in the user instruction.

# Task Description
Generate the most appropriate GUI action to achieve the user's goal. When choosing your action, consider the current webpage state and the checklist which can be interpreted as subtasks.

# Given Information
## User Instruction
{intent}

## Trajectory
{trajectory}

## Current State
### Current URL
{current_url}

### AXTREE
Note: [bid] is the unique alpha-numeric identifier at the beginning of lines for each element in the AXTree. Always use bid to refer to elements in your actions.
{text_observation}

## Checklist
{checklist}

## Agent's Response
"""

JUDGE_OURS_BT_MODELING_BASE_PROMPT = """You are an expert web agent that browses internet via GUI actions. Your task is to achieve the user's goal described in the user instruction.

# Task Description
Generate the most appropriate GUI action to achieve the user's goal. When choosing your action, consider the current webpage state and the checklist which can be interpreted as subtasks.

# Given Information
## User Instruction
{intent}

## Trajectory
{trajectory}

## Current State
### Current URL
{current_url}

### AXTREE
Note: [bid] is the unique alpha-numeric identifier at the beginning of lines for each element in the AXTree. Always use bid to refer to elements in your actions.
{text_observation}
"""

JUDGE_OURS_IMAGE_INPUT = """
### Image Screenshot
<IMAGE_PLACEHOLDER>
"""

JUDGE_OURS_WITH_CHECKLIST = """
## Checklist
{checklist}
"""

BT_MODELING_RESPONSE_FORMAT = """
THOUGHT: {thought}
ACTION: {action}
"""

## PROMPT TEMPLATE
JUDGE_GROUNDING_PROMPT_TEMPLATE = {
    "system": DEFAULT_SYSTEM_PROMPT_FORMAT.format(role_description=GROUNDING_ROLE),
    "user": DEFAULT_USER_PROMPT_FORMAT,
}

JUDGE_LIKERT_SCALE_PROMPT_TEMPLATE = {
    "system": DEFAULT_SYSTEM_PROMPT_FORMAT.format(role_description=PROGRESS_WITHOUT_CHECKLIST_ROLE),
    "user": DEFAULT_USER_PROMPT_FORMAT
}

JUDGE_THREE_CLASS_PROMPT_TEMPLATE = {
    "system": DEFAULT_SYSTEM_PROMPT_FORMAT.format(role_description=PROGRESS_WITHOUT_CHECKLIST_ROLE),
    "user": DEFAULT_USER_PROMPT_FORMAT
}

JUDGE_WITH_CHECKLIST_PROMPT_TEMPLATE = {
    "system": DEFAULT_SYSTEM_PROMPT_FORMAT.format(role_description=PROGRESS_WITH_CHECKLIST_ROLE),
    "user": DEFAULT_USER_PROMPT_FORMAT
}

JUDGE_OURS_PROMPT_TEMPLATE = {
    "system": "",
    "user": JUDGE_OURS_USER_PROMPT_FORMAT,
}

JUDGE_OURS_WO_CHECKLIST_PROMPT_TEMPLATE = {
    "system": "",
    "user": JUDGE_OURS_WO_CHECKLIST_USER_PROMPT_FORMAT,
}

JUDGE_OURS_BT_MODELING_PROMPT_TEMPLATE = {
    "user": JUDGE_OURS_BT_MODELING_BASE_PROMPT+JUDGE_OURS_WITH_CHECKLIST+"\n## Agent's Response\n",
    "assistant": BT_MODELING_RESPONSE_FORMAT,
}

JUDGE_OURS_BT_MODELING_MULTIMODAL_PROMPT_TEMPLATE = {
    "user": JUDGE_OURS_BT_MODELING_BASE_PROMPT+JUDGE_OURS_IMAGE_INPUT+JUDGE_OURS_WITH_CHECKLIST+"\n## Agent's Response\n",
    "assistant": BT_MODELING_RESPONSE_FORMAT,
}

JUDGE_OURS_BT_MODELING_WO_CHECKLIST_PROMPT_TEMPLATE = {
    "user": JUDGE_OURS_BT_MODELING_BASE_PROMPT+"\n## Agent's Response\n",
    "assistant": BT_MODELING_RESPONSE_FORMAT,
}

JUDGE_OURS_BT_MODELING_MULTIMODAL_WO_CHECKLIST_PROMPT_TEMPLATE = {
    "user": JUDGE_OURS_BT_MODELING_BASE_PROMPT+JUDGE_OURS_IMAGE_INPUT+"\n## Agent's Response\n",
    "assistant": BT_MODELING_RESPONSE_FORMAT,
}