Spaces:
Sleeping
Sleeping
File size: 3,085 Bytes
498ffec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import re
from langchain_openai import ChatOpenAI
from .agent import BaseAgent
SYSTEM_PROMPT = "You are an expert evaluator. Your task is to assess how well a Web Agent’s generated checklist aligns with the reference checklist for a given user instruction."
USER_PROMPT = """# Task Description
Use the provided task description, evaluation criteria, and both checklists to assign a score from 1 to 5. Justify your rating with a brief explanation that considers both content overlap and logical structure.
## Score Criteria
- 5: Checklist covers all subgoals, is correct and clearly expressed
- 4: Minor omissions or phrasing issues but mostly accurate and complete
- 3: Partially matches, but with noticeable gaps or errors
- 2: Incomplete or includes incorrect steps
- 1: Mostly irrelevant, incorrect, or missing the task goal
## User Instruction:
{intent}
## Reference Checklist:
{gt_checklist}
## Agent’s Generated Checklist:
{generated_checklist}
# Output Format
Your response should be in the following format:
REASON: [Write 2–4 sentences explaining how well the generated checklist matches the reference. Mention specific matches, omissions, errors, or strengths.]
SCORE: [1–5]
"""
class ChecklistEvalAgent(BaseAgent):
def __init__(self, agent_config: dict):
super().__init__(agent_config)
self._setup()
def prepare_message(self, model_input: dict, prompt_type):
message = [
{
"role": "system",
"content": SYSTEM_PROMPT
},
{
"role": "user",
"content": USER_PROMPT.format(
intent=model_input["intent"],
gt_checklist=model_input["gt_checklist"],
generated_checklist=model_input["generated_checklist"]
)
}
]
return message
def generate_response(self, model_input: dict):
total_cost = 0
response_list = []
# prepare message
message = self.prepare_message(model_input)
# n sampling
for _ in range(self.num_generate):
response, cost = self.generate_with_retry(message, ["SCORE"])
response_list.append(response)
total_cost += cost
return response_list, total_cost
def parsing_score(response: str):
score = response.split("SCORE:")[-1].split("\n")[0].strip()
match = re.search(r'\d+', score)
if match:
return int(match.group())
else:
return None
def average_score(scores: list[int]):
if len(scores) == 0:
return 0
return sum(scores) / len(scores)
def get_score(results: list[dict]):
score_list = []
for result in results:
tmp_scores = [parsing_score(response) for response in result["response"]]
scores = [score for score in tmp_scores if score is not None]
result["score_list"] = scores
final_score = average_score(scores)
result["score"] = final_score
score_list.append(result)
return results, score_list |