Spaces:

iruno
/

test_wprm3

Sleeping

File size: 3,085 Bytes

498ffec

import re

from langchain_openai import ChatOpenAI

from .agent import BaseAgent

SYSTEM_PROMPT = "You are an expert evaluator. Your task is to assess how well a Web Agent’s generated checklist aligns with the reference checklist for a given user instruction."

USER_PROMPT = """# Task Description
Use the provided task description, evaluation criteria, and both checklists to assign a score from 1 to 5. Justify your rating with a brief explanation that considers both content overlap and logical structure.

## Score Criteria
- 5: Checklist covers all subgoals, is correct and clearly expressed
- 4: Minor omissions or phrasing issues but mostly accurate and complete
- 3: Partially matches, but with noticeable gaps or errors
- 2: Incomplete or includes incorrect steps
- 1: Mostly irrelevant, incorrect, or missing the task goal

## User Instruction:
{intent}

## Reference Checklist:
{gt_checklist}

## Agent’s Generated Checklist:
{generated_checklist}

# Output Format
Your response should be in the following format:
REASON: [Write 2–4 sentences explaining how well the generated checklist matches the reference. Mention specific matches, omissions, errors, or strengths.]
SCORE: [1–5]
"""


class ChecklistEvalAgent(BaseAgent):
    def __init__(self, agent_config: dict):
        super().__init__(agent_config)
        self._setup()
    
    def prepare_message(self, model_input: dict, prompt_type):
        message = [
            {
                "role": "system",
                "content": SYSTEM_PROMPT
            },
            {
                "role": "user",
                "content": USER_PROMPT.format(
                    intent=model_input["intent"],
                    gt_checklist=model_input["gt_checklist"],
                    generated_checklist=model_input["generated_checklist"]
                )
            }
        ]
        return message
    
    def generate_response(self, model_input: dict):
        total_cost = 0
        response_list = []
        # prepare message
        message = self.prepare_message(model_input)

        # n sampling
        for _ in range(self.num_generate):
            response, cost = self.generate_with_retry(message, ["SCORE"])
            response_list.append(response)
            total_cost += cost

        return response_list, total_cost

def parsing_score(response: str):
    score = response.split("SCORE:")[-1].split("\n")[0].strip()
    match = re.search(r'\d+', score)
    
    if match:
        return int(match.group())
    else:
        return None

def average_score(scores: list[int]):
    if len(scores) == 0:
        return 0
    return sum(scores) / len(scores)

def get_score(results: list[dict]):
    score_list = []
    for result in results:
        tmp_scores = [parsing_score(response) for response in result["response"]]
        scores = [score for score in tmp_scores if score is not None]
        result["score_list"] = scores
        final_score = average_score(scores)
        result["score"] = final_score
        score_list.append(result)

    return results, score_list