Kunal Pai commited on
Commit
0581f43
·
1 Parent(s): 23572e6

Refactor benchmark functions to use updated client.predict API and improve prompt clarity

Browse files
bench/benchmarking_hle.py CHANGED
@@ -101,15 +101,15 @@ def benchmark_hle(num_samples=20, categories=None):
101
  # Send query to agent
102
  try:
103
  start_time = time.time()
104
- response = client.predict(
105
- messages=[{"role": "user", "content": prompt}],
106
- api_name="/run"
107
  )
108
  end_time = time.time()
109
 
110
  target_answer_phrase = sample.get('answer', '').strip()
111
 
112
- agent_final_response_content = get_last_assistant_content(response)
113
 
114
  is_correct = False
115
 
@@ -125,7 +125,7 @@ def benchmark_hle(num_samples=20, categories=None):
125
  "category": category,
126
  "input": prompt,
127
  "target_output": sample.get('answer', ''),
128
- "agent_full_response": response,
129
  "agent_final_response": agent_final_response_content,
130
  "response_time": end_time - start_time,
131
  "is_correct": is_correct
 
101
  # Send query to agent
102
  try:
103
  start_time = time.time()
104
+ response, history = client.predict(
105
+ message={"text": prompt, "files": []},
106
+ api_name="/chat"
107
  )
108
  end_time = time.time()
109
 
110
  target_answer_phrase = sample.get('answer', '').strip()
111
 
112
+ agent_final_response_content = get_last_assistant_content(history)
113
 
114
  is_correct = False
115
 
 
125
  "category": category,
126
  "input": prompt,
127
  "target_output": sample.get('answer', ''),
128
+ "agent_full_response": history,
129
  "agent_final_response": agent_final_response_content,
130
  "response_time": end_time - start_time,
131
  "is_correct": is_correct
bench/benchmarking_paper_reviews.py CHANGED
@@ -64,25 +64,26 @@ def benchmark_paper_reviews(
64
  for idx, row in df.iterrows():
65
  paper_id = row[id_col]
66
  title = row["Title"]
67
- prompt = "Create THREE agents with different personalities, expertise, and review styles. " \
68
  "Each agent should provide a review of the paper, and recommend Accept/Reject for ICLR 2023. " \
69
  "The review should be detailed and include strengths and weaknesses. " \
70
- "You can use ArxivTool and WikipediaTool to get more information. " \
 
71
  "The paper title is: " + title + "\n\n" + row[text_col]
72
  print(f"[{idx+1}/{len(df)}] Paper ID: {paper_id}")
73
 
74
  try:
75
  start = time.time()
76
- resp = client.predict(
77
- messages=[{"role":"user","content": prompt}],
78
- api_name="/run"
79
  )
80
  elapsed = time.time() - start
81
 
82
  result = {
83
  "paper_id": paper_id,
84
  "prompt_snippet": prompt[:200],
85
- "agent_review": resp,
86
  "ground_truth": row["Decision"],
87
  "response_time": elapsed
88
  }
@@ -91,7 +92,7 @@ def benchmark_paper_reviews(
91
  with open(out_path, "a") as f:
92
  f.write(json.dumps(result) + "\n")
93
 
94
- print(f" → {elapsed:.2f}s, review length {len(resp)} chars")
95
  results.append(result)
96
 
97
  # small delay
@@ -105,6 +106,6 @@ def benchmark_paper_reviews(
105
  if __name__ == "__main__":
106
  # example usage: adjust path & sample count as needed
107
  benchmark_paper_reviews(
108
- csv_path="ICLR_2023.csv",
109
  num_samples=1
110
  )
 
64
  for idx, row in df.iterrows():
65
  paper_id = row[id_col]
66
  title = row["Title"]
67
+ prompt = "Create THREE agents with relevant personalities, expertise, and review styles. " \
68
  "Each agent should provide a review of the paper, and recommend Accept/Reject for ICLR 2023. " \
69
  "The review should be detailed and include strengths and weaknesses. " \
70
+ "You MUST use ArxivTool and WikipediaTool to get more information about novelty and correctness. " \
71
+ "GIVE A FINAL DECISION in the form of \"FINAL DECISION: <Accept/Reject>\". " \
72
  "The paper title is: " + title + "\n\n" + row[text_col]
73
  print(f"[{idx+1}/{len(df)}] Paper ID: {paper_id}")
74
 
75
  try:
76
  start = time.time()
77
+ resp, history = client.predict(
78
+ message={"text": prompt, "files": []},
79
+ api_name="/chat"
80
  )
81
  elapsed = time.time() - start
82
 
83
  result = {
84
  "paper_id": paper_id,
85
  "prompt_snippet": prompt[:200],
86
+ "agent_review": history,
87
  "ground_truth": row["Decision"],
88
  "response_time": elapsed
89
  }
 
92
  with open(out_path, "a") as f:
93
  f.write(json.dumps(result) + "\n")
94
 
95
+ print(f" → {elapsed:.2f}s, review length {len(history)} chars")
96
  results.append(result)
97
 
98
  # small delay
 
106
  if __name__ == "__main__":
107
  # example usage: adjust path & sample count as needed
108
  benchmark_paper_reviews(
109
+ csv_path="bench/data/ICLR_2023.csv",
110
  num_samples=1
111
  )