Spaces:
Running
Running
Kunal Pai
commited on
Commit
·
0581f43
1
Parent(s):
23572e6
Refactor benchmark functions to use updated client.predict API and improve prompt clarity
Browse files
bench/benchmarking_hle.py
CHANGED
@@ -101,15 +101,15 @@ def benchmark_hle(num_samples=20, categories=None):
|
|
101 |
# Send query to agent
|
102 |
try:
|
103 |
start_time = time.time()
|
104 |
-
response = client.predict(
|
105 |
-
|
106 |
-
api_name="/
|
107 |
)
|
108 |
end_time = time.time()
|
109 |
|
110 |
target_answer_phrase = sample.get('answer', '').strip()
|
111 |
|
112 |
-
agent_final_response_content = get_last_assistant_content(
|
113 |
|
114 |
is_correct = False
|
115 |
|
@@ -125,7 +125,7 @@ def benchmark_hle(num_samples=20, categories=None):
|
|
125 |
"category": category,
|
126 |
"input": prompt,
|
127 |
"target_output": sample.get('answer', ''),
|
128 |
-
"agent_full_response":
|
129 |
"agent_final_response": agent_final_response_content,
|
130 |
"response_time": end_time - start_time,
|
131 |
"is_correct": is_correct
|
|
|
101 |
# Send query to agent
|
102 |
try:
|
103 |
start_time = time.time()
|
104 |
+
response, history = client.predict(
|
105 |
+
message={"text": prompt, "files": []},
|
106 |
+
api_name="/chat"
|
107 |
)
|
108 |
end_time = time.time()
|
109 |
|
110 |
target_answer_phrase = sample.get('answer', '').strip()
|
111 |
|
112 |
+
agent_final_response_content = get_last_assistant_content(history)
|
113 |
|
114 |
is_correct = False
|
115 |
|
|
|
125 |
"category": category,
|
126 |
"input": prompt,
|
127 |
"target_output": sample.get('answer', ''),
|
128 |
+
"agent_full_response": history,
|
129 |
"agent_final_response": agent_final_response_content,
|
130 |
"response_time": end_time - start_time,
|
131 |
"is_correct": is_correct
|
bench/benchmarking_paper_reviews.py
CHANGED
@@ -64,25 +64,26 @@ def benchmark_paper_reviews(
|
|
64 |
for idx, row in df.iterrows():
|
65 |
paper_id = row[id_col]
|
66 |
title = row["Title"]
|
67 |
-
prompt = "Create THREE agents with
|
68 |
"Each agent should provide a review of the paper, and recommend Accept/Reject for ICLR 2023. " \
|
69 |
"The review should be detailed and include strengths and weaknesses. " \
|
70 |
-
"You
|
|
|
71 |
"The paper title is: " + title + "\n\n" + row[text_col]
|
72 |
print(f"[{idx+1}/{len(df)}] Paper ID: {paper_id}")
|
73 |
|
74 |
try:
|
75 |
start = time.time()
|
76 |
-
resp = client.predict(
|
77 |
-
|
78 |
-
api_name="/
|
79 |
)
|
80 |
elapsed = time.time() - start
|
81 |
|
82 |
result = {
|
83 |
"paper_id": paper_id,
|
84 |
"prompt_snippet": prompt[:200],
|
85 |
-
"agent_review":
|
86 |
"ground_truth": row["Decision"],
|
87 |
"response_time": elapsed
|
88 |
}
|
@@ -91,7 +92,7 @@ def benchmark_paper_reviews(
|
|
91 |
with open(out_path, "a") as f:
|
92 |
f.write(json.dumps(result) + "\n")
|
93 |
|
94 |
-
print(f" → {elapsed:.2f}s, review length {len(
|
95 |
results.append(result)
|
96 |
|
97 |
# small delay
|
@@ -105,6 +106,6 @@ def benchmark_paper_reviews(
|
|
105 |
if __name__ == "__main__":
|
106 |
# example usage: adjust path & sample count as needed
|
107 |
benchmark_paper_reviews(
|
108 |
-
csv_path="ICLR_2023.csv",
|
109 |
num_samples=1
|
110 |
)
|
|
|
64 |
for idx, row in df.iterrows():
|
65 |
paper_id = row[id_col]
|
66 |
title = row["Title"]
|
67 |
+
prompt = "Create THREE agents with relevant personalities, expertise, and review styles. " \
|
68 |
"Each agent should provide a review of the paper, and recommend Accept/Reject for ICLR 2023. " \
|
69 |
"The review should be detailed and include strengths and weaknesses. " \
|
70 |
+
"You MUST use ArxivTool and WikipediaTool to get more information about novelty and correctness. " \
|
71 |
+
"GIVE A FINAL DECISION in the form of \"FINAL DECISION: <Accept/Reject>\". " \
|
72 |
"The paper title is: " + title + "\n\n" + row[text_col]
|
73 |
print(f"[{idx+1}/{len(df)}] Paper ID: {paper_id}")
|
74 |
|
75 |
try:
|
76 |
start = time.time()
|
77 |
+
resp, history = client.predict(
|
78 |
+
message={"text": prompt, "files": []},
|
79 |
+
api_name="/chat"
|
80 |
)
|
81 |
elapsed = time.time() - start
|
82 |
|
83 |
result = {
|
84 |
"paper_id": paper_id,
|
85 |
"prompt_snippet": prompt[:200],
|
86 |
+
"agent_review": history,
|
87 |
"ground_truth": row["Decision"],
|
88 |
"response_time": elapsed
|
89 |
}
|
|
|
92 |
with open(out_path, "a") as f:
|
93 |
f.write(json.dumps(result) + "\n")
|
94 |
|
95 |
+
print(f" → {elapsed:.2f}s, review length {len(history)} chars")
|
96 |
results.append(result)
|
97 |
|
98 |
# small delay
|
|
|
106 |
if __name__ == "__main__":
|
107 |
# example usage: adjust path & sample count as needed
|
108 |
benchmark_paper_reviews(
|
109 |
+
csv_path="bench/data/ICLR_2023.csv",
|
110 |
num_samples=1
|
111 |
)
|