Spaces:

HASHIRUAgentX
/

hashiruAI

Running

App Files Files Community

Kunal Pai commited on May 15

Commit

0581f43

1 Parent(s): 23572e6

Refactor benchmark functions to use updated client.predict API and improve prompt clarity

Browse files

Files changed (2) hide show

bench/benchmarking_hle.py +5 -5
bench/benchmarking_paper_reviews.py +9 -8

bench/benchmarking_hle.py CHANGED Viewed

@@ -101,15 +101,15 @@ def benchmark_hle(num_samples=20, categories=None):
         # Send query to agent
         try:
             start_time = time.time()
-            response = client.predict(
-                messages=[{"role": "user", "content": prompt}],
-                api_name="/run"
             )
             end_time = time.time()
             target_answer_phrase = sample.get('answer', '').strip()
-            agent_final_response_content = get_last_assistant_content(response)
             is_correct = False
@@ -125,7 +125,7 @@ def benchmark_hle(num_samples=20, categories=None):
                 "category": category,
                 "input": prompt,
                 "target_output": sample.get('answer', ''),
-                "agent_full_response": response,
                 "agent_final_response": agent_final_response_content,
                 "response_time": end_time - start_time,
                 "is_correct": is_correct

         # Send query to agent
         try:
             start_time = time.time()
+            response, history = client.predict(
+                message={"text": prompt, "files": []},
+                api_name="/chat"
             )
             end_time = time.time()
             target_answer_phrase = sample.get('answer', '').strip()
+            agent_final_response_content = get_last_assistant_content(history)
             is_correct = False
                 "category": category,
                 "input": prompt,
                 "target_output": sample.get('answer', ''),
+                "agent_full_response": history,
                 "agent_final_response": agent_final_response_content,
                 "response_time": end_time - start_time,
                 "is_correct": is_correct

bench/benchmarking_paper_reviews.py CHANGED Viewed

@@ -64,25 +64,26 @@ def benchmark_paper_reviews(
     for idx, row in df.iterrows():
         paper_id = row[id_col]
         title = row["Title"]
-        prompt = "Create THREE agents with different personalities, expertise, and review styles. " \
                 "Each agent should provide a review of the paper, and recommend Accept/Reject for ICLR 2023. " \
                 "The review should be detailed and include strengths and weaknesses. " \
-                "You can use ArxivTool and WikipediaTool to get more information. " \
                 "The paper title is: " + title + "\n\n" + row[text_col]
         print(f"[{idx+1}/{len(df)}] Paper ID: {paper_id}")
         try:
             start = time.time()
-            resp = client.predict(
-                messages=[{"role":"user","content": prompt}],
-                api_name="/run"
             )
             elapsed = time.time() - start
             result = {
                 "paper_id": paper_id,
                 "prompt_snippet": prompt[:200],
-                "agent_review": resp,
                 "ground_truth": row["Decision"],
                 "response_time": elapsed
             }
@@ -91,7 +92,7 @@ def benchmark_paper_reviews(
             with open(out_path, "a") as f:
                 f.write(json.dumps(result) + "\n")
-            print(f" → {elapsed:.2f}s, review length {len(resp)} chars")
             results.append(result)
             # small delay
@@ -105,6 +106,6 @@ def benchmark_paper_reviews(
 if __name__ == "__main__":
     # example usage: adjust path & sample count as needed
     benchmark_paper_reviews(
-        csv_path="ICLR_2023.csv",
         num_samples=1
     )

     for idx, row in df.iterrows():
         paper_id = row[id_col]
         title = row["Title"]
+        prompt = "Create THREE agents with relevant personalities, expertise, and review styles. " \
                 "Each agent should provide a review of the paper, and recommend Accept/Reject for ICLR 2023. " \
                 "The review should be detailed and include strengths and weaknesses. " \
+                "You MUST use ArxivTool and WikipediaTool to get more information about novelty and correctness. " \
+                "GIVE A FINAL DECISION in the form of \"FINAL DECISION: <Accept/Reject>\". " \
                 "The paper title is: " + title + "\n\n" + row[text_col]
         print(f"[{idx+1}/{len(df)}] Paper ID: {paper_id}")
         try:
             start = time.time()
+            resp, history = client.predict(
+                message={"text": prompt, "files": []},
+                api_name="/chat"
             )
             elapsed = time.time() - start
             result = {
                 "paper_id": paper_id,
                 "prompt_snippet": prompt[:200],
+                "agent_review": history,
                 "ground_truth": row["Decision"],
                 "response_time": elapsed
             }
             with open(out_path, "a") as f:
                 f.write(json.dumps(result) + "\n")
+            print(f" → {elapsed:.2f}s, review length {len(history)} chars")
             results.append(result)
             # small delay
 if __name__ == "__main__":
     # example usage: adjust path & sample count as needed
     benchmark_paper_reviews(
+        csv_path="bench/data/ICLR_2023.csv",
         num_samples=1
     )