from gradio_client import Client import pandas as pd import json import time import os from datetime import datetime def get_last_assistant_content(resp): """ Return the last assistant utterance from the response object produced by `client.predict`. """ if isinstance(resp, tuple): resp = resp[0] if not isinstance(resp, list): return "" for turn in reversed(resp): if turn.get("role") != "assistant": continue if turn.get("content"): return turn["content"] fr = turn.get("function_response", {}) out = fr.get("result", {}).get("output") if out: return out cont = turn.get("content") if isinstance(cont, dict): parts = cont.get("parts", []) if parts and parts[0].get("text"): return parts[0]["text"] return "" def benchmark_paper_reviews( csv_path, id_col="ID", text_col="concatenated_text", num_samples=None, output_dir="results" ): """ Benchmark agent performance on paper reviews. Args: csv_path: path to the pipe‑separated CSV of papers + existing reviews id_col: name of the column containing unique paper IDs text_col: name of the column containing the full paper text num_samples: if set, randomly sample this many papers output_dir: where to write the JSONL results """ # load CSV df = pd.read_csv(csv_path, sep="|") if num_samples: df = df.sample(num_samples, random_state=42).reset_index(drop=True) # prepare output os.makedirs(output_dir, exist_ok=True) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") out_path = os.path.join(output_dir, f"paper_review_benchmark_{timestamp}.jsonl") # init client client = Client("http://127.0.0.1:7860/") results = [] for idx, row in df.iterrows(): paper_id = row[id_col] title = row["Title"] prompt = "Create THREE agents with different personalities, expertise, and review styles. " \ "Each agent should provide a review of the paper, and recommend Accept/Reject for ICLR 2023. " \ "The review should be detailed and include strengths and weaknesses. " \ "You can use ArxivTool and WikipediaTool to get more information. " \ "The paper title is: " + title + "\n\n" + row[text_col] print(f"[{idx+1}/{len(df)}] Paper ID: {paper_id}") try: start = time.time() resp = client.predict( messages=[{"role":"user","content": prompt}], api_name="/run" ) elapsed = time.time() - start result = { "paper_id": paper_id, "prompt_snippet": prompt[:200], "agent_review": resp, "ground_truth": row["Decision"], "response_time": elapsed } # write immediately with open(out_path, "a") as f: f.write(json.dumps(result) + "\n") print(f" → {elapsed:.2f}s, review length {len(resp)} chars") results.append(result) # small delay time.sleep(1) except Exception as e: print(f" Error on {paper_id}: {e}") print(f"\nDone. Results written to {out_path}") return results if __name__ == "__main__": # example usage: adjust path & sample count as needed benchmark_paper_reviews( csv_path="ICLR_2023.csv", num_samples=1 )