Spaces:
Running
Running
| from gradio_client import Client | |
| import pandas as pd | |
| import json | |
| import time | |
| import os | |
| from datetime import datetime | |
| def get_last_assistant_content(resp): | |
| """ | |
| Return the last assistant utterance from the response object | |
| produced by `client.predict`. | |
| """ | |
| if isinstance(resp, tuple): | |
| resp = resp[0] | |
| if not isinstance(resp, list): | |
| return "" | |
| for turn in reversed(resp): | |
| if turn.get("role") != "assistant": | |
| continue | |
| if turn.get("content"): | |
| return turn["content"] | |
| fr = turn.get("function_response", {}) | |
| out = fr.get("result", {}).get("output") | |
| if out: | |
| return out | |
| cont = turn.get("content") | |
| if isinstance(cont, dict): | |
| parts = cont.get("parts", []) | |
| if parts and parts[0].get("text"): | |
| return parts[0]["text"] | |
| return "" | |
| def benchmark_paper_reviews( | |
| csv_path, | |
| id_col="ID", | |
| text_col="concatenated_text", | |
| num_samples=None, | |
| output_dir="results" | |
| ): | |
| """ | |
| Benchmark agent performance on paper reviews. | |
| Args: | |
| csv_path: path to the pipe‑separated CSV of papers + existing reviews | |
| id_col: name of the column containing unique paper IDs | |
| text_col: name of the column containing the full paper text | |
| num_samples: if set, randomly sample this many papers | |
| output_dir: where to write the JSONL results | |
| """ | |
| # load CSV | |
| df = pd.read_csv(csv_path, sep="|") | |
| if num_samples: | |
| df = df.sample(num_samples, random_state=42).reset_index(drop=True) | |
| # prepare output | |
| os.makedirs(output_dir, exist_ok=True) | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| out_path = os.path.join(output_dir, f"paper_review_benchmark_{timestamp}.jsonl") | |
| # init client | |
| client = Client("http://127.0.0.1:7860/") | |
| results = [] | |
| for idx, row in df.iterrows(): | |
| paper_id = row[id_col] | |
| title = row["Title"] | |
| prompt = "Create THREE agents with different personalities, expertise, and review styles. " \ | |
| "Each agent should provide a review of the paper, and recommend Accept/Reject for ICLR 2023. " \ | |
| "The review should be detailed and include strengths and weaknesses. " \ | |
| "You can use ArxivTool and WikipediaTool to get more information. " \ | |
| "The paper title is: " + title + "\n\n" + row[text_col] | |
| print(f"[{idx+1}/{len(df)}] Paper ID: {paper_id}") | |
| try: | |
| start = time.time() | |
| resp = client.predict( | |
| messages=[{"role":"user","content": prompt}], | |
| api_name="/run" | |
| ) | |
| elapsed = time.time() - start | |
| result = { | |
| "paper_id": paper_id, | |
| "prompt_snippet": prompt[:200], | |
| "agent_review": resp, | |
| "ground_truth": row["Decision"], | |
| "response_time": elapsed | |
| } | |
| # write immediately | |
| with open(out_path, "a") as f: | |
| f.write(json.dumps(result) + "\n") | |
| print(f" → {elapsed:.2f}s, review length {len(resp)} chars") | |
| results.append(result) | |
| # small delay | |
| time.sleep(1) | |
| except Exception as e: | |
| print(f" Error on {paper_id}: {e}") | |
| print(f"\nDone. Results written to {out_path}") | |
| return results | |
| if __name__ == "__main__": | |
| # example usage: adjust path & sample count as needed | |
| benchmark_paper_reviews( | |
| csv_path="ICLR_2023.csv", | |
| num_samples=1 | |
| ) | |