helloparthshah commited on
Commit
ef6cb0b
·
2 Parent(s): 0c0c279 ba796a2

Merge branch 'main' of github.com:kunpai/HASHIRU

Browse files
bench/benchmarking_connections.py DELETED
@@ -1,97 +0,0 @@
1
- from gradio_client import Client
2
- from datasets import load_dataset
3
- import json
4
- import time
5
- import random
6
- import os
7
- from datetime import datetime
8
- import re
9
-
10
- def get_last_assistant_content(resp):
11
- """
12
- Return the last assistant utterance from the response object
13
- produced by `client.predict`.
14
- """
15
- # ❶ If the server wraps things in a (messages, meta) tuple
16
- if isinstance(resp, tuple):
17
- resp = resp[0]
18
-
19
- # ❷ At this point `resp` must be the list of message dicts
20
- if not isinstance(resp, list):
21
- return ""
22
-
23
- for turn in reversed(resp):
24
- if turn.get("role") != "assistant":
25
- continue
26
-
27
- # a) plain messages
28
- if turn.get("content"):
29
- return turn["content"]
30
-
31
- # b) tool / function_response wrapper
32
- fr = turn.get("function_response", {})
33
- out = fr.get("result", {}).get("output")
34
- if out:
35
- return out
36
-
37
- # c) messages stored as Part objects inside `content`
38
- cont = turn.get("content")
39
- if isinstance(cont, dict):
40
- parts = cont.get("parts", [])
41
- if parts and parts[0].get("text"):
42
- return parts[0]["text"]
43
-
44
- return ""
45
-
46
- def benchmark_nyt_connections(num_samples=20, categories=None):
47
- """
48
- Benchmark agent performance on NYT connections dataset
49
- Args:
50
- num_samples: Number of samples to test
51
- categories: List of categories to include (None for all)
52
- """
53
- # Load NYT connections dataset
54
- print("Loading NYT connections dataset...")
55
- dataset = load_dataset("tm21cy/NYT-Connections")
56
-
57
- # Initialize client
58
- client = Client("http://127.0.0.1:7860/")
59
-
60
- # Prepare output directory
61
- output_dir = "results"
62
- os.makedirs(output_dir, exist_ok=True)
63
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
64
- out_path = os.path.join(output_dir, f"nyt_connections_benchmark_{timestamp}.jsonl")
65
- print(f"Results will be saved to {out_path}")
66
- results = []
67
- num_samples = min(num_samples, len(dataset["train"])) if num_samples else len(dataset["train"])
68
- print(f"Sampling {num_samples} samples from the dataset.")
69
- indices = random.sample(range(len(dataset["train"])), num_samples)
70
- for i in indices:
71
- sample = dataset["train"][i]
72
- if categories and sample["category"] not in categories:
73
- continue
74
- print(f"Sample {i}: {sample['contest']}")
75
- prompt = f"Given the following words, group them into 4 categories of 4 words each:\n{' '.join(sample['words'])}\n\n Once you've solved it, final output should be in the following format Group 1: word1, word2, word3, word4\nGroup 2: ..."
76
- start_time = time.time()
77
- response = client.predict(messages=[{"role": "user", "content": prompt}], api_name="/run")
78
- end_time = time.time()
79
- elapsed_time = end_time - start_time
80
- assistant_content = get_last_assistant_content(response)
81
- results.append({
82
- "input": sample["words"],
83
- "date": sample["contest"],
84
- "output": assistant_content,
85
- "expected": sample["answers"],
86
- "elapsed_time": elapsed_time,
87
- })
88
-
89
- # Save intermediate results
90
- with open(out_path, "a") as f:
91
- for result in results:
92
- f.write(json.dumps(result) + "\n")
93
- print(f"Results saved to {out_path}")
94
- return results
95
-
96
- if __name__ == "__main__":
97
- benchmark_nyt_connections(num_samples=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bench/benchmarking_globle.py DELETED
@@ -1,141 +0,0 @@
1
- #!/usr/bin/env python3
2
- import random
3
- import math
4
- import sys
5
- import json
6
- import time
7
- import difflib
8
- import os
9
- import requests
10
- import re
11
- import geopandas as gpd
12
- from shapely.geometry import Point
13
- from gradio_client import Client
14
- from datetime import datetime
15
-
16
- # -----------------------------------------------------------------------------
17
- # Utility: haversine distance only
18
- # -----------------------------------------------------------------------------
19
- def haversine(lat1, lon1, lat2, lon2):
20
- """Return distance in kilometers between two lat/lon points."""
21
- R = 6371.0 # Earth radius in km
22
- φ1, φ2 = math.radians(lat1), math.radians(lat2)
23
- Δφ = math.radians(lat2 - lat1)
24
- Δλ = math.radians(lon2 - lon1)
25
- a = math.sin(Δφ/2)**2 + math.cos(φ1)*math.cos(φ2)*math.sin(Δλ/2)**2
26
- return 2 * R * math.asin(math.sqrt(a))
27
-
28
- # -----------------------------------------------------------------------------
29
- # Load country centroids and geometries
30
- # -----------------------------------------------------------------------------
31
-
32
- def load_countries(geo_path):
33
- gdf = gpd.read_file(geo_path)
34
- candidates = ["ADMIN","NAME","NAME_EN","NAME_LONG","SOVEREIGN","COUNTRY"]
35
- name_field = next((f for f in candidates if f in gdf.columns), None)
36
- if name_field is None:
37
- non_geom = [c for c in gdf.columns if c.lower()!='geometry']
38
- name_field = non_geom[0] if non_geom else None
39
- centroids, geoms = {}, {}
40
- for _, row in gdf.iterrows():
41
- geom = row.geometry
42
- if not geom or geom.is_empty: continue
43
- c = geom.centroid
44
- country = row[name_field]
45
- centroids[country] = (c.y, c.x)
46
- geoms[country] = geom
47
- return centroids, geoms
48
-
49
- # -----------------------------------------------------------------------------
50
- # Extract last assistant content
51
- # -----------------------------------------------------------------------------
52
- def get_last_assistant_content(resp):
53
- if isinstance(resp, tuple): resp = resp[0]
54
- if not isinstance(resp, list): return ""
55
- for turn in reversed(resp):
56
- if turn.get("role")!='assistant': continue
57
- if turn.get("content"): return turn["content"]
58
- fr=turn.get("function_response",{})
59
- out=fr.get("result",{}).get("output")
60
- if out: return out
61
- cont=turn.get("content")
62
- if isinstance(cont,dict): parts=cont.get("parts",[])
63
- if parts and parts[0].get("text"): return parts[0]["text"]
64
- return ""
65
-
66
- # -----------------------------------------------------------------------------
67
- # Play one automated Globle game via LLM
68
- # -----------------------------------------------------------------------------
69
- def play_globle_agent(client, countries, geoms, max_guesses=20, threshold=0.6):
70
- # pick random target
71
- target, (tlat, tlon) = random.choice(list(countries.items()))
72
- guesses = []
73
- attempts = 0
74
-
75
- while True:
76
- # build prompt history
77
- history = "\n".join([f"Guess: {g}, Response: {resp}" for g,resp in guesses])
78
- prompt = (
79
- "Worldle (distance-only). Guess the country.\n" +
80
- (history+"\n" if history else "") +
81
- "Respond with a single country name and ONLY the name of the country."
82
- )
83
- resp = client.predict(messages=[{"role":"user","content":prompt}], api_name="/run")
84
- raw = get_last_assistant_content(resp).strip()
85
- print(f"Guess: {raw}")
86
- # sanitize: fuzzy match to known country
87
- if raw not in countries:
88
- match = difflib.get_close_matches(raw, countries.keys(), n=1, cutoff=threshold)
89
- if match:
90
- guess = match[0]
91
- else:
92
- # invalid guess, retry without counting
93
- continue
94
- else:
95
- guess = raw
96
-
97
- attempts += 1
98
- # correct?
99
- if guess == target:
100
- return {"target":target, "guesses":guesses+[(guess,"CORRECT")], "turns":attempts, "solved":True}
101
-
102
- # adjacency
103
- if geoms[guess].touches(geoms[target]):
104
- feedback="BORDER"
105
- else:
106
- # distance
107
- glat, glon = countries[guess]
108
- dist = haversine(glat, glon, tlat, tlon)
109
- feedback=f"{dist:.0f}km"
110
- guesses.append((guess,feedback))
111
- if attempts>=max_guesses:
112
- return {"target":target, "guesses":guesses, "turns":attempts, "solved":False}
113
-
114
- # -----------------------------------------------------------------------------
115
- # Benchmark multiple games
116
- # -----------------------------------------------------------------------------
117
- def benchmark_globle(geo_path, num_games=1, max_guesses=20, cutoff=0.6):
118
- countries, geoms = load_countries(geo_path)
119
- client = Client("http://127.0.0.1:7860/")
120
- os.makedirs("results",exist_ok=True)
121
- out_file = os.path.join("results", f"globle_benchmark_{datetime.now():%Y%m%d_%H%M%S}.jsonl")
122
- results=[]
123
- for i in range(num_games):
124
- print(f"Game {i+1}/{num_games}")
125
- start=time.time()
126
- res=play_globle_agent(client,countries,geoms,max_guesses,cutoff)
127
- res["time"] = time.time()-start
128
- results.append(res)
129
- with open(out_file,"a") as f: f.write(json.dumps(res)+"\n")
130
- print(f"Saved results to {out_file}")
131
- return results
132
-
133
- # -----------------------------------------------------------------------------
134
- # CLI
135
- # -----------------------------------------------------------------------------
136
- if __name__=="__main__":
137
- if len(sys.argv)!=2:
138
- print("Usage: python benchmarking_globle.py path/to/countries-file")
139
- sys.exit(1)
140
- geo=sys.argv[1]
141
- benchmark_globle(geo)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bench/benchmarking_hle.py DELETED
@@ -1,174 +0,0 @@
1
- from gradio_client import Client
2
- from datasets import load_dataset
3
- import json
4
- import time
5
- import random
6
- import os
7
- from datetime import datetime
8
- import re
9
-
10
- def get_last_assistant_content(resp):
11
- """
12
- Return the last assistant utterance from the response object
13
- produced by `client.predict`.
14
- """
15
- # ❶ If the server wraps things in a (messages, meta) tuple
16
- if isinstance(resp, tuple):
17
- resp = resp[0]
18
-
19
- # ❷ At this point `resp` must be the list of message dicts
20
- if not isinstance(resp, list):
21
- return ""
22
-
23
- for turn in reversed(resp):
24
- if turn.get("role") != "assistant":
25
- continue
26
-
27
- # a) plain messages
28
- if turn.get("content"):
29
- return turn["content"]
30
-
31
- # b) tool / function_response wrapper
32
- fr = turn.get("function_response", {})
33
- out = fr.get("result", {}).get("output")
34
- if out:
35
- return out
36
-
37
- # c) messages stored as Part objects inside `content`
38
- cont = turn.get("content")
39
- if isinstance(cont, dict):
40
- parts = cont.get("parts", [])
41
- if parts and parts[0].get("text"):
42
- return parts[0]["text"]
43
-
44
- return ""
45
-
46
- def benchmark_hle(num_samples=20, categories=None):
47
- """
48
- Benchmark agent performance on HLE dataset
49
-
50
- Args:
51
- num_samples: Number of samples to test
52
- categories: List of categories to include (None for all)
53
- """
54
- # Load HLE dataset
55
- print("Loading HLE dataset...")
56
- dataset = load_dataset("cais/hle")
57
-
58
- # Initialize client
59
- client = Client("http://127.0.0.1:7860/")
60
-
61
- # Create results directory if it doesn't exist
62
- os.makedirs("results", exist_ok=True)
63
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
64
- results_file = f"results/hle_benchmark_{timestamp}.jsonl"
65
-
66
- # Select samples
67
- all_samples = []
68
- for split in ['validation', 'test']: # Using validation and test splits
69
- if split in dataset:
70
- all_samples.extend(dataset[split])
71
-
72
- # Filter by category if specified
73
- if categories:
74
- all_samples = [s for s in all_samples if s.get('category') in categories]
75
-
76
- # Filter out prompts mentioning images (text-substring only)
77
- filtered_samples = [s for s in all_samples if 'image' not in s.get('input', '').lower()]
78
- removed = len(all_samples) - len(filtered_samples)
79
- if removed > 0:
80
- print(f"Filtered out {removed} samples containing 'image'.")
81
- all_samples = filtered_samples
82
-
83
- # Select random samples
84
- if len(all_samples) > num_samples:
85
- samples = random.sample(all_samples, num_samples)
86
- else:
87
- samples = all_samples
88
- print(f"Warning: Only found {len(samples)} samples after filtering.")
89
-
90
- print(f"Running benchmark on {len(samples)} samples...")
91
-
92
- # Run benchmarks
93
- results = []
94
- for i, sample in enumerate(samples):
95
- print(f"\nProcessing sample {i+1}/{len(samples)}")
96
- category = sample.get('category', 'Unknown')
97
- prompt = sample.get('question', '')
98
- print(f"Category: {category}")
99
- print(f"Question: {prompt[:100]}...")
100
-
101
- # Send query to agent
102
- try:
103
- start_time = time.time()
104
- response, history = client.predict(
105
- message={"text": prompt, "files": []},
106
- api_name="/chat"
107
- )
108
- end_time = time.time()
109
-
110
- target_answer_phrase = sample.get('answer', '').strip()
111
-
112
- agent_final_response_content = get_last_assistant_content(history)
113
-
114
- is_correct = False
115
-
116
- # Only attempt the check if both the target phrase and the agent content are non-empty
117
- if target_answer_phrase and agent_final_response_content:
118
- # Perform the simple case-insensitive substring check
119
- if target_answer_phrase.lower() in agent_final_response_content.lower():
120
- is_correct = True
121
-
122
- # Record result
123
- result = {
124
- "sample_id": sample.get('id', f'sample_{i}'),
125
- "category": category,
126
- "input": prompt,
127
- "target_output": sample.get('answer', ''),
128
- "agent_full_response": history,
129
- "agent_final_response": agent_final_response_content,
130
- "response_time": end_time - start_time,
131
- "is_correct": is_correct
132
- }
133
-
134
- results.append(result)
135
-
136
- # Write to file immediately to preserve progress
137
- with open(results_file, 'a') as f:
138
- f.write(json.dumps(result) + '\n')
139
-
140
- print(f"Response received in {end_time - start_time:.2f} seconds")
141
- print(f"Response: {response[:100]}...")
142
-
143
- # Add a delay to avoid overwhelming the server
144
- time.sleep(1)
145
-
146
- except Exception as e:
147
- print(f"Error processing sample: {e}")
148
- continue
149
-
150
- # Print summary statistics
151
- print("\n===== HLE BENCHMARK SUMMARY =====")
152
- print(f"Samples processed: {len(results)}")
153
-
154
- # Categorize by categories
155
- by_category = {}
156
- for result in results:
157
- category = result.get('category', 'Unknown')
158
- by_category.setdefault(category, []).append(result)
159
-
160
- print("\nSamples by category:")
161
- for category, items in by_category.items():
162
- print(f" {category}: {len(items)} samples")
163
-
164
- avg_time = sum(r.get('response_time', 0) for r in results) / len(results) if results else 0
165
- print(f"\nAverage response time: {avg_time:.2f} seconds")
166
- print(f"Results saved to: {results_file}")
167
-
168
- return results
169
-
170
- if __name__ == "__main__":
171
- benchmark_hle(
172
- num_samples=1,
173
- categories=None
174
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bench/benchmarking_paper_reviews.py DELETED
@@ -1,111 +0,0 @@
1
- from gradio_client import Client
2
- import pandas as pd
3
- import json
4
- import time
5
- import os
6
- from datetime import datetime
7
-
8
- def get_last_assistant_content(resp):
9
- """
10
- Return the last assistant utterance from the response object
11
- produced by `client.predict`.
12
- """
13
- if isinstance(resp, tuple):
14
- resp = resp[0]
15
- if not isinstance(resp, list):
16
- return ""
17
- for turn in reversed(resp):
18
- if turn.get("role") != "assistant":
19
- continue
20
- if turn.get("content"):
21
- return turn["content"]
22
- fr = turn.get("function_response", {})
23
- out = fr.get("result", {}).get("output")
24
- if out:
25
- return out
26
- cont = turn.get("content")
27
- if isinstance(cont, dict):
28
- parts = cont.get("parts", [])
29
- if parts and parts[0].get("text"):
30
- return parts[0]["text"]
31
- return ""
32
-
33
- def benchmark_paper_reviews(
34
- csv_path,
35
- id_col="ID",
36
- text_col="concatenated_text",
37
- num_samples=None,
38
- output_dir="results"
39
- ):
40
- """
41
- Benchmark agent performance on paper reviews.
42
-
43
- Args:
44
- csv_path: path to the pipe‑separated CSV of papers + existing reviews
45
- id_col: name of the column containing unique paper IDs
46
- text_col: name of the column containing the full paper text
47
- num_samples: if set, randomly sample this many papers
48
- output_dir: where to write the JSONL results
49
- """
50
- # load CSV
51
- df = pd.read_csv(csv_path, sep="|")
52
- if num_samples:
53
- df = df.sample(num_samples, random_state=42).reset_index(drop=True)
54
-
55
- # prepare output
56
- os.makedirs(output_dir, exist_ok=True)
57
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
58
- out_path = os.path.join(output_dir, f"paper_review_benchmark_{timestamp}.jsonl")
59
-
60
- # init client
61
- client = Client("http://127.0.0.1:7860/")
62
-
63
- results = []
64
- for idx, row in df.iterrows():
65
- paper_id = row[id_col]
66
- title = row["Title"]
67
- prompt = "Create THREE agents with relevant personalities, expertise, and review styles. " \
68
- "Each agent should provide a review of the paper, and recommend Accept/Reject for ICLR 2023. " \
69
- "The review should be detailed and include strengths and weaknesses. " \
70
- "You MUST use ArxivTool and WikipediaTool to get more information about novelty and correctness. " \
71
- "GIVE A FINAL DECISION in the form of \"FINAL DECISION: <Accept/Reject>\". " \
72
- "The paper title is: " + title + "\n\n" + row[text_col]
73
- print(f"[{idx+1}/{len(df)}] Paper ID: {paper_id}")
74
-
75
- try:
76
- start = time.time()
77
- resp, history = client.predict(
78
- message={"text": prompt, "files": []},
79
- api_name="/chat"
80
- )
81
- elapsed = time.time() - start
82
-
83
- result = {
84
- "paper_id": paper_id,
85
- "prompt_snippet": prompt[:200],
86
- "agent_review": history,
87
- "ground_truth": row["Decision"],
88
- "response_time": elapsed
89
- }
90
-
91
- # write immediately
92
- with open(out_path, "a") as f:
93
- f.write(json.dumps(result) + "\n")
94
-
95
- print(f" → {elapsed:.2f}s, review length {len(history)} chars")
96
- results.append(result)
97
-
98
- # small delay
99
- time.sleep(1)
100
- except Exception as e:
101
- print(f" Error on {paper_id}: {e}")
102
-
103
- print(f"\nDone. Results written to {out_path}")
104
- return results
105
-
106
- if __name__ == "__main__":
107
- # example usage: adjust path & sample count as needed
108
- benchmark_paper_reviews(
109
- csv_path="bench/data/ICLR_2023.csv",
110
- num_samples=1
111
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bench/benchmarking_wordle.py DELETED
@@ -1,113 +0,0 @@
1
- from gradio_client import Client
2
- from datasets import load_dataset
3
- import requests
4
- import json
5
- import time
6
- import random
7
- import os
8
- import re
9
- from datetime import datetime
10
-
11
- # Fetch the official Wordle guess list from GitHub
12
- WORD_LIST_URL = "https://raw.githubusercontent.com/tabatkins/wordle-list/main/words"
13
-
14
- def load_word_list():
15
- resp = requests.get(WORD_LIST_URL)
16
- resp.raise_for_status()
17
- words = [w.strip().lower() for w in resp.text.splitlines()]
18
- return [w for w in words if len(w) == 5 and w.isalpha()]
19
-
20
- WORD_LIST = load_word_list()
21
-
22
-
23
- def get_last_assistant_content(resp):
24
- if isinstance(resp, tuple): resp = resp[0]
25
- if not isinstance(resp, list): return ""
26
- for turn in reversed(resp):
27
- if turn.get("role") != "assistant": continue
28
- if turn.get("content"): return turn["content"]
29
- fr = turn.get("function_response", {})
30
- out = fr.get("result", {}).get("output")
31
- if out: return out
32
- cont = turn.get("content")
33
- if isinstance(cont, dict):
34
- parts = cont.get("parts", [])
35
- if parts and parts[0].get("text"): return parts[0]["text"]
36
- return ""
37
-
38
-
39
- def compute_feedback(guess, solution):
40
- feedback = ["B"] * 5
41
- sol = list(solution)
42
- for i, g in enumerate(guess):
43
- if g == sol[i]: feedback[i], sol[i] = "G", None
44
- for i, g in enumerate(guess):
45
- if feedback[i] == "B" and g in sol:
46
- feedback[i] = "Y"
47
- sol[sol.index(g)] = None
48
- return "".join(feedback)
49
-
50
-
51
- def sanitize_guess(raw):
52
- raw = raw.lower()
53
- m = re.search(r"\b[a-z]{5}\b", raw)
54
- if m: return m.group(0)
55
- cleaned = re.sub(r"[^a-z]", "", raw)
56
- return cleaned[-5:]
57
-
58
-
59
- def benchmark_wordle(num_games=10, max_guesses=6):
60
- client = Client("http://127.0.0.1:7860/")
61
- os.makedirs("results", exist_ok=True)
62
- out_path = os.path.join("results", f"wordle_benchmark_{datetime.now():%Y%m%d_%H%M%S}.jsonl")
63
- results = []
64
-
65
- for gi in range(num_games):
66
- solution = random.choice(WORD_LIST)
67
- print(f"Game {gi+1}/{num_games}, solution: {solution}")
68
- guesses = []
69
- attempts = 0
70
- start_time = time.time()
71
-
72
- while attempts < max_guesses:
73
- history = "\n".join(f"Guess: {g}, Feedback: {f}" for g, f in guesses)
74
- prompt = (
75
- f"Wordle game. Guess the 5-letter word.\n" +
76
- (history + "\n" if history else "") +
77
- "Respond with a single 5-letter guess and with ONLY YOUR GUESS. NO FILLER OR PUNCTUATION."
78
- "\n" + "Use the feedback format: G=green, Y=yellow, B=black.\n" +
79
- f"(Green: letter in correct position, Yellow: letter in wrong position, Black: letter not in word)\n" +
80
- f"Use tools and agents to help you guess the word.\n"
81
- )
82
- resp = client.predict(messages=[{"role": "user", "content": prompt}], api_name="/run")
83
- raw = get_last_assistant_content(resp)
84
- guess = sanitize_guess(raw)
85
-
86
- # If guess invalid, retry without counting
87
- if len(guess) != 5 or guess not in WORD_LIST:
88
- print(f"Warning: '{guess}' invalid; retrying without using a turn.")
89
- continue
90
-
91
- feedback = compute_feedback(guess, solution)
92
- guesses.append((guess, feedback))
93
- attempts += 1
94
- print(f"Attempt {attempts}: {guess} -> {feedback}")
95
- if feedback == "GGGGG":
96
- break
97
-
98
- results.append({
99
- "solution": solution,
100
- "guesses": guesses,
101
- "solved": bool(guesses and guesses[-1][1] == "GGGGG"),
102
- "turns": len(guesses),
103
- "time": time.time() - start_time
104
- })
105
- with open(out_path, "a") as f:
106
- f.write(json.dumps(results[-1]) + "\n")
107
-
108
- print(f"Benchmark complete, results saved to {out_path}")
109
- return results
110
-
111
-
112
- if __name__ == "__main__":
113
- benchmark_wordle(num_games=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
paper/HASHIRU.pdf DELETED
Binary file (74.4 kB)
 
paper/IEEEtran.cls DELETED
The diff for this file is too large to render. See raw diff
 
paper/conference_101719.tex DELETED
@@ -1,479 +0,0 @@
1
- \documentclass[conference]{IEEEtran}
2
- \IEEEoverridecommandlockouts
3
- % The preceding line is only needed to identify funding in the first footnote. If that is unneeded, please comment it out.
4
- \usepackage{cite}
5
- \usepackage{amsmath,amssymb,amsfonts}
6
- \usepackage{algorithmic}
7
- \usepackage{graphicx}
8
- \usepackage{textcomp}
9
- \usepackage{xcolor}
10
- \usepackage{hyperref}
11
- \usepackage{longtable}
12
- \usepackage{tabularx}
13
- \usepackage{tcolorbox}
14
- \tcbuselibrary{skins}
15
-
16
- % Define styles for user and assistant boxes
17
- \newtcolorbox{userbox}{colback=blue!5,colframe=blue!40!black,title=User}
18
- \newtcolorbox{assistantbox}{colback=gray!10,colframe=black,title=Assistant}
19
-
20
- \def\BibTeX{{\rm B\kern-.05em{\sc i\kern-.025em b}\kern-.08em
21
- T\kern-.1667em\lower.7ex\hbox{E}\kern-.125emX}}
22
- \begin{document}
23
-
24
- \title{HASHIRU: Hierarchical Agent System for Hybrid Intelligent Resource Utilization}
25
-
26
- \author{\IEEEauthorblockN{Kunal Pai}
27
- \IEEEauthorblockA{\textit{UC Davis} \\
28
- kunpai@ucdavis.edu}
29
- \and
30
- \IEEEauthorblockN{Parth Shah}
31
- \IEEEauthorblockA{\textit{Independent Researcher} \\
32
- helloparthshah@gmail.com}
33
- \and
34
- \IEEEauthorblockN{Harshil Patel}
35
- \IEEEauthorblockA{\textit{UC Davis} \\
36
- hpppatel@ucdavis.edu}
37
- \and
38
- \IEEEauthorblockN{Saisha Shetty}
39
- \IEEEauthorblockA{\textit{UC Davis} \\
40
- spshetty@ucdavis.edu}
41
- }
42
-
43
- \maketitle
44
-
45
- \section{Introduction}\label{sec:introduction}
46
-
47
- Rapid advancements in Large Language Models (LLMs) are reshaping Artificial Intelligence (AI) with profound capabilities in language understanding, generation, reasoning, and planning \cite{brown2020language, devlin2019bert, raffel2020exploring}. This progress drives the development of autonomous AI agents, shifting focus from single to Multi-Agent Systems (MAS) where collaborative teams tackle complex problems beyond individual scope \cite{dorri2018multi, wooldridge2009introduction}. Collaborative MAS show significant potential in diverse domains like scientific discovery \cite{boiko2023emergent}, software engineering \cite{qian2023communicative}, data analysis, and strategic decision-making \cite{wang2023decision}. The increasing complexity of tasks, demonstrated by benchmarks requiring advanced mathematical reasoning (e.g., GSM8K \cite{cobbe2021gsm8k}, SVAMP \cite{patel2021svamp}), coding (e.g., HumanEval \cite{chen2021codex}, CoDocBench \cite{pai2024codocbench}), and graduate-level technical knowledge \cite{phan2025humanitysexam}, highlights the need for agentic systems to effectively coordinate diverse cognitive resources \cite{wen2024benchmarkingcomplexinstructionfollowingmultiple}.
48
-
49
- Despite this potential, contemporary agentic frameworks face significant limitations. Many are \textbf{rigid}, relying on predefined roles and static structures hindering adaptation to dynamic tasks \cite{zhang2023building}. \textbf{Resource obliviousness} is common; systems often lack mechanisms to monitor and optimize computational resources like API costs, memory, and CPU load, leading to inefficiency, especially when scaling or deploying in resource-constrained environments \cite{park2023generative}. This is often worsened by reliance on powerful, costly proprietary cloud LLMs. \textbf{Model homogeneity}, defaulting to a single powerful LLM for all sub-tasks, misses efficiency gains from a diverse ecosystem including smaller, specialized, or local models \cite{zhou2023agents}. While \textbf{tool use} is fundamental \cite{yao2022react, parisi2022talm}, agents' ability to autonomously \textbf{create and integrate new tools} remains limited, restricting dynamic extension and self-improvement without human intervention \cite{wang2023voyager}.
50
-
51
- To address these challenges, we introduce \textbf{HASHIRU (Hierarchical Agent System for Hybrid Intelligent Resource Utilization)}, a novel MAS framework enhancing flexibility, resource efficiency, and adaptability. HASHIRU employs a hierarchical structure led by a central ``CEO'' agent dynamically managing specialized ``employee'' agents instantiated on demand. A core tenet is its \textbf{hybrid intelligence} approach, strategically prioritizing smaller (e.g., 3B--7B), locally-run LLMs (often via Ollama \cite{ollama}) for cost-effectiveness and efficiency. While prioritizing local resources, the system flexibly integrates external APIs and potentially more powerful models when justified by task complexity and resource availability, under the CEO's management.
52
-
53
- The primary contributions are:
54
- \begin{enumerate}
55
- \item A novel MAS architecture combining \textbf{hierarchical control} with \textbf{dynamic, resource-aware agent lifecycle management} (hiring/firing). This management is governed by computational budget constraints (cost, memory, concurrency) and includes an economic model with hiring/firing costs to discourage excessive churn.
56
- \item A \textbf{hybrid intelligence model} prioritizing cost-effective, local LLMs while adaptively incorporating external APIs and larger models, optimizing the efficiency-capability trade-off.
57
- \item An integrated mechanism for \textbf{autonomous API tool creation}, allowing dynamic functional repertoire extension.
58
- \item An \textbf{economic model} (hiring/firing fees) for agent management, promoting efficient resource allocation and team stability.
59
- \end{enumerate}
60
-
61
- This paper details HASHIRU's design and rationale. Section \ref{sec:background} discusses related work in agent architectures, dynamic management, resource allocation, model heterogeneity, and tool use. Section 3 elaborates on the architecture and core mechanisms. Section 4 presents experimental results (or outlines planned experiments), followed by discussion and conclusion in Sections 5 and 6.
62
-
63
- \section{Background and Related Work} \label{sec:background}
64
-
65
- Intelligent agent concepts have evolved from early symbolic AI \cite{russell2010artificial, shoham1994agent} to LLM-dominated frameworks leveraging models for reasoning, planning, and interaction \cite{wang2023survey, xi2023rise}. HASHIRU builds on this, addressing current limitations.
66
-
67
- \subsection{Agent Architectures: Hierarchy and Dynamics}
68
- MAS architectures vary, including flat, federated, and hierarchical \cite{dorri2018multi, horling2004survey}. Hierarchical models offer clear control and task decomposition but risk bottlenecks and rigidity \cite{gaston2005agenta,gaston2005agentb}. HASHIRU uses a \textbf{CEO-Employee hierarchy} for centralized coordination but distinguishes itself through \textbf{dynamic team composition}. Unlike systems with static hierarchies or predefined roles (e.g., CrewAI \cite{crewai}, ChatDev \cite{qian2023communicative}), HASHIRU's CEO dynamically manages the employee pool based on runtime needs and resource constraints.
69
-
70
- \subsection{Dynamic Agent Lifecycle Management}
71
- Dynamic MAS composition is crucial for complex environments \cite{valckenaers2005trends}. Agent creation/deletion triggers often relate to task structure or environmental changes. HASHIRU introduces a specific mechanism where the CEO makes \textbf{hiring and firing decisions} based on a cost-benefit analysis considering agent performance, operational costs (API fees, inferred compute), memory footprint (tracked explicitly as a percentage of available resources), and concurrency limits. HASHIRU also incorporates an \textbf{economic model} with explicit ``starting bonus'' (hiring) and ``invocation'' (usage) costs. This economic friction aims to prevent excessive initialization or usage for marginal gains and promote team stability, a nuance often missing in simpler dynamic strategies.
72
-
73
- \subsection{Resource Management and Agent Economies}
74
- Resource awareness is critical for scalable MAS. Economic research explores mechanisms like market-based auctions or contract nets for allocation \cite{clearwater1996market}. HASHIRU implements a more \textbf{centralized, budget-constrained resource management model}. The CEO operates within defined limits for financial cost, memory usage (as a percentage of total allocated), and concurrent agent count. This direct management, particularly focusing on memory percentage, suggests practicality for deployment on local or edge devices with finite resources, contrasting with cloud systems assuming elastic resources \cite{park2023generative}. Frameworks like AutoGen \cite{wu2023autogen} and LangGraph \cite{langgraph} typically rely on implicit cost tracking without explicit multi-dimensional budgeting and control.
75
-
76
- \subsection{Hybrid Intelligence and Heterogeneous Models}
77
- Leveraging diverse LLMs with varying capabilities, costs, and latencies is an emerging trend \cite{zhou2023agents}. Techniques like model routing select optimal models for sub-tasks. HASHIRU embraces \textbf{model heterogeneity} with a strategic focus: \textbf{prioritizing smaller (3B--7B), locally-run models via Ollama integration} \cite{ollama}. This emphasizes cost-efficiency, low latency, and potential privacy over systems defaulting to large proprietary cloud APIs (e.g., GPT-4 \cite{openai2023gpt4}, Claude 3 \cite{anthropic2024claude}). While integrating external APIs (potentially larger models), HASHIRU's default stance represents a distinct capability vs. efficiency balance.
78
-
79
- \subsection{Tool Use and Autonomous Tool Creation}
80
- Tool use (APIs, functions) is fundamental for modern agents \cite{yao2022react, openai_func_calling}. Most systems use predefined tools. HASHIRU advances this with \textbf{integrated, autonomous API tool creation}. When needed functionality is missing, the CEO can commission the generation (potentially via a specialized agent) and deployment of a new API tool within the HASHIRU ecosystem. This self-extension capability differentiates HASHIRU from systems limited to static toolsets, moving towards greater autonomy and adaptability \cite{wang2023voyager, park2023generative}.
81
-
82
- In summary, HASHIRU integrates hierarchical control, dynamic MAS, resource management, and tool use. Its novelty lies in the synergistic combination of: (1) dynamic, resource-aware hierarchical management with (2) an economic model for stability, (3) a local-first hybrid intelligence strategy, and (4) integrated autonomous tool creation. This targets key limitations in current systems regarding efficiency, adaptability, cost, and autonomy.
83
-
84
- \section{HASHIRU System Architecture}
85
- \label{sec:architecture}
86
-
87
- HASHIRU's architecture addresses rigidity, resource obliviousness, and limited adaptability through a hierarchical, dynamically managed MAS optimized for hybrid resource utilization.
88
-
89
- \subsection{Overview}
90
- HASHIRU operates with a central ``CEO'' agent coordinating specialized ``Employees''. Key tenets:
91
- \begin{itemize}
92
- \item \textbf{Dynamic Hierarchical Coordination:} CEO manages strategy, task allocation, and dynamic team composition.
93
- \item \textbf{Dynamic Lifecycle Management:} Employees are hired/fired based on runtime needs and resource constraints, governed by an economic model.
94
- \item \textbf{Hybrid Intelligence:} Strategic preference for local, cheaper LLMs, while accessing external APIs/models.
95
- \item \textbf{Explicit Resource Management:} Continuous monitoring and control of costs, memory usage, and concurrency against budgets.
96
- \item \textbf{Adaptive Tooling:} Using predefined tools alongside autonomous creation of new API tools.
97
- \end{itemize}
98
- Figure \ref{fig:arch} illustrates the structure.
99
-
100
- \begin{figure}[ht]
101
- \centering
102
- \includegraphics[width=0.45\textwidth]{HASHIRU.pdf}
103
- \caption{High-level architecture of the HASHIRU system, illustrating the CEO-Employee hierarchy.}
104
- \label{fig:arch}
105
- \end{figure}
106
-
107
- \subsection{Hierarchical Structure: CEO and Employee Agents}
108
- The system uses a two-tiered hierarchy:
109
-
110
- \begin{itemize}
111
- \item \textbf{CEO Agent:} Singleton, central coordinator and entry point. Responsibilities:
112
- \begin{itemize}
113
- \item Interpreting user query/task.
114
- \item Decomposing main task into sub-tasks.
115
- \item Identifying required capabilities.
116
- \item Managing Employee pool (Section \ref{subsec:dynamic_mgmt}).
117
- \item Assigning sub-tasks to active Employees.
118
- \item Monitoring Employee progress/performance.
119
- \item Synthesizing Employee results into final output.
120
- \item Managing overall resource budget (Section \ref{subsec:resource_mgmt}).
121
- \item Initiating new tool creation (Section \ref{subsec:tooling}).
122
- \end{itemize}
123
- We use Gemini 2.5 Flash~\cite{gemini25flash} as the CEO agent due to its strong reasoning capabilities, support for tool usage, and cost efficiency, making it a practical and capable choice for our deployment.
124
- \item \textbf{Employee Agents:} Specialized agents instantiated by the CEO for specific sub-tasks. Each typically wraps an LLM (local via Ollama \cite{ollama} or external API) or provides tool access. Characteristics:
125
- \begin{itemize}
126
- \item Specialization: Capabilities tailored to task types (code, data analysis, info retrieval).
127
- \item Dynamic Existence: Created/destroyed by CEO based on need/performance.
128
- \item Task Execution: Receive task, execute, return result.
129
- \item Resource Consumption: Associated costs (API, memory) tracked by system.
130
- \end{itemize}
131
- Specialized employee agents are constructed using base models such as Mistral~7B~\cite{jiang2023mistral}, Llama~3~\cite{llama3herd}, Gemini~1.5~\cite{gemini1.5_report}, Qwen2.5~\cite{qwen2.5_report}, Qwen3~\cite{qwen3_blog}, and DeepSeek-R1~\cite{deepseekr1_report}, with the CEO agent configuring them via tailored system prompts.
132
- \end{itemize}
133
- This hierarchy facilitates task decomposition and result aggregation; the dynamic pool provides flexibility.
134
-
135
- \subsection{Dynamic Agent Lifecycle Management}
136
- \label{subsec:dynamic_mgmt}
137
- A core innovation is the CEO's dynamic management (hiring/firing) of Employee agents. Driven by cost-benefit analysis, this optimizes task performance within resource constraints.
138
-
139
- When a sub-task needs unavailable or inefficiently provided capabilities, the CEO may hire a new agent. Conversely, if an agent underperforms, is idle, costly, or resource limits are neared, the CEO may fire it. Decision factors:
140
- \begin{itemize}
141
- \item \textbf{Task Requirements:} Needed capabilities for pending sub-tasks.
142
- \item \textbf{Agent Performance:} Historical success, output quality, efficiency.
143
- \item \textbf{Operational Costs:} API, estimated compute, or other costs.
144
- \item \textbf{Memory Footprint:} Agent memory usage (\% of total allocated).
145
- \item \textbf{Agent Concurrency:} Active agents vs. predefined limit.
146
- \end{itemize}
147
-
148
- HASHIRU includes an \textbf{economic model}:
149
- \begin{itemize}
150
- \item \textbf{Hiring Cost (``Starting Bonus''):} One-time cost upon instantiation (setup overhead).
151
- \item \textbf{Invocation Cost (``Salary''):} Multi-time cost upon use (system/payment load).
152
- \end{itemize}
153
- These transaction costs discourage excessive churn, promoting stability. The CEO evaluates if replacing an agent benefits outweigh hiring/firing costs plus operational differences. This combats rigidity and allows adaptation while managing budgets and preventing wasteful turnover.
154
-
155
- \subsection{Hybrid Intelligence and Model Management}
156
- HASHIRU is designed for \textbf{hybrid intelligence}, leveraging diverse cognitive resources. It strategically prioritizes smaller (3B--7B), cost-effective local LLMs via Ollama \cite{ollama}. This enhances efficiency, reduces external API reliance, and potentially improves privacy/latency.
157
-
158
- The system also integrates:
159
- \begin{itemize}
160
- \item \textbf{External LLM APIs:} Access to powerful proprietary models (GPT-4 \cite{openai2023gpt4}, Claude 3 \cite{anthropic2024claude}) when necessary, subject to cost-benefit.
161
- \item \textbf{External Tool APIs:} Third-party software/data source integration.
162
- \item \textbf{Self-Created APIs:} Tools generated by HASHIRU (Section \ref{subsec:tooling}).
163
- \end{itemize}
164
- The CEO manages this heterogeneous pool, selecting the most appropriate resource based on difficulty, capabilities, and budget. This balances cost-effectiveness and efficiency with high capability needs.
165
-
166
- \subsection{Resource Monitoring and Control}
167
- \label{subsec:resource_mgmt}
168
- Explicit resource management is central, moving beyond simple API cost tracking. The system, coordinated by the CEO, monitors:
169
- \begin{itemize}
170
- \item \textbf{Financial Costs:} Accumulating external API costs.
171
- \item \textbf{Memory Usage:} Footprint of active Employee agents (\% of allocated budget).
172
- \item \textbf{Agent Concurrency:} Count of concurrently active agents.
173
- \end{itemize}
174
- Metrics are monitored against predefined \textbf{budget limits}. Actions (like hiring) exceeding limits (e.g., $>$90\% memory, exceeding max concurrency) are prevented. This ensures operation within constraints, crucial for limited resources or strict budgets.
175
-
176
- \subsection{Tool Utilization and Autonomous Creation}
177
- \label{subsec:tooling}
178
- HASHIRU agents use predefined tools (functions, APIs, databases) to interact and perform actions beyond text generation \cite{yao2022react, openai_func_calling}.
179
-
180
- A distinctive feature is \textbf{integrated, autonomous tool creation}. If the CEO determines a required capability is missing, it can initiate new tool creation. This involves:
181
- \begin{enumerate}
182
- \item Defining tool specification (inputs, outputs, functionality).
183
- \item Commissioning logic generation (code, potentially using external APIs with provided credentials, possibly via a code-generating agent).
184
- \item Deploying logic as a new, callable API endpoint within HASHIRU.
185
- \item Potentially instantiating an Employee agent for the new tool.
186
- \end{enumerate}
187
- This allows HASHIRU to dynamically extend its functional repertoire, tailoring capabilities to tasks without manual intervention, enabling greater autonomy and adaptation.
188
-
189
- \subsection{Memory Function: Learning from Experience}
190
- \label{subsec:memory}
191
-
192
- To enable HASHIRU agents to learn from past interactions and rectify previous errors, a \textbf{Memory Function} is incorporated. This function stores records of significant past events, particularly those involving failed attempts or suboptimal outcomes, acting as a historical log of experiences. When the system encounters a new problem or a recurring challenge, it queries this memory store to retrieve relevant past situations and their outcomes.
193
-
194
- Memory retrieval is based on semantic similarity between the current context (e.g., task description, recent actions, error messages) and the stored memory entries. We utilize embeddings generated by the \textbf{all-MiniLM-L6-v2} model \cite{wang2020minilmdeepselfattentiondistillation} to represent both the query and the stored memories in a high-dimensional vector space. Relevance is determined by calculating the \textbf{cosine similarity} between the query embedding and each memory embedding. Memories exceeding a predefined similarity threshold are retrieved and provided to the CEO agent (or relevant Employee agents) as contextual information. This allows the system to draw upon past experiences, understand why previous approaches failed, and potentially adjust its strategy to avoid repeating mistakes, thereby improving performance and efficiency over time.
195
-
196
- \section{Case Studies}
197
- \label{sec:casestudies}
198
- This section presents two case studies demonstrating HASHIRU's self-improvement capabilities in practical settings. We highlight two instances where HASHIRU enhanced its own architecture and functionality: (1) by generating a comprehensive cost model for base models suitable for specialized agent creation, and (2) by autonomously integrating new tools for the CEO agent.
199
-
200
- \subsection{Case Study 1: Self-Generating the Cost Model for Agent Specialization}
201
- \label{sec:casestudy1_costmodel}
202
- An accurate cost model is essential for optimizing resource allocation and ensuring the efficiency of specialized agents within HASHIRU. Traditionally, constructing this model involves manual research into local model performance relative to hardware (e.g., 16~GiB VRAM) and the API costs of cloud-hosted alternatives. HASHIRU automated this labor-intensive process by leveraging its web search capabilities to autonomously identify and incorporate the necessary cost data into its internal model. The results were successfully committed to the codebase\footnote{\url{https://github.com/kunpai/HASHIRU/commit/70dc268b121cbd7c50c6691645d8a99912766965}}.
203
-
204
- \subsection{Case Study 2: Autonomous Tool Integration for the CEO Agent}
205
- \label{sec:casestudy2_tools}
206
- Extending the CEO agent's capabilities through tool integration is vital for broadening HASHIRU's operational scope. Manual tool development typically requires detailed analysis of existing tool schemas and diligent code implementation. HASHIRU streamlined this process by employing a few-shot learning approach, using an existing tool as a template to guide the autonomous creation of new tools~\cite{brown2020language}. The newly generated tools were directly integrated into the codebase\footnote{\url{https://github.com/kunpai/HASHIRU/commit/193e10b2b00917256b7cc01cb3aa5ac7b6a6c174}}.
207
-
208
-
209
- \section{Experimental Setup}
210
- \label{sec:experiments}
211
-
212
- We designed experiments to evaluate HASHIRU's performance, efficiency, and adaptability, targeting dynamic resource management, hybrid intelligence, and autonomous tool creation. Evaluation assesses benefits over baselines, focusing on:
213
- \begin{itemize}
214
- \item Impact of dynamic management with economic constraints on resource utilization (cost, memory) and task performance vs. static configurations.
215
- \item Effectiveness of the hybrid (local-first) strategy vs. homogeneous (cloud-only or local-only) approaches across task complexity.
216
- \item System's ability to autonomously create/utilize tools for novel functional requirements.
217
- \end{itemize}
218
-
219
- \subsection{Evaluation Tasks}
220
- \label{subsec:tasks}
221
- Tasks demand complex reasoning, multi-perspective analysis, and interaction, suitable for HASHIRU's coordination and dynamic capabilities. Tasks fall into two categories:
222
-
223
- \subsubsection{Academic Paper Review}
224
- Evaluates HASHIRU's critical assessment by simulating peer review. Given papers (e.g., PDF), the system generates a review summary and recommends acceptance/rejection. Probes ability to decompose criteria, delegate to specialized agents (novelty, rigor, clarity), and manage resources across complex documents.
225
-
226
- \subsubsection{Reasoning and Problem-Solving Tasks}
227
- Evaluates broader reasoning, knowledge retrieval, and problem-solving under constraints using challenging benchmarks and puzzles:
228
- \begin{itemize}
229
- \item \textbf{Humanity's Last Exam \cite{phan2025humanitysexam}:} Tests graduate-level technical knowledge and complex reasoning across domains. Requires deep understanding and sophisticated problem-solving, likely needing powerful external LLMs managed within HASHIRU's hybrid framework.
230
- \item \textbf{NYT Connections \cite{lopez2024nyt}:} Puzzle requiring identifying hidden semantic relationships/themes to categorize 16 words into four groups. Involves associative reasoning, broad knowledge, and hypothesis testing, testing knowledge access and combinatorial reasoning coordination.
231
- \item \textbf{Wordle:} Daily word puzzle requiring deductive reasoning to identify a five-letter word within six guesses, using feedback. Tests logical deduction, constraint satisfaction, vocabulary. Good test for comparing efficiency (speed, cost, guesses) of local vs. external models for iterative reasoning. Assumes simulated game environment.
232
- \item \textbf{Globle:} Geographic deduction game identifying a target country based on proximity feedback. Tests geographic knowledge, spatial reasoning, iterative strategy based on feedback. Assumes simulated game environment.
233
- \end{itemize}
234
- These tasks challenge the system's ability to leverage appropriate resources (local vs. external), potentially create simple tools, and coordinate problem-solving.
235
-
236
- \subsection{Baselines for Comparison}
237
- \label{subsec:baselines}
238
- To quantify HASHIRU's benefits, we compare its performance against baselines:
239
- \begin{itemize}
240
- \item \textbf{Static-HASHIRU:} Fixed, predefined Employee agents (e.g., one per role), disabling dynamic hiring/firing.
241
- \item \textbf{Cloud-Only HASHIRU:} Uses exclusively powerful external LLM API and online function-calling for all agents, disabling local models.
242
- \item \textbf{Local-Only HASHIRU:} Uses exclusively smaller, local LLMs (via Ollama) for all agents.
243
- \item \textbf{HASHIRU (No-Economy):} Dynamic hiring/firing enabled but without explicit costs, isolating economic model impact on churn/stability.
244
- \end{itemize}
245
-
246
- \subsection{Evaluation Metrics}
247
- \label{subsec:metrics}
248
- We evaluate using quantitative and qualitative metrics:
249
- \begin{itemize}
250
- \item \textbf{Task Success Rate / Quality:}
251
- \begin{itemize}
252
- \item Percentage of tasks completed (binary for games, graded for analysis).
253
- \item Output quality for analysis (human evaluation: relevance, coherence, accuracy, completeness).
254
- \item Accuracy for information extraction.
255
- \item Guesses/turns for game tasks.
256
- \end{itemize}
257
- \item \textbf{Resource Consumption:}
258
- \begin{itemize}
259
- \item Total external API costs.
260
- \item Peak and average memory usage (\% of allocated budget).
261
- \item Wall-clock time per task.
262
- \item Number and type (local/external) of LLM calls.
263
- \end{itemize}
264
- \item \textbf{System Dynamics and Adaptability:}
265
- \begin{itemize}
266
- \item Employee agents hired/fired per task.
267
- \item Agent churn frequency (hires+fires / duration or steps).
268
- \item Number and utility of autonomously created tools (if applicable).
269
- \end{itemize}
270
- \end{itemize}
271
-
272
- \bibliography{references}
273
- \bibliographystyle{plain}
274
-
275
- \appendix
276
- \section{Examples of HASHIRU in Action}
277
- \label{sec:examples}
278
- In this section, we provide examples of how HASHIRU operates in practice, demonstrating its dynamic management, hybrid intelligence, memory and autonomous tool creation capabilities.
279
-
280
- \subsection{Example 1: Memory}
281
- \begin{figure}[h]
282
- \centering
283
- \begin{minipage}{0.45\textwidth}
284
- \begin{userbox}
285
- I am vegetarian.
286
- \end{userbox}
287
-
288
- \begin{assistantbox}
289
- Okay, I'll keep that in mind. I'll store this preference in my memory.
290
-
291
- \texttt{Invoking MemoryManager with \{`memory': `The user is vegetarian.', `action': `add\_memory', `key': `vegetarian'\}}
292
-
293
- I have stored that you are vegetarian. How can I help you today?
294
- \end{assistantbox}
295
-
296
- \begin{userbox}
297
- Give me a bacon pasta recipe.
298
- \end{userbox}
299
-
300
- \begin{assistantbox}
301
- Memories
302
-
303
- Memories: [`The user is vegetarian.']
304
-
305
- My apologies, but I cannot provide a bacon pasta recipe since you are vegetarian. Would you like a vegetarian pasta recipe instead?
306
- \end{assistantbox}
307
- \end{minipage}
308
- \caption{HASHIRU enforcing a stored dietary preference in dialogue using its Memory Function.}
309
- \label{fig:memory_example}
310
- \end{figure}
311
-
312
- \begin{figure}[ht]
313
- \centering
314
- \begin{minipage}
315
- {0.45\textwidth}
316
- \begin{userbox}
317
- I can eat bacon actually
318
- \end{userbox}
319
-
320
- \begin{assistantbox}
321
- Memories
322
-
323
- Memories: [’The user is vegetarian.’]
324
-
325
- I see. My apologies for the confusion. I had a conflicting memory that you were vegetarian. I will remove that memory and proceed with finding a bacon pasta recipe.
326
-
327
- \texttt{Invoking MemoryManager with \{'action': 'delete\_memory', 'key': 'The user is vegetarian.'\}}
328
-
329
- OK, I have removed the memory. Now I will search for a bacon pasta recipe.
330
-
331
- \texttt{Invoking GoogleSearchTool with \{'query': 'bacon pasta recipe'\}}
332
-
333
- Here are some bacon pasta recipes I found:
334
- \begin{itemize}
335
- \item \href{https://www.sprinklesandsprouts.com/garlic-bacon-pasta/}{Garlic Bacon Pasta}
336
- \item \href{https://www.theburntbuttertable.com/creamy-bacon-pasta-sauce/}{Creamy Bacon Pasta Sauce}
337
- \item \dots
338
- \end{itemize}
339
-
340
- Would you like me to get the content of any of these websites?
341
- \end{assistantbox}
342
- \end{minipage}
343
- \caption{HASHIRU updating and deleting its memory based on user input. Continuation of Figure \ref{fig:memory_example}.}
344
- \label{fig:memory_example2}
345
- \end{figure}
346
-
347
- \subsection{Example 2: Novelty Assessment via Coordinated Tool Use}
348
-
349
- \begin{figure}[h]
350
- \centering
351
- \begin{minipage}{0.95\textwidth}
352
-
353
- \begin{userbox}
354
- Is this idea novel: ``a dynamically managed, resource-aware hierarchical multi-agent system that prioritizes cost-efficient local models, integrates autonomous tool creation, and employs an internal economic model to balance performance, adaptability, and stability in intelligent task coordination''?
355
- \end{userbox}
356
-
357
- \begin{assistantbox}
358
- To determine if the idea is novel, I will break it down into smaller parts and search for each part. Then I will combine the results to see if the whole idea is novel.
359
-
360
- Here's the breakdown:
361
- \begin{itemize}
362
- \item Dynamically managed multi-agent system
363
- \item Resource-aware hierarchical multi-agent system
364
- \item Cost-efficient local models in multi-agent systems
365
- \item Autonomous tool creation in multi-agent systems
366
- \item Internal economic model for multi-agent systems
367
- \end{itemize}
368
-
369
- I will use the \texttt{GoogleSearchTool} to search for each of these parts.
370
-
371
- \texttt{Invoking GoogleSearchTool with \{'query': 'dynamically managed multi-agent system'\}}
372
-
373
- \texttt{Invoking GoogleSearchTool with \{'query': 'resource-aware hierarchical multi-agent system'\}}
374
-
375
- \texttt{Invoking GoogleSearchTool with \{'query': 'cost-efficient local models in multi-agent systems'\}}
376
-
377
- \texttt{Invoking GoogleSearchTool with \{'query': 'autonomous tool creation in multi-agent systems'\}}
378
-
379
- \texttt{Invoking GoogleSearchTool with \{'query': 'internal economic model for multi-agent systems'\}}
380
-
381
- The search results indicate that each of the individual components of the idea has been explored in existing research. However, the novelty lies in the specific combination and integration of these elements.
382
-
383
- To further assess the novelty, I will use the \texttt{ArxivTool} to search for papers that combine multiple aspects of the idea.
384
-
385
- \texttt{Invoking ArxivTool with \{'query': 'dynamically managed resource-aware hierarchical multi-agent system'\}}
386
-
387
- \texttt{Invoking ArxivTool with \{'query': 'multi-agent system autonomous tool creation economic model'\}}
388
-
389
- The ArxivTool results show some papers that touch on combinations of these concepts, but none seem to address the entire idea as presented. Specifically, the combination of dynamic management, resource-awareness, hierarchical structure, cost-efficient local models, autonomous tool creation, and an internal economic model appears to be a novel synthesis.
390
-
391
- \textbf{Therefore, based on the search results, the idea seems novel.}
392
- \end{assistantbox}
393
- \caption{HASHIRU assessing the novelty of a complex idea by breaking it down into components and searching for each part.}
394
- \end{minipage}
395
- \label{fig:novelty_assessment}
396
- \end{figure}
397
-
398
- \clearpage
399
- \twocolumn
400
-
401
- \begin{figure}
402
- \centering
403
- \begin{minipage}{0.95\textwidth}
404
- \begin{userbox}
405
- Which papers mention the ideas?
406
- \end{userbox}
407
-
408
- \begin{assistantbox}
409
- To provide a comprehensive answer, I will analyze the summaries of the papers returned by the ArxivTool and GoogleSearchTool to identify which papers mention the key ideas in your prompt.
410
-
411
- \textbf{The key ideas are:}
412
- \begin{itemize}
413
- \item Dynamically managed
414
- \item Resource-aware
415
- \item Hierarchical
416
- \item Cost-efficient local models
417
- \item Autonomous tool creation
418
- \item Internal economic model
419
- \end{itemize}
420
-
421
- I will go through each paper and identify which of these ideas are mentioned.
422
-
423
- \texttt{Invoking ArxivTool with \{'query': 'dynamically managed resource-aware hierarchical multi-agent system'\}} \\
424
- \texttt{Invoking ArxivTool with \{'query': 'multi-agent system autonomous tool creation economic model'\}}
425
-
426
- \textbf{ArxivTool Results (Query: ``dynamically managed resource-aware hierarchical multi-agent system'')}
427
-
428
- \begin{itemize}
429
- \item \textit{Hierarchical stock assessment methods improve management performance in multi-species, data-limited fisheries} \\
430
- Hierarchical: Yes \\
431
- Dynamically managed: Implied through ``management performance''
432
-
433
- \item \textit{Hierarchical Reinforcement Learning for Optimal Agent Grouping in Cooperative Systems} \\
434
- Hierarchical: Yes \\
435
- Dynamically managed: Yes, through ``dynamic and optimal policy adjustments.''
436
-
437
- \item \textit{Efficient Hierarchical Storage Management Framework Empowered by Reinforcement Learning} \\
438
- Hierarchical: Yes \\
439
- Cost-efficient local models: Implied through ``efficient'' \\
440
- Dynamically managed: Yes, through ``dynamic migration policy''
441
-
442
- \item \textit{Autonomous Deep Agent} \\
443
- Hierarchical: Yes \\
444
- Autonomous tool creation: Yes, via ``Autonomous API \& Tool Creation (AATC) system'' \\
445
- Dynamically managed: Yes, ``dynamically decomposes high-level objectives''
446
-
447
- \item \textit{Adaptive Network Intervention for Complex Systems: A Hierarchical Graph Reinforcement Learning Approach} \\
448
- Hierarchical: Yes \\
449
- Dynamically managed: Yes, through ``dynamic networks''
450
- \end{itemize}
451
-
452
- \textbf{ArxivTool Results (Query: ``multi-agent system autonomous tool creation economic model'')}
453
-
454
- \begin{itemize}
455
- \item \textit{Autonomics: an autonomous and intelligent economic platform and next generation money tool} \\
456
- Autonomous tool creation: Implied \\
457
- Internal economic model: Yes
458
-
459
- \item \textit{From Single Agent to Multi-Agent: Improving Traffic Signal Control} \\
460
- No relevant keywords mentioned.
461
-
462
- \item \textit{Autonomous Deep Agent} \\
463
- Hierarchical: Yes \\
464
- Autonomous tool creation: Yes
465
-
466
- \item \textit{Beyond the Sum: Unlocking AI Agents Potential Through Market Forces} \\
467
- Internal economic model: Yes
468
-
469
- \item \textit{Particle-Hole Creation in Condensed Matter: A Conceptual Framework for Modeling Money-Debt Dynamics in Economics} \\
470
- Internal economic model: Yes
471
- \end{itemize}
472
-
473
- \end{assistantbox}
474
- \caption{Continuation of Figure \ref{fig:novelty_assessment}. HASHIRU analyzing the summaries of the papers returned by the ArxivTool and GoogleSearchTool to identify which papers mention the key ideas in the user prompt.}
475
- \end{minipage}
476
-
477
- \end{figure}
478
-
479
- \end{document}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
paper/references.bib DELETED
@@ -1,456 +0,0 @@
1
- @article{shen2023hugginggpt,
2
- title = {HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in Hugging Face},
3
- author = {Shen, Yongliang and Song, Kaitao and Tan, Xu and Li, Dongsheng and Lu, Weiming and Zhuang, Yueting},
4
- journal = {arXiv preprint arXiv:2303.17580},
5
- year = {2023}
6
- }
7
-
8
- @article{wu2023autogen,
9
- title = {{AutoGen}: Enabling Next-Gen {LLM} Applications via Multi-Agent Conversation},
10
- author = {Wu, Qingyun and Bansal, Gagan and Zhang, Jieyu and Wu, Yiran and Li, Beibin and Zhu, Erkang and Jiang, Li and Zhang, Xiaoyun and Zhang, Shaokun and Liu, Jiale and Awadallah, Ahmed H. and White, Ryen W. and Burger, Doug and Wang, Chi},
11
- journal = {arXiv preprint arXiv:2308.08155},
12
- year = {2023}
13
- }
14
-
15
- @article{lopez2024nyt,
16
- title={NYT-Connections: A Deceptively Simple Text Classification Task that Stumps System-1 Thinkers},
17
- author={Lopez, Angel Yahir Loredo and McDonald, Tyler and Emami, Ali},
18
- journal={arXiv preprint arXiv:2412.01621},
19
- year={2024}
20
- }
21
-
22
- @misc{wang2020minilmdeepselfattentiondistillation,
23
- title={MiniLM: Deep Self-Attention Distillation for Task-Agnostic Compression of Pre-Trained Transformers},
24
- author={Wenhui Wang and Furu Wei and Li Dong and Hangbo Bao and Nan Yang and Ming Zhou},
25
- year={2020},
26
- eprint={2002.10957},
27
- archivePrefix={arXiv},
28
- primaryClass={cs.CL},
29
- url={https://arxiv.org/abs/2002.10957},
30
- }
31
-
32
- @misc{gemini25flash,
33
- title = {Gemini 2.5 Flash: Model Card, API, and Announcement},
34
- author = {{Google DeepMind} and {Google AI}},
35
- year = {2025},
36
- howpublished = {\url{https://developers.googleblog.com/en/start-building-with-gemini-25-flash/}},
37
- note = {See also:
38
- \url{https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/gemini-2.5-flash-preview-04-17?inv=1&invt=AbxICQ},
39
- \url{https://ai.google.dev/gemini-api/docs/models}. Accessed: 2025-05-11}
40
- }
41
-
42
-
43
- @inproceedings{yao2022react,
44
- title = {{ReAct}: Synergizing Reasoning and Acting in Language Models},
45
- author = {Yao, Shunyu and Zhao, Jeffrey and Yu, Dian and Du, Nan and Shafran, Izhak and Narasimhan, Karthik and Cao, Yuan},
46
- booktitle = {International Conference on Learning Representations (ICLR)},
47
- year = {2023},
48
- note = {arXiv:2210.03629}
49
- }
50
-
51
- @article{schick2023toolformer,
52
- title = {Toolformer: Language Models Can Teach Themselves to Use Tools},
53
- author = {Schick, Timo and Dwivedi-Yu, Jane and Bitton, Yonatan and Yuan, Xi and Camburu, Oana-Maria and Houlsby, Neil},
54
- journal = {arXiv preprint arXiv:2302.04761},
55
- year = {2023}
56
- }
57
-
58
- @article{ong2024routellm,
59
- title = {{RouteLLM}: Learning to Route {LLMs} with Preference Data},
60
- author = {Ong, Isaac and Almahairi, Amjad and Wu, Vincent and Chiang, Wei-Lin and Wu, Tianhao and Gonzalez, Joseph E. and Kadous, M. Waleed and Stoica, Ion},
61
- journal = {arXiv preprint arXiv:2406.18665},
62
- year = {2024}
63
- }
64
-
65
- @article{fourney2024magentic,
66
- title = {Magentic-One: A Generalist Multi-Agent System for Solving Complex Tasks},
67
- author = {Fourney, Adam and Bansal, Gagan and Mozannar, Hussein and Tan, Cheng and et al.},
68
- journal = {arXiv preprint arXiv:2411.04468},
69
- year = {2024}
70
- }
71
-
72
- @inproceedings{cobbe2021gsm8k,
73
- title = {Training Verifiers to Solve Math Word Problems},
74
- author = {Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and Hesse, Christopher and Schulman, John},
75
- booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
76
- year = {2021},
77
- note = {Dataset introduced: GSM8K (Grade School Math 8K)}
78
- }
79
-
80
- @inproceedings{patel2021svamp,
81
- title = {Are {NLP} Models really able to Solve Simple Math Word Problems?},
82
- author = {Patel, Arkil and Bhattamishra, Satwik and Goyal, Navin},
83
- booktitle = {Conference of the North American Chapter of the Association for Computational Linguistics (NAACL)},
84
- year = {2021},
85
- note = {Introduces the SVAMP challenge dataset}
86
- }
87
-
88
- @misc{phan2025humanitysexam,
89
- title = {Humanity's Last Exam},
90
- author = {Phan, Long and Gatti, Alice and Han, Ziwen and others},
91
- year = {2025},
92
- eprint = {2501.14249},
93
- archivePrefix = {arXiv},
94
- primaryClass = {cs.LG},
95
- url = {https://arxiv.org/abs/2501.14249}
96
- }
97
-
98
-
99
-
100
- @article{chen2021codex,
101
- title = {Evaluating Large Language Models Trained on Code},
102
- author = {Chen, Mark and Tworek, Jerry and Jun, Heewoo and Yuan, Qiming and Ponde de Oliveira Pinto, Henrique and Kaplan, Jared and Edwards, Harri and Burda, Yuri and Joseph, Nicholas and Brockman, Greg and Ray, Alex and Puri, Raul and Krueger, Gretchen and Petrov, Michael and Khlaaf, Heidy and Sastry, Girish and Mishkin, Pamela and Chan, Brooke and Gray, Scott and Ryder, Nick and Pavlov, Mikhail and Power, Alethea and Kaiser, Lukasz and Bavarian, Mohammad and Winter, Clemens and Tillet, Philippe and Such, Felipe and Cummings, Dave and Plappert, Matthias and Chantzis, Fotios and Barnes, Elizabeth and Herbert-Voss, Ariel and Guss, William and Nichol, Alex and Paino, Alex and Tezak, Nikolas and Tang, Jie and Babuschkin, Igor and Balaji, Suchir and Jain, Shantanu and Saunders, William and Hesse, Christopher and Carr, Andrew N. and Leike, Jan and Achiam, Josh and Misra, Vedant and Morikawa, Evan and Radford, Alec and Knight, Matthew and Brundage, Miles and Murati, Mira and Mayer, Katie and Welinder, Peter and McGrew, Bob and Amodei, Dario and McCandlish, Sam and Sutskever, Ilya and Zaremba, Wojciech},
103
- journal = {arXiv preprint arXiv:2107.03374},
104
- year = {2021},
105
- note = {OpenAI Codex paper; introduced HumanEval benchmark}
106
- }
107
-
108
- @article{pai2024codocbench,
109
- title = {{CoDocBench}: A Dataset for Code-Documentation Alignment in Software Maintenance},
110
- author = {Pai, Kunal and Devanbu, Premkumar and Ahmed, Toufique},
111
- journal = {arXiv preprint arXiv:2502.00519},
112
- year = {2024}
113
- }
114
-
115
- @inproceedings{kamienski2021pysstubs,
116
- title = {{PySStuBs}: Characterizing Single-Statement Bugs in Popular Open-Source Python Projects},
117
- author = {Kamienski, Arthur V. and Palechor, Luisa and Bezemer, Cor-Paul and Hindle, Abram},
118
- booktitle = {IEEE/ACM International Conference on Mining Software Repositories (MSR)},
119
- year = {2021}
120
- }
121
-
122
- @article{brown2020language,
123
- title={Language models are few-shot learners},
124
- author={Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and others},
125
- journal={Advances in neural information processing systems},
126
- volume={33},
127
- pages={1877--1901},
128
- year={2020}
129
- }
130
-
131
- @inproceedings{devlin2019bert,
132
- title={Bert: Pre-training of deep bidirectional transformers for language understanding},
133
- author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
134
- booktitle={Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers)},
135
- pages={4171--4186},
136
- year={2019}
137
- }
138
-
139
- @article{raffel2020exploring,
140
- title={Exploring the limits of transfer learning with a unified text-to-text transformer},
141
- author={Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J},
142
- journal={Journal of machine learning research},
143
- volume={21},
144
- number={140},
145
- pages={1--67},
146
- year={2020}
147
- }
148
-
149
- @article{dorri2018multi,
150
- title={Multi-agent systems: A survey},
151
- author={Dorri, Ali and Kanhere, Salil S and Jurdak, Raja},
152
- journal={Ieee Access},
153
- volume={6},
154
- pages={28573--28593},
155
- year={2018},
156
- publisher={IEEE}
157
- }
158
-
159
- @book{wooldridge2009introduction,
160
- title={An introduction to multiagent systems},
161
- author={Wooldridge, Michael},
162
- year={2009},
163
- publisher={John wiley \& sons}
164
- }
165
-
166
- @article{boiko2023emergent,
167
- title={Emergent autonomous scientific research capabilities of large language models},
168
- author={Boiko, Daniil A and MacKnight, Robert and Gomes, Gabe},
169
- journal={arXiv preprint arXiv:2304.05332},
170
- year={2023}
171
- }
172
-
173
- @inproceedings{gaston2005agenta,
174
- title={Agent-organized networks for dynamic team formation},
175
- author={Gaston, Matthew E and DesJardins, Marie},
176
- booktitle={Proceedings of the fourth international joint conference on Autonomous agents and multiagent systems},
177
- pages={230--237},
178
- year={2005}
179
- }
180
-
181
- @misc{zhou2023agents,
182
- title={Agents: An Open-source Framework for Large Language Model based Autonomous Agents},
183
- author={Wangchunshu Zhou and Jianshu Chen and Jialong Wu and Yiheng Xu and Kexin Wang and Jintian Zhang and Yuan Gao and Zhiyong Wu and Kevin Tian and Yubo Feng and Linyi Yang and Bokai Quan and Cong Yu and Yuhang Wang and Shishen Lan and Yan Wang and Hong-Cheng Guo and Chaoyu Chen and Tianxiang Sun and Jin Xiong and Yi Lu and Peng Li and Lichao Sun and Lifan Yuan and Hang Li and Xiangang Li},
184
- year={2023},
185
- eprint={2309.07870},
186
- archivePrefix={arXiv},
187
- primaryClass={cs.AI},
188
- url={https://arxiv.org/abs/2309.07870}
189
- }
190
-
191
-
192
- @misc{openai_func_calling,
193
- title = {Function calling},
194
- author = {{OpenAI}},
195
- year = {2023},
196
- howpublished = {OpenAI API Documentation},
197
- url = {https://platform.openai.com/docs/guides/function-calling},
198
- note = {Accessed: 2025-05-01}
199
- }
200
-
201
- @misc{wang2023voyager,
202
- title={{Voyager}: An Open-Ended Embodied Agent with Large Language Models},
203
- author={Guanzhi Wang and Yuqi Xie and Yunfan Jiang and Ajay Mandlekar and Chaowei Xiao and Yuke Zhu and Linxi Fan and Anima Anandkumar},
204
- year={2023},
205
- eprint={2305.16291},
206
- archivePrefix={arXiv},
207
- primaryClass={cs.AI},
208
- url={https://arxiv.org/abs/2305.16291}
209
- }
210
-
211
- @book{russell2010artificial,
212
- title={Artificial intelligence: a modern approach},
213
- author={Russell, Stuart J. and Norvig, Peter},
214
- year={2010},
215
- edition={3rd},
216
- publisher={Prentice Hall Press},
217
- address={Upper Saddle River, NJ, USA}
218
- }
219
-
220
- @article{shoham1994agent,
221
- author = {Yoav Shoham},
222
- title = {Agent-oriented programming},
223
- journal = {Artificial Intelligence},
224
- volume = {60},
225
- number = {1},
226
- pages = {51--92},
227
- year = {1993},
228
- publisher = {Elsevier}
229
- }
230
-
231
- @misc{wang2023survey,
232
- title={A Survey on Large Language Model based Autonomous Agents},
233
- author={Lei Wang and Chen Ma and Xueyang Feng and Zeyu Zhang and Hao Yang and Jingsen Zhang and Zhiyuan Chen and Jiakai Tang and Xu Chen and Yankai Lin and Wayne Xin Zhao and Zhewei Wei and Ji-Rong Wen},
234
- year={2023},
235
- eprint={2308.11432},
236
- archivePrefix={arXiv},
237
- primaryClass={cs.AI}
238
- }
239
-
240
- @misc{xi2023rise,
241
- title={The Rise and Potential of Large Language Model Based Agents: A Survey},
242
- author={Zhiheng Xi and Wenxiang Chen and Xin Guo and Wei He and Yiwen Ding and Boyang Hong and Ming Zhang and Junzhe Wang and Senjie Jin and Enyu Zhou and Rui Zheng and Xiaoran Fan and Xiao Wang and Limao Xiong and Linyi Yang and Ting Ruan and Yongquan Yang and Peng Li and Yitao Chang and Yanlin Wang},
243
- year={2023},
244
- eprint={2309.07864},
245
- archivePrefix={arXiv},
246
- primaryClass={cs.AI}
247
- }
248
-
249
-
250
- @inproceedings{park2023generative,
251
- author = {Park, Joon Sung and O'Brien, Joseph C. and Cai, Carrie J. and Morris, Meredith Ringel and Liang, Percy and Bernstein, Michael S.},
252
- title = {Generative Agents: Interactive Simulacra of Human Behavior},
253
- year = {2023},
254
- isbn = {9798400701320},
255
- publisher = {Association for Computing Machinery},
256
- address = {New York, NY, USA},
257
- url = {https://doi.org/10.1145/3586183.3606763},
258
- doi = {10.1145/3586183.3606763},
259
- booktitle = {The 36th Annual ACM Symposium on User Interface Software and Technology (UIST '23)},
260
- pages = {1–22},
261
- numpages = {22},
262
- location = {San Francisco, CA, USA},
263
- series = {UIST '23}
264
- }
265
-
266
- @misc{ollama,
267
- title = {Ollama},
268
- author = {{Ollama Team}},
269
- howpublished = {\url{https://ollama.com/}},
270
- year = {2023},
271
- note = {Accessed: 2025-05-01}
272
- }
273
-
274
- @misc{anthropic2024claude,
275
- title = {The {Claude 3} Model Family: {Opus, Sonnet, Haiku}},
276
- author = {{Anthropic}},
277
- year = {2024},
278
- month = {March},
279
- howpublished = {Model Card},
280
- url = {https://www-cdn.anthropic.com/de8ba9b01c9ab7cbabf5c33b80b7bbc618857627/Model_Card_Claude_3.pdf},
281
- note = {Accessed: 2025-05-01}
282
- }
283
-
284
- @misc{openai2023gpt4,
285
- title={GPT-4 Technical Report},
286
- author={OpenAI},
287
- year={2023},
288
- eprint={2303.08774},
289
- archivePrefix={arXiv},
290
- primaryClass={cs.CL},
291
- url={https://arxiv.org/abs/2303.08774}
292
- }
293
-
294
- @misc{langgraph,
295
- title = {LangGraph: A Framework for Agentic Workflows},
296
- author = {LangChain},
297
- year = {2024},
298
- howpublished= {\url{https://www.langchain.com/langgraph}},
299
- note = {Accessed: May 1, 2025}
300
- }
301
-
302
- @book{clearwater1996market,
303
- title = {Market-Based Control: A Paradigm for Distributed Resource Allocation},
304
- editor = {Scott H. Clearwater},
305
- publisher = {World Scientific},
306
- year = {1996}
307
- }
308
-
309
- @article{valckenaers2005trends,
310
- title={Guest Editors' Introduction: Intelligent Control in the Manufacturing Supply Chain},
311
- author={McFarlane, Duncan and Mar{\'\i}k, Vladim{\'\i}r and Valckenaers, Paul},
312
- journal={IEEE Intelligent Systems},
313
- volume={20},
314
- number={1},
315
- pages={24--26},
316
- year={2005},
317
- publisher={IEEE}
318
- }
319
-
320
-
321
- @article{horling2004survey,
322
- title={A survey of multi-agent organizational paradigms},
323
- author={Horling, Bryan and Lesser, Victor},
324
- journal={The Knowledge engineering review},
325
- volume={19},
326
- number={4},
327
- pages={281--316},
328
- year={2004},
329
- publisher={Cambridge University Press}
330
- }
331
-
332
-
333
- @inproceedings{gaston2005agentb,
334
- title={Agent-organized networks for multi-agent production and exchange},
335
- author={Gaston, Matthew E and DesJardins, Marie},
336
- booktitle={Proceedings of the 20th national conference on Artificial intelligence-Volume 1},
337
- pages={77--82},
338
- year={2005}
339
- }
340
-
341
- @misc{zhang2023building,
342
- title={Building Cooperative Embodied Agents Modularly with Large Language Models},
343
- author={Hongxin Zhang and Weihua Du and Jiaming Shan and Qinhong Zhou and Yilun Du and Joshua B. Tenenbaum and Tianmin Shu and Chuang Gan},
344
- year={2023},
345
- eprint={2307.02485},
346
- archivePrefix={arXiv},
347
- primaryClass={cs.AI}
348
- }
349
-
350
- @misc{parisi2022talm,
351
- title={TALM: Tool Augmented Language Models},
352
- author={Aaron Parisi and Yao Zhao and Noah Fiedel},
353
- year={2022},
354
- eprint={2205.12255},
355
- archivePrefix={arXiv},
356
- primaryClass={cs.CL}
357
- }
358
-
359
-
360
- @misc{crewai,
361
- title = {CrewAI},
362
- author = {{CrewAI Inc.}},
363
- year = {2025},
364
- howpublished = {\url{https://www.crewai.com/}},
365
- note = {Accessed: 2025-05-01}
366
- }
367
-
368
-
369
- @article{qian2023communicative,
370
- title={Chatdev: Communicative agents for software development},
371
- author={Qian, Chen and Liu, Wei and Liu, Hongzhang and Chen, Nuo and Dang, Yufan and Li, Jiahao and Yang, Cheng and Chen, Weize and Su, Yusheng and Cong, Xin and others},
372
- journal={arXiv preprint arXiv:2307.07924},
373
- year={2023}
374
- }
375
-
376
- @article{wang2023decision,
377
- title={Decision-making driven by driver intelligence and environment reasoning for high-level autonomous vehicles: a survey},
378
- author={Wang, Yuning and Jiang, Junkai and Li, Shangyi and Li, Ruochen and Xu, Shaobing and Wang, Jianqiang and Li, Keqiang},
379
- journal={IEEE Transactions on Intelligent Transportation Systems},
380
- volume={24},
381
- number={10},
382
- pages={10362--10381},
383
- year={2023},
384
- publisher={IEEE}
385
- }
386
-
387
- @misc{wen2024benchmarkingcomplexinstructionfollowingmultiple,
388
- title={Benchmarking Complex Instruction-Following with Multiple Constraints Composition},
389
- author={Bosi Wen and Pei Ke and Xiaotao Gu and Lindong Wu and Hao Huang and Jinfeng Zhou and Wenchuang Li and Binxin Hu and Wendy Gao and Jiaxin Xu and Yiming Liu and Jie Tang and Hongning Wang and Minlie Huang},
390
- year={2024},
391
- eprint={2407.03978},
392
- archivePrefix={arXiv},
393
- primaryClass={cs.CL},
394
- url={https://arxiv.org/abs/2407.03978},
395
- }
396
-
397
- @article{jiang2023mistral,
398
- title={{Mistral 7B}},
399
- author={Jiang, Albert Q and Xu, Alexandre and Lachaux, Arthur Mensch Guillaume Lample Nicol{\`a}s and Rozenberg, Fran{\c{c}}ois and Lacroix, Timoth{\'e}e and Lavril, Thibaut and Gaddipati, Teven Le Scao Eleonora and Ortiz, Lucile Saulnier Lixin and Tang, Dieuwke Hiemstra L{\'e}lio Renard and others},
400
- year={2023},
401
- eprint={2310.06825},
402
- archivePrefix={arXiv},
403
- primaryClass={cs.CL},
404
- url={https://arxiv.org/abs/2310.06825},
405
- }
406
-
407
- @article{llama3herd,
408
- title={{The Llama 3 Herd of Models}},
409
- author={{Meta Llama Team}},
410
- year={2024},
411
- eprint={2407.21783},
412
- archivePrefix={arXiv},
413
- primaryClass={cs.CL},
414
- url={https://arxiv.org/abs/2407.21783},
415
- note={arXiv:2407.21783}
416
- }
417
-
418
- @article{gemini1.5_report,
419
- title={{Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context}},
420
- author={{Gemini Team}},
421
- year={2024},
422
- eprint={2403.05530},
423
- archivePrefix={arXiv},
424
- primaryClass={cs.CL},
425
- url={https://arxiv.org/abs/2403.05530},
426
- note={arXiv:2403.05530}
427
- }
428
-
429
- @article{qwen2.5_report,
430
- title={{Qwen2.5 Technical Report}},
431
- author={{Qwen Team} and Yang, An and others},
432
- year={2024},
433
- eprint={2412.15115},
434
- archivePrefix={arXiv},
435
- primaryClass={cs.CL},
436
- url={https://arxiv.org/abs/2412.15115},
437
- note={arXiv:2412.15115}
438
- }
439
-
440
- @misc{qwen3_blog,
441
- title={{Qwen3: Think Deeper, Act Faster}},
442
- author={{Qwen Team}},
443
- howpublished={\url{https://qwenlm.github.io/blog/qwen3/}},
444
- year={2025}
445
- }
446
-
447
- @article{deepseekr1_report,
448
- title={{DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning}},
449
- author={{DeepSeek-AI and others}},
450
- year={2025},
451
- eprint={2501.12948},
452
- archivePrefix={arXiv},
453
- primaryClass={cs.CL},
454
- url={https://arxiv.org/abs/2501.12948},
455
- note={arXiv:2501.12948}
456
- }