hashiruAI

Sleeping

App Files Files Community

helloparthshah commited on May 19

Commit

ef6cb0b

2 Parent(s): 0c0c279 ba796a2

Merge branch 'main' of github.com:kunpai/HASHIRU

Browse files

Files changed (9) hide show

bench/benchmarking_connections.py +0 -97
bench/benchmarking_globle.py +0 -141
bench/benchmarking_hle.py +0 -174
bench/benchmarking_paper_reviews.py +0 -111
bench/benchmarking_wordle.py +0 -113
paper/HASHIRU.pdf +0 -0
paper/IEEEtran.cls +0 -0
paper/conference_101719.tex +0 -479
paper/references.bib +0 -456

bench/benchmarking_connections.py DELETED Viewed

@@ -1,97 +0,0 @@
-from gradio_client import Client
-from datasets import load_dataset
-import json
-import time
-import random
-import os
-from datetime import datetime
-import re
-def get_last_assistant_content(resp):
-    """
-    Return the last assistant utterance from the response object
-    produced by `client.predict`.
-    """
-    # ❶ If the server wraps things in a (messages, meta) tuple
-    if isinstance(resp, tuple):
-        resp = resp[0]
-    # ❷ At this point `resp` must be the list of message dicts
-    if not isinstance(resp, list):
-        return ""
-    for turn in reversed(resp):
-        if turn.get("role") != "assistant":
-            continue
-        # a) plain messages
-        if turn.get("content"):
-            return turn["content"]
-        # b) tool / function_response wrapper
-        fr = turn.get("function_response", {})
-        out = fr.get("result", {}).get("output")
-        if out:
-            return out
-        # c) messages stored as Part objects inside `content`
-        cont = turn.get("content")
-        if isinstance(cont, dict):
-            parts = cont.get("parts", [])
-            if parts and parts[0].get("text"):
-                return parts[0]["text"]
-    return ""
-def benchmark_nyt_connections(num_samples=20, categories=None):
-    """
-    Benchmark agent performance on NYT connections dataset
-    Args:
-        num_samples: Number of samples to test
-        categories: List of categories to include (None for all)
-    """
-    # Load NYT connections dataset
-    print("Loading NYT connections dataset...")
-    dataset = load_dataset("tm21cy/NYT-Connections")
-    # Initialize client
-    client = Client("http://127.0.0.1:7860/")
-    # Prepare output directory
-    output_dir = "results"
-    os.makedirs(output_dir, exist_ok=True)
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    out_path = os.path.join(output_dir, f"nyt_connections_benchmark_{timestamp}.jsonl")
-    print(f"Results will be saved to {out_path}")
-    results = []
-    num_samples = min(num_samples, len(dataset["train"])) if num_samples else len(dataset["train"])
-    print(f"Sampling {num_samples} samples from the dataset.")
-    indices = random.sample(range(len(dataset["train"])), num_samples)
-    for i in indices:
-        sample = dataset["train"][i]
-        if categories and sample["category"] not in categories:
-            continue
-        print(f"Sample {i}: {sample['contest']}")
-        prompt = f"Given the following words, group them into 4 categories of 4 words each:\n{' '.join(sample['words'])}\n\n Once you've solved it, final output should be in the following format Group 1: word1, word2, word3, word4\nGroup 2: ..."
-        start_time = time.time()
-        response = client.predict(messages=[{"role": "user", "content": prompt}], api_name="/run")
-        end_time = time.time()
-        elapsed_time = end_time - start_time
-        assistant_content = get_last_assistant_content(response)
-        results.append({
-            "input": sample["words"],
-            "date": sample["contest"],
-            "output": assistant_content,
-            "expected": sample["answers"],
-            "elapsed_time": elapsed_time,
-        })
-        # Save intermediate results
-        with open(out_path, "a") as f:
-            for result in results:
-                f.write(json.dumps(result) + "\n")
-    print(f"Results saved to {out_path}")
-    return results
-if __name__ == "__main__":
-    benchmark_nyt_connections(num_samples=1)

bench/benchmarking_globle.py DELETED Viewed

@@ -1,141 +0,0 @@
-#!/usr/bin/env python3
-import random
-import math
-import sys
-import json
-import time
-import difflib
-import os
-import requests
-import re
-import geopandas as gpd
-from shapely.geometry import Point
-from gradio_client import Client
-from datetime import datetime
-# -----------------------------------------------------------------------------
-# Utility: haversine distance only
-# -----------------------------------------------------------------------------
-def haversine(lat1, lon1, lat2, lon2):
-    """Return distance in kilometers between two lat/lon points."""
-    R = 6371.0  # Earth radius in km
-    φ1, φ2 = math.radians(lat1), math.radians(lat2)
-    Δφ = math.radians(lat2 - lat1)
-    Δλ = math.radians(lon2 - lon1)
-    a = math.sin(Δφ/2)**2 + math.cos(φ1)*math.cos(φ2)*math.sin(Δλ/2)**2
-    return 2 * R * math.asin(math.sqrt(a))
-# -----------------------------------------------------------------------------
-# Load country centroids and geometries
-# -----------------------------------------------------------------------------
-def load_countries(geo_path):
-    gdf = gpd.read_file(geo_path)
-    candidates = ["ADMIN","NAME","NAME_EN","NAME_LONG","SOVEREIGN","COUNTRY"]
-    name_field = next((f for f in candidates if f in gdf.columns), None)
-    if name_field is None:
-        non_geom = [c for c in gdf.columns if c.lower()!='geometry']
-        name_field = non_geom[0] if non_geom else None
-    centroids, geoms = {}, {}
-    for _, row in gdf.iterrows():
-        geom = row.geometry
-        if not geom or geom.is_empty: continue
-        c = geom.centroid
-        country = row[name_field]
-        centroids[country] = (c.y, c.x)
-        geoms[country] = geom
-    return centroids, geoms
-# -----------------------------------------------------------------------------
-# Extract last assistant content
-# -----------------------------------------------------------------------------
-def get_last_assistant_content(resp):
-    if isinstance(resp, tuple): resp = resp[0]
-    if not isinstance(resp, list): return ""
-    for turn in reversed(resp):
-        if turn.get("role")!='assistant': continue
-        if turn.get("content"): return turn["content"]
-        fr=turn.get("function_response",{})
-        out=fr.get("result",{}).get("output")
-        if out: return out
-        cont=turn.get("content")
-        if isinstance(cont,dict): parts=cont.get("parts",[])
-        if parts and parts[0].get("text"): return parts[0]["text"]
-    return ""
-# -----------------------------------------------------------------------------
-# Play one automated Globle game via LLM
-# -----------------------------------------------------------------------------
-def play_globle_agent(client, countries, geoms, max_guesses=20, threshold=0.6):
-    # pick random target
-    target, (tlat, tlon) = random.choice(list(countries.items()))
-    guesses = []
-    attempts = 0
-    while True:
-        # build prompt history
-        history = "\n".join([f"Guess: {g}, Response: {resp}" for g,resp in guesses])
-        prompt = (
-            "Worldle (distance-only). Guess the country.\n" +
-            (history+"\n" if history else "") +
-            "Respond with a single country name and ONLY the name of the country."
-        )
-        resp = client.predict(messages=[{"role":"user","content":prompt}], api_name="/run")
-        raw = get_last_assistant_content(resp).strip()
-        print(f"Guess: {raw}")
-        # sanitize: fuzzy match to known country
-        if raw not in countries:
-            match = difflib.get_close_matches(raw, countries.keys(), n=1, cutoff=threshold)
-            if match:
-                guess = match[0]
-            else:
-                # invalid guess, retry without counting
-                continue
-        else:
-            guess = raw
-        attempts += 1
-        # correct?
-        if guess == target:
-            return {"target":target, "guesses":guesses+[(guess,"CORRECT")], "turns":attempts, "solved":True}
-        # adjacency
-        if geoms[guess].touches(geoms[target]):
-            feedback="BORDER"
-        else:
-            # distance
-            glat, glon = countries[guess]
-            dist = haversine(glat, glon, tlat, tlon)
-            feedback=f"{dist:.0f}km"
-        guesses.append((guess,feedback))
-        if attempts>=max_guesses:
-            return {"target":target, "guesses":guesses, "turns":attempts, "solved":False}
-# -----------------------------------------------------------------------------
-# Benchmark multiple games
-# -----------------------------------------------------------------------------
-def benchmark_globle(geo_path, num_games=1, max_guesses=20, cutoff=0.6):
-    countries, geoms = load_countries(geo_path)
-    client = Client("http://127.0.0.1:7860/")
-    os.makedirs("results",exist_ok=True)
-    out_file = os.path.join("results", f"globle_benchmark_{datetime.now():%Y%m%d_%H%M%S}.jsonl")
-    results=[]
-    for i in range(num_games):
-        print(f"Game {i+1}/{num_games}")
-        start=time.time()
-        res=play_globle_agent(client,countries,geoms,max_guesses,cutoff)
-        res["time"] = time.time()-start
-        results.append(res)
-        with open(out_file,"a") as f: f.write(json.dumps(res)+"\n")
-    print(f"Saved results to {out_file}")
-    return results
-# -----------------------------------------------------------------------------
-# CLI
-# -----------------------------------------------------------------------------
-if __name__=="__main__":
-    if len(sys.argv)!=2:
-        print("Usage: python benchmarking_globle.py path/to/countries-file")
-        sys.exit(1)
-    geo=sys.argv[1]
-    benchmark_globle(geo)

bench/benchmarking_hle.py DELETED Viewed

@@ -1,174 +0,0 @@
-from gradio_client import Client
-from datasets import load_dataset
-import json
-import time
-import random
-import os
-from datetime import datetime
-import re
-def get_last_assistant_content(resp):
-    """
-    Return the last assistant utterance from the response object
-    produced by `client.predict`.
-    """
-    # ❶ If the server wraps things in a (messages, meta) tuple
-    if isinstance(resp, tuple):
-        resp = resp[0]
-    # ❷ At this point `resp` must be the list of message dicts
-    if not isinstance(resp, list):
-        return ""
-    for turn in reversed(resp):
-        if turn.get("role") != "assistant":
-            continue
-        # a) plain messages
-        if turn.get("content"):
-            return turn["content"]
-        # b) tool / function_response wrapper
-        fr = turn.get("function_response", {})
-        out = fr.get("result", {}).get("output")
-        if out:
-            return out
-        # c) messages stored as Part objects inside `content`
-        cont = turn.get("content")
-        if isinstance(cont, dict):
-            parts = cont.get("parts", [])
-            if parts and parts[0].get("text"):
-                return parts[0]["text"]
-    return ""
-def benchmark_hle(num_samples=20, categories=None):
-    """
-    Benchmark agent performance on HLE dataset
-    Args:
-        num_samples: Number of samples to test
-        categories: List of categories to include (None for all)
-    """
-    # Load HLE dataset
-    print("Loading HLE dataset...")
-    dataset = load_dataset("cais/hle")
-    # Initialize client
-    client = Client("http://127.0.0.1:7860/")
-    # Create results directory if it doesn't exist
-    os.makedirs("results", exist_ok=True)
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    results_file = f"results/hle_benchmark_{timestamp}.jsonl"
-    # Select samples
-    all_samples = []
-    for split in ['validation', 'test']:  # Using validation and test splits
-        if split in dataset:
-            all_samples.extend(dataset[split])
-    # Filter by category if specified
-    if categories:
-        all_samples = [s for s in all_samples if s.get('category') in categories]
-    # Filter out prompts mentioning images (text-substring only)
-    filtered_samples = [s for s in all_samples if 'image' not in s.get('input', '').lower()]
-    removed = len(all_samples) - len(filtered_samples)
-    if removed > 0:
-        print(f"Filtered out {removed} samples containing 'image'.")
-    all_samples = filtered_samples
-    # Select random samples
-    if len(all_samples) > num_samples:
-        samples = random.sample(all_samples, num_samples)
-    else:
-        samples = all_samples
-        print(f"Warning: Only found {len(samples)} samples after filtering.")
-    print(f"Running benchmark on {len(samples)} samples...")
-    # Run benchmarks
-    results = []
-    for i, sample in enumerate(samples):
-        print(f"\nProcessing sample {i+1}/{len(samples)}")
-        category = sample.get('category', 'Unknown')
-        prompt = sample.get('question', '')
-        print(f"Category: {category}")
-        print(f"Question: {prompt[:100]}...")
-        # Send query to agent
-        try:
-            start_time = time.time()
-            response, history = client.predict(
-                message={"text": prompt, "files": []},
-                api_name="/chat"
-            )
-            end_time = time.time()
-            target_answer_phrase = sample.get('answer', '').strip()
-            agent_final_response_content = get_last_assistant_content(history)
-            is_correct = False
-            # Only attempt the check if both the target phrase and the agent content are non-empty
-            if target_answer_phrase and agent_final_response_content:
-                # Perform the simple case-insensitive substring check
-                if target_answer_phrase.lower() in agent_final_response_content.lower():
-                    is_correct = True
-            # Record result
-            result = {
-                "sample_id": sample.get('id', f'sample_{i}'),
-                "category": category,
-                "input": prompt,
-                "target_output": sample.get('answer', ''),
-                "agent_full_response": history,
-                "agent_final_response": agent_final_response_content,
-                "response_time": end_time - start_time,
-                "is_correct": is_correct
-            }
-            results.append(result)
-            # Write to file immediately to preserve progress
-            with open(results_file, 'a') as f:
-                f.write(json.dumps(result) + '\n')
-            print(f"Response received in {end_time - start_time:.2f} seconds")
-            print(f"Response: {response[:100]}...")
-            # Add a delay to avoid overwhelming the server
-            time.sleep(1)
-        except Exception as e:
-            print(f"Error processing sample: {e}")
-            continue
-    # Print summary statistics
-    print("\n===== HLE BENCHMARK SUMMARY =====")
-    print(f"Samples processed: {len(results)}")
-    # Categorize by categories
-    by_category = {}
-    for result in results:
-        category = result.get('category', 'Unknown')
-        by_category.setdefault(category, []).append(result)
-    print("\nSamples by category:")
-    for category, items in by_category.items():
-        print(f"  {category}: {len(items)} samples")
-    avg_time = sum(r.get('response_time', 0) for r in results) / len(results) if results else 0
-    print(f"\nAverage response time: {avg_time:.2f} seconds")
-    print(f"Results saved to: {results_file}")
-    return results
-if __name__ == "__main__":
-    benchmark_hle(
-        num_samples=1,
-        categories=None
-    )

bench/benchmarking_paper_reviews.py DELETED Viewed

@@ -1,111 +0,0 @@
-from gradio_client import Client
-import pandas as pd
-import json
-import time
-import os
-from datetime import datetime
-def get_last_assistant_content(resp):
-    """
-    Return the last assistant utterance from the response object
-    produced by `client.predict`.
-    """
-    if isinstance(resp, tuple):
-        resp = resp[0]
-    if not isinstance(resp, list):
-        return ""
-    for turn in reversed(resp):
-        if turn.get("role") != "assistant":
-            continue
-        if turn.get("content"):
-            return turn["content"]
-        fr = turn.get("function_response", {})
-        out = fr.get("result", {}).get("output")
-        if out:
-            return out
-        cont = turn.get("content")
-        if isinstance(cont, dict):
-            parts = cont.get("parts", [])
-            if parts and parts[0].get("text"):
-                return parts[0]["text"]
-    return ""
-def benchmark_paper_reviews(
-    csv_path,
-    id_col="ID",
-    text_col="concatenated_text",
-    num_samples=None,
-    output_dir="results"
-):
-    """
-    Benchmark agent performance on paper reviews.
-    Args:
-        csv_path: path to the pipe‑separated CSV of papers + existing reviews
-        id_col:    name of the column containing unique paper IDs
-        text_col:  name of the column containing the full paper text
-        num_samples: if set, randomly sample this many papers
-        output_dir: where to write the JSONL results
-    """
-    # load CSV
-    df = pd.read_csv(csv_path, sep="|")
-    if num_samples:
-        df = df.sample(num_samples, random_state=42).reset_index(drop=True)
-    # prepare output
-    os.makedirs(output_dir, exist_ok=True)
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    out_path = os.path.join(output_dir, f"paper_review_benchmark_{timestamp}.jsonl")
-    # init client
-    client = Client("http://127.0.0.1:7860/")
-    results = []
-    for idx, row in df.iterrows():
-        paper_id = row[id_col]
-        title = row["Title"]
-        prompt = "Create THREE agents with relevant personalities, expertise, and review styles. " \
-                "Each agent should provide a review of the paper, and recommend Accept/Reject for ICLR 2023. " \
-                "The review should be detailed and include strengths and weaknesses. " \
-                "You MUST use ArxivTool and WikipediaTool to get more information about novelty and correctness. " \
-                "GIVE A FINAL DECISION in the form of \"FINAL DECISION: <Accept/Reject>\". " \
-                "The paper title is: " + title + "\n\n" + row[text_col]
-        print(f"[{idx+1}/{len(df)}] Paper ID: {paper_id}")
-        try:
-            start = time.time()
-            resp, history = client.predict(
-                message={"text": prompt, "files": []},
-                api_name="/chat"
-            )
-            elapsed = time.time() - start
-            result = {
-                "paper_id": paper_id,
-                "prompt_snippet": prompt[:200],
-                "agent_review": history,
-                "ground_truth": row["Decision"],
-                "response_time": elapsed
-            }
-            # write immediately
-            with open(out_path, "a") as f:
-                f.write(json.dumps(result) + "\n")
-            print(f" → {elapsed:.2f}s, review length {len(history)} chars")
-            results.append(result)
-            # small delay
-            time.sleep(1)
-        except Exception as e:
-            print(f"  Error on {paper_id}: {e}")
-    print(f"\nDone. Results written to {out_path}")
-    return results
-if __name__ == "__main__":
-    # example usage: adjust path & sample count as needed
-    benchmark_paper_reviews(
-        csv_path="bench/data/ICLR_2023.csv",
-        num_samples=1
-    )

bench/benchmarking_wordle.py DELETED Viewed

@@ -1,113 +0,0 @@
-from gradio_client import Client
-from datasets import load_dataset
-import requests
-import json
-import time
-import random
-import os
-import re
-from datetime import datetime
-# Fetch the official Wordle guess list from GitHub
-WORD_LIST_URL = "https://raw.githubusercontent.com/tabatkins/wordle-list/main/words"
-def load_word_list():
-    resp = requests.get(WORD_LIST_URL)
-    resp.raise_for_status()
-    words = [w.strip().lower() for w in resp.text.splitlines()]
-    return [w for w in words if len(w) == 5 and w.isalpha()]
-WORD_LIST = load_word_list()
-def get_last_assistant_content(resp):
-    if isinstance(resp, tuple): resp = resp[0]
-    if not isinstance(resp, list): return ""
-    for turn in reversed(resp):
-        if turn.get("role") != "assistant": continue
-        if turn.get("content"): return turn["content"]
-        fr = turn.get("function_response", {})
-        out = fr.get("result", {}).get("output")
-        if out: return out
-        cont = turn.get("content")
-        if isinstance(cont, dict):
-            parts = cont.get("parts", [])
-            if parts and parts[0].get("text"): return parts[0]["text"]
-    return ""
-def compute_feedback(guess, solution):
-    feedback = ["B"] * 5
-    sol = list(solution)
-    for i, g in enumerate(guess):
-        if g == sol[i]: feedback[i], sol[i] = "G", None
-    for i, g in enumerate(guess):
-        if feedback[i] == "B" and g in sol:
-            feedback[i] = "Y"
-            sol[sol.index(g)] = None
-    return "".join(feedback)
-def sanitize_guess(raw):
-    raw = raw.lower()
-    m = re.search(r"\b[a-z]{5}\b", raw)
-    if m: return m.group(0)
-    cleaned = re.sub(r"[^a-z]", "", raw)
-    return cleaned[-5:]
-def benchmark_wordle(num_games=10, max_guesses=6):
-    client = Client("http://127.0.0.1:7860/")
-    os.makedirs("results", exist_ok=True)
-    out_path = os.path.join("results", f"wordle_benchmark_{datetime.now():%Y%m%d_%H%M%S}.jsonl")
-    results = []
-    for gi in range(num_games):
-        solution = random.choice(WORD_LIST)
-        print(f"Game {gi+1}/{num_games}, solution: {solution}")
-        guesses = []
-        attempts = 0
-        start_time = time.time()
-        while attempts < max_guesses:
-            history = "\n".join(f"Guess: {g}, Feedback: {f}" for g, f in guesses)
-            prompt = (
-                f"Wordle game. Guess the 5-letter word.\n" +
-                (history + "\n" if history else "") +
-                "Respond with a single 5-letter guess and with ONLY YOUR GUESS. NO FILLER OR PUNCTUATION."
-                "\n" + "Use the feedback format: G=green, Y=yellow, B=black.\n" +
-                f"(Green: letter in correct position, Yellow: letter in wrong position, Black: letter not in word)\n" +
-                f"Use tools and agents to help you guess the word.\n"
-            )
-            resp = client.predict(messages=[{"role": "user", "content": prompt}], api_name="/run")
-            raw = get_last_assistant_content(resp)
-            guess = sanitize_guess(raw)
-            # If guess invalid, retry without counting
-            if len(guess) != 5 or guess not in WORD_LIST:
-                print(f"Warning: '{guess}' invalid; retrying without using a turn.")
-                continue
-            feedback = compute_feedback(guess, solution)
-            guesses.append((guess, feedback))
-            attempts += 1
-            print(f"Attempt {attempts}: {guess} -> {feedback}")
-            if feedback == "GGGGG":
-                break
-        results.append({
-            "solution": solution,
-            "guesses": guesses,
-            "solved": bool(guesses and guesses[-1][1] == "GGGGG"),
-            "turns": len(guesses),
-            "time": time.time() - start_time
-        })
-        with open(out_path, "a") as f:
-            f.write(json.dumps(results[-1]) + "\n")
-    print(f"Benchmark complete, results saved to {out_path}")
-    return results
-if __name__ == "__main__":
-    benchmark_wordle(num_games=1)

paper/HASHIRU.pdf DELETED Viewed

Binary file (74.4 kB)

paper/IEEEtran.cls DELETED Viewed

The diff for this file is too large to render. See raw diff

paper/conference_101719.tex DELETED Viewed

@@ -1,479 +0,0 @@
-\documentclass[conference]{IEEEtran}
-\IEEEoverridecommandlockouts
-% The preceding line is only needed to identify funding in the first footnote. If that is unneeded, please comment it out.
-\usepackage{cite}
-\usepackage{amsmath,amssymb,amsfonts}
-\usepackage{algorithmic}
-\usepackage{graphicx}
-\usepackage{textcomp}
-\usepackage{xcolor}
-\usepackage{hyperref}
-\usepackage{longtable}
-\usepackage{tabularx}
-\usepackage{tcolorbox}
-\tcbuselibrary{skins}
-% Define styles for user and assistant boxes
-\newtcolorbox{userbox}{colback=blue!5,colframe=blue!40!black,title=User}
-\newtcolorbox{assistantbox}{colback=gray!10,colframe=black,title=Assistant}
-\def\BibTeX{{\rm B\kern-.05em{\sc i\kern-.025em b}\kern-.08em
-    T\kern-.1667em\lower.7ex\hbox{E}\kern-.125emX}}
-\begin{document}
-\title{HASHIRU: Hierarchical Agent System for Hybrid Intelligent Resource Utilization}
-\author{\IEEEauthorblockN{Kunal Pai}
-\IEEEauthorblockA{\textit{UC Davis} \\
-kunpai@ucdavis.edu}
-\and
-\IEEEauthorblockN{Parth Shah}
-\IEEEauthorblockA{\textit{Independent Researcher} \\
-helloparthshah@gmail.com}
-\and
-\IEEEauthorblockN{Harshil Patel}
-\IEEEauthorblockA{\textit{UC Davis} \\
-hpppatel@ucdavis.edu}
-\and
-\IEEEauthorblockN{Saisha Shetty}
-\IEEEauthorblockA{\textit{UC Davis} \\
-spshetty@ucdavis.edu}
-}
-\maketitle
-\section{Introduction}\label{sec:introduction}
-Rapid advancements in Large Language Models (LLMs) are reshaping Artificial Intelligence (AI) with profound capabilities in language understanding, generation, reasoning, and planning \cite{brown2020language, devlin2019bert, raffel2020exploring}. This progress drives the development of autonomous AI agents, shifting focus from single to Multi-Agent Systems (MAS) where collaborative teams tackle complex problems beyond individual scope \cite{dorri2018multi, wooldridge2009introduction}. Collaborative MAS show significant potential in diverse domains like scientific discovery \cite{boiko2023emergent}, software engineering \cite{qian2023communicative}, data analysis, and strategic decision-making \cite{wang2023decision}. The increasing complexity of tasks, demonstrated by benchmarks requiring advanced mathematical reasoning (e.g., GSM8K \cite{cobbe2021gsm8k}, SVAMP \cite{patel2021svamp}), coding (e.g., HumanEval \cite{chen2021codex}, CoDocBench \cite{pai2024codocbench}), and graduate-level technical knowledge \cite{phan2025humanitysexam}, highlights the need for agentic systems to effectively coordinate diverse cognitive resources \cite{wen2024benchmarkingcomplexinstructionfollowingmultiple}.
-Despite this potential, contemporary agentic frameworks face significant limitations. Many are \textbf{rigid}, relying on predefined roles and static structures hindering adaptation to dynamic tasks \cite{zhang2023building}. \textbf{Resource obliviousness} is common; systems often lack mechanisms to monitor and optimize computational resources like API costs, memory, and CPU load, leading to inefficiency, especially when scaling or deploying in resource-constrained environments \cite{park2023generative}. This is often worsened by reliance on powerful, costly proprietary cloud LLMs. \textbf{Model homogeneity}, defaulting to a single powerful LLM for all sub-tasks, misses efficiency gains from a diverse ecosystem including smaller, specialized, or local models \cite{zhou2023agents}. While \textbf{tool use} is fundamental \cite{yao2022react, parisi2022talm}, agents' ability to autonomously \textbf{create and integrate new tools} remains limited, restricting dynamic extension and self-improvement without human intervention \cite{wang2023voyager}.
-To address these challenges, we introduce \textbf{HASHIRU (Hierarchical Agent System for Hybrid Intelligent Resource Utilization)}, a novel MAS framework enhancing flexibility, resource efficiency, and adaptability. HASHIRU employs a hierarchical structure led by a central ``CEO'' agent dynamically managing specialized ``employee'' agents instantiated on demand. A core tenet is its \textbf{hybrid intelligence} approach, strategically prioritizing smaller (e.g., 3B--7B), locally-run LLMs (often via Ollama \cite{ollama}) for cost-effectiveness and efficiency. While prioritizing local resources, the system flexibly integrates external APIs and potentially more powerful models when justified by task complexity and resource availability, under the CEO's management.
-The primary contributions are:
-\begin{enumerate}
-    \item A novel MAS architecture combining \textbf{hierarchical control} with \textbf{dynamic, resource-aware agent lifecycle management} (hiring/firing). This management is governed by computational budget constraints (cost, memory, concurrency) and includes an economic model with hiring/firing costs to discourage excessive churn.
-    \item A \textbf{hybrid intelligence model} prioritizing cost-effective, local LLMs while adaptively incorporating external APIs and larger models, optimizing the efficiency-capability trade-off.
-    \item An integrated mechanism for \textbf{autonomous API tool creation}, allowing dynamic functional repertoire extension.
-    \item An \textbf{economic model} (hiring/firing fees) for agent management, promoting efficient resource allocation and team stability.
-\end{enumerate}
-This paper details HASHIRU's design and rationale. Section \ref{sec:background} discusses related work in agent architectures, dynamic management, resource allocation, model heterogeneity, and tool use. Section 3 elaborates on the architecture and core mechanisms. Section 4 presents experimental results (or outlines planned experiments), followed by discussion and conclusion in Sections 5 and 6.
-\section{Background and Related Work} \label{sec:background}
-Intelligent agent concepts have evolved from early symbolic AI \cite{russell2010artificial, shoham1994agent} to LLM-dominated frameworks leveraging models for reasoning, planning, and interaction \cite{wang2023survey, xi2023rise}. HASHIRU builds on this, addressing current limitations.
-\subsection{Agent Architectures: Hierarchy and Dynamics}
-MAS architectures vary, including flat, federated, and hierarchical \cite{dorri2018multi, horling2004survey}. Hierarchical models offer clear control and task decomposition but risk bottlenecks and rigidity \cite{gaston2005agenta,gaston2005agentb}. HASHIRU uses a \textbf{CEO-Employee hierarchy} for centralized coordination but distinguishes itself through \textbf{dynamic team composition}. Unlike systems with static hierarchies or predefined roles (e.g., CrewAI \cite{crewai}, ChatDev \cite{qian2023communicative}), HASHIRU's CEO dynamically manages the employee pool based on runtime needs and resource constraints.
-\subsection{Dynamic Agent Lifecycle Management}
-Dynamic MAS composition is crucial for complex environments \cite{valckenaers2005trends}. Agent creation/deletion triggers often relate to task structure or environmental changes. HASHIRU introduces a specific mechanism where the CEO makes \textbf{hiring and firing decisions} based on a cost-benefit analysis considering agent performance, operational costs (API fees, inferred compute), memory footprint (tracked explicitly as a percentage of available resources), and concurrency limits. HASHIRU also incorporates an \textbf{economic model} with explicit ``starting bonus'' (hiring) and ``invocation'' (usage) costs. This economic friction aims to prevent excessive initialization or usage for marginal gains and promote team stability, a nuance often missing in simpler dynamic strategies.
-\subsection{Resource Management and Agent Economies}
-Resource awareness is critical for scalable MAS. Economic research explores mechanisms like market-based auctions or contract nets for allocation \cite{clearwater1996market}. HASHIRU implements a more \textbf{centralized, budget-constrained resource management model}. The CEO operates within defined limits for financial cost, memory usage (as a percentage of total allocated), and concurrent agent count. This direct management, particularly focusing on memory percentage, suggests practicality for deployment on local or edge devices with finite resources, contrasting with cloud systems assuming elastic resources \cite{park2023generative}. Frameworks like AutoGen \cite{wu2023autogen} and LangGraph \cite{langgraph} typically rely on implicit cost tracking without explicit multi-dimensional budgeting and control.
-\subsection{Hybrid Intelligence and Heterogeneous Models}
-Leveraging diverse LLMs with varying capabilities, costs, and latencies is an emerging trend \cite{zhou2023agents}. Techniques like model routing select optimal models for sub-tasks. HASHIRU embraces \textbf{model heterogeneity} with a strategic focus: \textbf{prioritizing smaller (3B--7B), locally-run models via Ollama integration} \cite{ollama}. This emphasizes cost-efficiency, low latency, and potential privacy over systems defaulting to large proprietary cloud APIs (e.g., GPT-4 \cite{openai2023gpt4}, Claude 3 \cite{anthropic2024claude}). While integrating external APIs (potentially larger models), HASHIRU's default stance represents a distinct capability vs. efficiency balance.
-\subsection{Tool Use and Autonomous Tool Creation}
-Tool use (APIs, functions) is fundamental for modern agents \cite{yao2022react, openai_func_calling}. Most systems use predefined tools. HASHIRU advances this with \textbf{integrated, autonomous API tool creation}. When needed functionality is missing, the CEO can commission the generation (potentially via a specialized agent) and deployment of a new API tool within the HASHIRU ecosystem. This self-extension capability differentiates HASHIRU from systems limited to static toolsets, moving towards greater autonomy and adaptability \cite{wang2023voyager, park2023generative}.
-In summary, HASHIRU integrates hierarchical control, dynamic MAS, resource management, and tool use. Its novelty lies in the synergistic combination of: (1) dynamic, resource-aware hierarchical management with (2) an economic model for stability, (3) a local-first hybrid intelligence strategy, and (4) integrated autonomous tool creation. This targets key limitations in current systems regarding efficiency, adaptability, cost, and autonomy.
-\section{HASHIRU System Architecture}
-\label{sec:architecture}
-HASHIRU's architecture addresses rigidity, resource obliviousness, and limited adaptability through a hierarchical, dynamically managed MAS optimized for hybrid resource utilization.
-\subsection{Overview}
-HASHIRU operates with a central ``CEO'' agent coordinating specialized ``Employees''. Key tenets:
-\begin{itemize}
-    \item \textbf{Dynamic Hierarchical Coordination:} CEO manages strategy, task allocation, and dynamic team composition.
-    \item \textbf{Dynamic Lifecycle Management:} Employees are hired/fired based on runtime needs and resource constraints, governed by an economic model.
-    \item \textbf{Hybrid Intelligence:} Strategic preference for local, cheaper LLMs, while accessing external APIs/models.
-    \item \textbf{Explicit Resource Management:} Continuous monitoring and control of costs, memory usage, and concurrency against budgets.
-    \item \textbf{Adaptive Tooling:} Using predefined tools alongside autonomous creation of new API tools.
-\end{itemize}
-Figure \ref{fig:arch} illustrates the structure.
-\begin{figure}[ht]
-    \centering
-    \includegraphics[width=0.45\textwidth]{HASHIRU.pdf}
-    \caption{High-level architecture of the HASHIRU system, illustrating the CEO-Employee hierarchy.}
-    \label{fig:arch}
-\end{figure}
-\subsection{Hierarchical Structure: CEO and Employee Agents}
-The system uses a two-tiered hierarchy:
-\begin{itemize}
-    \item \textbf{CEO Agent:} Singleton, central coordinator and entry point. Responsibilities:
-        \begin{itemize}
-            \item Interpreting user query/task.
-            \item Decomposing main task into sub-tasks.
-            \item Identifying required capabilities.
-            \item Managing Employee pool (Section \ref{subsec:dynamic_mgmt}).
-            \item Assigning sub-tasks to active Employees.
-            \item Monitoring Employee progress/performance.
-            \item Synthesizing Employee results into final output.
-            \item Managing overall resource budget (Section \ref{subsec:resource_mgmt}).
-            \item Initiating new tool creation (Section \ref{subsec:tooling}).
-        \end{itemize}
-        We use Gemini 2.5 Flash~\cite{gemini25flash} as the CEO agent due to its strong reasoning capabilities, support for tool usage, and cost efficiency, making it a practical and capable choice for our deployment.
-    \item \textbf{Employee Agents:} Specialized agents instantiated by the CEO for specific sub-tasks. Each typically wraps an LLM (local via Ollama \cite{ollama} or external API) or provides tool access. Characteristics:
-        \begin{itemize}
-            \item Specialization: Capabilities tailored to task types (code, data analysis, info retrieval).
-            \item Dynamic Existence: Created/destroyed by CEO based on need/performance.
-            \item Task Execution: Receive task, execute, return result.
-            \item Resource Consumption: Associated costs (API, memory) tracked by system.
-        \end{itemize}
-        Specialized employee agents are constructed using base models such as Mistral~7B~\cite{jiang2023mistral}, Llama~3~\cite{llama3herd}, Gemini~1.5~\cite{gemini1.5_report}, Qwen2.5~\cite{qwen2.5_report}, Qwen3~\cite{qwen3_blog}, and DeepSeek-R1~\cite{deepseekr1_report}, with the CEO agent configuring them via tailored system prompts.
-\end{itemize}
-This hierarchy facilitates task decomposition and result aggregation; the dynamic pool provides flexibility.
-\subsection{Dynamic Agent Lifecycle Management}
-\label{subsec:dynamic_mgmt}
-A core innovation is the CEO's dynamic management (hiring/firing) of Employee agents. Driven by cost-benefit analysis, this optimizes task performance within resource constraints.
-When a sub-task needs unavailable or inefficiently provided capabilities, the CEO may hire a new agent. Conversely, if an agent underperforms, is idle, costly, or resource limits are neared, the CEO may fire it. Decision factors:
-\begin{itemize}
-    \item \textbf{Task Requirements:} Needed capabilities for pending sub-tasks.
-    \item \textbf{Agent Performance:} Historical success, output quality, efficiency.
-    \item \textbf{Operational Costs:} API, estimated compute, or other costs.
-    \item \textbf{Memory Footprint:} Agent memory usage (\% of total allocated).
-    \item \textbf{Agent Concurrency:} Active agents vs. predefined limit.
-\end{itemize}
-HASHIRU includes an \textbf{economic model}:
-\begin{itemize}
-    \item \textbf{Hiring Cost (``Starting Bonus''):} One-time cost upon instantiation (setup overhead).
-    \item \textbf{Invocation Cost (``Salary''):} Multi-time cost upon use (system/payment load).
-\end{itemize}
-These transaction costs discourage excessive churn, promoting stability. The CEO evaluates if replacing an agent benefits outweigh hiring/firing costs plus operational differences. This combats rigidity and allows adaptation while managing budgets and preventing wasteful turnover.
-\subsection{Hybrid Intelligence and Model Management}
-HASHIRU is designed for \textbf{hybrid intelligence}, leveraging diverse cognitive resources. It strategically prioritizes smaller (3B--7B), cost-effective local LLMs via Ollama \cite{ollama}. This enhances efficiency, reduces external API reliance, and potentially improves privacy/latency.
-The system also integrates:
-\begin{itemize}
-    \item \textbf{External LLM APIs:} Access to powerful proprietary models (GPT-4 \cite{openai2023gpt4}, Claude 3 \cite{anthropic2024claude}) when necessary, subject to cost-benefit.
-    \item \textbf{External Tool APIs:} Third-party software/data source integration.
-    \item \textbf{Self-Created APIs:} Tools generated by HASHIRU (Section \ref{subsec:tooling}).
-\end{itemize}
-The CEO manages this heterogeneous pool, selecting the most appropriate resource based on difficulty, capabilities, and budget. This balances cost-effectiveness and efficiency with high capability needs.
-\subsection{Resource Monitoring and Control}
-\label{subsec:resource_mgmt}
-Explicit resource management is central, moving beyond simple API cost tracking. The system, coordinated by the CEO, monitors:
-\begin{itemize}
-    \item \textbf{Financial Costs:} Accumulating external API costs.
-    \item \textbf{Memory Usage:} Footprint of active Employee agents (\% of allocated budget).
-    \item \textbf{Agent Concurrency:} Count of concurrently active agents.
-\end{itemize}
-Metrics are monitored against predefined \textbf{budget limits}. Actions (like hiring) exceeding limits (e.g., $>$90\% memory, exceeding max concurrency) are prevented. This ensures operation within constraints, crucial for limited resources or strict budgets.
-\subsection{Tool Utilization and Autonomous Creation}
-\label{subsec:tooling}
-HASHIRU agents use predefined tools (functions, APIs, databases) to interact and perform actions beyond text generation \cite{yao2022react, openai_func_calling}.
-A distinctive feature is \textbf{integrated, autonomous tool creation}. If the CEO determines a required capability is missing, it can initiate new tool creation. This involves:
-\begin{enumerate}
-    \item Defining tool specification (inputs, outputs, functionality).
-    \item Commissioning logic generation (code, potentially using external APIs with provided credentials, possibly via a code-generating agent).
-    \item Deploying logic as a new, callable API endpoint within HASHIRU.
-    \item Potentially instantiating an Employee agent for the new tool.
-\end{enumerate}
-This allows HASHIRU to dynamically extend its functional repertoire, tailoring capabilities to tasks without manual intervention, enabling greater autonomy and adaptation.
-\subsection{Memory Function: Learning from Experience}
-\label{subsec:memory}
-To enable HASHIRU agents to learn from past interactions and rectify previous errors, a \textbf{Memory Function} is incorporated. This function stores records of significant past events, particularly those involving failed attempts or suboptimal outcomes, acting as a historical log of experiences. When the system encounters a new problem or a recurring challenge, it queries this memory store to retrieve relevant past situations and their outcomes.
-Memory retrieval is based on semantic similarity between the current context (e.g., task description, recent actions, error messages) and the stored memory entries. We utilize embeddings generated by the \textbf{all-MiniLM-L6-v2} model \cite{wang2020minilmdeepselfattentiondistillation} to represent both the query and the stored memories in a high-dimensional vector space. Relevance is determined by calculating the \textbf{cosine similarity} between the query embedding and each memory embedding. Memories exceeding a predefined similarity threshold are retrieved and provided to the CEO agent (or relevant Employee agents) as contextual information. This allows the system to draw upon past experiences, understand why previous approaches failed, and potentially adjust its strategy to avoid repeating mistakes, thereby improving performance and efficiency over time.
-\section{Case Studies}
-\label{sec:casestudies}
-This section presents two case studies demonstrating HASHIRU's self-improvement capabilities in practical settings. We highlight two instances where HASHIRU enhanced its own architecture and functionality: (1) by generating a comprehensive cost model for base models suitable for specialized agent creation, and (2) by autonomously integrating new tools for the CEO agent.
-\subsection{Case Study 1: Self-Generating the Cost Model for Agent Specialization}
-\label{sec:casestudy1_costmodel}
-An accurate cost model is essential for optimizing resource allocation and ensuring the efficiency of specialized agents within HASHIRU. Traditionally, constructing this model involves manual research into local model performance relative to hardware (e.g., 16~GiB VRAM) and the API costs of cloud-hosted alternatives. HASHIRU automated this labor-intensive process by leveraging its web search capabilities to autonomously identify and incorporate the necessary cost data into its internal model. The results were successfully committed to the codebase\footnote{\url{https://github.com/kunpai/HASHIRU/commit/70dc268b121cbd7c50c6691645d8a99912766965}}.
-\subsection{Case Study 2: Autonomous Tool Integration for the CEO Agent}
-\label{sec:casestudy2_tools}
-Extending the CEO agent's capabilities through tool integration is vital for broadening HASHIRU's operational scope. Manual tool development typically requires detailed analysis of existing tool schemas and diligent code implementation. HASHIRU streamlined this process by employing a few-shot learning approach, using an existing tool as a template to guide the autonomous creation of new tools~\cite{brown2020language}. The newly generated tools were directly integrated into the codebase\footnote{\url{https://github.com/kunpai/HASHIRU/commit/193e10b2b00917256b7cc01cb3aa5ac7b6a6c174}}.
-\section{Experimental Setup}
-\label{sec:experiments}
-We designed experiments to evaluate HASHIRU's performance, efficiency, and adaptability, targeting dynamic resource management, hybrid intelligence, and autonomous tool creation. Evaluation assesses benefits over baselines, focusing on:
-\begin{itemize}
-    \item Impact of dynamic management with economic constraints on resource utilization (cost, memory) and task performance vs. static configurations.
-    \item Effectiveness of the hybrid (local-first) strategy vs. homogeneous (cloud-only or local-only) approaches across task complexity.
-    \item System's ability to autonomously create/utilize tools for novel functional requirements.
-\end{itemize}
-\subsection{Evaluation Tasks}
-\label{subsec:tasks}
-Tasks demand complex reasoning, multi-perspective analysis, and interaction, suitable for HASHIRU's coordination and dynamic capabilities. Tasks fall into two categories:
-\subsubsection{Academic Paper Review}
-Evaluates HASHIRU's critical assessment by simulating peer review. Given papers (e.g., PDF), the system generates a review summary and recommends acceptance/rejection. Probes ability to decompose criteria, delegate to specialized agents (novelty, rigor, clarity), and manage resources across complex documents.
-\subsubsection{Reasoning and Problem-Solving Tasks}
-Evaluates broader reasoning, knowledge retrieval, and problem-solving under constraints using challenging benchmarks and puzzles:
-\begin{itemize}
-    \item \textbf{Humanity's Last Exam \cite{phan2025humanitysexam}:} Tests graduate-level technical knowledge and complex reasoning across domains. Requires deep understanding and sophisticated problem-solving, likely needing powerful external LLMs managed within HASHIRU's hybrid framework.
-    \item \textbf{NYT Connections \cite{lopez2024nyt}:} Puzzle requiring identifying hidden semantic relationships/themes to categorize 16 words into four groups. Involves associative reasoning, broad knowledge, and hypothesis testing, testing knowledge access and combinatorial reasoning coordination.
-    \item \textbf{Wordle:} Daily word puzzle requiring deductive reasoning to identify a five-letter word within six guesses, using feedback. Tests logical deduction, constraint satisfaction, vocabulary. Good test for comparing efficiency (speed, cost, guesses) of local vs. external models for iterative reasoning. Assumes simulated game environment.
-    \item \textbf{Globle:} Geographic deduction game identifying a target country based on proximity feedback. Tests geographic knowledge, spatial reasoning, iterative strategy based on feedback. Assumes simulated game environment.
-\end{itemize}
-These tasks challenge the system's ability to leverage appropriate resources (local vs. external), potentially create simple tools, and coordinate problem-solving.
-\subsection{Baselines for Comparison}
-\label{subsec:baselines}
-To quantify HASHIRU's benefits, we compare its performance against baselines:
-\begin{itemize}
-    \item \textbf{Static-HASHIRU:} Fixed, predefined Employee agents (e.g., one per role), disabling dynamic hiring/firing.
-    \item \textbf{Cloud-Only HASHIRU:} Uses exclusively powerful external LLM API and online function-calling for all agents, disabling local models.
-    \item \textbf{Local-Only HASHIRU:} Uses exclusively smaller, local LLMs (via Ollama) for all agents.
-    \item \textbf{HASHIRU (No-Economy):} Dynamic hiring/firing enabled but without explicit costs, isolating economic model impact on churn/stability.
-\end{itemize}
-\subsection{Evaluation Metrics}
-\label{subsec:metrics}
-We evaluate using quantitative and qualitative metrics:
-\begin{itemize}
-    \item \textbf{Task Success Rate / Quality:}
-        \begin{itemize}
-            \item Percentage of tasks completed (binary for games, graded for analysis).
-            \item Output quality for analysis (human evaluation: relevance, coherence, accuracy, completeness).
-            \item Accuracy for information extraction.
-            \item Guesses/turns for game tasks.
-        \end{itemize}
-    \item \textbf{Resource Consumption:}
-        \begin{itemize}
-            \item Total external API costs.
-            \item Peak and average memory usage (\% of allocated budget).
-            \item Wall-clock time per task.
-            \item Number and type (local/external) of LLM calls.
-        \end{itemize}
-    \item \textbf{System Dynamics and Adaptability:}
-        \begin{itemize}
-            \item Employee agents hired/fired per task.
-            \item Agent churn frequency (hires+fires / duration or steps).
-            \item Number and utility of autonomously created tools (if applicable).
-        \end{itemize}
-\end{itemize}
-\bibliography{references}
-\bibliographystyle{plain}
-\appendix
-\section{Examples of HASHIRU in Action}
-\label{sec:examples}
-In this section, we provide examples of how HASHIRU operates in practice, demonstrating its dynamic management, hybrid intelligence, memory and autonomous tool creation capabilities.
-\subsection{Example 1: Memory}
-\begin{figure}[h]
-    \centering
-    \begin{minipage}{0.45\textwidth}
-    \begin{userbox}
-    I am vegetarian.
-    \end{userbox}
-    \begin{assistantbox}
-    Okay, I'll keep that in mind. I'll store this preference in my memory.
-    \texttt{Invoking MemoryManager with \{`memory': `The user is vegetarian.', `action': `add\_memory', `key': `vegetarian'\}}
-    I have stored that you are vegetarian. How can I help you today?
-    \end{assistantbox}
-    \begin{userbox}
-    Give me a bacon pasta recipe.
-    \end{userbox}
-    \begin{assistantbox}
-    Memories
-    Memories: [`The user is vegetarian.']
-    My apologies, but I cannot provide a bacon pasta recipe since you are vegetarian. Would you like a vegetarian pasta recipe instead?
-    \end{assistantbox}
-    \end{minipage}
-    \caption{HASHIRU enforcing a stored dietary preference in dialogue using its Memory Function.}
-    \label{fig:memory_example}
-    \end{figure}
-\begin{figure}[ht]
-    \centering
-    \begin{minipage}
-    {0.45\textwidth}
-    \begin{userbox}
-        I can eat bacon actually
-        \end{userbox}
-        \begin{assistantbox}
-        Memories
-        Memories: [’The user is vegetarian.’]
-        I see. My apologies for the confusion. I had a conflicting memory that you were vegetarian. I will remove that memory and proceed with finding a bacon pasta recipe.
-        \texttt{Invoking MemoryManager with \{'action': 'delete\_memory', 'key': 'The user is vegetarian.'\}}
-        OK, I have removed the memory. Now I will search for a bacon pasta recipe.
-        \texttt{Invoking GoogleSearchTool with \{'query': 'bacon pasta recipe'\}}
-        Here are some bacon pasta recipes I found:
-        \begin{itemize}
-            \item \href{https://www.sprinklesandsprouts.com/garlic-bacon-pasta/}{Garlic Bacon Pasta}
-            \item \href{https://www.theburntbuttertable.com/creamy-bacon-pasta-sauce/}{Creamy Bacon Pasta Sauce}
-            \item \dots
-        \end{itemize}
-        Would you like me to get the content of any of these websites?
-        \end{assistantbox}
-    \end{minipage}
-    \caption{HASHIRU updating and deleting its memory based on user input. Continuation of Figure \ref{fig:memory_example}.}
-    \label{fig:memory_example2}
-\end{figure}
-\subsection{Example 2: Novelty Assessment via Coordinated Tool Use}
-\begin{figure}[h]
-    \centering
-    \begin{minipage}{0.95\textwidth}
-    \begin{userbox}
-    Is this idea novel: ``a dynamically managed, resource-aware hierarchical multi-agent system that prioritizes cost-efficient local models, integrates autonomous tool creation, and employs an internal economic model to balance performance, adaptability, and stability in intelligent task coordination''?
-    \end{userbox}
-    \begin{assistantbox}
-    To determine if the idea is novel, I will break it down into smaller parts and search for each part. Then I will combine the results to see if the whole idea is novel.
-    Here's the breakdown:
-    \begin{itemize}
-        \item Dynamically managed multi-agent system
-        \item Resource-aware hierarchical multi-agent system
-        \item Cost-efficient local models in multi-agent systems
-        \item Autonomous tool creation in multi-agent systems
-        \item Internal economic model for multi-agent systems
-    \end{itemize}
-    I will use the \texttt{GoogleSearchTool} to search for each of these parts.
-    \texttt{Invoking GoogleSearchTool with \{'query': 'dynamically managed multi-agent system'\}}
-    \texttt{Invoking GoogleSearchTool with \{'query': 'resource-aware hierarchical multi-agent system'\}}
-    \texttt{Invoking GoogleSearchTool with \{'query': 'cost-efficient local models in multi-agent systems'\}}
-    \texttt{Invoking GoogleSearchTool with \{'query': 'autonomous tool creation in multi-agent systems'\}}
-    \texttt{Invoking GoogleSearchTool with \{'query': 'internal economic model for multi-agent systems'\}}
-    The search results indicate that each of the individual components of the idea has been explored in existing research. However, the novelty lies in the specific combination and integration of these elements.
-    To further assess the novelty, I will use the \texttt{ArxivTool} to search for papers that combine multiple aspects of the idea.
-    \texttt{Invoking ArxivTool with \{'query': 'dynamically managed resource-aware hierarchical multi-agent system'\}}
-    \texttt{Invoking ArxivTool with \{'query': 'multi-agent system autonomous tool creation economic model'\}}
-    The ArxivTool results show some papers that touch on combinations of these concepts, but none seem to address the entire idea as presented. Specifically, the combination of dynamic management, resource-awareness, hierarchical structure, cost-efficient local models, autonomous tool creation, and an internal economic model appears to be a novel synthesis.
-    \textbf{Therefore, based on the search results, the idea seems novel.}
-    \end{assistantbox}
-    \caption{HASHIRU assessing the novelty of a complex idea by breaking it down into components and searching for each part.}
-    \end{minipage}
-    \label{fig:novelty_assessment}
-\end{figure}
-\clearpage
-\twocolumn
-\begin{figure}
-    \centering
-    \begin{minipage}{0.95\textwidth}
-    \begin{userbox}
-        Which papers mention the ideas?
-        \end{userbox}
-        \begin{assistantbox}
-        To provide a comprehensive answer, I will analyze the summaries of the papers returned by the ArxivTool and GoogleSearchTool to identify which papers mention the key ideas in your prompt.
-        \textbf{The key ideas are:}
-        \begin{itemize}
-            \item Dynamically managed
-            \item Resource-aware
-            \item Hierarchical
-            \item Cost-efficient local models
-            \item Autonomous tool creation
-            \item Internal economic model
-        \end{itemize}
-        I will go through each paper and identify which of these ideas are mentioned.
-        \texttt{Invoking ArxivTool with \{'query': 'dynamically managed resource-aware hierarchical multi-agent system'\}} \\
-        \texttt{Invoking ArxivTool with \{'query': 'multi-agent system autonomous tool creation economic model'\}}
-        \textbf{ArxivTool Results (Query: ``dynamically managed resource-aware hierarchical multi-agent system'')}
-        \begin{itemize}
-            \item \textit{Hierarchical stock assessment methods improve management performance in multi-species, data-limited fisheries} \\
-            Hierarchical: Yes \\
-            Dynamically managed: Implied through ``management performance''
-            \item \textit{Hierarchical Reinforcement Learning for Optimal Agent Grouping in Cooperative Systems} \\
-            Hierarchical: Yes \\
-            Dynamically managed: Yes, through ``dynamic and optimal policy adjustments.''
-            \item \textit{Efficient Hierarchical Storage Management Framework Empowered by Reinforcement Learning} \\
-            Hierarchical: Yes \\
-            Cost-efficient local models: Implied through ``efficient'' \\
-            Dynamically managed: Yes, through ``dynamic migration policy''
-            \item \textit{Autonomous Deep Agent} \\
-            Hierarchical: Yes \\
-            Autonomous tool creation: Yes, via ``Autonomous API \& Tool Creation (AATC) system'' \\
-            Dynamically managed: Yes, ``dynamically decomposes high-level objectives''
-            \item \textit{Adaptive Network Intervention for Complex Systems: A Hierarchical Graph Reinforcement Learning Approach} \\
-            Hierarchical: Yes \\
-            Dynamically managed: Yes, through ``dynamic networks''
-        \end{itemize}
-        \textbf{ArxivTool Results (Query: ``multi-agent system autonomous tool creation economic model'')}
-        \begin{itemize}
-            \item \textit{Autonomics: an autonomous and intelligent economic platform and next generation money tool} \\
-            Autonomous tool creation: Implied \\
-            Internal economic model: Yes
-            \item \textit{From Single Agent to Multi-Agent: Improving Traffic Signal Control} \\
-            No relevant keywords mentioned.
-            \item \textit{Autonomous Deep Agent} \\
-            Hierarchical: Yes \\
-            Autonomous tool creation: Yes
-            \item \textit{Beyond the Sum: Unlocking AI Agents Potential Through Market Forces} \\
-            Internal economic model: Yes
-            \item \textit{Particle-Hole Creation in Condensed Matter: A Conceptual Framework for Modeling Money-Debt Dynamics in Economics} \\
-            Internal economic model: Yes
-        \end{itemize}
-        \end{assistantbox}
-        \caption{Continuation of Figure \ref{fig:novelty_assessment}. HASHIRU analyzing the summaries of the papers returned by the ArxivTool and GoogleSearchTool to identify which papers mention the key ideas in the user prompt.}
-    \end{minipage}
-\end{figure}
-\end{document}

paper/references.bib DELETED Viewed

@@ -1,456 +0,0 @@
-@article{shen2023hugginggpt,
-  title = {HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in Hugging Face},
-  author = {Shen, Yongliang and Song, Kaitao and Tan, Xu and Li, Dongsheng and Lu, Weiming and Zhuang, Yueting},
-  journal = {arXiv preprint arXiv:2303.17580},
-  year = {2023}
-}
-@article{wu2023autogen,
-  title = {{AutoGen}: Enabling Next-Gen {LLM} Applications via Multi-Agent Conversation},
-  author = {Wu, Qingyun and Bansal, Gagan and Zhang, Jieyu and Wu, Yiran and Li, Beibin and Zhu, Erkang and Jiang, Li and Zhang, Xiaoyun and Zhang, Shaokun and Liu, Jiale and Awadallah, Ahmed H. and White, Ryen W. and Burger, Doug and Wang, Chi},
-  journal = {arXiv preprint arXiv:2308.08155},
-  year = {2023}
-}
-@article{lopez2024nyt,
-  title={NYT-Connections: A Deceptively Simple Text Classification Task that Stumps System-1 Thinkers},
-  author={Lopez, Angel Yahir Loredo and McDonald, Tyler and Emami, Ali},
-  journal={arXiv preprint arXiv:2412.01621},
-  year={2024}
-}
-@misc{wang2020minilmdeepselfattentiondistillation,
-      title={MiniLM: Deep Self-Attention Distillation for Task-Agnostic Compression of Pre-Trained Transformers},
-      author={Wenhui Wang and Furu Wei and Li Dong and Hangbo Bao and Nan Yang and Ming Zhou},
-      year={2020},
-      eprint={2002.10957},
-      archivePrefix={arXiv},
-      primaryClass={cs.CL},
-      url={https://arxiv.org/abs/2002.10957},
-}
-@misc{gemini25flash,
-  title = {Gemini 2.5 Flash: Model Card, API, and Announcement},
-  author = {{Google DeepMind} and {Google AI}},
-  year = {2025},
-  howpublished = {\url{https://developers.googleblog.com/en/start-building-with-gemini-25-flash/}},
-  note = {See also:
-    \url{https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/gemini-2.5-flash-preview-04-17?inv=1&invt=AbxICQ},
-    \url{https://ai.google.dev/gemini-api/docs/models}. Accessed: 2025-05-11}
-}
-@inproceedings{yao2022react,
-  title = {{ReAct}: Synergizing Reasoning and Acting in Language Models},
-  author = {Yao, Shunyu and Zhao, Jeffrey and Yu, Dian and Du, Nan and Shafran, Izhak and Narasimhan, Karthik and Cao, Yuan},
-  booktitle = {International Conference on Learning Representations (ICLR)},
-  year = {2023},
-  note = {arXiv:2210.03629}
-}
-@article{schick2023toolformer,
-  title = {Toolformer: Language Models Can Teach Themselves to Use Tools},
-  author = {Schick, Timo and Dwivedi-Yu, Jane and Bitton, Yonatan and Yuan, Xi and Camburu, Oana-Maria and Houlsby, Neil},
-  journal = {arXiv preprint arXiv:2302.04761},
-  year = {2023}
-}
-@article{ong2024routellm,
-  title = {{RouteLLM}: Learning to Route {LLMs} with Preference Data},
-  author = {Ong, Isaac and Almahairi, Amjad and Wu, Vincent and Chiang, Wei-Lin and Wu, Tianhao and Gonzalez, Joseph E. and Kadous, M. Waleed and Stoica, Ion},
-  journal = {arXiv preprint arXiv:2406.18665},
-  year = {2024}
-}
-@article{fourney2024magentic,
-  title = {Magentic-One: A Generalist Multi-Agent System for Solving Complex Tasks},
-  author = {Fourney, Adam and Bansal, Gagan and Mozannar, Hussein and Tan, Cheng and et al.},
-  journal = {arXiv preprint arXiv:2411.04468},
-  year = {2024}
-}
-@inproceedings{cobbe2021gsm8k,
-  title = {Training Verifiers to Solve Math Word Problems},
-  author = {Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and Hesse, Christopher and Schulman, John},
-  booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
-  year = {2021},
-  note = {Dataset introduced: GSM8K (Grade School Math 8K)}
-}
-@inproceedings{patel2021svamp,
-  title = {Are {NLP} Models really able to Solve Simple Math Word Problems?},
-  author = {Patel, Arkil and Bhattamishra, Satwik and Goyal, Navin},
-  booktitle = {Conference of the North American Chapter of the Association for Computational Linguistics (NAACL)},
-  year = {2021},
-  note = {Introduces the SVAMP challenge dataset}
-}
-@misc{phan2025humanitysexam,
-  title         = {Humanity's Last Exam},
-  author        = {Phan, Long and Gatti, Alice and Han, Ziwen and others},
-  year          = {2025},
-  eprint        = {2501.14249},
-  archivePrefix = {arXiv},
-  primaryClass  = {cs.LG},
-  url           = {https://arxiv.org/abs/2501.14249}
-}
-@article{chen2021codex,
-  title = {Evaluating Large Language Models Trained on Code},
-  author = {Chen, Mark and Tworek, Jerry and Jun, Heewoo and Yuan, Qiming and Ponde de Oliveira Pinto, Henrique and Kaplan, Jared and Edwards, Harri and Burda, Yuri and Joseph, Nicholas and Brockman, Greg and Ray, Alex and Puri, Raul and Krueger, Gretchen and Petrov, Michael and Khlaaf, Heidy and Sastry, Girish and Mishkin, Pamela and Chan, Brooke and Gray, Scott and Ryder, Nick and Pavlov, Mikhail and Power, Alethea and Kaiser, Lukasz and Bavarian, Mohammad and Winter, Clemens and Tillet, Philippe and Such, Felipe and Cummings, Dave and Plappert, Matthias and Chantzis, Fotios and Barnes, Elizabeth and Herbert-Voss, Ariel and Guss, William and Nichol, Alex and Paino, Alex and Tezak, Nikolas and Tang, Jie and Babuschkin, Igor and Balaji, Suchir and Jain, Shantanu and Saunders, William and Hesse, Christopher and Carr, Andrew N. and Leike, Jan and Achiam, Josh and Misra, Vedant and Morikawa, Evan and Radford, Alec and Knight, Matthew and Brundage, Miles and Murati, Mira and Mayer, Katie and Welinder, Peter and McGrew, Bob and Amodei, Dario and McCandlish, Sam and Sutskever, Ilya and Zaremba, Wojciech},
-  journal = {arXiv preprint arXiv:2107.03374},
-  year = {2021},
-  note = {OpenAI Codex paper; introduced HumanEval benchmark}
-}
-@article{pai2024codocbench,
-  title = {{CoDocBench}: A Dataset for Code-Documentation Alignment in Software Maintenance},
-  author = {Pai, Kunal and Devanbu, Premkumar and Ahmed, Toufique},
-  journal = {arXiv preprint arXiv:2502.00519},
-  year = {2024}
-}
-@inproceedings{kamienski2021pysstubs,
-  title = {{PySStuBs}: Characterizing Single-Statement Bugs in Popular Open-Source Python Projects},
-  author = {Kamienski, Arthur V. and Palechor, Luisa and Bezemer, Cor-Paul and Hindle, Abram},
-  booktitle = {IEEE/ACM International Conference on Mining Software Repositories (MSR)},
-  year = {2021}
-}
-@article{brown2020language,
-  title={Language models are few-shot learners},
-  author={Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and others},
-  journal={Advances in neural information processing systems},
-  volume={33},
-  pages={1877--1901},
-  year={2020}
-}
-@inproceedings{devlin2019bert,
-  title={Bert: Pre-training of deep bidirectional transformers for language understanding},
-  author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
-  booktitle={Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers)},
-  pages={4171--4186},
-  year={2019}
-}
-@article{raffel2020exploring,
-  title={Exploring the limits of transfer learning with a unified text-to-text transformer},
-  author={Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J},
-  journal={Journal of machine learning research},
-  volume={21},
-  number={140},
-  pages={1--67},
-  year={2020}
-}
-@article{dorri2018multi,
-  title={Multi-agent systems: A survey},
-  author={Dorri, Ali and Kanhere, Salil S and Jurdak, Raja},
-  journal={Ieee Access},
-  volume={6},
-  pages={28573--28593},
-  year={2018},
-  publisher={IEEE}
-}
-@book{wooldridge2009introduction,
-  title={An introduction to multiagent systems},
-  author={Wooldridge, Michael},
-  year={2009},
-  publisher={John wiley \& sons}
-}
-@article{boiko2023emergent,
-  title={Emergent autonomous scientific research capabilities of large language models},
-  author={Boiko, Daniil A and MacKnight, Robert and Gomes, Gabe},
-  journal={arXiv preprint arXiv:2304.05332},
-  year={2023}
-}
-@inproceedings{gaston2005agenta,
-  title={Agent-organized networks for dynamic team formation},
-  author={Gaston, Matthew E and DesJardins, Marie},
-  booktitle={Proceedings of the fourth international joint conference on Autonomous agents and multiagent systems},
-  pages={230--237},
-  year={2005}
-}
-@misc{zhou2023agents,
-    title={Agents: An Open-source Framework for Large Language Model based Autonomous Agents},
-    author={Wangchunshu Zhou and Jianshu Chen and Jialong Wu and Yiheng Xu and Kexin Wang and Jintian Zhang and Yuan Gao and Zhiyong Wu and Kevin Tian and Yubo Feng and Linyi Yang and Bokai Quan and Cong Yu and Yuhang Wang and Shishen Lan and Yan Wang and Hong-Cheng Guo and Chaoyu Chen and Tianxiang Sun and Jin Xiong and Yi Lu and Peng Li and Lichao Sun and Lifan Yuan and Hang Li and Xiangang Li},
-    year={2023},
-    eprint={2309.07870},
-    archivePrefix={arXiv},
-    primaryClass={cs.AI},
-    url={https://arxiv.org/abs/2309.07870}
-}
-@misc{openai_func_calling,
-    title = {Function calling},
-    author = {{OpenAI}},
-    year = {2023},
-    howpublished = {OpenAI API Documentation},
-    url = {https://platform.openai.com/docs/guides/function-calling},
-    note = {Accessed: 2025-05-01}
-}
-@misc{wang2023voyager,
-    title={{Voyager}: An Open-Ended Embodied Agent with Large Language Models},
-    author={Guanzhi Wang and Yuqi Xie and Yunfan Jiang and Ajay Mandlekar and Chaowei Xiao and Yuke Zhu and Linxi Fan and Anima Anandkumar},
-    year={2023},
-    eprint={2305.16291},
-    archivePrefix={arXiv},
-    primaryClass={cs.AI},
-    url={https://arxiv.org/abs/2305.16291}
-}
-@book{russell2010artificial,
-  title={Artificial intelligence: a modern approach},
-  author={Russell, Stuart J. and Norvig, Peter},
-  year={2010},
-  edition={3rd},
-  publisher={Prentice Hall Press},
-  address={Upper Saddle River, NJ, USA}
-}
-@article{shoham1994agent,
-  author    = {Yoav Shoham},
-  title     = {Agent-oriented programming},
-  journal   = {Artificial Intelligence},
-  volume    = {60},
-  number    = {1},
-  pages     = {51--92},
-  year      = {1993},
-  publisher = {Elsevier}
-}
-@misc{wang2023survey,
-      title={A Survey on Large Language Model based Autonomous Agents},
-      author={Lei Wang and Chen Ma and Xueyang Feng and Zeyu Zhang and Hao Yang and Jingsen Zhang and Zhiyuan Chen and Jiakai Tang and Xu Chen and Yankai Lin and Wayne Xin Zhao and Zhewei Wei and Ji-Rong Wen},
-      year={2023},
-      eprint={2308.11432},
-      archivePrefix={arXiv},
-      primaryClass={cs.AI}
-}
-@misc{xi2023rise,
-      title={The Rise and Potential of Large Language Model Based Agents: A Survey},
-      author={Zhiheng Xi and Wenxiang Chen and Xin Guo and Wei He and Yiwen Ding and Boyang Hong and Ming Zhang and Junzhe Wang and Senjie Jin and Enyu Zhou and Rui Zheng and Xiaoran Fan and Xiao Wang and Limao Xiong and Linyi Yang and Ting Ruan and Yongquan Yang and Peng Li and Yitao Chang and Yanlin Wang},
-      year={2023},
-      eprint={2309.07864},
-      archivePrefix={arXiv},
-      primaryClass={cs.AI}
-}
-@inproceedings{park2023generative,
-    author = {Park, Joon Sung and O'Brien, Joseph C. and Cai, Carrie J. and Morris, Meredith Ringel and Liang, Percy and Bernstein, Michael S.},
-    title = {Generative Agents: Interactive Simulacra of Human Behavior},
-    year = {2023},
-    isbn = {9798400701320},
-    publisher = {Association for Computing Machinery},
-    address = {New York, NY, USA},
-    url = {https://doi.org/10.1145/3586183.3606763},
-    doi = {10.1145/3586183.3606763},
-    booktitle = {The 36th Annual ACM Symposium on User Interface Software and Technology (UIST '23)},
-    pages = {1–22},
-    numpages = {22},
-    location = {San Francisco, CA, USA},
-    series = {UIST '23}
-}
-@misc{ollama,
-    title = {Ollama},
-    author = {{Ollama Team}},
-    howpublished = {\url{https://ollama.com/}},
-    year = {2023},
-    note = {Accessed: 2025-05-01}
-}
-@misc{anthropic2024claude,
-    title = {The {Claude 3} Model Family: {Opus, Sonnet, Haiku}},
-    author = {{Anthropic}},
-    year = {2024},
-    month = {March},
-    howpublished = {Model Card},
-    url = {https://www-cdn.anthropic.com/de8ba9b01c9ab7cbabf5c33b80b7bbc618857627/Model_Card_Claude_3.pdf},
-    note = {Accessed: 2025-05-01}
-}
-@misc{openai2023gpt4,
-    title={GPT-4 Technical Report},
-    author={OpenAI},
-    year={2023},
-    eprint={2303.08774},
-    archivePrefix={arXiv},
-    primaryClass={cs.CL},
-    url={https://arxiv.org/abs/2303.08774}
-}
-@misc{langgraph,
-  title       = {LangGraph: A Framework for Agentic Workflows},
-  author      = {LangChain},
-  year        = {2024},
-  howpublished= {\url{https://www.langchain.com/langgraph}},
-  note        = {Accessed: May 1, 2025}
-}
-@book{clearwater1996market,
-  title     = {Market-Based Control: A Paradigm for Distributed Resource Allocation},
-  editor    = {Scott H. Clearwater},
-  publisher = {World Scientific},
-  year      = {1996}
-}
-@article{valckenaers2005trends,
-  title={Guest Editors' Introduction: Intelligent Control in the Manufacturing Supply Chain},
-  author={McFarlane, Duncan and Mar{\'\i}k, Vladim{\'\i}r and Valckenaers, Paul},
-  journal={IEEE Intelligent Systems},
-  volume={20},
-  number={1},
-  pages={24--26},
-  year={2005},
-  publisher={IEEE}
-}
-@article{horling2004survey,
-  title={A survey of multi-agent organizational paradigms},
-  author={Horling, Bryan and Lesser, Victor},
-  journal={The Knowledge engineering review},
-  volume={19},
-  number={4},
-  pages={281--316},
-  year={2004},
-  publisher={Cambridge University Press}
-}
-@inproceedings{gaston2005agentb,
-  title={Agent-organized networks for multi-agent production and exchange},
-  author={Gaston, Matthew E and DesJardins, Marie},
-  booktitle={Proceedings of the 20th national conference on Artificial intelligence-Volume 1},
-  pages={77--82},
-  year={2005}
-}
-@misc{zhang2023building,
-      title={Building Cooperative Embodied Agents Modularly with Large Language Models},
-      author={Hongxin Zhang and Weihua Du and Jiaming Shan and Qinhong Zhou and Yilun Du and Joshua B. Tenenbaum and Tianmin Shu and Chuang Gan},
-      year={2023},
-      eprint={2307.02485},
-      archivePrefix={arXiv},
-      primaryClass={cs.AI}
-}
-@misc{parisi2022talm,
-      title={TALM: Tool Augmented Language Models},
-      author={Aaron Parisi and Yao Zhao and Noah Fiedel},
-      year={2022},
-      eprint={2205.12255},
-      archivePrefix={arXiv},
-      primaryClass={cs.CL}
-}
-@misc{crewai,
-  title        = {CrewAI},
-  author       = {{CrewAI Inc.}},
-  year         = {2025},
-  howpublished = {\url{https://www.crewai.com/}},
-  note         = {Accessed: 2025-05-01}
-}
-@article{qian2023communicative,
-  title={Chatdev: Communicative agents for software development},
-  author={Qian, Chen and Liu, Wei and Liu, Hongzhang and Chen, Nuo and Dang, Yufan and Li, Jiahao and Yang, Cheng and Chen, Weize and Su, Yusheng and Cong, Xin and others},
-  journal={arXiv preprint arXiv:2307.07924},
-  year={2023}
-}
-@article{wang2023decision,
-  title={Decision-making driven by driver intelligence and environment reasoning for high-level autonomous vehicles: a survey},
-  author={Wang, Yuning and Jiang, Junkai and Li, Shangyi and Li, Ruochen and Xu, Shaobing and Wang, Jianqiang and Li, Keqiang},
-  journal={IEEE Transactions on Intelligent Transportation Systems},
-  volume={24},
-  number={10},
-  pages={10362--10381},
-  year={2023},
-  publisher={IEEE}
-}
-@misc{wen2024benchmarkingcomplexinstructionfollowingmultiple,
-      title={Benchmarking Complex Instruction-Following with Multiple Constraints Composition},
-      author={Bosi Wen and Pei Ke and Xiaotao Gu and Lindong Wu and Hao Huang and Jinfeng Zhou and Wenchuang Li and Binxin Hu and Wendy Gao and Jiaxin Xu and Yiming Liu and Jie Tang and Hongning Wang and Minlie Huang},
-      year={2024},
-      eprint={2407.03978},
-      archivePrefix={arXiv},
-      primaryClass={cs.CL},
-      url={https://arxiv.org/abs/2407.03978},
-}
-@article{jiang2023mistral,
-  title={{Mistral 7B}},
-  author={Jiang, Albert Q and Xu, Alexandre and Lachaux, Arthur Mensch Guillaume Lample Nicol{\`a}s and Rozenberg, Fran{\c{c}}ois and Lacroix, Timoth{\'e}e and Lavril, Thibaut and Gaddipati, Teven Le Scao Eleonora and Ortiz, Lucile Saulnier Lixin and Tang, Dieuwke Hiemstra L{\'e}lio Renard and others},
-  year={2023},
-  eprint={2310.06825},
-  archivePrefix={arXiv},
-  primaryClass={cs.CL},
-  url={https://arxiv.org/abs/2310.06825},
-}
-@article{llama3herd,
-  title={{The Llama 3 Herd of Models}},
-  author={{Meta Llama Team}},
-  year={2024},
-  eprint={2407.21783},
-  archivePrefix={arXiv},
-  primaryClass={cs.CL},
-  url={https://arxiv.org/abs/2407.21783},
-  note={arXiv:2407.21783}
-}
-@article{gemini1.5_report,
-  title={{Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context}},
-  author={{Gemini Team}},
-  year={2024},
-  eprint={2403.05530},
-  archivePrefix={arXiv},
-  primaryClass={cs.CL},
-  url={https://arxiv.org/abs/2403.05530},
-  note={arXiv:2403.05530}
-}
-@article{qwen2.5_report,
-  title={{Qwen2.5 Technical Report}},
-  author={{Qwen Team} and Yang, An and others},
-  year={2024},
-  eprint={2412.15115},
-  archivePrefix={arXiv},
-  primaryClass={cs.CL},
-  url={https://arxiv.org/abs/2412.15115},
-  note={arXiv:2412.15115}
-}
-@misc{qwen3_blog,
-    title={{Qwen3: Think Deeper, Act Faster}},
-    author={{Qwen Team}},
-    howpublished={\url{https://qwenlm.github.io/blog/qwen3/}},
-    year={2025}
-}
-@article{deepseekr1_report,
-  title={{DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning}},
-  author={{DeepSeek-AI and others}},
-  year={2025},
-  eprint={2501.12948},
-  archivePrefix={arXiv},
-  primaryClass={cs.CL},
-  url={https://arxiv.org/abs/2501.12948},
-  note={arXiv:2501.12948}
-}