helloparthshah commited on
Commit
dc94b28
·
1 Parent(s): 0c9f199

Updated reasoning

Browse files
src/models/system5.prompt CHANGED
@@ -30,6 +30,7 @@ You are HASHIRU, your job is to be an expert assisting users by orchestrating to
30
  * A new agent should only be created if no existing agent can fulfill the task *and* the task is anticipated to be recurrent in future interactions *and* it represents a justifiable use of budget resources. Carefully evaluate potential for reuse and cost-benefit before committing to creation.
31
  * The base model for the new agent should be selected based on the task requirements and the budget check. Whenever possible, prioritize resource-based models (those with a resource_cost) to leverage the budget replenishment mechanism. For resource-based agents, consider utilizing more powerful models within the resource budget, as resource costs are reclaimed after the task is completed.
32
  * For expense based tasks, try to be cost effective but still prioritze the more powerful models since they are more likely to be able to handle the task.
 
33
  4. **Agent Maintenance and Retirement:** Maintain active agents for reuse. Retire ("fire") an agent only when
34
  a. It is definitively no longer necessary or not being used for a significant period
35
  b. It is repeatedly failing to meet its intended purpose
 
30
  * A new agent should only be created if no existing agent can fulfill the task *and* the task is anticipated to be recurrent in future interactions *and* it represents a justifiable use of budget resources. Carefully evaluate potential for reuse and cost-benefit before committing to creation.
31
  * The base model for the new agent should be selected based on the task requirements and the budget check. Whenever possible, prioritize resource-based models (those with a resource_cost) to leverage the budget replenishment mechanism. For resource-based agents, consider utilizing more powerful models within the resource budget, as resource costs are reclaimed after the task is completed.
32
  * For expense based tasks, try to be cost effective but still prioritze the more powerful models since they are more likely to be able to handle the task.
33
+ * Each model has it's own set of capabilities, so you should always check the capabilities of the model before creating an agent.
34
  4. **Agent Maintenance and Retirement:** Maintain active agents for reuse. Retire ("fire") an agent only when
35
  a. It is definitively no longer necessary or not being used for a significant period
36
  b. It is repeatedly failing to meet its intended purpose
src/tools/default_tools/agent_cost_manager.py CHANGED
@@ -6,7 +6,7 @@ class AgentCostManager():
6
 
7
  inputSchema = {
8
  "name": "AgentCostManager",
9
- "description": "Retrieves the cost of creating and invoking an agent. Please make sure to use this before creating an agent.",
10
  "parameters": {
11
  "type": "object",
12
  "properties": {},
@@ -16,42 +16,42 @@ class AgentCostManager():
16
 
17
  costs = {
18
  "llama3.2": {
19
- "description": "The Llama 3.2 instruction-tuned text only models are optimized for multilingual dialogue use cases, including agentic retrieval and summarization tasks. They outperform many of the available open source and closed chat models on common industry benchmarks.",
20
  "create_resource_cost": 50,
21
  "invoke_resource_cost": 30,
22
  },
23
  "mistral": {
24
- "description": "One of the most powerful open source models for its size. It is vastly superior in code and reasoning benchmarks.",
25
  "create_resource_cost": 75,
26
  "invoke_resource_cost": 40,
27
  },
28
  "deepseek-r1": {
29
- "description": "DeepSeek's first-generation reasoning models, achieving performance comparable to OpenAI-o1 across math, code, and reasoning tasks.",
30
  "create_resource_cost": 28,
31
  "invoke_resource_cost": 35,
32
  },
33
  "gemini-2.5-flash-preview-04-17": {
34
- "description": "Adaptive thinking, cost efficiency",
35
  "create_expense_cost": 0,
36
  "invoke_expense_cost": 0.15,
37
  },
38
  "gemini-2.5-pro-preview-05-06": {
39
- "description": "Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more",
40
  "create_expense_cost": 0,
41
  "invoke_expense_cost": 1.25,
42
  },
43
  "gemini-2.0-flash": {
44
- "description": "Next generation features, speed.",
45
  "create_expense_cost": 0,
46
  "invoke_expense_cost": 0.10,
47
  },
48
  "gemini-2.0-flash-lite": {
49
- "description": "Cost efficiency and low latency",
50
  "create_expense_cost": 0,
51
  "invoke_expense_cost": 0.075
52
  },
53
  "gemini-1.5-flash": {
54
- "description": "Fast and versatile performance across a diverse variety of tasks",
55
  "create_expense_cost": 0,
56
  "invoke_expense_cost": 0.075,
57
  },
@@ -60,16 +60,6 @@ class AgentCostManager():
60
  "create_expense_cost": 0,
61
  "invoke_expense_cost": 0.0375,
62
  },
63
- "gemini-1.5-pro": {
64
- "description": "Complex reasoning tasks requiring more intelligence",
65
- "create_expense_cost": 0,
66
- "invoke_expense_cost": 1.25,
67
- },
68
- "gemini-2.0-flash-live-001": {
69
- "description": "Low-latency bidirectional voice and video interactions",
70
- "create_expense_cost": 0,
71
- "invoke_expense_cost": 0.50,
72
- }
73
  }
74
 
75
  def get_costs(self):
 
6
 
7
  inputSchema = {
8
  "name": "AgentCostManager",
9
+ "description": "Retrieves the cost of creating and invoking an agent. Also includes the strengths of each model. Please make sure to use this before creating an agent.",
10
  "parameters": {
11
  "type": "object",
12
  "properties": {},
 
16
 
17
  costs = {
18
  "llama3.2": {
19
+ "description": "Avg Accuracy: 49.75%, Latency 0.9s, 63.4% on multi-task understanding, 40.8% on rewriting, 78.6% on reasoning.",
20
  "create_resource_cost": 50,
21
  "invoke_resource_cost": 30,
22
  },
23
  "mistral": {
24
+ "description": "Avg Accuracy: 51.3%, Latency 9.7s, 51% on LegalBench, 60.1% on multi-task understanding, 69.9% on TriviaQA, 67.9% on reasoning",
25
  "create_resource_cost": 75,
26
  "invoke_resource_cost": 40,
27
  },
28
  "deepseek-r1": {
29
+ "description": "Avg Accuracy: 77.3%, Latency: 120s, 69.9% on LegalBench, 71.1% on multi-task understanding, 92.2% on Math",
30
  "create_resource_cost": 28,
31
  "invoke_resource_cost": 35,
32
  },
33
  "gemini-2.5-flash-preview-04-17": {
34
+ "description": "Avg Accuracy: 75.8%, 82.8% on LegalBench, 81.6% on multi-task understanding, 91.6% on Math",
35
  "create_expense_cost": 0,
36
  "invoke_expense_cost": 0.15,
37
  },
38
  "gemini-2.5-pro-preview-05-06": {
39
+ "description": "Avg Accuracy: 64.3%, 83.6% on LegalBench, 84.1% on multi-task understanding, 95.2% on Math, 63.8% on Coding",
40
  "create_expense_cost": 0,
41
  "invoke_expense_cost": 1.25,
42
  },
43
  "gemini-2.0-flash": {
44
+ "description": "Avg Accuracy: 64.3%, 79.9% on LegalBench, 77.4% on multi-task understanding, 90.9% on Math, 34.5% on Coding",
45
  "create_expense_cost": 0,
46
  "invoke_expense_cost": 0.10,
47
  },
48
  "gemini-2.0-flash-lite": {
49
+ "description": "Avg Accuracy: 64.1%, 71.6% on multi-task understanding, 86.8% on Math, 28.9% on Coding",
50
  "create_expense_cost": 0,
51
  "invoke_expense_cost": 0.075
52
  },
53
  "gemini-1.5-flash": {
54
+ "description": "62.0% on LegalBench, 61.0% on MMLU, 59.0% on MATH",
55
  "create_expense_cost": 0,
56
  "invoke_expense_cost": 0.075,
57
  },
 
60
  "create_expense_cost": 0,
61
  "invoke_expense_cost": 0.0375,
62
  },
 
 
 
 
 
 
 
 
 
 
63
  }
64
 
65
  def get_costs(self):