saisha09 commited on
Commit
800db6b
Β·
1 Parent(s): 82a76a7

updated cost_benefit

Browse files
Files changed (1) hide show
  1. cost_benefit.py +33 -128
cost_benefit.py CHANGED
@@ -3,143 +3,48 @@ import subprocess
3
  import time
4
  import requests
5
 
6
- # Model info with both ollama and API usage
7
- MODEL_INFO = {
8
- "mistral": {
9
- "size": 7,
10
- "token_cost": 0.002,
11
- "use_api": False
12
- },
13
- "llama": {
14
- "size": 13,
15
- "token_cost": 0.0025,
16
- "use_api": False
17
- },
18
- "deepseek": {
19
- "size": 1.3,
20
- "token_cost": 0.0015,
21
- "use_api": False,
22
- "api": ".." # Example API
23
- },
24
- "gemini": {
25
- "size": 15,
26
- "token_cost": 0.003,
27
- "use_api": False,
28
- "api": ".."
29
- }
30
- }
31
-
32
- def run_model_ollama(model, prompt):
33
- try:
34
- start = time.time()
35
- result = subprocess.run(
36
- ["ollama", "run", model],
37
- input=prompt.encode(),
38
- stdout=subprocess.PIPE,
39
- stderr=subprocess.PIPE,
40
- timeout=60
41
- )
42
- end = time.time()
43
- except Exception:
44
- return None
45
-
46
- output = result.stdout.decode().strip()
47
- duration = end - start
48
- token_count = len(output.split())
49
- tokens_per_sec = token_count / duration if duration > 0 else 0
50
- latency_ms = duration * 1000
51
- token_cost = MODEL_INFO[model]["token_cost"] * token_count
52
 
53
- return {
54
- "model": model,
55
- "latency_ms": latency_ms,
56
- "tokens_sec": tokens_per_sec,
57
- "token_cost": token_cost,
58
- "output": output
 
59
  }
60
 
61
- def run_model_api(model, prompt):
62
- try:
63
- start = time.time()
64
- response = requests.post(
65
- MODEL_INFO[model]["api"],
66
- json={"prompt": prompt},
67
- timeout=60
68
- )
69
- end = time.time()
70
- response.raise_for_status()
71
- output = response.json().get("response", "") # Adjust key as needed
72
- except Exception:
73
- return None
74
-
75
- duration = end - start
76
- token_count = len(output.split())
77
- tokens_per_sec = token_count / duration if duration > 0 else 0
78
- latency_ms = duration * 1000
79
- token_cost = MODEL_INFO[model]["token_cost"] * token_count
80
-
81
- return {
82
- "model": model,
83
- "latency_ms": latency_ms,
84
- "tokens_sec": tokens_per_sec,
85
- "token_cost": token_cost,
86
- "output": output
87
  }
88
 
89
- def run_model(model, prompt):
90
- if MODEL_INFO[model].get("use_api", False):
91
- return run_model_api(model, prompt)
92
- else:
93
- return run_model_ollama(model, prompt)
94
-
95
- def get_best_model(prompt, weights, models=None):
96
- if models is None:
97
- models = list(MODEL_INFO.keys())
98
 
99
- results = []
100
- for model in models:
101
- res = run_model(model, prompt)
102
- if not res:
103
- continue
104
 
105
- size = MODEL_INFO[model]["size"]
106
- cost_score = 1 + weights["w_lat"] * res["latency_ms"] + \
107
- weights["w_size"] * size + weights["w_token_cost"] * res["token_cost"]
108
- benefit_score = weights["w_speed"] * res["tokens_sec"]
109
- decision_score = benefit_score / cost_score
110
-
111
- res["decision_score"] = decision_score
112
- results.append(res)
113
 
114
- if not results:
115
- return "No models succeeded."
116
 
117
- return max(results, key=lambda x: x["decision_score"])
 
 
118
 
119
- if __name__ == "__main__":
120
- parser = argparse.ArgumentParser(description="Choose best model for a task")
121
- parser.add_argument('--prompt', required=True, help='The task or question to ask the models')
122
- parser.add_argument('--latency', type=int, default=3, help='Priority for latency (1–5)')
123
- parser.add_argument('--size', type=int, default=3, help='Priority for model size (1–5)')
124
- parser.add_argument('--cost', type=int, default=3, help='Priority for token cost (1–5)')
125
- parser.add_argument('--speed', type=int, default=3, help='Priority for tokens/sec (1–5)')
126
- args = parser.parse_args()
127
 
128
- weights = {
129
- "w_lat": 0.002 * args.latency,
130
- "w_size": 0.1 * args.size,
131
- "w_token_cost": 100 * args.cost,
132
- "w_speed": 0.01 * args.speed
 
133
  }
134
-
135
- best = get_best_model(args.prompt, weights)
136
-
137
- if isinstance(best, str):
138
- print(best)
139
- else:
140
- print(f"\nBest Model: {best['model']}")
141
- print(f"Decision Score: {round(best['decision_score'], 4)}")
142
- print(f"Latency (ms): {round(best['latency_ms'], 2)}")
143
- print(f"Tokens/sec: {round(best['tokens_sec'], 2)}")
144
- print(f"Token Cost ($): {round(best['token_cost'], 5)}")
145
- print(f"\nOutput:\n{best['output']}")
 
3
  import time
4
  import requests
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ def get_best_model(weights: dict, runtime_env: str) -> dict:
8
+ #placeholders
9
+ models = {
10
+ "llama3.2": {"size": 2.5, "token_cost": 0.0001, "speed": 30},
11
+ "mistral": {"size": 4.2, "token_cost": 0.0002, "speed": 50},
12
+ "gemini-2.0-flash": {"size": 6.1, "token_cost": 0.0005, "speed": 60},
13
+ "gemini-2.5-pro-preview-03-25": {"size": 8.2, "token_cost": 0.002, "speed": 45}
14
  }
15
 
16
+ penalty = {
17
+ "gpu": 1.0,
18
+ "cpu-local": 2.0,
19
+ "cloud-only": 1.5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  }
21
 
22
+ best_model = None
23
+ best_score = float("-inf") # Track max score
 
 
 
 
 
 
 
24
 
25
+ for model, metrics in models.items():
26
+ p = penalty.get(runtime_env, 2.0)
 
 
 
27
 
28
+ cost_score = (
29
+ weights["w_size"] * metrics["size"] * p +
30
+ weights["w_token_cost"] * metrics["token_cost"] * p +
31
+ weights["w_speed"] * (100 - metrics["speed"])
32
+ )
33
+ benefit_score = weights["w_speed"] * metrics["speed"]
 
 
34
 
35
+ decision_score = benefit_score / cost_score if cost_score != 0 else 0
 
36
 
37
+ if decision_score > best_score:
38
+ best_score = decision_score
39
+ best_model = model
40
 
41
+ if not best_model:
42
+ return "No suitable model found"
 
 
 
 
 
 
43
 
44
+ return {
45
+ "model": best_model,
46
+ "score": best_score,
47
+ "token_cost": models[best_model]["token_cost"],
48
+ "tokens_sec": models[best_model]["speed"],
49
+ "output": f"Sample output from {best_model}"
50
  }