saisha09 commited on
Commit
838a833
Β·
1 Parent(s): f8d05a7

Add cost_benefit.py for model selection using Ollama

Browse files
Files changed (1) hide show
  1. tools/cost_benefit.py +89 -0
tools/cost_benefit.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import subprocess
3
+ import time
4
+
5
+ # Model info --> Placeholder
6
+ # More models to add
7
+ MODEL_INFO = {
8
+ "mistral": {"size": 7, "token_cost": 0.002},
9
+ "llama": {"size": 13, "token_cost": 0.0025},
10
+ "deepseek": {"size": 1.3, "token_cost": 0.0015},
11
+ "gemini": {"size": 15, "token_cost": 0.003}
12
+ }
13
+
14
+ def run_model_ollama(model, prompt):
15
+ try:
16
+ start = time.time()
17
+ result = subprocess.run(
18
+ ["ollama", "run", model],
19
+ input=prompt.encode(),
20
+ stdout=subprocess.PIPE,
21
+ stderr=subprocess.PIPE,
22
+ timeout=60
23
+ )
24
+ end = time.time()
25
+ except Exception as e:
26
+ return None
27
+
28
+ output = result.stdout.decode().strip()
29
+ duration = end - start #
30
+ token_count = len(output.split()) #Number of tokens generated
31
+ tokens_per_sec = token_count / duration if duration > 0 else 0 #Tokens generated in a second
32
+ latency_ms = duration * 1000
33
+ token_cost = MODEL_INFO[model]["token_cost"] * token_count #Cost of all the tokens generated
34
+
35
+ return {
36
+ "model": model,
37
+ "latency_ms": latency_ms,
38
+ "tokens_sec": tokens_per_sec,
39
+ "token_cost": token_cost,
40
+ "output": output
41
+ }
42
+
43
+ def get_best_model(prompt, weights, models=["mistral", "llama", "deepseek", "gemini"]):
44
+ results = []
45
+ for model in models:
46
+ res = run_model_ollama(model, prompt)
47
+ if not res:
48
+ continue
49
+
50
+ #Redefine
51
+ size = MODEL_INFO[model]["size"]
52
+ cost_score = (1 / 1.0) + weights["w_lat"] * res["latency_ms"] + \
53
+ weights["w_size"] * size + weights["w_token_cost"] * res["token_cost"]
54
+ benefit_score = weights["w_speed"] * res["tokens_sec"]
55
+ decision_score = benefit_score / cost_score
56
+
57
+ res["decision_score"] = decision_score
58
+ results.append(res)
59
+
60
+ if not results:
61
+ return "No models succeeded."
62
+
63
+ best = max(results, key=lambda x: x["decision_score"])
64
+ return best
65
+
66
+ if __name__ == "__main__":
67
+ parser = argparse.ArgumentParser(description="Choose best Ollama model for a task")
68
+ parser.add_argument('--prompt', required=True, help='The task or question to ask the models')
69
+ parser.add_argument('--latency', type=int, default=3, help='Priority for latency (1–5)')
70
+ parser.add_argument('--size', type=int, default=3, help='Priority for model size (1–5)')
71
+ parser.add_argument('--cost', type=int, default=3, help='Priority for token cost (1–5)')
72
+ parser.add_argument('--speed', type=int, default=3, help='Priority for tokens/sec (1–5)')
73
+ args = parser.parse_args()
74
+
75
+ # Scale weights from priority. Can be redefined
76
+ weights = {
77
+ "w_lat": 0.002 * args.latency,
78
+ "w_size": 0.1 * args.size,
79
+ "w_token_cost": 100 * args.cost,
80
+ "w_speed": 0.01 * args.speed
81
+ }
82
+
83
+ best = get_best_model(args.prompt, weights)
84
+ print(f"\nBest Model: {best['model']}")
85
+ print(f"Decision Score: {round(best['decision_score'], 4)}")
86
+ print(f"Latency (ms): {round(best['latency_ms'], 2)}")
87
+ print(f"Tokens/sec: {round(best['tokens_sec'], 2)}")
88
+ print(f"Token Cost ($): {round(best['token_cost'], 5)}")
89
+ print(f"\nOutput:\n{best['output']}")