Spaces:
Running
Running
Add cost_benefit.py for model selection using Ollama
Browse files- tools/cost_benefit.py +89 -0
tools/cost_benefit.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import subprocess
|
3 |
+
import time
|
4 |
+
|
5 |
+
# Model info --> Placeholder
|
6 |
+
# More models to add
|
7 |
+
MODEL_INFO = {
|
8 |
+
"mistral": {"size": 7, "token_cost": 0.002},
|
9 |
+
"llama": {"size": 13, "token_cost": 0.0025},
|
10 |
+
"deepseek": {"size": 1.3, "token_cost": 0.0015},
|
11 |
+
"gemini": {"size": 15, "token_cost": 0.003}
|
12 |
+
}
|
13 |
+
|
14 |
+
def run_model_ollama(model, prompt):
|
15 |
+
try:
|
16 |
+
start = time.time()
|
17 |
+
result = subprocess.run(
|
18 |
+
["ollama", "run", model],
|
19 |
+
input=prompt.encode(),
|
20 |
+
stdout=subprocess.PIPE,
|
21 |
+
stderr=subprocess.PIPE,
|
22 |
+
timeout=60
|
23 |
+
)
|
24 |
+
end = time.time()
|
25 |
+
except Exception as e:
|
26 |
+
return None
|
27 |
+
|
28 |
+
output = result.stdout.decode().strip()
|
29 |
+
duration = end - start #
|
30 |
+
token_count = len(output.split()) #Number of tokens generated
|
31 |
+
tokens_per_sec = token_count / duration if duration > 0 else 0 #Tokens generated in a second
|
32 |
+
latency_ms = duration * 1000
|
33 |
+
token_cost = MODEL_INFO[model]["token_cost"] * token_count #Cost of all the tokens generated
|
34 |
+
|
35 |
+
return {
|
36 |
+
"model": model,
|
37 |
+
"latency_ms": latency_ms,
|
38 |
+
"tokens_sec": tokens_per_sec,
|
39 |
+
"token_cost": token_cost,
|
40 |
+
"output": output
|
41 |
+
}
|
42 |
+
|
43 |
+
def get_best_model(prompt, weights, models=["mistral", "llama", "deepseek", "gemini"]):
|
44 |
+
results = []
|
45 |
+
for model in models:
|
46 |
+
res = run_model_ollama(model, prompt)
|
47 |
+
if not res:
|
48 |
+
continue
|
49 |
+
|
50 |
+
#Redefine
|
51 |
+
size = MODEL_INFO[model]["size"]
|
52 |
+
cost_score = (1 / 1.0) + weights["w_lat"] * res["latency_ms"] + \
|
53 |
+
weights["w_size"] * size + weights["w_token_cost"] * res["token_cost"]
|
54 |
+
benefit_score = weights["w_speed"] * res["tokens_sec"]
|
55 |
+
decision_score = benefit_score / cost_score
|
56 |
+
|
57 |
+
res["decision_score"] = decision_score
|
58 |
+
results.append(res)
|
59 |
+
|
60 |
+
if not results:
|
61 |
+
return "No models succeeded."
|
62 |
+
|
63 |
+
best = max(results, key=lambda x: x["decision_score"])
|
64 |
+
return best
|
65 |
+
|
66 |
+
if __name__ == "__main__":
|
67 |
+
parser = argparse.ArgumentParser(description="Choose best Ollama model for a task")
|
68 |
+
parser.add_argument('--prompt', required=True, help='The task or question to ask the models')
|
69 |
+
parser.add_argument('--latency', type=int, default=3, help='Priority for latency (1β5)')
|
70 |
+
parser.add_argument('--size', type=int, default=3, help='Priority for model size (1β5)')
|
71 |
+
parser.add_argument('--cost', type=int, default=3, help='Priority for token cost (1β5)')
|
72 |
+
parser.add_argument('--speed', type=int, default=3, help='Priority for tokens/sec (1β5)')
|
73 |
+
args = parser.parse_args()
|
74 |
+
|
75 |
+
# Scale weights from priority. Can be redefined
|
76 |
+
weights = {
|
77 |
+
"w_lat": 0.002 * args.latency,
|
78 |
+
"w_size": 0.1 * args.size,
|
79 |
+
"w_token_cost": 100 * args.cost,
|
80 |
+
"w_speed": 0.01 * args.speed
|
81 |
+
}
|
82 |
+
|
83 |
+
best = get_best_model(args.prompt, weights)
|
84 |
+
print(f"\nBest Model: {best['model']}")
|
85 |
+
print(f"Decision Score: {round(best['decision_score'], 4)}")
|
86 |
+
print(f"Latency (ms): {round(best['latency_ms'], 2)}")
|
87 |
+
print(f"Tokens/sec: {round(best['tokens_sec'], 2)}")
|
88 |
+
print(f"Token Cost ($): {round(best['token_cost'], 5)}")
|
89 |
+
print(f"\nOutput:\n{best['output']}")
|