Spaces:
Running
Running
api link placeholder added to cost_benefit.py
Browse files- cost_benefit.py +82 -26
cost_benefit.py
CHANGED
@@ -1,14 +1,32 @@
|
|
1 |
import argparse
|
2 |
import subprocess
|
3 |
import time
|
|
|
4 |
|
5 |
-
# Model info
|
6 |
-
# More models to add
|
7 |
MODEL_INFO = {
|
8 |
-
"mistral": {
|
9 |
-
|
10 |
-
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
}
|
13 |
|
14 |
def run_model_ollama(model, prompt):
|
@@ -22,15 +40,43 @@ def run_model_ollama(model, prompt):
|
|
22 |
timeout=60
|
23 |
)
|
24 |
end = time.time()
|
25 |
-
except Exception
|
26 |
return None
|
27 |
|
28 |
output = result.stdout.decode().strip()
|
29 |
-
duration = end - start
|
30 |
-
token_count = len(output.split())
|
31 |
-
tokens_per_sec = token_count / duration if duration > 0 else 0
|
32 |
-
latency_ms = duration * 1000
|
33 |
-
token_cost = MODEL_INFO[model]["token_cost"] * token_count
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
return {
|
36 |
"model": model,
|
@@ -40,16 +86,24 @@ def run_model_ollama(model, prompt):
|
|
40 |
"output": output
|
41 |
}
|
42 |
|
43 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
results = []
|
45 |
for model in models:
|
46 |
-
res =
|
47 |
if not res:
|
48 |
continue
|
49 |
|
50 |
-
#Redefine
|
51 |
size = MODEL_INFO[model]["size"]
|
52 |
-
cost_score =
|
53 |
weights["w_size"] * size + weights["w_token_cost"] * res["token_cost"]
|
54 |
benefit_score = weights["w_speed"] * res["tokens_sec"]
|
55 |
decision_score = benefit_score / cost_score
|
@@ -60,11 +114,10 @@ def get_best_model(prompt, weights, models=["mistral", "llama", "deepseek", "gem
|
|
60 |
if not results:
|
61 |
return "No models succeeded."
|
62 |
|
63 |
-
|
64 |
-
return best
|
65 |
|
66 |
if __name__ == "__main__":
|
67 |
-
parser = argparse.ArgumentParser(description="Choose best
|
68 |
parser.add_argument('--prompt', required=True, help='The task or question to ask the models')
|
69 |
parser.add_argument('--latency', type=int, default=3, help='Priority for latency (1β5)')
|
70 |
parser.add_argument('--size', type=int, default=3, help='Priority for model size (1β5)')
|
@@ -72,7 +125,6 @@ if __name__ == "__main__":
|
|
72 |
parser.add_argument('--speed', type=int, default=3, help='Priority for tokens/sec (1β5)')
|
73 |
args = parser.parse_args()
|
74 |
|
75 |
-
# Scale weights from priority. Can be redefined
|
76 |
weights = {
|
77 |
"w_lat": 0.002 * args.latency,
|
78 |
"w_size": 0.1 * args.size,
|
@@ -81,9 +133,13 @@ if __name__ == "__main__":
|
|
81 |
}
|
82 |
|
83 |
best = get_best_model(args.prompt, weights)
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
|
|
|
|
|
|
|
|
|
1 |
import argparse
|
2 |
import subprocess
|
3 |
import time
|
4 |
+
import requests
|
5 |
|
6 |
+
# Model info with both ollama and API usage
|
|
|
7 |
MODEL_INFO = {
|
8 |
+
"mistral": {
|
9 |
+
"size": 7,
|
10 |
+
"token_cost": 0.002,
|
11 |
+
"use_api": False
|
12 |
+
},
|
13 |
+
"llama": {
|
14 |
+
"size": 13,
|
15 |
+
"token_cost": 0.0025,
|
16 |
+
"use_api": False
|
17 |
+
},
|
18 |
+
"deepseek": {
|
19 |
+
"size": 1.3,
|
20 |
+
"token_cost": 0.0015,
|
21 |
+
"use_api": False,
|
22 |
+
"api": ".." # Example API
|
23 |
+
},
|
24 |
+
"gemini": {
|
25 |
+
"size": 15,
|
26 |
+
"token_cost": 0.003,
|
27 |
+
"use_api": False,
|
28 |
+
"api": ".."
|
29 |
+
}
|
30 |
}
|
31 |
|
32 |
def run_model_ollama(model, prompt):
|
|
|
40 |
timeout=60
|
41 |
)
|
42 |
end = time.time()
|
43 |
+
except Exception:
|
44 |
return None
|
45 |
|
46 |
output = result.stdout.decode().strip()
|
47 |
+
duration = end - start
|
48 |
+
token_count = len(output.split())
|
49 |
+
tokens_per_sec = token_count / duration if duration > 0 else 0
|
50 |
+
latency_ms = duration * 1000
|
51 |
+
token_cost = MODEL_INFO[model]["token_cost"] * token_count
|
52 |
+
|
53 |
+
return {
|
54 |
+
"model": model,
|
55 |
+
"latency_ms": latency_ms,
|
56 |
+
"tokens_sec": tokens_per_sec,
|
57 |
+
"token_cost": token_cost,
|
58 |
+
"output": output
|
59 |
+
}
|
60 |
+
|
61 |
+
def run_model_api(model, prompt):
|
62 |
+
try:
|
63 |
+
start = time.time()
|
64 |
+
response = requests.post(
|
65 |
+
MODEL_INFO[model]["api"],
|
66 |
+
json={"prompt": prompt},
|
67 |
+
timeout=60
|
68 |
+
)
|
69 |
+
end = time.time()
|
70 |
+
response.raise_for_status()
|
71 |
+
output = response.json().get("response", "") # Adjust key as needed
|
72 |
+
except Exception:
|
73 |
+
return None
|
74 |
+
|
75 |
+
duration = end - start
|
76 |
+
token_count = len(output.split())
|
77 |
+
tokens_per_sec = token_count / duration if duration > 0 else 0
|
78 |
+
latency_ms = duration * 1000
|
79 |
+
token_cost = MODEL_INFO[model]["token_cost"] * token_count
|
80 |
|
81 |
return {
|
82 |
"model": model,
|
|
|
86 |
"output": output
|
87 |
}
|
88 |
|
89 |
+
def run_model(model, prompt):
|
90 |
+
if MODEL_INFO[model].get("use_api", False):
|
91 |
+
return run_model_api(model, prompt)
|
92 |
+
else:
|
93 |
+
return run_model_ollama(model, prompt)
|
94 |
+
|
95 |
+
def get_best_model(prompt, weights, models=None):
|
96 |
+
if models is None:
|
97 |
+
models = list(MODEL_INFO.keys())
|
98 |
+
|
99 |
results = []
|
100 |
for model in models:
|
101 |
+
res = run_model(model, prompt)
|
102 |
if not res:
|
103 |
continue
|
104 |
|
|
|
105 |
size = MODEL_INFO[model]["size"]
|
106 |
+
cost_score = 1 + weights["w_lat"] * res["latency_ms"] + \
|
107 |
weights["w_size"] * size + weights["w_token_cost"] * res["token_cost"]
|
108 |
benefit_score = weights["w_speed"] * res["tokens_sec"]
|
109 |
decision_score = benefit_score / cost_score
|
|
|
114 |
if not results:
|
115 |
return "No models succeeded."
|
116 |
|
117 |
+
return max(results, key=lambda x: x["decision_score"])
|
|
|
118 |
|
119 |
if __name__ == "__main__":
|
120 |
+
parser = argparse.ArgumentParser(description="Choose best model for a task")
|
121 |
parser.add_argument('--prompt', required=True, help='The task or question to ask the models')
|
122 |
parser.add_argument('--latency', type=int, default=3, help='Priority for latency (1β5)')
|
123 |
parser.add_argument('--size', type=int, default=3, help='Priority for model size (1β5)')
|
|
|
125 |
parser.add_argument('--speed', type=int, default=3, help='Priority for tokens/sec (1β5)')
|
126 |
args = parser.parse_args()
|
127 |
|
|
|
128 |
weights = {
|
129 |
"w_lat": 0.002 * args.latency,
|
130 |
"w_size": 0.1 * args.size,
|
|
|
133 |
}
|
134 |
|
135 |
best = get_best_model(args.prompt, weights)
|
136 |
+
|
137 |
+
if isinstance(best, str):
|
138 |
+
print(best)
|
139 |
+
else:
|
140 |
+
print(f"\nBest Model: {best['model']}")
|
141 |
+
print(f"Decision Score: {round(best['decision_score'], 4)}")
|
142 |
+
print(f"Latency (ms): {round(best['latency_ms'], 2)}")
|
143 |
+
print(f"Tokens/sec: {round(best['tokens_sec'], 2)}")
|
144 |
+
print(f"Token Cost ($): {round(best['token_cost'], 5)}")
|
145 |
+
print(f"\nOutput:\n{best['output']}")
|