hashiruAI / cost_benefit.py
saisha09's picture
api link placeholder added to cost_benefit.py
a997e09
raw
history blame
4.36 kB
import argparse
import subprocess
import time
import requests
# Model info with both ollama and API usage
MODEL_INFO = {
"mistral": {
"size": 7,
"token_cost": 0.002,
"use_api": False
},
"llama": {
"size": 13,
"token_cost": 0.0025,
"use_api": False
},
"deepseek": {
"size": 1.3,
"token_cost": 0.0015,
"use_api": False,
"api": ".." # Example API
},
"gemini": {
"size": 15,
"token_cost": 0.003,
"use_api": False,
"api": ".."
}
}
def run_model_ollama(model, prompt):
try:
start = time.time()
result = subprocess.run(
["ollama", "run", model],
input=prompt.encode(),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
timeout=60
)
end = time.time()
except Exception:
return None
output = result.stdout.decode().strip()
duration = end - start
token_count = len(output.split())
tokens_per_sec = token_count / duration if duration > 0 else 0
latency_ms = duration * 1000
token_cost = MODEL_INFO[model]["token_cost"] * token_count
return {
"model": model,
"latency_ms": latency_ms,
"tokens_sec": tokens_per_sec,
"token_cost": token_cost,
"output": output
}
def run_model_api(model, prompt):
try:
start = time.time()
response = requests.post(
MODEL_INFO[model]["api"],
json={"prompt": prompt},
timeout=60
)
end = time.time()
response.raise_for_status()
output = response.json().get("response", "") # Adjust key as needed
except Exception:
return None
duration = end - start
token_count = len(output.split())
tokens_per_sec = token_count / duration if duration > 0 else 0
latency_ms = duration * 1000
token_cost = MODEL_INFO[model]["token_cost"] * token_count
return {
"model": model,
"latency_ms": latency_ms,
"tokens_sec": tokens_per_sec,
"token_cost": token_cost,
"output": output
}
def run_model(model, prompt):
if MODEL_INFO[model].get("use_api", False):
return run_model_api(model, prompt)
else:
return run_model_ollama(model, prompt)
def get_best_model(prompt, weights, models=None):
if models is None:
models = list(MODEL_INFO.keys())
results = []
for model in models:
res = run_model(model, prompt)
if not res:
continue
size = MODEL_INFO[model]["size"]
cost_score = 1 + weights["w_lat"] * res["latency_ms"] + \
weights["w_size"] * size + weights["w_token_cost"] * res["token_cost"]
benefit_score = weights["w_speed"] * res["tokens_sec"]
decision_score = benefit_score / cost_score
res["decision_score"] = decision_score
results.append(res)
if not results:
return "No models succeeded."
return max(results, key=lambda x: x["decision_score"])
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Choose best model for a task")
parser.add_argument('--prompt', required=True, help='The task or question to ask the models')
parser.add_argument('--latency', type=int, default=3, help='Priority for latency (1–5)')
parser.add_argument('--size', type=int, default=3, help='Priority for model size (1–5)')
parser.add_argument('--cost', type=int, default=3, help='Priority for token cost (1–5)')
parser.add_argument('--speed', type=int, default=3, help='Priority for tokens/sec (1–5)')
args = parser.parse_args()
weights = {
"w_lat": 0.002 * args.latency,
"w_size": 0.1 * args.size,
"w_token_cost": 100 * args.cost,
"w_speed": 0.01 * args.speed
}
best = get_best_model(args.prompt, weights)
if isinstance(best, str):
print(best)
else:
print(f"\nBest Model: {best['model']}")
print(f"Decision Score: {round(best['decision_score'], 4)}")
print(f"Latency (ms): {round(best['latency_ms'], 2)}")
print(f"Tokens/sec: {round(best['tokens_sec'], 2)}")
print(f"Token Cost ($): {round(best['token_cost'], 5)}")
print(f"\nOutput:\n{best['output']}")