Spaces:
Running
Running
import argparse | |
import subprocess | |
import time | |
import requests | |
# Model info with both ollama and API usage | |
MODEL_INFO = { | |
"mistral": { | |
"size": 7, | |
"token_cost": 0.002, | |
"use_api": False | |
}, | |
"llama": { | |
"size": 13, | |
"token_cost": 0.0025, | |
"use_api": False | |
}, | |
"deepseek": { | |
"size": 1.3, | |
"token_cost": 0.0015, | |
"use_api": False, | |
"api": ".." # Example API | |
}, | |
"gemini": { | |
"size": 15, | |
"token_cost": 0.003, | |
"use_api": False, | |
"api": ".." | |
} | |
} | |
def run_model_ollama(model, prompt): | |
try: | |
start = time.time() | |
result = subprocess.run( | |
["ollama", "run", model], | |
input=prompt.encode(), | |
stdout=subprocess.PIPE, | |
stderr=subprocess.PIPE, | |
timeout=60 | |
) | |
end = time.time() | |
except Exception: | |
return None | |
output = result.stdout.decode().strip() | |
duration = end - start | |
token_count = len(output.split()) | |
tokens_per_sec = token_count / duration if duration > 0 else 0 | |
latency_ms = duration * 1000 | |
token_cost = MODEL_INFO[model]["token_cost"] * token_count | |
return { | |
"model": model, | |
"latency_ms": latency_ms, | |
"tokens_sec": tokens_per_sec, | |
"token_cost": token_cost, | |
"output": output | |
} | |
def run_model_api(model, prompt): | |
try: | |
start = time.time() | |
response = requests.post( | |
MODEL_INFO[model]["api"], | |
json={"prompt": prompt}, | |
timeout=60 | |
) | |
end = time.time() | |
response.raise_for_status() | |
output = response.json().get("response", "") # Adjust key as needed | |
except Exception: | |
return None | |
duration = end - start | |
token_count = len(output.split()) | |
tokens_per_sec = token_count / duration if duration > 0 else 0 | |
latency_ms = duration * 1000 | |
token_cost = MODEL_INFO[model]["token_cost"] * token_count | |
return { | |
"model": model, | |
"latency_ms": latency_ms, | |
"tokens_sec": tokens_per_sec, | |
"token_cost": token_cost, | |
"output": output | |
} | |
def run_model(model, prompt): | |
if MODEL_INFO[model].get("use_api", False): | |
return run_model_api(model, prompt) | |
else: | |
return run_model_ollama(model, prompt) | |
def get_best_model(prompt, weights, models=None): | |
if models is None: | |
models = list(MODEL_INFO.keys()) | |
results = [] | |
for model in models: | |
res = run_model(model, prompt) | |
if not res: | |
continue | |
size = MODEL_INFO[model]["size"] | |
cost_score = 1 + weights["w_lat"] * res["latency_ms"] + \ | |
weights["w_size"] * size + weights["w_token_cost"] * res["token_cost"] | |
benefit_score = weights["w_speed"] * res["tokens_sec"] | |
decision_score = benefit_score / cost_score | |
res["decision_score"] = decision_score | |
results.append(res) | |
if not results: | |
return "No models succeeded." | |
return max(results, key=lambda x: x["decision_score"]) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Choose best model for a task") | |
parser.add_argument('--prompt', required=True, help='The task or question to ask the models') | |
parser.add_argument('--latency', type=int, default=3, help='Priority for latency (1β5)') | |
parser.add_argument('--size', type=int, default=3, help='Priority for model size (1β5)') | |
parser.add_argument('--cost', type=int, default=3, help='Priority for token cost (1β5)') | |
parser.add_argument('--speed', type=int, default=3, help='Priority for tokens/sec (1β5)') | |
args = parser.parse_args() | |
weights = { | |
"w_lat": 0.002 * args.latency, | |
"w_size": 0.1 * args.size, | |
"w_token_cost": 100 * args.cost, | |
"w_speed": 0.01 * args.speed | |
} | |
best = get_best_model(args.prompt, weights) | |
if isinstance(best, str): | |
print(best) | |
else: | |
print(f"\nBest Model: {best['model']}") | |
print(f"Decision Score: {round(best['decision_score'], 4)}") | |
print(f"Latency (ms): {round(best['latency_ms'], 2)}") | |
print(f"Tokens/sec: {round(best['tokens_sec'], 2)}") | |
print(f"Token Cost ($): {round(best['token_cost'], 5)}") | |
print(f"\nOutput:\n{best['output']}") | |