File size: 4,357 Bytes
838a833
 
 
a997e09
838a833
a997e09
838a833
a997e09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
838a833
 
 
 
 
 
 
 
 
 
 
 
 
a997e09
838a833
 
 
a997e09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
838a833
 
 
 
 
 
 
 
 
a997e09
 
 
 
 
 
 
 
 
 
838a833
 
a997e09
838a833
 
 
 
a997e09
838a833
 
 
 
 
 
 
 
 
 
a997e09
838a833
 
a997e09
838a833
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a997e09
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import argparse
import subprocess
import time
import requests

# Model info with both ollama and API usage
MODEL_INFO = {
    "mistral": {
        "size": 7,
        "token_cost": 0.002,
        "use_api": False
    },
    "llama": {
        "size": 13,
        "token_cost": 0.0025,
        "use_api": False
    },
    "deepseek": {
        "size": 1.3,
        "token_cost": 0.0015,
        "use_api": False,
        "api": ".."  # Example API
    },
    "gemini": {
        "size": 15,
        "token_cost": 0.003,
        "use_api": False,
        "api": ".."
    }
}

def run_model_ollama(model, prompt):
    try:
        start = time.time()
        result = subprocess.run(
            ["ollama", "run", model],
            input=prompt.encode(),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            timeout=60
        )
        end = time.time()
    except Exception:
        return None

    output = result.stdout.decode().strip()
    duration = end - start
    token_count = len(output.split())
    tokens_per_sec = token_count / duration if duration > 0 else 0
    latency_ms = duration * 1000
    token_cost = MODEL_INFO[model]["token_cost"] * token_count

    return {
        "model": model,
        "latency_ms": latency_ms,
        "tokens_sec": tokens_per_sec,
        "token_cost": token_cost,
        "output": output
    }

def run_model_api(model, prompt):
    try:
        start = time.time()
        response = requests.post(
            MODEL_INFO[model]["api"],
            json={"prompt": prompt},
            timeout=60
        )
        end = time.time()
        response.raise_for_status()
        output = response.json().get("response", "")  # Adjust key as needed
    except Exception:
        return None

    duration = end - start
    token_count = len(output.split())
    tokens_per_sec = token_count / duration if duration > 0 else 0
    latency_ms = duration * 1000
    token_cost = MODEL_INFO[model]["token_cost"] * token_count

    return {
        "model": model,
        "latency_ms": latency_ms,
        "tokens_sec": tokens_per_sec,
        "token_cost": token_cost,
        "output": output
    }

def run_model(model, prompt):
    if MODEL_INFO[model].get("use_api", False):
        return run_model_api(model, prompt)
    else:
        return run_model_ollama(model, prompt)

def get_best_model(prompt, weights, models=None):
    if models is None:
        models = list(MODEL_INFO.keys())

    results = []
    for model in models:
        res = run_model(model, prompt)
        if not res:
            continue

        size = MODEL_INFO[model]["size"]
        cost_score = 1 + weights["w_lat"] * res["latency_ms"] + \
                     weights["w_size"] * size + weights["w_token_cost"] * res["token_cost"]
        benefit_score = weights["w_speed"] * res["tokens_sec"]
        decision_score = benefit_score / cost_score

        res["decision_score"] = decision_score
        results.append(res)

    if not results:
        return "No models succeeded."

    return max(results, key=lambda x: x["decision_score"])

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Choose best model for a task")
    parser.add_argument('--prompt', required=True, help='The task or question to ask the models')
    parser.add_argument('--latency', type=int, default=3, help='Priority for latency (1–5)')
    parser.add_argument('--size', type=int, default=3, help='Priority for model size (1–5)')
    parser.add_argument('--cost', type=int, default=3, help='Priority for token cost (1–5)')
    parser.add_argument('--speed', type=int, default=3, help='Priority for tokens/sec (1–5)')
    args = parser.parse_args()

    weights = {
        "w_lat": 0.002 * args.latency,
        "w_size": 0.1 * args.size,
        "w_token_cost": 100 * args.cost,
        "w_speed": 0.01 * args.speed
    }

    best = get_best_model(args.prompt, weights)

    if isinstance(best, str):
        print(best)
    else:
        print(f"\nBest Model: {best['model']}")
        print(f"Decision Score: {round(best['decision_score'], 4)}")
        print(f"Latency (ms): {round(best['latency_ms'], 2)}")
        print(f"Tokens/sec: {round(best['tokens_sec'], 2)}")
        print(f"Token Cost ($): {round(best['token_cost'], 5)}")
        print(f"\nOutput:\n{best['output']}")