|
import asyncio |
|
import pandas as pd |
|
import time |
|
from datetime import datetime, timedelta |
|
from models import models |
|
from tasks import tasks |
|
from languages import languages |
|
import os |
|
|
|
async def evaluate(): |
|
|
|
n_sentences = int(os.environ.get("N_SENTENCES", 20)) |
|
max_languages = int(os.environ.get("MAX_LANGUAGES", 150)) |
|
single_model = os.environ.get("SINGLE_MODEL") |
|
test_mode = os.environ.get("TEST", "").lower() in ("1", "true", "yes") |
|
|
|
models_df = pd.DataFrame(models) |
|
languages_df = pd.DataFrame(languages) |
|
top_languages = languages.head(max_languages) |
|
|
|
|
|
if single_model: |
|
models_df = models_df[models_df["id"] == single_model] |
|
if len(models_df) == 0: |
|
print(f"Error: Model '{single_model}' not found. Available models:") |
|
for model_id in pd.DataFrame(models)["id"]: |
|
print(f" {model_id}") |
|
return pd.DataFrame() |
|
|
|
print(f"Starting evaluation: {len(models_df)} models, {len(top_languages)} languages, {n_sentences} sentences per task") |
|
if test_mode: |
|
print("TEST MODE: Skipping results loading/saving") |
|
start_time = time.time() |
|
|
|
|
|
if test_mode: |
|
old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"]) |
|
else: |
|
try: |
|
old_results = pd.read_json("results.json") |
|
if old_results.empty: |
|
old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"]) |
|
except FileNotFoundError: |
|
old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"]) |
|
|
|
|
|
combis = [ |
|
(model, lang.bcp_47, task_name) |
|
for model in models_df["id"] |
|
for lang in top_languages.itertuples() |
|
for task_name, task in tasks.items() |
|
if task_name in models_df[models_df["id"] == model]["tasks"].iloc[0] |
|
] |
|
|
|
|
|
combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"]) |
|
combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left") |
|
combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]] |
|
|
|
|
|
all_tasks = [] |
|
for i in range(n_sentences): |
|
for model, bcp_47, task_name in combis.itertuples(index=False): |
|
all_tasks.append((tasks[task_name], model, bcp_47, i)) |
|
|
|
print(f"Running {len(all_tasks)} evaluation tasks...") |
|
|
|
|
|
try: |
|
results = await asyncio.gather( |
|
*[task_func(model, bcp_47, sentence_nr) for task_func, model, bcp_47, sentence_nr in all_tasks], |
|
return_exceptions=False |
|
) |
|
|
|
|
|
valid_results = [] |
|
for r in results: |
|
if isinstance(r, list): |
|
valid_results.extend(r) |
|
else: |
|
valid_results.append(r) |
|
|
|
print(f"Completed: {len(valid_results)} valid results") |
|
|
|
except Exception as e: |
|
print(f"EVALUATION STOPPED - API Error occurred:") |
|
print(f"Error type: {type(e).__name__}") |
|
print(f"Error message: {str(e)}") |
|
return pd.DataFrame() |
|
|
|
|
|
if valid_results: |
|
results_df = pd.DataFrame(valid_results) |
|
|
|
|
|
results_df = ( |
|
results_df.groupby(["model", "bcp_47", "task", "metric", "origin"]) |
|
.agg({"score": "mean"}) |
|
.reset_index() |
|
) |
|
|
|
if not test_mode: |
|
args = dict(orient="records", indent=2, force_ascii=False) |
|
|
|
|
|
if not old_results.empty: |
|
results_df = pd.concat([old_results, results_df]) |
|
results_df = results_df.drop_duplicates(subset=["model", "bcp_47", "task", "metric", "origin"]) |
|
|
|
results_df = results_df.sort_values(by=["model", "bcp_47", "task", "metric"]) |
|
results_df.to_json("results.json", **args) |
|
|
|
|
|
models_df.to_json("models.json", **args) |
|
languages_df.to_json("languages.json", **args) |
|
else: |
|
print("TEST MODE: Skipping results saving") |
|
|
|
elapsed = time.time() - start_time |
|
print(f"Evaluation completed in {str(timedelta(seconds=int(elapsed)))}") |
|
|
|
return results_df |
|
|
|
return pd.DataFrame() |
|
|
|
|
|
if __name__ == "__main__": |
|
results = asyncio.run(evaluate()) |
|
|