File size: 5,197 Bytes
da6e1bc 4d13673 7c06aef 8274634 da6e1bc 7c06aef 963cb78 c2eeeac da6e1bc 963cb78 7c06aef 963cb78 7c06aef 963cb78 7c06aef 963cb78 c2eeeac 7c06aef 963cb78 c2eeeac 963cb78 7c06aef 963cb78 7c06aef 963cb78 7c06aef 963cb78 7c06aef 963cb78 7c06aef 963cb78 7c06aef 963cb78 7c06aef 963cb78 7c06aef 963cb78 da6e1bc 2f9dee1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import asyncio
import pandas as pd
import time
from datetime import datetime, timedelta
from models import models
from tasks import tasks
from languages import languages
import os
async def evaluate():
# Configuration - easily adjustable defaults
n_sentences = int(os.environ.get("N_SENTENCES", 20)) # Default: 20 sentences per task
max_languages = int(os.environ.get("MAX_LANGUAGES", 150)) # Default: 150 top languages
single_model = os.environ.get("SINGLE_MODEL") # Optional: run only one specific model
test_mode = os.environ.get("TEST", "").lower() in ("1", "true", "yes") # Optional: skip results loading/saving
models_df = pd.DataFrame(models)
languages_df = pd.DataFrame(languages)
top_languages = languages.head(max_languages)
# Filter to single model if specified
if single_model:
models_df = models_df[models_df["id"] == single_model]
if len(models_df) == 0:
print(f"Error: Model '{single_model}' not found. Available models:")
for model_id in pd.DataFrame(models)["id"]:
print(f" {model_id}")
return pd.DataFrame()
print(f"Starting evaluation: {len(models_df)} models, {len(top_languages)} languages, {n_sentences} sentences per task")
if test_mode:
print("TEST MODE: Skipping results loading/saving")
start_time = time.time()
# Load existing results to avoid re-evaluation (skip in test mode)
if test_mode:
old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
else:
try:
old_results = pd.read_json("results.json")
if old_results.empty:
old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
except FileNotFoundError:
old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
# Get all combinations that need evaluation
combis = [
(model, lang.bcp_47, task_name)
for model in models_df["id"]
for lang in top_languages.itertuples()
for task_name, task in tasks.items()
if task_name in models_df[models_df["id"] == model]["tasks"].iloc[0]
]
# Filter out already evaluated combinations
combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
# Create all evaluation tasks
all_tasks = []
for i in range(n_sentences):
for model, bcp_47, task_name in combis.itertuples(index=False):
all_tasks.append((tasks[task_name], model, bcp_47, i))
print(f"Running {len(all_tasks)} evaluation tasks...")
# Run all tasks with simple asyncio.gather, but stop on first error
try:
results = await asyncio.gather(
*[task_func(model, bcp_47, sentence_nr) for task_func, model, bcp_47, sentence_nr in all_tasks],
return_exceptions=False # This will raise on first exception
)
# Process results - no exceptions should reach here
valid_results = []
for r in results:
if isinstance(r, list):
valid_results.extend(r)
else:
valid_results.append(r)
print(f"Completed: {len(valid_results)} valid results")
except Exception as e:
print(f"EVALUATION STOPPED - API Error occurred:")
print(f"Error type: {type(e).__name__}")
print(f"Error message: {str(e)}")
return pd.DataFrame()
# Save results (skip in test mode)
if valid_results:
results_df = pd.DataFrame(valid_results)
# Aggregate results
results_df = (
results_df.groupby(["model", "bcp_47", "task", "metric", "origin"])
.agg({"score": "mean"})
.reset_index()
)
if not test_mode:
args = dict(orient="records", indent=2, force_ascii=False)
# Merge with existing results
if not old_results.empty:
results_df = pd.concat([old_results, results_df])
results_df = results_df.drop_duplicates(subset=["model", "bcp_47", "task", "metric", "origin"])
results_df = results_df.sort_values(by=["model", "bcp_47", "task", "metric"])
results_df.to_json("results.json", **args)
# Save model and language info
models_df.to_json("models.json", **args)
languages_df.to_json("languages.json", **args)
else:
print("TEST MODE: Skipping results saving")
elapsed = time.time() - start_time
print(f"Evaluation completed in {str(timedelta(seconds=int(elapsed)))}")
return results_df
return pd.DataFrame()
if __name__ == "__main__":
results = asyncio.run(evaluate())
|