Spaces:
Running
Running
[ | |
{ | |
"model_name": "claude-3-7-sonnet-20250219", | |
"results": { | |
"mmlu_results": [], | |
"unified_exam_results": [ | |
{ | |
"category": "Average", | |
"score": 11.0833 | |
}, | |
{ | |
"category": "Armenian language and literature", | |
"score": 10.5 | |
}, | |
{ | |
"category": "Armenian history", | |
"score": 7.75 | |
}, | |
{ | |
"category": "Mathematics", | |
"score": 15.0 | |
} | |
] | |
} | |
}, | |
{ | |
"model_name": "claude-3-5-sonnet-20241022", | |
"results": { | |
"mmlu_results": [ | |
{ | |
"category": "Average", | |
"score": 0.6958 | |
}, | |
{ | |
"category": "Biology", | |
"score": 0.8667 | |
}, | |
{ | |
"category": "Business", | |
"score": 0.803 | |
}, | |
{ | |
"category": "Chemistry", | |
"score": 0.7579 | |
}, | |
{ | |
"category": "Computer Science", | |
"score": 0.7059 | |
}, | |
{ | |
"category": "Economics", | |
"score": 0.7887 | |
}, | |
{ | |
"category": "Engineering", | |
"score": 0.5625 | |
}, | |
{ | |
"category": "Health", | |
"score": 0.6618 | |
}, | |
{ | |
"category": "History", | |
"score": 0.6552 | |
}, | |
{ | |
"category": "Law", | |
"score": 0.4944 | |
}, | |
{ | |
"category": "Math", | |
"score": 0.7788 | |
}, | |
{ | |
"category": "Other", | |
"score": 0.6494 | |
}, | |
{ | |
"category": "Philosophy", | |
"score": 0.5476 | |
}, | |
{ | |
"category": "Physics", | |
"score": 0.7523 | |
}, | |
{ | |
"category": "Psychology", | |
"score": 0.7164 | |
} | |
], | |
"unified_exam_results": [ | |
{ | |
"category": "Average", | |
"score": 10.6667 | |
}, | |
{ | |
"category": "Armenian language and literature", | |
"score": 10.0 | |
}, | |
{ | |
"category": "Armenian history", | |
"score": 9.25 | |
}, | |
{ | |
"category": "Mathematics", | |
"score": 12.75 | |
} | |
] | |
} | |
}, | |
{ | |
"model_name": "gemini-2.0-flash", | |
"results": { | |
"mmlu_results": [ | |
{ | |
"category": "Average", | |
"score": 0.7247 | |
}, | |
{ | |
"category": "Biology", | |
"score": 0.85 | |
}, | |
{ | |
"category": "Business", | |
"score": 0.8182 | |
}, | |
{ | |
"category": "Chemistry", | |
"score": 0.7895 | |
}, | |
{ | |
"category": "Computer Science", | |
"score": 0.7353 | |
}, | |
{ | |
"category": "Economics", | |
"score": 0.8169 | |
}, | |
{ | |
"category": "Engineering", | |
"score": 0.6 | |
}, | |
{ | |
"category": "Health", | |
"score": 0.75 | |
}, | |
{ | |
"category": "History", | |
"score": 0.5517 | |
}, | |
{ | |
"category": "Law", | |
"score": 0.5281 | |
}, | |
{ | |
"category": "Math", | |
"score": 0.8673 | |
}, | |
{ | |
"category": "Other", | |
"score": 0.6364 | |
}, | |
{ | |
"category": "Philosophy", | |
"score": 0.6429 | |
}, | |
{ | |
"category": "Physics", | |
"score": 0.7982 | |
}, | |
{ | |
"category": "Psychology", | |
"score": 0.7612 | |
} | |
], | |
"unified_exam_results": [ | |
{ | |
"category": "Average", | |
"score": 9.8333 | |
}, | |
{ | |
"category": "Armenian language and literature", | |
"score": 5.5 | |
}, | |
{ | |
"category": "Armenian history", | |
"score": 6.75 | |
}, | |
{ | |
"category": "Mathematics", | |
"score": 17.25 | |
} | |
] | |
} | |
}, | |
{ | |
"model_name": "gpt-4o", | |
"results": { | |
"mmlu_results": [ | |
{ | |
"category": "Average", | |
"score": 0.6758 | |
}, | |
{ | |
"category": "Biology", | |
"score": 0.8667 | |
}, | |
{ | |
"category": "Business", | |
"score": 0.7424 | |
}, | |
{ | |
"category": "Chemistry", | |
"score": 0.6842 | |
}, | |
{ | |
"category": "Computer Science", | |
"score": 0.6176 | |
}, | |
{ | |
"category": "Economics", | |
"score": 0.7887 | |
}, | |
{ | |
"category": "Engineering", | |
"score": 0.5625 | |
}, | |
{ | |
"category": "Health", | |
"score": 0.7794 | |
}, | |
{ | |
"category": "History", | |
"score": 0.5517 | |
}, | |
{ | |
"category": "Law", | |
"score": 0.5393 | |
}, | |
{ | |
"category": "Math", | |
"score": 0.7788 | |
}, | |
{ | |
"category": "Other", | |
"score": 0.5974 | |
}, | |
{ | |
"category": "Philosophy", | |
"score": 0.5476 | |
}, | |
{ | |
"category": "Physics", | |
"score": 0.6881 | |
}, | |
{ | |
"category": "Psychology", | |
"score": 0.7164 | |
} | |
], | |
"unified_exam_results": [ | |
{ | |
"category": "Average", | |
"score": 8.9167 | |
}, | |
{ | |
"category": "Armenian language and literature", | |
"score": 6.75 | |
}, | |
{ | |
"category": "Armenian history", | |
"score": 6.75 | |
}, | |
{ | |
"category": "Mathematics", | |
"score": 13.25 | |
} | |
] | |
} | |
}, | |
{ | |
"model_name": "qwen-max-2025-01-25", | |
"results": { | |
"mmlu_results": [], | |
"unified_exam_results": [ | |
{ | |
"category": "Average", | |
"score": 8.6667 | |
}, | |
{ | |
"category": "Armenian language and literature", | |
"score": 7.25 | |
}, | |
{ | |
"category": "Armenian history", | |
"score": 4.5 | |
}, | |
{ | |
"category": "Mathematics", | |
"score": 14.25 | |
} | |
] | |
} | |
}, | |
{ | |
"model_name": "gemini-1.5-flash", | |
"results": { | |
"mmlu_results": [ | |
{ | |
"category": "Average", | |
"score": 0.5592 | |
}, | |
{ | |
"category": "Biology", | |
"score": 0.75 | |
}, | |
{ | |
"category": "Business", | |
"score": 0.7121 | |
}, | |
{ | |
"category": "Chemistry", | |
"score": 0.6947 | |
}, | |
{ | |
"category": "Computer Science", | |
"score": 0.5 | |
}, | |
{ | |
"category": "Economics", | |
"score": 0.7183 | |
}, | |
{ | |
"category": "Engineering", | |
"score": 0.4 | |
}, | |
{ | |
"category": "Health", | |
"score": 0.5 | |
}, | |
{ | |
"category": "History", | |
"score": 0.4483 | |
}, | |
{ | |
"category": "Law", | |
"score": 0.2584 | |
}, | |
{ | |
"category": "Math", | |
"score": 0.8319 | |
}, | |
{ | |
"category": "Other", | |
"score": 0.3506 | |
}, | |
{ | |
"category": "Philosophy", | |
"score": 0.3571 | |
}, | |
{ | |
"category": "Physics", | |
"score": 0.6514 | |
}, | |
{ | |
"category": "Psychology", | |
"score": 0.6567 | |
} | |
], | |
"unified_exam_results": [ | |
{ | |
"category": "Average", | |
"score": 7.8333 | |
}, | |
{ | |
"category": "Armenian language and literature", | |
"score": 4.75 | |
}, | |
{ | |
"category": "Armenian history", | |
"score": 3.75 | |
}, | |
{ | |
"category": "Mathematics", | |
"score": 15.0 | |
} | |
] | |
} | |
}, | |
{ | |
"model_name": "DeepSeek-V3", | |
"results": { | |
"mmlu_results": [ | |
{ | |
"category": "Average", | |
"score": 0.6633 | |
}, | |
{ | |
"category": "Biology", | |
"score": 0.8167 | |
}, | |
{ | |
"category": "Business", | |
"score": 0.8182 | |
}, | |
{ | |
"category": "Chemistry", | |
"score": 0.6947 | |
}, | |
{ | |
"category": "Computer Science", | |
"score": 0.7353 | |
}, | |
{ | |
"category": "Economics", | |
"score": 0.7887 | |
}, | |
{ | |
"category": "Engineering", | |
"score": 0.5875 | |
}, | |
{ | |
"category": "Health", | |
"score": 0.6471 | |
}, | |
{ | |
"category": "History", | |
"score": 0.4828 | |
}, | |
{ | |
"category": "Law", | |
"score": 0.3596 | |
}, | |
{ | |
"category": "Math", | |
"score": 0.8584 | |
}, | |
{ | |
"category": "Other", | |
"score": 0.5455 | |
}, | |
{ | |
"category": "Philosophy", | |
"score": 0.5476 | |
}, | |
{ | |
"category": "Physics", | |
"score": 0.6881 | |
}, | |
{ | |
"category": "Psychology", | |
"score": 0.7164 | |
} | |
], | |
"unified_exam_results": [ | |
{ | |
"category": "Average", | |
"score": 7.5 | |
}, | |
{ | |
"category": "Armenian language and literature", | |
"score": 5.25 | |
}, | |
{ | |
"category": "Armenian history", | |
"score": 5.0 | |
}, | |
{ | |
"category": "Mathematics", | |
"score": 12.25 | |
} | |
] | |
} | |
}, | |
{ | |
"model_name": "Meta-Llama-3.3-70B-Instruct", | |
"results": { | |
"mmlu_results": [ | |
{ | |
"category": "Average", | |
"score": 0.5139 | |
}, | |
{ | |
"category": "Biology", | |
"score": 0.7333 | |
}, | |
{ | |
"category": "Business", | |
"score": 0.5303 | |
}, | |
{ | |
"category": "Chemistry", | |
"score": 0.5895 | |
}, | |
{ | |
"category": "Computer Science", | |
"score": 0.3824 | |
}, | |
{ | |
"category": "Economics", | |
"score": 0.6338 | |
}, | |
{ | |
"category": "Engineering", | |
"score": 0.4875 | |
}, | |
{ | |
"category": "Health", | |
"score": 0.5735 | |
}, | |
{ | |
"category": "History", | |
"score": 0.4138 | |
}, | |
{ | |
"category": "Law", | |
"score": 0.3146 | |
}, | |
{ | |
"category": "Math", | |
"score": 0.6018 | |
}, | |
{ | |
"category": "Other", | |
"score": 0.3377 | |
}, | |
{ | |
"category": "Philosophy", | |
"score": 0.4524 | |
}, | |
{ | |
"category": "Physics", | |
"score": 0.5321 | |
}, | |
{ | |
"category": "Psychology", | |
"score": 0.6119 | |
} | |
], | |
"unified_exam_results": [ | |
{ | |
"category": "Average", | |
"score": 7.0833 | |
}, | |
{ | |
"category": "Armenian language and literature", | |
"score": 4.5 | |
}, | |
{ | |
"category": "Armenian history", | |
"score": 5.25 | |
}, | |
{ | |
"category": "Mathematics", | |
"score": 11.5 | |
} | |
] | |
} | |
}, | |
{ | |
"model_name": "claude-3-5-haiku-20241022", | |
"results": { | |
"mmlu_results": [ | |
{ | |
"category": "Average", | |
"score": 0.5198 | |
}, | |
{ | |
"category": "Biology", | |
"score": 0.75 | |
}, | |
{ | |
"category": "Business", | |
"score": 0.5758 | |
}, | |
{ | |
"category": "Chemistry", | |
"score": 0.5579 | |
}, | |
{ | |
"category": "Computer Science", | |
"score": 0.4412 | |
}, | |
{ | |
"category": "Economics", | |
"score": 0.6901 | |
}, | |
{ | |
"category": "Engineering", | |
"score": 0.4125 | |
}, | |
{ | |
"category": "Health", | |
"score": 0.5882 | |
}, | |
{ | |
"category": "History", | |
"score": 0.5172 | |
}, | |
{ | |
"category": "Law", | |
"score": 0.2472 | |
}, | |
{ | |
"category": "Math", | |
"score": 0.6018 | |
}, | |
{ | |
"category": "Other", | |
"score": 0.3636 | |
}, | |
{ | |
"category": "Philosophy", | |
"score": 0.4048 | |
}, | |
{ | |
"category": "Physics", | |
"score": 0.5596 | |
}, | |
{ | |
"category": "Psychology", | |
"score": 0.5672 | |
} | |
], | |
"unified_exam_results": [ | |
{ | |
"category": "Average", | |
"score": 6.5 | |
}, | |
{ | |
"category": "Armenian language and literature", | |
"score": 5.0 | |
}, | |
{ | |
"category": "Armenian history", | |
"score": 3.75 | |
}, | |
{ | |
"category": "Mathematics", | |
"score": 10.75 | |
} | |
] | |
} | |
}, | |
{ | |
"model_name": "Gen2B/HyGPT-10b-it", | |
"results": { | |
"mmlu_results": [], | |
"unified_exam_results": [ | |
{ | |
"category": "Armenian language and literature", | |
"score": 4.5 | |
}, | |
{ | |
"category": "Armenian history", | |
"score": 4.25 | |
}, | |
{ | |
"category": "Mathematics", | |
"score": 3.0 | |
}, | |
{ | |
"category": "Average", | |
"score": 3.9167 | |
} | |
] | |
} | |
}, | |
{ | |
"model_name": "google/gemma-2-9b-it", | |
"results": { | |
"mmlu_results": [], | |
"unified_exam_results": [ | |
{ | |
"category": "Armenian language and literature", | |
"score": 3.25 | |
}, | |
{ | |
"category": "Armenian history", | |
"score": 1.75 | |
}, | |
{ | |
"category": "Mathematics", | |
"score": 2.0 | |
}, | |
{ | |
"category": "Average", | |
"score": 2.3333 | |
} | |
] | |
} | |
}, | |
{ | |
"model_name": "google/gemma-3n-E2B-it", | |
"results": { | |
"mmlu_results": [], | |
"unified_exam_results": [ | |
{ | |
"category": "Armenian language and literature", | |
"score": 2.25 | |
}, | |
{ | |
"category": "Armenian history", | |
"score": 1.5 | |
}, | |
{ | |
"category": "Mathematics", | |
"score": 4.25 | |
}, | |
{ | |
"category": "Average", | |
"score": 2.6667 | |
} | |
] | |
} | |
}, | |
{ | |
"model_name": "google/gemma-3n-E4B-it", | |
"results": { | |
"mmlu_results": [], | |
"unified_exam_results": [ | |
{ | |
"category": "Armenian language and literature", | |
"score": 2.75 | |
}, | |
{ | |
"category": "Armenian history", | |
"score": 2.0 | |
}, | |
{ | |
"category": "Mathematics", | |
"score": 5.5 | |
}, | |
{ | |
"category": "Average", | |
"score": 3.4167 | |
} | |
] | |
} | |
}, | |
{ | |
"model_name": "mistralai/Mistral-Small-3.1-24B-Instruct-2503", | |
"results": { | |
"mmlu_results": [], | |
"unified_exam_results": [ | |
{ | |
"category": "Armenian language and literature", | |
"score": 6.25 | |
}, | |
{ | |
"category": "Armenian history", | |
"score": 5.0 | |
}, | |
{ | |
"category": "Mathematics", | |
"score": 12.5 | |
}, | |
{ | |
"category": "Average", | |
"score": 7.9167 | |
} | |
] | |
} | |
}, | |
{ | |
"model_name": "Qwen/Qwen3-32B", | |
"results": { | |
"mmlu_results": [], | |
"unified_exam_results": [ | |
{ | |
"category": "Armenian language and literature", | |
"score": 4.75 | |
}, | |
{ | |
"category": "Armenian history", | |
"score": 3.5 | |
}, | |
{ | |
"category": "Mathematics", | |
"score": 14.0 | |
}, | |
{ | |
"category": "Average", | |
"score": 7.4167 | |
} | |
] | |
} | |
}, | |
{ | |
"model_name": "Qwen/QwQ-32B", | |
"results": { | |
"mmlu_results": [], | |
"unified_exam_results": [ | |
{ | |
"category": "Armenian language and literature", | |
"score": 2.5 | |
}, | |
{ | |
"category": "Armenian history", | |
"score": 2.5 | |
}, | |
{ | |
"category": "Mathematics", | |
"score": 10.5 | |
}, | |
{ | |
"category": "Average", | |
"score": 5.1667 | |
} | |
] | |
} | |
}, | |
{ | |
"model_name": "gemini-2.5-flash", | |
"results": { | |
"mmlu_results": [], | |
"unified_exam_results": [ | |
{ | |
"category": "Average", | |
"score": 10.75 | |
}, | |
{ | |
"category": "Armenian history", | |
"score": 9.5 | |
}, | |
{ | |
"category": "Armenian language and literature", | |
"score": 8.25 | |
}, | |
{ | |
"category": "Mathematics", | |
"score": 14.5 | |
} | |
] | |
} | |
}, | |
{ | |
"model_name": "gemini-2.5-pro", | |
"results": { | |
"mmlu_results": [], | |
"unified_exam_results": [ | |
{ | |
"category": "Average", | |
"score": 12.75 | |
}, | |
{ | |
"category": "Armenian history", | |
"score": 11.25 | |
}, | |
{ | |
"category": "Armenian language and literature", | |
"score": 11.75 | |
}, | |
{ | |
"category": "Mathematics", | |
"score": 15.25 | |
} | |
] | |
} | |
}, | |
{ | |
"model_name": "gpt-4.1-2025-04-14", | |
"results": { | |
"mmlu_results": [], | |
"unified_exam_results": [ | |
{ | |
"category": "Average", | |
"score": 9.6667 | |
}, | |
{ | |
"category": "Armenian history", | |
"score": 8.0 | |
}, | |
{ | |
"category": "Armenian language and literature", | |
"score": 8.25 | |
}, | |
{ | |
"category": "Mathematics", | |
"score": 12.75 | |
} | |
] | |
} | |
}, | |
{ | |
"model_name": "claude-sonnet-4-20250514", | |
"results": { | |
"mmlu_results": [], | |
"unified_exam_results": [ | |
{ | |
"category": "Average", | |
"score": 10.0833 | |
}, | |
{ | |
"category": "Armenian history", | |
"score": 7.0 | |
}, | |
{ | |
"category": "Armenian language and literature", | |
"score": 7.5 | |
}, | |
{ | |
"category": "Mathematics", | |
"score": 15.75 | |
} | |
] | |
} | |
} | |
] |