{ "results": { "average_score": 7.642553191489362, "speed": 17.560391916863864, "contamination_score": 0, "execution_time": 837.737567, "errors": [], "scores_by_category": [ { "category": "Paraphrasing", "average_score": 10.0, "count": 6 }, { "category": "Entity Extraction", "average_score": 9.8, "count": 5 }, { "category": "Coding", "average_score": 9.666666666666666, "count": 3 }, { "category": "Instruction Following", "average_score": 9.571428571428571, "count": 7 }, { "category": "Sentiment Analysis", "average_score": 8.88888888888889, "count": 9 }, { "category": "Reading Comprehension", "average_score": 8.823529411764707, "count": 17 }, { "category": "General Knowledge", "average_score": 8.634920634920634, "count": 63 }, { "category": "Dialect Detection", "average_score": 8.272727272727273, "count": 11 }, { "category": "Summarization", "average_score": 8.25, "count": 8 }, { "category": "MMLU", "average_score": 8.099173553719009, "count": 121 }, { "category": "Transliteration", "average_score": 7.666666666666667, "count": 6 }, { "category": "Writing (incl Dialects)", "average_score": 7.545454545454546, "count": 22 }, { "category": "Translation (incl Dialects)", "average_score": 7.388888888888889, "count": 36 }, { "category": "Reasoning & Math", "average_score": 7.209302325581396, "count": 43 }, { "category": "Function Calling", "average_score": 7.0, "count": 3 }, { "category": "Trust & Safety", "average_score": 7.0, "count": 30 }, { "category": "Diacritization", "average_score": 6.833333333333333, "count": 12 }, { "category": "Hallucination", "average_score": 6.666666666666667, "count": 3 }, { "category": "Structuring", "average_score": 6.0, "count": 3 }, { "category": "Long Context", "average_score": 5.5, "count": 4 }, { "category": "Arabic Language & Grammar", "average_score": 5.470588235294118, "count": 17 }, { "category": "RAG QA", "average_score": 5.414634146341464, "count": 41 } ], "scores_by_format": [ { "format": "Short Answer", "average_score": 8.0, "count": 5 }, { "format": "MCQ", "average_score": 7.842794759825328, "count": 229 }, { "format": "Generation", "average_score": 7.526315789473684, "count": 228 }, { "format": "Fill-in-the-blank", "average_score": 5.0, "count": 8 } ] }, "config": { "model": "openai/gpt-4.1-nano", "model_sha": "na", "submitted_time": "2025-05-11 08:07:11", "likes": -1, "params": 999, "license": "closed", "model_source": "API", "model_category": "Large" } }