{ "results": { "average_score": 8.77659574468085, "speed": 12.59830004953974, "contamination_score": 0, "execution_time": 1313.828051, "errors": [], "scores_by_category": [ { "category": "Paraphrasing", "average_score": 10.0, "count": 6 }, { "category": "Coding", "average_score": 10.0, "count": 3 }, { "category": "Sentiment Analysis", "average_score": 10.0, "count": 9 }, { "category": "Hallucination", "average_score": 10.0, "count": 3 }, { "category": "Reading Comprehension", "average_score": 10.0, "count": 17 }, { "category": "Entity Extraction", "average_score": 9.6, "count": 5 }, { "category": "MMLU", "average_score": 9.338842975206612, "count": 121 }, { "category": "General Knowledge", "average_score": 9.19047619047619, "count": 63 }, { "category": "Trust & Safety", "average_score": 8.766666666666667, "count": 30 }, { "category": "Diacritization", "average_score": 8.75, "count": 12 }, { "category": "Long Context", "average_score": 8.75, "count": 4 }, { "category": "Structuring", "average_score": 8.666666666666666, "count": 3 }, { "category": "Function Calling", "average_score": 8.666666666666666, "count": 3 }, { "category": "Reasoning & Math", "average_score": 8.581395348837209, "count": 43 }, { "category": "Transliteration", "average_score": 8.166666666666666, "count": 6 }, { "category": "Instruction Following", "average_score": 8.142857142857142, "count": 7 }, { "category": "Summarization", "average_score": 8.125, "count": 8 }, { "category": "Writing (incl Dialects)", "average_score": 8.0, "count": 22 }, { "category": "RAG QA", "average_score": 7.975609756097561, "count": 41 }, { "category": "Arabic Language & Grammar", "average_score": 7.764705882352941, "count": 17 }, { "category": "Dialect Detection", "average_score": 7.636363636363637, "count": 11 }, { "category": "Translation (incl Dialects)", "average_score": 7.611111111111111, "count": 36 } ], "scores_by_format": [ { "format": "Short Answer", "average_score": 10.0, "count": 5 }, { "format": "MCQ", "average_score": 9.222707423580786, "count": 229 }, { "format": "Generation", "average_score": 8.346491228070175, "count": 228 }, { "format": "Fill-in-the-blank", "average_score": 7.5, "count": 8 } ] }, "config": { "model": "openai/gpt-4.1-mini", "model_sha": "na", "submitted_time": "2025-05-11 08:26:34", "likes": -1, "params": 999, "license": "closed", "model_source": "API", "model_category": "Large" } }