{ "results": { "average_score": 9.019148936170213, "speed": 15.2136124607292, "contamination_score": 0.00, "execution_time": 1054.581878, "errors": [], "scores_by_category": [ { "category": "Hallucination", "average_score": 10.0, "count": 3 }, { "category": "Long Context", "average_score": 10.0, "count": 4 }, { "category": "Sentiment Analysis", "average_score": 10.0, "count": 9 }, { "category": "Reading Comprehension", "average_score": 10.0, "count": 17 }, { "category": "Paraphrasing", "average_score": 9.833333333333334, "count": 6 }, { "category": "Trust & Safety", "average_score": 9.666666666666666, "count": 30 }, { "category": "Coding", "average_score": 9.666666666666666, "count": 3 }, { "category": "Reasoning & Math", "average_score": 9.604651162790697, "count": 43 }, { "category": "MMLU", "average_score": 9.50413223140496, "count": 121 }, { "category": "General Knowledge", "average_score": 9.444444444444445, "count": 63 }, { "category": "Entity Extraction", "average_score": 9.4, "count": 5 }, { "category": "Function Calling", "average_score": 9.333333333333334, "count": 3 }, { "category": "Instruction Following", "average_score": 9.285714285714286, "count": 7 }, { "category": "Dialect Detection", "average_score": 9.090909090909092, "count": 11 }, { "category": "Arabic Language & Grammar", "average_score": 8.705882352941176, "count": 17 }, { "category": "Structuring", "average_score": 8.666666666666666, "count": 3 }, { "category": "Writing (incl Dialects)", "average_score": 8.545454545454545, "count": 22 }, { "category": "Transliteration", "average_score": 8.5, "count": 6 }, { "category": "Diacritization", "average_score": 8.5, "count": 12 }, { "category": "Translation (incl Dialects)", "average_score": 8.38888888888889, "count": 36 }, { "category": "Summarization", "average_score": 8.375, "count": 8 }, { "category": "RAG QA", "average_score": 6.073170731707317, "count": 41 } ], "scores_by_format": [ { "format": "Fill-in-the-blank", "average_score": 10.0, "count": 8 }, { "format": "Short Answer", "average_score": 10.0, "count": 5 }, { "format": "MCQ", "average_score": 9.532751091703057, "count": 229 }, { "format": "Generation", "average_score": 8.447368421052632, "count": 228 } ] }, "config": { "model": "openai/gpt-4o", "model_sha": "na", "submitted_time": "2025-05-11 15:45:37", "likes": -1, "params": 999, "license": "closed", "model_source": "API", "model_category": "Large" } }