from dataclasses import dataclass from enum import Enum @dataclass class EvalDimension: metric: str col_name: str # Select your tasks here # --------------------------------------------------- class EvalDimensions(Enum): d0 = EvalDimension("speed", "Speed (words/sec)") d1 = EvalDimension("contamination_score", "Contamination Score") d2 = EvalDimension("paraphrasing", "Paraphrasing") d3 = EvalDimension("sentiment analysis", "Sentiment Analysis") d4 = EvalDimension("coding", "Coding") d5 = EvalDimension("function calling", "Function Calling") d6 = EvalDimension("rag qa", "RAG QA") d7 = EvalDimension("reading comprehension", "Reading Comprehension") d8 = EvalDimension("entity extraction", "Entity Extraction") d9 = EvalDimension("summarization", "Summarization") d10 = EvalDimension("long context", "Long Context") d11 = EvalDimension("mmlu", "MMLU") d12 = EvalDimension("arabic language & grammar", "Arabic Language & Grammar") d13 = EvalDimension("general knowledge", "General Knowledge") d14 = EvalDimension("translation (incl dialects)", "Translation (incl Dialects)") d15 = EvalDimension("trust & safety","Trust & Safety") d16 = EvalDimension("writing (incl dialects)", "Writing (incl Dialects)") d17 = EvalDimension("dialect detection", "Dialect Detection") d18 = EvalDimension("reasoning & math", "Reasoning & Math") d19 = EvalDimension("diacritization", "Diacritization") d20 = EvalDimension("instruction following", "Instruction Following") d21 = EvalDimension("transliteration", "Transliteration") d22 = EvalDimension("structuring", "Structuring") d23 = EvalDimension("hallucination", "Hallucination") NUM_FEWSHOT = 0 # Change with your few shot # --------------------------------------------------- # Your leaderboard name TITLE = """

""" # What does your leaderboard evaluate? INTRODUCTION_TEXT = """

Arabic Broad Leaderboard (ABL) - The first comprehensive Leaderboard for Arabic LLMs

ABL is the official Leaderboard of Arabic Broad Benchmark (ABB). With advanced features and innovative visualizations, we provide the community with a comprehensive view of the capabilities of Arabic models, showcasing their speed, diverse skills while also defending against benchmarking contamination. The benchmark consists of 450 high quality human-validated questions sampled from 63 Arabic benchmarking datasets, evaluating 22 categories and skills. Find more details in the about Tab. """ # Which evaluations are you running? how can people reproduce what you have? LLM_BENCHMARKS_TEXT = f""" ## FAQ ### What is the difference betweem ABL and ABB? ABL is the Leaderboard which uses ABB benchmarking dataset and code in the backend to produce the results you see here ### What can I learn more about ABL and ABB? Feel free to read the following resources ABB Page: ABL blog post: ### How can I reproduce the results? You can easily run the ABB benchmarking code using the following command on Google Collab or your own infratructure. ### What is the Benchmark Score? ### What is the Contamination Score? ### What is the Speed? ### Why I am not allowed to submit models more than 15B parameters? """ EVALUATION_QUEUE_TEXT = """ """ CITATION_BUTTON_LABEL = "Copy the following snippet to cite the Leaderboard" CITATION_BUTTON_TEXT = r""" @misc{ABL, author = {SILMA AI Team}, title = {Arabic Broad Leaderboard}, year = {2025}, publisher = {SILMA.AI}, howpublished = "{\url{https://huggingface.co/spaces/silma-ai/Arabic-LLM-Broad-Leaderboard}}" } """ FOOTER_TEXT = """