Spaces:

silma-ai
/

Arabic-LLM-Broad-Leaderboard

Running

App Files Files Community

karimouda commited on May 12

Commit

aaf9571

1 Parent(s): 5c67778

Prelaunch changes

Browse files

Files changed (5) hide show

app.py +6 -6
requests/Qwen/Qwen3-4B_eval_request.json +0 -1
src/about.py +82 -20
src/display/css_html_js.py +9 -1
src/submission/submit.py +4 -4

app.py CHANGED Viewed

@@ -105,7 +105,7 @@ def init_leaderboard(dataframe):
             show_fullscreen_button=False,
             interactive=False,
             column_widths=[30,50,40,150,60,60,60],
-            max_height=420,
             elem_classes="leaderboard_col_style",
             show_search="filter",
             max_chars=None
@@ -344,11 +344,11 @@ with demo:
             with gr.Row():
                 gr.Markdown("# Submit your model", elem_classes="markdown-text")
             with gr.Column():
-                gr.Markdown("### Please confirm that you understand and accept the conditions below before submitting your model.")
-                prereqs_checkboxes = gr.CheckboxGroup(["I have successfully run the ABB benchmark script on my model using my own infrastructure and I am NOT using the Leaderboard for testing purposes",
-                                  "I understand that my account/org have only one submission per month",
                                   "I understand that I can't submit models more than 15B parameters (learn more in the FAQ)",
-                                  "I understand that submitting contaminated models or models to test the contamination score will lead to action from our side including banning. We also reserve the right to delete any model we think is contaminated without notice."],
                                   label=None, info=None,
                                   elem_classes="submit_prereq_checkboxes_container",
                                   container=False)
@@ -415,7 +415,7 @@ with demo:
                                 row_count=5,
                             )
-        with gr.TabItem("📝 FAQ", elem_id="llm-benchmark-tab-about", id=6):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
     with gr.Row():

             show_fullscreen_button=False,
             interactive=False,
             column_widths=[30,50,40,150,60,60,60],
+            max_height=450,
             elem_classes="leaderboard_col_style",
             show_search="filter",
             max_chars=None
             with gr.Row():
                 gr.Markdown("# Submit your model", elem_classes="markdown-text")
             with gr.Column():
+                gr.Markdown("### Please confirm that you understand and accept the conditions below before submitting your model:")
+                prereqs_checkboxes = gr.CheckboxGroup(["I have successfully run the ABB benchmark script on my model using my own infrastructure, and I am not using the Leaderboard for testing purposes",
+                                  "I understand that my account/org has only one submission per month",
                                   "I understand that I can't submit models more than 15B parameters (learn more in the FAQ)",
+                                  "I understand that submitting contaminated models, or models intended to test the contamination score, may result in actions from our side, including banning. We also reserve the right to delete any model we deem contaminated without prior notice"],
                                   label=None, info=None,
                                   elem_classes="submit_prereq_checkboxes_container",
                                   container=False)
                                 row_count=5,
                             )
+        with gr.TabItem("📝 FAQ", elem_id="llm-benchmark-tab-faq", id=6):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
     with gr.Row():

requests/Qwen/Qwen3-4B_eval_request.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"model": "Qwen/Qwen3-4B", "model_sha": "82d62bb073771e7a1ea59435f548908540217d1f", "status": "PENDING", "submitted_time": "2025-05-11T06:25:32Z", "likes": 174, "params": 4.022, "license": "apache-2.0"}

src/about.py CHANGED Viewed

@@ -57,42 +57,104 @@ TITLE = """<div ><img class='abl_header_image' src='https://huggingface.co/space
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
 <h1 style='width: 100%;text-align: center;' id="space-title">Arabic Broad Leaderboard (ABL) - The first comprehensive Leaderboard for Arabic LLMs</h1>
-ABL is the official Leaderboard of <a href='https://huggingface.co/datasets/silma-ai/arabic-broad-benchmark' target='_blank'>Arabic Broad Benchmark (ABB)</a>.
-With advanced features and innovative visualizations, we provide the community with a comprehensive view of the capabilities of Arabic models, showcasing their speed, diverse skills while also defending against benchmarking contamination.
-The benchmark consists of <b>450 high quality human-validated questions</b> sampled from <b>63 Arabic benchmarking datasets</b>, evaluating <b>22 categories and skills</b>.
-Find more details in the about Tab.
 """
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
-## FAQ
-### What is the difference betweem ABL and ABB?
-ABL is the Leaderboard which uses ABB benchmarking dataset and code in the backend to produce the results you see here
-### What can I learn more about ABL and ABB?
-Feel free to read the following resources
-ABB Page:
-ABL blog post:
-### How can I reproduce the results?
-You can easily run the ABB benchmarking code using the following command on Google Collab or your own infratructure.
-### What is the Benchmark Score?
-### What is the Contamination Score?
-### What is the Speed?
-### Why I am not allowed to submit models more than 15B parameters?
 """
@@ -104,7 +166,7 @@ CITATION_BUTTON_LABEL = "Copy the following snippet to cite the Leaderboard"
 CITATION_BUTTON_TEXT = r"""
 @misc{ABL,
-  author = {SILMA AI Team},
   title = {Arabic Broad Leaderboard},
   year = {2025},
   publisher = {SILMA.AI},

 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
 <h1 style='width: 100%;text-align: center;' id="space-title">Arabic Broad Leaderboard (ABL) - The first comprehensive Leaderboard for Arabic LLMs</h1>
+ABL, the official leaderboard of the <a href='https://huggingface.co/datasets/silma-ai/arabic-broad-benchmark' target='_blank'>Arabic Broad Benchmark (ABB)</a>
+is a next-generation leaderboard offering innovative visualizations, analytical capabilities, model skill breakdowns, speed comparisons, and contamination detection mechanisms. ABL provides the community with an unprecedented ability to study the capabilities of Arabic models and choose the right model for the right task. Find more details in the FAQ section.
 """
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
+# FAQ
+---
+## What is the Benchmark Score?
+*   The benchmark score is calculated by taking the average of all individual question scores.
+*   Each question is scored from 0 to 10 using a mix of LLM-as-judge and manual rules, depending on the question type.
+*   Please refer to the ABB page below for more information about the scoring rules and the dataset:
+https://huggingface.co/datasets/silma-ai/arabic-broad-benchmark#scoring-rules
+---
+## What is the Contamination Score?
+*   The contamination score is a score measuring the probability that a model has been trained using the ABB benchmarking data to boost its scores on ABL.
+*   After testing each model on ABL, we run our private algorithm to detect contamination and arrive at a score.
+*   Contaminated models will show a red sign and a number above zero in the Contamination Score column.
+*   Any model showing signs of contamination will be deleted instantly from the leaderboard.
+---
+## What is the Speed?
+*   Speed shows how fast the model was during testing, using the "words per second" metric.
+*   The score is calculated by dividing the number of words generated by the model during the entire test by the time taken (in seconds) for the model to complete testing.
+*   Please note that we use the same GPU (A100) and a batch size of 1 for all models (in the same size category) to ensure a fair comparison.
+---
+## What does Size mean?
+*   Models below 3.5B parameters are considered Nano.
+*   Models between 3.5B and 10B parameters are considered Small.
+*   Models between 10B and 30B parameters are considered Medium.
+*   Models above 30B parameters are considered Large.
+---
+## What does Source mean?
+*   API: Closed models tested via an API.
+*   Hugging Face: Open models downloaded and tested from Hugging Face via the `transformers` library.
+---
+## How can I reproduce the results?
+You can easily reproduce the results of any model by following the steps on the ABB page below:
+https://huggingface.co/datasets/silma-ai/arabic-broad-benchmark#how-to-use-abb-to-benchmark-a-model
+---
+## I tested a model and got a slightly different score. Why is that?
+*   ABB is partially dependent on an external LLM-as-judge (GPT-4.1).
+*   LLMs are random in nature and will not always produce the same scores on every run.
+*   That said, according to our testing, such variations are always within a +/-1% range.
+---
+## I have seen an answer which seems correct to me but is getting a zero score. Why is that?
+*   First, LLM scoring is not always consistent, and sometimes it gives a wrong score to an answer, but based on our testing, this is very rare.
+*   Second, we also have fixed rules in place to penalize models; for example, when a model answers in another language or answers in two languages, we give a score of zero.
+*   In general, both fixed rules and LLM inconsistencies are applied to all models in the same way, which we consider fair.
+---
+## Why am I not allowed to submit models with more than 15B parameters?
+*   Models above 15B parameters don't fit into a single GPU and require provisioning of multiple GPUs, which we can't always guarantee to provision in an automated manner.
+*   We also know that most community models are below 15B parameters.
+*   As an exception, we can accept requests from organizations on a case-by-case basis.
+*   Finally, we will always make sure to include larger models when they have high adoption from the community.
+---
+## How can I learn more about ABL and ABB?
+Feel free to read through the following resources:
+* **ABB Page**: https://huggingface.co/datasets/silma-ai/arabic-broad-benchmark
+* **ABL blog post**: Coming soon...
+---
+## How can I contact the benchmark maintainers?
+You can contact us via benchmark@silma.ai
 """
 CITATION_BUTTON_TEXT = r"""
 @misc{ABL,
+  author = {SILMA.AI Team},
   title = {Arabic Broad Leaderboard},
   year = {2025},
   publisher = {SILMA.AI},

src/display/css_html_js.py CHANGED Viewed

@@ -2,7 +2,7 @@ custom_css = """
 .abl_desc_text {
     font-size: 16px !important;
-    margin-bottom: 5px;
 }
 #models-to-add-text {
@@ -155,6 +155,14 @@ border-radius: 10px;
 .prose *{
 color:unset;
 }
 """
 get_window_url_params = """

 .abl_desc_text {
     font-size: 16px !important;
+    margin-bottom: 0px;
 }
 #models-to-add-text {
 .prose *{
 color:unset;
 }
+#llm-benchmark-tab-submit{
+    margin-top:20px;
+}
+#llm-benchmark-tab-faq{
+    margin-top:20px;
+}
 """
 get_window_url_params = """

src/submission/submit.py CHANGED Viewed

@@ -89,7 +89,7 @@ def add_new_eval(
     progress(0.3, desc=f"Checking model size")
     model_size = get_model_size(model_info=model_info)#, precision=precision
-    if model_size>150:##********************CHANGE
         yield styled_error("We currently accept community-submitted models up to 15 billion parameters only. If you represent an organization then please contact us at benchmark@silma.ai")
         return
@@ -113,7 +113,7 @@ def add_new_eval(
     progress(0.6, desc=f"Checking last submission date")
     previous_user_submissions = USERS_TO_SUBMISSION_DATES.get(user_name)
-    if False and previous_user_submissions:
         previous_user_submission_dates = [datetime.strptime(date.replace("T"," ").split(" ")[0], "%Y-%m-%d") for date in previous_user_submissions]
         previous_user_submission_dates.sort(reverse=True)
@@ -132,7 +132,7 @@ def add_new_eval(
     # Check for duplicate submission
     if f"{model}" in REQUESTED_MODELS: #_{revision}_{precision}
-        yield styled_warning("This model has been already submitted.")
         return
     # Seems good, creating the eval
@@ -186,7 +186,7 @@ def add_new_eval(
     if queue_len == 0:
         queue_data = []
-    elif queue_len >= 10:##********************CHANGE
         yield styled_warning("The evaluation queue is full at the moment. Please try again in one hour")
         return

     progress(0.3, desc=f"Checking model size")
     model_size = get_model_size(model_info=model_info)#, precision=precision
+    if model_size>15:
         yield styled_error("We currently accept community-submitted models up to 15 billion parameters only. If you represent an organization then please contact us at benchmark@silma.ai")
         return
     progress(0.6, desc=f"Checking last submission date")
     previous_user_submissions = USERS_TO_SUBMISSION_DATES.get(user_name)
+    if previous_user_submissions:
         previous_user_submission_dates = [datetime.strptime(date.replace("T"," ").split(" ")[0], "%Y-%m-%d") for date in previous_user_submissions]
         previous_user_submission_dates.sort(reverse=True)
     # Check for duplicate submission
     if f"{model}" in REQUESTED_MODELS: #_{revision}_{precision}
+        yield styled_warning("This model has already been submitted.")
         return
     # Seems good, creating the eval
     if queue_len == 0:
         queue_data = []
+    elif queue_len >= 1:
         yield styled_warning("The evaluation queue is full at the moment. Please try again in one hour")
         return