Prelaunch changes
Browse files- app.py +6 -6
- requests/Qwen/Qwen3-4B_eval_request.json +0 -1
- src/about.py +82 -20
- src/display/css_html_js.py +9 -1
- src/submission/submit.py +4 -4
app.py
CHANGED
@@ -105,7 +105,7 @@ def init_leaderboard(dataframe):
|
|
105 |
show_fullscreen_button=False,
|
106 |
interactive=False,
|
107 |
column_widths=[30,50,40,150,60,60,60],
|
108 |
-
max_height=
|
109 |
elem_classes="leaderboard_col_style",
|
110 |
show_search="filter",
|
111 |
max_chars=None
|
@@ -344,11 +344,11 @@ with demo:
|
|
344 |
with gr.Row():
|
345 |
gr.Markdown("# Submit your model", elem_classes="markdown-text")
|
346 |
with gr.Column():
|
347 |
-
gr.Markdown("### Please confirm that you understand and accept the conditions below before submitting your model
|
348 |
-
prereqs_checkboxes = gr.CheckboxGroup(["I have successfully run the ABB benchmark script on my model using my own infrastructure and I am
|
349 |
-
"I understand that my account/org
|
350 |
"I understand that I can't submit models more than 15B parameters (learn more in the FAQ)",
|
351 |
-
"I understand that submitting contaminated models or models to test the contamination score
|
352 |
label=None, info=None,
|
353 |
elem_classes="submit_prereq_checkboxes_container",
|
354 |
container=False)
|
@@ -415,7 +415,7 @@ with demo:
|
|
415 |
row_count=5,
|
416 |
)
|
417 |
|
418 |
-
with gr.TabItem("📝 FAQ", elem_id="llm-benchmark-tab-
|
419 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
420 |
|
421 |
with gr.Row():
|
|
|
105 |
show_fullscreen_button=False,
|
106 |
interactive=False,
|
107 |
column_widths=[30,50,40,150,60,60,60],
|
108 |
+
max_height=450,
|
109 |
elem_classes="leaderboard_col_style",
|
110 |
show_search="filter",
|
111 |
max_chars=None
|
|
|
344 |
with gr.Row():
|
345 |
gr.Markdown("# Submit your model", elem_classes="markdown-text")
|
346 |
with gr.Column():
|
347 |
+
gr.Markdown("### Please confirm that you understand and accept the conditions below before submitting your model:")
|
348 |
+
prereqs_checkboxes = gr.CheckboxGroup(["I have successfully run the ABB benchmark script on my model using my own infrastructure, and I am not using the Leaderboard for testing purposes",
|
349 |
+
"I understand that my account/org has only one submission per month",
|
350 |
"I understand that I can't submit models more than 15B parameters (learn more in the FAQ)",
|
351 |
+
"I understand that submitting contaminated models, or models intended to test the contamination score, may result in actions from our side, including banning. We also reserve the right to delete any model we deem contaminated without prior notice"],
|
352 |
label=None, info=None,
|
353 |
elem_classes="submit_prereq_checkboxes_container",
|
354 |
container=False)
|
|
|
415 |
row_count=5,
|
416 |
)
|
417 |
|
418 |
+
with gr.TabItem("📝 FAQ", elem_id="llm-benchmark-tab-faq", id=6):
|
419 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
420 |
|
421 |
with gr.Row():
|
requests/Qwen/Qwen3-4B_eval_request.json
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
{"model": "Qwen/Qwen3-4B", "model_sha": "82d62bb073771e7a1ea59435f548908540217d1f", "status": "PENDING", "submitted_time": "2025-05-11T06:25:32Z", "likes": 174, "params": 4.022, "license": "apache-2.0"}
|
|
|
|
src/about.py
CHANGED
@@ -57,42 +57,104 @@ TITLE = """<div ><img class='abl_header_image' src='https://huggingface.co/space
|
|
57 |
# What does your leaderboard evaluate?
|
58 |
INTRODUCTION_TEXT = """
|
59 |
<h1 style='width: 100%;text-align: center;' id="space-title">Arabic Broad Leaderboard (ABL) - The first comprehensive Leaderboard for Arabic LLMs</h1>
|
60 |
-
ABL
|
61 |
-
|
62 |
-
The benchmark consists of <b>450 high quality human-validated questions</b> sampled from <b>63 Arabic benchmarking datasets</b>, evaluating <b>22 categories and skills</b>.
|
63 |
-
Find more details in the about Tab.
|
64 |
-
|
65 |
-
|
66 |
"""
|
67 |
|
68 |
# Which evaluations are you running? how can people reproduce what you have?
|
69 |
LLM_BENCHMARKS_TEXT = f"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
-
|
72 |
|
73 |
-
|
74 |
|
75 |
-
|
|
|
|
|
76 |
|
|
|
77 |
|
78 |
-
|
79 |
|
80 |
-
|
81 |
-
|
82 |
-
|
|
|
83 |
|
84 |
-
|
85 |
|
86 |
-
|
87 |
|
88 |
-
|
89 |
|
90 |
-
|
|
|
91 |
|
92 |
-
|
93 |
|
94 |
-
|
95 |
|
|
|
96 |
|
97 |
"""
|
98 |
|
@@ -104,7 +166,7 @@ CITATION_BUTTON_LABEL = "Copy the following snippet to cite the Leaderboard"
|
|
104 |
CITATION_BUTTON_TEXT = r"""
|
105 |
|
106 |
@misc{ABL,
|
107 |
-
author = {SILMA
|
108 |
title = {Arabic Broad Leaderboard},
|
109 |
year = {2025},
|
110 |
publisher = {SILMA.AI},
|
|
|
57 |
# What does your leaderboard evaluate?
|
58 |
INTRODUCTION_TEXT = """
|
59 |
<h1 style='width: 100%;text-align: center;' id="space-title">Arabic Broad Leaderboard (ABL) - The first comprehensive Leaderboard for Arabic LLMs</h1>
|
60 |
+
ABL, the official leaderboard of the <a href='https://huggingface.co/datasets/silma-ai/arabic-broad-benchmark' target='_blank'>Arabic Broad Benchmark (ABB)</a>
|
61 |
+
is a next-generation leaderboard offering innovative visualizations, analytical capabilities, model skill breakdowns, speed comparisons, and contamination detection mechanisms. ABL provides the community with an unprecedented ability to study the capabilities of Arabic models and choose the right model for the right task. Find more details in the FAQ section.
|
|
|
|
|
|
|
|
|
62 |
"""
|
63 |
|
64 |
# Which evaluations are you running? how can people reproduce what you have?
|
65 |
LLM_BENCHMARKS_TEXT = f"""
|
66 |
+
# FAQ
|
67 |
+
|
68 |
+
---
|
69 |
+
|
70 |
+
## What is the Benchmark Score?
|
71 |
+
|
72 |
+
* The benchmark score is calculated by taking the average of all individual question scores.
|
73 |
+
* Each question is scored from 0 to 10 using a mix of LLM-as-judge and manual rules, depending on the question type.
|
74 |
+
* Please refer to the ABB page below for more information about the scoring rules and the dataset:
|
75 |
+
|
76 |
+
https://huggingface.co/datasets/silma-ai/arabic-broad-benchmark#scoring-rules
|
77 |
+
|
78 |
+
---
|
79 |
+
|
80 |
+
## What is the Contamination Score?
|
81 |
+
|
82 |
+
* The contamination score is a score measuring the probability that a model has been trained using the ABB benchmarking data to boost its scores on ABL.
|
83 |
+
* After testing each model on ABL, we run our private algorithm to detect contamination and arrive at a score.
|
84 |
+
* Contaminated models will show a red sign and a number above zero in the Contamination Score column.
|
85 |
+
* Any model showing signs of contamination will be deleted instantly from the leaderboard.
|
86 |
+
|
87 |
+
---
|
88 |
+
|
89 |
+
## What is the Speed?
|
90 |
+
|
91 |
+
* Speed shows how fast the model was during testing, using the "words per second" metric.
|
92 |
+
* The score is calculated by dividing the number of words generated by the model during the entire test by the time taken (in seconds) for the model to complete testing.
|
93 |
+
* Please note that we use the same GPU (A100) and a batch size of 1 for all models (in the same size category) to ensure a fair comparison.
|
94 |
+
|
95 |
+
---
|
96 |
+
|
97 |
+
## What does Size mean?
|
98 |
+
|
99 |
+
* Models below 3.5B parameters are considered Nano.
|
100 |
+
* Models between 3.5B and 10B parameters are considered Small.
|
101 |
+
* Models between 10B and 30B parameters are considered Medium.
|
102 |
+
* Models above 30B parameters are considered Large.
|
103 |
+
|
104 |
+
---
|
105 |
+
|
106 |
+
## What does Source mean?
|
107 |
+
|
108 |
+
* API: Closed models tested via an API.
|
109 |
+
* Hugging Face: Open models downloaded and tested from Hugging Face via the `transformers` library.
|
110 |
+
|
111 |
+
---
|
112 |
+
|
113 |
+
## How can I reproduce the results?
|
114 |
+
|
115 |
+
You can easily reproduce the results of any model by following the steps on the ABB page below:
|
116 |
+
|
117 |
+
https://huggingface.co/datasets/silma-ai/arabic-broad-benchmark#how-to-use-abb-to-benchmark-a-model
|
118 |
+
|
119 |
+
---
|
120 |
+
|
121 |
+
## I tested a model and got a slightly different score. Why is that?
|
122 |
+
|
123 |
+
* ABB is partially dependent on an external LLM-as-judge (GPT-4.1).
|
124 |
+
* LLMs are random in nature and will not always produce the same scores on every run.
|
125 |
+
* That said, according to our testing, such variations are always within a +/-1% range.
|
126 |
|
127 |
+
---
|
128 |
|
129 |
+
## I have seen an answer which seems correct to me but is getting a zero score. Why is that?
|
130 |
|
131 |
+
* First, LLM scoring is not always consistent, and sometimes it gives a wrong score to an answer, but based on our testing, this is very rare.
|
132 |
+
* Second, we also have fixed rules in place to penalize models; for example, when a model answers in another language or answers in two languages, we give a score of zero.
|
133 |
+
* In general, both fixed rules and LLM inconsistencies are applied to all models in the same way, which we consider fair.
|
134 |
|
135 |
+
---
|
136 |
|
137 |
+
## Why am I not allowed to submit models with more than 15B parameters?
|
138 |
|
139 |
+
* Models above 15B parameters don't fit into a single GPU and require provisioning of multiple GPUs, which we can't always guarantee to provision in an automated manner.
|
140 |
+
* We also know that most community models are below 15B parameters.
|
141 |
+
* As an exception, we can accept requests from organizations on a case-by-case basis.
|
142 |
+
* Finally, we will always make sure to include larger models when they have high adoption from the community.
|
143 |
|
144 |
+
---
|
145 |
|
146 |
+
## How can I learn more about ABL and ABB?
|
147 |
|
148 |
+
Feel free to read through the following resources:
|
149 |
|
150 |
+
* **ABB Page**: https://huggingface.co/datasets/silma-ai/arabic-broad-benchmark
|
151 |
+
* **ABL blog post**: Coming soon...
|
152 |
|
153 |
+
---
|
154 |
|
155 |
+
## How can I contact the benchmark maintainers?
|
156 |
|
157 |
+
You can contact us via benchmark@silma.ai
|
158 |
|
159 |
"""
|
160 |
|
|
|
166 |
CITATION_BUTTON_TEXT = r"""
|
167 |
|
168 |
@misc{ABL,
|
169 |
+
author = {SILMA.AI Team},
|
170 |
title = {Arabic Broad Leaderboard},
|
171 |
year = {2025},
|
172 |
publisher = {SILMA.AI},
|
src/display/css_html_js.py
CHANGED
@@ -2,7 +2,7 @@ custom_css = """
|
|
2 |
|
3 |
.abl_desc_text {
|
4 |
font-size: 16px !important;
|
5 |
-
margin-bottom:
|
6 |
}
|
7 |
|
8 |
#models-to-add-text {
|
@@ -155,6 +155,14 @@ border-radius: 10px;
|
|
155 |
.prose *{
|
156 |
color:unset;
|
157 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
"""
|
159 |
|
160 |
get_window_url_params = """
|
|
|
2 |
|
3 |
.abl_desc_text {
|
4 |
font-size: 16px !important;
|
5 |
+
margin-bottom: 0px;
|
6 |
}
|
7 |
|
8 |
#models-to-add-text {
|
|
|
155 |
.prose *{
|
156 |
color:unset;
|
157 |
}
|
158 |
+
|
159 |
+
#llm-benchmark-tab-submit{
|
160 |
+
margin-top:20px;
|
161 |
+
}
|
162 |
+
|
163 |
+
#llm-benchmark-tab-faq{
|
164 |
+
margin-top:20px;
|
165 |
+
}
|
166 |
"""
|
167 |
|
168 |
get_window_url_params = """
|
src/submission/submit.py
CHANGED
@@ -89,7 +89,7 @@ def add_new_eval(
|
|
89 |
progress(0.3, desc=f"Checking model size")
|
90 |
model_size = get_model_size(model_info=model_info)#, precision=precision
|
91 |
|
92 |
-
if model_size>
|
93 |
yield styled_error("We currently accept community-submitted models up to 15 billion parameters only. If you represent an organization then please contact us at benchmark@silma.ai")
|
94 |
return
|
95 |
|
@@ -113,7 +113,7 @@ def add_new_eval(
|
|
113 |
progress(0.6, desc=f"Checking last submission date")
|
114 |
previous_user_submissions = USERS_TO_SUBMISSION_DATES.get(user_name)
|
115 |
|
116 |
-
if
|
117 |
|
118 |
previous_user_submission_dates = [datetime.strptime(date.replace("T"," ").split(" ")[0], "%Y-%m-%d") for date in previous_user_submissions]
|
119 |
previous_user_submission_dates.sort(reverse=True)
|
@@ -132,7 +132,7 @@ def add_new_eval(
|
|
132 |
|
133 |
# Check for duplicate submission
|
134 |
if f"{model}" in REQUESTED_MODELS: #_{revision}_{precision}
|
135 |
-
yield styled_warning("This model has been
|
136 |
return
|
137 |
|
138 |
# Seems good, creating the eval
|
@@ -186,7 +186,7 @@ def add_new_eval(
|
|
186 |
|
187 |
if queue_len == 0:
|
188 |
queue_data = []
|
189 |
-
elif queue_len >=
|
190 |
yield styled_warning("The evaluation queue is full at the moment. Please try again in one hour")
|
191 |
return
|
192 |
|
|
|
89 |
progress(0.3, desc=f"Checking model size")
|
90 |
model_size = get_model_size(model_info=model_info)#, precision=precision
|
91 |
|
92 |
+
if model_size>15:
|
93 |
yield styled_error("We currently accept community-submitted models up to 15 billion parameters only. If you represent an organization then please contact us at benchmark@silma.ai")
|
94 |
return
|
95 |
|
|
|
113 |
progress(0.6, desc=f"Checking last submission date")
|
114 |
previous_user_submissions = USERS_TO_SUBMISSION_DATES.get(user_name)
|
115 |
|
116 |
+
if previous_user_submissions:
|
117 |
|
118 |
previous_user_submission_dates = [datetime.strptime(date.replace("T"," ").split(" ")[0], "%Y-%m-%d") for date in previous_user_submissions]
|
119 |
previous_user_submission_dates.sort(reverse=True)
|
|
|
132 |
|
133 |
# Check for duplicate submission
|
134 |
if f"{model}" in REQUESTED_MODELS: #_{revision}_{precision}
|
135 |
+
yield styled_warning("This model has already been submitted.")
|
136 |
return
|
137 |
|
138 |
# Seems good, creating the eval
|
|
|
186 |
|
187 |
if queue_len == 0:
|
188 |
queue_data = []
|
189 |
+
elif queue_len >= 1:
|
190 |
yield styled_warning("The evaluation queue is full at the moment. Please try again in one hour")
|
191 |
return
|
192 |
|