karimouda commited on
Commit
aaf9571
·
1 Parent(s): 5c67778

Prelaunch changes

Browse files
app.py CHANGED
@@ -105,7 +105,7 @@ def init_leaderboard(dataframe):
105
  show_fullscreen_button=False,
106
  interactive=False,
107
  column_widths=[30,50,40,150,60,60,60],
108
- max_height=420,
109
  elem_classes="leaderboard_col_style",
110
  show_search="filter",
111
  max_chars=None
@@ -344,11 +344,11 @@ with demo:
344
  with gr.Row():
345
  gr.Markdown("# Submit your model", elem_classes="markdown-text")
346
  with gr.Column():
347
- gr.Markdown("### Please confirm that you understand and accept the conditions below before submitting your model.")
348
- prereqs_checkboxes = gr.CheckboxGroup(["I have successfully run the ABB benchmark script on my model using my own infrastructure and I am NOT using the Leaderboard for testing purposes",
349
- "I understand that my account/org have only one submission per month",
350
  "I understand that I can't submit models more than 15B parameters (learn more in the FAQ)",
351
- "I understand that submitting contaminated models or models to test the contamination score will lead to action from our side including banning. We also reserve the right to delete any model we think is contaminated without notice."],
352
  label=None, info=None,
353
  elem_classes="submit_prereq_checkboxes_container",
354
  container=False)
@@ -415,7 +415,7 @@ with demo:
415
  row_count=5,
416
  )
417
 
418
- with gr.TabItem("📝 FAQ", elem_id="llm-benchmark-tab-about", id=6):
419
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
420
 
421
  with gr.Row():
 
105
  show_fullscreen_button=False,
106
  interactive=False,
107
  column_widths=[30,50,40,150,60,60,60],
108
+ max_height=450,
109
  elem_classes="leaderboard_col_style",
110
  show_search="filter",
111
  max_chars=None
 
344
  with gr.Row():
345
  gr.Markdown("# Submit your model", elem_classes="markdown-text")
346
  with gr.Column():
347
+ gr.Markdown("### Please confirm that you understand and accept the conditions below before submitting your model:")
348
+ prereqs_checkboxes = gr.CheckboxGroup(["I have successfully run the ABB benchmark script on my model using my own infrastructure, and I am not using the Leaderboard for testing purposes",
349
+ "I understand that my account/org has only one submission per month",
350
  "I understand that I can't submit models more than 15B parameters (learn more in the FAQ)",
351
+ "I understand that submitting contaminated models, or models intended to test the contamination score, may result in actions from our side, including banning. We also reserve the right to delete any model we deem contaminated without prior notice"],
352
  label=None, info=None,
353
  elem_classes="submit_prereq_checkboxes_container",
354
  container=False)
 
415
  row_count=5,
416
  )
417
 
418
+ with gr.TabItem("📝 FAQ", elem_id="llm-benchmark-tab-faq", id=6):
419
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
420
 
421
  with gr.Row():
requests/Qwen/Qwen3-4B_eval_request.json DELETED
@@ -1 +0,0 @@
1
- {"model": "Qwen/Qwen3-4B", "model_sha": "82d62bb073771e7a1ea59435f548908540217d1f", "status": "PENDING", "submitted_time": "2025-05-11T06:25:32Z", "likes": 174, "params": 4.022, "license": "apache-2.0"}
 
 
src/about.py CHANGED
@@ -57,42 +57,104 @@ TITLE = """<div ><img class='abl_header_image' src='https://huggingface.co/space
57
  # What does your leaderboard evaluate?
58
  INTRODUCTION_TEXT = """
59
  <h1 style='width: 100%;text-align: center;' id="space-title">Arabic Broad Leaderboard (ABL) - The first comprehensive Leaderboard for Arabic LLMs</h1>
60
- ABL is the official Leaderboard of <a href='https://huggingface.co/datasets/silma-ai/arabic-broad-benchmark' target='_blank'>Arabic Broad Benchmark (ABB)</a>.
61
- With advanced features and innovative visualizations, we provide the community with a comprehensive view of the capabilities of Arabic models, showcasing their speed, diverse skills while also defending against benchmarking contamination.
62
- The benchmark consists of <b>450 high quality human-validated questions</b> sampled from <b>63 Arabic benchmarking datasets</b>, evaluating <b>22 categories and skills</b>.
63
- Find more details in the about Tab.
64
-
65
-
66
  """
67
 
68
  # Which evaluations are you running? how can people reproduce what you have?
69
  LLM_BENCHMARKS_TEXT = f"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
- ## FAQ
72
 
73
- ### What is the difference betweem ABL and ABB?
74
 
75
- ABL is the Leaderboard which uses ABB benchmarking dataset and code in the backend to produce the results you see here
 
 
76
 
 
77
 
78
- ### What can I learn more about ABL and ABB?
79
 
80
- Feel free to read the following resources
81
- ABB Page:
82
- ABL blog post:
 
83
 
84
- ### How can I reproduce the results?
85
 
86
- You can easily run the ABB benchmarking code using the following command on Google Collab or your own infratructure.
87
 
88
- ### What is the Benchmark Score?
89
 
90
- ### What is the Contamination Score?
 
91
 
92
- ### What is the Speed?
93
 
94
- ### Why I am not allowed to submit models more than 15B parameters?
95
 
 
96
 
97
  """
98
 
@@ -104,7 +166,7 @@ CITATION_BUTTON_LABEL = "Copy the following snippet to cite the Leaderboard"
104
  CITATION_BUTTON_TEXT = r"""
105
 
106
  @misc{ABL,
107
- author = {SILMA AI Team},
108
  title = {Arabic Broad Leaderboard},
109
  year = {2025},
110
  publisher = {SILMA.AI},
 
57
  # What does your leaderboard evaluate?
58
  INTRODUCTION_TEXT = """
59
  <h1 style='width: 100%;text-align: center;' id="space-title">Arabic Broad Leaderboard (ABL) - The first comprehensive Leaderboard for Arabic LLMs</h1>
60
+ ABL, the official leaderboard of the <a href='https://huggingface.co/datasets/silma-ai/arabic-broad-benchmark' target='_blank'>Arabic Broad Benchmark (ABB)</a>
61
+ is a next-generation leaderboard offering innovative visualizations, analytical capabilities, model skill breakdowns, speed comparisons, and contamination detection mechanisms. ABL provides the community with an unprecedented ability to study the capabilities of Arabic models and choose the right model for the right task. Find more details in the FAQ section.
 
 
 
 
62
  """
63
 
64
  # Which evaluations are you running? how can people reproduce what you have?
65
  LLM_BENCHMARKS_TEXT = f"""
66
+ # FAQ
67
+
68
+ ---
69
+
70
+ ## What is the Benchmark Score?
71
+
72
+ * The benchmark score is calculated by taking the average of all individual question scores.
73
+ * Each question is scored from 0 to 10 using a mix of LLM-as-judge and manual rules, depending on the question type.
74
+ * Please refer to the ABB page below for more information about the scoring rules and the dataset:
75
+
76
+ https://huggingface.co/datasets/silma-ai/arabic-broad-benchmark#scoring-rules
77
+
78
+ ---
79
+
80
+ ## What is the Contamination Score?
81
+
82
+ * The contamination score is a score measuring the probability that a model has been trained using the ABB benchmarking data to boost its scores on ABL.
83
+ * After testing each model on ABL, we run our private algorithm to detect contamination and arrive at a score.
84
+ * Contaminated models will show a red sign and a number above zero in the Contamination Score column.
85
+ * Any model showing signs of contamination will be deleted instantly from the leaderboard.
86
+
87
+ ---
88
+
89
+ ## What is the Speed?
90
+
91
+ * Speed shows how fast the model was during testing, using the "words per second" metric.
92
+ * The score is calculated by dividing the number of words generated by the model during the entire test by the time taken (in seconds) for the model to complete testing.
93
+ * Please note that we use the same GPU (A100) and a batch size of 1 for all models (in the same size category) to ensure a fair comparison.
94
+
95
+ ---
96
+
97
+ ## What does Size mean?
98
+
99
+ * Models below 3.5B parameters are considered Nano.
100
+ * Models between 3.5B and 10B parameters are considered Small.
101
+ * Models between 10B and 30B parameters are considered Medium.
102
+ * Models above 30B parameters are considered Large.
103
+
104
+ ---
105
+
106
+ ## What does Source mean?
107
+
108
+ * API: Closed models tested via an API.
109
+ * Hugging Face: Open models downloaded and tested from Hugging Face via the `transformers` library.
110
+
111
+ ---
112
+
113
+ ## How can I reproduce the results?
114
+
115
+ You can easily reproduce the results of any model by following the steps on the ABB page below:
116
+
117
+ https://huggingface.co/datasets/silma-ai/arabic-broad-benchmark#how-to-use-abb-to-benchmark-a-model
118
+
119
+ ---
120
+
121
+ ## I tested a model and got a slightly different score. Why is that?
122
+
123
+ * ABB is partially dependent on an external LLM-as-judge (GPT-4.1).
124
+ * LLMs are random in nature and will not always produce the same scores on every run.
125
+ * That said, according to our testing, such variations are always within a +/-1% range.
126
 
127
+ ---
128
 
129
+ ## I have seen an answer which seems correct to me but is getting a zero score. Why is that?
130
 
131
+ * First, LLM scoring is not always consistent, and sometimes it gives a wrong score to an answer, but based on our testing, this is very rare.
132
+ * Second, we also have fixed rules in place to penalize models; for example, when a model answers in another language or answers in two languages, we give a score of zero.
133
+ * In general, both fixed rules and LLM inconsistencies are applied to all models in the same way, which we consider fair.
134
 
135
+ ---
136
 
137
+ ## Why am I not allowed to submit models with more than 15B parameters?
138
 
139
+ * Models above 15B parameters don't fit into a single GPU and require provisioning of multiple GPUs, which we can't always guarantee to provision in an automated manner.
140
+ * We also know that most community models are below 15B parameters.
141
+ * As an exception, we can accept requests from organizations on a case-by-case basis.
142
+ * Finally, we will always make sure to include larger models when they have high adoption from the community.
143
 
144
+ ---
145
 
146
+ ## How can I learn more about ABL and ABB?
147
 
148
+ Feel free to read through the following resources:
149
 
150
+ * **ABB Page**: https://huggingface.co/datasets/silma-ai/arabic-broad-benchmark
151
+ * **ABL blog post**: Coming soon...
152
 
153
+ ---
154
 
155
+ ## How can I contact the benchmark maintainers?
156
 
157
+ You can contact us via benchmark@silma.ai
158
 
159
  """
160
 
 
166
  CITATION_BUTTON_TEXT = r"""
167
 
168
  @misc{ABL,
169
+ author = {SILMA.AI Team},
170
  title = {Arabic Broad Leaderboard},
171
  year = {2025},
172
  publisher = {SILMA.AI},
src/display/css_html_js.py CHANGED
@@ -2,7 +2,7 @@ custom_css = """
2
 
3
  .abl_desc_text {
4
  font-size: 16px !important;
5
- margin-bottom: 5px;
6
  }
7
 
8
  #models-to-add-text {
@@ -155,6 +155,14 @@ border-radius: 10px;
155
  .prose *{
156
  color:unset;
157
  }
 
 
 
 
 
 
 
 
158
  """
159
 
160
  get_window_url_params = """
 
2
 
3
  .abl_desc_text {
4
  font-size: 16px !important;
5
+ margin-bottom: 0px;
6
  }
7
 
8
  #models-to-add-text {
 
155
  .prose *{
156
  color:unset;
157
  }
158
+
159
+ #llm-benchmark-tab-submit{
160
+ margin-top:20px;
161
+ }
162
+
163
+ #llm-benchmark-tab-faq{
164
+ margin-top:20px;
165
+ }
166
  """
167
 
168
  get_window_url_params = """
src/submission/submit.py CHANGED
@@ -89,7 +89,7 @@ def add_new_eval(
89
  progress(0.3, desc=f"Checking model size")
90
  model_size = get_model_size(model_info=model_info)#, precision=precision
91
 
92
- if model_size>150:##********************CHANGE
93
  yield styled_error("We currently accept community-submitted models up to 15 billion parameters only. If you represent an organization then please contact us at benchmark@silma.ai")
94
  return
95
 
@@ -113,7 +113,7 @@ def add_new_eval(
113
  progress(0.6, desc=f"Checking last submission date")
114
  previous_user_submissions = USERS_TO_SUBMISSION_DATES.get(user_name)
115
 
116
- if False and previous_user_submissions:
117
 
118
  previous_user_submission_dates = [datetime.strptime(date.replace("T"," ").split(" ")[0], "%Y-%m-%d") for date in previous_user_submissions]
119
  previous_user_submission_dates.sort(reverse=True)
@@ -132,7 +132,7 @@ def add_new_eval(
132
 
133
  # Check for duplicate submission
134
  if f"{model}" in REQUESTED_MODELS: #_{revision}_{precision}
135
- yield styled_warning("This model has been already submitted.")
136
  return
137
 
138
  # Seems good, creating the eval
@@ -186,7 +186,7 @@ def add_new_eval(
186
 
187
  if queue_len == 0:
188
  queue_data = []
189
- elif queue_len >= 10:##********************CHANGE
190
  yield styled_warning("The evaluation queue is full at the moment. Please try again in one hour")
191
  return
192
 
 
89
  progress(0.3, desc=f"Checking model size")
90
  model_size = get_model_size(model_info=model_info)#, precision=precision
91
 
92
+ if model_size>15:
93
  yield styled_error("We currently accept community-submitted models up to 15 billion parameters only. If you represent an organization then please contact us at benchmark@silma.ai")
94
  return
95
 
 
113
  progress(0.6, desc=f"Checking last submission date")
114
  previous_user_submissions = USERS_TO_SUBMISSION_DATES.get(user_name)
115
 
116
+ if previous_user_submissions:
117
 
118
  previous_user_submission_dates = [datetime.strptime(date.replace("T"," ").split(" ")[0], "%Y-%m-%d") for date in previous_user_submissions]
119
  previous_user_submission_dates.sort(reverse=True)
 
132
 
133
  # Check for duplicate submission
134
  if f"{model}" in REQUESTED_MODELS: #_{revision}_{precision}
135
+ yield styled_warning("This model has already been submitted.")
136
  return
137
 
138
  # Seems good, creating the eval
 
186
 
187
  if queue_len == 0:
188
  queue_data = []
189
+ elif queue_len >= 1:
190
  yield styled_warning("The evaluation queue is full at the moment. Please try again in one hour")
191
  return
192