karimouda commited on
Commit
ca48878
·
1 Parent(s): 98e8d9a

Top size + Top by Skill

Browse files
Files changed (5) hide show
  1. app.py +87 -13
  2. src/about.py +36 -3
  3. src/display/utils.py +8 -5
  4. src/leaderboard/read_evals.py +24 -10
  5. src/populate.py +2 -0
app.py CHANGED
@@ -59,22 +59,86 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
59
  pending_eval_queue_df,
60
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
61
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  def init_leaderboard(dataframe):
63
- #if dataframe is None or dataframe.empty:
64
- #raise ValueError("Leaderboard DataFrame is empty or None.")
65
- dataframe = dataframe[[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default]]
66
 
67
- styler = dataframe.style.format({'Contamination Score': "{:.2f}",'Benchmark Score': "{:.2f}",'Speed (words/sec)': "{:.2f}"}).apply(
68
- lambda rows: [
69
- "background-color: red;" if (value >0) else "background-color: green;" for value in rows
70
- ],
71
- subset=["Contamination Score"],
72
 
 
 
 
 
 
 
 
 
 
73
  )
74
 
75
 
76
-
77
- return gr.Dataframe(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  value=styler,
79
  datatype="markdown",
80
  wrap=True,
@@ -85,6 +149,9 @@ def init_leaderboard(dataframe):
85
  elem_classes="leaderboard_col_style"
86
  )
87
 
 
 
 
88
 
89
  demo = gr.Blocks(css=custom_css)
90
  with demo:
@@ -92,13 +159,20 @@ with demo:
92
  gr.HTML(INTRODUCTION_TEXT, elem_classes="abl_desc_text")
93
 
94
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
95
- with gr.TabItem("🏅 Arabic LLM Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
96
  leaderboard = init_leaderboard(LEADERBOARD_DF)
 
 
 
 
 
 
 
97
 
98
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
99
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
100
 
101
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
102
  with gr.Column():
103
  with gr.Row():
104
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
 
59
  pending_eval_queue_df,
60
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
61
 
62
+ def hide_skill_columns(dataframe, exceptions=[]):
63
+ return dataframe[[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default or c.name in exceptions]]
64
+
65
+
66
+ def perform_cell_formatting(dataframe):
67
+ return dataframe.style.format({'Contamination Score': "{:.2f}",'Benchmark Score': "{:.2f}",'Speed (words/sec)': "{:.2f}"}).apply(
68
+ lambda rows: [
69
+ "background-color: red;" if (value >0) else "background-color: green;" for value in rows
70
+ ],
71
+ subset=["Contamination Score"],
72
+ )
73
+
74
  def init_leaderboard(dataframe):
75
+
76
+ dataframe = hide_skill_columns(dataframe)
77
+
78
 
79
+ styler = perform_cell_formatting(dataframe)
 
 
 
 
80
 
81
+ return gr.Dataframe(
82
+ value=styler,
83
+ datatype="markdown",
84
+ wrap=True,
85
+ show_fullscreen_button=False,
86
+ interactive=False,
87
+ column_widths=[30,50,50,150,60,60,60],
88
+ max_height=420,
89
+ elem_classes="leaderboard_col_style"
90
  )
91
 
92
 
93
+ def init_skill_leaderboard(dataframe):
94
+
95
+
96
+
97
+ ## create selector for model skills, based on the selector filter the dataframe
98
+ skills = ['MMLU', 'General Knowledge', 'Reasoning & Math', 'Translation (incl Dialects)', 'Trust & Safety', 'Writing (incl Dialects)', 'RAG QA', 'Reading Comprehension', 'Arabic Language & Grammar', 'Diacritization', 'Dialect Detection', 'Sentiment Analysis', 'Summarization', 'Instruction Following', 'Transliteration', 'Paraphrasing', 'Entity Extraction', 'Long Context', 'Coding', 'Hallucination', 'Function Calling', 'Structuring']
99
+
100
+ skills_dropdown = gr.Dropdown(choices=skills, label="Select Skill", value=skills[0])
101
+
102
+ def filter_dataframe(skill):
103
+ filtered_df = dataframe.sort_values(by=[skill], ascending=False).reset_index(drop=True)
104
+ filtered_df = hide_skill_columns(filtered_df, exceptions=[skill])
105
+ filtered_df["Rank"] = range(1, len(filtered_df) + 1)
106
+ styler = perform_cell_formatting(filtered_df)
107
+ return gr.Dataframe(
108
+ value=styler,
109
+ datatype="markdown",
110
+ wrap=True,
111
+ show_fullscreen_button=False,
112
+ interactive=False,
113
+ column_widths=[30,50,50,150,60,60,60,80],
114
+ max_height=420,
115
+ elem_classes="leaderboard_col_style"
116
+ )
117
+
118
+ leaderboard_by_skill = filter_dataframe(skills[0])
119
+ skills_dropdown.change(filter_dataframe, inputs=skills_dropdown, outputs=leaderboard_by_skill)
120
+ return leaderboard_by_skill
121
+
122
+
123
+
124
+ def init_size_leaderboard(dataframe):
125
+
126
+ dataframe = hide_skill_columns(dataframe)
127
+
128
+ size_keys = ["Large","Medium","Small","Nano"]
129
+
130
+ size_names = ["Large (More than 30B Parameter)","Medium (~30B)","Small (~10B)","Nano (~3B)"]
131
+ sizes_dropdown = gr.Dropdown(choices=size_names, label="Select Model Size", value=size_names[0])
132
+
133
+ def filter_dataframe(size_name):
134
+ ##map size name to size key
135
+ size_name_mapped_to_key = size_keys[size_names.index(size_name)]
136
+ ##slice array from 0 to index of size
137
+ size_list = size_keys[size_keys.index(size_name_mapped_to_key):]
138
+ filtered_df = dataframe[dataframe["Category"].isin(size_list)].reset_index(drop=True)
139
+ filtered_df["Rank"] = range(1, len(filtered_df) + 1)
140
+ styler = perform_cell_formatting(filtered_df)
141
+ return gr.Dataframe(
142
  value=styler,
143
  datatype="markdown",
144
  wrap=True,
 
149
  elem_classes="leaderboard_col_style"
150
  )
151
 
152
+ leaderboard_by_skill = filter_dataframe(size_names[0])
153
+ sizes_dropdown.change(filter_dataframe, inputs=sizes_dropdown, outputs=leaderboard_by_skill)
154
+ return leaderboard_by_skill
155
 
156
  demo = gr.Blocks(css=custom_css)
157
  with demo:
 
159
  gr.HTML(INTRODUCTION_TEXT, elem_classes="abl_desc_text")
160
 
161
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
162
+ with gr.TabItem("🏅 Leaderboard - Top Models", elem_id="llm-benchmark-tab-table", id=0):
163
  leaderboard = init_leaderboard(LEADERBOARD_DF)
164
+
165
+ with gr.TabItem("🏅 Top by Size", elem_id="llm-benchmark-tab-size", id=1):
166
+ leaderboard = init_size_leaderboard(LEADERBOARD_DF)
167
+
168
+ with gr.TabItem("🏅 Top by Skill", elem_id="llm-benchmark-tab-skills", id=2):
169
+ leaderboard = init_skill_leaderboard(LEADERBOARD_DF)
170
+
171
 
172
+ with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-about", id=4):
173
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
174
 
175
+ with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-submit", id=5):
176
  with gr.Column():
177
  with gr.Row():
178
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
src/about.py CHANGED
@@ -12,6 +12,39 @@ class EvalDimension:
12
  class EvalDimensions(Enum):
13
  d0 = EvalDimension("speed", "Speed (words/sec)")
14
  d1 = EvalDimension("contamination_score", "Contamination Score")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  NUM_FEWSHOT = 0 # Change with your few shot
17
  # ---------------------------------------------------
@@ -23,10 +56,10 @@ TITLE = """<div ><img class='abl_header_image' src='https://huggingface.co/space
23
 
24
  # What does your leaderboard evaluate?
25
  INTRODUCTION_TEXT = """
26
- <h1 style='width: 100%;text-align: center;' id="space-title">Arabic Board Leaderboard (ABL) - The first comprehensive Leaderboard for Arabic LLMs</h1>
27
- ABL is the official Leaderboard of <a href='https://huggingface.co/datasets/silma-ai/arabic-broad-benchmark' target='_blank'>Arabic Board Benchmark (ABB)</a>.
28
  With advanced features and innovative visualizations, we provide the community with a comprehensive view of the capabilities of Arabic models, showcasing their speed, diverse skills while also defending against benchmarking contamination.
29
- The benchmark consists of <b>450</b> high quality questions sampled from <b>63</b> Arabic benchmarking datasets, evaluating <b>22 categories and skills</b>.
30
  Find more details in the about Tab.
31
 
32
 
 
12
  class EvalDimensions(Enum):
13
  d0 = EvalDimension("speed", "Speed (words/sec)")
14
  d1 = EvalDimension("contamination_score", "Contamination Score")
15
+ d2 = EvalDimension("paraphrasing", "Paraphrasing")
16
+ d3 = EvalDimension("sentiment analysis", "Sentiment Analysis")
17
+ d4 = EvalDimension("coding", "Coding")
18
+ d5 = EvalDimension("function calling", "Function Calling")
19
+ d6 = EvalDimension("rag qa", "RAG QA")
20
+ d7 = EvalDimension("reading comprehension", "Reading Comprehension")
21
+ d8 = EvalDimension("entity extraction", "Entity Extraction")
22
+ d9 = EvalDimension("summarization", "Summarization")
23
+ d10 = EvalDimension("long context", "Long Context")
24
+ d11 = EvalDimension("mmlu", "MMLU")
25
+ d12 = EvalDimension("arabic language & grammar", "Arabic Language & Grammar")
26
+ d13 = EvalDimension("general knowledge", "General Knowledge")
27
+ d14 = EvalDimension("translation (incl dialects)", "Translation (incl Dialects)")
28
+ d15 = EvalDimension("trust & safety","Trust & Safety")
29
+ d16 = EvalDimension("writing (incl dialects)", "Writing (incl Dialects)")
30
+ d17 = EvalDimension("dialect detection", "Dialect Detection")
31
+ d18 = EvalDimension("reasoning & math", "Reasoning & Math")
32
+ d19 = EvalDimension("diacritization", "Diacritization")
33
+ d20 = EvalDimension("instruction following", "Instruction Following")
34
+ d21 = EvalDimension("transliteration", "Transliteration")
35
+ d22 = EvalDimension("structuring", "Structuring")
36
+ d23 = EvalDimension("hallucination", "Hallucination")
37
+
38
+
39
+
40
+
41
+
42
+
43
+
44
+
45
+
46
+
47
+
48
 
49
  NUM_FEWSHOT = 0 # Change with your few shot
50
  # ---------------------------------------------------
 
56
 
57
  # What does your leaderboard evaluate?
58
  INTRODUCTION_TEXT = """
59
+ <h1 style='width: 100%;text-align: center;' id="space-title">Arabic Broad Leaderboard (ABL) - The first comprehensive Leaderboard for Arabic LLMs</h1>
60
+ ABL is the official Leaderboard of <a href='https://huggingface.co/datasets/silma-ai/arabic-broad-benchmark' target='_blank'>Arabic Broad Benchmark (ABB)</a>.
61
  With advanced features and innovative visualizations, we provide the community with a comprehensive view of the capabilities of Arabic models, showcasing their speed, diverse skills while also defending against benchmarking contamination.
62
+ The benchmark consists of <b>450 high quality human-validated questions</b> sampled from <b>63 Arabic benchmarking datasets</b>, evaluating <b>22 categories and skills</b>.
63
  Find more details in the about Tab.
64
 
65
 
src/display/utils.py CHANGED
@@ -30,20 +30,23 @@ auto_eval_column_dict.append(["model_category", ColumnContent, ColumnContent("Ca
30
 
31
 
32
  #auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
33
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
34
  #Scores
35
  auto_eval_column_dict.append(["average_score", ColumnContent, ColumnContent("Benchmark Score", "number", True)])
36
  for eval_dim in EvalDimensions:
37
- auto_eval_column_dict.append([eval_dim.name, ColumnContent, ColumnContent(eval_dim.value.col_name, "number", True)])
 
 
 
38
  # Model information
39
 
40
  #auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
41
  #auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
42
  #auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
43
  #auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
44
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("License", "str", False)])
45
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
46
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Popularity (Likes)", "number", False)])
47
  #auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
48
  #auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
49
 
 
30
 
31
 
32
  #auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
33
+ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)])
34
  #Scores
35
  auto_eval_column_dict.append(["average_score", ColumnContent, ColumnContent("Benchmark Score", "number", True)])
36
  for eval_dim in EvalDimensions:
37
+ if eval_dim.value.metric in ["speed", "contamination_score"]:
38
+ auto_eval_column_dict.append([eval_dim.name, ColumnContent, ColumnContent(eval_dim.value.col_name, "number", True)])
39
+ else:
40
+ auto_eval_column_dict.append([eval_dim.name, ColumnContent, ColumnContent(eval_dim.value.col_name, "number", False)])
41
  # Model information
42
 
43
  #auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
44
  #auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
45
  #auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
46
  #auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
47
+ #auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("License", "str", False)])
48
+ #auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
49
+ #auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Popularity (Likes)", "number", False)])
50
  #auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
51
  #auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
52
 
src/leaderboard/read_evals.py CHANGED
@@ -28,9 +28,9 @@ class EvalResult:
28
  model_category: str = "" #Nano, Small, Medium, Large
29
  #weight_type: WeightType = WeightType.Original # Original or Adapter
30
  #architecture: str = "Unknown"
31
- license: str = "?"
32
- likes: int = 0
33
- num_params: int = 0
34
  date: str = "" # submission date of request file
35
  still_on_hub: bool = False
36
 
@@ -81,6 +81,16 @@ class EvalResult:
81
  results["speed"] = results_obj.get("speed")
82
  results["contamination_score"] = results_obj.get("contamination_score")
83
 
 
 
 
 
 
 
 
 
 
 
84
  return self(
85
  eval_name=result_key,
86
  full_model=full_model,
@@ -88,9 +98,9 @@ class EvalResult:
88
  model=model,
89
  model_source=config.get("model_source", ""),
90
  model_category=config.get("model_category", ""),
91
- num_params=config.get("params", 0),
92
- license=config.get("license", "?"),
93
- likes=config.get("likes", -1),
94
  results=results,
95
  #precision=precision,
96
  #revision= config.get("model_sha", ""),
@@ -128,15 +138,19 @@ class EvalResult:
128
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
129
  #AutoEvalColumn.revision.name: self.revision,
130
  AutoEvalColumn.average_score.name: average_score,
131
- AutoEvalColumn.license.name: self.license,
132
- AutoEvalColumn.likes.name: self.likes,
133
- AutoEvalColumn.params.name: self.num_params,
134
  #AutoEvalColumn.still_on_hub.name: self.still_on_hub,
135
  }
136
 
137
  for eval_dim in EvalDimensions:
138
  dimension_name = eval_dim.value.col_name
139
- dimension_value = self.results[eval_dim.value.metric]
 
 
 
 
140
  if dimension_name == "Contamination Score":
141
  dimension_value = 0 if dimension_value < 0 else round(dimension_value,2)
142
 
 
28
  model_category: str = "" #Nano, Small, Medium, Large
29
  #weight_type: WeightType = WeightType.Original # Original or Adapter
30
  #architecture: str = "Unknown"
31
+ #license: str = "?"
32
+ #likes: int = 0
33
+ #num_params: int = 0
34
  date: str = "" # submission date of request file
35
  still_on_hub: bool = False
36
 
 
81
  results["speed"] = results_obj.get("speed")
82
  results["contamination_score"] = results_obj.get("contamination_score")
83
 
84
+ scores_by_category = results_obj.get("scores_by_category")
85
+
86
+ for category_obj in scores_by_category:
87
+ category = category_obj["category"]
88
+ average_score = category_obj["average_score"]
89
+ results[category.lower()] = average_score
90
+
91
+
92
+
93
+
94
  return self(
95
  eval_name=result_key,
96
  full_model=full_model,
 
98
  model=model,
99
  model_source=config.get("model_source", ""),
100
  model_category=config.get("model_category", ""),
101
+ #num_params=config.get("params", 0),
102
+ #license=config.get("license", "?"),
103
+ #likes=config.get("likes", -1),
104
  results=results,
105
  #precision=precision,
106
  #revision= config.get("model_sha", ""),
 
138
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
139
  #AutoEvalColumn.revision.name: self.revision,
140
  AutoEvalColumn.average_score.name: average_score,
141
+ #AutoEvalColumn.license.name: self.license,
142
+ #AutoEvalColumn.likes.name: self.likes,
143
+ #AutoEvalColumn.params.name: self.num_params,
144
  #AutoEvalColumn.still_on_hub.name: self.still_on_hub,
145
  }
146
 
147
  for eval_dim in EvalDimensions:
148
  dimension_name = eval_dim.value.col_name
149
+ try:
150
+ dimension_value = self.results[eval_dim.value.metric]
151
+ except KeyError:
152
+ dimension_value = 0
153
+
154
  if dimension_name == "Contamination Score":
155
  dimension_value = 0 if dimension_value < 0 else round(dimension_value,2)
156
 
src/populate.py CHANGED
@@ -24,6 +24,8 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
24
 
25
  df.insert(0, "Rank", range(1, len(df) + 1))
26
  df = df[cols].round(decimals=2)
 
 
27
  print(df)
28
 
29
 
 
24
 
25
  df.insert(0, "Rank", range(1, len(df) + 1))
26
  df = df[cols].round(decimals=2)
27
+ print("###############\n\n\n\n\n\n###############")
28
+
29
  print(df)
30
 
31