Top size + Top by Skill
Browse files- app.py +87 -13
- src/about.py +36 -3
- src/display/utils.py +8 -5
- src/leaderboard/read_evals.py +24 -10
- src/populate.py +2 -0
app.py
CHANGED
@@ -59,22 +59,86 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
|
|
59 |
pending_eval_queue_df,
|
60 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
def init_leaderboard(dataframe):
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
|
67 |
-
|
68 |
-
lambda rows: [
|
69 |
-
"background-color: red;" if (value >0) else "background-color: green;" for value in rows
|
70 |
-
],
|
71 |
-
subset=["Contamination Score"],
|
72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
)
|
74 |
|
75 |
|
76 |
-
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
value=styler,
|
79 |
datatype="markdown",
|
80 |
wrap=True,
|
@@ -85,6 +149,9 @@ def init_leaderboard(dataframe):
|
|
85 |
elem_classes="leaderboard_col_style"
|
86 |
)
|
87 |
|
|
|
|
|
|
|
88 |
|
89 |
demo = gr.Blocks(css=custom_css)
|
90 |
with demo:
|
@@ -92,13 +159,20 @@ with demo:
|
|
92 |
gr.HTML(INTRODUCTION_TEXT, elem_classes="abl_desc_text")
|
93 |
|
94 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
95 |
-
with gr.TabItem("🏅
|
96 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
|
98 |
-
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-
|
99 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
100 |
|
101 |
-
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-
|
102 |
with gr.Column():
|
103 |
with gr.Row():
|
104 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
|
|
59 |
pending_eval_queue_df,
|
60 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
61 |
|
62 |
+
def hide_skill_columns(dataframe, exceptions=[]):
|
63 |
+
return dataframe[[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default or c.name in exceptions]]
|
64 |
+
|
65 |
+
|
66 |
+
def perform_cell_formatting(dataframe):
|
67 |
+
return dataframe.style.format({'Contamination Score': "{:.2f}",'Benchmark Score': "{:.2f}",'Speed (words/sec)': "{:.2f}"}).apply(
|
68 |
+
lambda rows: [
|
69 |
+
"background-color: red;" if (value >0) else "background-color: green;" for value in rows
|
70 |
+
],
|
71 |
+
subset=["Contamination Score"],
|
72 |
+
)
|
73 |
+
|
74 |
def init_leaderboard(dataframe):
|
75 |
+
|
76 |
+
dataframe = hide_skill_columns(dataframe)
|
77 |
+
|
78 |
|
79 |
+
styler = perform_cell_formatting(dataframe)
|
|
|
|
|
|
|
|
|
80 |
|
81 |
+
return gr.Dataframe(
|
82 |
+
value=styler,
|
83 |
+
datatype="markdown",
|
84 |
+
wrap=True,
|
85 |
+
show_fullscreen_button=False,
|
86 |
+
interactive=False,
|
87 |
+
column_widths=[30,50,50,150,60,60,60],
|
88 |
+
max_height=420,
|
89 |
+
elem_classes="leaderboard_col_style"
|
90 |
)
|
91 |
|
92 |
|
93 |
+
def init_skill_leaderboard(dataframe):
|
94 |
+
|
95 |
+
|
96 |
+
|
97 |
+
## create selector for model skills, based on the selector filter the dataframe
|
98 |
+
skills = ['MMLU', 'General Knowledge', 'Reasoning & Math', 'Translation (incl Dialects)', 'Trust & Safety', 'Writing (incl Dialects)', 'RAG QA', 'Reading Comprehension', 'Arabic Language & Grammar', 'Diacritization', 'Dialect Detection', 'Sentiment Analysis', 'Summarization', 'Instruction Following', 'Transliteration', 'Paraphrasing', 'Entity Extraction', 'Long Context', 'Coding', 'Hallucination', 'Function Calling', 'Structuring']
|
99 |
+
|
100 |
+
skills_dropdown = gr.Dropdown(choices=skills, label="Select Skill", value=skills[0])
|
101 |
+
|
102 |
+
def filter_dataframe(skill):
|
103 |
+
filtered_df = dataframe.sort_values(by=[skill], ascending=False).reset_index(drop=True)
|
104 |
+
filtered_df = hide_skill_columns(filtered_df, exceptions=[skill])
|
105 |
+
filtered_df["Rank"] = range(1, len(filtered_df) + 1)
|
106 |
+
styler = perform_cell_formatting(filtered_df)
|
107 |
+
return gr.Dataframe(
|
108 |
+
value=styler,
|
109 |
+
datatype="markdown",
|
110 |
+
wrap=True,
|
111 |
+
show_fullscreen_button=False,
|
112 |
+
interactive=False,
|
113 |
+
column_widths=[30,50,50,150,60,60,60,80],
|
114 |
+
max_height=420,
|
115 |
+
elem_classes="leaderboard_col_style"
|
116 |
+
)
|
117 |
+
|
118 |
+
leaderboard_by_skill = filter_dataframe(skills[0])
|
119 |
+
skills_dropdown.change(filter_dataframe, inputs=skills_dropdown, outputs=leaderboard_by_skill)
|
120 |
+
return leaderboard_by_skill
|
121 |
+
|
122 |
+
|
123 |
+
|
124 |
+
def init_size_leaderboard(dataframe):
|
125 |
+
|
126 |
+
dataframe = hide_skill_columns(dataframe)
|
127 |
+
|
128 |
+
size_keys = ["Large","Medium","Small","Nano"]
|
129 |
+
|
130 |
+
size_names = ["Large (More than 30B Parameter)","Medium (~30B)","Small (~10B)","Nano (~3B)"]
|
131 |
+
sizes_dropdown = gr.Dropdown(choices=size_names, label="Select Model Size", value=size_names[0])
|
132 |
+
|
133 |
+
def filter_dataframe(size_name):
|
134 |
+
##map size name to size key
|
135 |
+
size_name_mapped_to_key = size_keys[size_names.index(size_name)]
|
136 |
+
##slice array from 0 to index of size
|
137 |
+
size_list = size_keys[size_keys.index(size_name_mapped_to_key):]
|
138 |
+
filtered_df = dataframe[dataframe["Category"].isin(size_list)].reset_index(drop=True)
|
139 |
+
filtered_df["Rank"] = range(1, len(filtered_df) + 1)
|
140 |
+
styler = perform_cell_formatting(filtered_df)
|
141 |
+
return gr.Dataframe(
|
142 |
value=styler,
|
143 |
datatype="markdown",
|
144 |
wrap=True,
|
|
|
149 |
elem_classes="leaderboard_col_style"
|
150 |
)
|
151 |
|
152 |
+
leaderboard_by_skill = filter_dataframe(size_names[0])
|
153 |
+
sizes_dropdown.change(filter_dataframe, inputs=sizes_dropdown, outputs=leaderboard_by_skill)
|
154 |
+
return leaderboard_by_skill
|
155 |
|
156 |
demo = gr.Blocks(css=custom_css)
|
157 |
with demo:
|
|
|
159 |
gr.HTML(INTRODUCTION_TEXT, elem_classes="abl_desc_text")
|
160 |
|
161 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
162 |
+
with gr.TabItem("🏅 Leaderboard - Top Models", elem_id="llm-benchmark-tab-table", id=0):
|
163 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
164 |
+
|
165 |
+
with gr.TabItem("🏅 Top by Size", elem_id="llm-benchmark-tab-size", id=1):
|
166 |
+
leaderboard = init_size_leaderboard(LEADERBOARD_DF)
|
167 |
+
|
168 |
+
with gr.TabItem("🏅 Top by Skill", elem_id="llm-benchmark-tab-skills", id=2):
|
169 |
+
leaderboard = init_skill_leaderboard(LEADERBOARD_DF)
|
170 |
+
|
171 |
|
172 |
+
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-about", id=4):
|
173 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
174 |
|
175 |
+
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-submit", id=5):
|
176 |
with gr.Column():
|
177 |
with gr.Row():
|
178 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
src/about.py
CHANGED
@@ -12,6 +12,39 @@ class EvalDimension:
|
|
12 |
class EvalDimensions(Enum):
|
13 |
d0 = EvalDimension("speed", "Speed (words/sec)")
|
14 |
d1 = EvalDimension("contamination_score", "Contamination Score")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
NUM_FEWSHOT = 0 # Change with your few shot
|
17 |
# ---------------------------------------------------
|
@@ -23,10 +56,10 @@ TITLE = """<div ><img class='abl_header_image' src='https://huggingface.co/space
|
|
23 |
|
24 |
# What does your leaderboard evaluate?
|
25 |
INTRODUCTION_TEXT = """
|
26 |
-
<h1 style='width: 100%;text-align: center;' id="space-title">Arabic
|
27 |
-
ABL is the official Leaderboard of <a href='https://huggingface.co/datasets/silma-ai/arabic-broad-benchmark' target='_blank'>Arabic
|
28 |
With advanced features and innovative visualizations, we provide the community with a comprehensive view of the capabilities of Arabic models, showcasing their speed, diverse skills while also defending against benchmarking contamination.
|
29 |
-
The benchmark consists of <b>450
|
30 |
Find more details in the about Tab.
|
31 |
|
32 |
|
|
|
12 |
class EvalDimensions(Enum):
|
13 |
d0 = EvalDimension("speed", "Speed (words/sec)")
|
14 |
d1 = EvalDimension("contamination_score", "Contamination Score")
|
15 |
+
d2 = EvalDimension("paraphrasing", "Paraphrasing")
|
16 |
+
d3 = EvalDimension("sentiment analysis", "Sentiment Analysis")
|
17 |
+
d4 = EvalDimension("coding", "Coding")
|
18 |
+
d5 = EvalDimension("function calling", "Function Calling")
|
19 |
+
d6 = EvalDimension("rag qa", "RAG QA")
|
20 |
+
d7 = EvalDimension("reading comprehension", "Reading Comprehension")
|
21 |
+
d8 = EvalDimension("entity extraction", "Entity Extraction")
|
22 |
+
d9 = EvalDimension("summarization", "Summarization")
|
23 |
+
d10 = EvalDimension("long context", "Long Context")
|
24 |
+
d11 = EvalDimension("mmlu", "MMLU")
|
25 |
+
d12 = EvalDimension("arabic language & grammar", "Arabic Language & Grammar")
|
26 |
+
d13 = EvalDimension("general knowledge", "General Knowledge")
|
27 |
+
d14 = EvalDimension("translation (incl dialects)", "Translation (incl Dialects)")
|
28 |
+
d15 = EvalDimension("trust & safety","Trust & Safety")
|
29 |
+
d16 = EvalDimension("writing (incl dialects)", "Writing (incl Dialects)")
|
30 |
+
d17 = EvalDimension("dialect detection", "Dialect Detection")
|
31 |
+
d18 = EvalDimension("reasoning & math", "Reasoning & Math")
|
32 |
+
d19 = EvalDimension("diacritization", "Diacritization")
|
33 |
+
d20 = EvalDimension("instruction following", "Instruction Following")
|
34 |
+
d21 = EvalDimension("transliteration", "Transliteration")
|
35 |
+
d22 = EvalDimension("structuring", "Structuring")
|
36 |
+
d23 = EvalDimension("hallucination", "Hallucination")
|
37 |
+
|
38 |
+
|
39 |
+
|
40 |
+
|
41 |
+
|
42 |
+
|
43 |
+
|
44 |
+
|
45 |
+
|
46 |
+
|
47 |
+
|
48 |
|
49 |
NUM_FEWSHOT = 0 # Change with your few shot
|
50 |
# ---------------------------------------------------
|
|
|
56 |
|
57 |
# What does your leaderboard evaluate?
|
58 |
INTRODUCTION_TEXT = """
|
59 |
+
<h1 style='width: 100%;text-align: center;' id="space-title">Arabic Broad Leaderboard (ABL) - The first comprehensive Leaderboard for Arabic LLMs</h1>
|
60 |
+
ABL is the official Leaderboard of <a href='https://huggingface.co/datasets/silma-ai/arabic-broad-benchmark' target='_blank'>Arabic Broad Benchmark (ABB)</a>.
|
61 |
With advanced features and innovative visualizations, we provide the community with a comprehensive view of the capabilities of Arabic models, showcasing their speed, diverse skills while also defending against benchmarking contamination.
|
62 |
+
The benchmark consists of <b>450 high quality human-validated questions</b> sampled from <b>63 Arabic benchmarking datasets</b>, evaluating <b>22 categories and skills</b>.
|
63 |
Find more details in the about Tab.
|
64 |
|
65 |
|
src/display/utils.py
CHANGED
@@ -30,20 +30,23 @@ auto_eval_column_dict.append(["model_category", ColumnContent, ColumnContent("Ca
|
|
30 |
|
31 |
|
32 |
#auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
33 |
-
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
34 |
#Scores
|
35 |
auto_eval_column_dict.append(["average_score", ColumnContent, ColumnContent("Benchmark Score", "number", True)])
|
36 |
for eval_dim in EvalDimensions:
|
37 |
-
|
|
|
|
|
|
|
38 |
# Model information
|
39 |
|
40 |
#auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
41 |
#auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
42 |
#auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
43 |
#auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
44 |
-
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("License", "str", False)])
|
45 |
-
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
46 |
-
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Popularity (Likes)", "number", False)])
|
47 |
#auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
48 |
#auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
49 |
|
|
|
30 |
|
31 |
|
32 |
#auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
33 |
+
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)])
|
34 |
#Scores
|
35 |
auto_eval_column_dict.append(["average_score", ColumnContent, ColumnContent("Benchmark Score", "number", True)])
|
36 |
for eval_dim in EvalDimensions:
|
37 |
+
if eval_dim.value.metric in ["speed", "contamination_score"]:
|
38 |
+
auto_eval_column_dict.append([eval_dim.name, ColumnContent, ColumnContent(eval_dim.value.col_name, "number", True)])
|
39 |
+
else:
|
40 |
+
auto_eval_column_dict.append([eval_dim.name, ColumnContent, ColumnContent(eval_dim.value.col_name, "number", False)])
|
41 |
# Model information
|
42 |
|
43 |
#auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
44 |
#auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
45 |
#auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
46 |
#auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
47 |
+
#auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("License", "str", False)])
|
48 |
+
#auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
49 |
+
#auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Popularity (Likes)", "number", False)])
|
50 |
#auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
51 |
#auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
52 |
|
src/leaderboard/read_evals.py
CHANGED
@@ -28,9 +28,9 @@ class EvalResult:
|
|
28 |
model_category: str = "" #Nano, Small, Medium, Large
|
29 |
#weight_type: WeightType = WeightType.Original # Original or Adapter
|
30 |
#architecture: str = "Unknown"
|
31 |
-
license: str = "?"
|
32 |
-
likes: int = 0
|
33 |
-
num_params: int = 0
|
34 |
date: str = "" # submission date of request file
|
35 |
still_on_hub: bool = False
|
36 |
|
@@ -81,6 +81,16 @@ class EvalResult:
|
|
81 |
results["speed"] = results_obj.get("speed")
|
82 |
results["contamination_score"] = results_obj.get("contamination_score")
|
83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
return self(
|
85 |
eval_name=result_key,
|
86 |
full_model=full_model,
|
@@ -88,9 +98,9 @@ class EvalResult:
|
|
88 |
model=model,
|
89 |
model_source=config.get("model_source", ""),
|
90 |
model_category=config.get("model_category", ""),
|
91 |
-
num_params=config.get("params", 0),
|
92 |
-
license=config.get("license", "?"),
|
93 |
-
likes=config.get("likes", -1),
|
94 |
results=results,
|
95 |
#precision=precision,
|
96 |
#revision= config.get("model_sha", ""),
|
@@ -128,15 +138,19 @@ class EvalResult:
|
|
128 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
129 |
#AutoEvalColumn.revision.name: self.revision,
|
130 |
AutoEvalColumn.average_score.name: average_score,
|
131 |
-
AutoEvalColumn.license.name: self.license,
|
132 |
-
AutoEvalColumn.likes.name: self.likes,
|
133 |
-
AutoEvalColumn.params.name: self.num_params,
|
134 |
#AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
135 |
}
|
136 |
|
137 |
for eval_dim in EvalDimensions:
|
138 |
dimension_name = eval_dim.value.col_name
|
139 |
-
|
|
|
|
|
|
|
|
|
140 |
if dimension_name == "Contamination Score":
|
141 |
dimension_value = 0 if dimension_value < 0 else round(dimension_value,2)
|
142 |
|
|
|
28 |
model_category: str = "" #Nano, Small, Medium, Large
|
29 |
#weight_type: WeightType = WeightType.Original # Original or Adapter
|
30 |
#architecture: str = "Unknown"
|
31 |
+
#license: str = "?"
|
32 |
+
#likes: int = 0
|
33 |
+
#num_params: int = 0
|
34 |
date: str = "" # submission date of request file
|
35 |
still_on_hub: bool = False
|
36 |
|
|
|
81 |
results["speed"] = results_obj.get("speed")
|
82 |
results["contamination_score"] = results_obj.get("contamination_score")
|
83 |
|
84 |
+
scores_by_category = results_obj.get("scores_by_category")
|
85 |
+
|
86 |
+
for category_obj in scores_by_category:
|
87 |
+
category = category_obj["category"]
|
88 |
+
average_score = category_obj["average_score"]
|
89 |
+
results[category.lower()] = average_score
|
90 |
+
|
91 |
+
|
92 |
+
|
93 |
+
|
94 |
return self(
|
95 |
eval_name=result_key,
|
96 |
full_model=full_model,
|
|
|
98 |
model=model,
|
99 |
model_source=config.get("model_source", ""),
|
100 |
model_category=config.get("model_category", ""),
|
101 |
+
#num_params=config.get("params", 0),
|
102 |
+
#license=config.get("license", "?"),
|
103 |
+
#likes=config.get("likes", -1),
|
104 |
results=results,
|
105 |
#precision=precision,
|
106 |
#revision= config.get("model_sha", ""),
|
|
|
138 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
139 |
#AutoEvalColumn.revision.name: self.revision,
|
140 |
AutoEvalColumn.average_score.name: average_score,
|
141 |
+
#AutoEvalColumn.license.name: self.license,
|
142 |
+
#AutoEvalColumn.likes.name: self.likes,
|
143 |
+
#AutoEvalColumn.params.name: self.num_params,
|
144 |
#AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
145 |
}
|
146 |
|
147 |
for eval_dim in EvalDimensions:
|
148 |
dimension_name = eval_dim.value.col_name
|
149 |
+
try:
|
150 |
+
dimension_value = self.results[eval_dim.value.metric]
|
151 |
+
except KeyError:
|
152 |
+
dimension_value = 0
|
153 |
+
|
154 |
if dimension_name == "Contamination Score":
|
155 |
dimension_value = 0 if dimension_value < 0 else round(dimension_value,2)
|
156 |
|
src/populate.py
CHANGED
@@ -24,6 +24,8 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
24 |
|
25 |
df.insert(0, "Rank", range(1, len(df) + 1))
|
26 |
df = df[cols].round(decimals=2)
|
|
|
|
|
27 |
print(df)
|
28 |
|
29 |
|
|
|
24 |
|
25 |
df.insert(0, "Rank", range(1, len(df) + 1))
|
26 |
df = df[cols].round(decimals=2)
|
27 |
+
print("###############\n\n\n\n\n\n###############")
|
28 |
+
|
29 |
print(df)
|
30 |
|
31 |
|