rzanoli commited on
Commit
6b09246
·
1 Parent(s): 56e849d

Add theoretical performance of a model that scores the highest on every individual task

Browse files
Files changed (3) hide show
  1. app.py +95 -11
  2. src/display/utils.py +1 -0
  3. src/leaderboard/read_evals.py +4 -1
app.py CHANGED
@@ -17,6 +17,32 @@ import plotly.express as px
17
  import plotly.graph_objects as go
18
 
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def line_chart(dataframe):
21
  # Separiamo i dati in base a IS_FS
22
  df_true = dataframe[dataframe['IS_FS'] == True]
@@ -44,7 +70,7 @@ def line_chart(dataframe):
44
  x=x_true,
45
  y=y_true,
46
  mode='markers', # solo marker, niente testo
47
- name='5-Few-Shot',
48
  marker=dict(color='red', size=10),
49
  hovertemplate='<b>%{customdata}</b><br>#Params: %{x}<br>Performance: %{y}<extra></extra>',
50
  customdata=labels_true # tutte le informazioni sul hover
@@ -78,6 +104,8 @@ def line_chart(dataframe):
78
 
79
 
80
 
 
 
81
  # Define task metadata (icons, names, descriptions)
82
  TASK_METADATA_MULTIPLECHOICE = {
83
  "TE": {"icon": "📊", "name": "Textual Entailment", "tooltip": ""},
@@ -109,6 +137,8 @@ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
109
  if dataframe is None or dataframe.empty:
110
  raise ValueError("Leaderboard DataFrame is empty or None.")
111
 
 
 
112
  sorted_dataframe = dataframe.sort_values(by="Avg. Comb. Perf. ⬆️", ascending=False)
113
 
114
  sorted_dataframe = sorted_dataframe.reset_index(drop=True)
@@ -168,10 +198,10 @@ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
168
  search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
169
  hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
170
  filter_columns=[
171
- ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)"),
172
  #ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)",
173
  # default=[["0️⃣", "0️⃣"]]),
174
- # ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=150, label="Select the number of parameters (B)"),
175
  ],
176
  #filter_columns=[
177
  # ColumnFilter("IS_FS", type="checkbox", default=False, label="5-Few-Shot")
@@ -195,13 +225,46 @@ def update_task_leaderboard(dataframe, default_selection=None, hidden_columns=No
195
  sorted_dataframe = sorted_dataframe.reset_index(drop=True)
196
  sorted_dataframe["rank"] = sorted_dataframe.index + 1
197
 
198
- # aggiungi la corona accanto al nome del modello se il rank è 1
199
- sorted_dataframe["Model"] = sorted_dataframe.apply(
200
- lambda row: f"{row['Model']} 🥇" if row["rank"] == 1 else
201
- (f"{row['Model']} 🥈" if row["rank"] == 2 else
202
- (f"{row['Model']} 🥉" if row["rank"] == 3 else row["Model"])),
203
- axis=1
204
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
  pd.set_option('display.max_colwidth', None)
207
  #print("========================", dataframe['Model'])
@@ -222,7 +285,9 @@ def update_task_leaderboard(dataframe, default_selection=None, hidden_columns=No
222
  search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
223
  hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
224
  filter_columns=[
225
- ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)"),
 
 
226
  ],
227
  bool_checkboxgroup_label="Evaluation Mode",
228
  interactive=False
@@ -273,6 +338,8 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
273
  finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
274
  #print(LEADERBOARD_DF.columns.tolist())
275
 
 
 
276
  # Prepare the main interface
277
  demo = gr.Blocks(css=custom_css)
278
  with demo:
@@ -306,6 +373,22 @@ with demo:
306
  hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['rank', 'FS', 'Model', "Avg. Comb. Perf. ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]]
307
  )
308
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
  with gr.TabItem("📈 Charts"):
310
  #gr.Plot(value=line_chart(LEADERBOARD_DF), label="Andamento di esempio")
311
  #gr.Plot(value=line_chart_interactive_test(), label="Andamento interattivo")
@@ -319,6 +402,7 @@ with demo:
319
  with gr.TabItem("║", interactive=False):
320
  gr.Markdown("", elem_classes="markdown-text")
321
 
 
322
  # Task-specific leaderboards
323
  for task, metadata in TASK_METADATA_MULTIPLECHOICE.items():
324
 
 
17
  import plotly.graph_objects as go
18
 
19
 
20
+ def mean_of_max_per_field(df):
21
+ """
22
+ Calcola il massimo per ciascun campo e poi la media dei massimi.
23
+
24
+ Args:
25
+ df (pd.DataFrame): DataFrame con colonne TE, SA, HS, AT, WIC, FAQ, LS, SU, NER, REL
26
+
27
+ Returns:
28
+ float: media dei valori massimi dei campi
29
+ """
30
+ fields = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
31
+
32
+ # Controlla che tutte le colonne esistano nel DataFrame
33
+ missing = [f for f in fields if f not in df.columns]
34
+ if missing:
35
+ raise ValueError(f"Le seguenti colonne mancano nel DataFrame: {missing}")
36
+
37
+ # Calcola il massimo per ciascun campo
38
+ max_values = df[fields].max()
39
+
40
+ # Calcola la media dei massimi
41
+ mean_max = max_values.mean()
42
+
43
+ return mean_max
44
+
45
+
46
  def line_chart(dataframe):
47
  # Separiamo i dati in base a IS_FS
48
  df_true = dataframe[dataframe['IS_FS'] == True]
 
70
  x=x_true,
71
  y=y_true,
72
  mode='markers', # solo marker, niente testo
73
+ name='5-Shot',
74
  marker=dict(color='red', size=10),
75
  hovertemplate='<b>%{customdata}</b><br>#Params: %{x}<br>Performance: %{y}<extra></extra>',
76
  customdata=labels_true # tutte le informazioni sul hover
 
104
 
105
 
106
 
107
+
108
+
109
  # Define task metadata (icons, names, descriptions)
110
  TASK_METADATA_MULTIPLECHOICE = {
111
  "TE": {"icon": "📊", "name": "Textual Entailment", "tooltip": ""},
 
137
  if dataframe is None or dataframe.empty:
138
  raise ValueError("Leaderboard DataFrame is empty or None.")
139
 
140
+ #print("????????????????????????????????", mean_of_max_per_field(dataframe))
141
+
142
  sorted_dataframe = dataframe.sort_values(by="Avg. Comb. Perf. ⬆️", ascending=False)
143
 
144
  sorted_dataframe = sorted_dataframe.reset_index(drop=True)
 
198
  search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
199
  hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
200
  filter_columns=[
201
+ ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Shot Learning (FS)"),
202
  #ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)",
203
  # default=[["0️⃣", "0️⃣"]]),
204
+ ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max = 100, default = [0,100], label="Select the number of parameters (B)"),
205
  ],
206
  #filter_columns=[
207
  # ColumnFilter("IS_FS", type="checkbox", default=False, label="5-Few-Shot")
 
225
  sorted_dataframe = sorted_dataframe.reset_index(drop=True)
226
  sorted_dataframe["rank"] = sorted_dataframe.index + 1
227
 
228
+ # Flag per sapere se la medaglia è già stata assegnata per categoria e tipo
229
+ large_medal_fs_assigned = False
230
+ medium_medal_fs_assigned = False
231
+ small_medal_fs_assigned = False
232
+
233
+ large_medal_0shot_assigned = False
234
+ medium_medal_0shot_assigned = False
235
+ small_medal_0shot_assigned = False
236
+
237
+ # Lista temporanea per salvare i nuovi valori della colonna Model
238
+ new_model_column = []
239
+
240
+ for _, row in sorted_dataframe.iterrows():
241
+ if row['IS_FS']: # 5-Few-Shot
242
+ if row["#Params (B)"] > 30 and not large_medal_fs_assigned:
243
+ new_model_column.append(f"{row['Model']} 7️⃣0️⃣🅱️🏆")
244
+ large_medal_fs_assigned = True
245
+ elif 10 < row["#Params (B)"] <= 30 and not medium_medal_fs_assigned:
246
+ new_model_column.append(f"{row['Model']} 3️⃣0️⃣🅱️🏆")
247
+ medium_medal_fs_assigned = True
248
+ elif row["#Params (B)"] <= 10 and not small_medal_fs_assigned:
249
+ new_model_column.append(f"{row['Model']} 1️⃣0️⃣🅱️🏆")
250
+ small_medal_fs_assigned = True
251
+ else:
252
+ new_model_column.append(row["Model"])
253
+ else: # 0-Shot
254
+ if row["#Params (B)"] > 30 and not large_medal_0shot_assigned:
255
+ new_model_column.append(f"{row['Model']} 7️⃣0️⃣🅱️🎖️")
256
+ large_medal_0shot_assigned = True
257
+ elif 10 < row["#Params (B)"] <= 30 and not medium_medal_0shot_assigned:
258
+ new_model_column.append(f"{row['Model']} 3️⃣0️⃣🅱️🎖️")
259
+ medium_medal_0shot_assigned = True
260
+ elif row["#Params (B)"] <= 10 and not small_medal_0shot_assigned:
261
+ new_model_column.append(f"{row['Model']} 1️⃣0️⃣🅱️🎖️")
262
+ small_medal_0shot_assigned = True
263
+ else:
264
+ new_model_column.append(row["Model"])
265
+
266
+ # Aggiorna la colonna Model
267
+ sorted_dataframe["Model"] = new_model_column
268
 
269
  pd.set_option('display.max_colwidth', None)
270
  #print("========================", dataframe['Model'])
 
285
  search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
286
  hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
287
  filter_columns=[
288
+ ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Shot Learning (FS)"),
289
+ ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=100, default=[0, 100],
290
+ label="Select the number of parameters (B)"),
291
  ],
292
  bool_checkboxgroup_label="Evaluation Mode",
293
  interactive=False
 
338
  finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
339
  #print(LEADERBOARD_DF.columns.tolist())
340
 
341
+ theoretical_max_combined_perf = mean_of_max_per_field(LEADERBOARD_DF)
342
+
343
  # Prepare the main interface
344
  demo = gr.Blocks(css=custom_css)
345
  with demo:
 
373
  hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['rank', 'FS', 'Model', "Avg. Comb. Perf. ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]]
374
  )
375
 
376
+ gr.HTML(
377
+ f"""
378
+ <div style="
379
+ border: 2px solid #1f77b4;
380
+ border-radius: 10px;
381
+ padding: 10px;
382
+ background-color: #f0f8ff;
383
+ font-weight: bold;
384
+ font-size: 14px;
385
+ display: inline-block;
386
+ ">
387
+ Theoretical performance of a model that scores the highest on every individual task: <span style="color:#d62728; font-size:18px;">{theoretical_max_combined_perf:.2f}</span>
388
+ </div>
389
+ """
390
+ )
391
+
392
  with gr.TabItem("📈 Charts"):
393
  #gr.Plot(value=line_chart(LEADERBOARD_DF), label="Andamento di esempio")
394
  #gr.Plot(value=line_chart_interactive_test(), label="Andamento interattivo")
 
402
  with gr.TabItem("║", interactive=False):
403
  gr.Markdown("", elem_classes="markdown-text")
404
 
405
+
406
  # Task-specific leaderboards
407
  for task, metadata in TASK_METADATA_MULTIPLECHOICE.items():
408
 
src/display/utils.py CHANGED
@@ -48,6 +48,7 @@ auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B
48
  auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
49
  auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
50
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 
51
 
52
  # We use make dataclass to dynamically fill the scores from Tasks
53
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
48
  auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
49
  auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
50
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
51
+ #auto_eval_column_dict.append(["submitted_time", ColumnContent, ColumnContent("Submitted time", "date", False)])
52
 
53
  # We use make dataclass to dynamically fill the scores from Tasks
54
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
src/leaderboard/read_evals.py CHANGED
@@ -7,6 +7,7 @@ from dataclasses import dataclass, field
7
  import dateutil
8
  import numpy as np
9
  from typing import Dict, Union
 
10
 
11
  #from get_model_info import num_params
12
  from src.display.formatting import make_clickable_model
@@ -23,6 +24,7 @@ class EvalResult:
23
  org: str
24
  model: str
25
  revision: str # commit hash, "" if main
 
26
  results: Dict[str, Union[float, int]] # float o int
27
  average_CPS: float
28
  is_5fewshot: bool
@@ -119,7 +121,8 @@ class EvalResult:
119
  still_on_hub=still_on_hub,
120
  architecture=architecture,
121
  num_params=num_params,
122
- rank = 0
 
123
  )
124
 
125
  '''
 
7
  import dateutil
8
  import numpy as np
9
  from typing import Dict, Union
10
+ from datetime import datetime
11
 
12
  #from get_model_info import num_params
13
  from src.display.formatting import make_clickable_model
 
24
  org: str
25
  model: str
26
  revision: str # commit hash, "" if main
27
+ #submitted_time: datetime
28
  results: Dict[str, Union[float, int]] # float o int
29
  average_CPS: float
30
  is_5fewshot: bool
 
121
  still_on_hub=still_on_hub,
122
  architecture=architecture,
123
  num_params=num_params,
124
+ rank = 0,
125
+ #submitted_time=config.get("submitted_time", ""),
126
  )
127
 
128
  '''