karimouda commited on
Commit
f07d235
·
1 Parent(s): 42d6492

Fixing source/category

Browse files
app.py CHANGED
@@ -69,11 +69,13 @@ def init_leaderboard(dataframe):
69
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
70
  label="Select Columns to Display:",
71
  ),
72
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
73
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
74
  filter_columns=[
75
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
76
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
 
 
77
  ColumnFilter(
78
  AutoEvalColumn.params.name,
79
  type="slider",
@@ -81,12 +83,12 @@ def init_leaderboard(dataframe):
81
  max=150,
82
  label="Select the number of parameters (B)",
83
  ),
84
- ColumnFilter(
85
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
86
- ),
87
  ],
88
  bool_checkboxgroup_label="Hide models",
89
- interactive=False,
90
  )
91
 
92
 
 
69
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
70
  label="Select Columns to Display:",
71
  ),
72
+ search_columns=[AutoEvalColumn.model.name],
73
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
74
  filter_columns=[
75
+
76
+ ColumnFilter(AutoEvalColumn.model_source.name, type="checkboxgroup", label="Model Source"),
77
+ ColumnFilter(AutoEvalColumn.model_category.name, type="checkboxgroup", label="Model Category"),
78
+ #ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
79
  ColumnFilter(
80
  AutoEvalColumn.params.name,
81
  type="slider",
 
83
  max=150,
84
  label="Select the number of parameters (B)",
85
  ),
86
+ #ColumnFilter(
87
+ # AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
88
+ #),
89
  ],
90
  bool_checkboxgroup_label="Hide models",
91
+ interactive=True,
92
  )
93
 
94
 
results/Qwen/Qwen2.5-0.5B-Instruct_results_2025-04-21 16:50:28.595317.json CHANGED
@@ -1,14 +1,8 @@
1
  {
2
  "results": {
3
- "average": {
4
- "score": 3.2
5
- },
6
- "speed": {
7
- "spq": 0.18969469807280515
8
- },
9
- "contamination": {
10
- "score": 0
11
- },
12
  "execution_time": 88.587424,
13
  "errors": [],
14
  "scores_by_category": [
@@ -34,6 +28,8 @@
34
  "config": {
35
  "model": "Qwen/Qwen2.5-0.5B-Instruct",
36
  "model_sha": "7ae557604adf67be50417f59c2c2f167def9a775",
 
 
37
  "submitted_time": "2025-04-21T14:43:01Z",
38
  "likes": 310,
39
  "params": 0.494,
 
1
  {
2
  "results": {
3
+ "average_score": 6.0,
4
+ "speed": 5,
5
+ "contamination_score": 0,
 
 
 
 
 
 
6
  "execution_time": 88.587424,
7
  "errors": [],
8
  "scores_by_category": [
 
28
  "config": {
29
  "model": "Qwen/Qwen2.5-0.5B-Instruct",
30
  "model_sha": "7ae557604adf67be50417f59c2c2f167def9a775",
31
+ "model_source": "Hugging Face",
32
+ "model_category": "Nano",
33
  "submitted_time": "2025-04-21T14:43:01Z",
34
  "likes": 310,
35
  "params": 0.494,
results/openai-community/gpt2_results_2025-04-21 16:59:47.547731.json CHANGED
@@ -1,14 +1,8 @@
1
  {
2
  "results": {
3
- "average": {
4
- "score": 1.0
5
- },
6
- "speed": {
7
- "spq": 1.1064065631691649
8
- },
9
- "contamination": {
10
- "score": 0
11
- },
12
  "execution_time": 516.691865,
13
  "errors": [],
14
  "scores_by_category": [
@@ -34,6 +28,8 @@
34
  "config": {
35
  "model": "openai-community/gpt2",
36
  "model_sha": "607a30d783dfa663caf39e06633721c8d4cfcd7e",
 
 
37
  "submitted_time": "2025-04-21T14:50:23Z",
38
  "likes": 2679,
39
  "params": 0.137,
 
1
  {
2
  "results": {
3
+ "average_score": 1.0,
4
+ "speed": 1.1064065631691649,
5
+ "contamination_score": 0,
 
 
 
 
 
 
6
  "execution_time": 516.691865,
7
  "errors": [],
8
  "scores_by_category": [
 
28
  "config": {
29
  "model": "openai-community/gpt2",
30
  "model_sha": "607a30d783dfa663caf39e06633721c8d4cfcd7e",
31
+ "model_source": "Hugging Face",
32
+ "model_category": "Nano",
33
  "submitted_time": "2025-04-21T14:50:23Z",
34
  "likes": 2679,
35
  "params": 0.137,
src/about.py CHANGED
@@ -2,18 +2,16 @@ from dataclasses import dataclass
2
  from enum import Enum
3
 
4
  @dataclass
5
- class Task:
6
- benchmark: str
7
  metric: str
8
  col_name: str
9
 
10
 
11
  # Select your tasks here
12
  # ---------------------------------------------------
13
- class Tasks(Enum):
14
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("speed", "spq", "Speed")
16
- task1 = Task("contamination", "score", "Contamination")
17
 
18
  NUM_FEWSHOT = 0 # Change with your few shot
19
  # ---------------------------------------------------
 
2
  from enum import Enum
3
 
4
  @dataclass
5
+ class EvalDimension:
 
6
  metric: str
7
  col_name: str
8
 
9
 
10
  # Select your tasks here
11
  # ---------------------------------------------------
12
+ class EvalDimensions(Enum):
13
+ d0 = EvalDimension("speed", "Speed")
14
+ d1 = EvalDimension("contamination_score", "Contamination Score")
 
15
 
16
  NUM_FEWSHOT = 0 # Change with your few shot
17
  # ---------------------------------------------------
src/display/utils.py CHANGED
@@ -3,7 +3,7 @@ from enum import Enum
3
 
4
  import pandas as pd
5
 
6
- from src.about import Tasks
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -23,14 +23,19 @@ class ColumnContent:
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
 
 
 
 
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
  #Scores
29
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
- for task in Tasks:
31
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
  # Model information
33
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 
34
  #auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
  #auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
  #auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
@@ -40,6 +45,7 @@ auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️"
40
  #auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
  #auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
 
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
45
 
@@ -108,5 +114,5 @@ COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
108
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
109
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
110
 
111
- BENCHMARK_COLS = [t.value.col_name for t in Tasks]
112
 
 
3
 
4
  import pandas as pd
5
 
6
+ from src.about import EvalDimensions
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
+ auto_eval_column_dict.append(["model_source", ColumnContent, ColumnContent("Source", "str", True, False)])
27
+ auto_eval_column_dict.append(["model_category", ColumnContent, ColumnContent("Category", "str", True, False)])
28
+
29
+
30
+ #auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
31
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
32
  #Scores
33
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
34
+ for eval_dim in EvalDimensions:
35
+ auto_eval_column_dict.append([eval_dim.name, ColumnContent, ColumnContent(eval_dim.value.col_name, "number", True)])
36
  # Model information
37
+
38
+ #auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
39
  #auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
40
  #auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
41
  #auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
 
45
  #auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
46
  #auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
47
 
48
+
49
  # We use make dataclass to dynamically fill the scores from Tasks
50
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
51
 
 
114
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
115
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
116
 
117
+ BENCHMARK_COLS = [t.value.col_name for t in EvalDimensions]
118
 
src/leaderboard/read_evals.py CHANGED
@@ -8,7 +8,7 @@ import dateutil
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, Tasks#, ModelType, Precision, WeightType
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
@@ -24,7 +24,8 @@ class EvalResult:
24
  results: dict
25
  #precision: Precision = Precision.Unknown
26
  #model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
- model_source: str = "" # HF, private, ...
 
28
  #weight_type: WeightType = WeightType.Original # Original or Adapter
29
  #architecture: str = "Unknown"
30
  license: str = "?"
@@ -86,16 +87,17 @@ class EvalResult:
86
  org=org,
87
  model=model,
88
  model_source=config.get("model_source", ""),
 
89
  results=results,
90
  #precision=precision,
91
- revision= config.get("model_sha", ""),
92
  still_on_hub=still_on_hub,
93
  #architecture=architecture
94
  )
95
 
96
  def update_with_request_file(self, requests_path):
97
  """Finds the relevant request file for the current model and updates info with it"""
98
- request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
99
  try:
100
  with open(request_file, "r") as f:
101
  request = json.load(f)
@@ -107,34 +109,35 @@ class EvalResult:
107
  self.num_params = request.get("params", 0)
108
  self.date = request.get("submitted_time", "")
109
  except Exception:
110
- print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
111
 
112
  def to_dict(self):
113
  """Converts the Eval Result to a dict compatible with our dataframe display"""
114
- average = self.results["average"]
115
  data_dict = {
116
  "eval_name": self.eval_name, # not a column, just a save name,
117
  #AutoEvalColumn.precision.name: self.precision.value.name,
118
- AutoEvalColumn.model_source.name: self.model_source.value.name,
 
119
  #AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
120
  #AutoEvalColumn.weight_type.name: self.weight_type.value.name,
121
  #AutoEvalColumn.architecture.name: self.architecture,
122
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
123
  #AutoEvalColumn.revision.name: self.revision,
124
- AutoEvalColumn.average.name: average,
125
  AutoEvalColumn.license.name: self.license,
126
  AutoEvalColumn.likes.name: self.likes,
127
  AutoEvalColumn.params.name: self.num_params,
128
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
129
  }
130
 
131
- for task in Tasks:
132
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
133
 
134
  return data_dict
135
 
136
 
137
- def get_request_file_for_model(requests_path, model_name, precision):
138
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
139
  request_files = os.path.join(
140
  requests_path,
 
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
+ from src.display.utils import AutoEvalColumn, EvalDimensions#, ModelType, Precision, WeightType
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
 
24
  results: dict
25
  #precision: Precision = Precision.Unknown
26
  #model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
+ model_source: str = "" # HF, API, ...
28
+ model_category: str = "" #Nano, Small, Medium, Large
29
  #weight_type: WeightType = WeightType.Original # Original or Adapter
30
  #architecture: str = "Unknown"
31
  license: str = "?"
 
87
  org=org,
88
  model=model,
89
  model_source=config.get("model_source", ""),
90
+ model_category=config.get("model_category", ""),
91
  results=results,
92
  #precision=precision,
93
+ #revision= config.get("model_sha", ""),
94
  still_on_hub=still_on_hub,
95
  #architecture=architecture
96
  )
97
 
98
  def update_with_request_file(self, requests_path):
99
  """Finds the relevant request file for the current model and updates info with it"""
100
+ request_file = get_request_file_for_model(requests_path, self.full_model) #, self.precision.value.name
101
  try:
102
  with open(request_file, "r") as f:
103
  request = json.load(f)
 
109
  self.num_params = request.get("params", 0)
110
  self.date = request.get("submitted_time", "")
111
  except Exception:
112
+ print(f"Could not find request file for {self.org}/{self.model}") # with precision {self.precision.value.name}
113
 
114
  def to_dict(self):
115
  """Converts the Eval Result to a dict compatible with our dataframe display"""
116
+ average_score = self.results["average_score"]
117
  data_dict = {
118
  "eval_name": self.eval_name, # not a column, just a save name,
119
  #AutoEvalColumn.precision.name: self.precision.value.name,
120
+ AutoEvalColumn.model_source.name: self.model_source,
121
+ AutoEvalColumn.model_category.name: self.model_category,
122
  #AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
123
  #AutoEvalColumn.weight_type.name: self.weight_type.value.name,
124
  #AutoEvalColumn.architecture.name: self.architecture,
125
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
126
  #AutoEvalColumn.revision.name: self.revision,
127
+ AutoEvalColumn.average.name: average_score,
128
  AutoEvalColumn.license.name: self.license,
129
  AutoEvalColumn.likes.name: self.likes,
130
  AutoEvalColumn.params.name: self.num_params,
131
+ #AutoEvalColumn.still_on_hub.name: self.still_on_hub,
132
  }
133
 
134
+ for eval_dim in EvalDimensions:
135
+ data_dict[eval_dim.value.col_name] = self.results[eval_dim.value.metric]
136
 
137
  return data_dict
138
 
139
 
140
+ def get_request_file_for_model(requests_path, model_name): #,precision
141
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
142
  request_files = os.path.join(
143
  requests_path,