Fixing source/category
Browse files
app.py
CHANGED
@@ -69,11 +69,13 @@ def init_leaderboard(dataframe):
|
|
69 |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
70 |
label="Select Columns to Display:",
|
71 |
),
|
72 |
-
search_columns=[AutoEvalColumn.model.name
|
73 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
74 |
filter_columns=[
|
75 |
-
|
76 |
-
ColumnFilter(AutoEvalColumn.
|
|
|
|
|
77 |
ColumnFilter(
|
78 |
AutoEvalColumn.params.name,
|
79 |
type="slider",
|
@@ -81,12 +83,12 @@ def init_leaderboard(dataframe):
|
|
81 |
max=150,
|
82 |
label="Select the number of parameters (B)",
|
83 |
),
|
84 |
-
ColumnFilter(
|
85 |
-
|
86 |
-
),
|
87 |
],
|
88 |
bool_checkboxgroup_label="Hide models",
|
89 |
-
interactive=
|
90 |
)
|
91 |
|
92 |
|
|
|
69 |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
70 |
label="Select Columns to Display:",
|
71 |
),
|
72 |
+
search_columns=[AutoEvalColumn.model.name],
|
73 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
74 |
filter_columns=[
|
75 |
+
|
76 |
+
ColumnFilter(AutoEvalColumn.model_source.name, type="checkboxgroup", label="Model Source"),
|
77 |
+
ColumnFilter(AutoEvalColumn.model_category.name, type="checkboxgroup", label="Model Category"),
|
78 |
+
#ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
79 |
ColumnFilter(
|
80 |
AutoEvalColumn.params.name,
|
81 |
type="slider",
|
|
|
83 |
max=150,
|
84 |
label="Select the number of parameters (B)",
|
85 |
),
|
86 |
+
#ColumnFilter(
|
87 |
+
# AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
|
88 |
+
#),
|
89 |
],
|
90 |
bool_checkboxgroup_label="Hide models",
|
91 |
+
interactive=True,
|
92 |
)
|
93 |
|
94 |
|
results/Qwen/Qwen2.5-0.5B-Instruct_results_2025-04-21 16:50:28.595317.json
CHANGED
@@ -1,14 +1,8 @@
|
|
1 |
{
|
2 |
"results": {
|
3 |
-
"
|
4 |
-
|
5 |
-
|
6 |
-
"speed": {
|
7 |
-
"spq": 0.18969469807280515
|
8 |
-
},
|
9 |
-
"contamination": {
|
10 |
-
"score": 0
|
11 |
-
},
|
12 |
"execution_time": 88.587424,
|
13 |
"errors": [],
|
14 |
"scores_by_category": [
|
@@ -34,6 +28,8 @@
|
|
34 |
"config": {
|
35 |
"model": "Qwen/Qwen2.5-0.5B-Instruct",
|
36 |
"model_sha": "7ae557604adf67be50417f59c2c2f167def9a775",
|
|
|
|
|
37 |
"submitted_time": "2025-04-21T14:43:01Z",
|
38 |
"likes": 310,
|
39 |
"params": 0.494,
|
|
|
1 |
{
|
2 |
"results": {
|
3 |
+
"average_score": 6.0,
|
4 |
+
"speed": 5,
|
5 |
+
"contamination_score": 0,
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
"execution_time": 88.587424,
|
7 |
"errors": [],
|
8 |
"scores_by_category": [
|
|
|
28 |
"config": {
|
29 |
"model": "Qwen/Qwen2.5-0.5B-Instruct",
|
30 |
"model_sha": "7ae557604adf67be50417f59c2c2f167def9a775",
|
31 |
+
"model_source": "Hugging Face",
|
32 |
+
"model_category": "Nano",
|
33 |
"submitted_time": "2025-04-21T14:43:01Z",
|
34 |
"likes": 310,
|
35 |
"params": 0.494,
|
results/openai-community/gpt2_results_2025-04-21 16:59:47.547731.json
CHANGED
@@ -1,14 +1,8 @@
|
|
1 |
{
|
2 |
"results": {
|
3 |
-
"
|
4 |
-
|
5 |
-
|
6 |
-
"speed": {
|
7 |
-
"spq": 1.1064065631691649
|
8 |
-
},
|
9 |
-
"contamination": {
|
10 |
-
"score": 0
|
11 |
-
},
|
12 |
"execution_time": 516.691865,
|
13 |
"errors": [],
|
14 |
"scores_by_category": [
|
@@ -34,6 +28,8 @@
|
|
34 |
"config": {
|
35 |
"model": "openai-community/gpt2",
|
36 |
"model_sha": "607a30d783dfa663caf39e06633721c8d4cfcd7e",
|
|
|
|
|
37 |
"submitted_time": "2025-04-21T14:50:23Z",
|
38 |
"likes": 2679,
|
39 |
"params": 0.137,
|
|
|
1 |
{
|
2 |
"results": {
|
3 |
+
"average_score": 1.0,
|
4 |
+
"speed": 1.1064065631691649,
|
5 |
+
"contamination_score": 0,
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
"execution_time": 516.691865,
|
7 |
"errors": [],
|
8 |
"scores_by_category": [
|
|
|
28 |
"config": {
|
29 |
"model": "openai-community/gpt2",
|
30 |
"model_sha": "607a30d783dfa663caf39e06633721c8d4cfcd7e",
|
31 |
+
"model_source": "Hugging Face",
|
32 |
+
"model_category": "Nano",
|
33 |
"submitted_time": "2025-04-21T14:50:23Z",
|
34 |
"likes": 2679,
|
35 |
"params": 0.137,
|
src/about.py
CHANGED
@@ -2,18 +2,16 @@ from dataclasses import dataclass
|
|
2 |
from enum import Enum
|
3 |
|
4 |
@dataclass
|
5 |
-
class
|
6 |
-
benchmark: str
|
7 |
metric: str
|
8 |
col_name: str
|
9 |
|
10 |
|
11 |
# Select your tasks here
|
12 |
# ---------------------------------------------------
|
13 |
-
class
|
14 |
-
|
15 |
-
|
16 |
-
task1 = Task("contamination", "score", "Contamination")
|
17 |
|
18 |
NUM_FEWSHOT = 0 # Change with your few shot
|
19 |
# ---------------------------------------------------
|
|
|
2 |
from enum import Enum
|
3 |
|
4 |
@dataclass
|
5 |
+
class EvalDimension:
|
|
|
6 |
metric: str
|
7 |
col_name: str
|
8 |
|
9 |
|
10 |
# Select your tasks here
|
11 |
# ---------------------------------------------------
|
12 |
+
class EvalDimensions(Enum):
|
13 |
+
d0 = EvalDimension("speed", "Speed")
|
14 |
+
d1 = EvalDimension("contamination_score", "Contamination Score")
|
|
|
15 |
|
16 |
NUM_FEWSHOT = 0 # Change with your few shot
|
17 |
# ---------------------------------------------------
|
src/display/utils.py
CHANGED
@@ -3,7 +3,7 @@ from enum import Enum
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
-
from src.about import
|
7 |
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
@@ -23,14 +23,19 @@ class ColumnContent:
|
|
23 |
## Leaderboard columns
|
24 |
auto_eval_column_dict = []
|
25 |
# Init
|
26 |
-
auto_eval_column_dict.append(["
|
|
|
|
|
|
|
|
|
27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
28 |
#Scores
|
29 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
30 |
-
for
|
31 |
-
auto_eval_column_dict.append([
|
32 |
# Model information
|
33 |
-
|
|
|
34 |
#auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
35 |
#auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
36 |
#auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
@@ -40,6 +45,7 @@ auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️"
|
|
40 |
#auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
41 |
#auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
42 |
|
|
|
43 |
# We use make dataclass to dynamically fill the scores from Tasks
|
44 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
45 |
|
@@ -108,5 +114,5 @@ COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
|
108 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
109 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
110 |
|
111 |
-
BENCHMARK_COLS = [t.value.col_name for t in
|
112 |
|
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
+
from src.about import EvalDimensions
|
7 |
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
|
23 |
## Leaderboard columns
|
24 |
auto_eval_column_dict = []
|
25 |
# Init
|
26 |
+
auto_eval_column_dict.append(["model_source", ColumnContent, ColumnContent("Source", "str", True, False)])
|
27 |
+
auto_eval_column_dict.append(["model_category", ColumnContent, ColumnContent("Category", "str", True, False)])
|
28 |
+
|
29 |
+
|
30 |
+
#auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
31 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
32 |
#Scores
|
33 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
34 |
+
for eval_dim in EvalDimensions:
|
35 |
+
auto_eval_column_dict.append([eval_dim.name, ColumnContent, ColumnContent(eval_dim.value.col_name, "number", True)])
|
36 |
# Model information
|
37 |
+
|
38 |
+
#auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
39 |
#auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
40 |
#auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
41 |
#auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
|
|
45 |
#auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
46 |
#auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
47 |
|
48 |
+
|
49 |
# We use make dataclass to dynamically fill the scores from Tasks
|
50 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
51 |
|
|
|
114 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
115 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
116 |
|
117 |
+
BENCHMARK_COLS = [t.value.col_name for t in EvalDimensions]
|
118 |
|
src/leaderboard/read_evals.py
CHANGED
@@ -8,7 +8,7 @@ import dateutil
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
-
from src.display.utils import AutoEvalColumn,
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
|
@@ -24,7 +24,8 @@ class EvalResult:
|
|
24 |
results: dict
|
25 |
#precision: Precision = Precision.Unknown
|
26 |
#model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
27 |
-
model_source: str = "" # HF,
|
|
|
28 |
#weight_type: WeightType = WeightType.Original # Original or Adapter
|
29 |
#architecture: str = "Unknown"
|
30 |
license: str = "?"
|
@@ -86,16 +87,17 @@ class EvalResult:
|
|
86 |
org=org,
|
87 |
model=model,
|
88 |
model_source=config.get("model_source", ""),
|
|
|
89 |
results=results,
|
90 |
#precision=precision,
|
91 |
-
revision= config.get("model_sha", ""),
|
92 |
still_on_hub=still_on_hub,
|
93 |
#architecture=architecture
|
94 |
)
|
95 |
|
96 |
def update_with_request_file(self, requests_path):
|
97 |
"""Finds the relevant request file for the current model and updates info with it"""
|
98 |
-
request_file = get_request_file_for_model(requests_path, self.full_model
|
99 |
try:
|
100 |
with open(request_file, "r") as f:
|
101 |
request = json.load(f)
|
@@ -107,34 +109,35 @@ class EvalResult:
|
|
107 |
self.num_params = request.get("params", 0)
|
108 |
self.date = request.get("submitted_time", "")
|
109 |
except Exception:
|
110 |
-
print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}
|
111 |
|
112 |
def to_dict(self):
|
113 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
114 |
-
|
115 |
data_dict = {
|
116 |
"eval_name": self.eval_name, # not a column, just a save name,
|
117 |
#AutoEvalColumn.precision.name: self.precision.value.name,
|
118 |
-
AutoEvalColumn.model_source.name: self.model_source
|
|
|
119 |
#AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
120 |
#AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
121 |
#AutoEvalColumn.architecture.name: self.architecture,
|
122 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
123 |
#AutoEvalColumn.revision.name: self.revision,
|
124 |
-
AutoEvalColumn.average.name:
|
125 |
AutoEvalColumn.license.name: self.license,
|
126 |
AutoEvalColumn.likes.name: self.likes,
|
127 |
AutoEvalColumn.params.name: self.num_params,
|
128 |
-
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
129 |
}
|
130 |
|
131 |
-
for
|
132 |
-
data_dict[
|
133 |
|
134 |
return data_dict
|
135 |
|
136 |
|
137 |
-
def get_request_file_for_model(requests_path, model_name
|
138 |
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
139 |
request_files = os.path.join(
|
140 |
requests_path,
|
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
+
from src.display.utils import AutoEvalColumn, EvalDimensions#, ModelType, Precision, WeightType
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
|
|
|
24 |
results: dict
|
25 |
#precision: Precision = Precision.Unknown
|
26 |
#model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
27 |
+
model_source: str = "" # HF, API, ...
|
28 |
+
model_category: str = "" #Nano, Small, Medium, Large
|
29 |
#weight_type: WeightType = WeightType.Original # Original or Adapter
|
30 |
#architecture: str = "Unknown"
|
31 |
license: str = "?"
|
|
|
87 |
org=org,
|
88 |
model=model,
|
89 |
model_source=config.get("model_source", ""),
|
90 |
+
model_category=config.get("model_category", ""),
|
91 |
results=results,
|
92 |
#precision=precision,
|
93 |
+
#revision= config.get("model_sha", ""),
|
94 |
still_on_hub=still_on_hub,
|
95 |
#architecture=architecture
|
96 |
)
|
97 |
|
98 |
def update_with_request_file(self, requests_path):
|
99 |
"""Finds the relevant request file for the current model and updates info with it"""
|
100 |
+
request_file = get_request_file_for_model(requests_path, self.full_model) #, self.precision.value.name
|
101 |
try:
|
102 |
with open(request_file, "r") as f:
|
103 |
request = json.load(f)
|
|
|
109 |
self.num_params = request.get("params", 0)
|
110 |
self.date = request.get("submitted_time", "")
|
111 |
except Exception:
|
112 |
+
print(f"Could not find request file for {self.org}/{self.model}") # with precision {self.precision.value.name}
|
113 |
|
114 |
def to_dict(self):
|
115 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
116 |
+
average_score = self.results["average_score"]
|
117 |
data_dict = {
|
118 |
"eval_name": self.eval_name, # not a column, just a save name,
|
119 |
#AutoEvalColumn.precision.name: self.precision.value.name,
|
120 |
+
AutoEvalColumn.model_source.name: self.model_source,
|
121 |
+
AutoEvalColumn.model_category.name: self.model_category,
|
122 |
#AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
123 |
#AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
124 |
#AutoEvalColumn.architecture.name: self.architecture,
|
125 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
126 |
#AutoEvalColumn.revision.name: self.revision,
|
127 |
+
AutoEvalColumn.average.name: average_score,
|
128 |
AutoEvalColumn.license.name: self.license,
|
129 |
AutoEvalColumn.likes.name: self.likes,
|
130 |
AutoEvalColumn.params.name: self.num_params,
|
131 |
+
#AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
132 |
}
|
133 |
|
134 |
+
for eval_dim in EvalDimensions:
|
135 |
+
data_dict[eval_dim.value.col_name] = self.results[eval_dim.value.metric]
|
136 |
|
137 |
return data_dict
|
138 |
|
139 |
|
140 |
+
def get_request_file_for_model(requests_path, model_name): #,precision
|
141 |
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
142 |
request_files = os.path.join(
|
143 |
requests_path,
|