Jerrycool commited on
Commit
b41aa3c
Β·
verified Β·
1 Parent(s): 142f8de

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +115 -95
app.py CHANGED
@@ -1,10 +1,9 @@
1
  import gradio as gr
2
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
- from huggingface_hub import snapshot_download
6
-
7
- from src.about import (
8
  CITATION_BUTTON_LABEL,
9
  CITATION_BUTTON_TEXT,
10
  EVALUATION_QUEUE_TEXT,
@@ -12,112 +11,124 @@ from src.about import (
12
  LLM_BENCHMARKS_TEXT,
13
  TITLE,
14
  )
15
- from src.display.css_html_js import custom_css
16
- from src.display.utils import (
17
- BENCHMARK_COLS,
18
- COLS,
19
- EVAL_COLS,
20
- EVAL_TYPES,
21
- AutoEvalColumn,
22
- ModelType,
23
- fields,
24
- WeightType,
25
- Precision
26
- )
27
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
- from src.submission.submit import add_new_eval
30
-
31
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  def restart_space():
33
- API.restart_space(repo_id=REPO_ID)
34
-
35
- ### Space initialisation
36
- try:
37
- print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
41
- except Exception:
42
- restart_space()
43
- try:
44
- print(EVAL_RESULTS_PATH)
45
- snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
- )
48
- except Exception:
49
- restart_space()
50
-
51
-
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
-
54
- (
55
- finished_eval_queue_df,
56
- running_eval_queue_df,
57
- pending_eval_queue_df,
58
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
-
60
- def init_leaderboard(dataframe):
61
- if dataframe is None or dataframe.empty:
62
- raise ValueError("Leaderboard DataFrame is empty or None.")
63
- return Leaderboard(
64
- value=dataframe,
65
- datatype=[c.type for c in fields(AutoEvalColumn)],
66
- select_columns=SelectColumns(
67
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
- label="Select Columns to Display:",
70
- ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
- filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
- ],
87
- bool_checkboxgroup_label="Hide models",
88
- interactive=False,
89
- )
90
-
91
-
92
  demo = gr.Blocks(css=custom_css)
 
93
  with demo:
94
  gr.HTML(TITLE)
95
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
 
97
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
  with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
- leaderboard = init_leaderboard(LEADERBOARD_DF)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
102
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
 
104
  with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
 
105
  with gr.Column():
106
  with gr.Row():
107
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
-
109
  with gr.Column():
 
110
  with gr.Accordion(
111
- f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
112
  open=False,
113
  ):
114
  with gr.Row():
115
- finished_eval_table = gr.components.Dataframe(
116
  value=finished_eval_queue_df,
117
  headers=EVAL_COLS,
118
  datatype=EVAL_TYPES,
119
  row_count=5,
120
- )
121
  with gr.Accordion(
122
  f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
123
  open=False,
@@ -129,7 +140,6 @@ with demo:
129
  datatype=EVAL_TYPES,
130
  row_count=5,
131
  )
132
-
133
  with gr.Accordion(
134
  f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
  open=False,
@@ -141,31 +151,35 @@ with demo:
141
  datatype=EVAL_TYPES,
142
  row_count=5,
143
  )
 
144
  with gr.Row():
145
  gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
146
-
147
  with gr.Row():
 
148
  with gr.Column():
149
  model_name_textbox = gr.Textbox(label="Model name")
150
  revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
 
151
  model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
 
153
  label="Model type",
154
  multiselect=False,
155
  value=None,
156
  interactive=True,
157
  )
158
-
159
  with gr.Column():
160
  precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
 
162
  label="Precision",
163
  multiselect=False,
164
  value="float16",
165
  interactive=True,
166
  )
167
  weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
 
169
  label="Weights type",
170
  multiselect=False,
171
  value="Original",
@@ -175,6 +189,8 @@ with demo:
175
 
176
  submit_button = gr.Button("Submit Eval")
177
  submission_result = gr.Markdown()
 
 
178
  submit_button.click(
179
  add_new_eval,
180
  [
@@ -198,7 +214,11 @@ with demo:
198
  show_copy_button=True,
199
  )
200
 
201
- scheduler = BackgroundScheduler()
202
- scheduler.add_job(restart_space, "interval", seconds=1800)
203
- scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
 
 
 
 
1
  import gradio as gr
 
2
  import pandas as pd
3
  from apscheduler.schedulers.background import BackgroundScheduler
4
+ # Removed Hugging Face Hub imports as they are not needed for the simplified leaderboard
5
+ # from huggingface_hub import snapshot_download, HfApi
6
+ from src.about import ( # Assuming these still exist and are relevant for other tabs
7
  CITATION_BUTTON_LABEL,
8
  CITATION_BUTTON_TEXT,
9
  EVALUATION_QUEUE_TEXT,
 
11
  LLM_BENCHMARKS_TEXT,
12
  TITLE,
13
  )
14
+ from src.display.css_html_js import custom_css # Keep custom CSS
15
+ # Removed utils imports related to the old leaderboard
16
+ # from src.display.utils import (...)
17
+ from src.envs import REPO_ID # Keep if needed for restart_space or other functions
18
+ # Removed constants related to old data paths and repos if not needed elsewhere
19
+ # from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
20
+ # Removed old data processing functions
21
+ # from src.populate import get_evaluation_queue_df, get_leaderboard_df
22
+ from src.submission.submit import add_new_eval # Keep submission logic
23
+
24
+ # --- New Elo Leaderboard Configuration ---
25
+ INITIAL_MODELS = [
26
+ "gpt-4o-mini", "gpt-4o", "gemini-2.0-flash", "deepseek-v3",
27
+ "gemini-2.0-pro", "o3-mini", "deepseek-r1", "gemini-2.5-pro"
28
+ ]
29
+ CATEGORIES = ["MLE-Lite", "Tabular", "NLP", "CV"]
30
+ DEFAULT_ELO = 1200
31
+
32
+ # Placeholder data structure for Elo scores per category
33
+ # *** MODIFY THE SCORES HERE AS NEEDED ***
34
+ elo_data = {
35
+ category: pd.DataFrame({
36
+ "Model": INITIAL_MODELS,
37
+ "Elo Score": [DEFAULT_ELO] * len(INITIAL_MODELS)
38
+ }) for category in CATEGORIES
39
+ }
40
+ # Example: How to set specific scores for a category
41
+ # elo_data["NLP"] = pd.DataFrame({
42
+ # "Model": INITIAL_MODELS,
43
+ # "Elo Score": [1300, 1450, 1250, 1350, 1400, 1150, 1320, 1500] # Example scores
44
+ # })
45
+
46
+ # --- Helper function to update leaderboard ---
47
+ def update_leaderboard(category):
48
+ """Returns the DataFrame for the selected category."""
49
+ df = elo_data.get(category)
50
+ if df is None:
51
+ # Return default if category not found (shouldn't happen with radio)
52
+ return elo_data[CATEGORIES[0]]
53
+ return df
54
+
55
+ # --- Mock/Placeholder functions/data for other tabs ---
56
+ # Since we removed the snapshot download, the original queue fetching will fail.
57
+ # Provide empty DataFrames or mock data if you want the queue display to work without the original data source.
58
+ # This is a placeholder - replace with actual data loading if needed for the submission tab.
59
+ print("Warning: Evaluation queue data fetching is disabled/mocked due to leaderboard changes.")
60
+ finished_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"])
61
+ running_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"])
62
+ pending_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"])
63
+ EVAL_COLS = ["Model", "Status", "Requested", "Started"] # Define for the dataframe headers
64
+ EVAL_TYPES = ["str", "str", "str", "str"] # Define for the dataframe types
65
+
66
+ # --- Keep restart function if relevant ---
67
+ # Assuming HfApi is initialized elsewhere or REPO_ID is sufficient
68
+ # api = HfApi() # Example initialization, adjust as needed
69
  def restart_space():
70
+ print(f"Attempting to restart space: {REPO_ID}")
71
+ # Replace with your actual space restart mechanism if needed
72
+ # try:
73
+ # api.restart_space(repo_id=REPO_ID)
74
+ # print("Space restart request sent.")
75
+ # except Exception as e:
76
+ # print(f"Failed to restart space: {e}")
77
+
78
+ # --- Gradio App Definition ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  demo = gr.Blocks(css=custom_css)
80
+
81
  with demo:
82
  gr.HTML(TITLE)
83
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
84
 
85
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
86
  with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
87
+ with gr.Column():
88
+ gr.Markdown("## Model Elo Rankings") # New title for the section
89
+ category_selector = gr.Radio(
90
+ choices=CATEGORIES,
91
+ label="Select Category",
92
+ value=CATEGORIES[0], # Default selection
93
+ interactive=True,
94
+ container=False, # Make radio buttons horizontal if possible with CSS
95
+ )
96
+ leaderboard_df_component = gr.Dataframe(
97
+ value=update_leaderboard(CATEGORIES[0]), # Initial value
98
+ headers=["Model", "Elo Score"],
99
+ datatype=["str", "number"],
100
+ interactive=False,
101
+ row_count=(len(INITIAL_MODELS), "fixed"), # Fixed row count
102
+ col_count=(2, "fixed"), # Fixed column count
103
+ )
104
+ # Link the radio button change to the update function
105
+ category_selector.change(
106
+ fn=update_leaderboard,
107
+ inputs=category_selector,
108
+ outputs=leaderboard_df_component
109
+ )
110
 
111
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
112
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
113
 
114
  with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
115
+ # --- This section remains largely unchanged, but relies on potentially missing data ---
116
  with gr.Column():
117
  with gr.Row():
118
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
 
119
  with gr.Column():
120
+ # Displaying queue tables with potentially empty/mock data
121
  with gr.Accordion(
122
+ f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})", # Length might be 0
123
  open=False,
124
  ):
125
  with gr.Row():
126
+ finished_eval_table = gr.components.Dataframe(
127
  value=finished_eval_queue_df,
128
  headers=EVAL_COLS,
129
  datatype=EVAL_TYPES,
130
  row_count=5,
131
+ )
132
  with gr.Accordion(
133
  f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
134
  open=False,
 
140
  datatype=EVAL_TYPES,
141
  row_count=5,
142
  )
 
143
  with gr.Accordion(
144
  f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
145
  open=False,
 
151
  datatype=EVAL_TYPES,
152
  row_count=5,
153
  )
154
+
155
  with gr.Row():
156
  gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
 
157
  with gr.Row():
158
+ # Submission form - kept as is
159
  with gr.Column():
160
  model_name_textbox = gr.Textbox(label="Model name")
161
  revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
162
+ # Using simple strings for dropdowns now, adjust if ModelType/Precision/WeightType classes are still needed
163
  model_type = gr.Dropdown(
164
+ # choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown], # Original
165
+ choices=["Type A", "Type B", "Type C"], # Example choices, replace if needed
166
  label="Model type",
167
  multiselect=False,
168
  value=None,
169
  interactive=True,
170
  )
 
171
  with gr.Column():
172
  precision = gr.Dropdown(
173
+ # choices=[i.value.name for i in Precision if i != Precision.Unknown], # Original
174
+ choices=["float16", "bfloat16", "float32", "int8"], # Example choices
175
  label="Precision",
176
  multiselect=False,
177
  value="float16",
178
  interactive=True,
179
  )
180
  weight_type = gr.Dropdown(
181
+ # choices=[i.value.name for i in WeightType], # Original
182
+ choices=["Original", "Adapter", "Delta"], # Example choices
183
  label="Weights type",
184
  multiselect=False,
185
  value="Original",
 
189
 
190
  submit_button = gr.Button("Submit Eval")
191
  submission_result = gr.Markdown()
192
+
193
+ # Keep submission logic attached
194
  submit_button.click(
195
  add_new_eval,
196
  [
 
214
  show_copy_button=True,
215
  )
216
 
217
+ # --- Keep scheduler if relevant ---
218
+ # scheduler = BackgroundScheduler()
219
+ # scheduler.add_job(restart_space, "interval", seconds=1800) # Restart every 30 mins
220
+ # scheduler.start()
221
+
222
+ # --- Launch the app ---
223
+ # demo.queue(default_concurrency_limit=40).launch() # Original launch
224
+ demo.launch() # Simpler launch for testing