natolambert commited on
Commit
75fed94
Β·
1 Parent(s): 9a9d913

download button and style

Browse files
Files changed (5) hide show
  1. .gitignore +4 -1
  2. app.py +57 -18
  3. leaderboard/md.py +4 -2
  4. leaderboard/retired-app.py +133 -77
  5. leaderboard/utils.py +1 -0
.gitignore CHANGED
@@ -16,4 +16,7 @@ evals/
16
  .gradio/
17
  .evals/
18
  __pycache__/*
19
- *.pyc
 
 
 
 
16
  .gradio/
17
  .evals/
18
  __pycache__/*
19
+ *.pyc
20
+
21
+ # saved data automatically
22
+ leaderboard/current-rbv2-data.csv
app.py CHANGED
@@ -3,17 +3,17 @@ from pathlib import Path
3
 
4
  import gradio as gr
5
  import numpy as np
 
6
  from datasets import load_dataset
7
  from huggingface_hub import HfApi, snapshot_download
8
- import pandas as pd
9
 
10
- from leaderboard.constants import example_counts, length_categories, subset_mapping
11
  from leaderboard.css import custom_css
12
  from leaderboard.md import *
13
  from leaderboard.utils import load_all_data
14
 
15
  #######################################################
16
- # Setup #
17
  #######################################################
18
  api = HfApi()
19
 
@@ -40,6 +40,7 @@ repo = snapshot_download(
40
  # Load Data #
41
  ###########################################
42
 
 
43
  def avg_over_rewardbench_v2(dataframe_core):
44
  domain_cols = ["factuality", "precise if", "math", "safety", "chat", "ties"]
45
  domain_weights = [1, 1, 1, 1, 1, 1]
@@ -59,9 +60,19 @@ def avg_over_rewardbench_v2(dataframe_core):
59
  new_df = new_df[keep_columns]
60
 
61
  # TODO: update domain_cols and comment this out if final dataset version changes names
62
- new_df = new_df.rename(columns={"factuality": "Factuality", "precise if": "Precise IF", "math": "Math", "safety": "Safety", "chat": "Focus", "ties": "Ties"})
 
 
 
 
 
 
 
 
 
63
  return new_df
64
 
 
65
  def avg_over_rewardbench(dataframe_core, dataframe_prefs):
66
  """
67
  Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
@@ -130,6 +141,7 @@ def avg_over_rewardbench(dataframe_core, dataframe_prefs):
130
  new_df = new_df[keep_columns]
131
  return new_df
132
 
 
133
  def prep_df(df):
134
  # add column to 0th entry with count (column name itself empty)
135
  df.insert(0, "", range(1, 1 + len(df)))
@@ -147,8 +159,10 @@ def prep_df(df):
147
 
148
  return df
149
 
 
150
  # get v1 data
151
- rb_orig_snapshot = pd.read_csv("leaderboard/final-rbv1-data.csv")
 
152
  # rename column "Unnamed: 0" to ""
153
  rb_orig_snapshot = rb_orig_snapshot.rename(columns={"Unnamed: 0": ""})
154
  # rb_orig_snapshot = rb_orig_snapshot.drop(columns=["Unnamed: 0", ''])
@@ -158,20 +172,20 @@ rewardbench_data = load_all_data(repo_dir_rewardbench, subdir="eval-set").sort_v
158
  rewardbench_data_avg_intermediate = avg_over_rewardbench_v2(rewardbench_data.copy())
159
 
160
  # Prepare RBv1 scores for merging
161
- rb_v1_scores_to_merge = rb_orig_snapshot[['Model', 'Score']].copy()
162
 
163
  # if " ⚠️" in rb_v1_scores_to_merge["Model"].values, shorten the model name without it
164
  rb_v1_scores_to_merge["Model"] = rb_v1_scores_to_merge["Model"].str.replace(" ⚠️", "", regex=False)
165
 
166
- rb_v1_scores_to_merge.rename(columns={'Score': 'RBv1'}, inplace=True)
167
  # rename rb_v1 "Model" to "model"
168
- rb_v1_scores_to_merge.rename(columns={'Model': 'model'}, inplace=True)
169
 
170
  # Merge RBv1 scores into the v2 data
171
- rewardbench_data_avg = pd.merge(rewardbench_data_avg_intermediate, rb_v1_scores_to_merge, on='model', how='left')
172
 
173
  # Drop any models with only RBv1 scores and no v2 scores
174
- rewardbench_data_avg = rewardbench_data_avg.dropna(subset=['average'])
175
 
176
  # Sort by the v2 average
177
  rewardbench_data_avg = rewardbench_data_avg.sort_values(by="average", ascending=False)
@@ -183,9 +197,13 @@ rewardbench_data_avg = prep_df(rewardbench_data_avg).rename(columns={"Average":
183
 
184
  # Ensure RBv1 is the last column if it's not already (merge usually places it at the end of non-key columns)
185
  # If 'RBv1' is present and not last, move it to be the last column.
186
- if 'RBv1' in rewardbench_data_avg.columns:
187
- rbv1_col = rewardbench_data_avg.pop('RBv1')
188
- rewardbench_data_avg['RBv1'] = rbv1_col
 
 
 
 
189
 
190
  col_types_rewardbench = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data_avg.columns) - 1)
191
  col_types_rewardbench_v1 = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rb_orig_snapshot.columns) - 1)
@@ -217,6 +235,7 @@ def random_sample(r: gr.Request, subset):
217
  markdown_text = "\n\n".join([f"**{key}**:\n\n{value}" for key, value in sample.items()])
218
  return markdown_text
219
 
 
220
  # Duplicating because they use global variables with gradio setup
221
  def random_sample_v1(r: gr.Request, subset):
222
  if subset is None or subset == []:
@@ -233,6 +252,7 @@ def random_sample_v1(r: gr.Request, subset):
233
  markdown_text = "\n\n".join([f"**{key}**:\n\n{value}" for key, value in sample.items()])
234
  return markdown_text
235
 
 
236
  color_map = {
237
  "Generative": "#7497db",
238
  "Custom Classifier": "#E8ECF2",
@@ -240,6 +260,7 @@ color_map = {
240
  "DPO": "#75809c",
241
  }
242
 
 
243
  def color_model_type_column(df, color_map):
244
  """
245
  Apply color to the 'Model Type' column of the DataFrame based on a given color mapping.
@@ -264,6 +285,7 @@ def color_model_type_column(df, color_map):
264
 
265
  return df.style.applymap(apply_color, subset=["Model Type"]).format(format_dict, na_rep="")
266
 
 
267
  def regex_table(dataframe, regex, filter_button, style=True):
268
  """
269
  Takes a model name as a regex, then returns only the rows that has that in it.
@@ -327,6 +349,7 @@ def regex_table(dataframe, regex, filter_button, style=True):
327
 
328
  return data
329
 
 
330
  # import ipdb; ipdb.set_trace()
331
 
332
  total_models = len(
@@ -382,13 +405,21 @@ with gr.Blocks(theme=theme, css=custom_css) as app:
382
  label="Model Search (delimit with , )",
383
  placeholder="Model Search (delimit with , )",
384
  show_label=False,
 
385
  )
386
  model_types_1 = gr.CheckboxGroup(
387
  ["Seq. Classifiers", "Custom Classifiers", "Generative", "RBv1"],
388
  value=["Seq. Classifiers", "Custom Classifiers", "Generative"],
389
- label="Model Types",
390
  show_label=False,
391
- # info="Which model types to include.",
 
 
 
 
 
 
 
 
392
  )
393
  with gr.Row():
394
  # reference data
@@ -430,7 +461,6 @@ with gr.Blocks(theme=theme, css=custom_css) as app:
430
  gr.Markdown(CAPTION_V1.format(str(total_models_v1)))
431
  with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
432
  with gr.TabItem("Leaderboard"):
433
- pass
434
  with gr.Row():
435
  search_1_v1 = gr.Textbox(
436
  label="Model Search (delimit with , )",
@@ -444,6 +474,14 @@ with gr.Blocks(theme=theme, css=custom_css) as app:
444
  show_label=False,
445
  # info="Which model types to include.",
446
  )
 
 
 
 
 
 
 
 
447
  with gr.Row():
448
  # reference data
449
  rewardbench_table_hidden_v1 = gr.Dataframe(
@@ -479,9 +517,10 @@ with gr.Blocks(theme=theme, css=custom_css) as app:
479
 
480
  button_data_v1.click(fn=random_sample_v1, inputs=[subset_selector_v1], outputs=[sample_display_v1])
481
 
482
-
483
  search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
484
- search_1_v1.change(regex_table, inputs=[rewardbench_table_hidden_v1, search_1_v1, model_types_1_v1], outputs=rewardbench_table_v1)
 
 
485
 
486
  model_types_1.change(
487
  regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table
 
3
 
4
  import gradio as gr
5
  import numpy as np
6
+ import pandas as pd
7
  from datasets import load_dataset
8
  from huggingface_hub import HfApi, snapshot_download
 
9
 
10
+ from leaderboard.constants import example_counts, subset_mapping
11
  from leaderboard.css import custom_css
12
  from leaderboard.md import *
13
  from leaderboard.utils import load_all_data
14
 
15
  #######################################################
16
+ # Setup #
17
  #######################################################
18
  api = HfApi()
19
 
 
40
  # Load Data #
41
  ###########################################
42
 
43
+
44
  def avg_over_rewardbench_v2(dataframe_core):
45
  domain_cols = ["factuality", "precise if", "math", "safety", "chat", "ties"]
46
  domain_weights = [1, 1, 1, 1, 1, 1]
 
60
  new_df = new_df[keep_columns]
61
 
62
  # TODO: update domain_cols and comment this out if final dataset version changes names
63
+ new_df = new_df.rename(
64
+ columns={
65
+ "factuality": "Factuality",
66
+ "precise if": "Precise IF",
67
+ "math": "Math",
68
+ "safety": "Safety",
69
+ "chat": "Focus",
70
+ "ties": "Ties",
71
+ }
72
+ )
73
  return new_df
74
 
75
+
76
  def avg_over_rewardbench(dataframe_core, dataframe_prefs):
77
  """
78
  Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
 
141
  new_df = new_df[keep_columns]
142
  return new_df
143
 
144
+
145
  def prep_df(df):
146
  # add column to 0th entry with count (column name itself empty)
147
  df.insert(0, "", range(1, 1 + len(df)))
 
159
 
160
  return df
161
 
162
+
163
  # get v1 data
164
+ orig_data_path = "leaderboard/final-rbv1-data.csv"
165
+ rb_orig_snapshot = pd.read_csv(orig_data_path)
166
  # rename column "Unnamed: 0" to ""
167
  rb_orig_snapshot = rb_orig_snapshot.rename(columns={"Unnamed: 0": ""})
168
  # rb_orig_snapshot = rb_orig_snapshot.drop(columns=["Unnamed: 0", ''])
 
172
  rewardbench_data_avg_intermediate = avg_over_rewardbench_v2(rewardbench_data.copy())
173
 
174
  # Prepare RBv1 scores for merging
175
+ rb_v1_scores_to_merge = rb_orig_snapshot[["Model", "Score"]].copy()
176
 
177
  # if " ⚠️" in rb_v1_scores_to_merge["Model"].values, shorten the model name without it
178
  rb_v1_scores_to_merge["Model"] = rb_v1_scores_to_merge["Model"].str.replace(" ⚠️", "", regex=False)
179
 
180
+ rb_v1_scores_to_merge.rename(columns={"Score": "RBv1"}, inplace=True)
181
  # rename rb_v1 "Model" to "model"
182
+ rb_v1_scores_to_merge.rename(columns={"Model": "model"}, inplace=True)
183
 
184
  # Merge RBv1 scores into the v2 data
185
+ rewardbench_data_avg = pd.merge(rewardbench_data_avg_intermediate, rb_v1_scores_to_merge, on="model", how="left")
186
 
187
  # Drop any models with only RBv1 scores and no v2 scores
188
+ rewardbench_data_avg = rewardbench_data_avg.dropna(subset=["average"])
189
 
190
  # Sort by the v2 average
191
  rewardbench_data_avg = rewardbench_data_avg.sort_values(by="average", ascending=False)
 
197
 
198
  # Ensure RBv1 is the last column if it's not already (merge usually places it at the end of non-key columns)
199
  # If 'RBv1' is present and not last, move it to be the last column.
200
+ if "RBv1" in rewardbench_data_avg.columns:
201
+ rbv1_col = rewardbench_data_avg.pop("RBv1")
202
+ rewardbench_data_avg["RBv1"] = rbv1_col
203
+
204
+ # save rewardbench_data_avg as csv to src/current-rbv2-data.csv
205
+ v2_data_path = "leaderboard/current-rbv2-data.csv"
206
+ rewardbench_data_avg.to_csv(v2_data_path, index=False)
207
 
208
  col_types_rewardbench = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data_avg.columns) - 1)
209
  col_types_rewardbench_v1 = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rb_orig_snapshot.columns) - 1)
 
235
  markdown_text = "\n\n".join([f"**{key}**:\n\n{value}" for key, value in sample.items()])
236
  return markdown_text
237
 
238
+
239
  # Duplicating because they use global variables with gradio setup
240
  def random_sample_v1(r: gr.Request, subset):
241
  if subset is None or subset == []:
 
252
  markdown_text = "\n\n".join([f"**{key}**:\n\n{value}" for key, value in sample.items()])
253
  return markdown_text
254
 
255
+
256
  color_map = {
257
  "Generative": "#7497db",
258
  "Custom Classifier": "#E8ECF2",
 
260
  "DPO": "#75809c",
261
  }
262
 
263
+
264
  def color_model_type_column(df, color_map):
265
  """
266
  Apply color to the 'Model Type' column of the DataFrame based on a given color mapping.
 
285
 
286
  return df.style.applymap(apply_color, subset=["Model Type"]).format(format_dict, na_rep="")
287
 
288
+
289
  def regex_table(dataframe, regex, filter_button, style=True):
290
  """
291
  Takes a model name as a regex, then returns only the rows that has that in it.
 
349
 
350
  return data
351
 
352
+
353
  # import ipdb; ipdb.set_trace()
354
 
355
  total_models = len(
 
405
  label="Model Search (delimit with , )",
406
  placeholder="Model Search (delimit with , )",
407
  show_label=False,
408
+ scale=8,
409
  )
410
  model_types_1 = gr.CheckboxGroup(
411
  ["Seq. Classifiers", "Custom Classifiers", "Generative", "RBv1"],
412
  value=["Seq. Classifiers", "Custom Classifiers", "Generative"],
 
413
  show_label=False,
414
+ scale=8,
415
+ )
416
+ # narrow, non-expanding download button
417
+ gr.DownloadButton(
418
+ label="Download CSV",
419
+ value=v2_data_path,
420
+ size="sm", # shorter height / padding
421
+ scale=0, # ← **width stays just big enough for the text**
422
+ min_width=140, # (optional) guarantee it doesn’t collapse
423
  )
424
  with gr.Row():
425
  # reference data
 
461
  gr.Markdown(CAPTION_V1.format(str(total_models_v1)))
462
  with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
463
  with gr.TabItem("Leaderboard"):
 
464
  with gr.Row():
465
  search_1_v1 = gr.Textbox(
466
  label="Model Search (delimit with , )",
 
474
  show_label=False,
475
  # info="Which model types to include.",
476
  )
477
+ # narrow, non-expanding download button
478
+ gr.DownloadButton(
479
+ label="Download CSV",
480
+ value=orig_data_path,
481
+ size="sm", # shorter height / padding
482
+ scale=0, # ← **width stays just big enough for the text**
483
+ min_width=140, # (optional) guarantee it doesn’t collapse
484
+ )
485
  with gr.Row():
486
  # reference data
487
  rewardbench_table_hidden_v1 = gr.Dataframe(
 
517
 
518
  button_data_v1.click(fn=random_sample_v1, inputs=[subset_selector_v1], outputs=[sample_display_v1])
519
 
 
520
  search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
521
+ search_1_v1.change(
522
+ regex_table, inputs=[rewardbench_table_hidden_v1, search_1_v1, model_types_1_v1], outputs=rewardbench_table_v1
523
+ )
524
 
525
  model_types_1.change(
526
  regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table
leaderboard/md.py CHANGED
@@ -108,9 +108,11 @@ TOP_TEXT = """# RewardBench: Evaluating Reward Models
108
 
109
  CAPTION_V2 = f"""The *new version* of RewardBench that is based on unseen human data and designed to be substantially more difficult!
110
 
111
- [Code](https://github.com/allenai/reward-bench) | [Eval. Dataset](https://huggingface.co/datasets/allenai/reward-bench-v2-v0) | [Prior Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/allenai/reward-bench-v2-results) | [Paper (TODO)](TODO) | Total models: {{}} | Last restart (PST): {current_time}"""
112
 
113
- CAPTION_V1 = """The original RewardBench -- the first reward model evaluation.
 
 
114
 
115
  **Note**: This leaderboard is frozen and will not be updated. The final version of the evaluation results are available in the source for this application.
116
 
 
108
 
109
  CAPTION_V2 = f"""The *new version* of RewardBench that is based on unseen human data and designed to be substantially more difficult!
110
 
111
+ [Code](https://github.com/allenai/reward-bench) | [Eval. Dataset v2](https://huggingface.co/datasets/allenai/reward-bench-v2) | [Results v2](https://huggingface.co/datasets/allenai/reward-bench-v2-results) | [Paper (TODO)](TODO) | Total models: {{}} | Last restart (PST): {current_time}"""
112
 
113
+ CAPTION_V1 = f"""The original RewardBench -- the first reward model evaluation.
114
+
115
+ [Code](https://github.com/allenai/reward-bench) | [Eval. Dataset v1](https://huggingface.co/datasets/allenai/reward-bench) | [Prior Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results v1](https://huggingface.co/datasets/allenai/reward-bench-results) | [Paper v1](https://arxiv.org/abs/2403.13787) | Total models: {{}} | * Unverified models | ⚠️ Dataset Contamination | Last restart (PST): {current_time}
116
 
117
  **Note**: This leaderboard is frozen and will not be updated. The final version of the evaluation results are available in the source for this application.
118
 
leaderboard/retired-app.py CHANGED
@@ -1,14 +1,14 @@
1
- import gradio as gr
2
  import os
3
- from huggingface_hub import HfApi, snapshot_download
 
 
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  from datasets import load_dataset
6
- from src.utils import load_all_data
7
- from src.md import ABOUT_TEXT, TOP_TEXT
8
- from src.plt import plot_avg_correlation
9
- from src.constants import subset_mapping, length_categories, example_counts
10
  from src.css import custom_css
11
- import numpy as np
 
12
 
13
  api = HfApi()
14
 
@@ -18,16 +18,18 @@ evals_repo = "allenai/reward-bench-results"
18
  eval_set_repo = "allenai/reward-bench"
19
  repo_dir_rewardbench = "./evals/rewardbench/"
20
 
 
21
  def restart_space():
22
  api.restart_space(repo_id="allenai/reward-bench", token=COLLAB_TOKEN)
23
 
 
24
  print("Pulling evaluation results")
25
  repo = snapshot_download(
26
  local_dir=repo_dir_rewardbench,
27
  ignore_patterns=["pref-sets-scores/*", "eval-set-scores/*"],
28
  repo_id=evals_repo,
29
  use_auth_token=COLLAB_TOKEN,
30
- tqdm_class=None,
31
  etag_timeout=30,
32
  repo_type="dataset",
33
  )
@@ -50,13 +52,19 @@ def avg_over_rewardbench(dataframe_core, dataframe_prefs):
50
  # for main subsets, keys in subset_mapping, take the weighted avg by example_counts and store for the models
51
  for subset, sub_subsets in subset_mapping.items():
52
  subset_cols = [col for col in new_df.columns if col in sub_subsets]
53
- sub_data = new_df[subset_cols].values # take the relevant column values
54
- sub_counts = [example_counts[s] for s in subset_cols] # take the example counts
55
- new_df[subset] = np.average(sub_data, axis=1, weights=sub_counts) # take the weighted average
56
  # new_df[subset] = np.round(np.nanmean(new_df[subset_cols].values, axis=1), 2)
57
 
58
  data_cols = list(subset_mapping.keys())
59
- keep_columns = ["model",] + ["model_type"] + data_cols
 
 
 
 
 
 
60
  # keep_columns = ["model", "average"] + subsets
61
  new_df = new_df[keep_columns]
62
 
@@ -78,7 +86,7 @@ def avg_over_rewardbench(dataframe_core, dataframe_prefs):
78
  # new_df.at[i, "Prior Sets (0.5 weight)"] = dataframe_prefs[dataframe_prefs["model"] == model]["Prior Sets (0.5 weight)"].values[0]
79
  else:
80
  values.append(np.nan)
81
-
82
  new_df["Prior Sets (0.5 weight)"] = values
83
 
84
  # add total average
@@ -95,6 +103,7 @@ def avg_over_rewardbench(dataframe_core, dataframe_prefs):
95
  new_df = new_df[keep_columns]
96
  return new_df
97
 
 
98
  def expand_subsets(dataframe):
99
  # TODO need to modify data/ script to do this
100
  pass
@@ -106,7 +115,7 @@ def length_bias_check(dataframe):
106
  Then, take the average of the three buckets as "average"
107
  """
108
  new_df = dataframe.copy()
109
- existing_subsets = new_df.columns[3:] # model, model_type, average
110
  final_subsets = ["Length Bias", "Neutral", "Terse Bias"]
111
  # new data is empty list dict for each final subset
112
  new_data = {s: [] for s in final_subsets}
@@ -135,17 +144,17 @@ def length_bias_check(dataframe):
135
  return new_df
136
 
137
 
138
-
139
- rewardbench_data = load_all_data(repo_dir_rewardbench, subdir="eval-set").sort_values(by='average', ascending=False)
140
- rewardbench_data_length = length_bias_check(rewardbench_data).sort_values(by='Terse Bias', ascending=False)
141
- prefs_data = load_all_data(repo_dir_rewardbench, subdir="pref-sets").sort_values(by='average', ascending=False)
142
  # prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
143
 
144
- rewardbench_data_avg = avg_over_rewardbench(rewardbench_data, prefs_data).sort_values(by='average', ascending=False)
 
145
 
146
  def prep_df(df):
147
  # add column to 0th entry with count (column name itself empty)
148
- df.insert(0, '', range(1, 1 + len(df)))
149
 
150
  # replace "model" with "Model" and "model_type" with "Model Type" and "average" with "Average"
151
  df = df.rename(columns={"model": "Model", "model_type": "Model Type", "average": "Average"})
@@ -154,12 +163,13 @@ def prep_df(df):
154
  if "Model Type" in df.columns:
155
  # get model_types that have generative in them
156
  mask = df["Model Type"].str.contains("generative", case=False, na=False)
157
-
158
  # set these values to "Generative"
159
  df.loc[mask, "Model Type"] = "Generative"
160
 
161
  return df
162
 
 
163
  # add count column to all dataframes
164
  rewardbench_data = prep_df(rewardbench_data)
165
  rewardbench_data_avg = prep_df(rewardbench_data_avg).rename(columns={"Average": "Score"})
@@ -172,18 +182,20 @@ rewardbench_data_length = prep_df(rewardbench_data_length)
172
  prefs_data = prep_df(prefs_data)
173
 
174
  col_types_rewardbench = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data.columns) - 1)
175
- col_types_rewardbench_avg = ["number"] + ["markdown"]+ ["str"] + ["number"] * (len(rewardbench_data_avg.columns) - 1)
176
  cols_rewardbench_data_length = ["markdown"] + ["number"] * (len(rewardbench_data_length.columns) - 1)
177
  col_types_prefs = ["number"] + ["markdown"] + ["number"] * (len(prefs_data.columns) - 1)
178
  # col_types_prefs_sub = ["markdown"] + ["number"] * (len(prefs_data_sub.columns) - 1)
179
 
180
  # for showing random samples
181
  eval_set = load_dataset(eval_set_repo, use_auth_token=COLLAB_TOKEN, split="filtered")
 
 
182
  def random_sample(r: gr.Request, subset):
183
  if subset is None or subset == []:
184
  sample_index = np.random.randint(0, len(eval_set) - 1)
185
  sample = eval_set[sample_index]
186
- else: # filter by subsets (can be list)
187
  if isinstance(subset, str):
188
  subset = [subset]
189
  # filter down dataset to only include the subset(s)
@@ -191,9 +203,10 @@ def random_sample(r: gr.Request, subset):
191
  sample_index = np.random.randint(0, len(eval_set_filtered) - 1)
192
  sample = eval_set_filtered[sample_index]
193
 
194
- markdown_text = '\n\n'.join([f"**{key}**:\n\n{value}" for key, value in sample.items()])
195
  return markdown_text
196
 
 
197
  subsets = eval_set.unique("subset")
198
 
199
  color_map = {
@@ -202,6 +215,8 @@ color_map = {
202
  "Seq. Classifier": "#ffcd75",
203
  "DPO": "#75809c",
204
  }
 
 
205
  def color_model_type_column(df, color_map):
206
  """
207
  Apply color to the 'Model Type' column of the DataFrame based on a given color mapping.
@@ -213,17 +228,19 @@ def color_model_type_column(df, color_map):
213
  Returns:
214
  pd.Styler: The styled DataFrame.
215
  """
 
216
  # Function to apply color based on the model type
217
  def apply_color(val):
218
  color = color_map.get(val, "default") # Default color if not specified in color_map
219
- return f'background-color: {color}'
220
-
221
  # Format for different columns
222
- format_dict = {col: "{:.1f}" for col in df.columns if col not in ['Average', 'Model', 'Model Type']}
223
- format_dict['Average'] = "{:.2f}"
224
- format_dict[''] = "{:d}"
 
 
225
 
226
- return df.style.applymap(apply_color, subset=['Model Type']).format(format_dict, na_rep='')
227
 
228
  def regex_table(dataframe, regex, filter_button, style=True):
229
  """
@@ -232,18 +249,18 @@ def regex_table(dataframe, regex, filter_button, style=True):
232
  # Split regex statement by comma and trim whitespace around regexes
233
  regex_list = [x.strip() for x in regex.split(",")]
234
  # Join the list into a single regex pattern with '|' acting as OR
235
- combined_regex = '|'.join(regex_list)
236
 
237
  # remove internal ai2 data
238
  dataframe = dataframe[~dataframe["Model"].str.contains("ai2", case=False, na=False)]
239
-
240
  # if filter_button, remove all rows with "ai2" in the model name
241
  update_scores = False
242
  if isinstance(filter_button, list) or isinstance(filter_button, str):
243
- if "Prior Sets" not in filter_button and 'Prior Sets (0.5 weight)' in dataframe.columns:
244
  update_scores = True
245
  # remove the column "Prior Sets (0.5 weight)" from the outputted table
246
- dataframe = dataframe.drop(columns=['Prior Sets (0.5 weight)'])
247
  if "Seq. Classifiers" not in filter_button:
248
  dataframe = dataframe[~dataframe["Model Type"].str.contains("Seq. Classifier", case=False, na=False)]
249
  if "DPO" not in filter_button:
@@ -261,12 +278,12 @@ def regex_table(dataframe, regex, filter_button, style=True):
261
  # if "Prior Sets (0.5 weight)" in data.columns:
262
  # data["Prior Sets (0.5 weight)"] = np.nan
263
  # sort array by Score column
264
- data = data.sort_values(by='Score', ascending=False)
265
 
266
  data.reset_index(drop=True, inplace=True)
267
 
268
  # replace column '' with count/rank
269
- data[''] = np.arange(1, 1 + len(data))
270
 
271
  # if Score exists, round to 2 decimals
272
  if "Score" in data.columns:
@@ -277,7 +294,7 @@ def regex_table(dataframe, regex, filter_button, style=True):
277
  for col in data.columns:
278
  if col not in ["", "Model", "Model Type", "Score", "Average"]:
279
  # replace any data[col].values == '' with np.nan
280
- data[col] = data[col].replace('', np.nan)
281
  data[col] = np.round(np.array(data[col].values).astype(float), 1)
282
  if style:
283
  # apply color
@@ -285,9 +302,14 @@ def regex_table(dataframe, regex, filter_button, style=True):
285
 
286
  return data
287
 
 
288
  # import ipdb; ipdb.set_trace()
289
 
290
- total_models = len(regex_table(rewardbench_data_avg.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"], style=False).values)
 
 
 
 
291
 
292
  with gr.Blocks(css=custom_css) as app:
293
  # create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
@@ -298,21 +320,26 @@ with gr.Blocks(css=custom_css) as app:
298
  # search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
299
  # filter_button = gr.Checkbox(label="Include AI2 training runs (or type ai2 above).", interactive=True)
300
  # img = gr.Image(value="https://private-user-images.githubusercontent.com/10695622/310698241-24ed272a-0844-451f-b414-fde57478703e.png", width=500)
301
- gr.Markdown("""
 
302
  ![](file/src/logo.png)
303
- """)
 
304
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
305
  with gr.TabItem("πŸ† RewardBench Leaderboard"):
306
  with gr.Row():
307
- search_1 = gr.Textbox(label="Model Search (delimit with , )",
308
- placeholder="Model Search (delimit with , )",
309
- show_label=False)
310
- model_types_1 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative", "Prior Sets"],
311
- value=["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
312
- label="Model Types",
313
- show_label=False,
314
- # info="Which model types to include.",
315
- )
 
 
 
316
  with gr.Row():
317
  # reference data
318
  rewardbench_table_hidden = gr.Dataframe(
@@ -322,22 +349,31 @@ with gr.Blocks(css=custom_css) as app:
322
  visible=False,
323
  )
324
  rewardbench_table = gr.Dataframe(
325
- regex_table(rewardbench_data_avg.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"]),
 
 
 
 
326
  datatype=col_types_rewardbench_avg,
327
  headers=rewardbench_data_avg.columns.tolist(),
328
  elem_id="rewardbench_dataframe_avg",
329
  height=1000,
330
  )
331
-
332
  with gr.TabItem("πŸ” RewardBench - Detailed"):
333
  with gr.Row():
334
- search_2 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
335
- model_types_2 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
336
- value=["Seq. Classifiers", "DPO", "Generative", "Custom Classifiers"],
337
- label="Model Types",
338
- show_label=False,
339
- # info="Which model types to include."
340
- )
 
 
 
 
 
341
  with gr.Row():
342
  # ref data
343
  rewardbench_table_detailed_hidden = gr.Dataframe(
@@ -347,7 +383,9 @@ with gr.Blocks(css=custom_css) as app:
347
  visible=False,
348
  )
349
  rewardbench_table_detailed = gr.Dataframe(
350
- regex_table(rewardbench_data.copy(), "", ["Seq. Classifiers", "DPO", "Generative", "Custom Classifiers"]),
 
 
351
  datatype=col_types_rewardbench,
352
  headers=rewardbench_data.columns.tolist(),
353
  elem_id="rewardbench_dataframe",
@@ -371,13 +409,18 @@ with gr.Blocks(css=custom_css) as app:
371
  # )
372
  with gr.TabItem("Prior Test Sets"):
373
  with gr.Row():
374
- search_3 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
375
- model_types_3 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
376
- value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
377
- label="Model Types",
378
- show_label=False,
379
- # info="Which model types to include.",
380
- )
 
 
 
 
 
381
  with gr.Row():
382
  PREF_SET_TEXT = """
383
  For more information, see the [dataset](https://huggingface.co/datasets/allenai/pref-test-sets). Only the subsets Anthropic Helpful, Anthropic HHH, Stanford SHP, and OpenAI's Summarize data are used in the leaderboard ranking.
@@ -399,7 +442,6 @@ with gr.Blocks(css=custom_css) as app:
399
  height=1000,
400
  )
401
 
402
-
403
  with gr.TabItem("About"):
404
  with gr.Row():
405
  gr.Markdown(ABOUT_TEXT)
@@ -407,8 +449,10 @@ with gr.Blocks(css=custom_css) as app:
407
  with gr.TabItem("Dataset Viewer"):
408
  with gr.Row():
409
  # loads one sample
410
- gr.Markdown("""## Random Dataset Sample Viewer
411
- Warning, refusals, XSTest, and donotanswer datasets have sensitive content.""")
 
 
412
  subset_selector = gr.Dropdown(subsets, label="Subset", value=None, multiselect=True)
413
  button = gr.Button("Show Random Sample")
414
 
@@ -423,13 +467,25 @@ Warning, refusals, XSTest, and donotanswer datasets have sensitive content.""")
423
  # gr.Plot(plot)
424
 
425
  search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
426
- search_2.change(regex_table, inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2], outputs=rewardbench_table_detailed)
 
 
 
 
427
  # search.change(regex_table, inputs=[rewardbench_table_len_hidden, search, filter_button], outputs=rewardbench_table_len)
428
- search_3.change(regex_table, inputs=[pref_sets_table_hidden, search_3, model_types_3], outputs=pref_sets_table)
429
-
430
- model_types_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
431
- model_types_2.change(regex_table, inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2], outputs=rewardbench_table_detailed)
432
- model_types_3.change(regex_table, inputs=[pref_sets_table_hidden, search_3, model_types_3], outputs=pref_sets_table)
 
 
 
 
 
 
 
 
433
 
434
  with gr.Row():
435
  with gr.Accordion("πŸ“š Citation", open=False):
@@ -457,6 +513,6 @@ Warning, refusals, XSTest, and donotanswer datasets have sensitive content.""")
457
  # pref_sets_table.update(data_prefs)
458
 
459
  scheduler = BackgroundScheduler()
460
- scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
461
  scheduler.start()
462
- app.launch(allowed_paths=['src/']) # had .queue() before launch before... not sure if that's necessary
 
 
1
  import os
2
+
3
+ import gradio as gr
4
+ import numpy as np
5
  from apscheduler.schedulers.background import BackgroundScheduler
6
  from datasets import load_dataset
7
+ from huggingface_hub import HfApi, snapshot_download
8
+ from src.constants import example_counts, length_categories, subset_mapping
 
 
9
  from src.css import custom_css
10
+ from src.md import ABOUT_TEXT, TOP_TEXT
11
+ from src.utils import load_all_data
12
 
13
  api = HfApi()
14
 
 
18
  eval_set_repo = "allenai/reward-bench"
19
  repo_dir_rewardbench = "./evals/rewardbench/"
20
 
21
+
22
  def restart_space():
23
  api.restart_space(repo_id="allenai/reward-bench", token=COLLAB_TOKEN)
24
 
25
+
26
  print("Pulling evaluation results")
27
  repo = snapshot_download(
28
  local_dir=repo_dir_rewardbench,
29
  ignore_patterns=["pref-sets-scores/*", "eval-set-scores/*"],
30
  repo_id=evals_repo,
31
  use_auth_token=COLLAB_TOKEN,
32
+ tqdm_class=None,
33
  etag_timeout=30,
34
  repo_type="dataset",
35
  )
 
52
  # for main subsets, keys in subset_mapping, take the weighted avg by example_counts and store for the models
53
  for subset, sub_subsets in subset_mapping.items():
54
  subset_cols = [col for col in new_df.columns if col in sub_subsets]
55
+ sub_data = new_df[subset_cols].values # take the relevant column values
56
+ sub_counts = [example_counts[s] for s in subset_cols] # take the example counts
57
+ new_df[subset] = np.average(sub_data, axis=1, weights=sub_counts) # take the weighted average
58
  # new_df[subset] = np.round(np.nanmean(new_df[subset_cols].values, axis=1), 2)
59
 
60
  data_cols = list(subset_mapping.keys())
61
+ keep_columns = (
62
+ [
63
+ "model",
64
+ ]
65
+ + ["model_type"]
66
+ + data_cols
67
+ )
68
  # keep_columns = ["model", "average"] + subsets
69
  new_df = new_df[keep_columns]
70
 
 
86
  # new_df.at[i, "Prior Sets (0.5 weight)"] = dataframe_prefs[dataframe_prefs["model"] == model]["Prior Sets (0.5 weight)"].values[0]
87
  else:
88
  values.append(np.nan)
89
+
90
  new_df["Prior Sets (0.5 weight)"] = values
91
 
92
  # add total average
 
103
  new_df = new_df[keep_columns]
104
  return new_df
105
 
106
+
107
  def expand_subsets(dataframe):
108
  # TODO need to modify data/ script to do this
109
  pass
 
115
  Then, take the average of the three buckets as "average"
116
  """
117
  new_df = dataframe.copy()
118
+ existing_subsets = new_df.columns[3:] # model, model_type, average
119
  final_subsets = ["Length Bias", "Neutral", "Terse Bias"]
120
  # new data is empty list dict for each final subset
121
  new_data = {s: [] for s in final_subsets}
 
144
  return new_df
145
 
146
 
147
+ rewardbench_data = load_all_data(repo_dir_rewardbench, subdir="eval-set").sort_values(by="average", ascending=False)
148
+ rewardbench_data_length = length_bias_check(rewardbench_data).sort_values(by="Terse Bias", ascending=False)
149
+ prefs_data = load_all_data(repo_dir_rewardbench, subdir="pref-sets").sort_values(by="average", ascending=False)
 
150
  # prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
151
 
152
+ rewardbench_data_avg = avg_over_rewardbench(rewardbench_data, prefs_data).sort_values(by="average", ascending=False)
153
+
154
 
155
  def prep_df(df):
156
  # add column to 0th entry with count (column name itself empty)
157
+ df.insert(0, "", range(1, 1 + len(df)))
158
 
159
  # replace "model" with "Model" and "model_type" with "Model Type" and "average" with "Average"
160
  df = df.rename(columns={"model": "Model", "model_type": "Model Type", "average": "Average"})
 
163
  if "Model Type" in df.columns:
164
  # get model_types that have generative in them
165
  mask = df["Model Type"].str.contains("generative", case=False, na=False)
166
+
167
  # set these values to "Generative"
168
  df.loc[mask, "Model Type"] = "Generative"
169
 
170
  return df
171
 
172
+
173
  # add count column to all dataframes
174
  rewardbench_data = prep_df(rewardbench_data)
175
  rewardbench_data_avg = prep_df(rewardbench_data_avg).rename(columns={"Average": "Score"})
 
182
  prefs_data = prep_df(prefs_data)
183
 
184
  col_types_rewardbench = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data.columns) - 1)
185
+ col_types_rewardbench_avg = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data_avg.columns) - 1)
186
  cols_rewardbench_data_length = ["markdown"] + ["number"] * (len(rewardbench_data_length.columns) - 1)
187
  col_types_prefs = ["number"] + ["markdown"] + ["number"] * (len(prefs_data.columns) - 1)
188
  # col_types_prefs_sub = ["markdown"] + ["number"] * (len(prefs_data_sub.columns) - 1)
189
 
190
  # for showing random samples
191
  eval_set = load_dataset(eval_set_repo, use_auth_token=COLLAB_TOKEN, split="filtered")
192
+
193
+
194
  def random_sample(r: gr.Request, subset):
195
  if subset is None or subset == []:
196
  sample_index = np.random.randint(0, len(eval_set) - 1)
197
  sample = eval_set[sample_index]
198
+ else: # filter by subsets (can be list)
199
  if isinstance(subset, str):
200
  subset = [subset]
201
  # filter down dataset to only include the subset(s)
 
203
  sample_index = np.random.randint(0, len(eval_set_filtered) - 1)
204
  sample = eval_set_filtered[sample_index]
205
 
206
+ markdown_text = "\n\n".join([f"**{key}**:\n\n{value}" for key, value in sample.items()])
207
  return markdown_text
208
 
209
+
210
  subsets = eval_set.unique("subset")
211
 
212
  color_map = {
 
215
  "Seq. Classifier": "#ffcd75",
216
  "DPO": "#75809c",
217
  }
218
+
219
+
220
  def color_model_type_column(df, color_map):
221
  """
222
  Apply color to the 'Model Type' column of the DataFrame based on a given color mapping.
 
228
  Returns:
229
  pd.Styler: The styled DataFrame.
230
  """
231
+
232
  # Function to apply color based on the model type
233
  def apply_color(val):
234
  color = color_map.get(val, "default") # Default color if not specified in color_map
235
+ return f"background-color: {color}"
236
+
237
  # Format for different columns
238
+ format_dict = {col: "{:.1f}" for col in df.columns if col not in ["Average", "Model", "Model Type"]}
239
+ format_dict["Average"] = "{:.2f}"
240
+ format_dict[""] = "{:d}"
241
+
242
+ return df.style.applymap(apply_color, subset=["Model Type"]).format(format_dict, na_rep="")
243
 
 
244
 
245
  def regex_table(dataframe, regex, filter_button, style=True):
246
  """
 
249
  # Split regex statement by comma and trim whitespace around regexes
250
  regex_list = [x.strip() for x in regex.split(",")]
251
  # Join the list into a single regex pattern with '|' acting as OR
252
+ combined_regex = "|".join(regex_list)
253
 
254
  # remove internal ai2 data
255
  dataframe = dataframe[~dataframe["Model"].str.contains("ai2", case=False, na=False)]
256
+
257
  # if filter_button, remove all rows with "ai2" in the model name
258
  update_scores = False
259
  if isinstance(filter_button, list) or isinstance(filter_button, str):
260
+ if "Prior Sets" not in filter_button and "Prior Sets (0.5 weight)" in dataframe.columns:
261
  update_scores = True
262
  # remove the column "Prior Sets (0.5 weight)" from the outputted table
263
+ dataframe = dataframe.drop(columns=["Prior Sets (0.5 weight)"])
264
  if "Seq. Classifiers" not in filter_button:
265
  dataframe = dataframe[~dataframe["Model Type"].str.contains("Seq. Classifier", case=False, na=False)]
266
  if "DPO" not in filter_button:
 
278
  # if "Prior Sets (0.5 weight)" in data.columns:
279
  # data["Prior Sets (0.5 weight)"] = np.nan
280
  # sort array by Score column
281
+ data = data.sort_values(by="Score", ascending=False)
282
 
283
  data.reset_index(drop=True, inplace=True)
284
 
285
  # replace column '' with count/rank
286
+ data[""] = np.arange(1, 1 + len(data))
287
 
288
  # if Score exists, round to 2 decimals
289
  if "Score" in data.columns:
 
294
  for col in data.columns:
295
  if col not in ["", "Model", "Model Type", "Score", "Average"]:
296
  # replace any data[col].values == '' with np.nan
297
+ data[col] = data[col].replace("", np.nan)
298
  data[col] = np.round(np.array(data[col].values).astype(float), 1)
299
  if style:
300
  # apply color
 
302
 
303
  return data
304
 
305
+
306
  # import ipdb; ipdb.set_trace()
307
 
308
+ total_models = len(
309
+ regex_table(
310
+ rewardbench_data_avg.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"], style=False
311
+ ).values
312
+ )
313
 
314
  with gr.Blocks(css=custom_css) as app:
315
  # create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
 
320
  # search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
321
  # filter_button = gr.Checkbox(label="Include AI2 training runs (or type ai2 above).", interactive=True)
322
  # img = gr.Image(value="https://private-user-images.githubusercontent.com/10695622/310698241-24ed272a-0844-451f-b414-fde57478703e.png", width=500)
323
+ gr.Markdown(
324
+ """
325
  ![](file/src/logo.png)
326
+ """
327
+ )
328
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
329
  with gr.TabItem("πŸ† RewardBench Leaderboard"):
330
  with gr.Row():
331
+ search_1 = gr.Textbox(
332
+ label="Model Search (delimit with , )",
333
+ placeholder="Model Search (delimit with , )",
334
+ show_label=False,
335
+ )
336
+ model_types_1 = gr.CheckboxGroup(
337
+ ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative", "Prior Sets"],
338
+ value=["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
339
+ label="Model Types",
340
+ show_label=False,
341
+ # info="Which model types to include.",
342
+ )
343
  with gr.Row():
344
  # reference data
345
  rewardbench_table_hidden = gr.Dataframe(
 
349
  visible=False,
350
  )
351
  rewardbench_table = gr.Dataframe(
352
+ regex_table(
353
+ rewardbench_data_avg.copy(),
354
+ "",
355
+ ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
356
+ ),
357
  datatype=col_types_rewardbench_avg,
358
  headers=rewardbench_data_avg.columns.tolist(),
359
  elem_id="rewardbench_dataframe_avg",
360
  height=1000,
361
  )
362
+
363
  with gr.TabItem("πŸ” RewardBench - Detailed"):
364
  with gr.Row():
365
+ search_2 = gr.Textbox(
366
+ label="Model Search (delimit with , )",
367
+ show_label=False,
368
+ placeholder="Model Search (delimit with , )",
369
+ )
370
+ model_types_2 = gr.CheckboxGroup(
371
+ ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
372
+ value=["Seq. Classifiers", "DPO", "Generative", "Custom Classifiers"],
373
+ label="Model Types",
374
+ show_label=False,
375
+ # info="Which model types to include."
376
+ )
377
  with gr.Row():
378
  # ref data
379
  rewardbench_table_detailed_hidden = gr.Dataframe(
 
383
  visible=False,
384
  )
385
  rewardbench_table_detailed = gr.Dataframe(
386
+ regex_table(
387
+ rewardbench_data.copy(), "", ["Seq. Classifiers", "DPO", "Generative", "Custom Classifiers"]
388
+ ),
389
  datatype=col_types_rewardbench,
390
  headers=rewardbench_data.columns.tolist(),
391
  elem_id="rewardbench_dataframe",
 
409
  # )
410
  with gr.TabItem("Prior Test Sets"):
411
  with gr.Row():
412
+ search_3 = gr.Textbox(
413
+ label="Model Search (delimit with , )",
414
+ show_label=False,
415
+ placeholder="Model Search (delimit with , )",
416
+ )
417
+ model_types_3 = gr.CheckboxGroup(
418
+ ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
419
+ value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
420
+ label="Model Types",
421
+ show_label=False,
422
+ # info="Which model types to include.",
423
+ )
424
  with gr.Row():
425
  PREF_SET_TEXT = """
426
  For more information, see the [dataset](https://huggingface.co/datasets/allenai/pref-test-sets). Only the subsets Anthropic Helpful, Anthropic HHH, Stanford SHP, and OpenAI's Summarize data are used in the leaderboard ranking.
 
442
  height=1000,
443
  )
444
 
 
445
  with gr.TabItem("About"):
446
  with gr.Row():
447
  gr.Markdown(ABOUT_TEXT)
 
449
  with gr.TabItem("Dataset Viewer"):
450
  with gr.Row():
451
  # loads one sample
452
+ gr.Markdown(
453
+ """## Random Dataset Sample Viewer
454
+ Warning, refusals, XSTest, and donotanswer datasets have sensitive content."""
455
+ )
456
  subset_selector = gr.Dropdown(subsets, label="Subset", value=None, multiselect=True)
457
  button = gr.Button("Show Random Sample")
458
 
 
467
  # gr.Plot(plot)
468
 
469
  search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
470
+ search_2.change(
471
+ regex_table,
472
+ inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2],
473
+ outputs=rewardbench_table_detailed,
474
+ )
475
  # search.change(regex_table, inputs=[rewardbench_table_len_hidden, search, filter_button], outputs=rewardbench_table_len)
476
+ search_3.change(regex_table, inputs=[pref_sets_table_hidden, search_3, model_types_3], outputs=pref_sets_table)
477
+
478
+ model_types_1.change(
479
+ regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table
480
+ )
481
+ model_types_2.change(
482
+ regex_table,
483
+ inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2],
484
+ outputs=rewardbench_table_detailed,
485
+ )
486
+ model_types_3.change(
487
+ regex_table, inputs=[pref_sets_table_hidden, search_3, model_types_3], outputs=pref_sets_table
488
+ )
489
 
490
  with gr.Row():
491
  with gr.Accordion("πŸ“š Citation", open=False):
 
513
  # pref_sets_table.update(data_prefs)
514
 
515
  scheduler = BackgroundScheduler()
516
+ scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
517
  scheduler.start()
518
+ app.launch(allowed_paths=["src/"]) # had .queue() before launch before... not sure if that's necessary
leaderboard/utils.py CHANGED
@@ -43,6 +43,7 @@ CONTAMINATED_MODELS_V1 = [
43
  "Ray2333/GRM-Gemma-2B-rewardmodel-ft",
44
  ]
45
 
 
46
  # From Open LLM Leaderboard
47
  def model_hyperlink(link, model_name):
48
  # if model_name is above 50 characters, return first 47 characters and "..."
 
43
  "Ray2333/GRM-Gemma-2B-rewardmodel-ft",
44
  ]
45
 
46
+
47
  # From Open LLM Leaderboard
48
  def model_hyperlink(link, model_name):
49
  # if model_name is above 50 characters, return first 47 characters and "..."