Spaces:

allenai
/

reward-bench

Running

App Files Files Community

natolambert commited on 8 days ago

Commit

75fed94

1 Parent(s): 9a9d913

download button and style

Browse files

Files changed (5) hide show

.gitignore +4 -1
app.py +57 -18
leaderboard/md.py +4 -2
leaderboard/retired-app.py +133 -77
leaderboard/utils.py +1 -0

.gitignore CHANGED Viewed

@@ -16,4 +16,7 @@ evals/
 .gradio/
 .evals/
 __pycache__/*
-*.pyc

 .gradio/
 .evals/
 __pycache__/*
+*.pyc
+# saved data automatically
+leaderboard/current-rbv2-data.csv

app.py CHANGED Viewed

@@ -3,17 +3,17 @@ from pathlib import Path
 import gradio as gr
 import numpy as np
 from datasets import load_dataset
 from huggingface_hub import HfApi, snapshot_download
-import pandas as pd
-from leaderboard.constants import example_counts, length_categories, subset_mapping
 from leaderboard.css import custom_css
 from leaderboard.md import *
 from leaderboard.utils import load_all_data
 #######################################################
-#                     Setup                           #
 #######################################################
 api = HfApi()
@@ -40,6 +40,7 @@ repo = snapshot_download(
 #                 Load Data               #
 ###########################################
 def avg_over_rewardbench_v2(dataframe_core):
     domain_cols = ["factuality", "precise if", "math", "safety", "chat", "ties"]
     domain_weights = [1, 1, 1, 1, 1, 1]
@@ -59,9 +60,19 @@ def avg_over_rewardbench_v2(dataframe_core):
     new_df = new_df[keep_columns]
     # TODO: update domain_cols and comment this out if final dataset version changes names
-    new_df = new_df.rename(columns={"factuality": "Factuality", "precise if": "Precise IF", "math": "Math", "safety": "Safety", "chat": "Focus", "ties": "Ties"})
     return new_df
 def avg_over_rewardbench(dataframe_core, dataframe_prefs):
     """
     Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
@@ -130,6 +141,7 @@ def avg_over_rewardbench(dataframe_core, dataframe_prefs):
     new_df = new_df[keep_columns]
     return new_df
 def prep_df(df):
     # add column to 0th entry with count (column name itself empty)
     df.insert(0, "", range(1, 1 + len(df)))
@@ -147,8 +159,10 @@ def prep_df(df):
     return df
 # get v1 data
-rb_orig_snapshot = pd.read_csv("leaderboard/final-rbv1-data.csv")
 # rename column "Unnamed: 0" to ""
 rb_orig_snapshot = rb_orig_snapshot.rename(columns={"Unnamed: 0": ""})
 # rb_orig_snapshot = rb_orig_snapshot.drop(columns=["Unnamed: 0", ''])
@@ -158,20 +172,20 @@ rewardbench_data = load_all_data(repo_dir_rewardbench, subdir="eval-set").sort_v
 rewardbench_data_avg_intermediate = avg_over_rewardbench_v2(rewardbench_data.copy())
 # Prepare RBv1 scores for merging
-rb_v1_scores_to_merge = rb_orig_snapshot[['Model', 'Score']].copy()
 # if " ⚠️" in rb_v1_scores_to_merge["Model"].values, shorten the model name without it
 rb_v1_scores_to_merge["Model"] = rb_v1_scores_to_merge["Model"].str.replace(" ⚠️", "", regex=False)
-rb_v1_scores_to_merge.rename(columns={'Score': 'RBv1'}, inplace=True)
 # rename rb_v1 "Model" to "model"
-rb_v1_scores_to_merge.rename(columns={'Model': 'model'}, inplace=True)
 # Merge RBv1 scores into the v2 data
-rewardbench_data_avg = pd.merge(rewardbench_data_avg_intermediate, rb_v1_scores_to_merge, on='model', how='left')
 # Drop any models with only RBv1 scores and no v2 scores
-rewardbench_data_avg = rewardbench_data_avg.dropna(subset=['average'])
 # Sort by the v2 average
 rewardbench_data_avg = rewardbench_data_avg.sort_values(by="average", ascending=False)
@@ -183,9 +197,13 @@ rewardbench_data_avg = prep_df(rewardbench_data_avg).rename(columns={"Average":
 # Ensure RBv1 is the last column if it's not already (merge usually places it at the end of non-key columns)
 # If 'RBv1' is present and not last, move it to be the last column.
-if 'RBv1' in rewardbench_data_avg.columns:
-    rbv1_col = rewardbench_data_avg.pop('RBv1')
-    rewardbench_data_avg['RBv1'] = rbv1_col
 col_types_rewardbench = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data_avg.columns) - 1)
 col_types_rewardbench_v1 = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rb_orig_snapshot.columns) - 1)
@@ -217,6 +235,7 @@ def random_sample(r: gr.Request, subset):
     markdown_text = "\n\n".join([f"**{key}**:\n\n{value}" for key, value in sample.items()])
     return markdown_text
 # Duplicating because they use global variables with gradio setup
 def random_sample_v1(r: gr.Request, subset):
     if subset is None or subset == []:
@@ -233,6 +252,7 @@ def random_sample_v1(r: gr.Request, subset):
     markdown_text = "\n\n".join([f"**{key}**:\n\n{value}" for key, value in sample.items()])
     return markdown_text
 color_map = {
     "Generative": "#7497db",
     "Custom Classifier": "#E8ECF2",
@@ -240,6 +260,7 @@ color_map = {
     "DPO": "#75809c",
 }
 def color_model_type_column(df, color_map):
     """
     Apply color to the 'Model Type' column of the DataFrame based on a given color mapping.
@@ -264,6 +285,7 @@ def color_model_type_column(df, color_map):
     return df.style.applymap(apply_color, subset=["Model Type"]).format(format_dict, na_rep="")
 def regex_table(dataframe, regex, filter_button, style=True):
     """
     Takes a model name as a regex, then returns only the rows that has that in it.
@@ -327,6 +349,7 @@ def regex_table(dataframe, regex, filter_button, style=True):
     return data
 # import ipdb; ipdb.set_trace()
 total_models = len(
@@ -382,13 +405,21 @@ with gr.Blocks(theme=theme, css=custom_css) as app:
                             label="Model Search (delimit with , )",
                             placeholder="Model Search (delimit with , )",
                             show_label=False,
                         )
                         model_types_1 = gr.CheckboxGroup(
                             ["Seq. Classifiers", "Custom Classifiers", "Generative", "RBv1"],
                             value=["Seq. Classifiers", "Custom Classifiers", "Generative"],
-                            label="Model Types",
                             show_label=False,
-                            #  info="Which model types to include.",
                         )
                     with gr.Row():
                         # reference data
@@ -430,7 +461,6 @@ with gr.Blocks(theme=theme, css=custom_css) as app:
                 gr.Markdown(CAPTION_V1.format(str(total_models_v1)))
             with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
                 with gr.TabItem("Leaderboard"):
-                    pass
                     with gr.Row():
                         search_1_v1 = gr.Textbox(
                             label="Model Search (delimit with , )",
@@ -444,6 +474,14 @@ with gr.Blocks(theme=theme, css=custom_css) as app:
                             show_label=False,
                             #  info="Which model types to include.",
                         )
                     with gr.Row():
                         # reference data
                         rewardbench_table_hidden_v1 = gr.Dataframe(
@@ -479,9 +517,10 @@ with gr.Blocks(theme=theme, css=custom_css) as app:
                     button_data_v1.click(fn=random_sample_v1, inputs=[subset_selector_v1], outputs=[sample_display_v1])
     search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
-    search_1_v1.change(regex_table, inputs=[rewardbench_table_hidden_v1, search_1_v1, model_types_1_v1], outputs=rewardbench_table_v1)
     model_types_1.change(
         regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table

 import gradio as gr
 import numpy as np
+import pandas as pd
 from datasets import load_dataset
 from huggingface_hub import HfApi, snapshot_download
+from leaderboard.constants import example_counts, subset_mapping
 from leaderboard.css import custom_css
 from leaderboard.md import *
 from leaderboard.utils import load_all_data
 #######################################################
+#                     Setup                           #
 #######################################################
 api = HfApi()
 #                 Load Data               #
 ###########################################
 def avg_over_rewardbench_v2(dataframe_core):
     domain_cols = ["factuality", "precise if", "math", "safety", "chat", "ties"]
     domain_weights = [1, 1, 1, 1, 1, 1]
     new_df = new_df[keep_columns]
     # TODO: update domain_cols and comment this out if final dataset version changes names
+    new_df = new_df.rename(
+        columns={
+            "factuality": "Factuality",
+            "precise if": "Precise IF",
+            "math": "Math",
+            "safety": "Safety",
+            "chat": "Focus",
+            "ties": "Ties",
+        }
+    )
     return new_df
 def avg_over_rewardbench(dataframe_core, dataframe_prefs):
     """
     Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
     new_df = new_df[keep_columns]
     return new_df
 def prep_df(df):
     # add column to 0th entry with count (column name itself empty)
     df.insert(0, "", range(1, 1 + len(df)))
     return df
 # get v1 data
+orig_data_path = "leaderboard/final-rbv1-data.csv"
+rb_orig_snapshot = pd.read_csv(orig_data_path)
 # rename column "Unnamed: 0" to ""
 rb_orig_snapshot = rb_orig_snapshot.rename(columns={"Unnamed: 0": ""})
 # rb_orig_snapshot = rb_orig_snapshot.drop(columns=["Unnamed: 0", ''])
 rewardbench_data_avg_intermediate = avg_over_rewardbench_v2(rewardbench_data.copy())
 # Prepare RBv1 scores for merging
+rb_v1_scores_to_merge = rb_orig_snapshot[["Model", "Score"]].copy()
 # if " ⚠️" in rb_v1_scores_to_merge["Model"].values, shorten the model name without it
 rb_v1_scores_to_merge["Model"] = rb_v1_scores_to_merge["Model"].str.replace(" ⚠️", "", regex=False)
+rb_v1_scores_to_merge.rename(columns={"Score": "RBv1"}, inplace=True)
 # rename rb_v1 "Model" to "model"
+rb_v1_scores_to_merge.rename(columns={"Model": "model"}, inplace=True)
 # Merge RBv1 scores into the v2 data
+rewardbench_data_avg = pd.merge(rewardbench_data_avg_intermediate, rb_v1_scores_to_merge, on="model", how="left")
 # Drop any models with only RBv1 scores and no v2 scores
+rewardbench_data_avg = rewardbench_data_avg.dropna(subset=["average"])
 # Sort by the v2 average
 rewardbench_data_avg = rewardbench_data_avg.sort_values(by="average", ascending=False)
 # Ensure RBv1 is the last column if it's not already (merge usually places it at the end of non-key columns)
 # If 'RBv1' is present and not last, move it to be the last column.
+if "RBv1" in rewardbench_data_avg.columns:
+    rbv1_col = rewardbench_data_avg.pop("RBv1")
+    rewardbench_data_avg["RBv1"] = rbv1_col
+# save rewardbench_data_avg as csv to src/current-rbv2-data.csv
+v2_data_path = "leaderboard/current-rbv2-data.csv"
+rewardbench_data_avg.to_csv(v2_data_path, index=False)
 col_types_rewardbench = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data_avg.columns) - 1)
 col_types_rewardbench_v1 = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rb_orig_snapshot.columns) - 1)
     markdown_text = "\n\n".join([f"**{key}**:\n\n{value}" for key, value in sample.items()])
     return markdown_text
 # Duplicating because they use global variables with gradio setup
 def random_sample_v1(r: gr.Request, subset):
     if subset is None or subset == []:
     markdown_text = "\n\n".join([f"**{key}**:\n\n{value}" for key, value in sample.items()])
     return markdown_text
 color_map = {
     "Generative": "#7497db",
     "Custom Classifier": "#E8ECF2",
     "DPO": "#75809c",
 }
 def color_model_type_column(df, color_map):
     """
     Apply color to the 'Model Type' column of the DataFrame based on a given color mapping.
     return df.style.applymap(apply_color, subset=["Model Type"]).format(format_dict, na_rep="")
 def regex_table(dataframe, regex, filter_button, style=True):
     """
     Takes a model name as a regex, then returns only the rows that has that in it.
     return data
 # import ipdb; ipdb.set_trace()
 total_models = len(
                             label="Model Search (delimit with , )",
                             placeholder="Model Search (delimit with , )",
                             show_label=False,
+                            scale=8,
                         )
                         model_types_1 = gr.CheckboxGroup(
                             ["Seq. Classifiers", "Custom Classifiers", "Generative", "RBv1"],
                             value=["Seq. Classifiers", "Custom Classifiers", "Generative"],
                             show_label=False,
+                            scale=8,
+                        )
+                        # narrow, non-expanding download button
+                        gr.DownloadButton(
+                            label="Download CSV",
+                            value=v2_data_path,
+                            size="sm",         # shorter height / padding
+                            scale=0,           # ← **width stays just big enough for the text**
+                            min_width=140,     # (optional) guarantee it doesn’t collapse
                         )
                     with gr.Row():
                         # reference data
                 gr.Markdown(CAPTION_V1.format(str(total_models_v1)))
             with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
                 with gr.TabItem("Leaderboard"):
                     with gr.Row():
                         search_1_v1 = gr.Textbox(
                             label="Model Search (delimit with , )",
                             show_label=False,
                             #  info="Which model types to include.",
                         )
+                        # narrow, non-expanding download button
+                        gr.DownloadButton(
+                            label="Download CSV",
+                            value=orig_data_path,
+                            size="sm",         # shorter height / padding
+                            scale=0,           # ← **width stays just big enough for the text**
+                            min_width=140,     # (optional) guarantee it doesn’t collapse
+                        )
                     with gr.Row():
                         # reference data
                         rewardbench_table_hidden_v1 = gr.Dataframe(
                     button_data_v1.click(fn=random_sample_v1, inputs=[subset_selector_v1], outputs=[sample_display_v1])
     search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
+    search_1_v1.change(
+        regex_table, inputs=[rewardbench_table_hidden_v1, search_1_v1, model_types_1_v1], outputs=rewardbench_table_v1
+    )
     model_types_1.change(
         regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table

leaderboard/md.py CHANGED Viewed

@@ -108,9 +108,11 @@ TOP_TEXT = """# RewardBench: Evaluating Reward Models
 CAPTION_V2 = f"""The *new version* of RewardBench that is based on unseen human data and designed to be substantially more difficult!
-[Code](https://github.com/allenai/reward-bench) | [Eval. Dataset](https://huggingface.co/datasets/allenai/reward-bench-v2-v0) | [Prior Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/allenai/reward-bench-v2-results) | [Paper (TODO)](TODO) | Total models: {{}} | Last restart (PST): {current_time}"""
-CAPTION_V1 = """The original RewardBench -- the first reward model evaluation.
 **Note**: This leaderboard is frozen and will not be updated. The final version of the evaluation results are available in the source for this application.

 CAPTION_V2 = f"""The *new version* of RewardBench that is based on unseen human data and designed to be substantially more difficult!
+[Code](https://github.com/allenai/reward-bench) |  [Eval. Dataset v2](https://huggingface.co/datasets/allenai/reward-bench-v2) | [Results v2](https://huggingface.co/datasets/allenai/reward-bench-v2-results) | [Paper (TODO)](TODO) | Total models: {{}} |  Last restart (PST): {current_time}"""
+CAPTION_V1 = f"""The original RewardBench -- the first reward model evaluation.
+[Code](https://github.com/allenai/reward-bench) |  [Eval. Dataset v1](https://huggingface.co/datasets/allenai/reward-bench) | [Prior Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results v1](https://huggingface.co/datasets/allenai/reward-bench-results) | [Paper v1](https://arxiv.org/abs/2403.13787) | Total models: {{}} | * Unverified models | ⚠️ Dataset Contamination |  Last restart (PST): {current_time}
 **Note**: This leaderboard is frozen and will not be updated. The final version of the evaluation results are available in the source for this application.

leaderboard/retired-app.py CHANGED Viewed

@@ -1,14 +1,14 @@
-import gradio as gr
 import os
-from huggingface_hub import HfApi, snapshot_download
 from apscheduler.schedulers.background import BackgroundScheduler
 from datasets import load_dataset
-from src.utils import load_all_data
-from src.md import ABOUT_TEXT, TOP_TEXT
-from src.plt import plot_avg_correlation
-from src.constants import subset_mapping, length_categories, example_counts
 from src.css import custom_css
-import numpy as np
 api = HfApi()
@@ -18,16 +18,18 @@ evals_repo = "allenai/reward-bench-results"
 eval_set_repo = "allenai/reward-bench"
 repo_dir_rewardbench = "./evals/rewardbench/"
 def restart_space():
     api.restart_space(repo_id="allenai/reward-bench", token=COLLAB_TOKEN)
 print("Pulling evaluation results")
 repo = snapshot_download(
     local_dir=repo_dir_rewardbench,
     ignore_patterns=["pref-sets-scores/*", "eval-set-scores/*"],
     repo_id=evals_repo,
     use_auth_token=COLLAB_TOKEN,
-    tqdm_class=None,
     etag_timeout=30,
     repo_type="dataset",
 )
@@ -50,13 +52,19 @@ def avg_over_rewardbench(dataframe_core, dataframe_prefs):
     # for main subsets, keys in subset_mapping, take the weighted avg by example_counts and store for the models
     for subset, sub_subsets in subset_mapping.items():
         subset_cols = [col for col in new_df.columns if col in sub_subsets]
-        sub_data = new_df[subset_cols].values # take the relevant column values
-        sub_counts = [example_counts[s] for s in subset_cols] # take the example counts
-        new_df[subset] = np.average(sub_data, axis=1, weights=sub_counts) # take the weighted average
         # new_df[subset] = np.round(np.nanmean(new_df[subset_cols].values, axis=1), 2)
     data_cols = list(subset_mapping.keys())
-    keep_columns = ["model",] + ["model_type"] + data_cols
     # keep_columns = ["model", "average"] + subsets
     new_df = new_df[keep_columns]
@@ -78,7 +86,7 @@ def avg_over_rewardbench(dataframe_core, dataframe_prefs):
             # new_df.at[i, "Prior Sets (0.5 weight)"] = dataframe_prefs[dataframe_prefs["model"] == model]["Prior Sets (0.5 weight)"].values[0]
         else:
             values.append(np.nan)
     new_df["Prior Sets (0.5 weight)"] = values
     # add total average
@@ -95,6 +103,7 @@ def avg_over_rewardbench(dataframe_core, dataframe_prefs):
     new_df = new_df[keep_columns]
     return new_df
 def expand_subsets(dataframe):
     # TODO need to modify data/ script to do this
     pass
@@ -106,7 +115,7 @@ def length_bias_check(dataframe):
     Then, take the average of the three buckets as "average"
     """
     new_df = dataframe.copy()
-    existing_subsets = new_df.columns[3:] # model, model_type, average
     final_subsets = ["Length Bias", "Neutral", "Terse Bias"]
     # new data is empty list dict for each final subset
     new_data = {s: [] for s in final_subsets}
@@ -135,17 +144,17 @@ def length_bias_check(dataframe):
     return new_df
-rewardbench_data = load_all_data(repo_dir_rewardbench, subdir="eval-set").sort_values(by='average', ascending=False)
-rewardbench_data_length = length_bias_check(rewardbench_data).sort_values(by='Terse Bias', ascending=False)
-prefs_data = load_all_data(repo_dir_rewardbench, subdir="pref-sets").sort_values(by='average', ascending=False)
 # prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
-rewardbench_data_avg = avg_over_rewardbench(rewardbench_data, prefs_data).sort_values(by='average', ascending=False)
 def prep_df(df):
     # add column to 0th entry with count (column name itself empty)
-    df.insert(0, '', range(1, 1 + len(df)))
     # replace "model" with "Model" and "model_type" with "Model Type" and "average" with "Average"
     df = df.rename(columns={"model": "Model", "model_type": "Model Type", "average": "Average"})
@@ -154,12 +163,13 @@ def prep_df(df):
     if "Model Type" in df.columns:
         # get model_types that have generative in them
         mask = df["Model Type"].str.contains("generative", case=False, na=False)
         # set these values to "Generative"
         df.loc[mask, "Model Type"] = "Generative"
     return df
 # add count column to all dataframes
 rewardbench_data = prep_df(rewardbench_data)
 rewardbench_data_avg = prep_df(rewardbench_data_avg).rename(columns={"Average": "Score"})
@@ -172,18 +182,20 @@ rewardbench_data_length = prep_df(rewardbench_data_length)
 prefs_data = prep_df(prefs_data)
 col_types_rewardbench = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data.columns) - 1)
-col_types_rewardbench_avg = ["number"] + ["markdown"]+ ["str"] + ["number"] * (len(rewardbench_data_avg.columns) - 1)
 cols_rewardbench_data_length = ["markdown"] + ["number"] * (len(rewardbench_data_length.columns) - 1)
 col_types_prefs = ["number"] + ["markdown"] + ["number"] * (len(prefs_data.columns) - 1)
 # col_types_prefs_sub = ["markdown"] + ["number"] * (len(prefs_data_sub.columns) - 1)
 # for showing random samples
 eval_set = load_dataset(eval_set_repo, use_auth_token=COLLAB_TOKEN, split="filtered")
 def random_sample(r: gr.Request, subset):
     if subset is None or subset == []:
         sample_index = np.random.randint(0, len(eval_set) - 1)
         sample = eval_set[sample_index]
-    else: # filter by subsets (can be list)
         if isinstance(subset, str):
             subset = [subset]
         # filter down dataset to only include the subset(s)
@@ -191,9 +203,10 @@ def random_sample(r: gr.Request, subset):
         sample_index = np.random.randint(0, len(eval_set_filtered) - 1)
         sample = eval_set_filtered[sample_index]
-    markdown_text = '\n\n'.join([f"**{key}**:\n\n{value}" for key, value in sample.items()])
     return markdown_text
 subsets = eval_set.unique("subset")
 color_map = {
@@ -202,6 +215,8 @@ color_map = {
     "Seq. Classifier": "#ffcd75",
     "DPO": "#75809c",
 }
 def color_model_type_column(df, color_map):
     """
     Apply color to the 'Model Type' column of the DataFrame based on a given color mapping.
@@ -213,17 +228,19 @@ def color_model_type_column(df, color_map):
     Returns:
     pd.Styler: The styled DataFrame.
     """
     # Function to apply color based on the model type
     def apply_color(val):
         color = color_map.get(val, "default")  # Default color if not specified in color_map
-        return f'background-color: {color}'
     # Format for different columns
-    format_dict = {col: "{:.1f}" for col in df.columns if col not in ['Average', 'Model', 'Model Type']}
-    format_dict['Average'] = "{:.2f}"
-    format_dict[''] = "{:d}"
-    return df.style.applymap(apply_color, subset=['Model Type']).format(format_dict, na_rep='')
 def regex_table(dataframe, regex, filter_button, style=True):
     """
@@ -232,18 +249,18 @@ def regex_table(dataframe, regex, filter_button, style=True):
     # Split regex statement by comma and trim whitespace around regexes
     regex_list = [x.strip() for x in regex.split(",")]
     # Join the list into a single regex pattern with '|' acting as OR
-    combined_regex = '|'.join(regex_list)
     # remove internal ai2 data
     dataframe = dataframe[~dataframe["Model"].str.contains("ai2", case=False, na=False)]
     # if filter_button, remove all rows with "ai2" in the model name
     update_scores = False
     if isinstance(filter_button, list) or isinstance(filter_button, str):
-        if "Prior Sets" not in filter_button and 'Prior Sets (0.5 weight)' in dataframe.columns:
             update_scores = True
             # remove the column "Prior Sets (0.5 weight)" from the outputted table
-            dataframe = dataframe.drop(columns=['Prior Sets (0.5 weight)'])
         if "Seq. Classifiers" not in filter_button:
             dataframe = dataframe[~dataframe["Model Type"].str.contains("Seq. Classifier", case=False, na=False)]
         if "DPO" not in filter_button:
@@ -261,12 +278,12 @@ def regex_table(dataframe, regex, filter_button, style=True):
         # if "Prior Sets (0.5 weight)" in data.columns:
         # data["Prior Sets (0.5 weight)"] = np.nan
         # sort array by Score column
-        data = data.sort_values(by='Score', ascending=False)
     data.reset_index(drop=True, inplace=True)
     # replace column '' with count/rank
-    data[''] = np.arange(1, 1 + len(data))
     # if Score exists, round to 2 decimals
     if "Score" in data.columns:
@@ -277,7 +294,7 @@ def regex_table(dataframe, regex, filter_button, style=True):
     for col in data.columns:
         if col not in ["", "Model", "Model Type", "Score", "Average"]:
             # replace any data[col].values == '' with np.nan
-            data[col] = data[col].replace('', np.nan)
             data[col] = np.round(np.array(data[col].values).astype(float), 1)
     if style:
         # apply color
@@ -285,9 +302,14 @@ def regex_table(dataframe, regex, filter_button, style=True):
     return data
 # import ipdb; ipdb.set_trace()
-total_models = len(regex_table(rewardbench_data_avg.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"], style=False).values)
 with gr.Blocks(css=custom_css) as app:
     # create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
@@ -298,21 +320,26 @@ with gr.Blocks(css=custom_css) as app:
             # search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
             # filter_button = gr.Checkbox(label="Include AI2 training runs (or type ai2 above).", interactive=True)
             # img = gr.Image(value="https://private-user-images.githubusercontent.com/10695622/310698241-24ed272a-0844-451f-b414-fde57478703e.png", width=500)
-            gr.Markdown("""
                         ![](file/src/logo.png)
-                        """)
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem("🏆 RewardBench Leaderboard"):
             with gr.Row():
-                search_1 = gr.Textbox(label="Model Search (delimit with , )",
-                                      placeholder="Model Search (delimit with , )",
-                                      show_label=False)
-                model_types_1 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative", "Prior Sets"],
-                                                 value=["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
-                                                 label="Model Types",
-                                                 show_label=False,
-                                                #  info="Which model types to include.",
-                                                 )
             with gr.Row():
                 # reference data
                 rewardbench_table_hidden = gr.Dataframe(
@@ -322,22 +349,31 @@ with gr.Blocks(css=custom_css) as app:
                     visible=False,
                 )
                 rewardbench_table = gr.Dataframe(
-                    regex_table(rewardbench_data_avg.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers",  "Generative"]),
                     datatype=col_types_rewardbench_avg,
                     headers=rewardbench_data_avg.columns.tolist(),
                     elem_id="rewardbench_dataframe_avg",
                     height=1000,
                 )
         with gr.TabItem("🔍 RewardBench - Detailed"):
             with gr.Row():
-                search_2 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
-                model_types_2 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
-                                                 value=["Seq. Classifiers", "DPO",  "Generative", "Custom Classifiers"],
-                                                 label="Model Types",
-                                                 show_label=False,
-                                                #  info="Which model types to include."
-                                                 )
             with gr.Row():
                 # ref data
                 rewardbench_table_detailed_hidden = gr.Dataframe(
@@ -347,7 +383,9 @@ with gr.Blocks(css=custom_css) as app:
                     visible=False,
                 )
                 rewardbench_table_detailed = gr.Dataframe(
-                    regex_table(rewardbench_data.copy(), "", ["Seq. Classifiers", "DPO",  "Generative", "Custom Classifiers"]),
                     datatype=col_types_rewardbench,
                     headers=rewardbench_data.columns.tolist(),
                     elem_id="rewardbench_dataframe",
@@ -371,13 +409,18 @@ with gr.Blocks(css=custom_css) as app:
         #         )
         with gr.TabItem("Prior Test Sets"):
             with gr.Row():
-                search_3 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
-                model_types_3 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
-                                                 value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
-                                                 label="Model Types",
-                                                 show_label=False,
-                                                #  info="Which model types to include.",
-                                                 )
             with gr.Row():
                 PREF_SET_TEXT = """
                 For more information, see the [dataset](https://huggingface.co/datasets/allenai/pref-test-sets). Only the subsets Anthropic Helpful, Anthropic HHH, Stanford SHP, and OpenAI's Summarize data are used in the leaderboard ranking.
@@ -399,7 +442,6 @@ with gr.Blocks(css=custom_css) as app:
                     height=1000,
                 )
         with gr.TabItem("About"):
             with gr.Row():
                 gr.Markdown(ABOUT_TEXT)
@@ -407,8 +449,10 @@ with gr.Blocks(css=custom_css) as app:
         with gr.TabItem("Dataset Viewer"):
             with gr.Row():
                 # loads one sample
-                gr.Markdown("""## Random Dataset Sample Viewer
-Warning, refusals, XSTest, and donotanswer datasets have sensitive content.""")
                 subset_selector = gr.Dropdown(subsets, label="Subset", value=None, multiselect=True)
                 button = gr.Button("Show Random Sample")
@@ -423,13 +467,25 @@ Warning, refusals, XSTest, and donotanswer datasets have sensitive content.""")
         #         gr.Plot(plot)
     search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
-    search_2.change(regex_table, inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2], outputs=rewardbench_table_detailed)
     # search.change(regex_table, inputs=[rewardbench_table_len_hidden, search, filter_button], outputs=rewardbench_table_len)
-    search_3.change(regex_table, inputs=[pref_sets_table_hidden, search_3, model_types_3], outputs=pref_sets_table)
-    model_types_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
-    model_types_2.change(regex_table, inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2], outputs=rewardbench_table_detailed)
-    model_types_3.change(regex_table, inputs=[pref_sets_table_hidden, search_3, model_types_3], outputs=pref_sets_table)
     with gr.Row():
         with gr.Accordion("📚 Citation", open=False):
@@ -457,6 +513,6 @@ Warning, refusals, XSTest, and donotanswer datasets have sensitive content.""")
 #     pref_sets_table.update(data_prefs)
 scheduler = BackgroundScheduler()
-scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
 scheduler.start()
-app.launch(allowed_paths=['src/']) # had .queue() before launch before... not sure if that's necessary

 import os
+import gradio as gr
+import numpy as np
 from apscheduler.schedulers.background import BackgroundScheduler
 from datasets import load_dataset
+from huggingface_hub import HfApi, snapshot_download
+from src.constants import example_counts, length_categories, subset_mapping
 from src.css import custom_css
+from src.md import ABOUT_TEXT, TOP_TEXT
+from src.utils import load_all_data
 api = HfApi()
 eval_set_repo = "allenai/reward-bench"
 repo_dir_rewardbench = "./evals/rewardbench/"
 def restart_space():
     api.restart_space(repo_id="allenai/reward-bench", token=COLLAB_TOKEN)
 print("Pulling evaluation results")
 repo = snapshot_download(
     local_dir=repo_dir_rewardbench,
     ignore_patterns=["pref-sets-scores/*", "eval-set-scores/*"],
     repo_id=evals_repo,
     use_auth_token=COLLAB_TOKEN,
+    tqdm_class=None,
     etag_timeout=30,
     repo_type="dataset",
 )
     # for main subsets, keys in subset_mapping, take the weighted avg by example_counts and store for the models
     for subset, sub_subsets in subset_mapping.items():
         subset_cols = [col for col in new_df.columns if col in sub_subsets]
+        sub_data = new_df[subset_cols].values  # take the relevant column values
+        sub_counts = [example_counts[s] for s in subset_cols]  # take the example counts
+        new_df[subset] = np.average(sub_data, axis=1, weights=sub_counts)  # take the weighted average
         # new_df[subset] = np.round(np.nanmean(new_df[subset_cols].values, axis=1), 2)
     data_cols = list(subset_mapping.keys())
+    keep_columns = (
+        [
+            "model",
+        ]
+        + ["model_type"]
+        + data_cols
+    )
     # keep_columns = ["model", "average"] + subsets
     new_df = new_df[keep_columns]
             # new_df.at[i, "Prior Sets (0.5 weight)"] = dataframe_prefs[dataframe_prefs["model"] == model]["Prior Sets (0.5 weight)"].values[0]
         else:
             values.append(np.nan)
     new_df["Prior Sets (0.5 weight)"] = values
     # add total average
     new_df = new_df[keep_columns]
     return new_df
 def expand_subsets(dataframe):
     # TODO need to modify data/ script to do this
     pass
     Then, take the average of the three buckets as "average"
     """
     new_df = dataframe.copy()
+    existing_subsets = new_df.columns[3:]  # model, model_type, average
     final_subsets = ["Length Bias", "Neutral", "Terse Bias"]
     # new data is empty list dict for each final subset
     new_data = {s: [] for s in final_subsets}
     return new_df
+rewardbench_data = load_all_data(repo_dir_rewardbench, subdir="eval-set").sort_values(by="average", ascending=False)
+rewardbench_data_length = length_bias_check(rewardbench_data).sort_values(by="Terse Bias", ascending=False)
+prefs_data = load_all_data(repo_dir_rewardbench, subdir="pref-sets").sort_values(by="average", ascending=False)
 # prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
+rewardbench_data_avg = avg_over_rewardbench(rewardbench_data, prefs_data).sort_values(by="average", ascending=False)
 def prep_df(df):
     # add column to 0th entry with count (column name itself empty)
+    df.insert(0, "", range(1, 1 + len(df)))
     # replace "model" with "Model" and "model_type" with "Model Type" and "average" with "Average"
     df = df.rename(columns={"model": "Model", "model_type": "Model Type", "average": "Average"})
     if "Model Type" in df.columns:
         # get model_types that have generative in them
         mask = df["Model Type"].str.contains("generative", case=False, na=False)
         # set these values to "Generative"
         df.loc[mask, "Model Type"] = "Generative"
     return df
 # add count column to all dataframes
 rewardbench_data = prep_df(rewardbench_data)
 rewardbench_data_avg = prep_df(rewardbench_data_avg).rename(columns={"Average": "Score"})
 prefs_data = prep_df(prefs_data)
 col_types_rewardbench = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data.columns) - 1)
+col_types_rewardbench_avg = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data_avg.columns) - 1)
 cols_rewardbench_data_length = ["markdown"] + ["number"] * (len(rewardbench_data_length.columns) - 1)
 col_types_prefs = ["number"] + ["markdown"] + ["number"] * (len(prefs_data.columns) - 1)
 # col_types_prefs_sub = ["markdown"] + ["number"] * (len(prefs_data_sub.columns) - 1)
 # for showing random samples
 eval_set = load_dataset(eval_set_repo, use_auth_token=COLLAB_TOKEN, split="filtered")
 def random_sample(r: gr.Request, subset):
     if subset is None or subset == []:
         sample_index = np.random.randint(0, len(eval_set) - 1)
         sample = eval_set[sample_index]
+    else:  # filter by subsets (can be list)
         if isinstance(subset, str):
             subset = [subset]
         # filter down dataset to only include the subset(s)
         sample_index = np.random.randint(0, len(eval_set_filtered) - 1)
         sample = eval_set_filtered[sample_index]
+    markdown_text = "\n\n".join([f"**{key}**:\n\n{value}" for key, value in sample.items()])
     return markdown_text
 subsets = eval_set.unique("subset")
 color_map = {
     "Seq. Classifier": "#ffcd75",
     "DPO": "#75809c",
 }
 def color_model_type_column(df, color_map):
     """
     Apply color to the 'Model Type' column of the DataFrame based on a given color mapping.
     Returns:
     pd.Styler: The styled DataFrame.
     """
     # Function to apply color based on the model type
     def apply_color(val):
         color = color_map.get(val, "default")  # Default color if not specified in color_map
+        return f"background-color: {color}"
     # Format for different columns
+    format_dict = {col: "{:.1f}" for col in df.columns if col not in ["Average", "Model", "Model Type"]}
+    format_dict["Average"] = "{:.2f}"
+    format_dict[""] = "{:d}"
+    return df.style.applymap(apply_color, subset=["Model Type"]).format(format_dict, na_rep="")
 def regex_table(dataframe, regex, filter_button, style=True):
     """
     # Split regex statement by comma and trim whitespace around regexes
     regex_list = [x.strip() for x in regex.split(",")]
     # Join the list into a single regex pattern with '|' acting as OR
+    combined_regex = "|".join(regex_list)
     # remove internal ai2 data
     dataframe = dataframe[~dataframe["Model"].str.contains("ai2", case=False, na=False)]
     # if filter_button, remove all rows with "ai2" in the model name
     update_scores = False
     if isinstance(filter_button, list) or isinstance(filter_button, str):
+        if "Prior Sets" not in filter_button and "Prior Sets (0.5 weight)" in dataframe.columns:
             update_scores = True
             # remove the column "Prior Sets (0.5 weight)" from the outputted table
+            dataframe = dataframe.drop(columns=["Prior Sets (0.5 weight)"])
         if "Seq. Classifiers" not in filter_button:
             dataframe = dataframe[~dataframe["Model Type"].str.contains("Seq. Classifier", case=False, na=False)]
         if "DPO" not in filter_button:
         # if "Prior Sets (0.5 weight)" in data.columns:
         # data["Prior Sets (0.5 weight)"] = np.nan
         # sort array by Score column
+        data = data.sort_values(by="Score", ascending=False)
     data.reset_index(drop=True, inplace=True)
     # replace column '' with count/rank
+    data[""] = np.arange(1, 1 + len(data))
     # if Score exists, round to 2 decimals
     if "Score" in data.columns:
     for col in data.columns:
         if col not in ["", "Model", "Model Type", "Score", "Average"]:
             # replace any data[col].values == '' with np.nan
+            data[col] = data[col].replace("", np.nan)
             data[col] = np.round(np.array(data[col].values).astype(float), 1)
     if style:
         # apply color
     return data
 # import ipdb; ipdb.set_trace()
+total_models = len(
+    regex_table(
+        rewardbench_data_avg.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"], style=False
+    ).values
+)
 with gr.Blocks(css=custom_css) as app:
     # create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
             # search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
             # filter_button = gr.Checkbox(label="Include AI2 training runs (or type ai2 above).", interactive=True)
             # img = gr.Image(value="https://private-user-images.githubusercontent.com/10695622/310698241-24ed272a-0844-451f-b414-fde57478703e.png", width=500)
+            gr.Markdown(
+                """
                         ![](file/src/logo.png)
+                        """
+            )
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem("🏆 RewardBench Leaderboard"):
             with gr.Row():
+                search_1 = gr.Textbox(
+                    label="Model Search (delimit with , )",
+                    placeholder="Model Search (delimit with , )",
+                    show_label=False,
+                )
+                model_types_1 = gr.CheckboxGroup(
+                    ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative", "Prior Sets"],
+                    value=["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
+                    label="Model Types",
+                    show_label=False,
+                    #  info="Which model types to include.",
+                )
             with gr.Row():
                 # reference data
                 rewardbench_table_hidden = gr.Dataframe(
                     visible=False,
                 )
                 rewardbench_table = gr.Dataframe(
+                    regex_table(
+                        rewardbench_data_avg.copy(),
+                        "",
+                        ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
+                    ),
                     datatype=col_types_rewardbench_avg,
                     headers=rewardbench_data_avg.columns.tolist(),
                     elem_id="rewardbench_dataframe_avg",
                     height=1000,
                 )
         with gr.TabItem("🔍 RewardBench - Detailed"):
             with gr.Row():
+                search_2 = gr.Textbox(
+                    label="Model Search (delimit with , )",
+                    show_label=False,
+                    placeholder="Model Search (delimit with , )",
+                )
+                model_types_2 = gr.CheckboxGroup(
+                    ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
+                    value=["Seq. Classifiers", "DPO", "Generative", "Custom Classifiers"],
+                    label="Model Types",
+                    show_label=False,
+                    #  info="Which model types to include."
+                )
             with gr.Row():
                 # ref data
                 rewardbench_table_detailed_hidden = gr.Dataframe(
                     visible=False,
                 )
                 rewardbench_table_detailed = gr.Dataframe(
+                    regex_table(
+                        rewardbench_data.copy(), "", ["Seq. Classifiers", "DPO", "Generative", "Custom Classifiers"]
+                    ),
                     datatype=col_types_rewardbench,
                     headers=rewardbench_data.columns.tolist(),
                     elem_id="rewardbench_dataframe",
         #         )
         with gr.TabItem("Prior Test Sets"):
             with gr.Row():
+                search_3 = gr.Textbox(
+                    label="Model Search (delimit with , )",
+                    show_label=False,
+                    placeholder="Model Search (delimit with , )",
+                )
+                model_types_3 = gr.CheckboxGroup(
+                    ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
+                    value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
+                    label="Model Types",
+                    show_label=False,
+                    #  info="Which model types to include.",
+                )
             with gr.Row():
                 PREF_SET_TEXT = """
                 For more information, see the [dataset](https://huggingface.co/datasets/allenai/pref-test-sets). Only the subsets Anthropic Helpful, Anthropic HHH, Stanford SHP, and OpenAI's Summarize data are used in the leaderboard ranking.
                     height=1000,
                 )
         with gr.TabItem("About"):
             with gr.Row():
                 gr.Markdown(ABOUT_TEXT)
         with gr.TabItem("Dataset Viewer"):
             with gr.Row():
                 # loads one sample
+                gr.Markdown(
+                    """## Random Dataset Sample Viewer
+Warning, refusals, XSTest, and donotanswer datasets have sensitive content."""
+                )
                 subset_selector = gr.Dropdown(subsets, label="Subset", value=None, multiselect=True)
                 button = gr.Button("Show Random Sample")
         #         gr.Plot(plot)
     search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
+    search_2.change(
+        regex_table,
+        inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2],
+        outputs=rewardbench_table_detailed,
+    )
     # search.change(regex_table, inputs=[rewardbench_table_len_hidden, search, filter_button], outputs=rewardbench_table_len)
+    search_3.change(regex_table, inputs=[pref_sets_table_hidden, search_3, model_types_3], outputs=pref_sets_table)
+    model_types_1.change(
+        regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table
+    )
+    model_types_2.change(
+        regex_table,
+        inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2],
+        outputs=rewardbench_table_detailed,
+    )
+    model_types_3.change(
+        regex_table, inputs=[pref_sets_table_hidden, search_3, model_types_3], outputs=pref_sets_table
+    )
     with gr.Row():
         with gr.Accordion("📚 Citation", open=False):
 #     pref_sets_table.update(data_prefs)
 scheduler = BackgroundScheduler()
+scheduler.add_job(restart_space, "interval", seconds=10800)  # restarted every 3h
 scheduler.start()
+app.launch(allowed_paths=["src/"])  # had .queue() before launch before... not sure if that's necessary

leaderboard/utils.py CHANGED Viewed

@@ -43,6 +43,7 @@ CONTAMINATED_MODELS_V1 = [
     "Ray2333/GRM-Gemma-2B-rewardmodel-ft",
 ]
 # From Open LLM Leaderboard
 def model_hyperlink(link, model_name):
     # if model_name is above 50 characters, return first 47 characters and "..."

     "Ray2333/GRM-Gemma-2B-rewardmodel-ft",
 ]
 # From Open LLM Leaderboard
 def model_hyperlink(link, model_name):
     # if model_name is above 50 characters, return first 47 characters and "..."