root commited on
Commit
99399ee
·
1 Parent(s): 6dd5c50
Files changed (6) hide show
  1. src/constants.py +60 -0
  2. src/css.py +22 -0
  3. src/logo.png +0 -0
  4. src/md.py +106 -0
  5. src/plt.py +53 -0
  6. src/utils.py +174 -0
src/constants.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # reference for length bias categories
2
+ length_categories = {
3
+ 'alpacaeval-easy': 'True',
4
+ 'alpacaeval-hard': 'True',
5
+ 'alpacaeval-length': 'Neutral',
6
+ 'donotanswer': 'False',
7
+ 'hep-cpp': 'Neutral',
8
+ 'hep-go': 'Neutral',
9
+ 'hep-java': 'Neutral',
10
+ 'hep-js': 'Neutral',
11
+ 'hep-python': 'Neutral',
12
+ 'hep-rust': 'Neutral',
13
+ 'llmbar-adver-GPTInst': 'False',
14
+ 'llmbar-adver-GPTOut': 'Neutral',
15
+ 'llmbar-adver-manual': 'False',
16
+ 'llmbar-adver-neighbor': 'False',
17
+ 'llmbar-natural': 'Neutral',
18
+ 'math-prm': 'Neutral',
19
+ 'mt-bench-easy': 'False',
20
+ 'mt-bench-hard': 'False',
21
+ 'mt-bench-med': 'Neutral',
22
+ 'refusals-dangerous': 'False',
23
+ 'refusals-offensive': 'False',
24
+ 'xstest-should-refuse': 'False',
25
+ 'xstest-should-respond': 'True'
26
+ }
27
+
28
+ example_counts = {
29
+ "alpacaeval-easy": 100,
30
+ "alpacaeval-length": 95,
31
+ "alpacaeval-hard": 95,
32
+ "mt-bench-easy": 28,
33
+ "mt-bench-med": 40,
34
+ "mt-bench-hard": 37,
35
+ "math-prm": 984, # actual length 447, upweighting to be equal to code
36
+ "refusals-dangerous": 100,
37
+ "refusals-offensive": 100,
38
+ "llmbar-natural": 100,
39
+ "llmbar-adver-neighbor": 134,
40
+ "llmbar-adver-GPTInst": 92,
41
+ "llmbar-adver-GPTOut": 47,
42
+ "llmbar-adver-manual": 46,
43
+ "xstest-should-refuse": 154,
44
+ "xstest-should-respond": 250, # Note, refuse and respond were accidentally swapped until 9 Sept 2024
45
+ "donotanswer": 136,
46
+ "hep-cpp": 164,
47
+ "hep-go": 164,
48
+ "hep-java": 164,
49
+ "hep-js": 164,
50
+ "hep-python": 164,
51
+ "hep-rust": 164
52
+ }
53
+
54
+ # note, this order should match the dataframe.
55
+ subset_mapping = {
56
+ "Chat": ['alpacaeval-easy', 'alpacaeval-hard', 'alpacaeval-length', 'mt-bench-easy', 'mt-bench-med'],
57
+ "Chat Hard": ['llmbar-adver-GPTInst', 'llmbar-adver-GPTOut', 'llmbar-adver-manual', 'llmbar-adver-neighbor', 'llmbar-natural', 'mt-bench-hard'],
58
+ "Safety": ['donotanswer', 'refusals-dangerous', 'refusals-offensive', 'xstest-should-refuse', 'xstest-should-respond'],
59
+ "Reasoning": ["hep-cpp", "hep-go", "hep-java", "hep-js", "hep-python", "hep-rust", "math-prm"]
60
+ }
src/css.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ custom_css = """
2
+
3
+ /* Full width space */
4
+ .gradio-container {
5
+ max-width: 95%;
6
+ }
7
+
8
+ /* Text tyle and margins */
9
+ .markdown-text {
10
+ font-size: 17px !important;
11
+ }
12
+
13
+ .tab-buttons button {
14
+ font-size: 20px;
15
+ }
16
+
17
+ h1 {
18
+ font-size: 32px !important;
19
+ margin-top: 0px !important;
20
+ }
21
+
22
+ """
src/logo.png ADDED
src/md.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ import pytz
3
+
4
+ ABOUT_TEXT = """
5
+ We compute the win percentage for a reward model on hand curated chosen-rejected pairs for each prompt.
6
+ A win is when the score for the chosen response is higher than the score for the rejected response.
7
+
8
+ Note: Models with (*) after the model name are independently submitted model scores which have not been verified by the RewardBench team.
9
+
10
+ ## Overview
11
+
12
+ We average over 4 core sections (per prompt weighting):
13
+ 1. **Chat**: Includes the easy chat subsets (alpacaeval-easy, alpacaeval-length, alpacaeval-hard, mt-bench-easy, mt-bench-medium)
14
+ 2. **Chat Hard**: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
15
+ 3. **Safety**: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
16
+ 4. **Reasoning**: Includes the code and math subsets (math-prm, hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
17
+
18
+ For Reasoning, we increase the weight of the PRM-Math subset so code and math abilities are weighed equally in the final number, rather than increasing the relevance of code.
19
+ We add a final column, **Prior Sets** -- includes the test sets ([anthropic_helpful](https://huggingface.co/datasets/Anthropic/hh-rlhf), [anthropic_hhh](https://huggingface.co/datasets/HuggingFaceH4/hhh_alignment), [shp](https://huggingface.co/datasets/stanfordnlp/SHP), [summarize](https://huggingface.co/datasets/openai/summarize_from_feedback))
20
+ Prior sets is weighted 0.5x in the final score to avoid gamification by training on the available training sets of Anthropic HH, SHP, and Summarize.
21
+
22
+ Once all subsets weighted averages are achieved, the final RewardBench score is the average across the 5 subset scores.
23
+
24
+
25
+ We include multiple types of reward models in this evaluation:
26
+ 1. **Sequence Classifiers** (Seq. Classifier): A model, normally trained with HuggingFace AutoModelForSequenceClassification, that takes in a prompt and a response and outputs a score.
27
+ 2. **Custom Classifiers**: Research models with different architectures and training objectives to either take in two inputs at once or generate scores differently (e.g. PairRM and Stanford SteamSHP).
28
+ 3. **DPO**: Models trained with Direct Preference Optimization (DPO), with modifiers such as `-ref-free` or `-norm` changing how scores are computed. *Note*: This also includes other models trained with implicit rewards, such as those trained with [KTO](https://arxiv.org/abs/2402.01306).
29
+ 4. **Random**: Random choice baseline.
30
+ 4. **Generative**: Prompting fine-tuned models to choose between two answers, similar to MT Bench and AlpacaEval.
31
+
32
+ All models are evaluated in fp16 expect for Starling-7B, which is evaluated in fp32.
33
+ *Note*: The reference models for DPO models (and other implicit rewards) can be found in two ways.
34
+ * Click on a specific model in results and you'll see a key `ref_model`, e.g. [Qwen](https://huggingface.co/datasets/allenai/reward-bench-results/blob/main/eval-set/Qwen/Qwen1.5-72B-Chat.json).
35
+ * All the reference models are listed in the [evaluation configs](https://github.com/allenai/reward-bench/blob/main/scripts/configs/eval_configs.yaml).
36
+
37
+
38
+ ### Subset Details
39
+
40
+ Total number of the prompts is: 2985, filtered from 5123.
41
+
42
+ | Subset | Num. Samples (Pre-filtering, post-filtering) | Description |
43
+ | :---------- | :-----: | :---------: |
44
+ | alpacaeval-easy | 805, 100 | Great model vs poor model |
45
+ | alpacaeval-length | 805, 95 | Good model vs low model, equal length |
46
+ | alpacaeval-hard | 805, 95 | Great model vs baseline model |
47
+ | mt-bench-easy | 28, 28 | MT Bench 10s vs 1s |
48
+ | mt-bench-medium | 45, 40 | MT Bench 9s vs 2-5s |
49
+ | mt-bench-hard | 45, 37 | MT Bench 7-8 vs 5-6 |
50
+ | refusals-dangerous | 505, 100 | Dangerous response vs no response |
51
+ | refusals-offensive | 704, 100 | Offensive response vs no response |
52
+ | llmbar-natural | 100 | (See [paper](https://arxiv.org/abs/2310.07641)) Manually curated instruction pairs |
53
+ | llmbar-adver-neighbor | 134 | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. off-topic prompt response |
54
+ | llmbar-adver-GPTInst | 92 | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. GPT4 generated off-topic prompt response |
55
+ | llmbar-adver-GPTOut | 47 | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. unhelpful-prompted GPT4 responses |
56
+ | llmbar-adver-manual | 46 | (See [paper](https://arxiv.org/abs/2310.07641)) Challenge set chosen vs. rejected |
57
+ | xstest-should-refuse | 450, 154 | False response dataset (see [paper](https://arxiv.org/abs/2308.01263)) |
58
+ | xstest-should-respond | 450, 250 | False refusal dataset (see [paper](https://arxiv.org/abs/2308.01263)) |
59
+ | do not answer | 939, 136 | [Prompts which responsible LLMs do not answer](https://huggingface.co/datasets/LibrAI/do-not-answer) |
60
+ | math-prm | 447 | Human references vs. model error from OpenAI's Let's Verify Step by Step |
61
+ | hep-cpp | 164 | C++ code revisions (See [dataset](https://huggingface.co/datasets/bigcode/humanevalpack) or [paper](https://arxiv.org/abs/2308.07124)) |
62
+ | hep-go | 164 | Go code |
63
+ | hep-java | 164 | Java code |
64
+ | hep-js | 164 | Javascript code |
65
+ | hep-python | 164 | Python code |
66
+ | hep-rust | 164 | Rust code |
67
+
68
+ Lengths (mean, std. dev.) include the prompt
69
+
70
+ | subset | length bias | chosen_chars | rejected_chars | chosen_tokens | rejected_tokens | chosen_unique_tokens | rejected_unique_tokens |
71
+ |-----------------------|-------------|----------------|------------------|-----------------|-------------------|------------------------|--------------------------|
72
+ | alpacaeval-easy | True | 2283 (1138) | 646 (482) | 591 (303) | 167 (139) | 253 (117) | 83 (46) |
73
+ | alpacaeval-hard | True | 1590 (769) | 526 (430) | 412 (199) | 137 (117) | 173 (67) | 71 (48) |
74
+ | alpacaeval-length | Neutral | 2001 (1137) | 2127 (1787) | 511 (283) | 597 (530) | 192 (85) | 189 (99) |
75
+ | donotanswer | False | 755 (722) | 1389 (695) | 170 (161) | 320 (164) | 104 (82) | 157 (73) |
76
+ | hep-cpp | Neutral | 709 (341) | 705 (342) | 261 (125) | 259 (125) | 100 (29) | 99 (29) |
77
+ | hep-go | Neutral | 738 (361) | 734 (361) | 266 (118) | 265 (118) | 100 (29) | 99 (29) |
78
+ | hep-java | Neutral | 821 (393) | 814 (390) | 263 (123) | 261 (122) | 102 (30) | 102 (30) |
79
+ | hep-js | Neutral | 677 (341) | 673 (339) | 251 (129) | 250 (128) | 93 (29) | 93 (29) |
80
+ | hep-python | Neutral | 618 (301) | 616 (300) | 212 (98) | 211 (98) | 86 (26) | 85 (26) |
81
+ | hep-rust | Neutral | 666 (391) | 660 (391) | 221 (132) | 219 (132) | 95 (29) | 95 (29) |
82
+ | llmbar-adver-GPTInst | False | 735 (578) | 1623 (1055) | 170 (135) | 377 (245) | 93 (59) | 179 (106) |
83
+ | llmbar-adver-GPTOut | Neutral | 378 (339) | 359 (319) | 96 (81) | 101 (94) | 60 (45) | 55 (41) |
84
+ | llmbar-adver-manual | False | 666 (584) | 1139 (866) | 160 (134) | 264 (194) | 92 (63) | 140 (90) |
85
+ | llmbar-adver-neighbor | False | 287 (297) | 712 (749) | 70 (76) | 173 (175) | 43 (31) | 91 (70) |
86
+ | llmbar-natural | Neutral | 553 (644) | 530 (597) | 139 (162) | 130 (140) | 75 (71) | 70 (62) |
87
+ | mt-bench-easy | False | 1563 (720) | 2129 (1520) | 377 (159) | 551 (415) | 166 (55) | 116 (62) |
88
+ | mt-bench-hard | False | 1225 (499) | 1471 (1016) | 284 (116) | 349 (234) | 131 (45) | 136 (58) |
89
+ | mt-bench-med | Neutral | 1558 (729) | 1733 (1312) | 377 (170) | 410 (311) | 162 (58) | 145 (88) |
90
+ | refusals-dangerous | False | 597 (81) | 1828 (547) | 131 (20) | 459 (136) | 90 (12) | 211 (50) |
91
+ | refusals-offensive | False | 365 (116) | 1092 (1146) | 82 (25) | 299 (278) | 64 (15) | 134 (101) |
92
+ | xstest-should-refuse | False | 584 (419) | 904 (493) | 129 (89) | 217 (115) | 81 (47) | 116 (53) |
93
+ | xstest-should-respond | True | 771 (420) | 466 (427) | 189 (105) | 107 (94) | 104 (48) | 67 (48) |
94
+
95
+ For more details, see the [dataset](https://huggingface.co/datasets/allenai/reward-bench).
96
+ """
97
+
98
+ # Get Pacific time zone (handles PST/PDT automatically)
99
+ pacific_tz = pytz.timezone('America/Los_Angeles')
100
+ current_time = datetime.now(pacific_tz).strftime("%H:%M %Z, %d %b %Y")
101
+
102
+ TOP_TEXT = f"""# RewardBench: Evaluating Reward Models
103
+ ### Evaluating the capabilities, safety, and pitfalls of reward models
104
+ [Code](https://github.com/allenai/reward-bench) | [Eval. Dataset](https://huggingface.co/datasets/allenai/reward-bench) | [Prior Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/allenai/reward-bench-results) | [Paper](https://arxiv.org/abs/2403.13787) | Total models: {{}} | * Unverified models | ⚠️ Dataset Contamination | Last restart (PST): {current_time}
105
+
106
+ ⚠️ Many of the top models were trained on unintentionally contaminated, AI-generated data, for more information, see this [gist](https://gist.github.com/natolambert/1aed306000c13e0e8c5bc17c1a5dd300)."""
src/plt.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib.pyplot as plt
2
+ import pandas as pd
3
+ from .utils import undo_hyperlink
4
+
5
+ def plot_avg_correlation(df1, df2):
6
+ """
7
+ Plots the "average" column for each unique model that appears in both dataframes.
8
+
9
+ Parameters:
10
+ - df1: pandas DataFrame containing columns "model" and "average".
11
+ - df2: pandas DataFrame containing columns "model" and "average".
12
+ """
13
+ # Identify the unique models that appear in both DataFrames
14
+ common_models = pd.Series(list(set(df1['model']) & set(df2['model'])))
15
+
16
+ # Set up the plot
17
+ plt.figure(figsize=(13, 6), constrained_layout=True)
18
+
19
+ # axes from 0 to 1 for x and y
20
+ plt.xlim(0.475, 0.8)
21
+ plt.ylim(0.475, 0.8)
22
+
23
+ # larger font (16)
24
+ plt.rcParams.update({'font.size': 12, 'axes.labelsize': 14,'axes.titlesize': 14})
25
+ # plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)
26
+ # plt.tight_layout()
27
+ # plt.margins(0,0)
28
+
29
+ for model in common_models:
30
+ # Filter data for the current model
31
+ df1_model_data = df1[df1['model'] == model]['average'].values
32
+ df2_model_data = df2[df2['model'] == model]['average'].values
33
+
34
+ # Plotting
35
+ plt.scatter(df1_model_data, df2_model_data, label=model)
36
+ m_name = undo_hyperlink(model)
37
+ if m_name == "No text found":
38
+ m_name = "Random"
39
+ # Add text above each point like
40
+ # plt.text(x[i] + 0.1, y[i] + 0.1, label, ha='left', va='bottom')
41
+ plt.text(df1_model_data - .005, df2_model_data, m_name, horizontalalignment='right', verticalalignment='center')
42
+
43
+ # add correlation line to scatter plot
44
+ # first, compute correlation
45
+ corr = df1['average'].corr(df2['average'])
46
+ # add correlation line based on corr
47
+
48
+
49
+
50
+ plt.xlabel('HERM Eval. Set Avg.', fontsize=16)
51
+ plt.ylabel('Pref. Test Sets Avg.', fontsize=16)
52
+ # plt.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
53
+ return plt
src/utils.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from pathlib import Path
3
+ from datasets import load_dataset
4
+ import numpy as np
5
+ import os
6
+ import re
7
+
8
+ UNVERIFIED_MODELS = [
9
+ "nvidia/Nemotron-4-340B-Reward",
10
+ "nvidia/Llama3-70B-SteerLM-RM",
11
+ "Cohere May 2024",
12
+ "google/gemini-1.5-pro-0514",
13
+ "google/flame-24b-july-2024",
14
+ "Cohere March 2024",
15
+ "facebook/Self-taught-Llama-3-70B",
16
+ "facebook/Self-taught-evaluator-llama3.1-70B",
17
+ "google/flame-1.0-24B-july-2024",
18
+ "Salesforce/SFR-LLaMa-3.1-70B-Judge-r",
19
+ "Salesforce/SFR-nemo-12B-Judge-r",
20
+ "Salesforce/SFR-LLaMa-3.1-8B-Judge-r",
21
+ "SF-Foundation/TextEval-OffsetBias-12B",
22
+ "SF-Foundation/TextEval-Llama3.1-70B",
23
+ "nvidia/Llama-3.1-Nemotron-70B-Reward",
24
+ ]
25
+
26
+ CONTAMINATED_MODELS = [
27
+ "Skywork/Skywork-Reward-Gemma-2-27B",
28
+ "Skywork/Skywork-Critic-Llama-3.1-70B",
29
+ "LxzGordon/URM-LLaMa-3.1-8B",
30
+ "Skywork/Skywork-Reward-Llama-3.1-8B",
31
+ "Ray2333/GRM-Llama3-8B-rewardmodel-ft",
32
+ "nicolinho/QRM-Llama3.1-8B",
33
+ "nicolinho/QRM-Llama3-8B",
34
+ "general-preference/GPM-Llama-3.1-8B",
35
+ "SF-Foundation/TextEval-Llama3.1-70B",
36
+ "ZiyiYe/Con-J-Qwen2-7B",
37
+ "Ray2333/Gemma-2B-rewardmodel-ft",
38
+ "Ray2333/GRM-Gemma-2B-rewardmodel-ft"
39
+ ]
40
+
41
+ # From Open LLM Leaderboard
42
+ def model_hyperlink(link, model_name):
43
+ # if model_name is above 50 characters, return first 47 characters and "..."
44
+ if len(model_name) > 50:
45
+ model_name = model_name[:47] + "..."
46
+ if model_name == "random":
47
+ output = "random"
48
+ elif model_name == "Cohere March 2024":
49
+ output = f'<a target="_blank" href="https://huggingface.co/Cohere" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
50
+ elif "openai" == model_name.split("/")[0]:
51
+ output = f'<a target="_blank" href="https://huggingface.co/openai" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
52
+ elif "Anthropic" == model_name.split("/")[0]:
53
+ output = f'<a target="_blank" href="https://huggingface.co/Anthropic" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
54
+ elif "google" == model_name.split("/")[0]:
55
+ output = f'<a target="_blank" href="https://huggingface.co/google" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
56
+ elif "PoLL" == model_name.split("/")[0]:
57
+ output = model_name
58
+ output = f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
59
+
60
+ if model_name in UNVERIFIED_MODELS:
61
+ output += " *"
62
+ if model_name in CONTAMINATED_MODELS:
63
+ output += " ⚠️"
64
+ return output
65
+
66
+ def undo_hyperlink(html_string):
67
+ # Regex pattern to match content inside > and <
68
+ pattern = r'>[^<]+<'
69
+ match = re.search(pattern, html_string)
70
+ if match:
71
+ # Extract the matched text and remove leading '>' and trailing '<'
72
+ return match.group(0)[1:-1]
73
+ else:
74
+ return "No text found"
75
+
76
+
77
+ # Define a function to fetch and process data
78
+ def load_all_data(data_repo, subdir:str, subsubsets=False): # use HF api to pull the git repo
79
+ dir = Path(data_repo)
80
+ data_dir = dir / subdir
81
+ orgs = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
82
+ # get all files within the sub folders orgs
83
+ models_results = []
84
+ for org in orgs:
85
+ org_dir = data_dir / org
86
+ files = [f for f in os.listdir(org_dir) if os.path.isfile(os.path.join(org_dir, f))]
87
+ for file in files:
88
+ if file.endswith(".json"):
89
+ models_results.append(org + "/" + file)
90
+
91
+ # create empty dataframe to add all data to
92
+ df = pd.DataFrame()
93
+
94
+ # load all json data in the list models_results one by one to avoid not having the same entries
95
+ for model in models_results:
96
+ model_data = load_dataset("json", data_files=data_repo + subdir+ "/" + model, split="train")
97
+ df2 = pd.DataFrame(model_data)
98
+ # add to df
99
+ df = pd.concat([df2, df])
100
+
101
+
102
+ # remove chat_template comlumn
103
+ df = df.drop(columns=["chat_template"])
104
+
105
+ # sort columns alphabetically
106
+ df = df.reindex(sorted(df.columns), axis=1)
107
+
108
+ # move column "model" to the front
109
+ cols = list(df.columns)
110
+ cols.insert(0, cols.pop(cols.index('model')))
111
+ df = df.loc[:, cols]
112
+
113
+ # select all columns except "model"
114
+ cols = df.columns.tolist()
115
+ cols.remove("model")
116
+ # if model_type is a column (pref tests may not have it)
117
+ if "model_type" in cols:
118
+ cols.remove("model_type")
119
+ # remove ref_model if in columns
120
+ if "ref_model" in cols:
121
+ cols.remove("ref_model")
122
+ # remove model_beaker from dataframe
123
+ if "model_beaker" in cols:
124
+ cols.remove("model_beaker")
125
+ df = df.drop(columns=["model_beaker"])
126
+
127
+ # remove column xstest (outdated data)
128
+ # if xstest is a column
129
+ if "xstest" in cols:
130
+ df = df.drop(columns=["xstest"])
131
+ cols.remove("xstest")
132
+
133
+ if "ref_model" in df.columns:
134
+ df = df.drop(columns=["ref_model"])
135
+
136
+ # remove column anthropic and summarize_prompted (outdated data)
137
+ if "anthropic" in cols:
138
+ df = df.drop(columns=["anthropic"])
139
+ cols.remove("anthropic")
140
+ if "summarize_prompted" in cols:
141
+ df = df.drop(columns=["summarize_prompted"])
142
+ cols.remove("summarize_prompted")
143
+ # remove pku_better and pku_safer (removed from the leaderboard)
144
+ if "pku_better" in cols:
145
+ df = df.drop(columns=["pku_better"])
146
+ cols.remove("pku_better")
147
+ if "pku_safer" in cols:
148
+ df = df.drop(columns=["pku_safer"])
149
+ cols.remove("pku_safer")
150
+
151
+ # convert to score
152
+ df[cols] = (df[cols]*100)
153
+ avg = np.nanmean(df[cols].values,axis=1)
154
+ # add average column
155
+ df["average"] = avg
156
+
157
+ # apply model_hyperlink function to column "model"
158
+ df["model"] = df["model"].apply(lambda x: model_hyperlink(f"https://huggingface.co/{x}", x))
159
+
160
+ # move average column to the second
161
+ cols = list(df.columns)
162
+ cols.insert(1, cols.pop(cols.index('average')))
163
+ df = df.loc[:, cols]
164
+
165
+ # move model_type column to first
166
+ if "model_type" in cols:
167
+ cols = list(df.columns)
168
+ cols.insert(1, cols.pop(cols.index('model_type')))
169
+ df = df.loc[:, cols]
170
+
171
+ # remove models with DPO Ref. Free as type (future work)
172
+ df = df[~df["model_type"].str.contains("DPO Ref. Free", na=False)]
173
+
174
+ return df