natolambert commited on
Commit
ab74236
·
1 Parent(s): 56fcfaf
Files changed (4) hide show
  1. README.md +2 -2
  2. app.py +21 -20
  3. src/md.py +62 -25
  4. src/utils.py +10 -4
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
- title: Rm Benchmark Viewer
3
- emoji: 😻
4
  colorFrom: pink
5
  colorTo: blue
6
  sdk: gradio
 
1
  ---
2
+ title: HERM Leaderboard
3
+ emoji: 📐
4
  colorFrom: pink
5
  colorTo: blue
6
  sdk: gradio
app.py CHANGED
@@ -4,17 +4,16 @@ from huggingface_hub import HfApi, snapshot_download
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  from datasets import load_dataset
6
  from src.utils import load_all_data
7
- from src.md import ABOUT_TEXT
8
  import numpy as np
9
 
10
  api = HfApi()
11
 
12
  COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
13
- evals_repo = "ai2-adapt-dev/rm-benchmark-results"
14
- prefs_repo = "ai2-adapt-dev/rm-testset-results"
15
  eval_set_repo = "ai2-adapt-dev/rm-benchmark-dev"
16
  repo_dir_herm = "./evals/herm/"
17
- repo_dir_prefs = "./evals/prefs/"
18
 
19
  def restart_space():
20
  api.restart_space(repo_id="ai2-adapt-dev/rm-benchmark-viewer", token=COLLAB_TOKEN)
@@ -29,14 +28,6 @@ repo = snapshot_download(
29
  repo_type="dataset",
30
  )
31
 
32
- repo_pref_sets = snapshot_download(
33
- local_dir=repo_dir_prefs,
34
- repo_id=prefs_repo,
35
- use_auth_token=COLLAB_TOKEN,
36
- tqdm_class=None,
37
- etag_timeout=30,
38
- repo_type="dataset",
39
- )
40
 
41
  def avg_over_herm(dataframe):
42
  """
@@ -126,10 +117,10 @@ def length_bias_check(dataframe):
126
 
127
 
128
 
129
- herm_data = load_all_data(repo_dir_herm).sort_values(by='average', ascending=False)
130
  herm_data_avg = avg_over_herm(herm_data).sort_values(by='average', ascending=False)
131
  herm_data_length = length_bias_check(herm_data).sort_values(by='Terse Bias', ascending=False)
132
- prefs_data = load_all_data(repo_dir_prefs).sort_values(by='average', ascending=False)
133
  # prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
134
 
135
  col_types_herm = ["markdown"] + ["number"] * (len(herm_data.columns) - 1)
@@ -152,7 +143,7 @@ def random_sample(r: gr.Request, subset):
152
  sample_index = np.random.randint(0, len(eval_set_filtered) - 1)
153
  sample = eval_set_filtered[sample_index]
154
 
155
- markdown_text = '\n\n'.join([f"**{key}**:\n{value}" for key, value in sample.items()])
156
  return markdown_text
157
 
158
  subsets = eval_set.unique("subset")
@@ -160,38 +151,48 @@ subsets = eval_set.unique("subset")
160
  with gr.Blocks() as app:
161
  # create tabs for the app, moving the current table to one titled "HERM" and the benchmark_text to a tab called "About"
162
  with gr.Row():
163
- gr.Markdown("# HERM Results Viewer")
164
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
165
- with gr.TabItem("HERM - Overview"):
166
  with gr.Row():
167
  herm_table = gr.Dataframe(
168
  herm_data_avg.values,
169
  datatype=col_types_herm_avg,
170
  headers=herm_data_avg.columns.tolist(),
171
  elem_id="herm_dataframe_avg",
 
172
  )
173
- with gr.TabItem("HERM - Detailed"):
174
  with gr.Row():
175
  herm_table = gr.Dataframe(
176
  herm_data.values,
177
  datatype=col_types_herm,
178
  headers=herm_data.columns.tolist(),
179
  elem_id="herm_dataframe",
 
180
  )
181
- with gr.TabItem("HERM - Length Bias"):
182
  with gr.Row():
183
  herm_table = gr.Dataframe(
184
  herm_data_length.values,
185
  datatype=cols_herm_data_length,
186
  headers=herm_data_length.columns.tolist(),
187
  elem_id="herm_dataframe_length",
 
188
  )
189
- with gr.TabItem("Pref Sets - Overview"):
 
 
 
 
 
 
190
  pref_sets_table = gr.Dataframe(
191
  prefs_data.values,
192
  datatype=col_types_prefs,
193
  headers=prefs_data.columns.tolist(),
194
  elem_id="prefs_dataframe",
 
195
  )
196
 
197
  with gr.TabItem("About"):
 
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  from datasets import load_dataset
6
  from src.utils import load_all_data
7
+ from src.md import ABOUT_TEXT, TOP_TEXT
8
  import numpy as np
9
 
10
  api = HfApi()
11
 
12
  COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
13
+ evals_repo = "ai2-adapt-dev/HERM-Results"
14
+
15
  eval_set_repo = "ai2-adapt-dev/rm-benchmark-dev"
16
  repo_dir_herm = "./evals/herm/"
 
17
 
18
  def restart_space():
19
  api.restart_space(repo_id="ai2-adapt-dev/rm-benchmark-viewer", token=COLLAB_TOKEN)
 
28
  repo_type="dataset",
29
  )
30
 
 
 
 
 
 
 
 
 
31
 
32
  def avg_over_herm(dataframe):
33
  """
 
117
 
118
 
119
 
120
+ herm_data = load_all_data(repo_dir_herm, subdir="eval-set").sort_values(by='average', ascending=False)
121
  herm_data_avg = avg_over_herm(herm_data).sort_values(by='average', ascending=False)
122
  herm_data_length = length_bias_check(herm_data).sort_values(by='Terse Bias', ascending=False)
123
+ prefs_data = load_all_data(repo_dir_herm, subdir="pref-sets").sort_values(by='average', ascending=False)
124
  # prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
125
 
126
  col_types_herm = ["markdown"] + ["number"] * (len(herm_data.columns) - 1)
 
143
  sample_index = np.random.randint(0, len(eval_set_filtered) - 1)
144
  sample = eval_set_filtered[sample_index]
145
 
146
+ markdown_text = '\n\n'.join([f"**{key}**:\n\n{value}" for key, value in sample.items()])
147
  return markdown_text
148
 
149
  subsets = eval_set.unique("subset")
 
151
  with gr.Blocks() as app:
152
  # create tabs for the app, moving the current table to one titled "HERM" and the benchmark_text to a tab called "About"
153
  with gr.Row():
154
+ gr.Markdown(TOP_TEXT)
155
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
156
+ with gr.TabItem("HERM Eval Set - Overview"):
157
  with gr.Row():
158
  herm_table = gr.Dataframe(
159
  herm_data_avg.values,
160
  datatype=col_types_herm_avg,
161
  headers=herm_data_avg.columns.tolist(),
162
  elem_id="herm_dataframe_avg",
163
+ height=1000,
164
  )
165
+ with gr.TabItem("HERM Eval Set - Detailed"):
166
  with gr.Row():
167
  herm_table = gr.Dataframe(
168
  herm_data.values,
169
  datatype=col_types_herm,
170
  headers=herm_data.columns.tolist(),
171
  elem_id="herm_dataframe",
172
+ height=1000,
173
  )
174
+ with gr.TabItem("HERM Eval Set - Length Bias"):
175
  with gr.Row():
176
  herm_table = gr.Dataframe(
177
  herm_data_length.values,
178
  datatype=cols_herm_data_length,
179
  headers=herm_data_length.columns.tolist(),
180
  elem_id="herm_dataframe_length",
181
+ height=1000,
182
  )
183
+ with gr.TabItem("Known Pref. Sets"):
184
+ with gr.Row():
185
+ PREF_SET_TEXT = """
186
+ For more information, see the [dataset](https://huggingface.co/datasets/allenai/pref-test-sets).
187
+ """
188
+ gr.Markdown(PREF_SET_TEXT)
189
+ with gr.Row():
190
  pref_sets_table = gr.Dataframe(
191
  prefs_data.values,
192
  datatype=col_types_prefs,
193
  headers=prefs_data.columns.tolist(),
194
  elem_id="prefs_dataframe",
195
+ height=1000,
196
  )
197
 
198
  with gr.TabItem("About"):
src/md.py CHANGED
@@ -2,32 +2,69 @@ ABOUT_TEXT = """
2
  We compute the win percentage for a reward model on hand curated chosen-rejected pairs for each prompt.
3
  A win is when the score for the chosen response is higher than the score for the rejected response.
4
 
5
- ### Subset summary
6
-
7
- | Subset | Num. Samples (Pre-filtering, post-filtering) | Description |
8
- | :--------------------- | :------------------------------------------: | :---------------------------------------------------------------- |
9
- | alpacaeval-easy | 805, 100 | Great model vs poor model |
10
- | alpacaeval-length | 805, 95 | Good model vs low model, equal length |
11
- | alpacaeval-hard | 805, 95 | Great model vs baseline model |
12
- | mt-bench-easy | 28, 28 | MT Bench 10s vs 1s |
13
- | mt-bench-medium | 45, 40 | MT Bench 9s vs 2-5s |
14
- | mt-bench-hard | 45, 37 | MT Bench 7-8 vs 5-6 |
15
- | refusals-dangerous | 505, 100 | Dangerous response vs no response |
16
- | refusals-offensive | 704, 100 | Offensive response vs no response |
17
- | llmbar-natural | 100 | (See [paper](https://arxiv.org/abs/2310.07641)) Manually curated instruction pairs |
18
- | llmbar-adver-neighbor | 134 | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. off-topic prompt response |
19
- | llmbar-adver-GPTInst | 92 | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. GPT4 generated off-topic prompt response |
20
- | llmbar-adver-GPTOut | 47 | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. unhelpful-prompted GPT4 responses |
21
- | llmbar-adver-manual | 46 | (See [paper](https://arxiv.org/abs/2310.07641)) Challenge set chosen vs. rejected |
22
- | XSTest | 450, 404 | False refusal dataset (see [paper](https://arxiv.org/abs/2308.01263)) |
23
- | do not answer | 939, 136 | [Prompts which responsible LLMs do not answer](https://huggingface.co/datasets/LibrAI/do-not-answer) |
24
- | hep-cpp | 164 | C++ code revisions (See [dataset](https://huggingface.co/datasets/bigcode/humanevalpack) or [paper](https://arxiv.org/abs/2308.07124)) |
25
- | hep-go | 164 | Go code |
26
- | hep-java | 164 | Java code |
27
- | hep-js | 164 | Javascript code |
28
- | hep-python | 164 | Python code |
29
- | hep-rust | 164 | Rust code |
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  For more details, see the [dataset](https://huggingface.co/datasets/ai2-rlhf-collab/rm-benchmark-dev).
33
  """
 
 
 
 
 
 
 
 
 
2
  We compute the win percentage for a reward model on hand curated chosen-rejected pairs for each prompt.
3
  A win is when the score for the chosen response is higher than the score for the rejected response.
4
 
5
+ ## Subset Summary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ Total number of the prompts is: 2538, filtered from 4676.
8
+
9
+ | Subset | Num. Samples (Pre-filtering, post-filtering) | Description |
10
+ | :---------- | :-----: | :---------: |
11
+ | alpacaeval-easy | 805, 100 | Great model vs poor model |
12
+ | alpacaeval-length | 805, 95 | Good model vs low model, equal length |
13
+ | alpacaeval-hard | 805, 95 | Great model vs baseline model |
14
+ | mt-bench-easy | 28, 28 | MT Bench 10s vs 1s |
15
+ | mt-bench-medium | 45, 40 | MT Bench 9s vs 2-5s |
16
+ | mt-bench-hard | 45, 37 | MT Bench 7-8 vs 5-6 |
17
+ | refusals-dangerous | 505, 100 | Dangerous response vs no response |
18
+ | refusals-offensive | 704, 100 | Offensive response vs no response |
19
+ | llmbar-natural | 100 | (See [paper](https://arxiv.org/abs/2310.07641)) Manually curated instruction pairs |
20
+ | llmbar-adver-neighbor | 134 | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. off-topic prompt response |
21
+ | llmbar-adver-GPTInst | 92 | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. GPT4 generated off-topic prompt response |
22
+ | llmbar-adver-GPTOut | 47 | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. unhelpful-prompted GPT4 responses |
23
+ | llmbar-adver-manual | 46 | (See [paper](https://arxiv.org/abs/2310.07641)) Challenge set chosen vs. rejected |
24
+ | xstest-should-refuse | 450, 250 | False response dataset (see [paper](https://arxiv.org/abs/2308.01263)) |
25
+ | xstest-should-respond | 450, 154 | False refusal dataset (see [paper](https://arxiv.org/abs/2308.01263)) |
26
+ | do not answer | 939, 136 | [Prompts which responsible LLMs do not answer](https://huggingface.co/datasets/LibrAI/do-not-answer) |
27
+ | hep-cpp | 164 | C++ code revisions (See [dataset](https://huggingface.co/datasets/bigcode/humanevalpack) or [paper](https://arxiv.org/abs/2308.07124)) |
28
+ | hep-go | 164 | Go code |
29
+ | hep-java | 164 | Java code |
30
+ | hep-js | 164 | Javascript code |
31
+ | hep-python | 164 | Python code |
32
+ | hep-rust | 164 | Rust code |
33
+
34
+ Lengths (mean, std. dev.) include the prompt
35
+
36
+ | subset | length bias | chosen_chars | rejected_chars | chosen_tokens | rejected_tokens | chosen_unique_tokens | rejected_unique_tokens |
37
+ |-----------------------|-------------|----------------|------------------|-----------------|-------------------|------------------------|--------------------------|
38
+ | alpacaeval-easy | True | 2283 (1138) | 646 (482) | 591 (303) | 167 (139) | 253 (117) | 83 (46) |
39
+ | alpacaeval-hard | True | 1590 (769) | 526 (430) | 412 (199) | 137 (117) | 173 (67) | 71 (48) |
40
+ | alpacaeval-length | Neutral | 2001 (1137) | 2127 (1787) | 511 (283) | 597 (530) | 192 (85) | 189 (99) |
41
+ | donotanswer | False | 755 (722) | 1389 (695) | 170 (161) | 320 (164) | 104 (82) | 157 (73) |
42
+ | hep-cpp | Neutral | 709 (341) | 705 (342) | 261 (125) | 259 (125) | 100 (29) | 99 (29) |
43
+ | hep-go | Neutral | 738 (361) | 734 (361) | 266 (118) | 265 (118) | 100 (29) | 99 (29) |
44
+ | hep-java | Neutral | 821 (393) | 814 (390) | 263 (123) | 261 (122) | 102 (30) | 102 (30) |
45
+ | hep-js | Neutral | 677 (341) | 673 (339) | 251 (129) | 250 (128) | 93 (29) | 93 (29) |
46
+ | hep-python | Neutral | 618 (301) | 616 (300) | 212 (98) | 211 (98) | 86 (26) | 85 (26) |
47
+ | hep-rust | Neutral | 666 (391) | 660 (391) | 221 (132) | 219 (132) | 95 (29) | 95 (29) |
48
+ | llmbar-adver-GPTInst | False | 735 (578) | 1623 (1055) | 170 (135) | 377 (245) | 93 (59) | 179 (106) |
49
+ | llmbar-adver-GPTOut | Neutral | 378 (339) | 359 (319) | 96 (81) | 101 (94) | 60 (45) | 55 (41) |
50
+ | llmbar-adver-manual | False | 666 (584) | 1139 (866) | 160 (134) | 264 (194) | 92 (63) | 140 (90) |
51
+ | llmbar-adver-neighbor | False | 287 (297) | 712 (749) | 70 (76) | 173 (175) | 43 (31) | 91 (70) |
52
+ | llmbar-natural | Neutral | 553 (644) | 530 (597) | 139 (162) | 130 (140) | 75 (71) | 70 (62) |
53
+ | mt-bench-easy | False | 1563 (720) | 2129 (1520) | 377 (159) | 551 (415) | 166 (55) | 116 (62) |
54
+ | mt-bench-hard | False | 1225 (499) | 1471 (1016) | 284 (116) | 349 (234) | 131 (45) | 136 (58) |
55
+ | mt-bench-med | Neutral | 1558 (729) | 1733 (1312) | 377 (170) | 410 (311) | 162 (58) | 145 (88) |
56
+ | refusals-dangerous | False | 597 (81) | 1828 (547) | 131 (20) | 459 (136) | 90 (12) | 211 (50) |
57
+ | refusals-offensive | False | 365 (116) | 1092 (1146) | 82 (25) | 299 (278) | 64 (15) | 134 (101) |
58
+ | xstest-should-refuse | False | 584 (419) | 904 (493) | 129 (89) | 217 (115) | 81 (47) | 116 (53) |
59
+ | xstest-should-respond | True | 771 (420) | 466 (427) | 189 (105) | 107 (94) | 104 (48) | 67 (48) |
60
 
61
  For more details, see the [dataset](https://huggingface.co/datasets/ai2-rlhf-collab/rm-benchmark-dev).
62
  """
63
+
64
+ TOP_TEXT = """
65
+ # Holistic Evaluation of Reward Models (HERM) from AI2
66
+
67
+ Evaluating the capabilities, safety, and pitfalls of reward models.
68
+
69
+ [Code](https://github.com/allenai/herm) | [Eval. Dataset](https://huggingface.co/datasets/ai2-adapt-dev/rm-benchmark-dev) | [Existing Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/ai2-adapt-dev/HERM-Results) | Paper (coming soon)
70
+ """
src/utils.py CHANGED
@@ -11,9 +11,9 @@ def model_hyperlink(link, model_name):
11
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
12
 
13
  # Define a function to fetch and process data
14
- def load_all_data(data_repo, subsubsets=False): # use HF api to pull the git repo
15
  dir = Path(data_repo)
16
- data_dir = dir / "data"
17
  orgs = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
18
  # get all files within the sub folders orgs
19
  models_results = []
@@ -29,7 +29,7 @@ def load_all_data(data_repo, subsubsets=False): # use HF api to pull the git
29
 
30
  # load all json data in the list models_results one by one to avoid not having the same entries
31
  for model in models_results:
32
- model_data = load_dataset("json", data_files=data_repo + "data/" + model, split="train")
33
  df2 = pd.DataFrame(model_data)
34
  # add to df
35
  df = pd.concat([df2, df])
@@ -63,8 +63,14 @@ def load_all_data(data_repo, subsubsets=False): # use HF api to pull the git
63
  cols.insert(1, cols.pop(cols.index('average')))
64
  df = df.loc[:, cols]
65
 
66
- # remove columns xstest (outdated data)
67
  # if xstest is a column
68
  if "xstest" in df.columns:
69
  df = df.drop(columns=["xstest"])
 
 
 
 
 
 
70
  return df
 
11
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
12
 
13
  # Define a function to fetch and process data
14
+ def load_all_data(data_repo, subdir:str, subsubsets=False): # use HF api to pull the git repo
15
  dir = Path(data_repo)
16
+ data_dir = dir / subdir
17
  orgs = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
18
  # get all files within the sub folders orgs
19
  models_results = []
 
29
 
30
  # load all json data in the list models_results one by one to avoid not having the same entries
31
  for model in models_results:
32
+ model_data = load_dataset("json", data_files=data_repo + subdir+ "/" + model, split="train")
33
  df2 = pd.DataFrame(model_data)
34
  # add to df
35
  df = pd.concat([df2, df])
 
63
  cols.insert(1, cols.pop(cols.index('average')))
64
  df = df.loc[:, cols]
65
 
66
+ # remove column xstest (outdated data)
67
  # if xstest is a column
68
  if "xstest" in df.columns:
69
  df = df.drop(columns=["xstest"])
70
+
71
+ # remove column anthropic and summarize_prompted (outdated data)
72
+ if "anthropic" in df.columns:
73
+ df = df.drop(columns=["anthropic"])
74
+ if "summarize_prompted" in df.columns:
75
+ df = df.drop(columns=["summarize_prompted"])
76
  return df