natolambert
commited on
Commit
·
ab74236
1
Parent(s):
56fcfaf
updates
Browse files
README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
colorFrom: pink
|
5 |
colorTo: blue
|
6 |
sdk: gradio
|
|
|
1 |
---
|
2 |
+
title: HERM Leaderboard
|
3 |
+
emoji: 📐
|
4 |
colorFrom: pink
|
5 |
colorTo: blue
|
6 |
sdk: gradio
|
app.py
CHANGED
@@ -4,17 +4,16 @@ from huggingface_hub import HfApi, snapshot_download
|
|
4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
5 |
from datasets import load_dataset
|
6 |
from src.utils import load_all_data
|
7 |
-
from src.md import ABOUT_TEXT
|
8 |
import numpy as np
|
9 |
|
10 |
api = HfApi()
|
11 |
|
12 |
COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
|
13 |
-
evals_repo = "ai2-adapt-dev/
|
14 |
-
|
15 |
eval_set_repo = "ai2-adapt-dev/rm-benchmark-dev"
|
16 |
repo_dir_herm = "./evals/herm/"
|
17 |
-
repo_dir_prefs = "./evals/prefs/"
|
18 |
|
19 |
def restart_space():
|
20 |
api.restart_space(repo_id="ai2-adapt-dev/rm-benchmark-viewer", token=COLLAB_TOKEN)
|
@@ -29,14 +28,6 @@ repo = snapshot_download(
|
|
29 |
repo_type="dataset",
|
30 |
)
|
31 |
|
32 |
-
repo_pref_sets = snapshot_download(
|
33 |
-
local_dir=repo_dir_prefs,
|
34 |
-
repo_id=prefs_repo,
|
35 |
-
use_auth_token=COLLAB_TOKEN,
|
36 |
-
tqdm_class=None,
|
37 |
-
etag_timeout=30,
|
38 |
-
repo_type="dataset",
|
39 |
-
)
|
40 |
|
41 |
def avg_over_herm(dataframe):
|
42 |
"""
|
@@ -126,10 +117,10 @@ def length_bias_check(dataframe):
|
|
126 |
|
127 |
|
128 |
|
129 |
-
herm_data = load_all_data(repo_dir_herm).sort_values(by='average', ascending=False)
|
130 |
herm_data_avg = avg_over_herm(herm_data).sort_values(by='average', ascending=False)
|
131 |
herm_data_length = length_bias_check(herm_data).sort_values(by='Terse Bias', ascending=False)
|
132 |
-
prefs_data = load_all_data(
|
133 |
# prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
|
134 |
|
135 |
col_types_herm = ["markdown"] + ["number"] * (len(herm_data.columns) - 1)
|
@@ -152,7 +143,7 @@ def random_sample(r: gr.Request, subset):
|
|
152 |
sample_index = np.random.randint(0, len(eval_set_filtered) - 1)
|
153 |
sample = eval_set_filtered[sample_index]
|
154 |
|
155 |
-
markdown_text = '\n\n'.join([f"**{key}**:\n{value}" for key, value in sample.items()])
|
156 |
return markdown_text
|
157 |
|
158 |
subsets = eval_set.unique("subset")
|
@@ -160,38 +151,48 @@ subsets = eval_set.unique("subset")
|
|
160 |
with gr.Blocks() as app:
|
161 |
# create tabs for the app, moving the current table to one titled "HERM" and the benchmark_text to a tab called "About"
|
162 |
with gr.Row():
|
163 |
-
gr.Markdown(
|
164 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
165 |
-
with gr.TabItem("HERM - Overview"):
|
166 |
with gr.Row():
|
167 |
herm_table = gr.Dataframe(
|
168 |
herm_data_avg.values,
|
169 |
datatype=col_types_herm_avg,
|
170 |
headers=herm_data_avg.columns.tolist(),
|
171 |
elem_id="herm_dataframe_avg",
|
|
|
172 |
)
|
173 |
-
with gr.TabItem("HERM - Detailed"):
|
174 |
with gr.Row():
|
175 |
herm_table = gr.Dataframe(
|
176 |
herm_data.values,
|
177 |
datatype=col_types_herm,
|
178 |
headers=herm_data.columns.tolist(),
|
179 |
elem_id="herm_dataframe",
|
|
|
180 |
)
|
181 |
-
with gr.TabItem("HERM - Length Bias"):
|
182 |
with gr.Row():
|
183 |
herm_table = gr.Dataframe(
|
184 |
herm_data_length.values,
|
185 |
datatype=cols_herm_data_length,
|
186 |
headers=herm_data_length.columns.tolist(),
|
187 |
elem_id="herm_dataframe_length",
|
|
|
188 |
)
|
189 |
-
with gr.TabItem("Pref Sets
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
pref_sets_table = gr.Dataframe(
|
191 |
prefs_data.values,
|
192 |
datatype=col_types_prefs,
|
193 |
headers=prefs_data.columns.tolist(),
|
194 |
elem_id="prefs_dataframe",
|
|
|
195 |
)
|
196 |
|
197 |
with gr.TabItem("About"):
|
|
|
4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
5 |
from datasets import load_dataset
|
6 |
from src.utils import load_all_data
|
7 |
+
from src.md import ABOUT_TEXT, TOP_TEXT
|
8 |
import numpy as np
|
9 |
|
10 |
api = HfApi()
|
11 |
|
12 |
COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
|
13 |
+
evals_repo = "ai2-adapt-dev/HERM-Results"
|
14 |
+
|
15 |
eval_set_repo = "ai2-adapt-dev/rm-benchmark-dev"
|
16 |
repo_dir_herm = "./evals/herm/"
|
|
|
17 |
|
18 |
def restart_space():
|
19 |
api.restart_space(repo_id="ai2-adapt-dev/rm-benchmark-viewer", token=COLLAB_TOKEN)
|
|
|
28 |
repo_type="dataset",
|
29 |
)
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
def avg_over_herm(dataframe):
|
33 |
"""
|
|
|
117 |
|
118 |
|
119 |
|
120 |
+
herm_data = load_all_data(repo_dir_herm, subdir="eval-set").sort_values(by='average', ascending=False)
|
121 |
herm_data_avg = avg_over_herm(herm_data).sort_values(by='average', ascending=False)
|
122 |
herm_data_length = length_bias_check(herm_data).sort_values(by='Terse Bias', ascending=False)
|
123 |
+
prefs_data = load_all_data(repo_dir_herm, subdir="pref-sets").sort_values(by='average', ascending=False)
|
124 |
# prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
|
125 |
|
126 |
col_types_herm = ["markdown"] + ["number"] * (len(herm_data.columns) - 1)
|
|
|
143 |
sample_index = np.random.randint(0, len(eval_set_filtered) - 1)
|
144 |
sample = eval_set_filtered[sample_index]
|
145 |
|
146 |
+
markdown_text = '\n\n'.join([f"**{key}**:\n\n{value}" for key, value in sample.items()])
|
147 |
return markdown_text
|
148 |
|
149 |
subsets = eval_set.unique("subset")
|
|
|
151 |
with gr.Blocks() as app:
|
152 |
# create tabs for the app, moving the current table to one titled "HERM" and the benchmark_text to a tab called "About"
|
153 |
with gr.Row():
|
154 |
+
gr.Markdown(TOP_TEXT)
|
155 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
156 |
+
with gr.TabItem("HERM Eval Set - Overview"):
|
157 |
with gr.Row():
|
158 |
herm_table = gr.Dataframe(
|
159 |
herm_data_avg.values,
|
160 |
datatype=col_types_herm_avg,
|
161 |
headers=herm_data_avg.columns.tolist(),
|
162 |
elem_id="herm_dataframe_avg",
|
163 |
+
height=1000,
|
164 |
)
|
165 |
+
with gr.TabItem("HERM Eval Set - Detailed"):
|
166 |
with gr.Row():
|
167 |
herm_table = gr.Dataframe(
|
168 |
herm_data.values,
|
169 |
datatype=col_types_herm,
|
170 |
headers=herm_data.columns.tolist(),
|
171 |
elem_id="herm_dataframe",
|
172 |
+
height=1000,
|
173 |
)
|
174 |
+
with gr.TabItem("HERM Eval Set - Length Bias"):
|
175 |
with gr.Row():
|
176 |
herm_table = gr.Dataframe(
|
177 |
herm_data_length.values,
|
178 |
datatype=cols_herm_data_length,
|
179 |
headers=herm_data_length.columns.tolist(),
|
180 |
elem_id="herm_dataframe_length",
|
181 |
+
height=1000,
|
182 |
)
|
183 |
+
with gr.TabItem("Known Pref. Sets"):
|
184 |
+
with gr.Row():
|
185 |
+
PREF_SET_TEXT = """
|
186 |
+
For more information, see the [dataset](https://huggingface.co/datasets/allenai/pref-test-sets).
|
187 |
+
"""
|
188 |
+
gr.Markdown(PREF_SET_TEXT)
|
189 |
+
with gr.Row():
|
190 |
pref_sets_table = gr.Dataframe(
|
191 |
prefs_data.values,
|
192 |
datatype=col_types_prefs,
|
193 |
headers=prefs_data.columns.tolist(),
|
194 |
elem_id="prefs_dataframe",
|
195 |
+
height=1000,
|
196 |
)
|
197 |
|
198 |
with gr.TabItem("About"):
|
src/md.py
CHANGED
@@ -2,32 +2,69 @@ ABOUT_TEXT = """
|
|
2 |
We compute the win percentage for a reward model on hand curated chosen-rejected pairs for each prompt.
|
3 |
A win is when the score for the chosen response is higher than the score for the rejected response.
|
4 |
|
5 |
-
|
6 |
-
|
7 |
-
| Subset | Num. Samples (Pre-filtering, post-filtering) | Description |
|
8 |
-
| :--------------------- | :------------------------------------------: | :---------------------------------------------------------------- |
|
9 |
-
| alpacaeval-easy | 805, 100 | Great model vs poor model |
|
10 |
-
| alpacaeval-length | 805, 95 | Good model vs low model, equal length |
|
11 |
-
| alpacaeval-hard | 805, 95 | Great model vs baseline model |
|
12 |
-
| mt-bench-easy | 28, 28 | MT Bench 10s vs 1s |
|
13 |
-
| mt-bench-medium | 45, 40 | MT Bench 9s vs 2-5s |
|
14 |
-
| mt-bench-hard | 45, 37 | MT Bench 7-8 vs 5-6 |
|
15 |
-
| refusals-dangerous | 505, 100 | Dangerous response vs no response |
|
16 |
-
| refusals-offensive | 704, 100 | Offensive response vs no response |
|
17 |
-
| llmbar-natural | 100 | (See [paper](https://arxiv.org/abs/2310.07641)) Manually curated instruction pairs |
|
18 |
-
| llmbar-adver-neighbor | 134 | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. off-topic prompt response |
|
19 |
-
| llmbar-adver-GPTInst | 92 | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. GPT4 generated off-topic prompt response |
|
20 |
-
| llmbar-adver-GPTOut | 47 | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. unhelpful-prompted GPT4 responses |
|
21 |
-
| llmbar-adver-manual | 46 | (See [paper](https://arxiv.org/abs/2310.07641)) Challenge set chosen vs. rejected |
|
22 |
-
| XSTest | 450, 404 | False refusal dataset (see [paper](https://arxiv.org/abs/2308.01263)) |
|
23 |
-
| do not answer | 939, 136 | [Prompts which responsible LLMs do not answer](https://huggingface.co/datasets/LibrAI/do-not-answer) |
|
24 |
-
| hep-cpp | 164 | C++ code revisions (See [dataset](https://huggingface.co/datasets/bigcode/humanevalpack) or [paper](https://arxiv.org/abs/2308.07124)) |
|
25 |
-
| hep-go | 164 | Go code |
|
26 |
-
| hep-java | 164 | Java code |
|
27 |
-
| hep-js | 164 | Javascript code |
|
28 |
-
| hep-python | 164 | Python code |
|
29 |
-
| hep-rust | 164 | Rust code |
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
For more details, see the [dataset](https://huggingface.co/datasets/ai2-rlhf-collab/rm-benchmark-dev).
|
33 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
We compute the win percentage for a reward model on hand curated chosen-rejected pairs for each prompt.
|
3 |
A win is when the score for the chosen response is higher than the score for the rejected response.
|
4 |
|
5 |
+
## Subset Summary
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
+
Total number of the prompts is: 2538, filtered from 4676.
|
8 |
+
|
9 |
+
| Subset | Num. Samples (Pre-filtering, post-filtering) | Description |
|
10 |
+
| :---------- | :-----: | :---------: |
|
11 |
+
| alpacaeval-easy | 805, 100 | Great model vs poor model |
|
12 |
+
| alpacaeval-length | 805, 95 | Good model vs low model, equal length |
|
13 |
+
| alpacaeval-hard | 805, 95 | Great model vs baseline model |
|
14 |
+
| mt-bench-easy | 28, 28 | MT Bench 10s vs 1s |
|
15 |
+
| mt-bench-medium | 45, 40 | MT Bench 9s vs 2-5s |
|
16 |
+
| mt-bench-hard | 45, 37 | MT Bench 7-8 vs 5-6 |
|
17 |
+
| refusals-dangerous | 505, 100 | Dangerous response vs no response |
|
18 |
+
| refusals-offensive | 704, 100 | Offensive response vs no response |
|
19 |
+
| llmbar-natural | 100 | (See [paper](https://arxiv.org/abs/2310.07641)) Manually curated instruction pairs |
|
20 |
+
| llmbar-adver-neighbor | 134 | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. off-topic prompt response |
|
21 |
+
| llmbar-adver-GPTInst | 92 | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. GPT4 generated off-topic prompt response |
|
22 |
+
| llmbar-adver-GPTOut | 47 | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. unhelpful-prompted GPT4 responses |
|
23 |
+
| llmbar-adver-manual | 46 | (See [paper](https://arxiv.org/abs/2310.07641)) Challenge set chosen vs. rejected |
|
24 |
+
| xstest-should-refuse | 450, 250 | False response dataset (see [paper](https://arxiv.org/abs/2308.01263)) |
|
25 |
+
| xstest-should-respond | 450, 154 | False refusal dataset (see [paper](https://arxiv.org/abs/2308.01263)) |
|
26 |
+
| do not answer | 939, 136 | [Prompts which responsible LLMs do not answer](https://huggingface.co/datasets/LibrAI/do-not-answer) |
|
27 |
+
| hep-cpp | 164 | C++ code revisions (See [dataset](https://huggingface.co/datasets/bigcode/humanevalpack) or [paper](https://arxiv.org/abs/2308.07124)) |
|
28 |
+
| hep-go | 164 | Go code |
|
29 |
+
| hep-java | 164 | Java code |
|
30 |
+
| hep-js | 164 | Javascript code |
|
31 |
+
| hep-python | 164 | Python code |
|
32 |
+
| hep-rust | 164 | Rust code |
|
33 |
+
|
34 |
+
Lengths (mean, std. dev.) include the prompt
|
35 |
+
|
36 |
+
| subset | length bias | chosen_chars | rejected_chars | chosen_tokens | rejected_tokens | chosen_unique_tokens | rejected_unique_tokens |
|
37 |
+
|-----------------------|-------------|----------------|------------------|-----------------|-------------------|------------------------|--------------------------|
|
38 |
+
| alpacaeval-easy | True | 2283 (1138) | 646 (482) | 591 (303) | 167 (139) | 253 (117) | 83 (46) |
|
39 |
+
| alpacaeval-hard | True | 1590 (769) | 526 (430) | 412 (199) | 137 (117) | 173 (67) | 71 (48) |
|
40 |
+
| alpacaeval-length | Neutral | 2001 (1137) | 2127 (1787) | 511 (283) | 597 (530) | 192 (85) | 189 (99) |
|
41 |
+
| donotanswer | False | 755 (722) | 1389 (695) | 170 (161) | 320 (164) | 104 (82) | 157 (73) |
|
42 |
+
| hep-cpp | Neutral | 709 (341) | 705 (342) | 261 (125) | 259 (125) | 100 (29) | 99 (29) |
|
43 |
+
| hep-go | Neutral | 738 (361) | 734 (361) | 266 (118) | 265 (118) | 100 (29) | 99 (29) |
|
44 |
+
| hep-java | Neutral | 821 (393) | 814 (390) | 263 (123) | 261 (122) | 102 (30) | 102 (30) |
|
45 |
+
| hep-js | Neutral | 677 (341) | 673 (339) | 251 (129) | 250 (128) | 93 (29) | 93 (29) |
|
46 |
+
| hep-python | Neutral | 618 (301) | 616 (300) | 212 (98) | 211 (98) | 86 (26) | 85 (26) |
|
47 |
+
| hep-rust | Neutral | 666 (391) | 660 (391) | 221 (132) | 219 (132) | 95 (29) | 95 (29) |
|
48 |
+
| llmbar-adver-GPTInst | False | 735 (578) | 1623 (1055) | 170 (135) | 377 (245) | 93 (59) | 179 (106) |
|
49 |
+
| llmbar-adver-GPTOut | Neutral | 378 (339) | 359 (319) | 96 (81) | 101 (94) | 60 (45) | 55 (41) |
|
50 |
+
| llmbar-adver-manual | False | 666 (584) | 1139 (866) | 160 (134) | 264 (194) | 92 (63) | 140 (90) |
|
51 |
+
| llmbar-adver-neighbor | False | 287 (297) | 712 (749) | 70 (76) | 173 (175) | 43 (31) | 91 (70) |
|
52 |
+
| llmbar-natural | Neutral | 553 (644) | 530 (597) | 139 (162) | 130 (140) | 75 (71) | 70 (62) |
|
53 |
+
| mt-bench-easy | False | 1563 (720) | 2129 (1520) | 377 (159) | 551 (415) | 166 (55) | 116 (62) |
|
54 |
+
| mt-bench-hard | False | 1225 (499) | 1471 (1016) | 284 (116) | 349 (234) | 131 (45) | 136 (58) |
|
55 |
+
| mt-bench-med | Neutral | 1558 (729) | 1733 (1312) | 377 (170) | 410 (311) | 162 (58) | 145 (88) |
|
56 |
+
| refusals-dangerous | False | 597 (81) | 1828 (547) | 131 (20) | 459 (136) | 90 (12) | 211 (50) |
|
57 |
+
| refusals-offensive | False | 365 (116) | 1092 (1146) | 82 (25) | 299 (278) | 64 (15) | 134 (101) |
|
58 |
+
| xstest-should-refuse | False | 584 (419) | 904 (493) | 129 (89) | 217 (115) | 81 (47) | 116 (53) |
|
59 |
+
| xstest-should-respond | True | 771 (420) | 466 (427) | 189 (105) | 107 (94) | 104 (48) | 67 (48) |
|
60 |
|
61 |
For more details, see the [dataset](https://huggingface.co/datasets/ai2-rlhf-collab/rm-benchmark-dev).
|
62 |
"""
|
63 |
+
|
64 |
+
TOP_TEXT = """
|
65 |
+
# Holistic Evaluation of Reward Models (HERM) from AI2
|
66 |
+
|
67 |
+
Evaluating the capabilities, safety, and pitfalls of reward models.
|
68 |
+
|
69 |
+
[Code](https://github.com/allenai/herm) | [Eval. Dataset](https://huggingface.co/datasets/ai2-adapt-dev/rm-benchmark-dev) | [Existing Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/ai2-adapt-dev/HERM-Results) | Paper (coming soon)
|
70 |
+
"""
|
src/utils.py
CHANGED
@@ -11,9 +11,9 @@ def model_hyperlink(link, model_name):
|
|
11 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
12 |
|
13 |
# Define a function to fetch and process data
|
14 |
-
def load_all_data(data_repo, subsubsets=False): # use HF api to pull the git repo
|
15 |
dir = Path(data_repo)
|
16 |
-
data_dir = dir /
|
17 |
orgs = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
|
18 |
# get all files within the sub folders orgs
|
19 |
models_results = []
|
@@ -29,7 +29,7 @@ def load_all_data(data_repo, subsubsets=False): # use HF api to pull the git
|
|
29 |
|
30 |
# load all json data in the list models_results one by one to avoid not having the same entries
|
31 |
for model in models_results:
|
32 |
-
model_data = load_dataset("json", data_files=data_repo + "
|
33 |
df2 = pd.DataFrame(model_data)
|
34 |
# add to df
|
35 |
df = pd.concat([df2, df])
|
@@ -63,8 +63,14 @@ def load_all_data(data_repo, subsubsets=False): # use HF api to pull the git
|
|
63 |
cols.insert(1, cols.pop(cols.index('average')))
|
64 |
df = df.loc[:, cols]
|
65 |
|
66 |
-
# remove
|
67 |
# if xstest is a column
|
68 |
if "xstest" in df.columns:
|
69 |
df = df.drop(columns=["xstest"])
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
return df
|
|
|
11 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
12 |
|
13 |
# Define a function to fetch and process data
|
14 |
+
def load_all_data(data_repo, subdir:str, subsubsets=False): # use HF api to pull the git repo
|
15 |
dir = Path(data_repo)
|
16 |
+
data_dir = dir / subdir
|
17 |
orgs = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
|
18 |
# get all files within the sub folders orgs
|
19 |
models_results = []
|
|
|
29 |
|
30 |
# load all json data in the list models_results one by one to avoid not having the same entries
|
31 |
for model in models_results:
|
32 |
+
model_data = load_dataset("json", data_files=data_repo + subdir+ "/" + model, split="train")
|
33 |
df2 = pd.DataFrame(model_data)
|
34 |
# add to df
|
35 |
df = pd.concat([df2, df])
|
|
|
63 |
cols.insert(1, cols.pop(cols.index('average')))
|
64 |
df = df.loc[:, cols]
|
65 |
|
66 |
+
# remove column xstest (outdated data)
|
67 |
# if xstest is a column
|
68 |
if "xstest" in df.columns:
|
69 |
df = df.drop(columns=["xstest"])
|
70 |
+
|
71 |
+
# remove column anthropic and summarize_prompted (outdated data)
|
72 |
+
if "anthropic" in df.columns:
|
73 |
+
df = df.drop(columns=["anthropic"])
|
74 |
+
if "summarize_prompted" in df.columns:
|
75 |
+
df = df.drop(columns=["summarize_prompted"])
|
76 |
return df
|