Shane
commited on
Commit
·
91cb993
1
Parent(s):
ca662db
changed readme
Browse files
app.py
CHANGED
@@ -120,29 +120,29 @@ with gr.Blocks(css=custom_css) as app:
|
|
120 |
interactive=False,
|
121 |
height=1000,
|
122 |
)
|
123 |
-
with gr.TabItem("Non-Greedy"):
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
with gr.TabItem("About"):
|
147 |
with gr.Row():
|
148 |
gr.Markdown(ABOUT_TEXT)
|
@@ -161,8 +161,8 @@ with gr.Blocks(css=custom_css) as app:
|
|
161 |
|
162 |
search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, category_selector_1], outputs=rewardbench_table)
|
163 |
category_selector_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, category_selector_1], outputs=rewardbench_table)
|
164 |
-
search_2.change(regex_table, inputs=[rewardbench_table_hidden_nongreedy, search_2, category_selector_2], outputs=rewardbench_table_nongreedy)
|
165 |
-
category_selector_2.change(regex_table, inputs=[rewardbench_table_hidden_nongreedy, search_2, category_selector_2], outputs=rewardbench_table_nongreedy)
|
166 |
|
167 |
with gr.Row():
|
168 |
with gr.Accordion("📚 Citation", open=False):
|
|
|
120 |
interactive=False,
|
121 |
height=1000,
|
122 |
)
|
123 |
+
# with gr.TabItem("Non-Greedy"):
|
124 |
+
# with gr.Row():
|
125 |
+
# search_2 = gr.Textbox(label="Model Search (delimit with , )",
|
126 |
+
# # placeholder="Model Search (delimit with , )",
|
127 |
+
# show_label=True)
|
128 |
+
# category_selector_2 = gr.Dropdown(categories, label="Sorted By", value="Average",
|
129 |
+
# multiselect=False, show_label=True, elem_id="category_selector")
|
130 |
+
# with gr.Row():
|
131 |
+
# # reference data
|
132 |
+
# rewardbench_table_hidden_nongreedy = gr.Dataframe(
|
133 |
+
# href_data_nongreedy.values,
|
134 |
+
# datatype=col_types_href_hidden,
|
135 |
+
# headers=href_data_nongreedy.columns.tolist(),
|
136 |
+
# visible=False,
|
137 |
+
# )
|
138 |
+
# rewardbench_table_nongreedy = gr.Dataframe(
|
139 |
+
# regex_table(href_data_nongreedy.copy(), "", "Average"),
|
140 |
+
# datatype=col_types_href,
|
141 |
+
# headers=href_data_nongreedy.columns.tolist(),
|
142 |
+
# elem_id="href_data_nongreedy",
|
143 |
+
# interactive=False,
|
144 |
+
# height=1000,
|
145 |
+
# )
|
146 |
with gr.TabItem("About"):
|
147 |
with gr.Row():
|
148 |
gr.Markdown(ABOUT_TEXT)
|
|
|
161 |
|
162 |
search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, category_selector_1], outputs=rewardbench_table)
|
163 |
category_selector_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, category_selector_1], outputs=rewardbench_table)
|
164 |
+
# search_2.change(regex_table, inputs=[rewardbench_table_hidden_nongreedy, search_2, category_selector_2], outputs=rewardbench_table_nongreedy)
|
165 |
+
# category_selector_2.change(regex_table, inputs=[rewardbench_table_hidden_nongreedy, search_2, category_selector_2], outputs=rewardbench_table_nongreedy)
|
166 |
|
167 |
with gr.Row():
|
168 |
with gr.Accordion("📚 Citation", open=False):
|
src/md.py
CHANGED
@@ -2,9 +2,13 @@ from datetime import datetime
|
|
2 |
import pytz
|
3 |
|
4 |
ABOUT_TEXT = """
|
|
|
5 |
HREF is evaluation benchmark that evaluates language models' capacity of following human instructions. It is consisted of 4,258 instructions covering 11 distinct categories, including Brainstorm ,Open QA ,Closed QA ,Extract ,Generation ,Rewrite ,Summarize ,Coding ,Classify ,Fact Checking or Attributed QA ,Multi-Document Synthesis , and Reasoning Over Numerical Data.
|
6 |
![image/png](https://cdn-uploads.huggingface.co/production/uploads/64dff1ddb5cc372803af964d/dSv3U11h936t_q-aiqbkV.png)
|
7 |
|
|
|
|
|
|
|
8 |
## Why HREF
|
9 |
| Benchmark | Size | Evaluation Method | Baseline Model | Judge Model | Task Oriented | Contamination Resistant | Contains Human Reference|
|
10 |
|--------------------|-------|------------|----------------|----------------|----------|------------|-----------|
|
@@ -28,4 +32,8 @@ current_time = datetime.now(pacific_tz).strftime("%H:%M %Z, %d %b %Y")
|
|
28 |
|
29 |
TOP_TEXT = f"""# HREF: Human Reference Guided Evaluation for Instructiong Following
|
30 |
[Code]() | [Validation Set]() | [Human Agreement Set]() | [Results]() | [Paper]() | Total models: {{}} | * Unverified models | ⚠️ Dataset Contamination | Last restart (PST): {current_time}
|
|
|
|
|
|
|
31 |
"""
|
|
|
|
2 |
import pytz
|
3 |
|
4 |
ABOUT_TEXT = """
|
5 |
+
## Overview
|
6 |
HREF is evaluation benchmark that evaluates language models' capacity of following human instructions. It is consisted of 4,258 instructions covering 11 distinct categories, including Brainstorm ,Open QA ,Closed QA ,Extract ,Generation ,Rewrite ,Summarize ,Coding ,Classify ,Fact Checking or Attributed QA ,Multi-Document Synthesis , and Reasoning Over Numerical Data.
|
7 |
![image/png](https://cdn-uploads.huggingface.co/production/uploads/64dff1ddb5cc372803af964d/dSv3U11h936t_q-aiqbkV.png)
|
8 |
|
9 |
+
## Generation Configuration
|
10 |
+
For reproductability, we use greedy decoding for all model generation as default. We apply chat templates to the instructions if they are implemented in model's tokenizer or explicity recommanded by the model's creators. Please contact us if you would like to change this default configuration.
|
11 |
+
|
12 |
## Why HREF
|
13 |
| Benchmark | Size | Evaluation Method | Baseline Model | Judge Model | Task Oriented | Contamination Resistant | Contains Human Reference|
|
14 |
|--------------------|-------|------------|----------------|----------------|----------|------------|-----------|
|
|
|
32 |
|
33 |
TOP_TEXT = f"""# HREF: Human Reference Guided Evaluation for Instructiong Following
|
34 |
[Code]() | [Validation Set]() | [Human Agreement Set]() | [Results]() | [Paper]() | Total models: {{}} | * Unverified models | ⚠️ Dataset Contamination | Last restart (PST): {current_time}
|
35 |
+
|
36 |
+
## Contact Us
|
37 |
+
TODO
|
38 |
"""
|
39 |
+
|