Shane commited on
Commit
91cb993
·
1 Parent(s): ca662db

changed readme

Browse files
Files changed (2) hide show
  1. app.py +25 -25
  2. src/md.py +8 -0
app.py CHANGED
@@ -120,29 +120,29 @@ with gr.Blocks(css=custom_css) as app:
120
  interactive=False,
121
  height=1000,
122
  )
123
- with gr.TabItem("Non-Greedy"):
124
- with gr.Row():
125
- search_2 = gr.Textbox(label="Model Search (delimit with , )",
126
- # placeholder="Model Search (delimit with , )",
127
- show_label=True)
128
- category_selector_2 = gr.Dropdown(categories, label="Sorted By", value="Average",
129
- multiselect=False, show_label=True, elem_id="category_selector")
130
- with gr.Row():
131
- # reference data
132
- rewardbench_table_hidden_nongreedy = gr.Dataframe(
133
- href_data_nongreedy.values,
134
- datatype=col_types_href_hidden,
135
- headers=href_data_nongreedy.columns.tolist(),
136
- visible=False,
137
- )
138
- rewardbench_table_nongreedy = gr.Dataframe(
139
- regex_table(href_data_nongreedy.copy(), "", "Average"),
140
- datatype=col_types_href,
141
- headers=href_data_nongreedy.columns.tolist(),
142
- elem_id="href_data_nongreedy",
143
- interactive=False,
144
- height=1000,
145
- )
146
  with gr.TabItem("About"):
147
  with gr.Row():
148
  gr.Markdown(ABOUT_TEXT)
@@ -161,8 +161,8 @@ with gr.Blocks(css=custom_css) as app:
161
 
162
  search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, category_selector_1], outputs=rewardbench_table)
163
  category_selector_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, category_selector_1], outputs=rewardbench_table)
164
- search_2.change(regex_table, inputs=[rewardbench_table_hidden_nongreedy, search_2, category_selector_2], outputs=rewardbench_table_nongreedy)
165
- category_selector_2.change(regex_table, inputs=[rewardbench_table_hidden_nongreedy, search_2, category_selector_2], outputs=rewardbench_table_nongreedy)
166
 
167
  with gr.Row():
168
  with gr.Accordion("📚 Citation", open=False):
 
120
  interactive=False,
121
  height=1000,
122
  )
123
+ # with gr.TabItem("Non-Greedy"):
124
+ # with gr.Row():
125
+ # search_2 = gr.Textbox(label="Model Search (delimit with , )",
126
+ # # placeholder="Model Search (delimit with , )",
127
+ # show_label=True)
128
+ # category_selector_2 = gr.Dropdown(categories, label="Sorted By", value="Average",
129
+ # multiselect=False, show_label=True, elem_id="category_selector")
130
+ # with gr.Row():
131
+ # # reference data
132
+ # rewardbench_table_hidden_nongreedy = gr.Dataframe(
133
+ # href_data_nongreedy.values,
134
+ # datatype=col_types_href_hidden,
135
+ # headers=href_data_nongreedy.columns.tolist(),
136
+ # visible=False,
137
+ # )
138
+ # rewardbench_table_nongreedy = gr.Dataframe(
139
+ # regex_table(href_data_nongreedy.copy(), "", "Average"),
140
+ # datatype=col_types_href,
141
+ # headers=href_data_nongreedy.columns.tolist(),
142
+ # elem_id="href_data_nongreedy",
143
+ # interactive=False,
144
+ # height=1000,
145
+ # )
146
  with gr.TabItem("About"):
147
  with gr.Row():
148
  gr.Markdown(ABOUT_TEXT)
 
161
 
162
  search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, category_selector_1], outputs=rewardbench_table)
163
  category_selector_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, category_selector_1], outputs=rewardbench_table)
164
+ # search_2.change(regex_table, inputs=[rewardbench_table_hidden_nongreedy, search_2, category_selector_2], outputs=rewardbench_table_nongreedy)
165
+ # category_selector_2.change(regex_table, inputs=[rewardbench_table_hidden_nongreedy, search_2, category_selector_2], outputs=rewardbench_table_nongreedy)
166
 
167
  with gr.Row():
168
  with gr.Accordion("📚 Citation", open=False):
src/md.py CHANGED
@@ -2,9 +2,13 @@ from datetime import datetime
2
  import pytz
3
 
4
  ABOUT_TEXT = """
 
5
  HREF is evaluation benchmark that evaluates language models' capacity of following human instructions. It is consisted of 4,258 instructions covering 11 distinct categories, including Brainstorm ,Open QA ,Closed QA ,Extract ,Generation ,Rewrite ,Summarize ,Coding ,Classify ,Fact Checking or Attributed QA ,Multi-Document Synthesis , and Reasoning Over Numerical Data.
6
  ![image/png](https://cdn-uploads.huggingface.co/production/uploads/64dff1ddb5cc372803af964d/dSv3U11h936t_q-aiqbkV.png)
7
 
 
 
 
8
  ## Why HREF
9
  | Benchmark | Size | Evaluation Method | Baseline Model | Judge Model | Task Oriented | Contamination Resistant | Contains Human Reference|
10
  |--------------------|-------|------------|----------------|----------------|----------|------------|-----------|
@@ -28,4 +32,8 @@ current_time = datetime.now(pacific_tz).strftime("%H:%M %Z, %d %b %Y")
28
 
29
  TOP_TEXT = f"""# HREF: Human Reference Guided Evaluation for Instructiong Following
30
  [Code]() | [Validation Set]() | [Human Agreement Set]() | [Results]() | [Paper]() | Total models: {{}} | * Unverified models | ⚠️ Dataset Contamination | Last restart (PST): {current_time}
 
 
 
31
  """
 
 
2
  import pytz
3
 
4
  ABOUT_TEXT = """
5
+ ## Overview
6
  HREF is evaluation benchmark that evaluates language models' capacity of following human instructions. It is consisted of 4,258 instructions covering 11 distinct categories, including Brainstorm ,Open QA ,Closed QA ,Extract ,Generation ,Rewrite ,Summarize ,Coding ,Classify ,Fact Checking or Attributed QA ,Multi-Document Synthesis , and Reasoning Over Numerical Data.
7
  ![image/png](https://cdn-uploads.huggingface.co/production/uploads/64dff1ddb5cc372803af964d/dSv3U11h936t_q-aiqbkV.png)
8
 
9
+ ## Generation Configuration
10
+ For reproductability, we use greedy decoding for all model generation as default. We apply chat templates to the instructions if they are implemented in model's tokenizer or explicity recommanded by the model's creators. Please contact us if you would like to change this default configuration.
11
+
12
  ## Why HREF
13
  | Benchmark | Size | Evaluation Method | Baseline Model | Judge Model | Task Oriented | Contamination Resistant | Contains Human Reference|
14
  |--------------------|-------|------------|----------------|----------------|----------|------------|-----------|
 
32
 
33
  TOP_TEXT = f"""# HREF: Human Reference Guided Evaluation for Instructiong Following
34
  [Code]() | [Validation Set]() | [Human Agreement Set]() | [Results]() | [Paper]() | Total models: {{}} | * Unverified models | ⚠️ Dataset Contamination | Last restart (PST): {current_time}
35
+
36
+ ## Contact Us
37
+ TODO
38
  """
39
+