callanwu commited on
Commit
404f089
Β·
1 Parent(s): 2c9628e
Files changed (5) hide show
  1. agents_result.jsonl +18 -0
  2. app.py +86 -65
  3. content.py +42 -14
  4. overall.jpg +0 -0
  5. rag_result.jsonl +9 -0
agents_result.jsonl ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"method": "react", "model": "qwen2.5-7b-instruct", "ss_easy": 0.1, "ss_medium": 0.18571428571428572, "ss_hard": 0.09166666666666666, "ms_easy": 0.175, "ms_medium": 0.10714285714285714, "ms_hard": 0.058333333333333334, "overall": 0.11911764705882352}
2
+ {"method": "reflexion", "model": "qwen2.5-7b-instruct", "ss_easy": 0.0875, "ss_medium": 0.25, "ss_hard": 0.11666666666666667, "ms_easy": 0.3, "ms_medium": 0.15714285714285714, "ms_hard": 0.041666666666666664, "overall": 0.15735294117647058}
3
+ {"method": "seeker", "model": "qwen2.5-7b-instruct", "ss_easy": 0.075, "ss_medium": 0.2571428571428571, "ss_hard": 0.125, "ms_easy": 0.1875, "ms_medium": 0.2, "ms_hard": 0.058333333333333334, "overall": 0.15735294117647058}
4
+ {"method": "react", "model": "qwen2.5-14b-instruct", "ss_easy": 0.0875, "ss_medium": 0.32142857142857145, "ss_hard": 0.15, "ms_easy": 0.275, "ms_medium": 0.22857142857142856, "ms_hard": 0.05, "overall": 0.19117647058823528}
5
+ {"method": "reflexion", "model": "qwen2.5-14b-instruct", "ss_easy": 0.1375, "ss_medium": 0.34285714285714286, "ss_hard": 0.15, "ms_easy": 0.3625, "ms_medium": 0.22857142857142856, "ms_hard": 0.058333333333333334, "overall": 0.21323529411764705}
6
+ {"method": "seeker", "model": "qwen2.5-14b-instruct", "ss_easy": 0.0875, "ss_medium": 0.4142857142857143, "ss_hard": 0.23333333333333334, "ms_easy": 0.3, "ms_medium": 0.22857142857142856, "ms_hard": 0.1, "overall": 0.23676470588235293}
7
+ {"method": "react", "model": "qwen2.5-32b-instruct", "ss_easy": 0.1, "ss_medium": 0.35714285714285715, "ss_hard": 0.16666666666666666, "ms_easy": 0.3625, "ms_medium": 0.18571428571428572, "ms_hard": 0.08333333333333333, "overall": 0.21029411764705883}
8
+ {"method": "reflexion", "model": "qwen2.5-32b-instruct", "ss_easy": 0.075, "ss_medium": 0.32857142857142857, "ss_hard": 0.16666666666666666, "ms_easy": 0.3125, "ms_medium": 0.22857142857142856, "ms_hard": 0.058333333333333334, "overall": 0.2}
9
+ {"method": "seeker", "model": "qwen2.5-32b-instruct", "ss_easy": 0.1125, "ss_medium": 0.34285714285714286, "ss_hard": 0.225, "ms_easy": 0.275, "ms_medium": 0.24285714285714285, "ms_hard": 0.1, "overall": 0.2235294117647059}
10
+ {"method": "react", "model": "qwen2.5-72b-instruct", "ss_easy": 0.125, "ss_medium": 0.38571428571428573, "ss_hard": 0.2, "ms_easy": 0.45, "ms_medium": 0.3142857142857143, "ms_hard": 0.1, "overall": 0.2647058823529412}
11
+ {"method": "reflexion", "model": "qwen2.5-72b-instruct", "ss_easy": 0.1375, "ss_medium": 0.44285714285714284, "ss_hard": 0.2833333333333333, "ms_easy": 0.3625, "ms_medium": 0.25, "ms_hard": 0.125, "overall": 0.2735294117647059}
12
+ {"method": "seeker", "model": "qwen2.5-72b-instruct", "ss_easy": 0.15, "ss_medium": 0.4857142857142857, "ss_hard": 0.25833333333333336, "ms_easy": 0.35, "ms_medium": 0.29285714285714287, "ms_hard": 0.15, "overall": 0.2911764705882353}
13
+ {"method": "react", "model": "qwen-plus", "ss_easy": 0.1375, "ss_medium": 0.4, "ss_hard": 0.24166666666666667, "ms_easy": 0.475, "ms_medium": 0.3, "ms_hard": 0.15, "overall": 0.2852941176470588}
14
+ {"method": "reflexion", "model": "qwen-plus", "ss_easy": 0.1, "ss_medium": 0.4857142857142857, "ss_hard": 0.2833333333333333, "ms_easy": 0.35, "ms_medium": 0.2785714285714286, "ms_hard": 0.14166666666666666, "overall": 0.2852941176470588}
15
+ {"method": "seeker", "model": "qwen-plus", "ss_easy": 0.1375, "ss_medium": 0.4714285714285714, "ss_hard": 0.3, "ms_easy": 0.35, "ms_medium": 0.2714285714285714, "ms_hard": 0.15, "overall": 0.2897058823529412}
16
+ {"method": "react", "model": "gpt-4o", "ss_easy": 0.1125, "ss_medium": 0.45, "ss_hard": 0.3, "ms_easy": 0.325, "ms_medium": 0.30714285714285716, "ms_hard": 0.15, "overall": 0.2867647058823529}
17
+ {"method": "reflexion", "model": "gpt-4o", "ss_easy": 0.1375, "ss_medium": 0.5142857142857142, "ss_hard": 0.30833333333333335, "ms_easy": 0.35, "ms_medium": 0.2714285714285714, "ms_hard": 0.16666666666666666, "overall": 0.3029411764705882}
18
+ {"method": "seeker", "model": "gpt-4o", "ss_easy": 0.1, "ss_medium": 0.5, "ss_hard": 0.3, "ms_easy": 0.475, "ms_medium": 0.34285714285714286, "ms_hard": 0.15833333333333333, "overall": 0.3220588235294118}
app.py CHANGED
@@ -1,92 +1,113 @@
1
- import os
2
  import json
3
- import glob
4
- from collections import defaultdict
5
  import pandas as pd
6
  import gradio as gr
7
  from content import *
8
  from css import *
9
- import glob
10
 
11
- ARC = "arc"
12
- HELLASWAG = "hellaswag"
13
- MMLU = "mmlu"
14
- TRUTHFULQA = "truthfulqa"
15
- BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA]
16
-
17
- METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
18
-
19
- LANGS = 'ar,bn,ca,da,de,es,eu,fr,gu,hi,hr,hu,hy,id,it,kn,ml,mr,ne,nl,pt,ro,ru,sk,sr,sv,ta,te,uk,vi,zh'.split(',')
 
 
 
 
 
 
 
 
20
 
21
- LANG_NAME = {
22
- 'ar': 'Arabic',
23
- 'bn': 'Bengali',
24
- 'ca': 'Catalan',
25
- 'da': 'Danish',
26
- 'de': 'German',
27
- 'es': 'Spanish',
28
- 'eu': 'Basque',
29
- 'fr': 'French',
30
- 'gu': 'Gujarati',
31
- 'hi': 'Hindi',
32
- 'hr': 'Croatian',
33
- 'hu': 'Hungarian',
34
- 'hy': 'Armenian',
35
- 'id': 'Indonesian',
36
- 'it': 'Italian',
37
- 'kn': 'Kannada',
38
- 'ml': 'Malayalam',
39
- 'mr': 'Marathi',
40
- 'ne': 'Nepali',
41
- 'nl': 'Dutch',
42
- 'pt': 'Portuguese',
43
- 'ro': 'Romanian',
44
- 'ru': 'Russian',
45
- 'sk': 'Slovak',
46
- 'sr': 'Serbian',
47
- 'sv': 'Swedish',
48
- 'ta': 'Tamil',
49
- 'te': 'Telugu',
50
- 'uk': 'Ukrainian',
51
- 'vi': 'Vietnamese',
52
- 'zh': 'Chinese'
53
  }
 
 
 
 
 
 
 
 
 
54
 
55
- NONE_COL = "None"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
 
 
57
 
58
- COLS = ["Method", "Model" , "SS Easy", "SS Medium", "SS Hard", "MS Easy", "MS Meduium", "MS Hard", "Overall", NONE_COL]
59
- TYPES = ["str", "str", "number", "number", "number", "number", "number", "number", "number", "number" , "number"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
- df = []
62
- row = ["React", "Qwen-plus" , "10.5", "20.6", "30.4", "10.5", "20.6", "30.4", "20", NONE_COL]
 
63
 
64
- df.append(row)
65
- df.append(row)
66
- df.append(row)
67
- df.append(row)
68
- df = pd.DataFrame.from_records(df, columns=COLS)
69
- df = df.sort_values(by=["Method", "Overall"], ascending=False)
70
- df = df[COLS]
71
  demo = gr.Blocks(css=CUSTOM_CSS)
72
  with demo:
73
  gr.HTML(TITLE)
74
  gr.Markdown(INTRO_TEXT, elem_classes="markdown-text")
75
  gr.Markdown(HOW_TO, elem_classes="markdown-text")
76
- print(TYPES)
77
- print(df.columns)
78
  with gr.Group():
79
- with gr.Tab("Results: Agent"):
80
  leaderboard_table_test = gr.components.Dataframe(
81
- value=df, datatype=TYPES, interactive=False,
82
- column_widths = ["20%"] * len(df.columns)
83
  )
84
- with gr.Tab("Results: RAG-system"):
85
  leaderboard_table_val = gr.components.Dataframe(
86
- value=df, datatype=TYPES, interactive=False,
87
  column_widths=["20%"]
88
  )
89
-
90
 
91
  gr.Markdown(CREDIT, elem_classes="markdown-text")
92
  gr.Markdown(CITATION, elem_classes="markdown-text")
 
 
1
  import json
 
 
2
  import pandas as pd
3
  import gradio as gr
4
  from content import *
5
  from css import *
 
6
 
7
+ NONE_COL = "Ranking"
8
+
9
+ AGENT_COLS = ["Method", "Model" , "SS Easy", "SS Medium", "SS Hard", "MS Easy", "MS Meduium", "MS Hard", "Overall", NONE_COL]
10
+ AGENT_TYPES = ["str", "str", "number", "number", "number", "number", "number", "number", "number", "number" , "number"]
11
+ model_name_adic = {
12
+ "qwen-plus": "Qwen-Plus",
13
+ "qwen2.5-72b-instruct": "Qwen2.5-72B",
14
+ "qwen2.5-7b-instruct": "Qwen2.5-7B",
15
+ "qwen2.5-14b-instruct": "Qwen2.5-14B",
16
+ "qwen2.5-32b-instruct": "Qwen2.5-32B",
17
+ "gpt-4o": "GPT-4o",
18
+ }
19
+ method_name_adic = {
20
+ "reflexion": "Relfexion",
21
+ "react": "React",
22
+ "seeker": "WebWalker",
23
+ }
24
 
25
+ rag_name_adic = {
26
+ "kimi": "Kimi",
27
+ "mindsearch": "MindSearch",
28
+ "navie": "Navie RAG",
29
+ "o1": "o1",
30
+ "tongyi": "Tongyi",
31
+ "wenxin": "ERNIE",
32
+ "gemini": "Gemini",
33
+ "gemini_search": "Gemini w/ Search",
34
+ "doubao": "Doubao",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  }
36
+ agent_ranking = []
37
+ with open("agents_result.jsonl", "r") as f:
38
+ for line in f:
39
+ item = json.loads(line)
40
+ agent_ranking.append([method_name_adic[item["method"]], model_name_adic[item["model"]], item["overall"]])
41
+ agent_ranking = sorted(agent_ranking, key=lambda x: x[2], reverse=False)
42
+ ranking_dict = {}
43
+ for i, (method, model, score) in enumerate(agent_ranking):
44
+ ranking_dict[method+model] = i
45
 
46
+ agent_df = []
47
+ with open("agents_result.jsonl", "r") as f:
48
+ for line in f:
49
+ item = json.loads(line)
50
+ agent_df.append([method_name_adic[item["method"]], model_name_adic[item["model"]],
51
+ f"{item['ss_easy'] * 100:.2f}",
52
+ f"{item['ss_medium'] * 100:.2f}",
53
+ f"{item['ss_hard'] * 100:.2f}",
54
+ f"{item['ms_easy'] * 100:.2f}",
55
+ f"{item['ms_medium'] * 100:.2f}",
56
+ f"{item['ms_hard'] * 100:.2f}",
57
+ f"{item['overall'] * 100:.2f}",
58
+ ranking_dict[method_name_adic[item["method"]] + model_name_adic[item["model"]]]])
59
+ agent_df = pd.DataFrame.from_records(agent_df, columns=AGENT_COLS)
60
+ agent_df = agent_df.sort_values(by=["Ranking"], ascending=False)
61
+ agent_df = agent_df[AGENT_COLS]
62
 
63
+ RAG_COLS = ["System", "SS Easy", "SS Medium", "SS Hard", "MS Easy", "MS Meduium", "MS Hard", "Overall", NONE_COL]
64
+ RAG_TYPES = ["str", "number", "number", "number", "number", "number", "number", "number", "number" , "number"]
65
 
66
+ rag_ranking = []
67
+ with open("rag_result.jsonl", "r") as f:
68
+ for line in f:
69
+ item = json.loads(line)
70
+ rag_ranking.append([rag_name_adic[item["system"]], item["overall"]])
71
+ rag_ranking = sorted(rag_ranking, key=lambda x: x[1], reverse=False)
72
+ ranking_dict = {}
73
+ for i, (system, score) in enumerate(rag_ranking):
74
+ ranking_dict[system] = i
75
+ rag_df = []
76
+ with open("rag_result.jsonl", "r") as f:
77
+ for line in f:
78
+ item = json.loads(line)
79
+ rag_df.append([rag_name_adic[item["system"]],
80
+ f"{item['ss_easy'] * 100:.2f}",
81
+ f"{item['ss_medium'] * 100:.2f}",
82
+ f"{item['ss_hard'] * 100:.2f}",
83
+ f"{item['ms_easy'] * 100:.2f}",
84
+ f"{item['ms_medium'] * 100:.2f}",
85
+ f"{item['ms_hard'] * 100:.2f}",
86
+ f"{item['overall'] * 100:.2f}",
87
+ ranking_dict[rag_name_adic[item["system"]]]])
88
 
89
+ rag_df = pd.DataFrame.from_records(rag_df, columns=RAG_COLS)
90
+ rag_df = rag_df.sort_values(by=["Ranking"], ascending=False)
91
+ rag_df = rag_df[RAG_COLS]
92
 
 
 
 
 
 
 
 
93
  demo = gr.Blocks(css=CUSTOM_CSS)
94
  with demo:
95
  gr.HTML(TITLE)
96
  gr.Markdown(INTRO_TEXT, elem_classes="markdown-text")
97
  gr.Markdown(HOW_TO, elem_classes="markdown-text")
98
+ gr.Markdown("## Leaderboard")
 
99
  with gr.Group():
100
+ with gr.Tab("Results: Agent πŸ€–οΈ"):
101
  leaderboard_table_test = gr.components.Dataframe(
102
+ value=agent_df, datatype=AGENT_TYPES, interactive=False,
103
+ column_widths = ["20%"] * len(agent_df.columns)
104
  )
105
+ with gr.Tab("Results: RAG-system πŸ”"):
106
  leaderboard_table_val = gr.components.Dataframe(
107
+ value=rag_df, datatype=RAG_TYPES, interactive=False,
108
  column_widths=["20%"]
109
  )
110
+ gr.Markdown("SS denotes single-source, and MS denotes multi-source. Easy, Medium, and Hard denote the difficulty level of the question.")
111
 
112
  gr.Markdown(CREDIT, elem_classes="markdown-text")
113
  gr.Markdown(CITATION, elem_classes="markdown-text")
content.py CHANGED
@@ -1,30 +1,58 @@
1
  TITLE = '<h1 align="center" id="space-title">πŸ† WebWalkerQA Leaderboard</h1>'
2
 
3
  INTRO_TEXT = f"""
4
- ## About
5
- This leaderboard shows the performance of models on the WebWalkerQA benchmark. The WebWalkerQA benchmark is a collection of question-answering datasets that test the ability of models to answer questions about web pages.
6
  """
7
 
8
  HOW_TO = f"""
9
- ## How to list your model performance on this leaderboard:
10
- Send a email to [email protected] or jialongwu@seu.edu.cn.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  """
12
 
13
  CREDIT = f"""
14
- ## Credit
15
- To make this website, we use the following resources:
16
- - Evaluation code (EleutherAI's lm_evaluation_harness repo)
17
- - Leaderboard code (Huggingface4's open_llm_leaderboard and repo)
 
18
  """
19
 
20
 
21
  CITATION = f"""
22
- ## Citation
23
- ```
24
- @misc{{lai2023openllmbenchmark,
25
- author = {{Viet Lai and Nghia Trung Ngo and Amir Pouran Ben Veyseh and Franck Dernoncourt and Thien Huu Nguyen}},
26
- title={{Open Multilingual LLM Evaluation Leaderboard}},
27
- year={{2023}}
 
 
 
 
28
  }}
29
  ```
30
  """
 
1
  TITLE = '<h1 align="center" id="space-title">πŸ† WebWalkerQA Leaderboard</h1>'
2
 
3
  INTRO_TEXT = f"""
4
+ ## πŸ“– About
5
+ This leaderboard showcases the performance of models on the **WebWalkerQA benchmark**. WebWalkerQA is a collection of question-answering datasets designed to test models' ability to answer questions about web pages.
6
  """
7
 
8
  HOW_TO = f"""
9
+ ## πŸ—‚οΈ Data
10
+ The WebWalkerQA dataset is available on πŸ€— [Hugging Face](https://huggingface.co/datasets/callanwu/WebWalkerQA). It comprises **680 question-answer pairs**, each linked to a corresponding web page. The benchmark is divided into two key components:
11
+
12
+ - **Agent πŸ€–οΈ**
13
+ - **RAG-system πŸ”**
14
+
15
+ ## πŸš€ How to Submit Your Method
16
+
17
+ ### πŸ“ Submission Steps:
18
+ To list your method's performance on this leaderboard, email **[email protected]** or **[email protected]** with the following:
19
+
20
+ 1. A JSONL file in the format:
21
+ ```jsonl
22
+ {{"question": "question_text", "prediction": "predicted_answer_text"}}
23
+ ```
24
+ 2. Include the following details in your email:
25
+ - **User Name**
26
+ - **Type** (RAG-system or Agent)
27
+ - **Method Name**
28
+
29
+ Your method will be evaluated and added to the leaderboard. For reference, check out the [evaluation code](https://github.com/Alibaba-NLP/WebWalker/src/evaluate.py).
30
+
31
+ We will evaluate the performance of your method and list it on the leaderboard.
32
+
33
+ For reference, you can check the [evaluation code](https://github.com/Alibaba-NLP/WebWalker/src/evaluate.py).
34
  """
35
 
36
  CREDIT = f"""
37
+ ## πŸ™Œ Credit
38
+ This website is built using the following resources:
39
+
40
+ - **Evaluation Code**: Langchain's cot_qa evaluator
41
+ - **Leaderboard Code**: Huggingface4's open_llm_leaderboard
42
  """
43
 
44
 
45
  CITATION = f"""
46
+ ## 🚩Citation
47
+
48
+ If this work is helpful, please kindly cite as:
49
+
50
+ ```bigquery
51
+ @article{{wu2024webwalker,
52
+ title={{WebWalker: Benchmarking LLMs in Web Traversal}},
53
+ author={{Wu, Jialong and others}},
54
+ journal={{arXiv preprint arXiv:2411.02937}},
55
+ year={{2024}}
56
  }}
57
  ```
58
  """
overall.jpg ADDED
rag_result.jsonl ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {"system": "doubao", "ss_easy": 0.45, "ss_medium": 0.15, "ss_hard": 0.18333333333333332, "ms_easy": 0.1375, "ms_medium": 0.08571428571428572, "ms_hard": 0.1, "overall": 0.1676470588235294}
2
+ {"system": "gemini", "ss_easy": 0.125, "ss_medium": 0.07857142857142857, "ss_hard": 0.08333333333333333, "ms_easy": 0.1125, "ms_medium": 0.06428571428571428, "ms_hard": 0.05, "overall": 0.08088235294117647}
3
+ {"system": "gemini_search", "ss_easy": 0.4, "ss_medium": 0.32142857142857145, "ss_hard": 0.2916666666666667, "ms_easy": 0.3, "ms_medium": 0.2357142857142857, "ms_hard": 0.175, "overall": 0.27941176470588236}
4
+ {"system": "mindsearch", "ss_easy": 0.15, "ss_medium": 0.11428571428571428, "ss_hard": 0.10833333333333334, "ms_easy": 0.0875, "ms_medium": 0.12142857142857143, "ms_hard": 0.1, "overall": 0.11323529411764706}
5
+ {"system": "navie", "ss_easy": 0.375, "ss_medium": 0.2571428571428571, "ss_hard": 0.26666666666666666, "ms_easy": 0.1, "ms_medium": 0.14285714285714285, "ms_hard": 0.125, "overall": 0.2073529411764706}
6
+ {"system": "o1", "ss_easy": 0.1625, "ss_medium": 0.1, "ss_hard": 0.09166666666666666, "ms_easy": 0.075, "ms_medium": 0.10714285714285714, "ms_hard": 0.06666666666666667, "overall": 0.09852941176470588}
7
+ {"system": "wenxin", "ss_easy": 0.525, "ss_medium": 0.3, "ss_hard": 0.2833333333333333, "ms_easy": 0.2125, "ms_medium": 0.18571428571428572, "ms_hard": 0.3, "overall": 0.2897058823529412}
8
+ {"system": "tongyi", "ss_easy": 0.4125, "ss_medium": 0.45, "ss_hard": 0.4166666666666667, "ms_easy": 0.4, "ms_medium": 0.4143333, "ms_hard": 0.3416666666666667, "overall": 0.4073}
9
+ {"system": "kimi", "ss_easy": 0.775, "ss_medium": 0.4142857142857143, "ss_hard": 0.4083333333333333, "ms_easy": 0.2625, "ms_medium": 0.2642857142857143, "ms_hard": 0.225, "overall": 0.3735294117647059}