Spaces:
Running
Running
update
Browse files- agents_result.jsonl +18 -0
- app.py +86 -65
- content.py +42 -14
- overall.jpg +0 -0
- rag_result.jsonl +9 -0
agents_result.jsonl
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"method": "react", "model": "qwen2.5-7b-instruct", "ss_easy": 0.1, "ss_medium": 0.18571428571428572, "ss_hard": 0.09166666666666666, "ms_easy": 0.175, "ms_medium": 0.10714285714285714, "ms_hard": 0.058333333333333334, "overall": 0.11911764705882352}
|
2 |
+
{"method": "reflexion", "model": "qwen2.5-7b-instruct", "ss_easy": 0.0875, "ss_medium": 0.25, "ss_hard": 0.11666666666666667, "ms_easy": 0.3, "ms_medium": 0.15714285714285714, "ms_hard": 0.041666666666666664, "overall": 0.15735294117647058}
|
3 |
+
{"method": "seeker", "model": "qwen2.5-7b-instruct", "ss_easy": 0.075, "ss_medium": 0.2571428571428571, "ss_hard": 0.125, "ms_easy": 0.1875, "ms_medium": 0.2, "ms_hard": 0.058333333333333334, "overall": 0.15735294117647058}
|
4 |
+
{"method": "react", "model": "qwen2.5-14b-instruct", "ss_easy": 0.0875, "ss_medium": 0.32142857142857145, "ss_hard": 0.15, "ms_easy": 0.275, "ms_medium": 0.22857142857142856, "ms_hard": 0.05, "overall": 0.19117647058823528}
|
5 |
+
{"method": "reflexion", "model": "qwen2.5-14b-instruct", "ss_easy": 0.1375, "ss_medium": 0.34285714285714286, "ss_hard": 0.15, "ms_easy": 0.3625, "ms_medium": 0.22857142857142856, "ms_hard": 0.058333333333333334, "overall": 0.21323529411764705}
|
6 |
+
{"method": "seeker", "model": "qwen2.5-14b-instruct", "ss_easy": 0.0875, "ss_medium": 0.4142857142857143, "ss_hard": 0.23333333333333334, "ms_easy": 0.3, "ms_medium": 0.22857142857142856, "ms_hard": 0.1, "overall": 0.23676470588235293}
|
7 |
+
{"method": "react", "model": "qwen2.5-32b-instruct", "ss_easy": 0.1, "ss_medium": 0.35714285714285715, "ss_hard": 0.16666666666666666, "ms_easy": 0.3625, "ms_medium": 0.18571428571428572, "ms_hard": 0.08333333333333333, "overall": 0.21029411764705883}
|
8 |
+
{"method": "reflexion", "model": "qwen2.5-32b-instruct", "ss_easy": 0.075, "ss_medium": 0.32857142857142857, "ss_hard": 0.16666666666666666, "ms_easy": 0.3125, "ms_medium": 0.22857142857142856, "ms_hard": 0.058333333333333334, "overall": 0.2}
|
9 |
+
{"method": "seeker", "model": "qwen2.5-32b-instruct", "ss_easy": 0.1125, "ss_medium": 0.34285714285714286, "ss_hard": 0.225, "ms_easy": 0.275, "ms_medium": 0.24285714285714285, "ms_hard": 0.1, "overall": 0.2235294117647059}
|
10 |
+
{"method": "react", "model": "qwen2.5-72b-instruct", "ss_easy": 0.125, "ss_medium": 0.38571428571428573, "ss_hard": 0.2, "ms_easy": 0.45, "ms_medium": 0.3142857142857143, "ms_hard": 0.1, "overall": 0.2647058823529412}
|
11 |
+
{"method": "reflexion", "model": "qwen2.5-72b-instruct", "ss_easy": 0.1375, "ss_medium": 0.44285714285714284, "ss_hard": 0.2833333333333333, "ms_easy": 0.3625, "ms_medium": 0.25, "ms_hard": 0.125, "overall": 0.2735294117647059}
|
12 |
+
{"method": "seeker", "model": "qwen2.5-72b-instruct", "ss_easy": 0.15, "ss_medium": 0.4857142857142857, "ss_hard": 0.25833333333333336, "ms_easy": 0.35, "ms_medium": 0.29285714285714287, "ms_hard": 0.15, "overall": 0.2911764705882353}
|
13 |
+
{"method": "react", "model": "qwen-plus", "ss_easy": 0.1375, "ss_medium": 0.4, "ss_hard": 0.24166666666666667, "ms_easy": 0.475, "ms_medium": 0.3, "ms_hard": 0.15, "overall": 0.2852941176470588}
|
14 |
+
{"method": "reflexion", "model": "qwen-plus", "ss_easy": 0.1, "ss_medium": 0.4857142857142857, "ss_hard": 0.2833333333333333, "ms_easy": 0.35, "ms_medium": 0.2785714285714286, "ms_hard": 0.14166666666666666, "overall": 0.2852941176470588}
|
15 |
+
{"method": "seeker", "model": "qwen-plus", "ss_easy": 0.1375, "ss_medium": 0.4714285714285714, "ss_hard": 0.3, "ms_easy": 0.35, "ms_medium": 0.2714285714285714, "ms_hard": 0.15, "overall": 0.2897058823529412}
|
16 |
+
{"method": "react", "model": "gpt-4o", "ss_easy": 0.1125, "ss_medium": 0.45, "ss_hard": 0.3, "ms_easy": 0.325, "ms_medium": 0.30714285714285716, "ms_hard": 0.15, "overall": 0.2867647058823529}
|
17 |
+
{"method": "reflexion", "model": "gpt-4o", "ss_easy": 0.1375, "ss_medium": 0.5142857142857142, "ss_hard": 0.30833333333333335, "ms_easy": 0.35, "ms_medium": 0.2714285714285714, "ms_hard": 0.16666666666666666, "overall": 0.3029411764705882}
|
18 |
+
{"method": "seeker", "model": "gpt-4o", "ss_easy": 0.1, "ss_medium": 0.5, "ss_hard": 0.3, "ms_easy": 0.475, "ms_medium": 0.34285714285714286, "ms_hard": 0.15833333333333333, "overall": 0.3220588235294118}
|
app.py
CHANGED
@@ -1,92 +1,113 @@
|
|
1 |
-
import os
|
2 |
import json
|
3 |
-
import glob
|
4 |
-
from collections import defaultdict
|
5 |
import pandas as pd
|
6 |
import gradio as gr
|
7 |
from content import *
|
8 |
from css import *
|
9 |
-
import glob
|
10 |
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
'hi': 'Hindi',
|
32 |
-
'hr': 'Croatian',
|
33 |
-
'hu': 'Hungarian',
|
34 |
-
'hy': 'Armenian',
|
35 |
-
'id': 'Indonesian',
|
36 |
-
'it': 'Italian',
|
37 |
-
'kn': 'Kannada',
|
38 |
-
'ml': 'Malayalam',
|
39 |
-
'mr': 'Marathi',
|
40 |
-
'ne': 'Nepali',
|
41 |
-
'nl': 'Dutch',
|
42 |
-
'pt': 'Portuguese',
|
43 |
-
'ro': 'Romanian',
|
44 |
-
'ru': 'Russian',
|
45 |
-
'sk': 'Slovak',
|
46 |
-
'sr': 'Serbian',
|
47 |
-
'sv': 'Swedish',
|
48 |
-
'ta': 'Tamil',
|
49 |
-
'te': 'Telugu',
|
50 |
-
'uk': 'Ukrainian',
|
51 |
-
'vi': 'Vietnamese',
|
52 |
-
'zh': 'Chinese'
|
53 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
|
|
|
|
57 |
|
58 |
-
|
59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
|
61 |
-
|
62 |
-
|
|
|
63 |
|
64 |
-
df.append(row)
|
65 |
-
df.append(row)
|
66 |
-
df.append(row)
|
67 |
-
df.append(row)
|
68 |
-
df = pd.DataFrame.from_records(df, columns=COLS)
|
69 |
-
df = df.sort_values(by=["Method", "Overall"], ascending=False)
|
70 |
-
df = df[COLS]
|
71 |
demo = gr.Blocks(css=CUSTOM_CSS)
|
72 |
with demo:
|
73 |
gr.HTML(TITLE)
|
74 |
gr.Markdown(INTRO_TEXT, elem_classes="markdown-text")
|
75 |
gr.Markdown(HOW_TO, elem_classes="markdown-text")
|
76 |
-
|
77 |
-
print(df.columns)
|
78 |
with gr.Group():
|
79 |
-
with gr.Tab("Results: Agent"):
|
80 |
leaderboard_table_test = gr.components.Dataframe(
|
81 |
-
value=
|
82 |
-
column_widths = ["20%"] * len(
|
83 |
)
|
84 |
-
with gr.Tab("Results: RAG-system"):
|
85 |
leaderboard_table_val = gr.components.Dataframe(
|
86 |
-
value=
|
87 |
column_widths=["20%"]
|
88 |
)
|
89 |
-
|
90 |
|
91 |
gr.Markdown(CREDIT, elem_classes="markdown-text")
|
92 |
gr.Markdown(CITATION, elem_classes="markdown-text")
|
|
|
|
|
1 |
import json
|
|
|
|
|
2 |
import pandas as pd
|
3 |
import gradio as gr
|
4 |
from content import *
|
5 |
from css import *
|
|
|
6 |
|
7 |
+
NONE_COL = "Ranking"
|
8 |
+
|
9 |
+
AGENT_COLS = ["Method", "Model" , "SS Easy", "SS Medium", "SS Hard", "MS Easy", "MS Meduium", "MS Hard", "Overall", NONE_COL]
|
10 |
+
AGENT_TYPES = ["str", "str", "number", "number", "number", "number", "number", "number", "number", "number" , "number"]
|
11 |
+
model_name_adic = {
|
12 |
+
"qwen-plus": "Qwen-Plus",
|
13 |
+
"qwen2.5-72b-instruct": "Qwen2.5-72B",
|
14 |
+
"qwen2.5-7b-instruct": "Qwen2.5-7B",
|
15 |
+
"qwen2.5-14b-instruct": "Qwen2.5-14B",
|
16 |
+
"qwen2.5-32b-instruct": "Qwen2.5-32B",
|
17 |
+
"gpt-4o": "GPT-4o",
|
18 |
+
}
|
19 |
+
method_name_adic = {
|
20 |
+
"reflexion": "Relfexion",
|
21 |
+
"react": "React",
|
22 |
+
"seeker": "WebWalker",
|
23 |
+
}
|
24 |
|
25 |
+
rag_name_adic = {
|
26 |
+
"kimi": "Kimi",
|
27 |
+
"mindsearch": "MindSearch",
|
28 |
+
"navie": "Navie RAG",
|
29 |
+
"o1": "o1",
|
30 |
+
"tongyi": "Tongyi",
|
31 |
+
"wenxin": "ERNIE",
|
32 |
+
"gemini": "Gemini",
|
33 |
+
"gemini_search": "Gemini w/ Search",
|
34 |
+
"doubao": "Doubao",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
}
|
36 |
+
agent_ranking = []
|
37 |
+
with open("agents_result.jsonl", "r") as f:
|
38 |
+
for line in f:
|
39 |
+
item = json.loads(line)
|
40 |
+
agent_ranking.append([method_name_adic[item["method"]], model_name_adic[item["model"]], item["overall"]])
|
41 |
+
agent_ranking = sorted(agent_ranking, key=lambda x: x[2], reverse=False)
|
42 |
+
ranking_dict = {}
|
43 |
+
for i, (method, model, score) in enumerate(agent_ranking):
|
44 |
+
ranking_dict[method+model] = i
|
45 |
|
46 |
+
agent_df = []
|
47 |
+
with open("agents_result.jsonl", "r") as f:
|
48 |
+
for line in f:
|
49 |
+
item = json.loads(line)
|
50 |
+
agent_df.append([method_name_adic[item["method"]], model_name_adic[item["model"]],
|
51 |
+
f"{item['ss_easy'] * 100:.2f}",
|
52 |
+
f"{item['ss_medium'] * 100:.2f}",
|
53 |
+
f"{item['ss_hard'] * 100:.2f}",
|
54 |
+
f"{item['ms_easy'] * 100:.2f}",
|
55 |
+
f"{item['ms_medium'] * 100:.2f}",
|
56 |
+
f"{item['ms_hard'] * 100:.2f}",
|
57 |
+
f"{item['overall'] * 100:.2f}",
|
58 |
+
ranking_dict[method_name_adic[item["method"]] + model_name_adic[item["model"]]]])
|
59 |
+
agent_df = pd.DataFrame.from_records(agent_df, columns=AGENT_COLS)
|
60 |
+
agent_df = agent_df.sort_values(by=["Ranking"], ascending=False)
|
61 |
+
agent_df = agent_df[AGENT_COLS]
|
62 |
|
63 |
+
RAG_COLS = ["System", "SS Easy", "SS Medium", "SS Hard", "MS Easy", "MS Meduium", "MS Hard", "Overall", NONE_COL]
|
64 |
+
RAG_TYPES = ["str", "number", "number", "number", "number", "number", "number", "number", "number" , "number"]
|
65 |
|
66 |
+
rag_ranking = []
|
67 |
+
with open("rag_result.jsonl", "r") as f:
|
68 |
+
for line in f:
|
69 |
+
item = json.loads(line)
|
70 |
+
rag_ranking.append([rag_name_adic[item["system"]], item["overall"]])
|
71 |
+
rag_ranking = sorted(rag_ranking, key=lambda x: x[1], reverse=False)
|
72 |
+
ranking_dict = {}
|
73 |
+
for i, (system, score) in enumerate(rag_ranking):
|
74 |
+
ranking_dict[system] = i
|
75 |
+
rag_df = []
|
76 |
+
with open("rag_result.jsonl", "r") as f:
|
77 |
+
for line in f:
|
78 |
+
item = json.loads(line)
|
79 |
+
rag_df.append([rag_name_adic[item["system"]],
|
80 |
+
f"{item['ss_easy'] * 100:.2f}",
|
81 |
+
f"{item['ss_medium'] * 100:.2f}",
|
82 |
+
f"{item['ss_hard'] * 100:.2f}",
|
83 |
+
f"{item['ms_easy'] * 100:.2f}",
|
84 |
+
f"{item['ms_medium'] * 100:.2f}",
|
85 |
+
f"{item['ms_hard'] * 100:.2f}",
|
86 |
+
f"{item['overall'] * 100:.2f}",
|
87 |
+
ranking_dict[rag_name_adic[item["system"]]]])
|
88 |
|
89 |
+
rag_df = pd.DataFrame.from_records(rag_df, columns=RAG_COLS)
|
90 |
+
rag_df = rag_df.sort_values(by=["Ranking"], ascending=False)
|
91 |
+
rag_df = rag_df[RAG_COLS]
|
92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
demo = gr.Blocks(css=CUSTOM_CSS)
|
94 |
with demo:
|
95 |
gr.HTML(TITLE)
|
96 |
gr.Markdown(INTRO_TEXT, elem_classes="markdown-text")
|
97 |
gr.Markdown(HOW_TO, elem_classes="markdown-text")
|
98 |
+
gr.Markdown("## Leaderboard")
|
|
|
99 |
with gr.Group():
|
100 |
+
with gr.Tab("Results: Agent π€οΈ"):
|
101 |
leaderboard_table_test = gr.components.Dataframe(
|
102 |
+
value=agent_df, datatype=AGENT_TYPES, interactive=False,
|
103 |
+
column_widths = ["20%"] * len(agent_df.columns)
|
104 |
)
|
105 |
+
with gr.Tab("Results: RAG-system π"):
|
106 |
leaderboard_table_val = gr.components.Dataframe(
|
107 |
+
value=rag_df, datatype=RAG_TYPES, interactive=False,
|
108 |
column_widths=["20%"]
|
109 |
)
|
110 |
+
gr.Markdown("SS denotes single-source, and MS denotes multi-source. Easy, Medium, and Hard denote the difficulty level of the question.")
|
111 |
|
112 |
gr.Markdown(CREDIT, elem_classes="markdown-text")
|
113 |
gr.Markdown(CITATION, elem_classes="markdown-text")
|
content.py
CHANGED
@@ -1,30 +1,58 @@
|
|
1 |
TITLE = '<h1 align="center" id="space-title">π WebWalkerQA Leaderboard</h1>'
|
2 |
|
3 |
INTRO_TEXT = f"""
|
4 |
-
## About
|
5 |
-
This leaderboard
|
6 |
"""
|
7 |
|
8 |
HOW_TO = f"""
|
9 |
-
##
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
"""
|
12 |
|
13 |
CREDIT = f"""
|
14 |
-
## Credit
|
15 |
-
|
16 |
-
|
17 |
-
-
|
|
|
18 |
"""
|
19 |
|
20 |
|
21 |
CITATION = f"""
|
22 |
-
## Citation
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
28 |
}}
|
29 |
```
|
30 |
"""
|
|
|
1 |
TITLE = '<h1 align="center" id="space-title">π WebWalkerQA Leaderboard</h1>'
|
2 |
|
3 |
INTRO_TEXT = f"""
|
4 |
+
## π About
|
5 |
+
This leaderboard showcases the performance of models on the **WebWalkerQA benchmark**. WebWalkerQA is a collection of question-answering datasets designed to test models' ability to answer questions about web pages.
|
6 |
"""
|
7 |
|
8 |
HOW_TO = f"""
|
9 |
+
## ποΈ Data
|
10 |
+
The WebWalkerQA dataset is available on π€ [Hugging Face](https://huggingface.co/datasets/callanwu/WebWalkerQA). It comprises **680 question-answer pairs**, each linked to a corresponding web page. The benchmark is divided into two key components:
|
11 |
+
|
12 |
+
- **Agent π€οΈ**
|
13 |
+
- **RAG-system π**
|
14 |
+
|
15 |
+
## π How to Submit Your Method
|
16 |
+
|
17 |
+
### π Submission Steps:
|
18 |
+
To list your method's performance on this leaderboard, email **[email protected]** or **[email protected]** with the following:
|
19 |
+
|
20 |
+
1. A JSONL file in the format:
|
21 |
+
```jsonl
|
22 |
+
{{"question": "question_text", "prediction": "predicted_answer_text"}}
|
23 |
+
```
|
24 |
+
2. Include the following details in your email:
|
25 |
+
- **User Name**
|
26 |
+
- **Type** (RAG-system or Agent)
|
27 |
+
- **Method Name**
|
28 |
+
|
29 |
+
Your method will be evaluated and added to the leaderboard. For reference, check out the [evaluation code](https://github.com/Alibaba-NLP/WebWalker/src/evaluate.py).
|
30 |
+
|
31 |
+
We will evaluate the performance of your method and list it on the leaderboard.
|
32 |
+
|
33 |
+
For reference, you can check the [evaluation code](https://github.com/Alibaba-NLP/WebWalker/src/evaluate.py).
|
34 |
"""
|
35 |
|
36 |
CREDIT = f"""
|
37 |
+
## π Credit
|
38 |
+
This website is built using the following resources:
|
39 |
+
|
40 |
+
- **Evaluation Code**: Langchain's cot_qa evaluator
|
41 |
+
- **Leaderboard Code**: Huggingface4's open_llm_leaderboard
|
42 |
"""
|
43 |
|
44 |
|
45 |
CITATION = f"""
|
46 |
+
## π©Citation
|
47 |
+
|
48 |
+
If this work is helpful, please kindly cite as:
|
49 |
+
|
50 |
+
```bigquery
|
51 |
+
@article{{wu2024webwalker,
|
52 |
+
title={{WebWalker: Benchmarking LLMs in Web Traversal}},
|
53 |
+
author={{Wu, Jialong and others}},
|
54 |
+
journal={{arXiv preprint arXiv:2411.02937}},
|
55 |
+
year={{2024}}
|
56 |
}}
|
57 |
```
|
58 |
"""
|
overall.jpg
ADDED
rag_result.jsonl
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"system": "doubao", "ss_easy": 0.45, "ss_medium": 0.15, "ss_hard": 0.18333333333333332, "ms_easy": 0.1375, "ms_medium": 0.08571428571428572, "ms_hard": 0.1, "overall": 0.1676470588235294}
|
2 |
+
{"system": "gemini", "ss_easy": 0.125, "ss_medium": 0.07857142857142857, "ss_hard": 0.08333333333333333, "ms_easy": 0.1125, "ms_medium": 0.06428571428571428, "ms_hard": 0.05, "overall": 0.08088235294117647}
|
3 |
+
{"system": "gemini_search", "ss_easy": 0.4, "ss_medium": 0.32142857142857145, "ss_hard": 0.2916666666666667, "ms_easy": 0.3, "ms_medium": 0.2357142857142857, "ms_hard": 0.175, "overall": 0.27941176470588236}
|
4 |
+
{"system": "mindsearch", "ss_easy": 0.15, "ss_medium": 0.11428571428571428, "ss_hard": 0.10833333333333334, "ms_easy": 0.0875, "ms_medium": 0.12142857142857143, "ms_hard": 0.1, "overall": 0.11323529411764706}
|
5 |
+
{"system": "navie", "ss_easy": 0.375, "ss_medium": 0.2571428571428571, "ss_hard": 0.26666666666666666, "ms_easy": 0.1, "ms_medium": 0.14285714285714285, "ms_hard": 0.125, "overall": 0.2073529411764706}
|
6 |
+
{"system": "o1", "ss_easy": 0.1625, "ss_medium": 0.1, "ss_hard": 0.09166666666666666, "ms_easy": 0.075, "ms_medium": 0.10714285714285714, "ms_hard": 0.06666666666666667, "overall": 0.09852941176470588}
|
7 |
+
{"system": "wenxin", "ss_easy": 0.525, "ss_medium": 0.3, "ss_hard": 0.2833333333333333, "ms_easy": 0.2125, "ms_medium": 0.18571428571428572, "ms_hard": 0.3, "overall": 0.2897058823529412}
|
8 |
+
{"system": "tongyi", "ss_easy": 0.4125, "ss_medium": 0.45, "ss_hard": 0.4166666666666667, "ms_easy": 0.4, "ms_medium": 0.4143333, "ms_hard": 0.3416666666666667, "overall": 0.4073}
|
9 |
+
{"system": "kimi", "ss_easy": 0.775, "ss_medium": 0.4142857142857143, "ss_hard": 0.4083333333333333, "ms_easy": 0.2625, "ms_medium": 0.2642857142857143, "ms_hard": 0.225, "overall": 0.3735294117647059}
|