Spaces:

callanwu
/

WebWalkerQALeaderboard

Running

App Files Files Community

callanwu commited on 8 days ago

Commit

404f089

1 Parent(s): 2c9628e

update

Browse files

Files changed (5) hide show

agents_result.jsonl +18 -0
app.py +86 -65
content.py +42 -14
overall.jpg +0 -0
rag_result.jsonl +9 -0

agents_result.jsonl ADDED Viewed

	@@ -0,0 +1,18 @@

+{"method": "react", "model": "qwen2.5-7b-instruct", "ss_easy": 0.1, "ss_medium": 0.18571428571428572, "ss_hard": 0.09166666666666666, "ms_easy": 0.175, "ms_medium": 0.10714285714285714, "ms_hard": 0.058333333333333334, "overall": 0.11911764705882352}
+{"method": "reflexion", "model": "qwen2.5-7b-instruct", "ss_easy": 0.0875, "ss_medium": 0.25, "ss_hard": 0.11666666666666667, "ms_easy": 0.3, "ms_medium": 0.15714285714285714, "ms_hard": 0.041666666666666664, "overall": 0.15735294117647058}
+{"method": "seeker", "model": "qwen2.5-7b-instruct", "ss_easy": 0.075, "ss_medium": 0.2571428571428571, "ss_hard": 0.125, "ms_easy": 0.1875, "ms_medium": 0.2, "ms_hard": 0.058333333333333334, "overall": 0.15735294117647058}
+{"method": "react", "model": "qwen2.5-14b-instruct", "ss_easy": 0.0875, "ss_medium": 0.32142857142857145, "ss_hard": 0.15, "ms_easy": 0.275, "ms_medium": 0.22857142857142856, "ms_hard": 0.05, "overall": 0.19117647058823528}
+{"method": "reflexion", "model": "qwen2.5-14b-instruct", "ss_easy": 0.1375, "ss_medium": 0.34285714285714286, "ss_hard": 0.15, "ms_easy": 0.3625, "ms_medium": 0.22857142857142856, "ms_hard": 0.058333333333333334, "overall": 0.21323529411764705}
+{"method": "seeker", "model": "qwen2.5-14b-instruct", "ss_easy": 0.0875, "ss_medium": 0.4142857142857143, "ss_hard": 0.23333333333333334, "ms_easy": 0.3, "ms_medium": 0.22857142857142856, "ms_hard": 0.1, "overall": 0.23676470588235293}
+{"method": "react", "model": "qwen2.5-32b-instruct", "ss_easy": 0.1, "ss_medium": 0.35714285714285715, "ss_hard": 0.16666666666666666, "ms_easy": 0.3625, "ms_medium": 0.18571428571428572, "ms_hard": 0.08333333333333333, "overall": 0.21029411764705883}
+{"method": "reflexion", "model": "qwen2.5-32b-instruct", "ss_easy": 0.075, "ss_medium": 0.32857142857142857, "ss_hard": 0.16666666666666666, "ms_easy": 0.3125, "ms_medium": 0.22857142857142856, "ms_hard": 0.058333333333333334, "overall": 0.2}
+{"method": "seeker", "model": "qwen2.5-32b-instruct", "ss_easy": 0.1125, "ss_medium": 0.34285714285714286, "ss_hard": 0.225, "ms_easy": 0.275, "ms_medium": 0.24285714285714285, "ms_hard": 0.1, "overall": 0.2235294117647059}
+{"method": "react", "model": "qwen2.5-72b-instruct", "ss_easy": 0.125, "ss_medium": 0.38571428571428573, "ss_hard": 0.2, "ms_easy": 0.45, "ms_medium": 0.3142857142857143, "ms_hard": 0.1, "overall": 0.2647058823529412}
+{"method": "reflexion", "model": "qwen2.5-72b-instruct", "ss_easy": 0.1375, "ss_medium": 0.44285714285714284, "ss_hard": 0.2833333333333333, "ms_easy": 0.3625, "ms_medium": 0.25, "ms_hard": 0.125, "overall": 0.2735294117647059}
+{"method": "seeker", "model": "qwen2.5-72b-instruct", "ss_easy": 0.15, "ss_medium": 0.4857142857142857, "ss_hard": 0.25833333333333336, "ms_easy": 0.35, "ms_medium": 0.29285714285714287, "ms_hard": 0.15, "overall": 0.2911764705882353}
+{"method": "react", "model": "qwen-plus", "ss_easy": 0.1375, "ss_medium": 0.4, "ss_hard": 0.24166666666666667, "ms_easy": 0.475, "ms_medium": 0.3, "ms_hard": 0.15, "overall": 0.2852941176470588}
+{"method": "reflexion", "model": "qwen-plus", "ss_easy": 0.1, "ss_medium": 0.4857142857142857, "ss_hard": 0.2833333333333333, "ms_easy": 0.35, "ms_medium": 0.2785714285714286, "ms_hard": 0.14166666666666666, "overall": 0.2852941176470588}
+{"method": "seeker", "model": "qwen-plus", "ss_easy": 0.1375, "ss_medium": 0.4714285714285714, "ss_hard": 0.3, "ms_easy": 0.35, "ms_medium": 0.2714285714285714, "ms_hard": 0.15, "overall": 0.2897058823529412}
+{"method": "react", "model": "gpt-4o", "ss_easy": 0.1125, "ss_medium": 0.45, "ss_hard": 0.3, "ms_easy": 0.325, "ms_medium": 0.30714285714285716, "ms_hard": 0.15, "overall": 0.2867647058823529}
+{"method": "reflexion", "model": "gpt-4o", "ss_easy": 0.1375, "ss_medium": 0.5142857142857142, "ss_hard": 0.30833333333333335, "ms_easy": 0.35, "ms_medium": 0.2714285714285714, "ms_hard": 0.16666666666666666, "overall": 0.3029411764705882}
+{"method": "seeker", "model": "gpt-4o", "ss_easy": 0.1, "ss_medium": 0.5, "ss_hard": 0.3, "ms_easy": 0.475, "ms_medium": 0.34285714285714286, "ms_hard": 0.15833333333333333, "overall": 0.3220588235294118}

app.py CHANGED Viewed

@@ -1,92 +1,113 @@
-import os
 import json
-import glob
-from collections import defaultdict
 import pandas as pd
 import gradio as gr
 from content import *
 from css import *
-import glob
-ARC = "arc"
-HELLASWAG = "hellaswag"
-MMLU = "mmlu"
-TRUTHFULQA = "truthfulqa"
-BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA]
-METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
-LANGS = 'ar,bn,ca,da,de,es,eu,fr,gu,hi,hr,hu,hy,id,it,kn,ml,mr,ne,nl,pt,ro,ru,sk,sr,sv,ta,te,uk,vi,zh'.split(',')
-LANG_NAME = {
-    'ar': 'Arabic',
-    'bn': 'Bengali',
-    'ca': 'Catalan',
-    'da': 'Danish',
-    'de': 'German',
-    'es': 'Spanish',
-    'eu': 'Basque',
-    'fr': 'French',
-    'gu': 'Gujarati',
-    'hi': 'Hindi',
-    'hr': 'Croatian',
-    'hu': 'Hungarian',
-    'hy': 'Armenian',
-    'id': 'Indonesian',
-    'it': 'Italian',
-    'kn': 'Kannada',
-    'ml': 'Malayalam',
-    'mr': 'Marathi',
-    'ne': 'Nepali',
-    'nl': 'Dutch',
-    'pt': 'Portuguese',
-    'ro': 'Romanian',
-    'ru': 'Russian',
-    'sk': 'Slovak',
-    'sr': 'Serbian',
-    'sv': 'Swedish',
-    'ta': 'Tamil',
-    'te': 'Telugu',
-    'uk': 'Ukrainian',
-    'vi': 'Vietnamese',
-    'zh': 'Chinese'
 }
-NONE_COL = "None"
-COLS = ["Method", "Model" , "SS Easy", "SS Medium", "SS Hard", "MS Easy", "MS Meduium", "MS Hard", "Overall", NONE_COL]
-TYPES = ["str", "str", "number", "number", "number", "number", "number", "number", "number", "number" , "number"]
-df = []
-row = ["React", "Qwen-plus" , "10.5", "20.6", "30.4", "10.5", "20.6", "30.4", "20", NONE_COL]
-df.append(row)
-df.append(row)
-df.append(row)
-df.append(row)
-df = pd.DataFrame.from_records(df, columns=COLS)
-df = df.sort_values(by=["Method", "Overall"], ascending=False)
-df = df[COLS]
 demo = gr.Blocks(css=CUSTOM_CSS)
 with demo:
     gr.HTML(TITLE)
     gr.Markdown(INTRO_TEXT, elem_classes="markdown-text")
     gr.Markdown(HOW_TO, elem_classes="markdown-text")
-    print(TYPES)
-    print(df.columns)
     with gr.Group():
-        with gr.Tab("Results: Agent"):
             leaderboard_table_test = gr.components.Dataframe(
-                value=df, datatype=TYPES, interactive=False,
-                column_widths = ["20%"] * len(df.columns)
             )
-        with gr.Tab("Results: RAG-system"):
             leaderboard_table_val = gr.components.Dataframe(
-                value=df, datatype=TYPES, interactive=False,
                 column_widths=["20%"]
         )
     gr.Markdown(CREDIT, elem_classes="markdown-text")
     gr.Markdown(CITATION, elem_classes="markdown-text")

 import json
 import pandas as pd
 import gradio as gr
 from content import *
 from css import *
+NONE_COL = "Ranking"
+AGENT_COLS = ["Method", "Model" , "SS Easy", "SS Medium", "SS Hard", "MS Easy", "MS Meduium", "MS Hard", "Overall", NONE_COL]
+AGENT_TYPES = ["str", "str", "number", "number", "number", "number", "number", "number", "number", "number" , "number"]
+model_name_adic = {
+    "qwen-plus": "Qwen-Plus",
+    "qwen2.5-72b-instruct": "Qwen2.5-72B",
+    "qwen2.5-7b-instruct": "Qwen2.5-7B",
+    "qwen2.5-14b-instruct": "Qwen2.5-14B",
+    "qwen2.5-32b-instruct": "Qwen2.5-32B",
+    "gpt-4o": "GPT-4o",
+}
+method_name_adic = {
+    "reflexion": "Relfexion",
+    "react": "React",
+    "seeker": "WebWalker",
+}
+rag_name_adic = {
+    "kimi": "Kimi",
+    "mindsearch": "MindSearch",
+    "navie": "Navie RAG",
+    "o1": "o1",
+    "tongyi": "Tongyi",
+    "wenxin": "ERNIE",
+    "gemini": "Gemini",
+    "gemini_search": "Gemini w/ Search",
+    "doubao": "Doubao",
 }
+agent_ranking = []
+with open("agents_result.jsonl", "r") as f:
+    for line in f:
+        item = json.loads(line)
+        agent_ranking.append([method_name_adic[item["method"]], model_name_adic[item["model"]], item["overall"]])
+agent_ranking = sorted(agent_ranking, key=lambda x: x[2], reverse=False)
+ranking_dict = {}
+for i, (method, model, score) in enumerate(agent_ranking):
+    ranking_dict[method+model] = i
+agent_df = []
+with open("agents_result.jsonl", "r") as f:
+    for line in f:
+        item = json.loads(line)
+        agent_df.append([method_name_adic[item["method"]], model_name_adic[item["model"]],
+                         f"{item['ss_easy'] * 100:.2f}",
+                       f"{item['ss_medium'] * 100:.2f}",
+                       f"{item['ss_hard'] * 100:.2f}",
+                       f"{item['ms_easy'] * 100:.2f}",
+                       f"{item['ms_medium'] * 100:.2f}",
+                       f"{item['ms_hard'] * 100:.2f}",
+                       f"{item['overall'] * 100:.2f}",
+                       ranking_dict[method_name_adic[item["method"]] + model_name_adic[item["model"]]]])
+agent_df = pd.DataFrame.from_records(agent_df, columns=AGENT_COLS)
+agent_df = agent_df.sort_values(by=["Ranking"], ascending=False)
+agent_df = agent_df[AGENT_COLS]
+RAG_COLS = ["System", "SS Easy", "SS Medium", "SS Hard", "MS Easy", "MS Meduium", "MS Hard", "Overall", NONE_COL]
+RAG_TYPES = ["str", "number", "number", "number", "number", "number", "number", "number", "number" , "number"]
+rag_ranking = []
+with open("rag_result.jsonl", "r") as f:
+    for line in f:
+        item = json.loads(line)
+        rag_ranking.append([rag_name_adic[item["system"]], item["overall"]])
+rag_ranking = sorted(rag_ranking, key=lambda x: x[1], reverse=False)
+ranking_dict = {}
+for i, (system, score) in enumerate(rag_ranking):
+    ranking_dict[system] = i
+rag_df = []
+with open("rag_result.jsonl", "r") as f:
+    for line in f:
+        item = json.loads(line)
+        rag_df.append([rag_name_adic[item["system"]],
+                       f"{item['ss_easy'] * 100:.2f}",
+                       f"{item['ss_medium'] * 100:.2f}",
+                       f"{item['ss_hard'] * 100:.2f}",
+                       f"{item['ms_easy'] * 100:.2f}",
+                       f"{item['ms_medium'] * 100:.2f}",
+                       f"{item['ms_hard'] * 100:.2f}",
+                       f"{item['overall'] * 100:.2f}",
+                       ranking_dict[rag_name_adic[item["system"]]]])
+rag_df = pd.DataFrame.from_records(rag_df, columns=RAG_COLS)
+rag_df = rag_df.sort_values(by=["Ranking"], ascending=False)
+rag_df = rag_df[RAG_COLS]
 demo = gr.Blocks(css=CUSTOM_CSS)
 with demo:
     gr.HTML(TITLE)
     gr.Markdown(INTRO_TEXT, elem_classes="markdown-text")
     gr.Markdown(HOW_TO, elem_classes="markdown-text")
+    gr.Markdown("## Leaderboard")
     with gr.Group():
+        with gr.Tab("Results: Agent 🤖️"):
             leaderboard_table_test = gr.components.Dataframe(
+                value=agent_df, datatype=AGENT_TYPES, interactive=False,
+                column_widths = ["20%"] * len(agent_df.columns)
             )
+        with gr.Tab("Results: RAG-system 🔍"):
             leaderboard_table_val = gr.components.Dataframe(
+                value=rag_df, datatype=RAG_TYPES, interactive=False,
                 column_widths=["20%"]
         )
+    gr.Markdown("SS denotes single-source, and MS denotes multi-source. Easy, Medium, and Hard denote the difficulty level of the question.")
     gr.Markdown(CREDIT, elem_classes="markdown-text")
     gr.Markdown(CITATION, elem_classes="markdown-text")

content.py CHANGED Viewed

@@ -1,30 +1,58 @@
 TITLE = '<h1 align="center" id="space-title">🏆 WebWalkerQA Leaderboard</h1>'
 INTRO_TEXT = f"""
-## About
-This leaderboard shows the performance of models on the WebWalkerQA benchmark. The WebWalkerQA benchmark is a collection of question-answering datasets that test the ability of models to answer questions about web pages.
 """
 HOW_TO = f"""
-## How to list your model performance on this leaderboard:
-Send a email to [email protected] or jialongwu@seu.edu.cn.
 """
 CREDIT = f"""
-## Credit
-To make this website, we use the following resources:
-- Evaluation code (EleutherAI's lm_evaluation_harness repo)
-- Leaderboard code (Huggingface4's open_llm_leaderboard and repo)
 """
 CITATION = f"""
-## Citation
-```
-@misc{{lai2023openllmbenchmark,
-    author = {{Viet Lai and Nghia Trung Ngo and Amir Pouran Ben Veyseh and Franck Dernoncourt and Thien Huu Nguyen}},
-    title={{Open Multilingual LLM Evaluation Leaderboard}},
-    year={{2023}}
 }}
 ```
 """

 TITLE = '<h1 align="center" id="space-title">🏆 WebWalkerQA Leaderboard</h1>'
 INTRO_TEXT = f"""
+## 📖 About
+This leaderboard showcases the performance of models on the **WebWalkerQA benchmark**. WebWalkerQA is a collection of question-answering datasets designed to test models' ability to answer questions about web pages.
 """
 HOW_TO = f"""
+## 🗂️ Data
+The WebWalkerQA dataset is available on 🤗 [Hugging Face](https://huggingface.co/datasets/callanwu/WebWalkerQA). It comprises **680 question-answer pairs**, each linked to a corresponding web page. The benchmark is divided into two key components:
+- **Agent 🤖️**
+- **RAG-system 🔍**
+## 🚀 How to Submit Your Method
+### 📝 Submission Steps:
+To list your method's performance on this leaderboard, email **[email protected]** or **[email protected]** with the following:
+1. A JSONL file in the format:
+   ```jsonl
+   {{"question": "question_text", "prediction": "predicted_answer_text"}}
+   ```
+2. Include the following details in your email:
+   - **User Name**
+   - **Type** (RAG-system or Agent)
+   - **Method Name**
+Your method will be evaluated and added to the leaderboard. For reference, check out the [evaluation code](https://github.com/Alibaba-NLP/WebWalker/src/evaluate.py).
+We will evaluate the performance of your method and list it on the leaderboard.
+For reference, you can check the [evaluation code](https://github.com/Alibaba-NLP/WebWalker/src/evaluate.py).
 """
 CREDIT = f"""
+## 🙌 Credit
+This website is built using the following resources:
+- **Evaluation Code**: Langchain's cot_qa evaluator
+- **Leaderboard Code**: Huggingface4's open_llm_leaderboard
 """
 CITATION = f"""
+## 🚩Citation
+If this work is helpful, please kindly cite as:
+```bigquery
+@article{{wu2024webwalker,
+  title={{WebWalker: Benchmarking LLMs in Web Traversal}},
+  author={{Wu, Jialong and others}},
+  journal={{arXiv preprint arXiv:2411.02937}},
+  year={{2024}}
 }}
 ```
 """

overall.jpg ADDED Viewed

rag_result.jsonl ADDED Viewed

	@@ -0,0 +1,9 @@

+{"system": "doubao", "ss_easy": 0.45, "ss_medium": 0.15, "ss_hard": 0.18333333333333332, "ms_easy": 0.1375, "ms_medium": 0.08571428571428572, "ms_hard": 0.1, "overall": 0.1676470588235294}
+{"system": "gemini", "ss_easy": 0.125, "ss_medium": 0.07857142857142857, "ss_hard": 0.08333333333333333, "ms_easy": 0.1125, "ms_medium": 0.06428571428571428, "ms_hard": 0.05, "overall": 0.08088235294117647}
+{"system": "gemini_search", "ss_easy": 0.4, "ss_medium": 0.32142857142857145, "ss_hard": 0.2916666666666667, "ms_easy": 0.3, "ms_medium": 0.2357142857142857, "ms_hard": 0.175, "overall": 0.27941176470588236}
+{"system": "mindsearch", "ss_easy": 0.15, "ss_medium": 0.11428571428571428, "ss_hard": 0.10833333333333334, "ms_easy": 0.0875, "ms_medium": 0.12142857142857143, "ms_hard": 0.1, "overall": 0.11323529411764706}
+{"system": "navie", "ss_easy": 0.375, "ss_medium": 0.2571428571428571, "ss_hard": 0.26666666666666666, "ms_easy": 0.1, "ms_medium": 0.14285714285714285, "ms_hard": 0.125, "overall": 0.2073529411764706}
+{"system": "o1", "ss_easy": 0.1625, "ss_medium": 0.1, "ss_hard": 0.09166666666666666, "ms_easy": 0.075, "ms_medium": 0.10714285714285714, "ms_hard": 0.06666666666666667, "overall": 0.09852941176470588}
+{"system": "wenxin", "ss_easy": 0.525, "ss_medium": 0.3, "ss_hard": 0.2833333333333333, "ms_easy": 0.2125, "ms_medium": 0.18571428571428572, "ms_hard": 0.3, "overall": 0.2897058823529412}
+{"system": "tongyi", "ss_easy": 0.4125, "ss_medium": 0.45, "ss_hard": 0.4166666666666667, "ms_easy": 0.4, "ms_medium": 0.4143333, "ms_hard": 0.3416666666666667, "overall": 0.4073}
+{"system": "kimi", "ss_easy": 0.775, "ss_medium": 0.4142857142857143, "ss_hard": 0.4083333333333333, "ms_easy": 0.2625, "ms_medium": 0.2642857142857143, "ms_hard": 0.225, "overall": 0.3735294117647059}