Spaces:

microsoft
/

MageBench-Leaderboard

Running

File size: 11,830 Bytes

import gradio as gr
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns, SearchColumns
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import snapshot_download
import os


from src.about import (
    CITATION_BUTTON_LABEL,
    CITATION_BUTTON_TEXT,
    EVALUATION_QUEUE_TEXT,
    INTRODUCTION_TEXT,
    LLM_BENCHMARKS_TEXT,
    TITLE,
)
from src.display.css_html_js import custom_css
from src.display.utils import (
    BENCHMARK_COLS,
    COLS,
    EVAL_COLS,
    EVAL_TYPES,
    AutoEvalColumn,
    ModelType,
    fields,
    WeightType,
    Precision
)
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
from src.populate import get_evaluation_queue_df, get_leaderboard_df
from src.submission.submit import add_new_eval


def restart_space():
    API.restart_space(repo_id=REPO_ID)

### Space initialisation
try:
    print(EVAL_REQUESTS_PATH)
    snapshot_download(
        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
    )
except Exception:
    restart_space()
try:
    print(EVAL_RESULTS_PATH)
    snapshot_download(
        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
    )
except Exception:
    restart_space()


# LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
import jsonlines  
  
# Initialize an empty list to store the JSON objects  
json_list = []  
  
# Open the JSONL file  
with jsonlines.open('commit_results.jsonl') as reader:  
    for obj in reader:  
        # Append each JSON object to the list  
        json_list.append(obj) 
# _test_data = pd.DataFrame({"Score": [54,46,53], "Name": ["MageBench", "MageBench", "MageBench"], "BaseModel": ["GPT-4o", "GPT-4o", "LLaMA"], "Env.": ["Sokoban", "Sokoban", "Football"],
                          # "Target-research": ["Model-Eval-Global", "Model-Eval-Online", "Agent-Eval-Prompt"], "Subset": ["mini", "all", "mini"],  "Link": ["xxx", "xxx", "xxx"]})
json_list = sorted(json_list, key=lambda x: x['Score'], reverse=True) 
committed = pd.DataFrame(json_list)

(
    finished_eval_queue_df,
    running_eval_queue_df,
    pending_eval_queue_df,
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)

def init_leaderboard(dataframe):
    if dataframe is None or dataframe.empty:
        raise ValueError("Leaderboard DataFrame is empty or None.")
    return Leaderboard(
        value=dataframe,  #dataframe,
        select_columns=SelectColumns(
            default_selection=["Score", "Name", "BaseModel", "Env.", "Target-research", "Subset", "Link"],
            cant_deselect=["Score", "Name",],
            label="Select Columns to Display:",
        ),
        search_columns=SearchColumns(primary_column="Name", secondary_columns=["BaseModel", "Target-research"],
                                     placeholder="Search by work name or basemodel. To search by country, type 'basemodel:<query>'",
                                     label="Search"),
        filter_columns=[
            ColumnFilter("Target-research", type="checkboxgroup", label="Comparison settings for target researches (Single Selection)"),
            # ColumnFilter("BaseModel", type="dropdown", label="Select The base lmm model that fultill the task."),
            ColumnFilter("Env.", type="checkboxgroup", label="Environment (Single Selection)"),
            ColumnFilter("Subset", type="checkboxgroup", label="Subset (Single Selection)"),
            ColumnFilter("State", type="checkboxgroup", label="Result state (checked or under-review)"),
            # ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
            # ColumnFilter(
            #     AutoEvalColumn.params.name,
            #     type="slider",
            #     min=0.01,
            #     max=150,
            #     label="Select the number of parameters (B)",
            # ),
            # ColumnFilter(
            #     AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
            # ),
        ],
        interactive=False,
    )


# =================test
if os.path.exists("./text.txt"):
    print(open("./text.txt").read())
else:
    print("not exists")


demo = gr.Blocks(css=custom_css)
with demo:
    gr.HTML(TITLE)
    gr.Video('demo.mp4', elem_id="video-player", label="Introduction Video")  
    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
            leaderboard = init_leaderboard(committed)  # LEADERBOARD_DF

        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")

        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
            with gr.Column():
                with gr.Row():
                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
            with gr.Column():  
                with gr.Row():  
                    score_input = gr.Textbox(label="Score (float)", placeholder="请输入分数")  
                    name_input = gr.Textbox(label="Name (str)", placeholder="请输入名称")  
                    base_model_input = gr.Textbox(label="BaseModel (str)", placeholder="请输入基模型名称")  
                  
                with gr.Row():  
                    env_dropdown = gr.Dropdown(  
                        choices=["Sokoban", "Football", "WebUI"],  
                        label="Env.",  
                        value="Sokoban"  
                    )  
                    target_research_dropdown = gr.Dropdown(  
                        choices=["Model-Eval-Online", "Model-Eval-Global"],  
                        label="Target-research",  
                        value="Model-Eval-Online"  
                    )  
                    subset_dropdown = gr.Dropdown(  
                        choices=["mini", "all"],  
                        label="Subset",  
                        value="mini"  
                    )  
                  
                link_input = gr.Textbox(label="Link (str)", placeholder="请输入链接")  
                  
                submit_button = gr.Button("Submit Eval")  
                submission_result = gr.Markdown()  
                  
                def submit_eval(score, name, base_model, env, target_research, subset, link):  
                    # 在这里处理提交逻辑，可以将信息保存到数据库或进行其他处理  
                    result = (  
                        f"Score: {score}\n"  
                        f"Name: {name}\n"  
                        f"BaseModel: {base_model}\n"  
                        f"Env: {env}\n"  
                        f"Target-research: {target_research}\n"  
                        f"Subset: {subset}\n"  
                        f"Link: {link}"  
                    ) 
                    open("./text.txt", "w").write(result)
                    return result  
                  
                submit_button.click(  
                    submit_eval,   
                    [score_input, name_input, base_model_input, env_dropdown, target_research_dropdown, subset_dropdown, link_input],  
                    submission_result  
                )  
            #     with gr.Column():
            #         with gr.Accordion(
            #             f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
            #             open=False,
            #         ):
            #             with gr.Row():
            #                 finished_eval_table = gr.components.Dataframe(
            #                     value=finished_eval_queue_df,
            #                     headers=EVAL_COLS,
            #                     datatype=EVAL_TYPES,
            #                     row_count=5,
            #                 )
            #         with gr.Accordion(
            #             f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
            #             open=False,
            #         ):
            #             with gr.Row():
            #                 running_eval_table = gr.components.Dataframe(
            #                     value=running_eval_queue_df,
            #                     headers=EVAL_COLS,
            #                     datatype=EVAL_TYPES,
            #                     row_count=5,
            #                 )

            #         with gr.Accordion(
            #             f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
            #             open=False,
            #         ):
            #             with gr.Row():
            #                 pending_eval_table = gr.components.Dataframe(
            #                     value=pending_eval_queue_df,
            #                     headers=EVAL_COLS,
            #                     datatype=EVAL_TYPES,
            #                     row_count=5,
            #                 )
            # with gr.Row():
            #     gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")

            # with gr.Row():
            #     with gr.Column():
            #         model_name_textbox = gr.Textbox(label="Model name")
            #         revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
            #         model_type = gr.Dropdown(
            #             choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
            #             label="Model type",
            #             multiselect=False,
            #             value=None,
            #             interactive=True,
            #         )

            #     with gr.Column():
            #         precision = gr.Dropdown(
            #             choices=[i.value.name for i in Precision if i != Precision.Unknown],
            #             label="Precision",
            #             multiselect=False,
            #             value="float16",
            #             interactive=True,
            #         )
            #         weight_type = gr.Dropdown(
            #             choices=[i.value.name for i in WeightType],
            #             label="Weights type",
            #             multiselect=False,
            #             value="Original",
            #             interactive=True,
            #         )
            #         base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")

            # submit_button = gr.Button("Submit Eval")
            # submission_result = gr.Markdown()
            # submit_button.click(
            #     add_new_eval,
            #     [
            #         model_name_textbox,
            #         base_model_name_textbox,
            #         revision_name_textbox,
            #         precision,
            #         weight_type,
            #         model_type,
            #     ],
            #     submission_result,
            # )

    with gr.Row():
        with gr.Accordion("📙 Citation", open=False):
            citation_button = gr.Textbox(
                value=CITATION_BUTTON_TEXT,
                label=CITATION_BUTTON_LABEL,
                lines=20,
                elem_id="citation-button",
                show_copy_button=True,
            )

scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()
demo.queue(default_concurrency_limit=40).launch()