config: REPO_ID: "mteb/leaderboard" RESULTS_REPO: mteb/results LEADERBOARD_NAME: "MTEB Leaderboard" tasks: BitextMining: icon: "🎌" metric: f1 metric_description: "[F1](https://huggingface.co/spaces/evaluate-metric/f1)" task_description: "Bitext mining is the task of finding parallel sentences in two languages." Classification: icon: "❀️" metric: accuracy metric_description: "[Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)" task_description: "Classification is the task of assigning a label to a text." Clustering: icon: "✨" metric: v_measure metric_description: "Validity Measure (V-measure)" task_description: "Clustering is the task of grouping similar documents together." PairClassification: icon: "🎭" metric: ap metric_description: "Average Precision (AP) based on the models similarity metric (usually cosine)" task_description: "Pair classification is the task of determining whether two texts are similar." Reranking: icon: "πŸ₯ˆ" metric: map metric_description: "Mean Average Precision (MAP)" task_description: "Reranking is the task of reordering a list of documents to improve relevance." Retrieval: icon: "πŸ”Ž" metric: ndcg_at_10 metric_description: "Normalized Discounted Cumulative Gain @ 10 (nDCG@10)" task_description: "Retrieval is the task of finding relevant documents for a query." STS: icon: "☘️" metric: spearman metric_description: "Spearman correlation based on the model's similarity metric (usually cosine)" task_description: "Semantic Textual Similarity is the task of determining how similar two texts are." Summarization: icon: "πŸ“œ" metric: spearman metric_description: "Spearman correlation based on the model's similarity metric (usually cosine)" task_description: "Summarization is the task of generating a summary of a text." InstructionRetrieval: icon: "πŸ”ŽπŸ“‹" metric: "p-MRR" metric_description: "paired mean reciprocal rank (p-MRR)" task_description: "Retrieval w/Instructions is the task of finding relevant documents for a query that has detailed instructions." boards: en: title: English language_long: "English" has_overall: true acronym: null icon: null special_icons: null credits: null tasks: Classification: - AmazonCounterfactualClassification (en) - AmazonPolarityClassification - AmazonReviewsClassification (en) - Banking77Classification - EmotionClassification - ImdbClassification - MassiveIntentClassification (en) - MassiveScenarioClassification (en) - MTOPDomainClassification (en) - MTOPIntentClassification (en) - ToxicConversationsClassification - TweetSentimentExtractionClassification Clustering: - ArxivClusteringP2P - ArxivClusteringS2S - BiorxivClusteringP2P - BiorxivClusteringS2S - MedrxivClusteringP2P - MedrxivClusteringS2S - RedditClustering - RedditClusteringP2P - StackExchangeClustering - StackExchangeClusteringP2P - TwentyNewsgroupsClustering PairClassification: - SprintDuplicateQuestions - TwitterSemEval2015 - TwitterURLCorpus Reranking: - AskUbuntuDupQuestions - MindSmallReranking - SciDocsRR - StackOverflowDupQuestions Retrieval: - ArguAna - ClimateFEVER - CQADupstackRetrieval - DBPedia - FEVER - FiQA2018 - HotpotQA - MSMARCO - NFCorpus - NQ - QuoraRetrieval - SCIDOCS - SciFact - Touche2020 - TRECCOVID STS: - BIOSSES - SICK-R - STS12 - STS13 - STS14 - STS15 - STS16 - STS17 (en-en) - STS22 (en) - STSBenchmark Summarization: - SummEval en-x: title: "English-X" language_long: "117 (Pairs of: English & other language)" has_overall: false acronym: null icon: null special_icons: null credits: null tasks: BitextMining: ['BUCC (de-en)', 'BUCC (fr-en)', 'BUCC (ru-en)', 'BUCC (zh-en)', 'Tatoeba (afr-eng)', 'Tatoeba (amh-eng)', 'Tatoeba (ang-eng)', 'Tatoeba (ara-eng)', 'Tatoeba (arq-eng)', 'Tatoeba (arz-eng)', 'Tatoeba (ast-eng)', 'Tatoeba (awa-eng)', 'Tatoeba (aze-eng)', 'Tatoeba (bel-eng)', 'Tatoeba (ben-eng)', 'Tatoeba (ber-eng)', 'Tatoeba (bos-eng)', 'Tatoeba (bre-eng)', 'Tatoeba (bul-eng)', 'Tatoeba (cat-eng)', 'Tatoeba (cbk-eng)', 'Tatoeba (ceb-eng)', 'Tatoeba (ces-eng)', 'Tatoeba (cha-eng)', 'Tatoeba (cmn-eng)', 'Tatoeba (cor-eng)', 'Tatoeba (csb-eng)', 'Tatoeba (cym-eng)', 'Tatoeba (dan-eng)', 'Tatoeba (deu-eng)', 'Tatoeba (dsb-eng)', 'Tatoeba (dtp-eng)', 'Tatoeba (ell-eng)', 'Tatoeba (epo-eng)', 'Tatoeba (est-eng)', 'Tatoeba (eus-eng)', 'Tatoeba (fao-eng)', 'Tatoeba (fin-eng)', 'Tatoeba (fra-eng)', 'Tatoeba (fry-eng)', 'Tatoeba (gla-eng)', 'Tatoeba (gle-eng)', 'Tatoeba (glg-eng)', 'Tatoeba (gsw-eng)', 'Tatoeba (heb-eng)', 'Tatoeba (hin-eng)', 'Tatoeba (hrv-eng)', 'Tatoeba (hsb-eng)', 'Tatoeba (hun-eng)', 'Tatoeba (hye-eng)', 'Tatoeba (ido-eng)', 'Tatoeba (ile-eng)', 'Tatoeba (ina-eng)', 'Tatoeba (ind-eng)', 'Tatoeba (isl-eng)', 'Tatoeba (ita-eng)', 'Tatoeba (jav-eng)', 'Tatoeba (jpn-eng)', 'Tatoeba (kab-eng)', 'Tatoeba (kat-eng)', 'Tatoeba (kaz-eng)', 'Tatoeba (khm-eng)', 'Tatoeba (kor-eng)', 'Tatoeba (kur-eng)', 'Tatoeba (kzj-eng)', 'Tatoeba (lat-eng)', 'Tatoeba (lfn-eng)', 'Tatoeba (lit-eng)', 'Tatoeba (lvs-eng)', 'Tatoeba (mal-eng)', 'Tatoeba (mar-eng)', 'Tatoeba (max-eng)', 'Tatoeba (mhr-eng)', 'Tatoeba (mkd-eng)', 'Tatoeba (mon-eng)', 'Tatoeba (nds-eng)', 'Tatoeba (nld-eng)', 'Tatoeba (nno-eng)', 'Tatoeba (nob-eng)', 'Tatoeba (nov-eng)', 'Tatoeba (oci-eng)', 'Tatoeba (orv-eng)', 'Tatoeba (pam-eng)', 'Tatoeba (pes-eng)', 'Tatoeba (pms-eng)', 'Tatoeba (pol-eng)', 'Tatoeba (por-eng)', 'Tatoeba (ron-eng)', 'Tatoeba (rus-eng)', 'Tatoeba (slk-eng)', 'Tatoeba (slv-eng)', 'Tatoeba (spa-eng)', 'Tatoeba (sqi-eng)', 'Tatoeba (srp-eng)', 'Tatoeba (swe-eng)', 'Tatoeba (swg-eng)', 'Tatoeba (swh-eng)', 'Tatoeba (tam-eng)', 'Tatoeba (tat-eng)', 'Tatoeba (tel-eng)', 'Tatoeba (tgl-eng)', 'Tatoeba (tha-eng)', 'Tatoeba (tuk-eng)', 'Tatoeba (tur-eng)', 'Tatoeba (tzl-eng)', 'Tatoeba (uig-eng)', 'Tatoeba (ukr-eng)', 'Tatoeba (urd-eng)', 'Tatoeba (uzb-eng)', 'Tatoeba (vie-eng)', 'Tatoeba (war-eng)', 'Tatoeba (wuu-eng)', 'Tatoeba (xho-eng)', 'Tatoeba (yid-eng)', 'Tatoeba (yue-eng)', 'Tatoeba (zsm-eng)'] zh: title: Chinese language_long: Chinese has_overall: true acronym: C-MTEB icon: "πŸ‡¨πŸ‡³" special_icons: Classification: "🧑" credits: "[FlagEmbedding](https://github.com/FlagOpen/FlagEmbedding)" tasks: Classification: - AmazonReviewsClassification (zh) - IFlyTek - JDReview - MassiveIntentClassification (zh-CN) - MassiveScenarioClassification (zh-CN) - MultilingualSentiment - OnlineShopping - TNews - Waimai Clustering: - CLSClusteringP2P - CLSClusteringS2S - ThuNewsClusteringP2P - ThuNewsClusteringS2S PairClassification: - Cmnli - Ocnli Reranking: - CMedQAv1 - CMedQAv2 - MMarcoReranking - T2Reranking Retrieval: - CmedqaRetrieval - CovidRetrieval - DuRetrieval - EcomRetrieval - MedicalRetrieval - MMarcoRetrieval - T2Retrieval - VideoRetrieval STS: - AFQMC - ATEC - BQ - LCQMC - PAWSX - QBQTC - STS22 (zh) - STSB da: title: Danish language_long: Danish has_overall: false acronym: null icon: "πŸ‡©πŸ‡°" special_icons: Classification: "🀍" credits: "[Kenneth Enevoldsen](https://github.com/KennethEnevoldsen), [scandinavian-embedding-benchmark](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/)" tasks: BitextMining: - BornholmBitextMining Classification: - AngryTweetsClassification - DanishPoliticalCommentsClassification - DKHateClassification - LccSentimentClassification - MassiveIntentClassification (da) - MassiveScenarioClassification (da) - NordicLangClassification - ScalaDaClassification fr: title: French language_long: "French" has_overall: true acronym: "F-MTEB" icon: "πŸ‡«πŸ‡·" special_icons: Classification: "πŸ’™" credits: "[Lyon-NLP](https://github.com/Lyon-NLP): [Gabriel Sequeira](https://github.com/GabrielSequeira), [Imene Kerboua](https://github.com/imenelydiaker), [Wissam Siblini](https://github.com/wissam-sib), [Mathieu Ciancone](https://github.com/MathieuCiancone), [Marion Schaeffer](https://github.com/schmarion)" tasks: Classification: - AmazonReviewsClassification (fr) - MasakhaNEWSClassification (fra) - MassiveIntentClassification (fr) - MassiveScenarioClassification (fr) - MTOPDomainClassification (fr) - MTOPIntentClassification (fr) Clustering: - AlloProfClusteringP2P - AlloProfClusteringS2S - HALClusteringS2S - MLSUMClusteringP2P (fr) - MLSUMClusteringS2S (fr) - MasakhaNEWSClusteringP2P (fra) - MasakhaNEWSClusteringS2S (fra) PairClassification: - OpusparcusPC (fr) - PawsXPairClassification (fr) Reranking: - AlloprofReranking - SyntecReranking Retrieval: - AlloprofRetrieval - BSARDRetrieval - MintakaRetrieval (fr) - SyntecRetrieval - XPQARetrieval (fr) STS: - STS22 (fr) - STSBenchmarkMultilingualSTS (fr) - SICKFr Summarization: - SummEvalFr 'no': title: Norwegian language_long: "Norwegian BokmΓ₯l" has_overall: false acronym: null icon: "πŸ‡³πŸ‡΄" special_icons: Classification: "πŸ’™" credits: "[Kenneth Enevoldsen](https://github.com/KennethEnevoldsen), [scandinavian-embedding-benchmark](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/)" tasks: Classification: &id001 - NoRecClassification - NordicLangClassification - NorwegianParliament - MassiveIntentClassification (nb) - MassiveScenarioClassification (nb) - ScalaNbClassification instructions: title: English language_long: "English" has_overall: false acronym: null icon: null credits: "[Orion Weller, FollowIR](https://arxiv.org/abs/2403.15246)" tasks: InstructionRetrieval: - Robust04InstructionRetrieval - News21InstructionRetrieval - Core17InstructionRetrieval law: title: Law language_long: "English, German, Chinese" has_overall: false acronym: null icon: "βš–οΈ" special_icons: null credits: "[Voyage AI](https://www.voyageai.com/)" tasks: Retrieval: - AILACasedocs - AILAStatutes - GerDaLIRSmall - LeCaRDv2 - LegalBenchConsumerContractsQA - LegalBenchCorporateLobbying - LegalQuAD - LegalSummarization longembed: title: LongEmbed language_long: "English" has_overall: false acronym: null icon: "πŸ“š" special_icons: null credits: "[LongEmbed](https://arxiv.org/abs/2404.12096v2)" metric: nDCG@10 (for NarrativeQA, QMSum, SummScreenFD, WikimQA) & nDCG@1 (for passkey and needle) tasks: Retrieval: - LEMBNarrativeQARetrieval - LEMBNeedleRetrieval - LEMBPasskeyRetrieval - LEMBQMSumRetrieval - LEMBSummScreenFDRetrieval - LEMBWikimQARetrieval de: title: German language_long: "German" has_overall: false acronym: null icon: "πŸ‡©πŸ‡ͺ" special_icons: null credits: "[Silvan](https://github.com/slvnwhrl)" tasks: Clustering: - BlurbsClusteringP2P - BlurbsClusteringS2S - TenKGnadClusteringP2P - TenKGnadClusteringS2S pl: title: Polish language_long: Polish has_overall: true acronym: null icon: "πŸ‡΅πŸ‡±" special_icons: Classification: "🀍" credits: "[RafaΕ‚ PoΕ›wiata](https://github.com/rafalposwiata)" tasks: Classification: - AllegroReviews - CBD - MassiveIntentClassification (pl) - MassiveScenarioClassification (pl) - PAC - PolEmo2.0-IN - PolEmo2.0-OUT Clustering: - 8TagsClustering PairClassification: - CDSC-E - PPC - PSC - SICK-E-PL Retrieval: - ArguAna-PL - DBPedia-PL - FiQA-PL - HotpotQA-PL - MSMARCO-PL - NFCorpus-PL - NQ-PL - Quora-PL - SCIDOCS-PL - SciFact-PL - TRECCOVID-PL STS: - CDSC-R - SICK-R-PL - STS22 (pl) ru: title: Russian language_long: "Russian" has_overall: true acronym: null icon: "πŸ‡·πŸ‡Ί" special_icons: null credits: "[Roman Solomatin](https://github.com/Samoed) and SaluteDevices: [Alena Fenogenova](https://github.com/Alenush), [Aleksandr Abramov](https://github.com/Ab1992ao), [Artem Snegirev](https://github.com/artemsnegirev), [Anna Maksimova](https://github.com/anpalmak2003), [Maria Tikhonova](https://github.com/MariyaTikhonova)" tasks: Classification: - GeoreviewClassification (rus-Cyrl) - HeadlineClassification (rus-Cyrl) - InappropriatenessClassification (rus-Cyrl) - KinopoiskClassification (rus-Cyrl) - RuReviewsClassification (rus-Cyrl) - RuSciBenchGRNTIClassification (rus-Cyrl) - RuSciBenchOECDClassification (rus-Cyrl) Clustering: - GeoreviewClusteringP2P (rus-Cyrl) - RuSciBenchGRNTIClusteringP2P (rus-Cyrl) - RuSciBenchOECDClusteringP2P (rus-Cyrl) PairClassification: - TERRa (rus-Cyrl) Reranking: - RuBQReranking (rus-Cyrl) Retrieval: - RiaNewsRetrieval (rus-Cyrl) - RuBQRetrieval (rus-Cyrl) STS: - RUParaPhraserSTS (rus-Cyrl) - RuSTSBenchmarkSTS (rus-Cyrl) se: title: Swedish language_long: Swedish has_overall: false acronym: null icon: "πŸ‡ΈπŸ‡ͺ" special_icons: Classification: "πŸ’›" credits: "[Kenneth Enevoldsen](https://github.com/KennethEnevoldsen), [scandinavian-embedding-benchmark](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/)" tasks: Classification: - NoRecClassification - NordicLangClassification - NorwegianParliament - MassiveIntentClassification (nb) - MassiveScenarioClassification (nb) - ScalaNbClassification other-cls: title: "Other Languages" language_long: "47 (Only languages not included in the other tabs)" has_overall: false acronym: null icon: null special_icons: Classification: "πŸ’œπŸ’šπŸ’™" credits: null tasks: Classification: ['AmazonCounterfactualClassification (de)', 'AmazonCounterfactualClassification (ja)', 'AmazonReviewsClassification (de)', 'AmazonReviewsClassification (es)', 'AmazonReviewsClassification (fr)', 'AmazonReviewsClassification (ja)', 'AmazonReviewsClassification (zh)', 'MTOPDomainClassification (de)', 'MTOPDomainClassification (es)', 'MTOPDomainClassification (fr)', 'MTOPDomainClassification (hi)', 'MTOPDomainClassification (th)', 'MTOPIntentClassification (de)', 'MTOPIntentClassification (es)', 'MTOPIntentClassification (fr)', 'MTOPIntentClassification (hi)', 'MTOPIntentClassification (th)', 'MassiveIntentClassification (af)', 'MassiveIntentClassification (am)', 'MassiveIntentClassification (ar)', 'MassiveIntentClassification (az)', 'MassiveIntentClassification (bn)', 'MassiveIntentClassification (cy)', 'MassiveIntentClassification (de)', 'MassiveIntentClassification (el)', 'MassiveIntentClassification (es)', 'MassiveIntentClassification (fa)', 'MassiveIntentClassification (fi)', 'MassiveIntentClassification (fr)', 'MassiveIntentClassification (he)', 'MassiveIntentClassification (hi)', 'MassiveIntentClassification (hu)', 'MassiveIntentClassification (hy)', 'MassiveIntentClassification (id)', 'MassiveIntentClassification (is)', 'MassiveIntentClassification (it)', 'MassiveIntentClassification (ja)', 'MassiveIntentClassification (jv)', 'MassiveIntentClassification (ka)', 'MassiveIntentClassification (km)', 'MassiveIntentClassification (kn)', 'MassiveIntentClassification (ko)', 'MassiveIntentClassification (lv)', 'MassiveIntentClassification (ml)', 'MassiveIntentClassification (mn)', 'MassiveIntentClassification (ms)', 'MassiveIntentClassification (my)', 'MassiveIntentClassification (nl)', 'MassiveIntentClassification (pt)', 'MassiveIntentClassification (ro)', 'MassiveIntentClassification (ru)', 'MassiveIntentClassification (sl)', 'MassiveIntentClassification (sq)', 'MassiveIntentClassification (sw)', 'MassiveIntentClassification (ta)', 'MassiveIntentClassification (te)', 'MassiveIntentClassification (th)', 'MassiveIntentClassification (tl)', 'MassiveIntentClassification (tr)', 'MassiveIntentClassification (ur)', 'MassiveIntentClassification (vi)', 'MassiveIntentClassification (zh-TW)', 'MassiveScenarioClassification (af)', 'MassiveScenarioClassification (am)', 'MassiveScenarioClassification (ar)', 'MassiveScenarioClassification (az)', 'MassiveScenarioClassification (bn)', 'MassiveScenarioClassification (cy)', 'MassiveScenarioClassification (de)', 'MassiveScenarioClassification (el)', 'MassiveScenarioClassification (es)', 'MassiveScenarioClassification (fa)', 'MassiveScenarioClassification (fi)', 'MassiveScenarioClassification (fr)', 'MassiveScenarioClassification (he)', 'MassiveScenarioClassification (hi)', 'MassiveScenarioClassification (hu)', 'MassiveScenarioClassification (hy)', 'MassiveScenarioClassification (id)', 'MassiveScenarioClassification (is)', 'MassiveScenarioClassification (it)', 'MassiveScenarioClassification (ja)', 'MassiveScenarioClassification (jv)', 'MassiveScenarioClassification (ka)', 'MassiveScenarioClassification (km)', 'MassiveScenarioClassification (kn)', 'MassiveScenarioClassification (ko)', 'MassiveScenarioClassification (lv)', 'MassiveScenarioClassification (ml)', 'MassiveScenarioClassification (mn)', 'MassiveScenarioClassification (ms)', 'MassiveScenarioClassification (my)', 'MassiveScenarioClassification (nl)', 'MassiveScenarioClassification (pt)', 'MassiveScenarioClassification (ro)', 'MassiveScenarioClassification (ru)', 'MassiveScenarioClassification (sl)', 'MassiveScenarioClassification (sq)', 'MassiveScenarioClassification (sw)', 'MassiveScenarioClassification (ta)', 'MassiveScenarioClassification (te)', 'MassiveScenarioClassification (th)', 'MassiveScenarioClassification (tl)', 'MassiveScenarioClassification (tr)', 'MassiveScenarioClassification (ur)', 'MassiveScenarioClassification (vi)', 'MassiveScenarioClassification (zh-TW)'] other-sts: title: Other language_long: "Arabic, Chinese, Dutch, English, French, German, Italian, Korean, Polish, Russian, Spanish (Only language combos not included in the other tabs)" has_overall: false acronym: null icon: null special_icons: null credits: null tasks: STS: ["STS17 (ar-ar)", "STS17 (en-ar)", "STS17 (en-de)", "STS17 (en-tr)", "STS17 (es-en)", "STS17 (es-es)", "STS17 (fr-en)", "STS17 (it-en)", "STS17 (ko-ko)", "STS17 (nl-en)", "STS22 (ar)", "STS22 (de)", "STS22 (de-en)", "STS22 (de-fr)", "STS22 (de-pl)", "STS22 (es)", "STS22 (es-en)", "STS22 (es-it)", "STS22 (fr)", "STS22 (fr-pl)", "STS22 (it)", "STS22 (pl)", "STS22 (pl-en)", "STS22 (ru)", "STS22 (tr)", "STS22 (zh-en)", "STSBenchmark"] rar-b: title: RAR-b language_long: "English" has_overall: false acronym: null icon: "πŸ“š" special_icons: null credits: "[RAR-b (Xiao et al.)](https://arxiv.org/abs/2404.06347/)" metric: nDCG@10 tasks: Retrieval: - ARCChallenge - AlphaNLI - HellaSwag - PIQA - Quail - RARbCode - RARbMath - SIQA - SpartQA - TempReasonL1 - TempReasonL2Fact - TempReasonL2Pure - TempReasonL3Fact - TempReasonL3Pure - WinoGrande bright: title: BRIGHT language_long: "English" has_overall: false acronym: null icon: "🌟" special_icons: null credits: "[BRIGHT (Hongjin Su, Howard Yen, Mengzhou Xia et al.)](https://brightbenchmark.github.io/)" metric: nDCG@10 tasks: Retrieval: - BrightRetrieval (biology) - BrightRetrieval (earth_science) - BrightRetrieval (economics) - BrightRetrieval (psychology) - BrightRetrieval (robotics) - BrightRetrieval (stackoverflow) - BrightRetrieval (sustainable_living) - BrightRetrieval (pony) - BrightRetrieval (leetcode) - BrightRetrieval (aops) - BrightRetrieval (theoremqa_theorems) - BrightRetrieval (theoremqa_questions)