embeddings-benchmark · Samoed · Jan 15, 2025 · Jan 12, 2025 · Jan 12, 2025 · Jan 12, 2025
diff --git a/mteb/cli.py b/mteb/cli.py
@@ -374,26 +374,7 @@ def main():
     add_create_meta_parser(subparsers)
 
     args = parser.parse_args()
-
-    # If no subcommand is provided, default to run with a deprecation warning
-    if not hasattr(args, "func"):
-        logger.warning(
-            "Using `mteb` without a subcommand is deprecated. Use `mteb run` instead.",
-            DeprecationWarning,
-        )
-        # Set default arguments for 'run' if no subcommand is provided
-        default_args = parser.parse_args(
-            ["run"]
-            + list(map(str, args._get_args()))
-            + [
-                f"--{k}" if v is None else f"--{k}={v}"
-                for k, v in vars(args).items()
-                if k != "func"
-            ]
-        )
-        default_args.func(default_args)
-    else:
-        args.func(args)
+    args.func(args)
 
 
 if __name__ == "__main__":

diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py
@@ -5,153 +5,71 @@
 import os
 import traceback
 from collections.abc import Iterable
-from copy import copy, deepcopy
+from copy import deepcopy
 from datetime import datetime
 from itertools import chain
 from pathlib import Path
 from time import time
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 import datasets
 from codecarbon import EmissionsTracker
 from sentence_transformers import CrossEncoder, SentenceTransformer
 
+import mteb
 from mteb.abstasks.AbsTask import ScoresDict
 from mteb.encoder_interface import Encoder
 from mteb.model_meta import ModelMeta
 from mteb.models import model_meta_from_sentence_transformers
 
 from ..abstasks.AbsTask import AbsTask
-from ..abstasks.AbsTaskMultilabelClassification import AbsTaskMultilabelClassification
-from ..abstasks.AbsTaskReranking import AbsTaskReranking
 from ..load_results.task_results import TaskResult
 from ..models.sentence_transformer_wrapper import SentenceTransformerWrapper
-from . import LangMapping
+
+if TYPE_CHECKING:
+    from mteb.benchmarks import Benchmark
 
 logger = logging.getLogger(__name__)
 
 
 class MTEB:
     def __init__(
         self,
-        tasks: Iterable[str | AbsTask] | None = None,
+        tasks: Iterable[AbsTask | Benchmark],
         *,
-        task_types: list[str] | None = None,
-        task_categories: list[str] | None = None,
-        task_langs: list[str] | None = None,
-        version=None,
         err_logs_path: str = "error_logs.txt",
-        **kwargs,
     ):
         """Create an Evaluation pipeline, based on the provided tasks.
 
         Args:
-            tasks: List of tasks to be evaluated.
-            task_types: Will be deprecated we recommend that you use `mteb.get_tasks()` to filter tasks. List of task types (Clustering, Retrieval..) to be
-                evaluated. If None, all tasks will be evaluated
-            task_categories: Will be deprecated we recommend that you use `mteb.get_tasks()` to filter tasks. List of task categories (s2s, p2p..) to be
-                evaluated. If None, all tasks will be evaluated
-            task_langs: Will be deprecated we recommend that you use `mteb.get_tasks()` to filter tasks. List of languages to be evaluated. if None, all
-                languages will be evaluated. ["eng-Latn", "deu_Latn"] will evaluate on all tasks with these languages.
-            version: Will be deprecated. Version of the benchmark to use. If None, latest is used
+            tasks: List of tasks or benchmarks to be evaluated, e.g. tasks returned by
+                `mteb.get_tasks(["task1","task2"]) or `mteb.get_benchmark("MTEB(eng, classic)").
             err_logs_path: Path to save error logs.
-            kwargs: Additional arguments to be passed to the tasks
         """
         from mteb.benchmarks import Benchmark
 
-        self.deprecation_warning(
-            task_types, task_categories, task_langs, tasks, version
-        )
-
-        if tasks is not None:
-            self._tasks = tasks
-            if isinstance(tasks[0], Benchmark):
-                self.benchmarks = tasks
-                self._tasks = list(chain.from_iterable(tasks))
-            assert (
-                task_types is None and task_categories is None
-            ), "Cannot specify both `tasks` and `task_types`/`task_categories`"
-        else:
-            self._task_types = task_types
-            self._task_categories = task_categories
-            self._tasks = None
-
-        self._task_langs = task_langs if task_langs is not None else []
-        if isinstance(self._task_langs, str):
-            self._task_langs = [self._task_langs]
+        self.tasks = list(tasks)
+        if len(self.tasks) > 0 and isinstance(self.tasks[0], Benchmark):
+            self.benchmarks = tasks
+            self.tasks = list(chain.from_iterable(self.tasks))
 
-        self._extend_lang_code()
-        self._extend_lang_pairs()  # add all possible pairs
-
-        self._version = version
         self.err_logs_path = err_logs_path
-
         self.last_evaluated_splits = {}
 
-        self.select_tasks(**kwargs)
-
-    def deprecation_warning(
-        self, task_types, task_categories, task_langs, tasks, version
-    ):
-        if task_types is not None:
-            logger.warning(
-                "The `task_types` argument is deprecated and will be removed in the next release. "
-                + "Please use `tasks = mteb.get_tasks(... task_types = [...])` to filter tasks instead."
-            )
-        if task_categories is not None:
-            logger.warning(
-                "The `task_categories` argument is deprecated and will be removed in the next release. "
-                + "Please use `tasks = mteb.get_tasks(... categories = [...])` to filter tasks instead."
-            )
-        if task_langs is not None:
-            logger.warning(
-                "The `task_langs` argument is deprecated and will be removed in the next release. "
-                + "Please use `tasks = mteb.get_tasks(... languages = [...])` to filter tasks instead. "
-                + "Note that this uses 3 letter language codes (ISO 639-3)."
-            )
-        if version is not None:
-            logger.warning(
-                "The `version` argument is deprecated and will be removed in the next release."
-            )
-        task_contains_strings = any(isinstance(x, str) for x in tasks or [])
-        if task_contains_strings:
-            logger.warning(
-                "Passing task names as strings is deprecated and will be removed in the next release. "
-                + "Please use `tasks = mteb.get_tasks(tasks=[...])` method to get tasks instead."
-            )
-
     @property
     def available_tasks(self):
-        return [x.metadata.name for x in self.tasks_cls]
+        return [x.metadata.name for x in self.tasks]
 
     @property
     def available_task_types(self):
         # sort the task types
-        return sorted({x.metadata.type for x in self.tasks_cls})
+        return sorted({x.metadata.type for x in self.tasks})
 
     @property
     def available_task_categories(self):
-        return {x.metadata.category for x in self.tasks_cls}
-
-    def _extend_lang_code(self):
-        # add all possible language codes
-        for lang in set(self._task_langs):
-            if lang in LangMapping.LANG_MAPPING:
-                self._task_langs += LangMapping.LANG_MAPPING[lang]
-
-    def _extend_lang_pairs(self):
-        # add all possible language pairs
-        langs = set(self._task_langs)
-        for x in langs:
-            if "-" not in x:
-                for y in langs:
-                    if "-" not in y:
-                        pair = f"{x}-{y}"
-                        if pair not in langs:
-                            self._task_langs.append(pair)
-        return
-
-    def _display_tasks(self, task_list, name=None):
+        return {x.metadata.category for x in self.tasks}
+
+    def _display_tasks(self, task_list: Iterable[AbsTask], name: str | None = None):
         from rich.console import Console
 
         # disable logging for other ranks
@@ -215,80 +133,14 @@ def mteb_benchmarks(self):
     @classmethod
     def mteb_tasks(cls):
         """Get all tasks available in the MTEB."""
-        instance = cls()
-        instance._display_tasks(instance.tasks_cls, name="MTEB tasks")
+        tasks = mteb.get_tasks()
+        instance = cls(tasks)
+        instance._display_tasks(tasks, name="MTEB tasks")
 
     def print_selected_tasks(self):
         """Print the selected tasks."""
         self._display_tasks(self.tasks, name="Selected tasks")
 
-    def select_tasks(self, **kwargs):
-        """Select the tasks to be evaluated."""
-        # Get all existing tasks
-        # reranking and multiclassClassification subclasses retrieval to share methods, but is an abstract task
-        tasks_categories_cls = list(AbsTask.__subclasses__()) + [
-            AbsTaskReranking,
-            AbsTaskMultilabelClassification,
-        ]
-        all_task_classes = []
-        for cat_cls in tasks_categories_cls:
-            for cls in cat_cls.__subclasses__():
-                if cat_cls.__name__.startswith("AbsTask") and cls.__name__ not in (
-                    "AbsTaskReranking",
-                    "AbsTaskMultilabelClassification",
-                ):
-                    task = cls(hf_subsets=self._task_langs, **kwargs)
-                    all_task_classes.append(task)
-
-        self.tasks_cls = all_task_classes
-
-        # If `task_list` is specified, select list of tasks
-        if self._tasks is not None:
-            self.tasks = list(
-                filter(lambda x: (x.metadata.name in self._tasks), self.tasks_cls)
-            )
-            if len(self.tasks) != len(self._tasks):
-                tasks_known = {x.metadata.name for x in self.tasks_cls}
-                tasks_unknown = {
-                    x for x in self._tasks if isinstance(x, str)
-                } - tasks_known
-                if tasks_unknown:
-                    unknown_str, known_str = (
-                        ",".join(sorted(tasks_unknown)),
-                        ",".join(sorted(tasks_known)),
-                    )
-                    logger.warning(
-                        f"WARNING: Unknown tasks: {unknown_str}. Known tasks: {known_str}."
-                    )
-            # add task if subclass of mteb.tasks
-            self.tasks.extend([x for x in self._tasks if isinstance(x, AbsTask)])
-            return
-
-        # Otherwise use filters to select tasks
-        filtered_tasks = filter(
-            lambda x: (self._task_types is None)
-            or (x.metadata.type in self._task_types),
-            self.tasks_cls,
-        )
-        filtered_tasks = filter(
-            lambda x: (self._task_categories is None)
-            or (x.metadata.category in self._task_categories),
-            filtered_tasks,
-        )
-        filtered_tasks = filter(
-            lambda x: (self._version is None) or (x.metadata.version >= self._version),
-            filtered_tasks,
-        )
-        # keep only tasks with at least one language in the filter
-        filtered_tasks = filter(
-            lambda x: (not self._task_langs)
-            or (len(set(x.metadata.eval_langs) & set(self._task_langs)) > 0),
-            filtered_tasks,
-        )
-
-        # Get final list of tasks
-        self.tasks = list(filtered_tasks)
-
     def load_tasks_data(self):
         """Load datasets for the selected tasks."""
         logger.info(f"\n\n## Loading datasets for {len(self.tasks)} tasks")
@@ -416,13 +268,6 @@ def run(
         Returns:
             A list of TaskResult objects, one for each task evaluated.
         """
-        if "batch_size" in kwargs:
-            logger.warning(
-                "The `batch_size` argument is deprecated and will be removed in the next release. "
-                + "Please use `encode_kwargs = {'batch_size': ...}` to set the batch size instead."
-            )
-            encode_kwargs["batch_size"] = kwargs["batch_size"]
-
         # update logging to account for different levels of Verbosity (similar to the command line)
 
         if verbosity == 0:
@@ -455,8 +300,8 @@ def run(
             self.print_selected_tasks()
 
         evaluation_results = []
-        original_tasks = (
-            self.tasks.copy()
+        original_tasks = deepcopy(
+            self.tasks
         )  # save them in case we re-use the object (e.g. for reranking)
 
         # To evaluate missing splits, we keep track of the task name and the corresponding splits.
@@ -665,7 +510,7 @@ def create_model_meta(model: Encoder) -> ModelMeta:
                 )
 
         # create a copy of the meta to avoid modifying the original object
-        meta = copy(meta)
+        meta = deepcopy(meta)
         meta.revision = meta.revision or "no_revision_available"
         meta.name = meta.name or "no_model_name_available"
 

diff --git a/mteb/overview.py b/mteb/overview.py
@@ -124,7 +124,7 @@ def __repr__(self) -> str:
         return "MTEBTasks" + super().__repr__()
 
     @staticmethod
-    def _extract_property_from_task(task, property):
+    def _extract_property_from_task(task, property: str):
         if hasattr(task.metadata, property):
             return getattr(task.metadata, property)
         elif hasattr(task, property):

diff --git a/tests/test_benchmark/task_grid.py b/tests/test_benchmark/task_grid.py
@@ -2,14 +2,8 @@
 
 from __future__ import annotations
 
+import mteb
 from mteb.abstasks import AbsTask
-from mteb.tasks.BitextMining.dan.BornholmskBitextMining import BornholmBitextMining
-from mteb.tasks.Classification.multilingual.IndicSentimentClassification import (
-    IndicSentimentClassification,
-)
-from mteb.tasks.Clustering.eng.TwentyNewsgroupsClustering import (
-    TwentyNewsgroupsClusteringFast,
-)
 
 from .mock_tasks import (
     MockBitextMiningTask,
@@ -39,31 +33,25 @@
     MockSummarizationTask,
 )
 
-twenty_news = TwentyNewsgroupsClusteringFast()
-
-# downsample to speed up tests
-twenty_news.max_document_to_embed = 1000
-twenty_news.n_clusters = 2
-twenty_news.max_fraction_of_documents_to_embed = None
-
-TASK_TEST_GRID = [
-    BornholmBitextMining(),  # bitext mining + just supplying a task class instead of a string
-    IndicSentimentClassification(  # multi subset loader
-        hf_subsets=["as"],  # we only load one subset here to speed up tests
-        n_experiments=2,  # to speed up the test
-    ),
-    "TwentyNewsgroupsClustering",  # clustering and string instead of class
-    twenty_news,  # fast clustering
-    "Banking77Classification",  # classification
-    "SciDocsRR",  # reranking
-    "FarsTail",  # pair classification
-    "TwitterHjerneRetrieval",  # retrieval
-    "BrazilianToxicTweetsClassification",  # multilabel classification
-    "FaroeseSTS",  # STS
-    "SummEval",  # summarization
-    "Core17InstructionRetrieval",  # instruction reranking
-    "InstructIR",  # instruction retrieval
-]
+TASK_TEST_GRID = (
+    mteb.get_tasks(
+        tasks=[
+            "BornholmBitextMining",  # bitext mining + just supplying a task class instead of a string
+            "TwentyNewsgroupsClustering",  # clustering and string instead of class
+            "TwentyNewsgroupsClustering.v2",  # fast clustering
+            "Banking77Classification",  # classification
+            "SciDocsRR",  # reranking
+            "FarsTail",  # pair classification
+            "TwitterHjerneRetrieval",  # retrieval
+            "BrazilianToxicTweetsClassification",  # multilabel classification
+            "FaroeseSTS",  # STS
+            "SummEval",  # summarization
+            "Core17InstructionRetrieval",  # instruction reranking
+            "InstructIR",  # instruction retrieval
+        ]
+    )
+    + mteb.get_tasks(tasks=["IndicSentimentClassification"], languages=["asm-Beng"])
+)
 
 TASK_TEST_GRID_AS_STRING = [
     t.metadata.name if isinstance(t, AbsTask) else t for t in TASK_TEST_GRID