Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[v2] Remove deprecated parameters from MTEB and cli #1773

Merged
merged 15 commits into from
Jan 15, 2025
21 changes: 1 addition & 20 deletions mteb/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,26 +374,7 @@ def main():
add_create_meta_parser(subparsers)

args = parser.parse_args()

# If no subcommand is provided, default to run with a deprecation warning
if not hasattr(args, "func"):
logger.warning(
"Using `mteb` without a subcommand is deprecated. Use `mteb run` instead.",
DeprecationWarning,
)
# Set default arguments for 'run' if no subcommand is provided
default_args = parser.parse_args(
["run"]
+ list(map(str, args._get_args()))
+ [
f"--{k}" if v is None else f"--{k}={v}"
for k, v in vars(args).items()
if k != "func"
]
)
default_args.func(default_args)
else:
args.func(args)
args.func(args)


if __name__ == "__main__":
Expand Down
203 changes: 24 additions & 179 deletions mteb/evaluation/MTEB.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,153 +5,71 @@
import os
import traceback
from collections.abc import Iterable
from copy import copy, deepcopy
from copy import deepcopy
from datetime import datetime
from itertools import chain
from pathlib import Path
from time import time
from typing import Any
from typing import TYPE_CHECKING, Any

import datasets
from codecarbon import EmissionsTracker
from sentence_transformers import CrossEncoder, SentenceTransformer

import mteb
from mteb.abstasks.AbsTask import ScoresDict
from mteb.encoder_interface import Encoder
from mteb.model_meta import ModelMeta
from mteb.models import model_meta_from_sentence_transformers

from ..abstasks.AbsTask import AbsTask
from ..abstasks.AbsTaskMultilabelClassification import AbsTaskMultilabelClassification
from ..abstasks.AbsTaskReranking import AbsTaskReranking
from ..load_results.task_results import TaskResult
from ..models.sentence_transformer_wrapper import SentenceTransformerWrapper
from . import LangMapping

if TYPE_CHECKING:
from mteb.benchmarks import Benchmark

logger = logging.getLogger(__name__)


class MTEB:
def __init__(
self,
tasks: Iterable[str | AbsTask] | None = None,
tasks: Iterable[AbsTask | Benchmark],
*,
task_types: list[str] | None = None,
task_categories: list[str] | None = None,
task_langs: list[str] | None = None,
version=None,
err_logs_path: str = "error_logs.txt",
**kwargs,
):
"""Create an Evaluation pipeline, based on the provided tasks.

Args:
tasks: List of tasks to be evaluated.
task_types: Will be deprecated we recommend that you use `mteb.get_tasks()` to filter tasks. List of task types (Clustering, Retrieval..) to be
evaluated. If None, all tasks will be evaluated
task_categories: Will be deprecated we recommend that you use `mteb.get_tasks()` to filter tasks. List of task categories (s2s, p2p..) to be
evaluated. If None, all tasks will be evaluated
task_langs: Will be deprecated we recommend that you use `mteb.get_tasks()` to filter tasks. List of languages to be evaluated. if None, all
languages will be evaluated. ["eng-Latn", "deu_Latn"] will evaluate on all tasks with these languages.
version: Will be deprecated. Version of the benchmark to use. If None, latest is used
tasks: List of tasks or benchmarks to be evaluated, e.g. tasks returned by
`mteb.get_tasks(["task1","task2"]) or `mteb.get_benchmark("MTEB(eng, classic)").
err_logs_path: Path to save error logs.
kwargs: Additional arguments to be passed to the tasks
"""
from mteb.benchmarks import Benchmark

self.deprecation_warning(
task_types, task_categories, task_langs, tasks, version
)

if tasks is not None:
self._tasks = tasks
if isinstance(tasks[0], Benchmark):
self.benchmarks = tasks
self._tasks = list(chain.from_iterable(tasks))
assert (
task_types is None and task_categories is None
), "Cannot specify both `tasks` and `task_types`/`task_categories`"
else:
self._task_types = task_types
self._task_categories = task_categories
self._tasks = None

self._task_langs = task_langs if task_langs is not None else []
if isinstance(self._task_langs, str):
self._task_langs = [self._task_langs]
self.tasks = list(tasks)
if len(self.tasks) > 0 and isinstance(self.tasks[0], Benchmark):
self.benchmarks = tasks
self.tasks = list(chain.from_iterable(self.tasks))

self._extend_lang_code()
self._extend_lang_pairs() # add all possible pairs

self._version = version
self.err_logs_path = err_logs_path

self.last_evaluated_splits = {}

self.select_tasks(**kwargs)

def deprecation_warning(
self, task_types, task_categories, task_langs, tasks, version
):
if task_types is not None:
logger.warning(
"The `task_types` argument is deprecated and will be removed in the next release. "
+ "Please use `tasks = mteb.get_tasks(... task_types = [...])` to filter tasks instead."
)
if task_categories is not None:
logger.warning(
"The `task_categories` argument is deprecated and will be removed in the next release. "
+ "Please use `tasks = mteb.get_tasks(... categories = [...])` to filter tasks instead."
)
if task_langs is not None:
logger.warning(
"The `task_langs` argument is deprecated and will be removed in the next release. "
+ "Please use `tasks = mteb.get_tasks(... languages = [...])` to filter tasks instead. "
+ "Note that this uses 3 letter language codes (ISO 639-3)."
)
if version is not None:
logger.warning(
"The `version` argument is deprecated and will be removed in the next release."
)
task_contains_strings = any(isinstance(x, str) for x in tasks or [])
if task_contains_strings:
logger.warning(
"Passing task names as strings is deprecated and will be removed in the next release. "
+ "Please use `tasks = mteb.get_tasks(tasks=[...])` method to get tasks instead."
)

@property
def available_tasks(self):
return [x.metadata.name for x in self.tasks_cls]
return [x.metadata.name for x in self.tasks]

@property
def available_task_types(self):
# sort the task types
return sorted({x.metadata.type for x in self.tasks_cls})
return sorted({x.metadata.type for x in self.tasks})

@property
def available_task_categories(self):
return {x.metadata.category for x in self.tasks_cls}

def _extend_lang_code(self):
# add all possible language codes
for lang in set(self._task_langs):
if lang in LangMapping.LANG_MAPPING:
self._task_langs += LangMapping.LANG_MAPPING[lang]

def _extend_lang_pairs(self):
# add all possible language pairs
langs = set(self._task_langs)
for x in langs:
if "-" not in x:
for y in langs:
if "-" not in y:
pair = f"{x}-{y}"
if pair not in langs:
self._task_langs.append(pair)
return

def _display_tasks(self, task_list, name=None):
return {x.metadata.category for x in self.tasks}

def _display_tasks(self, task_list: Iterable[AbsTask], name: str | None = None):
Samoed marked this conversation as resolved.
Show resolved Hide resolved
from rich.console import Console

# disable logging for other ranks
Expand Down Expand Up @@ -215,80 +133,14 @@ def mteb_benchmarks(self):
@classmethod
def mteb_tasks(cls):
"""Get all tasks available in the MTEB."""
instance = cls()
instance._display_tasks(instance.tasks_cls, name="MTEB tasks")
tasks = mteb.get_tasks()
instance = cls(tasks)
instance._display_tasks(tasks, name="MTEB tasks")

def print_selected_tasks(self):
"""Print the selected tasks."""
self._display_tasks(self.tasks, name="Selected tasks")

def select_tasks(self, **kwargs):
"""Select the tasks to be evaluated."""
# Get all existing tasks
# reranking and multiclassClassification subclasses retrieval to share methods, but is an abstract task
tasks_categories_cls = list(AbsTask.__subclasses__()) + [
AbsTaskReranking,
AbsTaskMultilabelClassification,
]
all_task_classes = []
for cat_cls in tasks_categories_cls:
for cls in cat_cls.__subclasses__():
if cat_cls.__name__.startswith("AbsTask") and cls.__name__ not in (
"AbsTaskReranking",
"AbsTaskMultilabelClassification",
):
task = cls(hf_subsets=self._task_langs, **kwargs)
all_task_classes.append(task)

self.tasks_cls = all_task_classes

# If `task_list` is specified, select list of tasks
if self._tasks is not None:
self.tasks = list(
filter(lambda x: (x.metadata.name in self._tasks), self.tasks_cls)
)
if len(self.tasks) != len(self._tasks):
tasks_known = {x.metadata.name for x in self.tasks_cls}
tasks_unknown = {
x for x in self._tasks if isinstance(x, str)
} - tasks_known
if tasks_unknown:
unknown_str, known_str = (
",".join(sorted(tasks_unknown)),
",".join(sorted(tasks_known)),
)
logger.warning(
f"WARNING: Unknown tasks: {unknown_str}. Known tasks: {known_str}."
)
# add task if subclass of mteb.tasks
self.tasks.extend([x for x in self._tasks if isinstance(x, AbsTask)])
return

# Otherwise use filters to select tasks
filtered_tasks = filter(
lambda x: (self._task_types is None)
or (x.metadata.type in self._task_types),
self.tasks_cls,
)
filtered_tasks = filter(
lambda x: (self._task_categories is None)
or (x.metadata.category in self._task_categories),
filtered_tasks,
)
filtered_tasks = filter(
lambda x: (self._version is None) or (x.metadata.version >= self._version),
filtered_tasks,
)
# keep only tasks with at least one language in the filter
filtered_tasks = filter(
lambda x: (not self._task_langs)
or (len(set(x.metadata.eval_langs) & set(self._task_langs)) > 0),
filtered_tasks,
)

# Get final list of tasks
self.tasks = list(filtered_tasks)

def load_tasks_data(self):
"""Load datasets for the selected tasks."""
logger.info(f"\n\n## Loading datasets for {len(self.tasks)} tasks")
Expand Down Expand Up @@ -416,13 +268,6 @@ def run(
Returns:
A list of TaskResult objects, one for each task evaluated.
"""
if "batch_size" in kwargs:
logger.warning(
"The `batch_size` argument is deprecated and will be removed in the next release. "
+ "Please use `encode_kwargs = {'batch_size': ...}` to set the batch size instead."
)
encode_kwargs["batch_size"] = kwargs["batch_size"]

# update logging to account for different levels of Verbosity (similar to the command line)

if verbosity == 0:
Expand Down Expand Up @@ -455,8 +300,8 @@ def run(
self.print_selected_tasks()

evaluation_results = []
original_tasks = (
self.tasks.copy()
original_tasks = deepcopy(
self.tasks
) # save them in case we re-use the object (e.g. for reranking)

# To evaluate missing splits, we keep track of the task name and the corresponding splits.
Expand Down Expand Up @@ -665,7 +510,7 @@ def create_model_meta(model: Encoder) -> ModelMeta:
)

# create a copy of the meta to avoid modifying the original object
meta = copy(meta)
meta = deepcopy(meta)
meta.revision = meta.revision or "no_revision_available"
meta.name = meta.name or "no_model_name_available"

Expand Down
2 changes: 1 addition & 1 deletion mteb/overview.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def __repr__(self) -> str:
return "MTEBTasks" + super().__repr__()

@staticmethod
def _extract_property_from_task(task, property):
def _extract_property_from_task(task, property: str):
if hasattr(task.metadata, property):
return getattr(task.metadata, property)
elif hasattr(task, property):
Expand Down
52 changes: 20 additions & 32 deletions tests/test_benchmark/task_grid.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,8 @@

from __future__ import annotations

import mteb
from mteb.abstasks import AbsTask
from mteb.tasks.BitextMining.dan.BornholmskBitextMining import BornholmBitextMining
from mteb.tasks.Classification.multilingual.IndicSentimentClassification import (
IndicSentimentClassification,
)
from mteb.tasks.Clustering.eng.TwentyNewsgroupsClustering import (
TwentyNewsgroupsClusteringFast,
)

from .mock_tasks import (
MockBitextMiningTask,
Expand Down Expand Up @@ -39,31 +33,25 @@
MockSummarizationTask,
)

twenty_news = TwentyNewsgroupsClusteringFast()

# downsample to speed up tests
twenty_news.max_document_to_embed = 1000
twenty_news.n_clusters = 2
twenty_news.max_fraction_of_documents_to_embed = None
isaac-chung marked this conversation as resolved.
Show resolved Hide resolved

TASK_TEST_GRID = [
BornholmBitextMining(), # bitext mining + just supplying a task class instead of a string
IndicSentimentClassification( # multi subset loader
hf_subsets=["as"], # we only load one subset here to speed up tests
n_experiments=2, # to speed up the test
),
"TwentyNewsgroupsClustering", # clustering and string instead of class
twenty_news, # fast clustering
"Banking77Classification", # classification
"SciDocsRR", # reranking
"FarsTail", # pair classification
"TwitterHjerneRetrieval", # retrieval
"BrazilianToxicTweetsClassification", # multilabel classification
"FaroeseSTS", # STS
"SummEval", # summarization
"Core17InstructionRetrieval", # instruction reranking
"InstructIR", # instruction retrieval
]
TASK_TEST_GRID = (
mteb.get_tasks(
tasks=[
"BornholmBitextMining", # bitext mining + just supplying a task class instead of a string
"TwentyNewsgroupsClustering", # clustering and string instead of class
"TwentyNewsgroupsClustering.v2", # fast clustering
"Banking77Classification", # classification
"SciDocsRR", # reranking
"FarsTail", # pair classification
"TwitterHjerneRetrieval", # retrieval
"BrazilianToxicTweetsClassification", # multilabel classification
"FaroeseSTS", # STS
"SummEval", # summarization
"Core17InstructionRetrieval", # instruction reranking
"InstructIR", # instruction retrieval
]
)
+ mteb.get_tasks(tasks=["IndicSentimentClassification"], languages=["asm-Beng"])
)

TASK_TEST_GRID_AS_STRING = [
t.metadata.name if isinstance(t, AbsTask) else t for t in TASK_TEST_GRID
Expand Down
Loading
Loading