Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Added way more training dataset annotations #1765

Open
wants to merge 18 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/model_loading.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,4 @@ jobs:

- name: Install dependencies and run tests
run: |
make model-load-test
make model-load-test BASE_BRANCH=${{ github.event.pull_request.base.ref }}
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -41,5 +41,5 @@ build-docs:
model-load-test:
@echo "--- 🚀 Running model load test ---"
pip install ".[dev, speedtask, pylate,gritlm,xformers,model2vec]"
python scripts/extract_model_names.py
python scripts/extract_model_names.py $(BASE_BRANCH)
python tests/test_models/model_loading.py --model_name_file scripts/model_names.txt
68 changes: 39 additions & 29 deletions mteb/abstasks/AbsTask.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import random
from abc import ABC, abstractmethod
from collections.abc import Sequence
from copy import copy
from typing import Any

import datasets
Expand Down Expand Up @@ -62,6 +63,7 @@ class AbsTask(ABC):
dataset: dict[HFSubset, DatasetDict] | None = None # type: ignore
data_loaded: bool = False
is_multilingual: bool = False
hf_subsets: list[HFSubset] | None = None

def __init__(self, seed: int = 42, **kwargs: Any):
self.save_suffix = kwargs.get("save_suffix", "")
Expand Down Expand Up @@ -110,10 +112,13 @@ def evaluate(
self.dataset: dict[HFSubset, DatasetDict]

scores = {}
hf_subsets = list(self.dataset.keys()) if self.is_multilingual else ["default"]
if self.hf_subsets is None:
hf_subsets = list(self.dataset.keys())
else:
hf_subsets = copy(self.hf_subsets)

if subsets_to_run is not None:
hf_subsets = [s for s in hf_subsets if s in subsets_to_run]
if subsets_to_run is not None: # allow overwrites of pre-filtering
hf_subsets = subsets_to_run

for hf_subset in hf_subsets:
logger.info(
Expand Down Expand Up @@ -218,16 +223,13 @@ def calculate_metadata_metrics(
)
descriptive_stats[split][hf_subset_stat] = {}

eval_langs = (
list(self.metadata.eval_langs.keys())
if isinstance(self.metadata.eval_langs, dict)
else self.metadata.eval_langs
pbar_subsets = tqdm.tqdm(
self.metadata.hf_subsets_to_langscripts,
desc="Processing Languages...",
)

pbar_subsets = tqdm.tqdm(eval_langs, desc="Processing Languages...")
for hf_subset in pbar_subsets:
pbar_subsets.set_postfix_str(f"Language: {hf_subset}")
logger.info(f"Processing metadata for language {hf_subset}")
pbar_subsets.set_postfix_str(f"Huggingface subset: {hf_subset}")
logger.info(f"Processing metadata for subset {hf_subset}")
split_details = self._calculate_metrics_from_split(split, hf_subset)
descriptive_stats[split][hf_subset_stat][hf_subset] = split_details
else:
Expand All @@ -252,12 +254,8 @@ def metadata_dict(self) -> dict[str, Any]:
@property
def languages(self) -> list[str]:
"""Returns the languages of the task"""
# check if self.hf_subsets is set
if self.is_multilingual and hasattr(self, "hf_subsets"):
assert isinstance(
self.metadata.eval_langs, dict
), "eval_langs must be dict for multilingual tasks"
eval_langs = self.metadata.eval_langs
if self.hf_subsets:
eval_langs = self.metadata.hf_subsets_to_langscripts
languages = []

for lang in self.hf_subsets:
Expand All @@ -275,31 +273,43 @@ def filter_eval_splits(self, eval_splits: list[str] | None) -> AbsTask:
return self

def filter_languages(
self, languages: list[str] | None, script: list[str] | None = None
self,
languages: list[str] | None,
script: list[str] | None = None,
hf_subsets: list[HFSubset] | None = None,
exclusive_language_filter: bool = False,
) -> AbsTask:
"""Filter the languages of the task.

Args:
languages: list of languages to filter the task by can be either a 3-letter langauge code (e.g. "eng") or also include the script
(e.g. "eng-Latn")
script: list of scripts to filter the task by. Will be ignored if language code specified the script. If None, all scripts are included.
script: A list of scripts to filter the task by. Will be ignored if language code specified the script. If None, all scripts are included.
If the language code does not specify the script the intersection of the language and script will be used.
hf_subsets: A list of huggingface subsets to filter on. This is useful if a dataset have multiple subsets containing the desired language,
but you only want to test on one. An example is STS22 which e.g. have both "en" and "de-en" which both contains English.
exclusive_language_filter: Some datasets contains more than one language e.g. for STS22 the subset "de-en" contain eng and deu. If
exclusive_language_filter is set to False both of these will be kept, but if set to True only those that contains all the languages
specified will be kept.
"""
lang_scripts = LanguageScripts.from_languages_and_scripts(languages, script)

subsets_to_keep = []

if not isinstance(self.metadata.eval_langs, dict):
self.hf_subsets = self.metadata.eval_langs
return self

for hf_subset, langs in self.metadata.eval_langs.items():
for langscript in langs:
if lang_scripts.contains_language(
langscript
) or lang_scripts.contains_script(langscript):
for hf_subset, langs in self.metadata.hf_subsets_to_langscripts.items():
if (hf_subsets is not None) and (hf_subset not in hf_subsets):
continue
if exclusive_language_filter is False:
for langscript in langs:
if lang_scripts.contains_language(
langscript
) or lang_scripts.contains_script(langscript):
subsets_to_keep.append(hf_subset)
break

if exclusive_language_filter is True and languages:
if lang_scripts.contains_languages(langs):
subsets_to_keep.append(hf_subset)
break

self.hf_subsets = subsets_to_keep
return self
Expand Down
4 changes: 1 addition & 3 deletions mteb/abstasks/MultilingualTask.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,7 @@ def __init__(self, hf_subsets: list[str] | None = None, **kwargs):
lang for lang in hf_subsets if lang in self.metadata.eval_langs
]
if hf_subsets is not None and len(hf_subsets) > 0:
self.hf_subsets = (
hf_subsets # TODO: case where user provides langs not in the dataset
)
self.hf_subsets = hf_subsets
else:
self.hf_subsets = self.metadata.eval_langs
self.is_multilingual = True
Loading