From 748955c367b5c549f4b8d54945361f5bbc7184f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Wed, 15 Jan 2025 16:26:22 +0100 Subject: [PATCH] fix: Added more Chinese models' `ModelMeta` (#1814) * Added Multilingual USE models * Added Moka models * Added dmeta models * Added jina-zh * Added piccolo models --- mteb/models/misc_models.py | 47 ++++++ mteb/models/moka_models.py | 150 ++++++++++++++++++++ mteb/models/overview.py | 1 + mteb/models/piccolo_models.py | 50 +++++++ mteb/models/sentence_transformers_models.py | 107 +++++++++++++- 5 files changed, 354 insertions(+), 1 deletion(-) create mode 100644 mteb/models/moka_models.py create mode 100644 mteb/models/piccolo_models.py diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py index d05461af1..5e8fcae0a 100644 --- a/mteb/models/misc_models.py +++ b/mteb/models/misc_models.py @@ -1661,3 +1661,50 @@ adapted_from="intfloat/e5-mistral-7b-instruct", superseded_by=None, ) +sbert_chinese_general_v1 = ModelMeta( + name="DMetaSoul/sbert-chinese-general-v1", + revision="bd27765956bcc2fcf682de0097819947ac10037e", + release_date="2022-03-25", + languages=["zho_Hans"], + loader=None, + n_parameters=None, # Not visible on repo + memory_usage=None, + max_tokens=512, + embed_dim=128, + license="apache-2", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/DMetaSoul/sbert-chinese-general-v1", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets={ + "PAWSX": ["train"], + "PawsXPairClassification": ["train"], # they do not specify which one + # They might have trained on other datasets too, they don't say: + # "trained on semantically similar datasets such as NLI, PAWS-X, PKU-Paraphrase-Bank, and STS." + }, + superseded_by=None, +) +dmeta_embedding_zh_small = ModelMeta( + name="DMetaSoul/Dmeta-embedding-zh-small", + revision="2050d3439a2f68999dd648c1697471acaac37a29", + release_date="2024-03-25", + languages=["zho_Hans"], + loader=None, + n_parameters=74.2 * 1e6, + memory_usage=None, + max_tokens=1024, + embed_dim=768, + license="apache-2", + open_weights=True, + public_training_data=False, + public_training_code=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/DMetaSoul/Dmeta-embedding-zh-small/", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, # They don't specify + superseded_by=None, +) diff --git a/mteb/models/moka_models.py b/mteb/models/moka_models.py new file mode 100644 index 000000000..cf9b96f88 --- /dev/null +++ b/mteb/models/moka_models.py @@ -0,0 +1,150 @@ +"""Moka AI's Chinese embedding models""" + +from __future__ import annotations + +from mteb.model_meta import ModelMeta + +sent_trf_training_dataset = { + # derived from datasheets + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + # not in MTEB + # "s2orc": ["train"], + # "flax-sentence-embeddings/stackexchange_xml": ["train"], + # "ms_marco": ["train"], + # "gooaq": ["train"], + # "yahoo_answers_topics": ["train"], + # "code_search_net": ["train"], + # "search_qa": ["train"], + # "eli5": ["train"], + # "snli": ["train"], + # "multi_nli": ["train"], + # "wikihow": ["train"], + # "natural_questions": ["train"], + # "trivia_qa": ["train"], + # "embedding-data/sentence-compression": ["train"], + # "embedding-data/flickr30k-captions": ["train"], + # "embedding-data/altlex": ["train"], + # "embedding-data/simple-wiki": ["train"], + # "embedding-data/QQP": ["train"], + # "embedding-data/SPECTER": ["train"], + # "embedding-data/PAQ_pairs": ["train"], + # "embedding-data/WikiAnswers": ["train"], +} +medi_dataset = { + **sent_trf_training_dataset, + # not in MTEB: + # - Super-NI + # - KILT (https://arxiv.org/abs/2009.02252) + # - MedMCQA (https://proceedings.mlr.press/v174/pal22a/pal22a.pdf) +} +m3e_dataset = { + **medi_dataset, + "AmazonReviewsClassification": ["train"], # Possibly also test, hard to know + "Ocnli": ["train"], + "BQ": ["train"], + "LCQMC": ["train"], + "MIRACLReranking": ["train"], + "PAWSX": ["train"], + # not in MTEB: + # - cmrc2018 + # - belle_2m + # - firefily + # - alpaca_gpt4 + # - zhihu_kol + # - hc3_chinese + # - amazon_reviews_multi (intersects with AmazonReviewsClassification) + # - qa: Encyclopedia QA dataset + # - xlsum + # - wiki_atomic_edit + # - chatmed_consult + # - webqa + # - dureader_robust + # - csl + # - lawzhidao + # - CINLID + # - DuSQL + # - Zhuiyi-NL2SQL + # - Cspider + # - news2016zh + # - baike2018qa + # - webtext2019zh + # - SimCLUE + # - SQuAD +} + +m3e_base = ModelMeta( + name="moka-ai/m3e-base", + languages=["zho_Hans", "eng-Latn"], + open_weights=True, + revision="764b537a0e50e5c7d64db883f2d2e051cbe3c64c", + release_date="2023-06-06", # first commit + n_parameters=102 * 1e6, + memory_usage=None, + embed_dim=768, + # They don't give a specific license but commercial use is not allowed + license="unspecified-noncommercial", + max_tokens=512, + reference="https://huggingface.co/moka-ai/m3e-base", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + public_training_code=False, # Not published + public_training_data=False, # They haven't published it yet + training_datasets=m3e_dataset, +) + +m3e_small = ModelMeta( + name="moka-ai/m3e-small", + languages=["zho_Hans", "eng-Latn"], + open_weights=True, + revision="44c696631b2a8c200220aaaad5f987f096e986df", + release_date="2023-06-02", # first commit + n_parameters=None, # Can't be seen on HF page + memory_usage=None, + embed_dim=512, + # They don't give a specific license but commercial use is not allowed + license="unspecified-noncommercial", + max_tokens=512, + reference="https://huggingface.co/moka-ai/m3e-small", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + public_training_code=False, # Not published + public_training_data=False, # They haven't published it yet + training_datasets=m3e_dataset, +) + + +m3e_large = ModelMeta( + name="moka-ai/m3e-large", + languages=["zho_Hans", "eng-Latn"], + open_weights=True, + revision="12900375086c37ba5d83d1e417b21dc7d1d1f388", + release_date="2023-06-21", # first commit + n_parameters=None, # Can't be seen on HF page + memory_usage=None, + embed_dim=768, + # They don't give a specific license but commercial use is not allowed + license="unspecified-noncommercial", + max_tokens=512, + reference="https://huggingface.co/moka-ai/m3e-large", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + public_training_code=False, # Not published + public_training_data=False, # They haven't published it yet + training_datasets=m3e_dataset, +) diff --git a/mteb/models/overview.py b/mteb/models/overview.py index 4e19bed19..f1abb1014 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -29,6 +29,7 @@ llm2vec_models, misc_models, model2vec_models, + moka_models, mxbai_models, no_instruct_sentence_models, nomic_models, diff --git a/mteb/models/piccolo_models.py b/mteb/models/piccolo_models.py new file mode 100644 index 000000000..17ea1fc2a --- /dev/null +++ b/mteb/models/piccolo_models.py @@ -0,0 +1,50 @@ +"""Piccolo Chinese embedding models by SenseNova""" + +from __future__ import annotations + +from mteb.model_meta import ModelMeta + +piccolo_base_zh = ModelMeta( + name="sensenova/piccolo-base-zh", + languages=["zho_Hans"], + open_weights=True, + revision="47c0a63b8f667c3482e05b2fd45577bb19252196", + release_date="2023-09-04", # first commit + n_parameters=None, # can't see on model card + memory_usage=None, + embed_dim=768, + license="mit", + max_tokens=512, + reference="https://huggingface.co/sensenova/piccolo-base-zh", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + public_training_code=False, + public_training_data=False, + training_datasets=None, # They don't specify +) + +piccolo_large_zh_v2 = ModelMeta( + name="sensenova/piccolo-large-zh-v2", + languages=["zho_Hans"], + open_weights=False, # They "temporarily" removed it in may last year + # "Due to certain internal company considerations" + revision="05948c1d889355936bdf9db7d30df57dd78d25a3", + release_date="2024-04-22", # first commit + n_parameters=None, # we don't know because they removed the model + memory_usage=None, + embed_dim=1024, + license="not specified", + max_tokens=512, + reference="https://huggingface.co/sensenova/piccolo-large-zh-v2", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + public_training_code=False, + public_training_data=False, + training_datasets=None, # They don't say +) diff --git a/mteb/models/sentence_transformers_models.py b/mteb/models/sentence_transformers_models.py index 28349d60d..18b08f16f 100644 --- a/mteb/models/sentence_transformers_models.py +++ b/mteb/models/sentence_transformers_models.py @@ -218,7 +218,7 @@ "NQ": ["test"], "NQHardNegatives": ["test"], "MSMARCO": ["train"], - # Non MTEB sources + # Non MTEB source # "s2orc": ["train"], # "flax-sentence-embeddings/stackexchange_xml": ["train"], # "ms_marco": ["train"], @@ -242,6 +242,82 @@ }, ) +# Source: https://arxiv.org/pdf/1907.04307 +use_multilingual_languages = [ + "ara-Arab", # Arabic + "zho-Hans", # Chinese (Simplified, PRC) + "zho-Hant", # Chinese (Traditional, Taiwan) + "nld-Latn", # Dutch + "eng-Latn", # English + "deu-Latn", # German + "fra-Latn", # French + "ita-Latn", # Italian + "por-Latn", # Portuguese + "spa-Latn", # Spanish + "jpn-Jpan", # Japanese + "kor-Kore", # Korean + "rus-Cyrl", # Russian + "pol-Latn", # Polish + "tha-Thai", # Thai + "tur-Latn", # Turkish +] +use_multilingual_training_data = { + # I'm not certain since they mined this themselves, but I would assume that there is significant overlap + "StackOverflowQARetrieval": ["train", "test"], + # Not in MTEB: + # - SNLI translated to 15 languages (could have intersections with other NLI datasets) + # - Translation pairs: Mined from the internet + # - QA mined from Reddit, StackOverflow, YahooAnswers (could be problematic) +} +distiluse_base_multilingual_cased_v2 = ModelMeta( + name="sentence-transformers/distiluse-base-multilingual-cased-v2", + languages=use_multilingual_languages, + open_weights=True, + revision="dad0fa1ee4fa6e982d3adbce87c73c02e6aee838", + release_date="2021-06-22", # First commit + n_parameters=135 * 1e6, + memory_usage=None, + embed_dim=768, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/sentence-transformers/distiluse-base-multilingual-cased-v2", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + public_training_code=True, + public_training_data=True, + training_datasets=use_multilingual_training_data, +) + +use_cmlm_multilingual = ModelMeta( + name="sentence-transformers/use-cmlm-multilingual", + languages=paraphrase_langs, + open_weights=True, + revision="6f8ff6583c371cbc4d6d3b93a5e37a888fd54574", + release_date="2022-04-14", # First commit + n_parameters=472 * 1e6, + memory_usage=None, + embed_dim=768, + license="apache-2.0", + max_tokens=256, + reference="https://huggingface.co/sentence-transformers/use-cmlm-multilingual", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from="sentence-transformers/LaBSE", + public_training_code=True, + public_training_data=True, + training_datasets={ + # Not in MTEB: + # - SNLI + # - Translation corpus based largely on Uszkoreit et al. (2010) + }, +) + + jina_embeddings_v2_base_en = ModelMeta( name="jinaai/jina-embeddings-v2-base-en", languages=["eng-Latn"], @@ -262,6 +338,35 @@ training_datasets={"allenai/c4": ["train"]}, ) +jina_embeddings_v2_base_zh = ModelMeta( + name="jinaai/jina-embeddings-v2-base-zh", + languages=["eng-Latn", "zho-Hans"], + open_weights=True, + revision="c1ff9086a89a1123d7b5eff58055a665db4fb4b9", + release_date="2024-01-10", + n_parameters=161_000_000, + memory_usage=None, + embed_dim=768, + license="apache-2.0", + max_tokens=8192, + reference="https://huggingface.co/jinaai/jina-embeddings-v2-base-zh", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + superseded_by=None, + adapted_from=None, + training_datasets={ + # source: https://arxiv.org/pdf/2402.17016 + "XNLI": ["train"], + "MLSumClusteringS2S": ["train"], + "MLSumClusteringP2P": ["train"], + # Not in MTEB: + # - MQA + # - XLSUM + }, +) + + jina_embeddings_v2_small_en = ModelMeta( name="jinaai/jina-embeddings-v2-small-en", languages=["eng-Latn"],