Skip to content

Commit

Permalink
fix: Added more Chinese models' ModelMeta (#1814)
Browse files Browse the repository at this point in the history
* Added Multilingual USE models

* Added Moka models

* Added dmeta models

* Added jina-zh

* Added  piccolo models
  • Loading branch information
x-tabdeveloping authored Jan 15, 2025
1 parent 186cc23 commit 748955c
Show file tree
Hide file tree
Showing 5 changed files with 354 additions and 1 deletion.
47 changes: 47 additions & 0 deletions mteb/models/misc_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1661,3 +1661,50 @@
adapted_from="intfloat/e5-mistral-7b-instruct",
superseded_by=None,
)
sbert_chinese_general_v1 = ModelMeta(
name="DMetaSoul/sbert-chinese-general-v1",
revision="bd27765956bcc2fcf682de0097819947ac10037e",
release_date="2022-03-25",
languages=["zho_Hans"],
loader=None,
n_parameters=None, # Not visible on repo
memory_usage=None,
max_tokens=512,
embed_dim=128,
license="apache-2",
open_weights=True,
public_training_data=False,
public_training_code=None,
framework=["PyTorch", "Sentence Transformers"],
reference="https://huggingface.co/DMetaSoul/sbert-chinese-general-v1",
similarity_fn_name="cosine",
use_instructions=None,
training_datasets={
"PAWSX": ["train"],
"PawsXPairClassification": ["train"], # they do not specify which one
# They might have trained on other datasets too, they don't say:
# "trained on semantically similar datasets such as NLI, PAWS-X, PKU-Paraphrase-Bank, and STS."
},
superseded_by=None,
)
dmeta_embedding_zh_small = ModelMeta(
name="DMetaSoul/Dmeta-embedding-zh-small",
revision="2050d3439a2f68999dd648c1697471acaac37a29",
release_date="2024-03-25",
languages=["zho_Hans"],
loader=None,
n_parameters=74.2 * 1e6,
memory_usage=None,
max_tokens=1024,
embed_dim=768,
license="apache-2",
open_weights=True,
public_training_data=False,
public_training_code=None,
framework=["PyTorch", "Sentence Transformers"],
reference="https://huggingface.co/DMetaSoul/Dmeta-embedding-zh-small/",
similarity_fn_name="cosine",
use_instructions=None,
training_datasets=None, # They don't specify
superseded_by=None,
)
150 changes: 150 additions & 0 deletions mteb/models/moka_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
"""Moka AI's Chinese embedding models"""

from __future__ import annotations

from mteb.model_meta import ModelMeta

sent_trf_training_dataset = {
# derived from datasheets
"MSMARCO": ["train"],
"MSMARCOHardNegatives": ["train"],
"NanoMSMARCORetrieval": ["train"],
"MSMARCO-PL": ["train"], # translation not trained on
"NQ": ["train"],
"NQHardNegatives": ["train"],
"NanoNQRetrieval": ["train"],
"NQ-PL": ["train"], # translation not trained on
# not in MTEB
# "s2orc": ["train"],
# "flax-sentence-embeddings/stackexchange_xml": ["train"],
# "ms_marco": ["train"],
# "gooaq": ["train"],
# "yahoo_answers_topics": ["train"],
# "code_search_net": ["train"],
# "search_qa": ["train"],
# "eli5": ["train"],
# "snli": ["train"],
# "multi_nli": ["train"],
# "wikihow": ["train"],
# "natural_questions": ["train"],
# "trivia_qa": ["train"],
# "embedding-data/sentence-compression": ["train"],
# "embedding-data/flickr30k-captions": ["train"],
# "embedding-data/altlex": ["train"],
# "embedding-data/simple-wiki": ["train"],
# "embedding-data/QQP": ["train"],
# "embedding-data/SPECTER": ["train"],
# "embedding-data/PAQ_pairs": ["train"],
# "embedding-data/WikiAnswers": ["train"],
}
medi_dataset = {
**sent_trf_training_dataset,
# not in MTEB:
# - Super-NI
# - KILT (https://arxiv.org/abs/2009.02252)
# - MedMCQA (https://proceedings.mlr.press/v174/pal22a/pal22a.pdf)
}
m3e_dataset = {
**medi_dataset,
"AmazonReviewsClassification": ["train"], # Possibly also test, hard to know
"Ocnli": ["train"],
"BQ": ["train"],
"LCQMC": ["train"],
"MIRACLReranking": ["train"],
"PAWSX": ["train"],
# not in MTEB:
# - cmrc2018
# - belle_2m
# - firefily
# - alpaca_gpt4
# - zhihu_kol
# - hc3_chinese
# - amazon_reviews_multi (intersects with AmazonReviewsClassification)
# - qa: Encyclopedia QA dataset
# - xlsum
# - wiki_atomic_edit
# - chatmed_consult
# - webqa
# - dureader_robust
# - csl
# - lawzhidao
# - CINLID
# - DuSQL
# - Zhuiyi-NL2SQL
# - Cspider
# - news2016zh
# - baike2018qa
# - webtext2019zh
# - SimCLUE
# - SQuAD
}

m3e_base = ModelMeta(
name="moka-ai/m3e-base",
languages=["zho_Hans", "eng-Latn"],
open_weights=True,
revision="764b537a0e50e5c7d64db883f2d2e051cbe3c64c",
release_date="2023-06-06", # first commit
n_parameters=102 * 1e6,
memory_usage=None,
embed_dim=768,
# They don't give a specific license but commercial use is not allowed
license="unspecified-noncommercial",
max_tokens=512,
reference="https://huggingface.co/moka-ai/m3e-base",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=False,
superseded_by=None,
adapted_from=None,
public_training_code=False, # Not published
public_training_data=False, # They haven't published it yet
training_datasets=m3e_dataset,
)

m3e_small = ModelMeta(
name="moka-ai/m3e-small",
languages=["zho_Hans", "eng-Latn"],
open_weights=True,
revision="44c696631b2a8c200220aaaad5f987f096e986df",
release_date="2023-06-02", # first commit
n_parameters=None, # Can't be seen on HF page
memory_usage=None,
embed_dim=512,
# They don't give a specific license but commercial use is not allowed
license="unspecified-noncommercial",
max_tokens=512,
reference="https://huggingface.co/moka-ai/m3e-small",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=False,
superseded_by=None,
adapted_from=None,
public_training_code=False, # Not published
public_training_data=False, # They haven't published it yet
training_datasets=m3e_dataset,
)


m3e_large = ModelMeta(
name="moka-ai/m3e-large",
languages=["zho_Hans", "eng-Latn"],
open_weights=True,
revision="12900375086c37ba5d83d1e417b21dc7d1d1f388",
release_date="2023-06-21", # first commit
n_parameters=None, # Can't be seen on HF page
memory_usage=None,
embed_dim=768,
# They don't give a specific license but commercial use is not allowed
license="unspecified-noncommercial",
max_tokens=512,
reference="https://huggingface.co/moka-ai/m3e-large",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=False,
superseded_by=None,
adapted_from=None,
public_training_code=False, # Not published
public_training_data=False, # They haven't published it yet
training_datasets=m3e_dataset,
)
1 change: 1 addition & 0 deletions mteb/models/overview.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
llm2vec_models,
misc_models,
model2vec_models,
moka_models,
mxbai_models,
no_instruct_sentence_models,
nomic_models,
Expand Down
50 changes: 50 additions & 0 deletions mteb/models/piccolo_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""Piccolo Chinese embedding models by SenseNova"""

from __future__ import annotations

from mteb.model_meta import ModelMeta

piccolo_base_zh = ModelMeta(
name="sensenova/piccolo-base-zh",
languages=["zho_Hans"],
open_weights=True,
revision="47c0a63b8f667c3482e05b2fd45577bb19252196",
release_date="2023-09-04", # first commit
n_parameters=None, # can't see on model card
memory_usage=None,
embed_dim=768,
license="mit",
max_tokens=512,
reference="https://huggingface.co/sensenova/piccolo-base-zh",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=False,
superseded_by=None,
adapted_from=None,
public_training_code=False,
public_training_data=False,
training_datasets=None, # They don't specify
)

piccolo_large_zh_v2 = ModelMeta(
name="sensenova/piccolo-large-zh-v2",
languages=["zho_Hans"],
open_weights=False, # They "temporarily" removed it in may last year
# "Due to certain internal company considerations"
revision="05948c1d889355936bdf9db7d30df57dd78d25a3",
release_date="2024-04-22", # first commit
n_parameters=None, # we don't know because they removed the model
memory_usage=None,
embed_dim=1024,
license="not specified",
max_tokens=512,
reference="https://huggingface.co/sensenova/piccolo-large-zh-v2",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=False,
superseded_by=None,
adapted_from=None,
public_training_code=False,
public_training_data=False,
training_datasets=None, # They don't say
)
107 changes: 106 additions & 1 deletion mteb/models/sentence_transformers_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@
"NQ": ["test"],
"NQHardNegatives": ["test"],
"MSMARCO": ["train"],
# Non MTEB sources
# Non MTEB source
# "s2orc": ["train"],
# "flax-sentence-embeddings/stackexchange_xml": ["train"],
# "ms_marco": ["train"],
Expand All @@ -242,6 +242,82 @@
},
)

# Source: https://arxiv.org/pdf/1907.04307
use_multilingual_languages = [
"ara-Arab", # Arabic
"zho-Hans", # Chinese (Simplified, PRC)
"zho-Hant", # Chinese (Traditional, Taiwan)
"nld-Latn", # Dutch
"eng-Latn", # English
"deu-Latn", # German
"fra-Latn", # French
"ita-Latn", # Italian
"por-Latn", # Portuguese
"spa-Latn", # Spanish
"jpn-Jpan", # Japanese
"kor-Kore", # Korean
"rus-Cyrl", # Russian
"pol-Latn", # Polish
"tha-Thai", # Thai
"tur-Latn", # Turkish
]
use_multilingual_training_data = {
# I'm not certain since they mined this themselves, but I would assume that there is significant overlap
"StackOverflowQARetrieval": ["train", "test"],
# Not in MTEB:
# - SNLI translated to 15 languages (could have intersections with other NLI datasets)
# - Translation pairs: Mined from the internet
# - QA mined from Reddit, StackOverflow, YahooAnswers (could be problematic)
}
distiluse_base_multilingual_cased_v2 = ModelMeta(
name="sentence-transformers/distiluse-base-multilingual-cased-v2",
languages=use_multilingual_languages,
open_weights=True,
revision="dad0fa1ee4fa6e982d3adbce87c73c02e6aee838",
release_date="2021-06-22", # First commit
n_parameters=135 * 1e6,
memory_usage=None,
embed_dim=768,
license="apache-2.0",
max_tokens=512,
reference="https://huggingface.co/sentence-transformers/distiluse-base-multilingual-cased-v2",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=False,
superseded_by=None,
adapted_from=None,
public_training_code=True,
public_training_data=True,
training_datasets=use_multilingual_training_data,
)

use_cmlm_multilingual = ModelMeta(
name="sentence-transformers/use-cmlm-multilingual",
languages=paraphrase_langs,
open_weights=True,
revision="6f8ff6583c371cbc4d6d3b93a5e37a888fd54574",
release_date="2022-04-14", # First commit
n_parameters=472 * 1e6,
memory_usage=None,
embed_dim=768,
license="apache-2.0",
max_tokens=256,
reference="https://huggingface.co/sentence-transformers/use-cmlm-multilingual",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=False,
superseded_by=None,
adapted_from="sentence-transformers/LaBSE",
public_training_code=True,
public_training_data=True,
training_datasets={
# Not in MTEB:
# - SNLI
# - Translation corpus based largely on Uszkoreit et al. (2010)
},
)


jina_embeddings_v2_base_en = ModelMeta(
name="jinaai/jina-embeddings-v2-base-en",
languages=["eng-Latn"],
Expand All @@ -262,6 +338,35 @@
training_datasets={"allenai/c4": ["train"]},
)

jina_embeddings_v2_base_zh = ModelMeta(
name="jinaai/jina-embeddings-v2-base-zh",
languages=["eng-Latn", "zho-Hans"],
open_weights=True,
revision="c1ff9086a89a1123d7b5eff58055a665db4fb4b9",
release_date="2024-01-10",
n_parameters=161_000_000,
memory_usage=None,
embed_dim=768,
license="apache-2.0",
max_tokens=8192,
reference="https://huggingface.co/jinaai/jina-embeddings-v2-base-zh",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=False,
superseded_by=None,
adapted_from=None,
training_datasets={
# source: https://arxiv.org/pdf/2402.17016
"XNLI": ["train"],
"MLSumClusteringS2S": ["train"],
"MLSumClusteringP2P": ["train"],
# Not in MTEB:
# - MQA
# - XLSUM
},
)


jina_embeddings_v2_small_en = ModelMeta(
name="jinaai/jina-embeddings-v2-small-en",
languages=["eng-Latn"],
Expand Down

0 comments on commit 748955c

Please sign in to comment.