-
Notifications
You must be signed in to change notification settings - Fork 298
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: Added more Chinese models'
ModelMeta
(#1814)
* Added Multilingual USE models * Added Moka models * Added dmeta models * Added jina-zh * Added piccolo models
- Loading branch information
1 parent
186cc23
commit 748955c
Showing
5 changed files
with
354 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
"""Moka AI's Chinese embedding models""" | ||
|
||
from __future__ import annotations | ||
|
||
from mteb.model_meta import ModelMeta | ||
|
||
sent_trf_training_dataset = { | ||
# derived from datasheets | ||
"MSMARCO": ["train"], | ||
"MSMARCOHardNegatives": ["train"], | ||
"NanoMSMARCORetrieval": ["train"], | ||
"MSMARCO-PL": ["train"], # translation not trained on | ||
"NQ": ["train"], | ||
"NQHardNegatives": ["train"], | ||
"NanoNQRetrieval": ["train"], | ||
"NQ-PL": ["train"], # translation not trained on | ||
# not in MTEB | ||
# "s2orc": ["train"], | ||
# "flax-sentence-embeddings/stackexchange_xml": ["train"], | ||
# "ms_marco": ["train"], | ||
# "gooaq": ["train"], | ||
# "yahoo_answers_topics": ["train"], | ||
# "code_search_net": ["train"], | ||
# "search_qa": ["train"], | ||
# "eli5": ["train"], | ||
# "snli": ["train"], | ||
# "multi_nli": ["train"], | ||
# "wikihow": ["train"], | ||
# "natural_questions": ["train"], | ||
# "trivia_qa": ["train"], | ||
# "embedding-data/sentence-compression": ["train"], | ||
# "embedding-data/flickr30k-captions": ["train"], | ||
# "embedding-data/altlex": ["train"], | ||
# "embedding-data/simple-wiki": ["train"], | ||
# "embedding-data/QQP": ["train"], | ||
# "embedding-data/SPECTER": ["train"], | ||
# "embedding-data/PAQ_pairs": ["train"], | ||
# "embedding-data/WikiAnswers": ["train"], | ||
} | ||
medi_dataset = { | ||
**sent_trf_training_dataset, | ||
# not in MTEB: | ||
# - Super-NI | ||
# - KILT (https://arxiv.org/abs/2009.02252) | ||
# - MedMCQA (https://proceedings.mlr.press/v174/pal22a/pal22a.pdf) | ||
} | ||
m3e_dataset = { | ||
**medi_dataset, | ||
"AmazonReviewsClassification": ["train"], # Possibly also test, hard to know | ||
"Ocnli": ["train"], | ||
"BQ": ["train"], | ||
"LCQMC": ["train"], | ||
"MIRACLReranking": ["train"], | ||
"PAWSX": ["train"], | ||
# not in MTEB: | ||
# - cmrc2018 | ||
# - belle_2m | ||
# - firefily | ||
# - alpaca_gpt4 | ||
# - zhihu_kol | ||
# - hc3_chinese | ||
# - amazon_reviews_multi (intersects with AmazonReviewsClassification) | ||
# - qa: Encyclopedia QA dataset | ||
# - xlsum | ||
# - wiki_atomic_edit | ||
# - chatmed_consult | ||
# - webqa | ||
# - dureader_robust | ||
# - csl | ||
# - lawzhidao | ||
# - CINLID | ||
# - DuSQL | ||
# - Zhuiyi-NL2SQL | ||
# - Cspider | ||
# - news2016zh | ||
# - baike2018qa | ||
# - webtext2019zh | ||
# - SimCLUE | ||
# - SQuAD | ||
} | ||
|
||
m3e_base = ModelMeta( | ||
name="moka-ai/m3e-base", | ||
languages=["zho_Hans", "eng-Latn"], | ||
open_weights=True, | ||
revision="764b537a0e50e5c7d64db883f2d2e051cbe3c64c", | ||
release_date="2023-06-06", # first commit | ||
n_parameters=102 * 1e6, | ||
memory_usage=None, | ||
embed_dim=768, | ||
# They don't give a specific license but commercial use is not allowed | ||
license="unspecified-noncommercial", | ||
max_tokens=512, | ||
reference="https://huggingface.co/moka-ai/m3e-base", | ||
similarity_fn_name="cosine", | ||
framework=["Sentence Transformers", "PyTorch"], | ||
use_instructions=False, | ||
superseded_by=None, | ||
adapted_from=None, | ||
public_training_code=False, # Not published | ||
public_training_data=False, # They haven't published it yet | ||
training_datasets=m3e_dataset, | ||
) | ||
|
||
m3e_small = ModelMeta( | ||
name="moka-ai/m3e-small", | ||
languages=["zho_Hans", "eng-Latn"], | ||
open_weights=True, | ||
revision="44c696631b2a8c200220aaaad5f987f096e986df", | ||
release_date="2023-06-02", # first commit | ||
n_parameters=None, # Can't be seen on HF page | ||
memory_usage=None, | ||
embed_dim=512, | ||
# They don't give a specific license but commercial use is not allowed | ||
license="unspecified-noncommercial", | ||
max_tokens=512, | ||
reference="https://huggingface.co/moka-ai/m3e-small", | ||
similarity_fn_name="cosine", | ||
framework=["Sentence Transformers", "PyTorch"], | ||
use_instructions=False, | ||
superseded_by=None, | ||
adapted_from=None, | ||
public_training_code=False, # Not published | ||
public_training_data=False, # They haven't published it yet | ||
training_datasets=m3e_dataset, | ||
) | ||
|
||
|
||
m3e_large = ModelMeta( | ||
name="moka-ai/m3e-large", | ||
languages=["zho_Hans", "eng-Latn"], | ||
open_weights=True, | ||
revision="12900375086c37ba5d83d1e417b21dc7d1d1f388", | ||
release_date="2023-06-21", # first commit | ||
n_parameters=None, # Can't be seen on HF page | ||
memory_usage=None, | ||
embed_dim=768, | ||
# They don't give a specific license but commercial use is not allowed | ||
license="unspecified-noncommercial", | ||
max_tokens=512, | ||
reference="https://huggingface.co/moka-ai/m3e-large", | ||
similarity_fn_name="cosine", | ||
framework=["Sentence Transformers", "PyTorch"], | ||
use_instructions=False, | ||
superseded_by=None, | ||
adapted_from=None, | ||
public_training_code=False, # Not published | ||
public_training_data=False, # They haven't published it yet | ||
training_datasets=m3e_dataset, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
"""Piccolo Chinese embedding models by SenseNova""" | ||
|
||
from __future__ import annotations | ||
|
||
from mteb.model_meta import ModelMeta | ||
|
||
piccolo_base_zh = ModelMeta( | ||
name="sensenova/piccolo-base-zh", | ||
languages=["zho_Hans"], | ||
open_weights=True, | ||
revision="47c0a63b8f667c3482e05b2fd45577bb19252196", | ||
release_date="2023-09-04", # first commit | ||
n_parameters=None, # can't see on model card | ||
memory_usage=None, | ||
embed_dim=768, | ||
license="mit", | ||
max_tokens=512, | ||
reference="https://huggingface.co/sensenova/piccolo-base-zh", | ||
similarity_fn_name="cosine", | ||
framework=["Sentence Transformers", "PyTorch"], | ||
use_instructions=False, | ||
superseded_by=None, | ||
adapted_from=None, | ||
public_training_code=False, | ||
public_training_data=False, | ||
training_datasets=None, # They don't specify | ||
) | ||
|
||
piccolo_large_zh_v2 = ModelMeta( | ||
name="sensenova/piccolo-large-zh-v2", | ||
languages=["zho_Hans"], | ||
open_weights=False, # They "temporarily" removed it in may last year | ||
# "Due to certain internal company considerations" | ||
revision="05948c1d889355936bdf9db7d30df57dd78d25a3", | ||
release_date="2024-04-22", # first commit | ||
n_parameters=None, # we don't know because they removed the model | ||
memory_usage=None, | ||
embed_dim=1024, | ||
license="not specified", | ||
max_tokens=512, | ||
reference="https://huggingface.co/sensenova/piccolo-large-zh-v2", | ||
similarity_fn_name="cosine", | ||
framework=["Sentence Transformers", "PyTorch"], | ||
use_instructions=False, | ||
superseded_by=None, | ||
adapted_from=None, | ||
public_training_code=False, | ||
public_training_data=False, | ||
training_datasets=None, # They don't say | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters