Skip to content

Commit

Permalink
sync fork (#7)
Browse files Browse the repository at this point in the history
* fix: Unable to run vebbench and cli

fix: remove comma of logging str
fix cli unable to run zilliztech#444

Signed-off-by: yangxuan <[email protected]>

* enhance: Unify optimize and remove ready_to_load

PyMilvus used to be the only client that uses ready_to_load.
Not it'll load the collection when creating it, so
this PR removes `ready_to_load` from the client.API

Also this PR enhance optimize and remove the optimize_with_size

Signed-off-by: yangxuan <[email protected]>

* add mongodb client

Signed-off-by: zhuwenxing <[email protected]>

* add mongodb client in readme

Signed-off-by: zhuwenxing <[email protected]>

---------

Signed-off-by: yangxuan <[email protected]>
Signed-off-by: zhuwenxing <[email protected]>
Co-authored-by: yangxuan <[email protected]>
Co-authored-by: zhuwenxing <[email protected]>
  • Loading branch information
3 people authored Jan 16, 2025
1 parent 5ebc3e3 commit 729b5b9
Show file tree
Hide file tree
Showing 31 changed files with 383 additions and 260 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ All the database client supported
| chromadb | `pip install vectordb-bench[chromadb]` |
| awsopensearch | `pip install vectordb-bench[opensearch]` |
| aliyun_opensearch | `pip install vectordb-bench[aliyun_opensearch]` |
| mongodb | `pip install vectordb-bench[mongodb]` |

### Run

Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ memorydb = [ "memorydb" ]
chromadb = [ "chromadb" ]
opensearch = [ "opensearch-py" ]
aliyun_opensearch = [ "alibabacloud_ha3engine_vector", "alibabacloud_searchengine20211025"]
mongodb = [ "pymongo" ]

[project.urls]
"repository" = "https://github.com/zilliztech/VectorDBBench"
Expand Down Expand Up @@ -133,6 +134,7 @@ lint.ignore = [
"RUF017",
"C416",
"PLW0603",
"COM812",
]

# Allow autofix for all enabled rules (when `--fix`) is provided.
Expand Down Expand Up @@ -206,4 +208,3 @@ builtins-ignorelist = [
# "dict", # TODO
# "filter",
]

30 changes: 28 additions & 2 deletions vectordb_bench/backend/clients/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,10 @@ class DB(Enum):
AliyunElasticsearch = "AliyunElasticsearch"
Test = "test"
AliyunOpenSearch = "AliyunOpenSearch"
MongoDB = "MongoDB"

@property
def init_cls(self) -> type[VectorDB]: # noqa: PLR0911, PLR0912
def init_cls(self) -> type[VectorDB]: # noqa: PLR0911, PLR0912, C901
"""Import while in use"""
if self == DB.Milvus:
from .milvus.milvus import Milvus
Expand Down Expand Up @@ -129,11 +130,21 @@ def init_cls(self) -> type[VectorDB]: # noqa: PLR0911, PLR0912

return AliyunOpenSearch

if self == DB.MongoDB:
from .mongodb.mongodb import MongoDB

return MongoDB

if self == DB.Test:
from .test.test import Test

return Test

msg = f"Unknown DB: {self.name}"
raise ValueError(msg)

@property
def config_cls(self) -> type[DBConfig]: # noqa: PLR0911, PLR0912
def config_cls(self) -> type[DBConfig]: # noqa: PLR0911, PLR0912, C901
"""Import while in use"""
if self == DB.Milvus:
from .milvus.config import MilvusConfig
Expand Down Expand Up @@ -220,6 +231,16 @@ def config_cls(self) -> type[DBConfig]: # noqa: PLR0911, PLR0912

return AliyunOpenSearchConfig

if self == DB.MongoDB:
from .mongodb.config import MongoDBConfig

return MongoDBConfig

if self == DB.Test:
from .test.config import TestConfig

return TestConfig

msg = f"Unknown DB: {self.name}"
raise ValueError(msg)

Expand Down Expand Up @@ -292,6 +313,11 @@ def case_config_cls( # noqa: PLR0911

return AliyunOpenSearchIndexConfig

if self == DB.MongoDB:
from .mongodb.config import MongoDBIndexConfig

return MongoDBIndexConfig

# DB.Pinecone, DB.Chroma, DB.Redis
return EmptyDBCaseConfig

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -325,10 +325,7 @@ def need_normalize_cosine(self) -> bool:

return False

def optimize(self):
pass

def optimize_with_size(self, data_size: int):
def optimize(self, data_size: int):
log.info(f"optimize count: {data_size}")
retry_times = 0
while True:
Expand All @@ -340,6 +337,3 @@ def optimize_with_size(self, data_size: int):
if total_count == data_size:
log.info("optimize table finish.")
return

def ready_to_load(self):
"""ready_to_load will be called before load in load cases."""
5 changes: 1 addition & 4 deletions vectordb_bench/backend/clients/alloydb/alloydb.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,10 +149,7 @@ def _drop_table(self):
)
self.conn.commit()

def ready_to_load(self):
pass

def optimize(self):
def optimize(self, data_size: int | None = None):
self._post_insert()

def _post_insert(self):
Expand Down
23 changes: 8 additions & 15 deletions vectordb_bench/backend/clients/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,13 @@ def __init__(
@contextmanager
def init(self) -> None:
"""create and destory connections to database.
Why contextmanager:
In multiprocessing search tasks, vectordbbench might init
totally hundreds of thousands of connections with DB server.
Too many connections may drain local FDs or server connection resources.
If the DB client doesn't have `close()` method, just set the object to None.
Examples:
>>> with self.init():
Expand Down Expand Up @@ -187,9 +194,8 @@ def search_embedding(
"""
raise NotImplementedError

# TODO: remove
@abstractmethod
def optimize(self):
def optimize(self, data_size: int | None = None):
"""optimize will be called between insertion and search in performance cases.
Should be blocked until the vectorDB is ready to be tested on
Expand All @@ -199,16 +205,3 @@ def optimize(self):
Optimize's execution time is limited, the limited time is based on cases.
"""
raise NotImplementedError

def optimize_with_size(self, data_size: int):
self.optimize()

# TODO: remove
@abstractmethod
def ready_to_load(self):
"""ready_to_load will be called before load in load cases.
Should be blocked until the vectorDB is ready to be tested on
heavy load cases.
"""
raise NotImplementedError
11 changes: 4 additions & 7 deletions vectordb_bench/backend/clients/aws_opensearch/aws_opensearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,15 +145,15 @@ def search_embedding(
docvalue_fields=[self.id_col_name],
stored_fields="_none_",
)
log.info(f'Search took: {resp["took"]}')
log.info(f'Search shards: {resp["_shards"]}')
log.info(f'Search hits total: {resp["hits"]["total"]}')
log.info(f"Search took: {resp['took']}")
log.info(f"Search shards: {resp['_shards']}")
log.info(f"Search hits total: {resp['hits']['total']}")
return [int(h["fields"][self.id_col_name][0]) for h in resp["hits"]["hits"]]
except Exception as e:
log.warning(f"Failed to search: {self.index_name} error: {e!s}")
raise e from None

def optimize(self):
def optimize(self, data_size: int | None = None):
"""optimize will be called between insertion and search in performance cases."""
# Call refresh first to ensure that all segments are created
self._refresh_index()
Expand Down Expand Up @@ -194,6 +194,3 @@ def _load_graphs_to_memory(self):
log.info("Calling warmup API to load graphs into memory")
warmup_endpoint = f"/_plugins/_knn/warmup/{self.index_name}"
self.client.transport.perform_request("GET", warmup_endpoint)

def ready_to_load(self):
"""ready_to_load will be called before load in load cases."""
5 changes: 1 addition & 4 deletions vectordb_bench/backend/clients/chroma/chroma.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,7 @@ def init(self) -> None:
def ready_to_search(self) -> bool:
pass

def ready_to_load(self) -> bool:
pass

def optimize(self) -> None:
def optimize(self, data_size: int | None = None):
pass

def insert_embeddings(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def search_embedding(
log.warning(f"Failed to search: {self.indice} error: {e!s}")
raise e from None

def optimize(self):
def optimize(self, data_size: int | None = None):
"""optimize will be called between insertion and search in performance cases."""
assert self.client is not None, "should self.init() first"
self.client.indices.refresh(index=self.indice)
Expand All @@ -158,6 +158,3 @@ def optimize(self):
task_status = self.client.tasks.get(task_id=force_merge_task_id)
if task_status["completed"]:
return

def ready_to_load(self):
"""ready_to_load will be called before load in load cases."""
4 changes: 2 additions & 2 deletions vectordb_bench/backend/clients/memorydb/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ class MemoryDBTypedDict(TypedDict):
show_default=True,
default=False,
help=(
"Cluster Mode Disabled (CMD), use this flag when testing locally on a single node instance.",
" In production, MemoryDB only supports cluster mode (CME)",
"Cluster Mode Disabled (CMD), use this flag when testing locally on a single node instance."
" In production, MemoryDB only supports cluster mode (CME)"
),
),
]
Expand Down
7 changes: 2 additions & 5 deletions vectordb_bench/backend/clients/memorydb/memorydb.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,17 +157,14 @@ def init(self) -> Generator[None, None, None]:
self.conn = self.get_client()
search_param = self.case_config.search_param()
if search_param["ef_runtime"]:
self.ef_runtime_str = f'EF_RUNTIME {search_param["ef_runtime"]}'
self.ef_runtime_str = f"EF_RUNTIME {search_param['ef_runtime']}"
else:
self.ef_runtime_str = ""
yield
self.conn.close()
self.conn = None

def ready_to_load(self) -> bool:
pass

def optimize(self) -> None:
def optimize(self, data_size: int | None = None):
self._post_insert()

def insert_embeddings(
Expand Down
21 changes: 1 addition & 20 deletions vectordb_bench/backend/clients/milvus/milvus.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,26 +138,7 @@ def wait_index():
log.warning(f"{self.name} optimize error: {e}")
raise e from None

def ready_to_load(self):
assert self.col, "Please call self.init() before"
self._pre_load(self.col)

def _pre_load(self, coll: Collection):
try:
if not coll.has_index(index_name=self._index_name):
log.info(f"{self.name} create index")
coll.create_index(
self._vector_field,
self.case_config.index_param(),
index_name=self._index_name,
)
coll.load()
log.info(f"{self.name} load")
except Exception as e:
log.warning(f"{self.name} pre load error: {e}")
raise e from None

def optimize(self):
def optimize(self, data_size: int | None = None):
assert self.col, "Please call self.init() before"
self._optimize()

Expand Down
44 changes: 44 additions & 0 deletions vectordb_bench/backend/clients/mongodb/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from pydantic import BaseModel, SecretStr

from ..api import DBCaseConfig, DBConfig, IndexType, MetricType


class MongoDBConfig(DBConfig, BaseModel):
connection_string: SecretStr = "mongodb+srv://<user>:<password>@<cluster_name>.heatl.mongodb.net"
database: str = "vdb_bench"

def to_dict(self) -> dict:
return {
"connection_string": self.connection_string.get_secret_value(),
"database": self.database,
}


class MongoDBIndexConfig(BaseModel, DBCaseConfig):
index: IndexType = IndexType.HNSW # MongoDB uses HNSW for vector search
metric_type: MetricType | None = None
num_candidates: int | None = 1500 # Default numCandidates for vector search
exact_search: bool = False # Whether to use exact (ENN) search

def parse_metric(self) -> str:
if self.metric_type == MetricType.L2:
return "euclidean"
if self.metric_type == MetricType.IP:
return "dotProduct"
return "cosine" # Default to cosine similarity

def index_param(self) -> dict:
return {
"type": "vectorSearch",
"fields": [
{
"type": "vector",
"similarity": self.parse_metric(),
"numDimensions": None, # Will be set in MongoDB class
"path": "vector", # Vector field name
}
],
}

def search_param(self) -> dict:
return {"numCandidates": self.num_candidates if not self.exact_search else None, "exact": self.exact_search}
Loading

0 comments on commit 729b5b9

Please sign in to comment.