sync fork (#7)

* fix: Unable to run vebbench and cli fix: remove comma of logging str fix cli unable to run zilliztech#444 Signed-off-by: yangxuan <[email protected]> * enhance: Unify optimize and remove ready_to_load PyMilvus used to be the only client that uses ready_to_load. Not it'll load the collection when creating it, so this PR removes `ready_to_load` from the client.API Also this PR enhance optimize and remove the optimize_with_size Signed-off-by: yangxuan <[email protected]> * add mongodb client Signed-off-by: zhuwenxing <[email protected]> * add mongodb client in readme Signed-off-by: zhuwenxing <[email protected]> --------- Signed-off-by: yangxuan <[email protected]> Signed-off-by: zhuwenxing <[email protected]> Co-authored-by: yangxuan <[email protected]> Co-authored-by: zhuwenxing <[email protected]>
yugabyte · Jan 16, 2025 · 729b5b9 · 729b5b9
1 parent 5ebc3e3
commit 729b5b9
Show file tree

Hide file tree

Showing 31 changed files with 383 additions and 260 deletions.
diff --git a/README.md b/README.md
@@ -52,6 +52,7 @@ All the database client supported
 | chromadb                 | `pip install vectordb-bench[chromadb]`      |
 | awsopensearch            | `pip install vectordb-bench[opensearch]` |
 | aliyun_opensearch        | `pip install vectordb-bench[aliyun_opensearch]` |
+| mongodb                  | `pip install vectordb-bench[mongodb]`       |
 
 ### Run
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -85,6 +85,7 @@ memorydb        = [ "memorydb" ]
 chromadb        = [ "chromadb" ]
 opensearch      = [ "opensearch-py" ]
 aliyun_opensearch = [ "alibabacloud_ha3engine_vector", "alibabacloud_searchengine20211025"]
+mongodb         = [ "pymongo" ]
 
 [project.urls]
 "repository" = "https://github.com/zilliztech/VectorDBBench"
@@ -133,6 +134,7 @@ lint.ignore = [
     "RUF017",
     "C416",
     "PLW0603",
+    "COM812",
 ]
 
 # Allow autofix for all enabled rules (when `--fix`) is provided.
@@ -206,4 +208,3 @@ builtins-ignorelist = [
     # "dict", # TODO
     # "filter",
 ]
-
diff --git a/vectordb_bench/backend/clients/__init__.py b/vectordb_bench/backend/clients/__init__.py
@@ -40,9 +40,10 @@ class DB(Enum):
     AliyunElasticsearch = "AliyunElasticsearch"
     Test = "test"
     AliyunOpenSearch = "AliyunOpenSearch"
+    MongoDB = "MongoDB"
 
     @property
-    def init_cls(self) -> type[VectorDB]:  # noqa: PLR0911, PLR0912
+    def init_cls(self) -> type[VectorDB]:  # noqa: PLR0911, PLR0912, C901
         """Import while in use"""
         if self == DB.Milvus:
             from .milvus.milvus import Milvus
@@ -129,11 +130,21 @@ def init_cls(self) -> type[VectorDB]:  # noqa: PLR0911, PLR0912
 
             return AliyunOpenSearch
 
+        if self == DB.MongoDB:
+            from .mongodb.mongodb import MongoDB
+
+            return MongoDB
+
+        if self == DB.Test:
+            from .test.test import Test
+
+            return Test
+
         msg = f"Unknown DB: {self.name}"
         raise ValueError(msg)
 
     @property
-    def config_cls(self) -> type[DBConfig]:  # noqa: PLR0911, PLR0912
+    def config_cls(self) -> type[DBConfig]:  # noqa: PLR0911, PLR0912, C901
         """Import while in use"""
         if self == DB.Milvus:
             from .milvus.config import MilvusConfig
@@ -220,6 +231,16 @@ def config_cls(self) -> type[DBConfig]:  # noqa: PLR0911, PLR0912
 
             return AliyunOpenSearchConfig
 
+        if self == DB.MongoDB:
+            from .mongodb.config import MongoDBConfig
+
+            return MongoDBConfig
+
+        if self == DB.Test:
+            from .test.config import TestConfig
+
+            return TestConfig
+
         msg = f"Unknown DB: {self.name}"
         raise ValueError(msg)
 
@@ -292,6 +313,11 @@ def case_config_cls(  # noqa: PLR0911
 
             return AliyunOpenSearchIndexConfig
 
+        if self == DB.MongoDB:
+            from .mongodb.config import MongoDBIndexConfig
+
+            return MongoDBIndexConfig
+
         # DB.Pinecone, DB.Chroma, DB.Redis
         return EmptyDBCaseConfig
 

diff --git a/vectordb_bench/backend/clients/aliyun_opensearch/aliyun_opensearch.py b/vectordb_bench/backend/clients/aliyun_opensearch/aliyun_opensearch.py
@@ -325,10 +325,7 @@ def need_normalize_cosine(self) -> bool:
 
         return False
 
-    def optimize(self):
-        pass
-
-    def optimize_with_size(self, data_size: int):
+    def optimize(self, data_size: int):
         log.info(f"optimize count: {data_size}")
         retry_times = 0
         while True:
@@ -340,6 +337,3 @@ def optimize_with_size(self, data_size: int):
             if total_count == data_size:
                 log.info("optimize table finish.")
                 return
-
-    def ready_to_load(self):
-        """ready_to_load will be called before load in load cases."""
diff --git a/vectordb_bench/backend/clients/alloydb/alloydb.py b/vectordb_bench/backend/clients/alloydb/alloydb.py
@@ -149,10 +149,7 @@ def _drop_table(self):
         )
         self.conn.commit()
 
-    def ready_to_load(self):
-        pass
-
-    def optimize(self):
+    def optimize(self, data_size: int | None = None):
         self._post_insert()
 
     def _post_insert(self):

diff --git a/vectordb_bench/backend/clients/api.py b/vectordb_bench/backend/clients/api.py
@@ -137,6 +137,13 @@ def __init__(
     @contextmanager
     def init(self) -> None:
         """create and destory connections to database.
+        Why contextmanager:
+
+            In multiprocessing search tasks, vectordbbench might init
+            totally hundreds of thousands of connections with DB server.
+
+            Too many connections may drain local FDs or server connection resources.
+            If the DB client doesn't have `close()` method, just set the object to None.
 
         Examples:
             >>> with self.init():
@@ -187,9 +194,8 @@ def search_embedding(
         """
         raise NotImplementedError
 
-    # TODO: remove
     @abstractmethod
-    def optimize(self):
+    def optimize(self, data_size: int | None = None):
         """optimize will be called between insertion and search in performance cases.
 
         Should be blocked until the vectorDB is ready to be tested on
@@ -199,16 +205,3 @@ def optimize(self):
         Optimize's execution time is limited, the limited time is based on cases.
         """
         raise NotImplementedError
-
-    def optimize_with_size(self, data_size: int):
-        self.optimize()
-
-    # TODO: remove
-    @abstractmethod
-    def ready_to_load(self):
-        """ready_to_load will be called before load in load cases.
-
-        Should be blocked until the vectorDB is ready to be tested on
-        heavy load cases.
-        """
-        raise NotImplementedError
diff --git a/vectordb_bench/backend/clients/aws_opensearch/aws_opensearch.py b/vectordb_bench/backend/clients/aws_opensearch/aws_opensearch.py
@@ -145,15 +145,15 @@ def search_embedding(
                 docvalue_fields=[self.id_col_name],
                 stored_fields="_none_",
             )
-            log.info(f'Search took: {resp["took"]}')
-            log.info(f'Search shards: {resp["_shards"]}')
-            log.info(f'Search hits total: {resp["hits"]["total"]}')
+            log.info(f"Search took: {resp['took']}")
+            log.info(f"Search shards: {resp['_shards']}")
+            log.info(f"Search hits total: {resp['hits']['total']}")
             return [int(h["fields"][self.id_col_name][0]) for h in resp["hits"]["hits"]]
         except Exception as e:
             log.warning(f"Failed to search: {self.index_name} error: {e!s}")
             raise e from None
 
-    def optimize(self):
+    def optimize(self, data_size: int | None = None):
         """optimize will be called between insertion and search in performance cases."""
         # Call refresh first to ensure that all segments are created
         self._refresh_index()
@@ -194,6 +194,3 @@ def _load_graphs_to_memory(self):
             log.info("Calling warmup API to load graphs into memory")
             warmup_endpoint = f"/_plugins/_knn/warmup/{self.index_name}"
             self.client.transport.perform_request("GET", warmup_endpoint)
-
-    def ready_to_load(self):
-        """ready_to_load will be called before load in load cases."""
diff --git a/vectordb_bench/backend/clients/chroma/chroma.py b/vectordb_bench/backend/clients/chroma/chroma.py
@@ -57,10 +57,7 @@ def init(self) -> None:
     def ready_to_search(self) -> bool:
         pass
 
-    def ready_to_load(self) -> bool:
-        pass
-
-    def optimize(self) -> None:
+    def optimize(self, data_size: int | None = None):
         pass
 
     def insert_embeddings(

diff --git a/vectordb_bench/backend/clients/elastic_cloud/elastic_cloud.py b/vectordb_bench/backend/clients/elastic_cloud/elastic_cloud.py
@@ -143,7 +143,7 @@ def search_embedding(
             log.warning(f"Failed to search: {self.indice} error: {e!s}")
             raise e from None
 
-    def optimize(self):
+    def optimize(self, data_size: int | None = None):
         """optimize will be called between insertion and search in performance cases."""
         assert self.client is not None, "should self.init() first"
         self.client.indices.refresh(index=self.indice)
@@ -158,6 +158,3 @@ def optimize(self):
             task_status = self.client.tasks.get(task_id=force_merge_task_id)
             if task_status["completed"]:
                 return
-
-    def ready_to_load(self):
-        """ready_to_load will be called before load in load cases."""
diff --git a/vectordb_bench/backend/clients/memorydb/cli.py b/vectordb_bench/backend/clients/memorydb/cli.py
@@ -43,8 +43,8 @@ class MemoryDBTypedDict(TypedDict):
             show_default=True,
             default=False,
             help=(
-                "Cluster Mode Disabled (CMD), use this flag when testing locally on a single node instance.",
-                " In production, MemoryDB only supports cluster mode (CME)",
+                "Cluster Mode Disabled (CMD), use this flag when testing locally on a single node instance."
+                " In production, MemoryDB only supports cluster mode (CME)"
             ),
         ),
     ]

diff --git a/vectordb_bench/backend/clients/memorydb/memorydb.py b/vectordb_bench/backend/clients/memorydb/memorydb.py
@@ -157,17 +157,14 @@ def init(self) -> Generator[None, None, None]:
         self.conn = self.get_client()
         search_param = self.case_config.search_param()
         if search_param["ef_runtime"]:
-            self.ef_runtime_str = f'EF_RUNTIME {search_param["ef_runtime"]}'
+            self.ef_runtime_str = f"EF_RUNTIME {search_param['ef_runtime']}"
         else:
             self.ef_runtime_str = ""
         yield
         self.conn.close()
         self.conn = None
 
-    def ready_to_load(self) -> bool:
-        pass
-
-    def optimize(self) -> None:
+    def optimize(self, data_size: int | None = None):
         self._post_insert()
 
     def insert_embeddings(

diff --git a/vectordb_bench/backend/clients/milvus/milvus.py b/vectordb_bench/backend/clients/milvus/milvus.py
@@ -138,26 +138,7 @@ def wait_index():
             log.warning(f"{self.name} optimize error: {e}")
             raise e from None
 
-    def ready_to_load(self):
-        assert self.col, "Please call self.init() before"
-        self._pre_load(self.col)
-
-    def _pre_load(self, coll: Collection):
-        try:
-            if not coll.has_index(index_name=self._index_name):
-                log.info(f"{self.name} create index")
-                coll.create_index(
-                    self._vector_field,
-                    self.case_config.index_param(),
-                    index_name=self._index_name,
-                )
-            coll.load()
-            log.info(f"{self.name} load")
-        except Exception as e:
-            log.warning(f"{self.name} pre load error: {e}")
-            raise e from None
-
-    def optimize(self):
+    def optimize(self, data_size: int | None = None):
         assert self.col, "Please call self.init() before"
         self._optimize()
 

diff --git a/vectordb_bench/backend/clients/mongodb/config.py b/vectordb_bench/backend/clients/mongodb/config.py
@@ -0,0 +1,44 @@
+from pydantic import BaseModel, SecretStr
+
+from ..api import DBCaseConfig, DBConfig, IndexType, MetricType
+
+
+class MongoDBConfig(DBConfig, BaseModel):
+    connection_string: SecretStr = "mongodb+srv://<user>:<password>@<cluster_name>.heatl.mongodb.net"
+    database: str = "vdb_bench"
+
+    def to_dict(self) -> dict:
+        return {
+            "connection_string": self.connection_string.get_secret_value(),
+            "database": self.database,
+        }
+
+
+class MongoDBIndexConfig(BaseModel, DBCaseConfig):
+    index: IndexType = IndexType.HNSW  # MongoDB uses HNSW for vector search
+    metric_type: MetricType | None = None
+    num_candidates: int | None = 1500  # Default numCandidates for vector search
+    exact_search: bool = False  # Whether to use exact (ENN) search
+
+    def parse_metric(self) -> str:
+        if self.metric_type == MetricType.L2:
+            return "euclidean"
+        if self.metric_type == MetricType.IP:
+            return "dotProduct"
+        return "cosine"  # Default to cosine similarity
+
+    def index_param(self) -> dict:
+        return {
+            "type": "vectorSearch",
+            "fields": [
+                {
+                    "type": "vector",
+                    "similarity": self.parse_metric(),
+                    "numDimensions": None,  # Will be set in MongoDB class
+                    "path": "vector",  # Vector field name
+                }
+            ],
+        }
+
+    def search_param(self) -> dict:
+        return {"numCandidates": self.num_candidates if not self.exact_search else None, "exact": self.exact_search}