From 28a501018885d02cd6ddb30ebb19fc1f784b971f Mon Sep 17 00:00:00 2001 From: Amnon Catav Date: Thu, 26 Oct 2023 15:07:16 +0300 Subject: [PATCH 1/5] fix metadata filtering --- src/canopy/knowledge_base/knowledge_base.py | 2 +- .../knowledge_base/test_knowledge_base.py | 22 ++++++++++++++++--- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/src/canopy/knowledge_base/knowledge_base.py b/src/canopy/knowledge_base/knowledge_base.py index 4d0c5ae2..5ca38085 100644 --- a/src/canopy/knowledge_base/knowledge_base.py +++ b/src/canopy/knowledge_base/knowledge_base.py @@ -446,7 +446,7 @@ def _query_index(self, sparse_vector=query.sparse_values, top_k=top_k, namespace=query.namespace, - metadata_filter=metadata_filter, + filter=metadata_filter, include_metadata=True, _check_return_type=_check_return_type, **query_params) diff --git a/tests/system/knowledge_base/test_knowledge_base.py b/tests/system/knowledge_base/test_knowledge_base.py index 8095239e..643b81b7 100644 --- a/tests/system/knowledge_base/test_knowledge_base.py +++ b/tests/system/knowledge_base/test_knowledge_base.py @@ -71,7 +71,7 @@ def knowledge_base(index_full_name, index_name, chunker, encoder): kb = KnowledgeBase(index_name=index_name, record_encoder=encoder, chunker=chunker) - kb.create_canopy_index() + kb.create_canopy_index(indexed_fields=["my-key"]) return kb @@ -139,6 +139,18 @@ def execute_and_assert_queries(knowledge_base, chunks_to_query): f"actual: {q_res.documents}" +def assert_query_metadata_filter(knowledge_base: KnowledgeBase, + metadata_filter: dict, + num_vectors_expected: int, + top_k: int = 100): + assert top_k > num_vectors_expected, \ + "the test might return false positive if top_k is not > num_vectors_expected" + query = Query(text="test", top_k=top_k, metadata_filter=metadata_filter) + query_results = knowledge_base.query([query]) + assert len(query_results) == 1 + assert len(query_results[0].documents) == num_vectors_expected + + @pytest.fixture(scope="module", autouse=True) def teardown_knowledge_base(index_full_name, knowledge_base): yield @@ -162,7 +174,7 @@ def documents(random_texts): return [Document(id=f"doc_{i}", text=random_texts[i], source=f"source_{i}", - metadata={"test": i}) + metadata={"my-key": f"value-{i}"}) for i in range(5)] @@ -170,7 +182,7 @@ def documents(random_texts): def documents_large(): return [Document(id=f"doc_{i}_large", text=f"Sample document {i}", - metadata={"test": i}) + metadata={"my-key-large": f"value-{i}"}) for i in range(1000)] @@ -249,6 +261,10 @@ def test_query(knowledge_base, encoded_chunks): execute_and_assert_queries(knowledge_base, encoded_chunks) +def test_query_with_metadata_filter(knowledge_base, encoded_chunks): + assert_query_metadata_filter(knowledge_base, {"my-key": "value-1"}, 2) + + def test_delete_documents(knowledge_base, encoded_chunks): chunk_ids = [chunk.id for chunk in encoded_chunks[-4:]] doc_ids = set(doc.document_id for doc in encoded_chunks[-4:]) From 4630f968ea05a06627091564e9cbf3d6cba0c8a2 Mon Sep 17 00:00:00 2001 From: Amnon Catav Date: Thu, 26 Oct 2023 15:17:40 +0300 Subject: [PATCH 2/5] allow reinitillize --- src/canopy/tokenizer/tokenizer.py | 2 -- tests/unit/stubs/stub_tokenizer.py | 6 +++++- tests/unit/tokenizer/test_tokenizer_singleton.py | 7 ++++--- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/canopy/tokenizer/tokenizer.py b/src/canopy/tokenizer/tokenizer.py index 050c183a..7e9d53f6 100644 --- a/src/canopy/tokenizer/tokenizer.py +++ b/src/canopy/tokenizer/tokenizer.py @@ -21,8 +21,6 @@ def __new__(cls): @classmethod def initialize(cls, tokenizer_class=DEFAULT_TOKENIZER_CLASS, **kwargs): - if cls._initialized: - raise ValueError("Tokenizer has already been initialized") if not issubclass(tokenizer_class, BaseTokenizer): raise ValueError("Invalid tokenizer class provided") if issubclass(tokenizer_class, Tokenizer): diff --git a/tests/unit/stubs/stub_tokenizer.py b/tests/unit/stubs/stub_tokenizer.py index 82c4f6f9..7cf34189 100644 --- a/tests/unit/stubs/stub_tokenizer.py +++ b/tests/unit/stubs/stub_tokenizer.py @@ -5,6 +5,9 @@ class StubTokenizer(BaseTokenizer): + def __init__(self, message_overhead: int = 3): + self._message_overhead = message_overhead + def tokenize(self, text: str) -> List[str]: return text.split() @@ -14,4 +17,5 @@ def detokenize(self, tokens: List[str]) -> str: return " ".join(tokens) def messages_token_count(self, messages: Messages) -> int: - return sum(len(self.tokenize(msg.content)) + 3 for msg in messages) + return sum(len(self.tokenize(msg.content)) + self._message_overhead + for msg in messages) diff --git a/tests/unit/tokenizer/test_tokenizer_singleton.py b/tests/unit/tokenizer/test_tokenizer_singleton.py index 26e3001e..ea69103e 100644 --- a/tests/unit/tokenizer/test_tokenizer_singleton.py +++ b/tests/unit/tokenizer/test_tokenizer_singleton.py @@ -19,9 +19,10 @@ def test_tokenizer_init(reset_tokenizer_singleton): def test_tokenizer_init_already_initialized(reset_tokenizer_singleton): - Tokenizer.initialize(StubTokenizer) - with pytest.raises(ValueError): - Tokenizer.initialize(StubTokenizer) + Tokenizer.initialize(StubTokenizer, message_overhead=10) + assert isinstance(Tokenizer._tokenizer_instance, StubTokenizer) + assert Tokenizer._initialized is True + assert Tokenizer._tokenizer_instance._message_overhead == 10 def test_tokenizer_init_invalid_same_class(reset_tokenizer_singleton): From 1a980cac7f8cd2743c014ecfc9ee99e74aa00048 Mon Sep 17 00:00:00 2001 From: Amnon Catav Date: Thu, 26 Oct 2023 16:07:21 +0300 Subject: [PATCH 3/5] add test case --- tests/unit/tokenizer/test_tokenizer_singleton.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/tests/unit/tokenizer/test_tokenizer_singleton.py b/tests/unit/tokenizer/test_tokenizer_singleton.py index ea69103e..4a365679 100644 --- a/tests/unit/tokenizer/test_tokenizer_singleton.py +++ b/tests/unit/tokenizer/test_tokenizer_singleton.py @@ -3,6 +3,10 @@ from ..stubs.stub_tokenizer import StubTokenizer +class StubChildTokenizer(StubTokenizer): + pass + + @pytest.fixture def reset_tokenizer_singleton(): before = Tokenizer._tokenizer_instance.__class__ @@ -18,11 +22,21 @@ def test_tokenizer_init(reset_tokenizer_singleton): assert Tokenizer._initialized is True -def test_tokenizer_init_already_initialized(reset_tokenizer_singleton): +def test_tokenizer_init_already_initialized_same_class(reset_tokenizer_singleton): Tokenizer.initialize(StubTokenizer, message_overhead=10) + tokenizer = Tokenizer() assert isinstance(Tokenizer._tokenizer_instance, StubTokenizer) assert Tokenizer._initialized is True assert Tokenizer._tokenizer_instance._message_overhead == 10 + assert tokenizer._tokenizer_instance._message_overhead == 10 + + +def test_tokenizer_init_already_initialized_different_class(reset_tokenizer_singleton): + Tokenizer.initialize(StubChildTokenizer, message_overhead=10) + tokenizer = Tokenizer() + assert isinstance(Tokenizer._tokenizer_instance, StubChildTokenizer) + assert Tokenizer._initialized is True + assert isinstance(tokenizer._tokenizer_instance, StubChildTokenizer) def test_tokenizer_init_invalid_same_class(reset_tokenizer_singleton): From 70b0c663c91f57439eb82cedd24fc5e9574fac9d Mon Sep 17 00:00:00 2001 From: Amnon Catav Date: Thu, 26 Oct 2023 16:18:25 +0300 Subject: [PATCH 4/5] set test as indexed field for e2e test --- tests/e2e/test_app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/e2e/test_app.py b/tests/e2e/test_app.py index 6bcf191c..15ac99b9 100644 --- a/tests/e2e/test_app.py +++ b/tests/e2e/test_app.py @@ -51,7 +51,7 @@ def index_name(testrun_uid): def knowledge_base(index_name): pinecone.init() kb = KnowledgeBase(index_name=index_name) - kb.create_canopy_index() + kb.create_canopy_index(indexed_fields=["test"]) return kb From 26b201625591323921884c36cc3d111616aeb1f4 Mon Sep 17 00:00:00 2001 From: Amnon Catav Date: Thu, 26 Oct 2023 16:19:58 +0300 Subject: [PATCH 5/5] add test as indexed field to e2e test --- tests/e2e/test_app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/e2e/test_app.py b/tests/e2e/test_app.py index 6bcf191c..15ac99b9 100644 --- a/tests/e2e/test_app.py +++ b/tests/e2e/test_app.py @@ -51,7 +51,7 @@ def index_name(testrun_uid): def knowledge_base(index_name): pinecone.init() kb = KnowledgeBase(index_name=index_name) - kb.create_canopy_index() + kb.create_canopy_index(indexed_fields=["test"]) return kb