Skip to content
This repository has been archived by the owner on Nov 13, 2024. It is now read-only.

Commit

Permalink
Merge branch 'dev' into kwargs-context-to-text
Browse files Browse the repository at this point in the history
  • Loading branch information
acatav authored Oct 26, 2023
2 parents 85cf6c4 + 21db56a commit 9ef6ad2
Show file tree
Hide file tree
Showing 6 changed files with 45 additions and 12 deletions.
2 changes: 1 addition & 1 deletion src/canopy/knowledge_base/knowledge_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,7 +446,7 @@ def _query_index(self,
sparse_vector=query.sparse_values,
top_k=top_k,
namespace=query.namespace,
metadata_filter=metadata_filter,
filter=metadata_filter,
include_metadata=True,
_check_return_type=_check_return_type,
**query_params)
Expand Down
2 changes: 0 additions & 2 deletions src/canopy/tokenizer/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@ def __new__(cls):

@classmethod
def initialize(cls, tokenizer_class=DEFAULT_TOKENIZER_CLASS, **kwargs):
if cls._initialized:
raise ValueError("Tokenizer has already been initialized")
if not issubclass(tokenizer_class, BaseTokenizer):
raise ValueError("Invalid tokenizer class provided")
if issubclass(tokenizer_class, Tokenizer):
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/test_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def index_name(testrun_uid):
def knowledge_base(index_name):
pinecone.init()
kb = KnowledgeBase(index_name=index_name)
kb.create_canopy_index()
kb.create_canopy_index(indexed_fields=["test"])

return kb

Expand Down
22 changes: 19 additions & 3 deletions tests/system/knowledge_base/test_knowledge_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def knowledge_base(index_full_name, index_name, chunker, encoder):
kb = KnowledgeBase(index_name=index_name,
record_encoder=encoder,
chunker=chunker)
kb.create_canopy_index()
kb.create_canopy_index(indexed_fields=["my-key"])

return kb

Expand Down Expand Up @@ -139,6 +139,18 @@ def execute_and_assert_queries(knowledge_base, chunks_to_query):
f"actual: {q_res.documents}"


def assert_query_metadata_filter(knowledge_base: KnowledgeBase,
metadata_filter: dict,
num_vectors_expected: int,
top_k: int = 100):
assert top_k > num_vectors_expected, \
"the test might return false positive if top_k is not > num_vectors_expected"
query = Query(text="test", top_k=top_k, metadata_filter=metadata_filter)
query_results = knowledge_base.query([query])
assert len(query_results) == 1
assert len(query_results[0].documents) == num_vectors_expected


@pytest.fixture(scope="module", autouse=True)
def teardown_knowledge_base(index_full_name, knowledge_base):
yield
Expand All @@ -162,15 +174,15 @@ def documents(random_texts):
return [Document(id=f"doc_{i}",
text=random_texts[i],
source=f"source_{i}",
metadata={"test": i})
metadata={"my-key": f"value-{i}"})
for i in range(5)]


@pytest.fixture
def documents_large():
return [Document(id=f"doc_{i}_large",
text=f"Sample document {i}",
metadata={"test": i})
metadata={"my-key-large": f"value-{i}"})
for i in range(1000)]


Expand Down Expand Up @@ -249,6 +261,10 @@ def test_query(knowledge_base, encoded_chunks):
execute_and_assert_queries(knowledge_base, encoded_chunks)


def test_query_with_metadata_filter(knowledge_base, encoded_chunks):
assert_query_metadata_filter(knowledge_base, {"my-key": "value-1"}, 2)


def test_delete_documents(knowledge_base, encoded_chunks):
chunk_ids = [chunk.id for chunk in encoded_chunks[-4:]]
doc_ids = set(doc.document_id for doc in encoded_chunks[-4:])
Expand Down
6 changes: 5 additions & 1 deletion tests/unit/stubs/stub_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@

class StubTokenizer(BaseTokenizer):

def __init__(self, message_overhead: int = 3):
self._message_overhead = message_overhead

def tokenize(self, text: str) -> List[str]:
return text.split()

Expand All @@ -14,4 +17,5 @@ def detokenize(self, tokens: List[str]) -> str:
return " ".join(tokens)

def messages_token_count(self, messages: Messages) -> int:
return sum(len(self.tokenize(msg.content)) + 3 for msg in messages)
return sum(len(self.tokenize(msg.content)) + self._message_overhead
for msg in messages)
23 changes: 19 additions & 4 deletions tests/unit/tokenizer/test_tokenizer_singleton.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
from ..stubs.stub_tokenizer import StubTokenizer


class StubChildTokenizer(StubTokenizer):
pass


@pytest.fixture
def reset_tokenizer_singleton():
before = Tokenizer._tokenizer_instance.__class__
Expand All @@ -18,10 +22,21 @@ def test_tokenizer_init(reset_tokenizer_singleton):
assert Tokenizer._initialized is True


def test_tokenizer_init_already_initialized(reset_tokenizer_singleton):
Tokenizer.initialize(StubTokenizer)
with pytest.raises(ValueError):
Tokenizer.initialize(StubTokenizer)
def test_tokenizer_init_already_initialized_same_class(reset_tokenizer_singleton):
Tokenizer.initialize(StubTokenizer, message_overhead=10)
tokenizer = Tokenizer()
assert isinstance(Tokenizer._tokenizer_instance, StubTokenizer)
assert Tokenizer._initialized is True
assert Tokenizer._tokenizer_instance._message_overhead == 10
assert tokenizer._tokenizer_instance._message_overhead == 10


def test_tokenizer_init_already_initialized_different_class(reset_tokenizer_singleton):
Tokenizer.initialize(StubChildTokenizer, message_overhead=10)
tokenizer = Tokenizer()
assert isinstance(Tokenizer._tokenizer_instance, StubChildTokenizer)
assert Tokenizer._initialized is True
assert isinstance(tokenizer._tokenizer_instance, StubChildTokenizer)


def test_tokenizer_init_invalid_same_class(reset_tokenizer_singleton):
Expand Down

0 comments on commit 9ef6ad2

Please sign in to comment.