Skip to content
This repository has been archived by the owner on Nov 13, 2024. It is now read-only.

Kb docstrings #126

Merged
merged 22 commits into from
Nov 1, 2023
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 27 additions & 1 deletion src/canopy/knowledge_base/chunker/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,25 @@
class Chunker(ABC, ConfigurableMixin):

"""
BaseChunker is an abstract class that defines the interface for a chunker.
Base class for chunkers. Chunkers take a document (id, text, ...)
and return a list of KBDocChunks (id, text, document_id, ...)
Chunker is an abstract class that must be subclassed to be used,
also, it extends ConfigurableMixin which means that every subclass of
Chunker could be referenced by a name and configured in a config file.
"""

def chunk_documents(self, documents: List[Document]) -> List[KBDocChunk]:
"""
chunk_documents takes a list of documents and returns a list of KBDocChunks
this method is just a wrapper around chunk_single_document that can be
used to chunk a list of documents.

Args:
documents: list of documents

Returns:
chunks: list of chunks of type KBDocChunks
"""
chunks: List[KBDocChunk] = []
for doc in documents:
chunks.extend(self.chunk_single_document(doc))
Expand All @@ -26,6 +41,17 @@ async def achunk_documents(self, documents: List[Document]) -> List[KBDocChunk]:

@abstractmethod
def chunk_single_document(self, document: Document) -> List[KBDocChunk]:
"""
chunk_single_document takes a document and returns a
list of KBDocChunks, this is the main method
that must be implemented by every subclass of Chunker

Args:
document: list of documents

Returns:
chunks: list of chunks KBDocChunks
"""
pass

@abstractmethod
Expand Down
15 changes: 15 additions & 0 deletions src/canopy/knowledge_base/chunker/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,27 @@


class MarkdownChunker(RecursiveCharacterChunker):
"""
MarkdownChunker is a subclass of RecursiveCharacterChunker that is configured
to chunk markdown documents. It uses RecursiveCharacterTextSplitter to split
the text of the document into chunks, by providing the separators for markdown documents
(also from LangChainTextSplitter, with modifications)
""" # noqa: E501

def __init__(self,
chunk_size: int = 256,
chunk_overlap: int = 0,
keep_separator: bool = True
):
"""
Iniitalizes RecursiveCharacterChunker with the separators for markdown documents.

Args:
chunk_size: size of the chunks. Defaults to 256 tokens.
chunk_overlap: overlap between chunks. Defaults to 0.
keep_separator: whether to keep the separator in the chunk. Defaults to True.

""" # noqa: E501
separators = RecursiveCharacterTextSplitter.get_separators_for_language(
Language.MARKDOWN
)
Expand Down
25 changes: 25 additions & 0 deletions src/canopy/knowledge_base/chunker/recursive_character.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,28 @@


class RecursiveCharacterChunker(Chunker):
"""
A chunker that splits a document into chunks of a given size, using a recursive character splitter.
A RecursiveCharacterChunker is a derived class of Chunker, which means that it can be referenced by a name
and configured in a config file.
""" # noqa: E501

def __init__(self,
chunk_size: int = 256,
chunk_overlap: int = 0,
separators: Optional[List[str]] = None,
keep_separator: bool = True,
):
"""
RecursiveCharacterTextSplitter is a text splitter from the langchain library.
It splits a text into chunks of a given size, using a recursive character splitter.

Args:
chunk_size: size of the chunks, in tokens
chunk_overlap: overlap between chunks
separators: list of separators to use for splitting the text
keep_separator: whether to keep the separator in the chunk or not
""" # noqa: E501
self._tokenizer = Tokenizer()
self._chunker = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
Expand All @@ -26,6 +41,16 @@ def __init__(self,
keep_separator=keep_separator)

def chunk_single_document(self, document: Document) -> List[KBDocChunk]:
"""
using the RecursiveCharacterTextSplitter, this method takes a document and returns a list of KBDocChunks
Args:
document: document to be chunked

Returns:
chunks: list of chunks KBDocChunks from the document, where text is splitted
evenly using the RecursiveCharacterTextSplitter
""" # noqa: E501
# TODO: check overlap not bigger than max_chunk_size
text_chunks = self._chunker.split_text(document.text)
return [KBDocChunk(id=f"{document.id}_{i}",
document_id=document.id,
Expand Down
27 changes: 27 additions & 0 deletions src/canopy/knowledge_base/chunker/token_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,25 @@


class TokenChunker(Chunker):
"""
Simple chunker that splits a document into chunks (group of tokens) of a given size, using a tokenizer.
A TokenChunker is a derived class of Chunker, which means that it can be referenced by a name
and configured in a config file.
""" # noqa: E501

def __init__(self,
max_chunk_size: int = 256,
overlap: int = 30, ):
"""
Using the global tokenizer, will set the class parameters for the TokenChunker.
will check overlap and max_chunk_size.

Args:
max_chunk_size: size of the chunks, in tokens
overlap: overlap between chunks, in tokens
""" # noqa: E501

# TODO: should add check for overlap not bigger than max_chunk_size
if overlap < 0:
cls_name = self.__class__.__name__
raise ValueError(
Expand All @@ -28,6 +43,18 @@ def __init__(self,
self._overlap = overlap

def chunk_single_document(self, document: Document) -> List[KBDocChunk]:
"""
This methods takes a document and returns a list of KBDocChunks, where text is splitted
evenly using the tokenizer. Firts the text is tokenized, then the tokens are splitted into chunks
of a given size, with overlap between chunks.
Last chunk is handled such that if the last chunk is smaller than the overlap, it will be removed.

Args:
document: document to be chunked

Returns:
text_chunks: list of chunks KBDocChunks from the document
""" # noqa: E501
tokens = self._tokenizer.tokenize(document.text)
token_chunks = [tokens[i:i + self._chunk_size]
for i in range(0, len(tokens),
Expand Down
6 changes: 3 additions & 3 deletions src/canopy/knowledge_base/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@

from canopy.models.data_models import Document, Query

# TODO 1: consider moving this to pinecone-text
# TODO 2: consider renaming to "Vector" or "DenseVector"
# TODO 3: consider supporting `np.ndarray`
# TODO: (1) consider moving this to pinecone-text
# TODO: (2) consider renaming to "Vector" or "DenseVector"
# TODO: (3) consider supporting `np.ndarray`
VectorValues = List[float]


Expand Down
66 changes: 60 additions & 6 deletions src/canopy/knowledge_base/record_encoder/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,20 @@

class RecordEncoder(ABC, ConfigurableMixin):
"""
Base class for all encoders. Encoders are used to encode documents' and queries'
text into vectors.
"""
Base class for RecordEncoders. Encodes document chunks and queries to vector representations.
The vector representation may include both dense and sparse values.
Dense values are usually generated by an embedding model, and sparse values usually represent weighted keyword counts.

The RecordEncoder implements separate functions for the encoding of documents and queries.
Some implementations of both sparse and dense encoding are not symmetrical. For example, BM25 sparse
encoders and instruction dense encoders.

Any class the extends RecordEncoder must implement the method responsible for the encoding of a single documents \ queries batch:
- _encode_documents_batch
- _encode_queries_batch

Async encoders are still not supported, but will be added in the future.
""" # noqa: E501

def __init__(self, batch_size: int = 1):
"""
Expand All @@ -19,17 +30,38 @@ def __init__(self, batch_size: int = 1):
Args:
batch_size: The number of documents or queries to encode at once.
Defaults to 1.
"""
""" # noqa: E501
self.batch_size = batch_size

# TODO: rename documents to doc_chunks or chunks
@abstractmethod
def _encode_documents_batch(self,
documents: List[KBDocChunk]
) -> List[KBEncodedDocChunk]:
"""
Abstract method for encoding a batch of documents, takes a list of KBDocChunk and returns a list of KBEncodedDocChunk.
For maximal performance, and derived class should try to operate on the entire documents batch in a single operation.

Args:
documents: A list of KBDocChunk to encode.

Returns:
encoded chunks: A list of KBEncodedDocChunk.
""" # noqa: E501
pass

@abstractmethod
def _encode_queries_batch(self, queries: List[Query]) -> List[KBQuery]:
"""
Abstract method for encoding a batch of queries, takes a list of Query and returns a list of KBQuery.
For maximal performance, and derived class should try to operate on the entire batch in a single operation.

Args:
queries: A list of `Query` objects to encode.

Returns:
encoded queries: A list of KBQuery.
""" # noqa: E501
pass

@abstractmethod
Expand All @@ -51,17 +83,39 @@ def dimension(self) -> Optional[int]:
"""
Returns:
The dimension of the dense vectors produced by the encoder, if applicable.
"""
""" # noqa: E501
return None

def encode_documents(self, documents: List[KBDocChunk]) -> List[KBEncodedDocChunk]:
"""

Encode documents in batches. Will iterate over batch of documents and encode them using the _encode_documents_batch method.

Args:
documents: A list of KBDocChunk to encode.

Returns:
encoded chunks: A list of KBEncodedDocChunk.

""" # noqa: E501
encoded_docs = []
for batch in self._batch_iterator(documents, self.batch_size):
encoded_docs.extend(self._encode_documents_batch(batch))

return encoded_docs
return encoded_docs # TODO: consider yielding a generator

def encode_queries(self, queries: List[Query]) -> List[KBQuery]:
"""

Encode queries in batches. Will iterate over batch of queries and encode them using the _encode_queries_batch method.

Args:
queries: A list of Query to encode.

Returns:
encoded queries: A list of KBQuery.
""" # noqa: E501

kb_queries = []
for batch in self._batch_iterator(queries, self.batch_size):
kb_queries.extend(self._encode_queries_batch(batch))
Expand Down
36 changes: 36 additions & 0 deletions src/canopy/knowledge_base/record_encoder/dense.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,26 +8,62 @@


class DenseRecordEncoder(RecordEncoder):
"""
DenseRecordEncoder is a subclass of RecordEncoder that generates dense vector representation of documents chunks and textual queries.
The dense represntation generated by the `DenseRecordEncoder` is a list of floats in a given dimension.
DenseRecordEncoder wraps a BaseDenseEncoder from the `pinecone-text` library to encode the text itself.
for more information about the BaseDenseEncoder see: https://github.com/pinecone-io/pinecone-text
""" # noqa: E501

def __init__(self,
dense_encoder: BaseDenseEncoder,
**kwargs):
"""
Initialize the encoder.

Args:
dense_encoder: A BaseDenseEncoder to encode the text.
**kwargs: Additional arguments to pass to the RecordEncoder.
""" # noqa: E501
super().__init__(**kwargs)
self._dense_encoder = dense_encoder

def _encode_documents_batch(self,
documents: List[KBDocChunk]
) -> List[KBEncodedDocChunk]:
"""
Encode a batch of documents, takes a list of KBDocChunk and returns a list of KBEncodedDocChunk.

Args:
documents: A list of KBDocChunk to encode.
Returns:
encoded chunks: A list of KBEncodedDocChunk, with the `values` field populated by the generated embeddings vector.
""" # noqa: E501
dense_values = self._dense_encoder.encode_documents([d.text for d in documents])
return [KBEncodedDocChunk(**d.dict(), values=v) for d, v in
zip(documents, dense_values)]

def _encode_queries_batch(self, queries: List[Query]) -> List[KBQuery]:
"""
Encode a batch of queries, takes a list of Query and returns a list of KBQuery.
Args:
queries: A list of Query to encode.
Returns:
encoded queries: A list of KBQuery, with the `values` field populated by the generated embeddings vector.
""" # noqa: E501
dense_values = self._dense_encoder.encode_queries([q.text for q in queries])
return [KBQuery(**q.dict(), values=v) for q, v in zip(queries, dense_values)]

@cached_property
def dimension(self) -> int:
"""
The dimension is the length of the vector generated by the `DenseRecordEncoder`
Canopy will run a single word through the encoder to get the dimension, this will also validate that the encoder
is working properly.

Returns:
dimension(int): the dimension of the encoder
""" # noqa: E501
return len(self._dense_encoder.encode_documents(["hello"])[0])

async def _aencode_documents_batch(self,
Expand Down
24 changes: 24 additions & 0 deletions src/canopy/knowledge_base/record_encoder/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,27 @@


class OpenAIRecordEncoder(DenseRecordEncoder):
"""
OpenAIRecordEncoder is a type of DenseRecordEncoder that uses the OpenAI `embeddings` API.
The implementation uses the `OpenAIEncoder` class from the `pinecone-text` library.
For more information about see: https://github.com/pinecone-io/pinecone-text

""" # noqa: E501

def __init__(self,
*,
model_name: str = "text-embedding-ada-002",
batch_size: int = 400,
**kwargs):
"""
Initialize the OpenAIRecordEncoder

Args:
model_name: The name of the OpenAI embeddings model to use for encoding. See https://platform.openai.com/docs/models/embeddings
batch_size: The number of documents or queries to encode at once.
Defaults to 400.
**kwargs: Additional arguments to pass to the underlying `pinecone-text. OpenAIEncoder`.
""" # noqa: E501
encoder = OpenAIEncoder(model_name)
super().__init__(dense_encoder=encoder, batch_size=batch_size, **kwargs)

Expand All @@ -29,6 +44,15 @@ def __init__(self,
retry=retry_if_exception_type(OPEN_AI_TRANSIENT_EXCEPTIONS),
)
def encode_documents(self, documents: List[KBDocChunk]) -> List[KBEncodedDocChunk]:
"""
Encode a list of documents, takes a list of KBDocChunk and returns a list of KBEncodedDocChunk.

Args:
documents: A list of KBDocChunk to encode.

Returns:
encoded chunks: A list of KBEncodedDocChunk, with the `values` field populated by the generated embeddings vector.
""" # noqa: E501
return super().encode_documents(documents)

async def _aencode_documents_batch(self,
Expand Down
Loading
Loading