pinecone-io · igiloh-pinecone · Nov 1, 2023 · Oct 26, 2023 · Oct 26, 2023 · Oct 29, 2023
diff --git a/src/canopy/knowledge_base/chunker/base.py b/src/canopy/knowledge_base/chunker/base.py
@@ -9,10 +9,25 @@
 class Chunker(ABC, ConfigurableMixin):
 
     """
-    BaseChunker is an abstract class that defines the interface for a chunker.
+    Base class for chunkers. Chunkers take a document (id, text, ...)
+    and return a list of KBDocChunks  (id, text, document_id, ...)
+    Chunker is an abstract class that must be subclassed to be used,
+    also, it extends ConfigurableMixin which means that every subclass of
+    Chunker could be referenced by a name and configured in a config file.
     """
 
     def chunk_documents(self, documents: List[Document]) -> List[KBDocChunk]:
+        """
+        chunk_documents takes a list of documents and returns a list of KBDocChunks
+        this method is just a wrapper around chunk_single_document that can be
+        used to chunk a list of documents.
+
+        Args:
+            documents: list of documents
+
+        Returns:
+            chunks: list of chunks of type KBDocChunks
+        """
         chunks: List[KBDocChunk] = []
         for doc in documents:
             chunks.extend(self.chunk_single_document(doc))
@@ -26,6 +41,17 @@ async def achunk_documents(self, documents: List[Document]) -> List[KBDocChunk]:
 
     @abstractmethod
     def chunk_single_document(self, document: Document) -> List[KBDocChunk]:
+        """
+        chunk_single_document takes a document and returns a
+        list of KBDocChunks, this is the main method
+        that must be implemented by every subclass of Chunker
+
+        Args:
+            document: list of documents
+
+        Returns:
+            chunks: list of chunks KBDocChunks
+        """
         pass
 
     @abstractmethod

diff --git a/src/canopy/knowledge_base/chunker/markdown.py b/src/canopy/knowledge_base/chunker/markdown.py
@@ -7,12 +7,27 @@
 
 
 class MarkdownChunker(RecursiveCharacterChunker):
+    """
+    MarkdownChunker is a subclass of RecursiveCharacterChunker that is configured
+    to chunk markdown documents. It uses RecursiveCharacterTextSplitter to split
+    the text of the document into chunks, by providing the separators for markdown documents
+    (also from LangChainTextSplitter, with modifications)
+    """  # noqa: E501
 
     def __init__(self,
                  chunk_size: int = 256,
                  chunk_overlap: int = 0,
                  keep_separator: bool = True
                  ):
+        """
+        Iniitalizes RecursiveCharacterChunker with the separators for markdown documents.
+
+        Args:
+            chunk_size: size of the chunks. Defaults to 256 tokens.
+            chunk_overlap: overlap between chunks. Defaults to 0.
+            keep_separator: whether to keep the separator in the chunk. Defaults to True.
+
+        """  # noqa: E501
         separators = RecursiveCharacterTextSplitter.get_separators_for_language(
             Language.MARKDOWN
         )

diff --git a/src/canopy/knowledge_base/chunker/recursive_character.py b/src/canopy/knowledge_base/chunker/recursive_character.py
@@ -10,13 +10,28 @@
 
 
 class RecursiveCharacterChunker(Chunker):
+    """
+    A chunker that splits a document into chunks of a given size, using a recursive character splitter.
+    A RecursiveCharacterChunker is a derived class of Chunker, which means that it can be referenced by a name
+    and configured in a config file.
+    """  # noqa: E501
 
     def __init__(self,
                  chunk_size: int = 256,
                  chunk_overlap: int = 0,
                  separators: Optional[List[str]] = None,
                  keep_separator: bool = True,
                  ):
+        """
+        RecursiveCharacterTextSplitter is a text splitter from the langchain library.
+        It splits a text into chunks of a given size, using a recursive character splitter.
+
+        Args:
+            chunk_size: size of the chunks, in tokens
+            chunk_overlap: overlap between chunks
+            separators: list of separators to use for splitting the text
+            keep_separator: whether to keep the separator in the chunk or not
+        """  # noqa: E501
         self._tokenizer = Tokenizer()
         self._chunker = RecursiveCharacterTextSplitter(
             chunk_size=chunk_size,
@@ -26,6 +41,16 @@ def __init__(self,
             keep_separator=keep_separator)
 
     def chunk_single_document(self, document: Document) -> List[KBDocChunk]:
+        """
+        using the RecursiveCharacterTextSplitter, this method takes a document and returns a list of KBDocChunks
+        Args:
+            document: document to be chunked
+
+        Returns:
+            chunks: list of chunks KBDocChunks from the document, where text is splitted
+                              evenly using the RecursiveCharacterTextSplitter
+        """  # noqa: E501
+        # TODO: check overlap not bigger than max_chunk_size
         text_chunks = self._chunker.split_text(document.text)
         return [KBDocChunk(id=f"{document.id}_{i}",
                            document_id=document.id,

diff --git a/src/canopy/knowledge_base/chunker/token_chunker.py b/src/canopy/knowledge_base/chunker/token_chunker.py
@@ -7,10 +7,25 @@
 
 
 class TokenChunker(Chunker):
+    """
+    Simple chunker that splits a document into chunks (group of tokens) of a given size, using a tokenizer.
+    A TokenChunker is a derived class of Chunker, which means that it can be referenced by a name
+    and configured in a config file.
+    """  # noqa: E501
 
     def __init__(self,
                  max_chunk_size: int = 256,
                  overlap: int = 30, ):
+        """
+        Using the global tokenizer, will set the class parameters for the TokenChunker.
+        will check overlap and max_chunk_size.
+
+        Args:
+            max_chunk_size: size of the chunks, in tokens
+            overlap: overlap between chunks, in tokens
+        """  # noqa: E501
+
+        # TODO: should add check for overlap not bigger than max_chunk_size
         if overlap < 0:
             cls_name = self.__class__.__name__
             raise ValueError(
@@ -28,6 +43,18 @@ def __init__(self,
         self._overlap = overlap
 
     def chunk_single_document(self, document: Document) -> List[KBDocChunk]:
+        """
+        This methods takes a document and returns a list of KBDocChunks, where text is splitted
+        evenly using the tokenizer. Firts the text is tokenized, then the tokens are splitted into chunks
+        of a given size, with overlap between chunks.
+        Last chunk is handled such that if the last chunk is smaller than the overlap, it will be removed.
+
+        Args:
+            document: document to be chunked
+
+        Returns:
+            text_chunks: list of chunks KBDocChunks from the document
+        """  # noqa: E501
         tokens = self._tokenizer.tokenize(document.text)
         token_chunks = [tokens[i:i + self._chunk_size]
                         for i in range(0, len(tokens),

diff --git a/src/canopy/knowledge_base/models.py b/src/canopy/knowledge_base/models.py
@@ -6,9 +6,9 @@
 
 from canopy.models.data_models import Document, Query
 
-# TODO 1: consider moving this to pinecone-text
-# TODO 2: consider renaming to "Vector" or "DenseVector"
-# TODO 3: consider supporting `np.ndarray`
+# TODO: (1) consider moving this to pinecone-text
+# TODO: (2) consider renaming to "Vector" or "DenseVector"
+# TODO: (3) consider supporting `np.ndarray`
 VectorValues = List[float]
 
 

diff --git a/src/canopy/knowledge_base/record_encoder/base.py b/src/canopy/knowledge_base/record_encoder/base.py
@@ -8,9 +8,20 @@
 
 class RecordEncoder(ABC, ConfigurableMixin):
     """
-    Base class for all encoders. Encoders are used to encode documents' and queries'
-    text into vectors.
-    """
+    Base class for RecordEncoders. Encodes document chunks and queries to vector representations.
+    The vector representation may include both dense and sparse values.
+    Dense values are usually generated by an embedding model, and sparse values usually represent weighted keyword counts.
+
+    The RecordEncoder implements separate functions for the encoding of documents and queries. 
+    Some implementations of both sparse and dense encoding are not symmetrical. For example, BM25 sparse
+    encoders and instruction dense encoders.
+
+    Any class the extends RecordEncoder must implement the method responsible for the encoding of a single documents \ queries batch:
+    - _encode_documents_batch
+    - _encode_queries_batch
+
+    Async encoders are still not supported, but will be added in the future.
+    """   # noqa: E501
 
     def __init__(self, batch_size: int = 1):
         """
@@ -19,17 +30,38 @@ def __init__(self, batch_size: int = 1):
         Args:
             batch_size: The number of documents or queries to encode at once.
                         Defaults to 1.
-        """
+        """   # noqa: E501
         self.batch_size = batch_size
 
+    # TODO: rename documents to doc_chunks or chunks
     @abstractmethod
     def _encode_documents_batch(self,
                                 documents: List[KBDocChunk]
                                 ) -> List[KBEncodedDocChunk]:
+        """
+        Abstract method for encoding a batch of documents, takes a list of KBDocChunk and returns a list of KBEncodedDocChunk.
+        For maximal performance, and derived class should try to operate on the entire documents batch in a single operation.
+
+        Args:
+            documents: A list of KBDocChunk to encode.
+
+        Returns:
+            encoded chunks: A list of KBEncodedDocChunk.
+        """   # noqa: E501
         pass
 
     @abstractmethod
     def _encode_queries_batch(self, queries: List[Query]) -> List[KBQuery]:
+        """
+        Abstract method for encoding a batch of queries, takes a list of Query and returns a list of KBQuery.
+        For maximal performance, and derived class should try to operate on the entire batch in a single operation.
+
+        Args:
+            queries: A list of `Query` objects to encode.
+
+        Returns:
+            encoded queries: A list of KBQuery.
+        """   # noqa: E501
         pass
 
     @abstractmethod
@@ -51,17 +83,39 @@ def dimension(self) -> Optional[int]:
         """
         Returns:
             The dimension of the dense vectors produced by the encoder, if applicable.
-        """
+        """   # noqa: E501
         return None
 
     def encode_documents(self, documents: List[KBDocChunk]) -> List[KBEncodedDocChunk]:
+        """
+
+        Encode documents in batches. Will iterate over batch of documents and encode them using the _encode_documents_batch method.
+
+        Args:
+            documents: A list of KBDocChunk to encode.
+
+        Returns:
+            encoded chunks: A list of KBEncodedDocChunk.
+
+        """   # noqa: E501
         encoded_docs = []
         for batch in self._batch_iterator(documents, self.batch_size):
             encoded_docs.extend(self._encode_documents_batch(batch))
 
-        return encoded_docs
+        return encoded_docs  # TODO: consider yielding a generator
 
     def encode_queries(self, queries: List[Query]) -> List[KBQuery]:
+        """
+
+        Encode queries in batches. Will iterate over batch of queries and encode them using the _encode_queries_batch method.
+
+        Args:
+            queries: A list of Query to encode.
+
+        Returns:
+            encoded queries: A list of KBQuery.
+        """   # noqa: E501
+
         kb_queries = []
         for batch in self._batch_iterator(queries, self.batch_size):
             kb_queries.extend(self._encode_queries_batch(batch))

diff --git a/src/canopy/knowledge_base/record_encoder/dense.py b/src/canopy/knowledge_base/record_encoder/dense.py
@@ -8,26 +8,62 @@
 
 
 class DenseRecordEncoder(RecordEncoder):
+    """
+    DenseRecordEncoder is a subclass of RecordEncoder that generates dense vector representation of documents chunks and textual queries.
+    The dense represntation generated by the `DenseRecordEncoder` is a list of floats in a given dimension.
+    DenseRecordEncoder wraps a BaseDenseEncoder from the `pinecone-text` library to encode the text itself.
+    for more information about the BaseDenseEncoder see: https://github.com/pinecone-io/pinecone-text
+    """  # noqa: E501
 
     def __init__(self,
                  dense_encoder: BaseDenseEncoder,
                  **kwargs):
+        """
+        Initialize the encoder.
+
+        Args:
+            dense_encoder: A BaseDenseEncoder to encode the text.
+            **kwargs: Additional arguments to pass to the RecordEncoder.
+        """  # noqa: E501
         super().__init__(**kwargs)
         self._dense_encoder = dense_encoder
 
     def _encode_documents_batch(self,
                                 documents: List[KBDocChunk]
                                 ) -> List[KBEncodedDocChunk]:
+        """
+        Encode a batch of documents, takes a list of KBDocChunk and returns a list of KBEncodedDocChunk.
+
+        Args:
+            documents: A list of KBDocChunk to encode.
+        Returns:
+            encoded chunks: A list of KBEncodedDocChunk, with the `values` field populated by the generated embeddings vector.
+        """  # noqa: E501
         dense_values = self._dense_encoder.encode_documents([d.text for d in documents])
         return [KBEncodedDocChunk(**d.dict(), values=v) for d, v in
                 zip(documents, dense_values)]
 
     def _encode_queries_batch(self, queries: List[Query]) -> List[KBQuery]:
+        """
+        Encode a batch of queries, takes a list of Query and returns a list of KBQuery.
+        Args:
+            queries: A list of Query to encode.
+        Returns:
+            encoded queries: A list of KBQuery, with the `values` field populated by the generated embeddings vector.
+        """  # noqa: E501
         dense_values = self._dense_encoder.encode_queries([q.text for q in queries])
         return [KBQuery(**q.dict(), values=v) for q, v in zip(queries, dense_values)]
 
     @cached_property
     def dimension(self) -> int:
+        """
+        The dimension is the length of the vector generated by the `DenseRecordEncoder`
+        Canopy will run a single word through the encoder to get the dimension, this will also validate that the encoder
+        is working properly.
+
+        Returns:
+            dimension(int): the dimension of the encoder
+        """  # noqa: E501
         return len(self._dense_encoder.encode_documents(["hello"])[0])
 
     async def _aencode_documents_batch(self,

diff --git a/src/canopy/knowledge_base/record_encoder/openai.py b/src/canopy/knowledge_base/record_encoder/openai.py
@@ -13,12 +13,27 @@
 
 
 class OpenAIRecordEncoder(DenseRecordEncoder):
+    """
+    OpenAIRecordEncoder is a type of DenseRecordEncoder that uses the OpenAI `embeddings` API.
+    The implementation uses the `OpenAIEncoder` class from the `pinecone-text` library.
+    For more information about see: https://github.com/pinecone-io/pinecone-text
+
+    """  # noqa: E501
 
     def __init__(self,
                  *,
                  model_name: str = "text-embedding-ada-002",
                  batch_size: int = 400,
                  **kwargs):
+        """
+        Initialize the OpenAIRecordEncoder
+
+        Args:
+            model_name: The name of the OpenAI embeddings model to use for encoding. See https://platform.openai.com/docs/models/embeddings
+            batch_size: The number of documents or queries to encode at once.
+                        Defaults to 400.
+            **kwargs: Additional arguments to pass to the underlying `pinecone-text. OpenAIEncoder`.
+        """  # noqa: E501
         encoder = OpenAIEncoder(model_name)
         super().__init__(dense_encoder=encoder, batch_size=batch_size, **kwargs)
 
@@ -29,6 +44,15 @@ def __init__(self,
         retry=retry_if_exception_type(OPEN_AI_TRANSIENT_EXCEPTIONS),
     )
     def encode_documents(self, documents: List[KBDocChunk]) -> List[KBEncodedDocChunk]:
+        """
+        Encode a list of documents, takes a list of KBDocChunk and returns a list of KBEncodedDocChunk.
+
+        Args:
+            documents: A list of KBDocChunk to encode.
+
+        Returns:
+            encoded chunks: A list of KBEncodedDocChunk, with the `values` field populated by the generated embeddings vector.
+        """  # noqa: E501
         return super().encode_documents(documents)
 
     async def _aencode_documents_batch(self,