Skip to content
This repository has been archived by the owner on Nov 13, 2024. It is now read-only.

Commit

Permalink
Configured pydoclint + fixed all issues
Browse files Browse the repository at this point in the history
Should be working from now on
  • Loading branch information
igiloh-pinecone committed Oct 26, 2023
1 parent c677fe4 commit 62865a8
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 29 deletions.
8 changes: 8 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -68,5 +68,13 @@ per-file-ignores = [
exclude = ['.venv']
max-line-length = 88

# PyDocLint configuration
style = 'google'
arg-type-hints-in-docstring = false
require-return-section-when-returning-nothing = false
allow-init-docstring = true
check-return-types = false
skip-checking-raises = true

[tool.poetry.scripts]
canopy = "canopy_cli.cli:cli"
23 changes: 12 additions & 11 deletions src/canopy/knowledge_base/chunker/langchain_text_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,17 +38,7 @@ def _split_text_with_regex(


class TextSplitter(ABC):
"""Interface for splitting text into chunks.
Args:
chunk_size: Maximum size of chunks to return
chunk_overlap: Overlap in characters between chunks
length_function: Function that measures the length of given chunks
keep_separator: Whether to keep the separator in the chunks
add_start_index: If `True`, includes chunk's start index in metadata
strip_whitespace: If `True`, strips whitespace from the start and end of
every document
"""
"""Interface for splitting text into chunks."""

def __init__(
self,
Expand All @@ -59,6 +49,17 @@ def __init__(
add_start_index: bool = False,
strip_whitespace: bool = True,
) -> None:
"""Create a new TextSplitter.
Args:
chunk_size: Maximum size of chunks to return
chunk_overlap: Overlap in characters between chunks
length_function: Function that measures the length of given chunks
keep_separator: Whether to keep the separator in the chunks
add_start_index: If `True`, includes chunk's start index in metadata
strip_whitespace: If `True`, strips whitespace from the start and end of
every document
"""
if chunk_overlap > chunk_size:
raise ValueError(
f"Got a larger chunk overlap ({chunk_overlap}) than chunk size "
Expand Down
52 changes: 38 additions & 14 deletions src/canopy/knowledge_base/knowledge_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,16 +52,6 @@ class KnowledgeBase(BaseKnowledgeBase):
When creating a new Canopy service, the user must first create the underlying Pinecone index.
This is a one-time setup process - the index will exist on Pinecone's managed service until it is deleted.
Args:
index_name: The name of the underlying Pinecone index.
record_encoder: An instance of RecordEncoder to use for encoding documents and queries.
Defaults to OpenAIRecordEncoder.
chunker: An instance of Chunker to use for chunking documents. Defaults to MarkdownChunker.
reranker: An instance of Reranker to use for reranking query results. Defaults to TransparentReranker.
default_top_k: The default number of document chunks to return per query. Defaults to 5.
index_params: A dictionary of parameters to pass to the index creation API. Defaults to None.
see https://docs.pinecone.io/docs/python-client#create_index
Example:
>>> from canopy.knowledge_base.knowledge_base import KnowledgeBase
>>> from tokenizer import Tokenizer
Expand Down Expand Up @@ -89,6 +79,44 @@ def __init__(self,
reranker: Optional[Reranker] = None,
default_top_k: int = 5,
):
"""
Initilize the knowledge base object.
If the index does not exist, the user must first create it by calling `create_canopy_index()` or the CLI command `canopy new`.
Note: Canopy will add the prefix --canopy to your selected index name.
You can retrieve the full index name knowledge_base.index_name at any time, or find it in the Pinecone console at https://app.pinecone.io/
Example:
create a new index:
>>> from canopy.knowledge_base.knowledge_base import KnowledgeBase
>>> from tokenizer import Tokenizer
>>> Tokenizer.initialize()
>>> kb = KnowledgeBase(index_name="my_index")
>>> kb.create_canopy_index()
In any future interactions,
the user simply needs to connect to the existing index:
>>> kb = KnowledgeBase(index_name="my_index")
>>> kb.connect()
Args:
index_name: The name of the underlying Pinecone index.
record_encoder: An instance of RecordEncoder to use for encoding documents and queries.
Defaults to OpenAIRecordEncoder.
chunker: An instance of Chunker to use for chunking documents. Defaults to MarkdownChunker.
reranker: An instance of Reranker to use for reranking query results. Defaults to TransparentReranker.
default_top_k: The default number of document chunks to return per query. Defaults to 5.
Raises:
ValueError: If default_top_k is not a positive integer.
TypeError: If record_encoder is not an instance of RecordEncoder.
TypeError: If chunker is not an instance of Chunker.
TypeError: If reranker is not an instance of Reranker.
""" # noqa: E501
if default_top_k < 1:
raise ValueError("default_top_k must be greater than 0")

Expand Down Expand Up @@ -243,8 +271,6 @@ def create_canopy_index(self,
For example, you can set the index's number of replicas by passing {"replicas": 2}.
see https://docs.pinecone.io/docs/python-client#create_index
Returns:
None
""" # noqa: E501
# validate inputs
if indexed_fields is None:
Expand Down Expand Up @@ -464,8 +490,6 @@ def upsert(self,
Defaults to 100.
show_progress_bar: Whether to show a progress bar while upserting the documents.
Returns:
None
Example:
>>> from canopy.knowledge_base.knowledge_base import KnowledgeBase
Expand Down
10 changes: 6 additions & 4 deletions src/canopy/knowledge_base/record_encoder/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,16 @@ class RecordEncoder(ABC, ConfigurableMixin):
"""
Base class for all encoders. Encoders are used to encode documents' and queries'
text into vectors.
Args:
batch_size: The number of documents or queries to encode at once.
Defaults to 1.
"""

def __init__(self, batch_size: int = 1):
"""
Initialize the encoder.
Args:
batch_size: The number of documents or queries to encode at once.
Defaults to 1.
"""
self.batch_size = batch_size

@abstractmethod
Expand Down

0 comments on commit 62865a8

Please sign in to comment.