diff --git a/docs/library.md b/docs/library.md index d916f44f..24d907cd 100644 --- a/docs/library.md +++ b/docs/library.md @@ -19,7 +19,41 @@ The idea behind Canopy library is to provide a framework to build AI application ## Setup -To setup canopy, please follow the instructions [here](../README.md#setup). +0. set up a virtual environment (optional) +```bash +python3 -m venv canopy-env +source canopy-env/bin/activate +``` +more about virtual environments [here](https://docs.python.org/3/tutorial/venv.html) + +1. install the package +```bash +pip install pinecone-canopy +``` + +2. Set up the environment variables + +```python +import os + +os.environ["PINECONE_API_KEY"] = "" +os.environ["PINECONE_ENVIRONMENT"] = "" +os.environ["OPENAI_API_KEY"] = "" +``` + +
+CLICK HERE for more information about the environment variables + +
+
+ +| Name | Description | How to get it? | +|-----------------------|-----------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `PINECONE_API_KEY` | The API key for Pinecone. Used to authenticate to Pinecone services to create indexes and to insert, delete and search data | Register or log into your Pinecone account in the [console](https://app.pinecone.io/). You can access your API key from the "API Keys" section in the sidebar of your dashboard | +| `PINECONE_ENVIRONMENT`| Determines the Pinecone service cloud environment of your index e.g `west1-gcp`, `us-east-1-aws`, etc | You can find the Pinecone environment next to the API key in [console](https://app.pinecone.io/) | +| `OPENAI_API_KEY` | API key for OpenAI. Used to authenticate to OpenAI's services for embedding and chat API | You can find your OpenAI API key [here](https://platform.openai.com/account/api-keys). You might need to login or register to OpenAI services | +
+ ## Quickstart @@ -118,22 +152,30 @@ To insert data into the knowledge base, you can create a list of documents and u ```python from canopy.models.data_models import Document -documents = [Document(id="1", text="U2 are an Irish rock band from Dublin, formed in 1976.", source="https://url.com"), - Document(id="2", text="Arctic Monkeys are an English rock band formed in Sheffield in 2002.", source="https://another-url.com", metadata={"my-key": "my-value"})] +documents = [Document(id="1", + text="U2 are an Irish rock band from Dublin, formed in 1976.", + source="https://en.wikipedia.org/wiki/U2"), + Document(id="2", + text="Arctic Monkeys are an English rock band formed in Sheffield in 2002.", + source="https://en.wikipedia.org/wiki/Arctic_Monkeys", + metadata={"my-key": "my-value"})] kb.upsert(documents) ``` Now you can query the knowledge base with the `query` method to find the most similar documents to a given text: ```python -from canopy.models.query_models import Query -results = kb.query([Query("Arctic Monkeys music genre"), +from canopy.models.data_models import Query +results = kb.query([Query(text="Arctic Monkeys music genre"), Query(text="U2 music genre", top_k=10, metadata_filter={"my-key": "my-value"})]) print(results[0].documents[0].text) # output: Arctic Monkeys are an English rock band formed in Sheffield in 2002. + +print(f"score - {results[0].documents[0].score:.4f}") +# output: score - 0.8942 ``` ### Step 4: Create a context engine @@ -153,14 +195,32 @@ context_engine = ContextEngine(kb) Then, you can use the `query` method to retrieve the most relevant context for a given query and token budget: ```python -result = context_engine.query([Query("Arctic Monkeys music genre")], token_budget=100) +import json -print(result.content) -# output: Arctic Monkeys are an English rock band formed in Sheffield in 2002. +result = context_engine.query([Query(text="Arctic Monkeys music genre")], max_context_tokens=100) -print(result.token_count) -# output: 17 +print(json.dumps(json.loads(result.to_text()), indent=2, ensure_ascii=False)) +print(f"\n# tokens in context returned: {result.num_tokens}") ``` +output: +```json +{ + "query": "Arctic Monkeys music genre", + "snippets": [ + { + "source": "https://en.wikipedia.org/wiki/Arctic_Monkeys", + "text": "Arctic Monkeys are an English rock band formed in Sheffield in 2002." + }, + { + "source": "https://en.wikipedia.org/wiki/U2", + "text": "U2 are an Irish rock band from Dublin, formed in 1976." + } + ] +} + +# tokens in context returned: 89 +``` + By default, to handle the token budget constraint, the context engine will use the `StuffingContextBuilder` that will stuff as many documents as possible into the context without exceeding the token budget, by the order they have been retrieved from the knowledge base. @@ -190,8 +250,13 @@ chat_engine = ChatEngine(context_engine) Then, you can start chatting! ```python -chat_engine.chat("what is the genre of Arctic Monkeys band?") -# output: Arctic Monkeys is a rock band. +from canopy.models.data_models import MessageBase + +response = chat_engine.chat(messages=[MessageBase(role="user", content="what is the genre of Arctic Monkeys band?")], stream=False) + +print(response.choices[0].message.content) + +# output: The genre of the Arctic Monkeys band is rock. Source: [Wikipedia](https://en.wikipedia.org/wiki/Arctic_Monkeys) ``` diff --git a/src/canopy/knowledge_base/knowledge_base.py b/src/canopy/knowledge_base/knowledge_base.py index 4d0c5ae2..5ca38085 100644 --- a/src/canopy/knowledge_base/knowledge_base.py +++ b/src/canopy/knowledge_base/knowledge_base.py @@ -446,7 +446,7 @@ def _query_index(self, sparse_vector=query.sparse_values, top_k=top_k, namespace=query.namespace, - metadata_filter=metadata_filter, + filter=metadata_filter, include_metadata=True, _check_return_type=_check_return_type, **query_params) diff --git a/src/canopy/knowledge_base/record_encoder/openai.py b/src/canopy/knowledge_base/record_encoder/openai.py index 3a1ec728..ce18a3f8 100644 --- a/src/canopy/knowledge_base/record_encoder/openai.py +++ b/src/canopy/knowledge_base/record_encoder/openai.py @@ -23,6 +23,7 @@ def __init__(self, super().__init__(dense_encoder=encoder, batch_size=batch_size, **kwargs) @retry( + reraise=True, wait=wait_random_exponential(min=1, max=10), stop=stop_after_attempt(3), retry=retry_if_exception_type(OPEN_AI_TRANSIENT_EXCEPTIONS), diff --git a/src/canopy/llm/openai.py b/src/canopy/llm/openai.py index 812fc5e0..4cb3d43f 100644 --- a/src/canopy/llm/openai.py +++ b/src/canopy/llm/openai.py @@ -30,6 +30,7 @@ def available_models(self): return [k["id"] for k in openai.Model.list().data] @retry( + reraise=True, wait=wait_random_exponential(min=1, max=10), stop=stop_after_attempt(3), retry=retry_if_exception_type(OPEN_AI_TRANSIENT_EXCEPTIONS), @@ -66,6 +67,7 @@ def streaming_iterator(response): return ChatResponse(**response) @retry( + reraise=True, wait=wait_random_exponential(min=1, max=10), stop=stop_after_attempt(3), retry=retry_if_exception_type( diff --git a/src/canopy_cli/cli.py b/src/canopy_cli/cli.py index b3ba714d..7fcf8757 100644 --- a/src/canopy_cli/cli.py +++ b/src/canopy_cli/cli.py @@ -64,7 +64,7 @@ def check_service_health(url: str): raise CLIError(msg) -@retry(wait=wait_fixed(5), stop=stop_after_attempt(6)) +@retry(reraise=True, wait=wait_fixed(5), stop=stop_after_attempt(6)) def wait_for_service(chat_service_url: str): check_service_health(chat_service_url) @@ -193,7 +193,7 @@ def new(index_name: str, config: Optional[str]): help=( """ \b - Upload local data files containing documents to the Canopy service. + Upload local data files to the Canopy service. Load all the documents from data file or a directory containing multiple data files. The allowed formats are .jsonl and .parquet. @@ -297,6 +297,7 @@ def upsert(index_name: str, raise CLIError(msg) pbar.update(len(batch)) + pbar.close() if failed_docs: msg = ( diff --git a/tests/system/knowledge_base/test_knowledge_base.py b/tests/system/knowledge_base/test_knowledge_base.py index 8095239e..643b81b7 100644 --- a/tests/system/knowledge_base/test_knowledge_base.py +++ b/tests/system/knowledge_base/test_knowledge_base.py @@ -71,7 +71,7 @@ def knowledge_base(index_full_name, index_name, chunker, encoder): kb = KnowledgeBase(index_name=index_name, record_encoder=encoder, chunker=chunker) - kb.create_canopy_index() + kb.create_canopy_index(indexed_fields=["my-key"]) return kb @@ -139,6 +139,18 @@ def execute_and_assert_queries(knowledge_base, chunks_to_query): f"actual: {q_res.documents}" +def assert_query_metadata_filter(knowledge_base: KnowledgeBase, + metadata_filter: dict, + num_vectors_expected: int, + top_k: int = 100): + assert top_k > num_vectors_expected, \ + "the test might return false positive if top_k is not > num_vectors_expected" + query = Query(text="test", top_k=top_k, metadata_filter=metadata_filter) + query_results = knowledge_base.query([query]) + assert len(query_results) == 1 + assert len(query_results[0].documents) == num_vectors_expected + + @pytest.fixture(scope="module", autouse=True) def teardown_knowledge_base(index_full_name, knowledge_base): yield @@ -162,7 +174,7 @@ def documents(random_texts): return [Document(id=f"doc_{i}", text=random_texts[i], source=f"source_{i}", - metadata={"test": i}) + metadata={"my-key": f"value-{i}"}) for i in range(5)] @@ -170,7 +182,7 @@ def documents(random_texts): def documents_large(): return [Document(id=f"doc_{i}_large", text=f"Sample document {i}", - metadata={"test": i}) + metadata={"my-key-large": f"value-{i}"}) for i in range(1000)] @@ -249,6 +261,10 @@ def test_query(knowledge_base, encoded_chunks): execute_and_assert_queries(knowledge_base, encoded_chunks) +def test_query_with_metadata_filter(knowledge_base, encoded_chunks): + assert_query_metadata_filter(knowledge_base, {"my-key": "value-1"}, 2) + + def test_delete_documents(knowledge_base, encoded_chunks): chunk_ids = [chunk.id for chunk in encoded_chunks[-4:]] doc_ids = set(doc.document_id for doc in encoded_chunks[-4:])