merge remote changes from main

BCG-X-Official · Jul 23, 2024 · 54db619 · 54db619
2 parents 447ae06 + 94860af
commit 54db619
Show file tree

Hide file tree

Showing 15 changed files with 373 additions and 207 deletions.
diff --git a/.env.example b/.env.example
@@ -87,6 +87,8 @@ AGENT_CONFIG_PATH='app/config/agent.yml'
 # PDF Tool
 #############################################
 PDF_TOOL_ENABLED="true" # Set to "true" to enable the PDF tool.
+PDF_TOOL_LOG_QUERY="false"
+PDF_TOOL_LOG_QUERY_PATH='app/tool_constants/query_log'
 PDF_TOOL_DATA_PATH='app/tool_constants/pdf_data'
 PDF_TOOL_DATABASE='pdf_indexing_1'
 PDF_TOOL_EXTRACTION_CONFIG_PATH='app/config/extraction.yml'

diff --git a/README.md b/README.md
@@ -30,6 +30,8 @@ The starter pack is based on the latest technologies for optimal performance, se
 * ⬆️ Docker-compose for simple deployments and DX
 * 🖍 Linting, tests and pre-commit hooks pre-configured
 
+Note: this is a starter kit - for production deployments, we recommend adding enterprise-grade security functionalities. Especially when using LLMs, be aware of known risks like prompt injection ([read more](https://www.ibm.com/topics/prompt-injection)).
+
 ## Quickstart
 For a quick setup of AgentKit, use the steps below, where both the backend app and frontend app are run inside a Docker container. More elaborate setup instructions can be found in the [documentation](https://agentkit.infra.x.bcg.com/docs/introduction).
 
@@ -106,7 +108,7 @@ See [optional feature documentation](docs/docusaurus/docs/advanced/optional_feat
 The project spun of a combination of different templates. One great inspiration is [fastapi-alembic-sqlmodel-async](https://github.com/jonra1993/fastapi-alembic-sqlmodel-async), which provided the foundations for the FastAPI setup. Please check them out!
 
 Great thanks to all the contributors:
-[@kaikun213](https://github.com/kaikun213) [@drivian](https://github.com/drivian) [@ielmansouri](https://github.com/ielmansouri) [@mastersplinter](https://github.com/mastersplinter) [@tanmaygupta9](https://github.com/tanmaygupta9) [@sofglide](https://github.com/sofglide) [@harticode](https://github.com/harticode) [@edenbd](https://github.com/edenbd) [@ben-howt](https://github.com/ben-howt) [@carelschw](https://github.com/carelschw) [@gustafvh](https://github.com/gustafvh) [@casper321](https://github.com/casper321) [@modvinden1](https://github.com/modvinden1) [@valerie-jzr](https://github.com/valerie-jzr) [@ispoljari](https://github.com/ispoljari) [@martinthenext](https://github.com/martinthenext)
+[@kaikun213](https://github.com/kaikun213) [@drivian](https://github.com/drivian) [@ielmansouri](https://github.com/ielmansouri) [@mastersplinter](https://github.com/mastersplinter) [@tanmaygupta9](https://github.com/tanmaygupta9) [@sofglide](https://github.com/sofglide) [@harticode](https://github.com/harticode) [@edenbd](https://github.com/edenbd) [@ben-howt](https://github.com/ben-howt) [@carelschw](https://github.com/carelschw) [@gustafvh](https://github.com/gustafvh) [@casper321](https://github.com/casper321) [@modvinden1](https://github.com/modvinden1) [@valerie-jzr](https://github.com/valerie-jzr) [@ispoljari](https://github.com/ispoljari) [@martinthenext](https://github.com/martinthenext) [@rkdy](https://github.com/rkdy)
 
 Please read `CONTRIBUTING.md` for more details on how to contribute.
 PRs are welcome ❤️

diff --git a/backend/app/app/config/agent.yml b/backend/app/app/config/agent.yml
@@ -16,54 +16,37 @@ tools: # list of all tools available for the agent
   - chain_tool
   - image_generation_tool
 action_plans: # list of all action plans available for the meta agent
-  '0':
-    name: ''
-    description: Gather new information, summarize and visualize
-    actions: # each sublist is 1 action step, i.e. add tools as subitems if you want to execute in parallel
-      - - sql_tool
-        - entertainer_tool
-      - - expert_tool
-        - visualizer_tool
   '1':
     name: ''
     description: Gather new information and summarize
     actions:
-      - - sql_tool
-        - pdf_tool
+      - - pdf_tool
         - entertainer_tool
       - - expert_tool
   '2':
-    name: ''
-    description: Sufficient information in chat history, visualize
-    actions:
-      - - memory
-        - sql_tool
-      - - visualizer_tool
-  '3':
     name: ''
     description: Sufficient information in chat history, answer question
     actions:
       - - memory
         - expert_tool
-  '4':
+  '3':
     name: ''
     description: Use information from chat history to gather new information, then summarize
     actions:
       - - memory
-        - sql_tool
         - pdf_tool
       - - expert_tool
-  '5':
+  '4':
     name: ''
     description: Clarification for ambiguous questions
     actions:
       - - clarify_tool
-  '6':
+  '5':
     name: 'Nested Chain'
     description: User requests expert advice and asks to consider multiple options
     actions:
       - - chain_tool
-  '7':
+  '6':
     name: ''
     description: Generate an image
     actions:

diff --git a/backend/app/app/core/config.py b/backend/app/app/core/config.py
@@ -212,6 +212,8 @@ def assemble_sql_tool_db_connection(
         raise ValueError(v)
 
     PDF_TOOL_ENABLED: bool
+    PDF_TOOL_LOG_QUERY: bool = False
+    PDF_TOOL_LOG_QUERY_PATH: str = "app/tool_constants/query_log"
     PDF_TOOL_DATA_PATH: str
     PDF_TOOL_DATABASE: str
 

diff --git a/backend/app/app/db/vector_db_pdf_ingestion.py b/backend/app/app/db/vector_db_pdf_ingestion.py
@@ -1,8 +1,10 @@
 # -*- coding: utf-8 -*-
+import csv
 import logging
 import os
 from typing import Any, List
 
+import psycopg2
 from dotenv import load_dotenv
 from langchain.document_loaders.base import BaseLoader
 from langchain.embeddings import CacheBackedEmbeddings
@@ -12,7 +14,6 @@
 
 from app.core.config import settings
 from app.schemas.ingestion_schema import LOADER_DICT, IndexingConfig
-from app.schemas.tool_schemas.pdf_tool_schema import MarkdownMetadata
 from app.services.chat_agent.helpers.embedding_models import get_embedding_model
 from app.utils.config_loader import get_ingestion_configs
 
@@ -41,6 +42,14 @@ def __init__(self, pipeline_config: IndexingConfig, db_name: str):
             user=settings.DATABASE_USER,
             password=settings.DATABASE_PASSWORD,
         )
+        self.db_connection = psycopg2.connect(
+            dbname=db_name,
+            user=settings.DATABASE_USER,
+            password=settings.DATABASE_PASSWORD,
+            host=settings.DATABASE_HOST,
+            port=settings.DATABASE_PORT,
+        )
+        self.db_cursor = self.db_connection.cursor()
 
     def run(
         self,
@@ -61,60 +70,116 @@ def run(
             return self._load_documents(folder_path=folder_path, collection_name=collection_name)
         raise ValueError("folder_path must be provided if load_index is False")
 
+    def _file_already_loaded(self, file_path: str, collection_name: str) -> bool:
+        """Check if file is already loaded based on its path using direct SQL query."""
+        try:
+            query = """
+            SELECT EXISTS(
+                SELECT 1
+                FROM langchain_pg_embedding e
+                JOIN langchain_pg_collection c on c.uuid = e.collection_id
+                WHERE c.name = %s AND e.cmetadata->>'source' = %s
+            );
+            """
+            self.db_cursor.execute(query, (collection_name, file_path))
+            return self.db_cursor.fetchone()[0]
+        except Exception as e:
+            logger.error("Error checking if file is already loaded.")
+            logger.error(repr(e))
+            return False
+
     def _load_docs(
         self,
         dir_path: str,
+        collection_name: str,
     ) -> List[Document]:
         """
-        Using specified PDF miner to convert PDF documents to raw text chunks.
+        Using specified PDF miner to convert PDF documents into raw text chunks.
+        Also supports loading .txt (plain text) files and loads files from subfolders.
+        Only loads files not already in the database.
 
         Fallback: PyPDF
         """
         documents = []
-        for file_name in os.listdir(dir_path):
-            file_extension = os.path.splitext(file_name)[1].lower()
-            # Load PDF files
-            if file_extension == ".pdf":
-                logger.info(f"Loading {file_name} into vectorstore")
-                file_path = f"{dir_path}/{file_name}"
-                try:
-                    loader: Any = self.pdf_loader(file_path)  # type: ignore
-                    file_docs = loader.load()
-                    documents.extend(file_docs)
-                    logger.info(f"{file_name} loaded successfully")
-                except Exception as e:
-                    logger.error(
-                        f"Could not extract text from PDF {file_name} with {self.pipeline_config.pdf_parser}: {repr(e)}"
-                    )
-            # Load Markdown files
-            elif file_extension == ".md":
-                logger.info(f"Loading data from {file_name} as Document...")
-                file_path = f"{dir_path}/{file_name}"
-                try:
-                    # Load md files as single document
-                    with open(file_path, "r", encoding="utf-8") as f:
-                        md_file = f.read()
-
-                    md_doc = Document(
-                        page_content=md_file,
-                        metadata=MarkdownMetadata.parse_obj({"source": file_name, "type": "text"}).dict(),
-                    )
-
-                    # Further split at token-level, when splits are above chunk_size configuration (rare)
-                    text_splitter = TokenTextSplitter(
-                        chunk_size=self.pipeline_config.tokenizer_chunk_size,
-                        chunk_overlap=self.pipeline_config.tokenizer_chunk_overlap,
-                    )
-                    file_docs = text_splitter.split_documents([md_doc])
-
-                    documents.extend(file_docs)
-                    if len(file_docs) > 1:
-                        logger.info(
-                            f"Split {file_name} to {len(file_docs)} documents due to "
-                            f"chunk_size: ({self.pipeline_config.tokenizer_chunk_size})"
-                        )
-                except Exception as e:
-                    logger.error(f"Could not load MD file {file_name}: {repr(e)}")
+        for root, _, files in os.walk(dir_path):
+            for file_name in files:
+                file_extension = os.path.splitext(file_name)[1].lower()
+                file_path = os.path.join(root, file_name)
+
+                if not self._file_already_loaded(file_path, collection_name):
+                    # Load PDF files
+                    if file_extension == ".pdf":
+                        logger.info(f"Loading {file_name} into vectorstore")
+                        try:
+                            loader: Any = self.pdf_loader(file_path)  # type: ignore
+                            file_docs = loader.load()
+                            documents.extend(file_docs)
+                            logger.info(f"{file_name} loaded successfully")
+                        except Exception as e:
+                            logger.error(
+                                f"Could not extract text from PDF {file_name} with {self.pipeline_config.pdf_parser}: {repr(e)}"  # noqa: E501
+                            )
+
+                    # Load Markdown or Plain Text files
+                    elif file_extension in (".md", ".txt"):
+                        file_type = "markdown" if file_extension == ".md" else "plain text"
+                        logger.info(f"Loading data from {file_name} as Document ({file_type})...")
+                        try:
+                            with open(file_path, "r", encoding="utf-8") as f:
+                                file_content = f.read()
+
+                            file_doc = Document(
+                                page_content=file_content,
+                                metadata={"source": file_path, "type": file_type},
+                            )
+
+                            text_splitter = TokenTextSplitter(
+                                chunk_size=self.pipeline_config.tokenizer_chunk_size,
+                                chunk_overlap=self.pipeline_config.tokenizer_chunk_overlap,
+                            )
+                            file_docs = text_splitter.split_documents([file_doc])
+
+                            documents.extend(file_docs)
+                            if len(file_docs) > 1:
+                                logger.info(
+                                    f"Split {file_name} into {len(file_docs)} documents due to chunk size: ({self.pipeline_config.tokenizer_chunk_size})"  # noqa: E501
+                                )
+                        except Exception as e:
+                            logger.error(f"Could not load {file_type} file {file_name}: {repr(e)}")
+
+                    # Load CSV files
+                    elif file_extension == ".csv":
+                        logger.info(f"Loading data from {file_name} as CSV Document...")
+                        try:
+                            with open(file_path, "r", encoding="utf-8") as f:
+                                csv_reader = csv.DictReader(f)
+                                for row in csv_reader:
+                                    text = row["text"]
+                                    metadata = {key: value for key, value in row.items() if key != "text"}
+                                    metadata["source"] = file_path
+                                    metadata["type"] = "csv"
+
+                                    file_doc = Document(
+                                        page_content=text,
+                                        metadata=metadata,
+                                    )
+
+                                    text_splitter = TokenTextSplitter(
+                                        chunk_size=self.pipeline_config.tokenizer_chunk_size,
+                                        chunk_overlap=self.pipeline_config.tokenizer_chunk_overlap,
+                                    )
+                                    file_docs = text_splitter.split_documents([file_doc])
+
+                                    documents.extend(file_docs)
+                                    if len(file_docs) > 1:
+                                        logger.info(
+                                            f"Split {file_name} into {len(file_docs)} documents due to chunk size: ({self.pipeline_config.tokenizer_chunk_size})"  # noqa: E501
+                                        )
+                        except Exception as e:
+                            logger.error(f"Could not load CSV file {file_name}: {repr(e)}")
+
+                else:
+                    logger.info(f"File {file_name} already loaded, skipping.")
 
         return documents
 
@@ -124,7 +189,7 @@ def _load_documents(
         collection_name: str,
     ) -> PGVector:
         """Load documents into vectorstore."""
-        text_documents = self._load_docs(folder_path)
+        text_documents = self._load_docs(folder_path, collection_name)
         text_splitter = TokenTextSplitter(
             chunk_size=self.pipeline_config.tokenizer_chunk_size,
             chunk_overlap=self.pipeline_config.tokenizer_chunk_overlap,
@@ -143,7 +208,7 @@ def _load_documents(
             documents=docs,
             collection_name=collection_name,
             connection_string=self.connection_str,
-            pre_delete_collection=True,
+            pre_delete_collection=False,
         )
 
 

diff --git a/backend/app/app/services/chat_agent/tools/ExtendedBaseTool.py b/backend/app/app/services/chat_agent/tools/ExtendedBaseTool.py
@@ -4,6 +4,7 @@
 
 from typing import Any, List, Optional
 
+from box import Box
 from langchain.base_language import BaseLanguageModel
 from langchain.callbacks.manager import AsyncCallbackManagerForToolRun, CallbackManagerForToolRun
 from langchain.schema import BaseMessage
@@ -37,6 +38,8 @@ class ExtendedBaseTool(BaseTool):
 
     image_description_prompt: Optional[str] = None
 
+    additional: Optional[Box] = None
+
     @classmethod
     def from_config(
         cls,

diff --git a/backend/app/app/services/chat_agent/tools/library/basellm_tool/basellm_tool.py b/backend/app/app/services/chat_agent/tools/library/basellm_tool/basellm_tool.py
@@ -10,6 +10,7 @@
 from app.schemas.agent_schema import AgentAndToolsConfig
 from app.schemas.tool_schema import ToolConfig, ToolInputSchema
 from app.services.chat_agent.helpers.llm import get_llm
+from app.services.chat_agent.helpers.query_formatting import standard_query_format
 from app.services.chat_agent.tools.ExtendedBaseTool import ExtendedBaseTool
 
 logger = logging.getLogger(__name__)
@@ -66,12 +67,12 @@ async def _arun(
         **kwargs: Any,
     ) -> str:
         """Use the tool asynchronously."""
-        query = kwargs.get(
+        tool_input_str = kwargs.get(
             "query",
             args[0],
         )
-        # Only use the latest human message
-        query = ToolInputSchema.parse_raw(query).latest_human_message
+
+        query = standard_query_format(ToolInputSchema.parse_raw(tool_input_str))
 
         try:
             messages = [