Skip to content

Commit

Permalink
merge remote changes from main
Browse files Browse the repository at this point in the history
  • Loading branch information
tanmaygupta9 committed Jul 23, 2024
2 parents 447ae06 + 94860af commit 54db619
Show file tree
Hide file tree
Showing 15 changed files with 373 additions and 207 deletions.
2 changes: 2 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ AGENT_CONFIG_PATH='app/config/agent.yml'
# PDF Tool
#############################################
PDF_TOOL_ENABLED="true" # Set to "true" to enable the PDF tool.
PDF_TOOL_LOG_QUERY="false"
PDF_TOOL_LOG_QUERY_PATH='app/tool_constants/query_log'
PDF_TOOL_DATA_PATH='app/tool_constants/pdf_data'
PDF_TOOL_DATABASE='pdf_indexing_1'
PDF_TOOL_EXTRACTION_CONFIG_PATH='app/config/extraction.yml'
Expand Down
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ The starter pack is based on the latest technologies for optimal performance, se
* ⬆️ Docker-compose for simple deployments and DX
* 🖍 Linting, tests and pre-commit hooks pre-configured

Note: this is a starter kit - for production deployments, we recommend adding enterprise-grade security functionalities. Especially when using LLMs, be aware of known risks like prompt injection ([read more](https://www.ibm.com/topics/prompt-injection)).

## Quickstart
For a quick setup of AgentKit, use the steps below, where both the backend app and frontend app are run inside a Docker container. More elaborate setup instructions can be found in the [documentation](https://agentkit.infra.x.bcg.com/docs/introduction).

Expand Down Expand Up @@ -106,7 +108,7 @@ See [optional feature documentation](docs/docusaurus/docs/advanced/optional_feat
The project spun of a combination of different templates. One great inspiration is [fastapi-alembic-sqlmodel-async](https://github.com/jonra1993/fastapi-alembic-sqlmodel-async), which provided the foundations for the FastAPI setup. Please check them out!

Great thanks to all the contributors:
[@kaikun213](https://github.com/kaikun213) [@drivian](https://github.com/drivian) [@ielmansouri](https://github.com/ielmansouri) [@mastersplinter](https://github.com/mastersplinter) [@tanmaygupta9](https://github.com/tanmaygupta9) [@sofglide](https://github.com/sofglide) [@harticode](https://github.com/harticode) [@edenbd](https://github.com/edenbd) [@ben-howt](https://github.com/ben-howt) [@carelschw](https://github.com/carelschw) [@gustafvh](https://github.com/gustafvh) [@casper321](https://github.com/casper321) [@modvinden1](https://github.com/modvinden1) [@valerie-jzr](https://github.com/valerie-jzr) [@ispoljari](https://github.com/ispoljari) [@martinthenext](https://github.com/martinthenext)
[@kaikun213](https://github.com/kaikun213) [@drivian](https://github.com/drivian) [@ielmansouri](https://github.com/ielmansouri) [@mastersplinter](https://github.com/mastersplinter) [@tanmaygupta9](https://github.com/tanmaygupta9) [@sofglide](https://github.com/sofglide) [@harticode](https://github.com/harticode) [@edenbd](https://github.com/edenbd) [@ben-howt](https://github.com/ben-howt) [@carelschw](https://github.com/carelschw) [@gustafvh](https://github.com/gustafvh) [@casper321](https://github.com/casper321) [@modvinden1](https://github.com/modvinden1) [@valerie-jzr](https://github.com/valerie-jzr) [@ispoljari](https://github.com/ispoljari) [@martinthenext](https://github.com/martinthenext) [@rkdy](https://github.com/rkdy)

Please read `CONTRIBUTING.md` for more details on how to contribute.
PRs are welcome ❤️
Expand Down
27 changes: 5 additions & 22 deletions backend/app/app/config/agent.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,54 +16,37 @@ tools: # list of all tools available for the agent
- chain_tool
- image_generation_tool
action_plans: # list of all action plans available for the meta agent
'0':
name: ''
description: Gather new information, summarize and visualize
actions: # each sublist is 1 action step, i.e. add tools as subitems if you want to execute in parallel
- - sql_tool
- entertainer_tool
- - expert_tool
- visualizer_tool
'1':
name: ''
description: Gather new information and summarize
actions:
- - sql_tool
- pdf_tool
- - pdf_tool
- entertainer_tool
- - expert_tool
'2':
name: ''
description: Sufficient information in chat history, visualize
actions:
- - memory
- sql_tool
- - visualizer_tool
'3':
name: ''
description: Sufficient information in chat history, answer question
actions:
- - memory
- expert_tool
'4':
'3':
name: ''
description: Use information from chat history to gather new information, then summarize
actions:
- - memory
- sql_tool
- pdf_tool
- - expert_tool
'5':
'4':
name: ''
description: Clarification for ambiguous questions
actions:
- - clarify_tool
'6':
'5':
name: 'Nested Chain'
description: User requests expert advice and asks to consider multiple options
actions:
- - chain_tool
'7':
'6':
name: ''
description: Generate an image
actions:
Expand Down
2 changes: 2 additions & 0 deletions backend/app/app/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,8 @@ def assemble_sql_tool_db_connection(
raise ValueError(v)

PDF_TOOL_ENABLED: bool
PDF_TOOL_LOG_QUERY: bool = False
PDF_TOOL_LOG_QUERY_PATH: str = "app/tool_constants/query_log"
PDF_TOOL_DATA_PATH: str
PDF_TOOL_DATABASE: str

Expand Down
161 changes: 113 additions & 48 deletions backend/app/app/db/vector_db_pdf_ingestion.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
# -*- coding: utf-8 -*-
import csv
import logging
import os
from typing import Any, List

import psycopg2
from dotenv import load_dotenv
from langchain.document_loaders.base import BaseLoader
from langchain.embeddings import CacheBackedEmbeddings
Expand All @@ -12,7 +14,6 @@

from app.core.config import settings
from app.schemas.ingestion_schema import LOADER_DICT, IndexingConfig
from app.schemas.tool_schemas.pdf_tool_schema import MarkdownMetadata
from app.services.chat_agent.helpers.embedding_models import get_embedding_model
from app.utils.config_loader import get_ingestion_configs

Expand Down Expand Up @@ -41,6 +42,14 @@ def __init__(self, pipeline_config: IndexingConfig, db_name: str):
user=settings.DATABASE_USER,
password=settings.DATABASE_PASSWORD,
)
self.db_connection = psycopg2.connect(
dbname=db_name,
user=settings.DATABASE_USER,
password=settings.DATABASE_PASSWORD,
host=settings.DATABASE_HOST,
port=settings.DATABASE_PORT,
)
self.db_cursor = self.db_connection.cursor()

def run(
self,
Expand All @@ -61,60 +70,116 @@ def run(
return self._load_documents(folder_path=folder_path, collection_name=collection_name)
raise ValueError("folder_path must be provided if load_index is False")

def _file_already_loaded(self, file_path: str, collection_name: str) -> bool:
"""Check if file is already loaded based on its path using direct SQL query."""
try:
query = """
SELECT EXISTS(
SELECT 1
FROM langchain_pg_embedding e
JOIN langchain_pg_collection c on c.uuid = e.collection_id
WHERE c.name = %s AND e.cmetadata->>'source' = %s
);
"""
self.db_cursor.execute(query, (collection_name, file_path))
return self.db_cursor.fetchone()[0]
except Exception as e:
logger.error("Error checking if file is already loaded.")
logger.error(repr(e))
return False

def _load_docs(
self,
dir_path: str,
collection_name: str,
) -> List[Document]:
"""
Using specified PDF miner to convert PDF documents to raw text chunks.
Using specified PDF miner to convert PDF documents into raw text chunks.
Also supports loading .txt (plain text) files and loads files from subfolders.
Only loads files not already in the database.
Fallback: PyPDF
"""
documents = []
for file_name in os.listdir(dir_path):
file_extension = os.path.splitext(file_name)[1].lower()
# Load PDF files
if file_extension == ".pdf":
logger.info(f"Loading {file_name} into vectorstore")
file_path = f"{dir_path}/{file_name}"
try:
loader: Any = self.pdf_loader(file_path) # type: ignore
file_docs = loader.load()
documents.extend(file_docs)
logger.info(f"{file_name} loaded successfully")
except Exception as e:
logger.error(
f"Could not extract text from PDF {file_name} with {self.pipeline_config.pdf_parser}: {repr(e)}"
)
# Load Markdown files
elif file_extension == ".md":
logger.info(f"Loading data from {file_name} as Document...")
file_path = f"{dir_path}/{file_name}"
try:
# Load md files as single document
with open(file_path, "r", encoding="utf-8") as f:
md_file = f.read()

md_doc = Document(
page_content=md_file,
metadata=MarkdownMetadata.parse_obj({"source": file_name, "type": "text"}).dict(),
)

# Further split at token-level, when splits are above chunk_size configuration (rare)
text_splitter = TokenTextSplitter(
chunk_size=self.pipeline_config.tokenizer_chunk_size,
chunk_overlap=self.pipeline_config.tokenizer_chunk_overlap,
)
file_docs = text_splitter.split_documents([md_doc])

documents.extend(file_docs)
if len(file_docs) > 1:
logger.info(
f"Split {file_name} to {len(file_docs)} documents due to "
f"chunk_size: ({self.pipeline_config.tokenizer_chunk_size})"
)
except Exception as e:
logger.error(f"Could not load MD file {file_name}: {repr(e)}")
for root, _, files in os.walk(dir_path):
for file_name in files:
file_extension = os.path.splitext(file_name)[1].lower()
file_path = os.path.join(root, file_name)

if not self._file_already_loaded(file_path, collection_name):
# Load PDF files
if file_extension == ".pdf":
logger.info(f"Loading {file_name} into vectorstore")
try:
loader: Any = self.pdf_loader(file_path) # type: ignore
file_docs = loader.load()
documents.extend(file_docs)
logger.info(f"{file_name} loaded successfully")
except Exception as e:
logger.error(
f"Could not extract text from PDF {file_name} with {self.pipeline_config.pdf_parser}: {repr(e)}" # noqa: E501
)

# Load Markdown or Plain Text files
elif file_extension in (".md", ".txt"):
file_type = "markdown" if file_extension == ".md" else "plain text"
logger.info(f"Loading data from {file_name} as Document ({file_type})...")
try:
with open(file_path, "r", encoding="utf-8") as f:
file_content = f.read()

file_doc = Document(
page_content=file_content,
metadata={"source": file_path, "type": file_type},
)

text_splitter = TokenTextSplitter(
chunk_size=self.pipeline_config.tokenizer_chunk_size,
chunk_overlap=self.pipeline_config.tokenizer_chunk_overlap,
)
file_docs = text_splitter.split_documents([file_doc])

documents.extend(file_docs)
if len(file_docs) > 1:
logger.info(
f"Split {file_name} into {len(file_docs)} documents due to chunk size: ({self.pipeline_config.tokenizer_chunk_size})" # noqa: E501
)
except Exception as e:
logger.error(f"Could not load {file_type} file {file_name}: {repr(e)}")

# Load CSV files
elif file_extension == ".csv":
logger.info(f"Loading data from {file_name} as CSV Document...")
try:
with open(file_path, "r", encoding="utf-8") as f:
csv_reader = csv.DictReader(f)
for row in csv_reader:
text = row["text"]
metadata = {key: value for key, value in row.items() if key != "text"}
metadata["source"] = file_path
metadata["type"] = "csv"

file_doc = Document(
page_content=text,
metadata=metadata,
)

text_splitter = TokenTextSplitter(
chunk_size=self.pipeline_config.tokenizer_chunk_size,
chunk_overlap=self.pipeline_config.tokenizer_chunk_overlap,
)
file_docs = text_splitter.split_documents([file_doc])

documents.extend(file_docs)
if len(file_docs) > 1:
logger.info(
f"Split {file_name} into {len(file_docs)} documents due to chunk size: ({self.pipeline_config.tokenizer_chunk_size})" # noqa: E501
)
except Exception as e:
logger.error(f"Could not load CSV file {file_name}: {repr(e)}")

else:
logger.info(f"File {file_name} already loaded, skipping.")

return documents

Expand All @@ -124,7 +189,7 @@ def _load_documents(
collection_name: str,
) -> PGVector:
"""Load documents into vectorstore."""
text_documents = self._load_docs(folder_path)
text_documents = self._load_docs(folder_path, collection_name)
text_splitter = TokenTextSplitter(
chunk_size=self.pipeline_config.tokenizer_chunk_size,
chunk_overlap=self.pipeline_config.tokenizer_chunk_overlap,
Expand All @@ -143,7 +208,7 @@ def _load_documents(
documents=docs,
collection_name=collection_name,
connection_string=self.connection_str,
pre_delete_collection=True,
pre_delete_collection=False,
)


Expand Down
3 changes: 3 additions & 0 deletions backend/app/app/services/chat_agent/tools/ExtendedBaseTool.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from typing import Any, List, Optional

from box import Box
from langchain.base_language import BaseLanguageModel
from langchain.callbacks.manager import AsyncCallbackManagerForToolRun, CallbackManagerForToolRun
from langchain.schema import BaseMessage
Expand Down Expand Up @@ -37,6 +38,8 @@ class ExtendedBaseTool(BaseTool):

image_description_prompt: Optional[str] = None

additional: Optional[Box] = None

@classmethod
def from_config(
cls,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from app.schemas.agent_schema import AgentAndToolsConfig
from app.schemas.tool_schema import ToolConfig, ToolInputSchema
from app.services.chat_agent.helpers.llm import get_llm
from app.services.chat_agent.helpers.query_formatting import standard_query_format
from app.services.chat_agent.tools.ExtendedBaseTool import ExtendedBaseTool

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -66,12 +67,12 @@ async def _arun(
**kwargs: Any,
) -> str:
"""Use the tool asynchronously."""
query = kwargs.get(
tool_input_str = kwargs.get(
"query",
args[0],
)
# Only use the latest human message
query = ToolInputSchema.parse_raw(query).latest_human_message

query = standard_query_format(ToolInputSchema.parse_raw(tool_input_str))

try:
messages = [
Expand Down
Loading

0 comments on commit 54db619

Please sign in to comment.