-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy path2_Vector_DB[qdrant].py
83 lines (72 loc) · 2.91 KB
/
2_Vector_DB[qdrant].py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from qdrant_client import QdrantClient, models
import google.generativeai as genai
from dotenv import load_dotenv
# Load configs from .env file
load_dotenv()
PATH_TO_KNOWLEDGE_BASE = "knowledge_base" # Path where the PDFs are stored
COLLECTION_NAME = "tech_radar" # Name of the collection
QDRANT_API_KEY = os.environ['QDRANT_API_KEY']
QDRANT_URL = os.environ['QDRANT_URL']
# Set the API key by exporting it as an environment variable
genai.configure(api_key=os.environ['GOOGLE_API_KEY'])
# Make sure qdrant docker container is running
# Connect to the Qdrant server.
# qdrant = QdrantClient("http://localhost:6333") # If running locally
qdrant = QdrantClient(QDRANT_URL, api_key=QDRANT_API_KEY)
# Function to create embeddings of the text
def create_embedding(text):
try:
result = genai.embed_content(
model="models/embedding-001",
content=text,
task_type="retrieval_document",
title="Embedding of single string")
return result['embedding']
except Exception as error:
print(f"Error: {error}")
# Function to create a collection in Qdrant
def create_collection():
qdrant.create_collection(
collection_name=COLLECTION_NAME,
vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE),
)
# Function to ingest the documents into the collection
def ingest_document():
documents = []
# Load the PDFs from the knowledge base
for file in os.listdir(PATH_TO_KNOWLEDGE_BASE):
if file.endswith('.pdf'):
pdf_path = os.path.join(PATH_TO_KNOWLEDGE_BASE, file)
loader = PyPDFLoader(pdf_path)
documents.extend(loader.load())
# Split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
chunked_documents = text_splitter.split_documents(documents)
print(f"Total number of chunks: {len(chunked_documents)}")
# Ingest the chunks as vector embedding into the collection with metadata
for i, chunk in enumerate(chunked_documents):
embedding = create_embedding(chunk.page_content)
try:
qdrant.upsert(
collection_name=COLLECTION_NAME,
points=[
models.PointStruct(
id=i+1,
vector=embedding,
payload={
"metadata": chunk.metadata,
"page_content": chunk.page_content,
},
)
],
)
print(f"Chunk ingested: {i+1}")
except Exception as error:
print(f"Error: {error}")
print(f"Chunk not ingested: {i+1}")
if __name__ == "__main__":
create_collection()
ingest_document()