-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvector_store.py
67 lines (49 loc) · 1.67 KB
/
vector_store.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import os
from pinecone import Pinecone, ServerlessSpec
from langchain.vectorstores import Pinecone
import pinecone
from dotenv import load_dotenv
from src.helper import load_data, text_split, download_hf_embeddings
load_dotenv()
pc = Pinecone(
api_key=os.getenv("PINECONE_API_KEY")
)
# text chunks from the pdf
extracted_data = load_data("data")
text_chunks = text_split(extracted_data)
# load the embeddings model
embeddings = download_hf_embeddings()
# Create embeddings for your text chunks
embedded_texts = embeddings.embed_documents([t.page_content for t in text_chunks])
index_name="medicure-chatbot"
# Prepare vectors for upsert
vectors_to_upsert = []
for i, (chunk, embedding) in enumerate(zip(text_chunks, embedded_texts)):
vector = {
"id": f"chunk_{i}",
"values": embedding,
"metadata": {
"text": chunk.page_content,
# Add any other metadata you want to include
}
}
vectors_to_upsert.append(vector)
# Upsert vectors to Pinecone
# Function to split list into chunks
def chunk_list(lst, chunk_size):
return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]
# Split vectors into smaller batches
batch_size = 100
batches = chunk_list(vectors_to_upsert, batch_size)
# Upsert batches to Pinecone
for i, batch in enumerate(batches):
try:
index.upsert(
vectors=batch,
namespace="ns1" # Replace with your desired namespace
)
print(f"Batch {i+1}/{len(batches)} upserted successfully")
except Exception as e:
print(f"Error upserting batch {i+1}: {str(e)}")
# You might want to implement retry logic here
print("Upsert completed")