-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathingest.py
80 lines (58 loc) · 2.47 KB
/
ingest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from langchain_openai import OpenAIEmbeddings
from pymongo import MongoClient
from datasets import load_dataset
import pandas as pd
import tiktoken
import os
import params
mongodb_conn_string = os.getenv("MONGODB_CONN_STRING")
# Connect to MongoDB Atlas
client = MongoClient(mongodb_conn_string)
db = client[params.db_name]
collection = db[params.collection_name]
ai_model = params.ai_model
vector_dimension = params.vector_dimension
# Load dataset
dataset = load_dataset("Quangnguyen711/Fashion_Shop_Consultant", split="train")
# Convert dataset to Panda DataFrame
df = pd.DataFrame(dataset)
# Only keep records where the Question and Answer fields are not null
df = df[df["Question"].notna() & df["Answer"].notna()]
# Combine Question and Answer fields into a single text field
# axis=1: This means the function is applied row-wise
df["text"] = df.apply(lambda row: f"[Question]{row['Question']}[Answer]{row['Answer']}", axis=1)
# Convert the combined text column to a list of strings
texts = df["text"].tolist()
# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings(model=ai_model, dimensions=vector_dimension)
# Initialize the tokenizer for the specific model
tokenizer = tiktoken.encoding_for_model(ai_model)
# Initialize a variable to keep track of the total tokens used
total_tokens_used = 0
# Define a reasonable batch size
batch_size = 50
# Process the dataset in batches
for i in range(0, len(texts), batch_size):
batch_texts = texts[i:i + batch_size]
# Calculate total tokens used for the current batch
batch_tokens_used = sum(len(tokenizer.encode(text)) for text in batch_texts)
total_tokens_used += batch_tokens_used
# Generate embeddings for the current batch
embeddings_list = embeddings.embed_documents(batch_texts)
# Prepare documents with embeddings to insert into MongoDB
documents = []
for j, (index, row) in enumerate(df.iloc[i:i + batch_size].iterrows()):
document = {
"text" : row["text"],
"embedding": embeddings_list[j]
}
# print(f"Item i: {i}, j: {j}\n")
# print (document)
documents.append(document)
# Insert the batch of documents into MongoDB
collection.insert_many(documents)
# Print total tokens used in the current batch
print(f"Processed and inserted batch {i // batch_size + 1}, tokens used : {batch_tokens_used}")
print(f"Embeddings generated and stored in MongoDB! Total tokens used: {total_tokens_used}")
# Close the MongoDB connection
client.close()