From c72d340dedb6d68636e3fc240a085634e36667db Mon Sep 17 00:00:00 2001 From: momegas Date: Mon, 6 Nov 2023 08:23:06 +0200 Subject: [PATCH] pdf --- src/app/api/documents/pdf/route.ts | 87 ++++++++++++++++++++++++++++++ src/app/api/documents/route.ts | 76 +------------------------- 2 files changed, 89 insertions(+), 74 deletions(-) create mode 100644 src/app/api/documents/pdf/route.ts diff --git a/src/app/api/documents/pdf/route.ts b/src/app/api/documents/pdf/route.ts new file mode 100644 index 0000000..eaa6357 --- /dev/null +++ b/src/app/api/documents/pdf/route.ts @@ -0,0 +1,87 @@ +import { headers } from 'next/headers'; +import { NextRequest, NextResponse } from 'next/server'; +import { authApiKey } from '@/lib/public-api/auth'; +import { WebPDFLoader } from 'langchain/document_loaders/web/pdf'; +import { OpenAIEmbeddings } from 'langchain/embeddings/openai'; +import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; +import { DocumentInsert } from '../../../../types/supabase-entities'; +import { supabaseExecute } from '../../../../lib/public-api/database'; + +export async function POST(request: NextRequest) { + const { data: project, error: authError } = await authApiKey(headers()); + + if (!project || authError) { + return NextResponse.json({ error: authError }, { status: 401 }); + } + + const indexId = request.nextUrl.searchParams.get('index_id'); + if (!indexId) { + return NextResponse.json( + { error: 'Missing index_id query parameter' }, + { status: 400 } + ); + } + + const formData = await request.formData(); + const file: File | null = formData.get('file') as unknown as File; + + if (!file) { + return NextResponse.json( + { error: 'Missing file in request body' }, + { status: 400 } + ); + } + if (file.type !== 'application/pdf') { + return NextResponse.json({ error: 'File must be a pdf' }, { status: 400 }); + } + + const loader = new WebPDFLoader(file); + + const docs = await loader.load(); + + const splitter = new RecursiveCharacterTextSplitter({ + // TODO: This should be dynamic + chunkSize: 1000, + chunkOverlap: 200, + }); + + const documents = await splitter.createDocuments( + docs.map((doc) => doc.pageContent) + ); + + const openAIEmbeddings = new OpenAIEmbeddings({ + batchSize: 512, // Default value if omitted is 512. Max is 2048 + }); + + const embeddings = await openAIEmbeddings.embedDocuments( + documents.map((doc) => doc.pageContent) + ); + + const documentInsert: DocumentInsert[] = documents.map((doc, index) => ({ + embedding: embeddings[index] as unknown as string, // This is not right. The type generation from supabase is wrong here. + content: doc.pageContent, + metadata: doc.metadata.loc, + index_id: indexId, + source: file.name, + })); + + const query = ` + INSERT INTO documents (embedding, content, metadata, index_id, source, user_id) + VALUES ${documentInsert + .map( + (doc) => + `('[${doc.embedding.toString()}]', '${doc.content}', '${JSON.stringify( + doc.metadata + )}', '${doc.index_id}', '${doc.source}', '${doc.user_id}')` + ) + .join(',')} + RETURNING content, metadata, index_id, source, user_id, created_at, id;`; + + const { data, error } = await supabaseExecute(query); + + if (error) { + return NextResponse.json({ data: formData, error }, { status: 400 }); + } + + return NextResponse.json(data); +} diff --git a/src/app/api/documents/route.ts b/src/app/api/documents/route.ts index 493014a..c69f1b0 100644 --- a/src/app/api/documents/route.ts +++ b/src/app/api/documents/route.ts @@ -1,76 +1,10 @@ -/** - * @swagger - * /api/documents: - * get: - * summary: Get all documents from a index - * description: Returns all documents from the specified index - * tags: - * - Documents - * parameters: - * - in: query - * name: knowledge_base_id - * schema: - * type: string - * required: true - * description: The ID of the index to retrieve documents from - * responses: - * 200: - * description: Returns all documents from the specified index - * content: - * application/json: - * schema: - * type: array - * 400: - * description: Bad request - * content: - * application/json: - * schema: - * type: object - * properties: - * error: - * type: string - * description: The error message - * - * post: - * summary: Add documents to a index - * description: Adds new documents to the specified index - * tags: - * - Documents - * parameters: - * - in: query - * name: knowledge_base_id - * schema: - * type: string - * required: true - * description: The ID of the index to add documents to - * requestBody: - * description: The document content, source, and metadata - * required: true - * responses: - * 200: - * description: Returns the inserted document - * content: - * application/json: - * schema: - * $ref: '#/components/schemas/Document' - * 400: - * description: Bad request - * content: - * application/json: - * schema: - * type: object - * properties: - * error: - * type: string - * description: The error message - */ - import { headers } from 'next/headers'; import { NextRequest, NextResponse } from 'next/server'; import { authApiKey } from '@/lib/public-api/auth'; import { Document, DocumentInsert } from '@/types/supabase-entities'; import { supabaseExecute } from '@/lib/public-api/database'; import { OpenAIEmbeddings } from 'langchain/embeddings/openai'; +import { CreateDocumentRequestType } from '@/lib/public-api/validation'; // Get all documents from a index export async function GET(request: NextRequest) { @@ -100,12 +34,6 @@ export async function GET(request: NextRequest) { return NextResponse.json(data); } -interface DocumentPostRequest { - content: string; - source: string; - metadata: any; -} - // Add documents to a index export async function POST(request: NextRequest) { const { data: project, error: authError } = await authApiKey(headers()); @@ -122,7 +50,7 @@ export async function POST(request: NextRequest) { ); } - const documents = (await request.json()) as DocumentPostRequest[]; + const documents = (await request.json()) as CreateDocumentRequestType; // TODO: Validate documents if (!documents || !documents.length) {