Skip to content

Commit

Permalink
pdf
Browse files Browse the repository at this point in the history
  • Loading branch information
momegas committed Nov 6, 2023
1 parent d4eda83 commit c72d340
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 74 deletions.
87 changes: 87 additions & 0 deletions src/app/api/documents/pdf/route.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import { headers } from 'next/headers';
import { NextRequest, NextResponse } from 'next/server';
import { authApiKey } from '@/lib/public-api/auth';
import { WebPDFLoader } from 'langchain/document_loaders/web/pdf';
import { OpenAIEmbeddings } from 'langchain/embeddings/openai';
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import { DocumentInsert } from '../../../../types/supabase-entities';
import { supabaseExecute } from '../../../../lib/public-api/database';

export async function POST(request: NextRequest) {
const { data: project, error: authError } = await authApiKey(headers());

if (!project || authError) {
return NextResponse.json({ error: authError }, { status: 401 });
}

const indexId = request.nextUrl.searchParams.get('index_id');
if (!indexId) {
return NextResponse.json(
{ error: 'Missing index_id query parameter' },
{ status: 400 }
);
}

const formData = await request.formData();
const file: File | null = formData.get('file') as unknown as File;

if (!file) {
return NextResponse.json(
{ error: 'Missing file in request body' },
{ status: 400 }
);
}
if (file.type !== 'application/pdf') {
return NextResponse.json({ error: 'File must be a pdf' }, { status: 400 });
}

const loader = new WebPDFLoader(file);

const docs = await loader.load();

const splitter = new RecursiveCharacterTextSplitter({
// TODO: This should be dynamic
chunkSize: 1000,
chunkOverlap: 200,
});

const documents = await splitter.createDocuments(
docs.map((doc) => doc.pageContent)
);

const openAIEmbeddings = new OpenAIEmbeddings({
batchSize: 512, // Default value if omitted is 512. Max is 2048
});

const embeddings = await openAIEmbeddings.embedDocuments(
documents.map((doc) => doc.pageContent)
);

const documentInsert: DocumentInsert[] = documents.map((doc, index) => ({
embedding: embeddings[index] as unknown as string, // This is not right. The type generation from supabase is wrong here.
content: doc.pageContent,
metadata: doc.metadata.loc,
index_id: indexId,
source: file.name,
}));

const query = `
INSERT INTO documents (embedding, content, metadata, index_id, source, user_id)
VALUES ${documentInsert
.map(
(doc) =>
`('[${doc.embedding.toString()}]', '${doc.content}', '${JSON.stringify(
doc.metadata
)}', '${doc.index_id}', '${doc.source}', '${doc.user_id}')`
)
.join(',')}
RETURNING content, metadata, index_id, source, user_id, created_at, id;`;

const { data, error } = await supabaseExecute<Document>(query);

if (error) {
return NextResponse.json({ data: formData, error }, { status: 400 });
}

return NextResponse.json(data);
}
76 changes: 2 additions & 74 deletions src/app/api/documents/route.ts
Original file line number Diff line number Diff line change
@@ -1,76 +1,10 @@
/**
* @swagger
* /api/documents:
* get:
* summary: Get all documents from a index
* description: Returns all documents from the specified index
* tags:
* - Documents
* parameters:
* - in: query
* name: knowledge_base_id
* schema:
* type: string
* required: true
* description: The ID of the index to retrieve documents from
* responses:
* 200:
* description: Returns all documents from the specified index
* content:
* application/json:
* schema:
* type: array
* 400:
* description: Bad request
* content:
* application/json:
* schema:
* type: object
* properties:
* error:
* type: string
* description: The error message
*
* post:
* summary: Add documents to a index
* description: Adds new documents to the specified index
* tags:
* - Documents
* parameters:
* - in: query
* name: knowledge_base_id
* schema:
* type: string
* required: true
* description: The ID of the index to add documents to
* requestBody:
* description: The document content, source, and metadata
* required: true
* responses:
* 200:
* description: Returns the inserted document
* content:
* application/json:
* schema:
* $ref: '#/components/schemas/Document'
* 400:
* description: Bad request
* content:
* application/json:
* schema:
* type: object
* properties:
* error:
* type: string
* description: The error message
*/

import { headers } from 'next/headers';
import { NextRequest, NextResponse } from 'next/server';
import { authApiKey } from '@/lib/public-api/auth';
import { Document, DocumentInsert } from '@/types/supabase-entities';
import { supabaseExecute } from '@/lib/public-api/database';
import { OpenAIEmbeddings } from 'langchain/embeddings/openai';
import { CreateDocumentRequestType } from '@/lib/public-api/validation';

// Get all documents from a index
export async function GET(request: NextRequest) {
Expand Down Expand Up @@ -100,12 +34,6 @@ export async function GET(request: NextRequest) {
return NextResponse.json(data);
}

interface DocumentPostRequest {
content: string;
source: string;
metadata: any;
}

// Add documents to a index
export async function POST(request: NextRequest) {
const { data: project, error: authError } = await authApiKey(headers());
Expand All @@ -122,7 +50,7 @@ export async function POST(request: NextRequest) {
);
}

const documents = (await request.json()) as DocumentPostRequest[];
const documents = (await request.json()) as CreateDocumentRequestType;
// TODO: Validate documents

if (!documents || !documents.length) {
Expand Down

0 comments on commit c72d340

Please sign in to comment.