From 76634fe19d47ac45213de4457dd36c363e9f6dfb Mon Sep 17 00:00:00 2001 From: alisalim17 Date: Tue, 30 Apr 2024 12:41:42 +0400 Subject: [PATCH] fix: passing metadata when semantic splitter used as well --- service/embedding.py | 2 +- service/splitter.py | 20 ++++++++++++++++---- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/service/embedding.py b/service/embedding.py index 50dc6b07..d3d2ac42 100644 --- a/service/embedding.py +++ b/service/embedding.py @@ -188,7 +188,7 @@ async def generate_chunks( min_split_tokens=config.splitter.min_tokens, max_split_tokens=config.splitter.max_tokens, ) - chunks = await splitter_config(elements=elements) + chunks = await splitter_config(elements=elements, file=file) if not chunks: continue diff --git a/service/splitter.py b/service/splitter.py index 2443619f..0fa89f65 100644 --- a/service/splitter.py +++ b/service/splitter.py @@ -8,6 +8,7 @@ from utils.logger import logger from utils.table_parser import TableParser +from models.file import File # TODO: Move to document processing utils, once we have @@ -125,7 +126,11 @@ def _group_elements_by_title(self, elements: list[dict[str, Any]]) -> dict: return grouped_elements async def split_grouped_elements( - self, elements: list[dict[str, Any]], splitter: RollingWindowSplitter + self, + *, + elements: list[dict[str, Any]], + file: File, + splitter: RollingWindowSplitter, ) -> list[dict[str, Any]]: grouped_elements = self._group_elements_by_title(elements) chunks_with_title = [] @@ -138,7 +143,10 @@ def _append_chunks( "title": title, "content": content, "chunk_index": chunk_index, - "metadata": metadata, + "metadata": { + **file.metadata, + **metadata, + }, } ) @@ -207,5 +215,9 @@ def _append_chunks( chunks_with_title.extend(chunks) return chunks_with_title - async def __call__(self, elements: list[dict[str, Any]]) -> list[dict[str, Any]]: - return await self.split_grouped_elements(elements, self.splitter) + async def __call__( + self, elements: list[dict[str, Any]], file: File + ) -> list[dict[str, Any]]: + return await self.split_grouped_elements( + elements=elements, file=file, splitter=self.splitter + )