From 7d42a6f5275f63670945fb0be977887199f19150 Mon Sep 17 00:00:00 2001
From: Simonas <20096648+simjak@users.noreply.github.com>
Date: Tue, 5 Mar 2024 22:38:22 +0800
Subject: [PATCH] feat: Table splitter

---
 dev/walkthrough.ipynb | 150 +++++++++++-------------------------------
 models/document.py    |   2 +-
 service/embedding.py  |  15 ++++-
 service/ingest.py     |   1 -
 service/splitter.py   | 122 ++++++++++++++++++++++++++++------
 utils/table_parser.py |  52 +++++++++++++++
 6 files changed, 206 insertions(+), 136 deletions(-)
 create mode 100644 utils/table_parser.py

diff --git a/dev/walkthrough.ipynb b/dev/walkthrough.ipynb
index d91d867d..644627f4 100644
--- a/dev/walkthrough.ipynb
+++ b/dev/walkthrough.ipynb
@@ -72,6 +72,45 @@
     "print(response.json())"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'success': True, 'index_name': 'simonas-serverless-384'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Ingest a file\n",
+    "url = f\"{API_URL}/api/v1/ingest\"\n",
+    "\n",
+    "payload = {\n",
+    "    \"files\": [\n",
+    "        {\n",
+    "            \"name\": \"csv_chunking\",\n",
+    "            \"url\": \"https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv\"\n",
+    "        }\n",
+    "    ],\n",
+    "    \"vector_database\": {\n",
+    "        \"type\": \"pinecone\",\n",
+    "        \"config\": {\n",
+    "            \"api_key\": PINECONE_API_KEY,\n",
+    "            \"host\": PINECONE_HOST,\n",
+    "        }\n",
+    "    },\n",
+    "    \"index_name\": PINECONE_INDEX,\n",
+    "}\n",
+    "\n",
+    "response = requests.post(url, json=payload)\n",
+    "\n",
+    "print(response.json())"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 3,
@@ -83,116 +122,7 @@
      "text": [
       "{\n",
       "    \"success\": true,\n",
-      "    \"data\": [\n",
-      "        {\n",
-      "            \"id\": \"75d3adef-0fec-496e-99a7-0510d9c2ed5d\",\n",
-      "            \"doc_url\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
-      "            \"document_id\": \"doc_fdadb486-da0e-4bc3-ada5-d583831cb112\",\n",
-      "            \"content\": \"2 Related work\\nMore speci\\ufb01cally on document chunking methods for RAG, there are stan- dard approaches being considered such as chunking text into spans of a given token length (e.g. 128 and 256) or chunking based on sentences. Open source projects already allow simple processing of documents (e.g. Unstructured4, Lla- maindex5 or Langchain 6), without explicitly considering the table structure on which these chunking strategies are applied. Even though di\\ufb00erent approaches are available, an exhaustive evaluation of chunking applied to RAG and speci\\ufb01cally to \\ufb01nancial reporting, except for some limited chunking analysis [14,36], is non-existent. In our work, we compare a broad range of chunking approaches in addition to more simple ones and provide an analysis of the outcomes of di\\ufb00erent methods when asking questions about di\\ufb00erent aspects of the reports.\",\n",
-      "            \"source\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
-      "            \"source_type\": \".pdf\",\n",
-      "            \"chunk_index\": null,\n",
-      "            \"title\": \"2 Related work\",\n",
-      "            \"token_count\": null,\n",
-      "            \"page_number\": 3,\n",
-      "            \"metadata\": {\n",
-      "                \"filename\": \"tmpykpa2wwh.pdf\",\n",
-      "                \"filetype\": \"application/pdf\",\n",
-      "                \"languages\": [\n",
-      "                    \"eng\"\n",
-      "                ],\n",
-      "                \"parent_id\": \"5cdbed1de9473b8856ab0befd08ff7cb\"\n",
-      "            },\n",
-      "            \"dense_embedding\": null\n",
-      "        },\n",
-      "        {\n",
-      "            \"id\": \"58353d3f-a938-43f7-bde8-0e99125fa2f9\",\n",
-      "            \"doc_url\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
-      "            \"document_id\": \"doc_fdadb486-da0e-4bc3-ada5-d583831cb112\",\n",
-      "            \"content\": \"Table 3. Chunks statistics for basic chunking elements and Unstructured elements\\nResults in table 5 show that element-based chunking strategies o\\ufb00er the best question-answering accuracy, which is consistent with page retrieval and para- graph retrieval accuracy. Lastly, our approach stands out for its e\\ufb03ciency. Not only is element-based chunking generalizable without the need to select the chunk size, but when com- pared to the aggregation results that yield the highest retrieval scores. Element- based chunking achieves the highest retrieval scores with only half the number of chunks required compared to methods that do not consider the structure of the documents (62,529 v.s. 112,155). This can reduce the indexing cost and im- prove query latency because there are only half as many vectors to index for the vectordb that stores the chunks. This underscores the e\\ufb00ectiveness of our solu- tion in optimizing the balance between performance and computational resource requirements.\",\n",
-      "            \"source\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
-      "            \"source_type\": \".pdf\",\n",
-      "            \"chunk_index\": null,\n",
-      "            \"title\": \"Table 3. Chunks statistics for basic chunking elements and Unstructured elements\",\n",
-      "            \"token_count\": null,\n",
-      "            \"page_number\": 9,\n",
-      "            \"metadata\": {\n",
-      "                \"filename\": \"tmpykpa2wwh.pdf\",\n",
-      "                \"filetype\": \"application/pdf\",\n",
-      "                \"languages\": [\n",
-      "                    \"eng\"\n",
-      "                ],\n",
-      "                \"parent_id\": \"53ffedc9520f52ef2c8e4568301c8530\"\n",
-      "            },\n",
-      "            \"dense_embedding\": null\n",
-      "        },\n",
-      "        {\n",
-      "            \"id\": \"e3caf266-27a8-4654-94ec-9b82ead3c9ce\",\n",
-      "            \"doc_url\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
-      "            \"document_id\": \"doc_fdadb486-da0e-4bc3-ada5-d583831cb112\",\n",
-      "            \"content\": \"Table 3. Chunks statistics for basic chunking elements and Unstructured elements\\nRetrieval Accuracy Secondly, we evaluate the capabilities of each chunking strategy in terms of retrieval accuracy. We use the page numbers in the ground truth to calculate the page-level retrieval accuracy, and we use ROGUE [24] and BLEU [32] scores to evaluate the accuracy of paragraph-level retrieval compared to the ground truth evidence paragraphs. As shown in Table 4, when compared to Unstructured element-based chunk- ing strategies, basic chunking strategies seem to have higher page-level retrieval accuracy but lower paragraph-level accuracy on average. Additionally, basic chunking strategies also lack consistency between page-level and paragraph-level accuracy; higher page-level accuracy doesn\\u2019t ensure higher paragraph-level ac- curacy. For example, Base 128 has the second highest page-level accuracy but\",\n",
-      "            \"source\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
-      "            \"source_type\": \".pdf\",\n",
-      "            \"chunk_index\": null,\n",
-      "            \"title\": \"Table 3. Chunks statistics for basic chunking elements and Unstructured elements\",\n",
-      "            \"token_count\": null,\n",
-      "            \"page_number\": 9,\n",
-      "            \"metadata\": {\n",
-      "                \"filename\": \"tmpykpa2wwh.pdf\",\n",
-      "                \"filetype\": \"application/pdf\",\n",
-      "                \"languages\": [\n",
-      "                    \"eng\"\n",
-      "                ],\n",
-      "                \"parent_id\": \"53ffedc9520f52ef2c8e4568301c8530\"\n",
-      "            },\n",
-      "            \"dense_embedding\": null\n",
-      "        },\n",
-      "        {\n",
-      "            \"id\": \"14257177-480d-45cf-9759-f6e8b1bd60b5\",\n",
-      "            \"doc_url\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
-      "            \"document_id\": \"doc_fdadb486-da0e-4bc3-ada5-d583831cb112\",\n",
-      "            \"content\": \"5 Discussion\\new have observed that using basic 512 chunking strategies produces results most similar to the Unstructured element-based approach, which may be due to the fact that 512 tokens share a similar length with the token size within our element-based chunks and capture a long context, but fail keep a coherent context in some cases, leaving out relevant information required for Q&A. This is further observed when considering the ROGUE and BLEU scores in table 4, where the chunk contexts for the baseline have lower scores. These \\ufb01ndings support existing research stating that the best basic chunk size varies from data to data [3]. These results show, as well, that our method adapts to di\\ufb00erent documents without tuning. Our method relies on the struc-\",\n",
-      "            \"source\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
-      "            \"source_type\": \".pdf\",\n",
-      "            \"chunk_index\": null,\n",
-      "            \"title\": \"5 Discussion\",\n",
-      "            \"token_count\": null,\n",
-      "            \"page_number\": 11,\n",
-      "            \"metadata\": {\n",
-      "                \"filename\": \"tmpykpa2wwh.pdf\",\n",
-      "                \"filetype\": \"application/pdf\",\n",
-      "                \"languages\": [\n",
-      "                    \"eng\"\n",
-      "                ],\n",
-      "                \"parent_id\": \"2a6506945581218449cc497a03e8cfcd\"\n",
-      "            },\n",
-      "            \"dense_embedding\": null\n",
-      "        },\n",
-      "        {\n",
-      "            \"id\": \"94411542-6ad8-4454-ad42-d0fbf9f5b4f9\",\n",
-      "            \"doc_url\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
-      "            \"document_id\": \"doc_fdadb486-da0e-4bc3-ada5-d583831cb112\",\n",
-      "            \"content\": \"3.4 Chunking\\nThe list of elements considered are provided by the Unstructured9 open source library. From the set of processing strategies, 9 https://unstructured-io.github.io/unstructured/introduction.html#\",\n",
-      "            \"source\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
-      "            \"source_type\": \".pdf\",\n",
-      "            \"chunk_index\": null,\n",
-      "            \"title\": \"3.4 Chunking\",\n",
-      "            \"token_count\": null,\n",
-      "            \"page_number\": 6,\n",
-      "            \"metadata\": {\n",
-      "                \"filename\": \"tmpykpa2wwh.pdf\",\n",
-      "                \"filetype\": \"application/pdf\",\n",
-      "                \"languages\": [\n",
-      "                    \"eng\"\n",
-      "                ],\n",
-      "                \"links\": [\n",
-      "                    \"{'text': '9https :// unstructured - io . github . io / unstructured / introduction . html', 'url': 'https://unstructured-io.github.io/unstructured/introduction.html#elements', 'start_index': 313}\"\n",
-      "                ],\n",
-      "                \"parent_id\": \"dac017d1d3734f5431cae57dcc72f748\"\n",
-      "            },\n",
-      "            \"dense_embedding\": null\n",
-      "        }\n",
-      "    ]\n",
+      "    \"data\": []\n",
       "}\n"
      ]
     }
diff --git a/models/document.py b/models/document.py
index dfd7b763..35d4165b 100644
--- a/models/document.py
+++ b/models/document.py
@@ -13,9 +13,9 @@ class BaseDocument(BaseModel):
 
 class BaseDocumentChunk(BaseModel):
     id: str
-    doc_url: str | None = None
     document_id: str
     content: str
+    doc_url: str | None = None
     source: str | None = None
     source_type: str | None = None
     chunk_index: int | None = None
diff --git a/service/embedding.py b/service/embedding.py
index f553af27..17b38761 100644
--- a/service/embedding.py
+++ b/service/embedding.py
@@ -81,7 +81,6 @@ async def _partition_file(
             f"Downloading and extracting elements from {file.url}, "
             f"using `{strategy}` strategy"
         )
-        print(file.suffix)
         with NamedTemporaryFile(suffix=file.suffix, delete=True) as temp_file:
             with requests.get(url=file.url) as response:
                 temp_file.write(response.content)
@@ -157,6 +156,7 @@ async def generate_chunks(
     ) -> List[BaseDocumentChunk]:
         doc_chunks = []
         for file in tqdm(self.files, desc="Generating chunks"):
+            logger.info(f"Splitting method: {config.splitter.name}")
             try:
                 chunks = []
                 if config.splitter.name == "by_title":
@@ -247,8 +247,17 @@ async def embed_batch(
         ) -> List[BaseDocumentChunk]:
             async with sem:
                 try:
-                    texts = [chunk.content for chunk in chunks_batch]
-                    embeddings = encoder(texts)
+                    chunk_texts = []
+                    for chunk in chunks_batch:
+                        if not chunk:
+                            logger.warning("Empty chunk encountered")
+                            continue
+                        chunk_texts.append(chunk.content)
+
+                    if not chunk_texts:
+                        logger.warning(f"No content to embed in batch {chunks_batch}")
+                        return []
+                    embeddings = encoder(chunk_texts)
                     for chunk, embedding in zip(chunks_batch, embeddings):
                         chunk.dense_embedding = np.array(embedding).tolist()
                     pbar.update(len(chunks_batch))  # Update the progress bar
diff --git a/service/ingest.py b/service/ingest.py
index 9dc4b625..407342a4 100644
--- a/service/ingest.py
+++ b/service/ingest.py
@@ -14,7 +14,6 @@ async def handle_urls(
 ):
     embedding_service.files = files
     chunks = await embedding_service.generate_chunks(config=config)
-    print(chunks)
     summary_documents = await embedding_service.generate_summary_documents(
         documents=chunks
     )
diff --git a/service/splitter.py b/service/splitter.py
index b044144d..b4a9237d 100644
--- a/service/splitter.py
+++ b/service/splitter.py
@@ -1,10 +1,21 @@
 import re
 from typing import Any
 
+import tiktoken
 from colorama import Fore, Style
 from semantic_router.encoders import BaseEncoder
 from semantic_router.splitters import RollingWindowSplitter
 
+from utils.logger import logger
+from utils.table_parser import TableParser
+
+
+# TODO: Move to document processing utils, once we have
+def _tiktoken_length(text: str):
+    tokenizer = tiktoken.get_encoding("cl100k_base")
+    tokens = tokenizer.encode(text, disallowed_special=())
+    return len(tokens)
+
 
 class UnstructuredSemanticSplitter:
     def __init__(
@@ -20,8 +31,9 @@ def __init__(
             min_split_tokens=min_split_tokens,
             max_split_tokens=max_split_tokens,
         )
+        self.max_split_tokens = max_split_tokens
 
-    def is_valid_title(self, title: str) -> bool:
+    def _is_valid_title(self, title: str) -> bool:
         # Rule 1: Title starts with a lowercase letter
         if re.match(r"^[a-z]", title):
             return False
@@ -33,6 +45,67 @@ def is_valid_title(self, title: str) -> bool:
             return False
         return True
 
+    def _split_table(self, table_html: str, max_split_tokens: int) -> list[str]:
+        parser = TableParser()
+        parser.feed(table_html)
+
+        # Create the full table HTML to check if it needs splitting
+        full_table = (
+            '<table border="1" class="dataframe">'
+            + parser.title_row
+            + "<tbody>"
+            + "".join(parser.rows)
+            + "</tbody></table>"
+        )
+
+        # If the full table is within the token limit, return it without splitting
+        if _tiktoken_length(full_table) <= max_split_tokens:
+            return [full_table]
+
+        splitted_tables = []  # To store split tables
+        current_chunk = []
+
+        # If the table exceeds the token limit, split it
+        for row in parser.rows:
+            # Temporarily add the current row to the chunk to check size
+            temp_table = (
+                '<table border="1" class="dataframe">'
+                + parser.title_row
+                + "<tbody>"
+                + "".join(current_chunk + [row])
+                + "</tbody></table>"
+            )
+            if _tiktoken_length(temp_table) > max_split_tokens:
+                if current_chunk:
+                    # Finalize the current chunk if it's not empty
+                    splitted_tables.append(
+                        '<table border="1" class="dataframe">'
+                        + parser.title_row
+                        + "<tbody>"
+                        + "".join(current_chunk)
+                        + "</tbody></table>"
+                    )
+                    current_chunk = [row]  # Start a new chunk with the current row
+                else:
+                    # If a single row exceeds the limit,
+                    # add it anyway (to handle edge cases)
+                    splitted_tables.append(temp_table)
+                    current_chunk = []  # Reset for the next chunk
+            else:
+                current_chunk.append(row)  # Add the row to the current chunk
+
+        # Add any remaining rows as a chunk
+        if current_chunk:
+            splitted_tables.append(
+                '<table border="1" class="dataframe">'
+                + parser.title_row
+                + "<tbody>"
+                + "".join(current_chunk)
+                + "</tbody></table>"
+            )
+
+        return splitted_tables
+
     def _group_elements_by_title(self, elements: list[dict[str, Any]]) -> dict:
         grouped_elements = {}
         current_title = "Untitled"  # Default title for initial text without a title
@@ -40,17 +113,15 @@ def _group_elements_by_title(self, elements: list[dict[str, Any]]) -> dict:
         for element in elements:
             if element.get("type") == "Title":
                 potential_title = element.get("text", "Untitled")
-                if self.is_valid_title(potential_title):
+                if self._is_valid_title(potential_title):
                     print(f"{Fore.GREEN}{potential_title}: True{Style.RESET_ALL}")
                     current_title = potential_title
                 else:
                     print(f"{Fore.RED}{potential_title}: False{Style.RESET_ALL}")
                     continue
-            else:
-                if current_title not in grouped_elements:
-                    grouped_elements[current_title] = []
-                else:
-                    grouped_elements[current_title].append(element)
+            if current_title not in grouped_elements:
+                grouped_elements[current_title] = []
+            grouped_elements[current_title].append(element)
         return grouped_elements
 
     async def split_grouped_elements(
@@ -59,12 +130,14 @@ async def split_grouped_elements(
         grouped_elements = self._group_elements_by_title(elements)
         chunks_with_title = []
 
-        def _append_chunks(*, title: str, content: str, index: int, metadata: dict):
+        def _append_chunks(
+            *, title: str, content: str, chunk_index: int, metadata: dict
+        ):
             chunks_with_title.append(
                 {
                     "title": title,
                     "content": content,
-                    "chunk_index": index,
+                    "chunk_index": chunk_index,
                     "metadata": metadata,
                 }
             )
@@ -89,7 +162,7 @@ def _append_chunks(*, title: str, content: str, index: int, metadata: dict):
                             _append_chunks(
                                 title=title,
                                 content=split.content,
-                                index=index,
+                                chunk_index=index,
                                 metadata=section_metadata,
                             )
                         # TODO: reset after PageBreak also
@@ -97,17 +170,24 @@ def _append_chunks(*, title: str, content: str, index: int, metadata: dict):
                             []
                         )  # Start new accumulation after table
 
-                    # Add table as a separate chunk
-                    _append_chunks(
-                        title=title,
-                        content=element.get("metadata", {}).get(
-                            "text_as_html", "No text"
-                        ),
-                        index=index,
-                        metadata=element.get("metadata", {}),
+                    # Add table as a separate chunk or split it if
+                    table_html = element.get("metadata", {}).get("text_as_html", "")
+                    splitted_tables = self._split_table(
+                        table_html, self.max_split_tokens
                     )
-                else:
-                    accumulated_element_texts.append(element.get("text", "No text"))
+                    metadata = {**element.get("metadata", {})}
+                    metadata.pop("text_as_html", None)
+                    for table in splitted_tables:
+                        if not table:
+                            logger.warning("Empty table encountered")
+                            continue
+                        _append_chunks(
+                            title=title,
+                            content=table,  # TODO: This should be a summary of table
+                            chunk_index=index,
+                            # TODO: Think of how to pass this to LLM
+                            metadata={"table_content": table, **metadata},
+                        )
 
             # Process any remaining accumulated text after the last table
             # or if no table was encountered
@@ -118,7 +198,7 @@ def _append_chunks(*, title: str, content: str, index: int, metadata: dict):
                     _append_chunks(
                         title=title,
                         content=split.content,
-                        index=index,
+                        chunk_index=index,
                         metadata=section_metadata,
                     )
             if chunks:
diff --git a/utils/table_parser.py b/utils/table_parser.py
new file mode 100644
index 00000000..faf2e50d
--- /dev/null
+++ b/utils/table_parser.py
@@ -0,0 +1,52 @@
+from html.parser import HTMLParser
+
+
+class TableParser(HTMLParser):
+    def __init__(self):
+        super().__init__()
+        self.in_table = False
+        self.in_thead = False
+        self.in_tbody = False
+        self.in_row = False
+        self.in_cell = False
+        self.title_row = ""
+        self.current_row = ""
+        self.rows = []
+        self.capture_next_row_as_title = True
+
+    def handle_starttag(self, tag, attrs):
+        if tag == "table":
+            self.in_table = True
+        elif tag == "thead":
+            self.in_thead = True
+        elif tag == "tbody":
+            self.in_tbody = True
+        elif tag == "tr":
+            self.in_row = True
+            self.current_row = ""
+        elif tag in ["td", "th"]:
+            self.in_cell = True
+            self.current_row += "<" + tag + ">"
+
+    def handle_endtag(self, tag):
+        if tag == "table":
+            self.in_table = False
+        elif tag == "thead":
+            self.in_thead = False
+        elif tag == "tbody":
+            self.in_tbody = False
+        elif tag == "tr":
+            self.in_row = False
+            self.current_row += "</tr>"
+            if self.capture_next_row_as_title:
+                self.title_row = self.current_row
+                self.capture_next_row_as_title = False
+            else:
+                self.rows.append(self.current_row)
+        elif tag in ["td", "th"]:
+            self.in_cell = False
+            self.current_row += "</" + tag + ">"
+
+    def handle_data(self, data):
+        if self.in_cell:
+            self.current_row += data