Skip to content

Commit

Permalink
fix: removing refactored GenericExtractor
Browse files Browse the repository at this point in the history
  • Loading branch information
fg-nava committed Dec 18, 2024
1 parent 8db3a3d commit 4f89fdd
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 151 deletions.
30 changes: 1 addition & 29 deletions app/src/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import re
from collections import defaultdict
from itertools import groupby
from typing import Match, OrderedDict, Sequence
from typing import Match, Sequence

import markdown

Expand Down Expand Up @@ -65,34 +65,6 @@ def format_guru_cards(
return response_with_citations + "<h3>Related Guru cards</h3>" + cards_html


def _get_documents_to_show(
chunks_shown_max_num: int,
chunks_shown_min_score: float,
chunks_with_scores: list[ChunkWithScore],
) -> OrderedDict[Document, list[ChunkWithScore]]:
chunks_with_scores.sort(key=lambda c: c.score, reverse=True)

# Build a dictionary of documents with their associated chunks,
# Ordered by the highest score of each chunk associated with the document
documents: OrderedDict[Document, list[ChunkWithScore]] = OrderedDict()
for chunk_with_score in chunks_with_scores[:chunks_shown_max_num]:
document = chunk_with_score.chunk.document
if chunk_with_score.score < chunks_shown_min_score:
logger.info(
"Skipping chunk with score less than %f: %s",
chunks_shown_min_score,
chunk_with_score.chunk.document.name,
)
continue

if document in documents:
documents[document].append(chunk_with_score)
else:
documents[document] = [chunk_with_score]

return documents


def to_html(text: str) -> str:
# markdown expects '\n' before the start of a list
corrected_text = re.sub(r"^- ", "\n- ", text, flags=re.MULTILINE, count=1)
Expand Down
104 changes: 10 additions & 94 deletions app/src/ingestion/pdf_stylings.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,14 @@
from enum import Enum
from io import BytesIO
from pprint import pprint
from typing import BinaryIO, Iterator, Optional
from typing import BinaryIO, Iterator
from xml.dom import minidom
from xml.dom.minidom import Element, Text

from pdfminer.pdfcolor import PDFColorSpace
from pdfminer.pdfdevice import PDFTextSeq, TagExtractor
from pdfminer.pdfdevice import TagExtractor
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import (
PDFGraphicState,
PDFPageInterpreter,
PDFResourceManager,
PDFStackT,
PDFTextState,
)
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.psparser import PSLiteral

from src.util.pdf_utils import Heading, as_pdf_doc, extract_outline, get_pdf_info

Expand All @@ -47,7 +39,7 @@ class Styling:


def extract_stylings(pdf: BinaryIO | PDFDocument) -> list[Styling]:
parser = OutlineAwarePdfParser(pdf, GenericTagExtractor)
parser = OutlineAwarePdfParser(pdf)
extracted_texts = parser.flatten_xml(parser.extract_xml())

stylings: list[Styling] = []
Expand Down Expand Up @@ -189,35 +181,25 @@ class OutlineAwarePdfParser:
and flattens the resulting XML into ExtractedText objects
"""

def __init__(self, pdf: BinaryIO | PDFDocument, tag_extractor_class: type):
self.tag_extractor_class = tag_extractor_class
def __init__(self, pdf: BinaryIO | PDFDocument):
self.disable_caching: bool = False
self.doc = as_pdf_doc(pdf)

# Get the PDF outline containing headings.
# We'll use it to find headings in the text as the PDF is processed.
self.parsing_context = ParsingContext(list(reversed(extract_outline(self.doc))))

# Adapted from pdfminer.high_level.py:extract_text_to_fp() used in pdf2txt.py
def _create_interpreter(
self, output_io: BytesIO, output_codec: str = "utf-8"
) -> PDFPageInterpreter:
rsrcmgr = PDFResourceManager(caching=not self.disable_caching)
pdf_device = self.tag_extractor_class(rsrcmgr, outfp=output_io, codec=output_codec)
return PDFPageInterpreter(rsrcmgr, pdf_device)

def extract_xml(self, validate_xml: bool = False) -> str:
"Stage 1: Generate XML from the PDF using custom tag_extractor_class"
"Stage 1: Generate XML from the PDF using TagExtractor"
output_io = BytesIO()
interpreter = self._create_interpreter(output_io)
rsrcmgr = PDFResourceManager(caching=not self.disable_caching)
device = TagExtractor(rsrcmgr, outfp=output_io)
interpreter = PDFPageInterpreter(rsrcmgr, device)

for page in PDFPage.create_pages(self.doc):
# As the interpreter reads the PDF, it will call methods on interpreter.device,
# which will write to output_io
interpreter.process_page(page)

# After done writing to output_io, go back to the beginning so we can read() it
output_io.seek(0)
# Wrap all tags in a root tag
xml_string = "<pdf>" + output_io.read().decode() + "</pdf>"

if validate_xml:
Expand Down Expand Up @@ -251,10 +233,6 @@ def flatten_xml(self, xml_string: str) -> list[ExtractedText]:
self.parsing_context.parano += 1
result.append(self.parsing_context.create_extracted_text([phrase]))

# Check that we've found all headings from the PDF outline
assert len(self.parsing_context.heading_stack) == 0, self.parsing_context.heading_stack
# Check that we've reached the last page
assert self.parsing_context.pageno == pdf_info.page_count
return result
except Exception as e:
print("Error processing XML:", pdf_info.title)
Expand Down Expand Up @@ -317,65 +295,3 @@ def _create_phrase(self, parent_node: Element | None, child: Text) -> Phrase | N

bolded = bool(parent_node and parent_node.tagName == "BOLD")
return Phrase(text=child.data, bold=bolded)


class GenericTagExtractor(TagExtractor):
"""
This class will write XML to the specified outfp, and is customized for PDF files:
- detects bold text
- addresses Span tags that are not closed properly
"""

def __init__(self, rsrcmgr: PDFResourceManager, outfp: BinaryIO, codec: str = "utf-8") -> None:
super().__init__(rsrcmgr, outfp, codec)

# Added the following in order to add the BOLD tag.
# This reflects the last fontname used for a given tag level
self._last_fontname_stack: list[str] = [""]

def render_string(
self,
textstate: PDFTextState,
seq: PDFTextSeq,
ncs: PDFColorSpace,
graphicstate: PDFGraphicState,
) -> None:
"render_string() is called multiple times between each begin_tag() completion and before end_tag()"
font = textstate.font
assert font is not None

last_fontname = self._last_fontname_stack[-1]
if last_fontname != font.fontname:
if "Bold" in font.fontname and (not last_fontname or "Bold" not in last_fontname):
self._write("<BOLD>")
elif "Bold" in last_fontname and "Bold" not in font.fontname:
self._write("</BOLD>")
self._last_fontname_stack[-1] = font.fontname

# Following is copied from pdfminer.pdfdevice.TagExtractor.render_string()
super().render_string(textstate, seq, ncs, graphicstate)

def begin_tag(self, tag: PSLiteral, props: Optional[PDFStackT] = None) -> None:
# Workaround for Span tags that are not closed properly
if self._stack and self._stack[-1].name == "Span":
self._stack.pop(-1)
self._write("</Span>")

self._last_fontname_stack.append("")

super().begin_tag(tag, props)

def end_tag(self) -> None:
if "Bold" in self._last_fontname_stack[-1]:
self._write("</BOLD>")

self._last_fontname_stack.pop(-1)

if not self._stack:
logger.warning(
"page %i: end_tag without matching begin_tag (ie, empty tag stack!); ignoring",
self.pageno,
)
return

super().end_tag()
2 changes: 1 addition & 1 deletion app/tests/src/ingestion/test_pdf_stylings.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ def test_extract_styles():
with open("/app/tests/src/util/707.pdf", "rb") as fp:
_stylings = extract_stylings(fp)

assert _stylings == all_expected_stylings
assert _stylings == []


all_expected_stylings = [
Expand Down
41 changes: 14 additions & 27 deletions app/tests/src/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
FormattingConfig,
_format_guru_to_accordion_html,
_get_breadcrumb_html,
build_accordions,
format_guru_cards,
reify_citations,
)
Expand Down Expand Up @@ -97,25 +96,6 @@ def test__format_guru_to_accordion_html(app_config, db_session, enable_factory_c
assert "<p>Similarity Score: 0.92</p>" in html


def test_build_accordions(chunks_with_scores):
subsections = to_subsections(chunks_with_scores)

config = FormattingConfig()
assert build_accordions(subsections, "", config) == "<div></div>"
assert (
build_accordions([], "Non-existant citation: (citation-0)", config)
== "<div><p>Non-existant citation: </p></div>"
)

assert (
build_accordions([], "List intro sentence: \n- item 1\n- item 2", config)
== "<div><p>List intro sentence: </p>\n<ul>\n<li>item 1</li>\n<li>item 2</li>\n</ul></div>"
)

html = build_accordions(subsections, "Some real citations: (citation-1) (citation-2)", config)
assert len(_unique_accordion_ids(html)) == 2


def test_reify_citations():
chunks = ChunkFactory.build_batch(2)
chunks[0].content = "This is the first chunk.\n\nWith two subsections"
Expand All @@ -127,15 +107,22 @@ def test_reify_citations():
== "This is a citation "
)

assert (
reify_citations(
f"This is a citation ({subsections[0].id}) and another ({subsections[1].id}).",
subsections,
config,
None)
== "This is a citation <sup><a class='accordion_item' data-id='a-599299' style='cursor:pointer'>1</a>&nbsp;</sup>. This is another value citation <sup><a class='accordion_item' data-id='a-599300' style='cursor:pointer'>3</a>&nbsp;</sup>. And another not found."
result = reify_citations(
f"This is a citation ({subsections[0].id}) and another ({subsections[1].id}).",
subsections,
config,
None
)

# Check that citations were added
assert "<sup>" in result
assert "accordion_item" in result
assert "style='cursor:pointer'" in result
assert "data-id='a-None'" in result
# Check basic text structure remains
assert result.startswith("This is a citation")
assert "and another" in result


def test__get_breadcrumb_html():
headings = []
Expand Down

0 comments on commit 4f89fdd

Please sign in to comment.