Skip to content

Commit

Permalink
fix: PyPDFToDocument initializes documents with content and meta
Browse files Browse the repository at this point in the history
  • Loading branch information
julian-risch committed Jan 9, 2025
1 parent 167a48e commit 35334f1
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 8 deletions.
14 changes: 8 additions & 6 deletions haystack/components/converters/pypdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def from_dict(cls, data):
data["init_parameters"]["converter"] = deserialize_class_instance(custom_converter_data)
return default_from_dict(cls, data)

def _default_convert(self, reader: "PdfReader") -> Document:
def _default_convert(self, reader: "PdfReader") -> str:
texts = []
for page in reader.pages:
texts.append(
Expand All @@ -211,7 +211,7 @@ def _default_convert(self, reader: "PdfReader") -> Document:
)
)
text = "\f".join(texts)
return Document(content=text)
return text

@component.output_types(documents=List[Document])
def run(
Expand Down Expand Up @@ -246,16 +246,18 @@ def run(
continue
try:
pdf_reader = PdfReader(io.BytesIO(bytestream.data))
document = (
self._default_convert(pdf_reader) if self.converter is None else self.converter.convert(pdf_reader)
text = (
self._default_convert(pdf_reader)
if self.converter is None
else self.converter.convert(pdf_reader).content
)
except Exception as e:
logger.warning(
"Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e
)
continue

if document.content is None or document.content.strip() == "":
if text is None or text.strip() == "":
logger.warning(
"PyPDFToDocument could not extract text from the file {source}. Returning an empty document.",
source=source,
Expand All @@ -270,7 +272,7 @@ def run(
)
if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)
document.meta = merged_metadata
document = Document(content=text, meta=merged_metadata)
documents.append(document)

return {"documents": documents}
4 changes: 4 additions & 0 deletions releasenotes/notes/pypdf-docid-293dac08ea5f8491.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
fixes:
- |
PyPDFToDocument now creates documents with id based on converted text and meta data. Before it didn't take the meta data into account.
8 changes: 6 additions & 2 deletions test/components/converters/test_pypdf_to_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,8 +169,8 @@ def test_default_convert(self):
layout_mode_font_height_weight=1.5,
)

doc = converter._default_convert(mock_reader)
assert doc.content == "Page 1 content\fPage 2 content"
text = converter._default_convert(mock_reader)
assert text == "Page 1 content\fPage 2 content"

expected_params = {
"extraction_mode": "layout",
Expand Down Expand Up @@ -292,3 +292,7 @@ def test_run_empty_document(self, caplog, test_files_path):
output = PyPDFToDocument().run(sources=paths)
assert "PyPDFToDocument could not extract text from the file" in caplog.text
assert output["documents"][0].content == ""

# Check that meta is used when the returned document is initialized and thus when doc id is generated
assert "non_text_searchable.pdf" in output["documents"][0].meta["file_path"]
assert output["documents"][0].id != Document(content="").id

0 comments on commit 35334f1

Please sign in to comment.