diff --git a/haystack/components/converters/pypdf.py b/haystack/components/converters/pypdf.py index de55f68471..ae46ac918e 100644 --- a/haystack/components/converters/pypdf.py +++ b/haystack/components/converters/pypdf.py @@ -196,7 +196,7 @@ def from_dict(cls, data): data["init_parameters"]["converter"] = deserialize_class_instance(custom_converter_data) return default_from_dict(cls, data) - def _default_convert(self, reader: "PdfReader") -> Document: + def _default_convert(self, reader: "PdfReader") -> str: texts = [] for page in reader.pages: texts.append( @@ -211,7 +211,7 @@ def _default_convert(self, reader: "PdfReader") -> Document: ) ) text = "\f".join(texts) - return Document(content=text) + return text @component.output_types(documents=List[Document]) def run( @@ -246,8 +246,10 @@ def run( continue try: pdf_reader = PdfReader(io.BytesIO(bytestream.data)) - document = ( - self._default_convert(pdf_reader) if self.converter is None else self.converter.convert(pdf_reader) + text = ( + self._default_convert(pdf_reader) + if self.converter is None + else self.converter.convert(pdf_reader).content ) except Exception as e: logger.warning( @@ -255,7 +257,7 @@ def run( ) continue - if document.content is None or document.content.strip() == "": + if text is None or text.strip() == "": logger.warning( "PyPDFToDocument could not extract text from the file {source}. Returning an empty document.", source=source, @@ -270,7 +272,7 @@ def run( ) if not self.store_full_path and (file_path := bytestream.meta.get("file_path")): merged_metadata["file_path"] = os.path.basename(file_path) - document.meta = merged_metadata + document = Document(content=text, meta=merged_metadata) documents.append(document) return {"documents": documents} diff --git a/releasenotes/notes/pypdf-docid-293dac08ea5f8491.yaml b/releasenotes/notes/pypdf-docid-293dac08ea5f8491.yaml new file mode 100644 index 0000000000..f077d8b4ee --- /dev/null +++ b/releasenotes/notes/pypdf-docid-293dac08ea5f8491.yaml @@ -0,0 +1,4 @@ +--- +fixes: + - | + PyPDFToDocument now creates documents with id based on converted text and meta data. Before it didn't take the meta data into account. diff --git a/test/components/converters/test_pypdf_to_document.py b/test/components/converters/test_pypdf_to_document.py index e82b8029d4..58a9835ebc 100644 --- a/test/components/converters/test_pypdf_to_document.py +++ b/test/components/converters/test_pypdf_to_document.py @@ -169,8 +169,8 @@ def test_default_convert(self): layout_mode_font_height_weight=1.5, ) - doc = converter._default_convert(mock_reader) - assert doc.content == "Page 1 content\fPage 2 content" + text = converter._default_convert(mock_reader) + assert text == "Page 1 content\fPage 2 content" expected_params = { "extraction_mode": "layout", @@ -292,3 +292,7 @@ def test_run_empty_document(self, caplog, test_files_path): output = PyPDFToDocument().run(sources=paths) assert "PyPDFToDocument could not extract text from the file" in caplog.text assert output["documents"][0].content == "" + + # Check that meta is used when the returned document is initialized and thus when doc id is generated + assert "non_text_searchable.pdf" in output["documents"][0].meta["file_path"] + assert output["documents"][0].id != Document(content="").id