deepset-ai · Night-Quiet · Jan 23, 2025 · anakin87 · Jan 23, 2025 · anakin87
@@ -23,6 +23,8 @@
     from docx.document import Document as DocxDocument
     from docx.table import Table
     from docx.text.paragraph import Paragraph
+with LazyImport("Run 'pip install lxml'") as lxml_import:
+    from lxml.etree import _Comment
-with LazyImport("Run 'pip install lxml'") as lxml_import:
-    from lxml.etree import _Comment
+    from lxml.etree import _Comment
-with LazyImport("Run 'pip install lxml'") as lxml_import:
-    from lxml.etree import _Comment
+    from lxml.etree import _Comment
 
 
 @dataclass
@@ -119,6 +121,7 @@ def __init__(self, table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.C
             If False, only the file name is stored.
         """
         docx_import.check()
+        lxml_import.check()
-        lxml_import.check()
-        lxml_import.check()
         self.table_format = DOCXTableFormat.from_str(table_format) if isinstance(table_format, str) else table_format
         self.store_full_path = store_full_path
 
@@ -210,6 +213,8 @@ def _extract_elements(self, document: "DocxDocument") -> List[str]:
         """
         elements = []
         for element in document.element.body:
+            if isinstance(element, _Comment):
+                continue
             if element.tag.endswith("p"):
                 paragraph = Paragraph(element, document)
                 if paragraph.contains_page_break: