Skip to content

Commit

Permalink
fix: preserve hyperlink references in Word document merge
Browse files Browse the repository at this point in the history
  • Loading branch information
Oreoxmt committed Dec 21, 2024
1 parent 8b80a1a commit 8537760
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 15 deletions.
45 changes: 34 additions & 11 deletions src/tidocs/docx_handler.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import io

from docx import Document
from docx.oxml import parse_xml
from docx.oxml.shared import qn


def merge_word_docs_with_tables(
Expand All @@ -9,7 +10,7 @@ def merge_word_docs_with_tables(
marker_text: str = "TIDOCS_REPLACE_TABLE",
) -> bytes:
"""
Merges tables from one Word document into another at specified marker locations.
Merges tables from one Word document into another at specified marker locations, preserving hyperlinks and other document relationships.
Args:
main_doc_data (bytes): The main document binary data
Expand All @@ -23,20 +24,43 @@ def merge_word_docs_with_tables(
main_doc = Document(io.BytesIO(main_doc_data))
table_doc = Document(io.BytesIO(table_doc_data))

# Create a mapping of relationship IDs between documents
rel_map = {}

# Copy hyperlink relationships from table_doc to main_doc
for rel_id, rel in table_doc.part.rels.items():
if (
rel.reltype
== "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink"
):
new_rel_id = main_doc.part.relate_to(
rel._target, rel.reltype, rel.is_external
)
rel_map[rel_id] = new_rel_id

# Find all tables in the table document
tables_to_insert = {}
current_heading = None

# Associate tables with their preceding headings
for element in table_doc.element.body:
if element.tag.endswith("p"): # It's a paragraph
if element.tag.endswith("p"):
paragraph_text = element.text.strip()
if paragraph_text:
# print(paragraph_text)
current_heading = paragraph_text
elif element.tag.endswith("tbl"): # It's a table
elif element.tag.endswith("tbl"):
if current_heading:
tables_to_insert[current_heading] = element
# Deep copy the table element
table_copy = parse_xml(element.xml)

# Update relationship IDs in the copied table
# Find all hyperlinks using the proper namespace approach
for hyperlink in table_copy.xpath(".//w:hyperlink"):
old_rid = hyperlink.get(qn("r:id"))
if old_rid in rel_map:
hyperlink.set(qn("r:id"), rel_map[old_rid])

tables_to_insert[current_heading] = table_copy

# Process the main document
for paragraph in main_doc.paragraphs:
Expand All @@ -53,17 +77,16 @@ def merge_word_docs_with_tables(
return output.getvalue()


# Usage with your existing code
def merge_documents(doc_data: bytes, table_data: bytes) -> bytes:
"""
Wrapper function to merge your documents using the existing download objects
Merge two Word documents, inserting table_data into doc_data.
Args:
doc_data (bytes): Main document data from first Pandoc conversion
table_data (bytes): Table document data from second Pandoc conversion
doc_data: Main document binary data
table_data: Table document binary data
Returns:
bytes: Merged document data
Merged document binary data
"""
try:
merged_data = merge_word_docs_with_tables(doc_data, table_data)
Expand Down
8 changes: 4 additions & 4 deletions src/tidocs/merge/main_marimo.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import marimo

__generated_with = "0.9.20"
__generated_with = "0.9.28"
app = marimo.App(app_title="TiDocs - Merge Release Notes")


Expand Down Expand Up @@ -34,7 +34,7 @@ def __(is_valid_filename, md_files, mo):
mo.stop(
not is_valid_filename(md_files.value[i].name),
mo.md(
f'#### {mo.icon("ic:round-error-outline", color="darkorange", inline=True)} Invalid format.\n\nPlease upload release notes in `release-x.y.z.md` format.'
f"#### {mo.icon('ic:round-error-outline', color='darkorange', inline=True)} Invalid format.\n\nPlease upload release notes in `release-x.y.z.md` format."
)
.center()
.callout(kind="danger"),
Expand Down Expand Up @@ -229,7 +229,7 @@ def __(Pandoc, get_reference_doc, md_contents, mo, table_contents):
mo.stop(
md_doc_err.decode("utf-8") != "",
mo.md(
f'#### {mo.icon("ic:round-error-outline", color="darkorange", inline=True)} Failed to convert to Word.\n\n{md_doc_err.decode("utf-8")}'
f"#### {mo.icon('ic:round-error-outline', color='darkorange', inline=True)} Failed to convert to Word.\n\n{md_doc_err.decode('utf-8')}"
)
.center()
.callout(kind="danger"),
Expand All @@ -248,7 +248,7 @@ def __(Pandoc, get_reference_doc, md_contents, mo, table_contents):
mo.stop(
table_doc_err.decode("utf-8") != "",
mo.md(
f'####{mo.icon("ic:round-error-outline", color="darkorange", inline=True)} Failed to convert to Word.\n\n{table_doc_err.decode("utf-8")}'
f"####{mo.icon('ic:round-error-outline', color='darkorange', inline=True)} Failed to convert to Word.\n\n{table_doc_err.decode('utf-8')}"
)
.center()
.callout(kind="danger"),
Expand Down

0 comments on commit 8537760

Please sign in to comment.