Skip to content

Commit

Permalink
fix incorrect HTML table matching (#5)
Browse files Browse the repository at this point in the history
  • Loading branch information
Oreoxmt authored Dec 23, 2024
1 parent fd423e5 commit 1b4757d
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 10 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,10 @@ Use the `tidocs merge` command to access a web interface for combining multiple

## Changelog

### [1.0.7] - 2024-12-23

- Fix the issue that HTML tables are incorrectly extracted when `<table>` tags appear in code blocks or plain text that is not part of actual HTML markup.

### [1.0.6] - 2024-12-21

- Fix the issue that hyperlinks become broken after merging Word documents due to incorrect relationship reference handling. ([#2](https://github.com/Oreoxmt/tidocs/issues/2))
Expand Down
56 changes: 46 additions & 10 deletions src/tidocs/markdown_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def extract_and_mark_html_tables(content: str) -> (str, str):
Examples:
>>> test_content = "Table1\\n<table><thead>Test1</thead></table>\\n\\nTable2\\n<table><thead>Test2</thead></table>\\n\\n"
>>> modified, html_tables = extract_and_mark_html_tables(test_content)
>>> print(html_tables) # doctest: +NORMALIZE_WHITESPACE
>>> print(html_tables)
TIDOCS_REPLACE_TABLE_0
<BLANKLINE>
<table><thead>Test1</thead></table>
Expand All @@ -140,27 +140,63 @@ def extract_and_mark_html_tables(content: str) -> (str, str):
<table><thead>Test2</thead></table>
<BLANKLINE>
<BLANKLINE>
>>> print(modified) # doctest: +NORMALIZE_WHITESPACE
>>> print(modified)
Table1
TIDOCS_REPLACE_TABLE_0
<BLANKLINE>
Table2
TIDOCS_REPLACE_TABLE_1
<BLANKLINE>
<BLANKLINE>
>>> test_content = "Content <table> </table>"
>>> modified, html_tables = extract_and_mark_html_tables(test_content)
>>> modified == test_content
True
>>> len(html_tables) == 0
True
>>> test_content = "Table1\\n <table><thead>Test1</thead> </table>\\n\\nTable2\\n <table><thead>Test2</thead></table>\\n\\n"
>>> modified, html_tables = extract_and_mark_html_tables(test_content)
>>> print(html_tables)
TIDOCS_REPLACE_TABLE_0
<BLANKLINE>
<table><thead>Test1</thead> </table>
<BLANKLINE>
TIDOCS_REPLACE_TABLE_1
<BLANKLINE>
<table><thead>Test2</thead></table>
<BLANKLINE>
<BLANKLINE>
>>> print(modified)
Table1
TIDOCS_REPLACE_TABLE_0
<BLANKLINE>
Table2
TIDOCS_REPLACE_TABLE_1
<BLANKLINE>
<BLANKLINE>
"""
TABLE_MARKER_TEMPLATE = "TIDOCS_REPLACE_TABLE_{}"
table_pattern = re.compile(r"<table>.*?</table>", re.DOTALL)
# Capture whitespace between newline and table
table_pattern = re.compile(r"\n(\s*?)(<table>.*?</table>)", re.DOTALL)

# Find all tables in the content
tables = table_pattern.findall(content)
# Find all tables and their positions along with whitespace
tables = []
for match in table_pattern.finditer(content):
whitespace = match.group(1)
table = match.group(2)
tables.append((whitespace, table, match.start()))

# Initialize result containers
extracted_tables = []
modified_content = content

# Process each table
for i, table in enumerate(tables):
marker = TABLE_MARKER_TEMPLATE.format(i)
extracted_tables.append(f"{marker}\n\n{table}\n\n")
modified_content = modified_content.replace(table + "\n\n", f"{marker}\n\n")
# Process each table in reverse order to maintain correct positions
for i, (whitespace, table, _) in enumerate(reversed(tables)):
marker = TABLE_MARKER_TEMPLATE.format(len(tables) - i - 1)
extracted_tables.insert(0, f"{marker}\n\n{table}\n\n")
# Replace the table while preserving whitespace
modified_content = modified_content.replace(
f"\n{whitespace}{table}", f"\n{whitespace}{marker}"
)

return modified_content, "".join(extracted_tables)

0 comments on commit 1b4757d

Please sign in to comment.