From 5d7dc82912a62417ce608d6f3a8d497d58eb70e1 Mon Sep 17 00:00:00 2001 From: Aolin Date: Mon, 23 Dec 2024 22:56:03 +0800 Subject: [PATCH] fix incorrect HTML table matching --- src/tidocs/markdown_handler.py | 56 ++++++++++++++++++++++++++++------ 1 file changed, 46 insertions(+), 10 deletions(-) diff --git a/src/tidocs/markdown_handler.py b/src/tidocs/markdown_handler.py index 941de2e..e241889 100644 --- a/src/tidocs/markdown_handler.py +++ b/src/tidocs/markdown_handler.py @@ -130,7 +130,7 @@ def extract_and_mark_html_tables(content: str) -> (str, str): Examples: >>> test_content = "Table1\\nTest1
\\n\\nTable2\\nTest2
\\n\\n" >>> modified, html_tables = extract_and_mark_html_tables(test_content) - >>> print(html_tables) # doctest: +NORMALIZE_WHITESPACE + >>> print(html_tables) TIDOCS_REPLACE_TABLE_0 Test1
@@ -140,27 +140,63 @@ def extract_and_mark_html_tables(content: str) -> (str, str): Test2
- >>> print(modified) # doctest: +NORMALIZE_WHITESPACE + >>> print(modified) Table1 TIDOCS_REPLACE_TABLE_0 Table2 TIDOCS_REPLACE_TABLE_1 + + + >>> test_content = "Content
" + >>> modified, html_tables = extract_and_mark_html_tables(test_content) + >>> modified == test_content + True + >>> len(html_tables) == 0 + True + >>> test_content = "Table1\\n Test1
\\n\\nTable2\\n Test2
\\n\\n" + >>> modified, html_tables = extract_and_mark_html_tables(test_content) + >>> print(html_tables) + TIDOCS_REPLACE_TABLE_0 + + Test1
+ + TIDOCS_REPLACE_TABLE_1 + + Test2
+ + + >>> print(modified) + Table1 + TIDOCS_REPLACE_TABLE_0 + + Table2 + TIDOCS_REPLACE_TABLE_1 + + """ TABLE_MARKER_TEMPLATE = "TIDOCS_REPLACE_TABLE_{}" - table_pattern = re.compile(r".*?
", re.DOTALL) + # Capture whitespace between newline and table + table_pattern = re.compile(r"\n(\s*?)(.*?
)", re.DOTALL) - # Find all tables in the content - tables = table_pattern.findall(content) + # Find all tables and their positions along with whitespace + tables = [] + for match in table_pattern.finditer(content): + whitespace = match.group(1) + table = match.group(2) + tables.append((whitespace, table, match.start())) # Initialize result containers extracted_tables = [] modified_content = content - # Process each table - for i, table in enumerate(tables): - marker = TABLE_MARKER_TEMPLATE.format(i) - extracted_tables.append(f"{marker}\n\n{table}\n\n") - modified_content = modified_content.replace(table + "\n\n", f"{marker}\n\n") + # Process each table in reverse order to maintain correct positions + for i, (whitespace, table, _) in enumerate(reversed(tables)): + marker = TABLE_MARKER_TEMPLATE.format(len(tables) - i - 1) + extracted_tables.insert(0, f"{marker}\n\n{table}\n\n") + # Replace the table while preserving whitespace + modified_content = modified_content.replace( + f"\n{whitespace}{table}", f"\n{whitespace}{marker}" + ) return modified_content, "".join(extracted_tables)