From 854da28b55c15728988197a393d5e7107cd3b0b9 Mon Sep 17 00:00:00 2001
From: Levente Hunyadi <levente.hunyadi@instructure.com>
Date: Fri, 4 Oct 2024 20:39:06 +0200
Subject: [PATCH] Add support for emojis

---
 integration_tests/.gitignore  |  1 +
 integration_tests/test_api.py |  1 +
 integration_tests/test_csf.py | 29 ++++++++++++
 md2conf/api.py                | 21 +--------
 md2conf/converter.py          | 87 ++++++++++++++++++++++++++++++++++-
 md2conf/emoji.py              | 48 +++++++++++++++++++
 md2conf/util.py               | 19 ++++++++
 sample/index.md               | 10 ++++
 tests/source/.gitignore       |  1 +
 tests/target/.gitignore       |  1 +
 tests/test_conversion.py      | 15 +++---
 tests/test_processor.py       |  2 +-
 12 files changed, 206 insertions(+), 29 deletions(-)
 create mode 100644 integration_tests/.gitignore
 create mode 100644 integration_tests/test_csf.py
 create mode 100644 md2conf/emoji.py
 create mode 100644 md2conf/util.py
 create mode 100644 tests/source/.gitignore
 create mode 100644 tests/target/.gitignore

diff --git a/integration_tests/.gitignore b/integration_tests/.gitignore
new file mode 100644
index 0000000..8b0aee8
--- /dev/null
+++ b/integration_tests/.gitignore
@@ -0,0 +1 @@
+/example.csf
diff --git a/integration_tests/test_api.py b/integration_tests/test_api.py
index 79bb56a..991a84e 100644
--- a/integration_tests/test_api.py
+++ b/integration_tests/test_api.py
@@ -26,6 +26,7 @@
 
 class TestAPI(unittest.TestCase):
     out_dir: Path
+    sample_dir: Path
 
     def setUp(self) -> None:
         test_dir = Path(__file__).parent
diff --git a/integration_tests/test_csf.py b/integration_tests/test_csf.py
new file mode 100644
index 0000000..59a2609
--- /dev/null
+++ b/integration_tests/test_csf.py
@@ -0,0 +1,29 @@
+import unittest
+from pathlib import Path
+
+from md2conf.api import ConfluenceAPI
+from md2conf.converter import content_to_string
+
+TEST_SPACE = "DAP"
+TEST_PAGE_ID = "86918529216"
+
+
+class TestConfluenceStorageFormat(unittest.TestCase):
+    test_dir: Path
+
+    def setUp(self) -> None:
+        self.test_dir = Path(__file__).parent
+        parent_dir = self.test_dir.parent
+
+        self.sample_dir = parent_dir / "sample"
+
+    def test_markdown(self) -> None:
+        with ConfluenceAPI() as api:
+            page = api.get_page(TEST_PAGE_ID, space_key=TEST_SPACE)
+
+        with open(self.test_dir / "example.csf", "w") as f:
+            f.write(content_to_string(page.content))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/md2conf/api.py b/md2conf/api.py
index a6c24b5..a6e6124 100644
--- a/md2conf/api.py
+++ b/md2conf/api.py
@@ -2,7 +2,6 @@
 import json
 import logging
 import mimetypes
-import sys
 import typing
 from contextlib import contextmanager
 from dataclasses import dataclass
@@ -15,6 +14,7 @@
 
 from .converter import ParseError, sanitize_confluence
 from .properties import ConfluenceError, ConfluenceProperties
+from .util import removeprefix
 
 # a JSON type with possible `null` values
 JsonType = Union[
@@ -44,25 +44,6 @@ def build_url(base_url: str, query: Optional[Dict[str, str]] = None) -> str:
     return urlunparse(url_parts)
 
 
-if sys.version_info >= (3, 9):
-
-    def removeprefix(string: str, prefix: str) -> str:
-        "If the string starts with the prefix, return the string without the prefix; otherwise, return the original string."
-
-        return string.removeprefix(prefix)
-
-else:
-
-    def removeprefix(string: str, prefix: str) -> str:
-        "If the string starts with the prefix, return the string without the prefix; otherwise, return the original string."
-
-        if string.startswith(prefix):
-            prefix_len = len(prefix)
-            return string[prefix_len:]
-        else:
-            return string
-
-
 LOGGER = logging.getLogger(__name__)
 
 
diff --git a/md2conf/converter.py b/md2conf/converter.py
index 10b1bc0..22dddb2 100644
--- a/md2conf/converter.py
+++ b/md2conf/converter.py
@@ -7,9 +7,10 @@
 import re
 import sys
 import uuid
+import xml.etree.ElementTree
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Dict, List, Literal, Optional, Tuple
+from typing import Any, Dict, List, Literal, Optional, Tuple
 from urllib.parse import ParseResult, urlparse, urlunparse
 
 import lxml.etree as ET
@@ -55,6 +56,27 @@ def is_relative_url(url: str) -> bool:
     return not bool(urlparts.scheme) and not bool(urlparts.netloc)
 
 
+def emoji_generator(
+    index: str,
+    shortname: str,
+    alias: Optional[str],
+    uc: Optional[str],
+    alt: str,
+    title: Optional[str],
+    category: Optional[str],
+    options: Dict[str, Any],
+    md: markdown.Markdown,
+) -> xml.etree.ElementTree.Element:
+    name = (alias or shortname).strip(":")
+    span = xml.etree.ElementTree.Element("span", {"data-emoji": name})
+    if uc is not None:
+        # convert series of Unicode code point hexadecimal values into characters
+        span.text = "".join(chr(int(item, base=16)) for item in uc.split("-"))
+    else:
+        span.text = alt
+    return span
+
+
 def markdown_to_html(content: str) -> str:
     return markdown.markdown(
         content,
@@ -62,11 +84,17 @@ def markdown_to_html(content: str) -> str:
             "admonition",
             "markdown.extensions.tables",
             "markdown.extensions.fenced_code",
+            "pymdownx.emoji",
             "pymdownx.magiclink",
             "pymdownx.tilde",
             "sane_lists",
             "md_in_html",
         ],
+        extension_configs={
+            "pymdownx.emoji": {
+                "emoji_generator": emoji_generator,
+            }
+        },
     )
 
 
@@ -81,6 +109,7 @@ def _elements_from_strings(dtd_path: Path, items: List[str]) -> ET._Element:
 
     parser = ET.XMLParser(
         remove_blank_text=True,
+        remove_comments=True,
         strip_cdata=False,
         load_dtd=True,
     )
@@ -678,6 +707,23 @@ def _transform_section(self, elem: ET._Element) -> ET._Element:
             AC("rich-text-body", {}, *list(elem)),
         )
 
+    def _transform_emoji(self, elem: ET._Element) -> ET._Element:
+        shortname = elem.attrib.get("data-emoji", "")
+        alt = elem.text or ""
+
+        # <ac:emoticon ac:name="wink" ac:emoji-shortname=":wink:" ac:emoji-id="1f609" ac:emoji-fallback="&#128521;"/>
+        # <ac:emoticon ac:name="blue-star" ac:emoji-shortname=":heavy_plus_sign:" ac:emoji-id="2795" ac:emoji-fallback="&#10133;"/>
+        # <ac:emoticon ac:name="blue-star" ac:emoji-shortname=":heavy_minus_sign:" ac:emoji-id="2796" ac:emoji-fallback="&#10134;"/>
+        return AC(
+            "emoticon",
+            {
+                # use "blue-star" as a placeholder name to ensure wiki page loads in timely manner
+                ET.QName(namespaces["ac"], "name"): "blue-star",
+                ET.QName(namespaces["ac"], "emoji-shortname"): f":{shortname}:",
+                ET.QName(namespaces["ac"], "emoji-fallback"): alt,
+            },
+        )
+
     def transform(self, child: ET._Element) -> Optional[ET._Element]:
         # normalize line breaks to regular space in element text
         if child.text:
@@ -764,6 +810,9 @@ def transform(self, child: ET._Element) -> Optional[ET._Element]:
         elif child.tag == "pre" and len(child) == 1 and child[0].tag == "code":
             return self._transform_block(child[0])
 
+        elif child.tag == "span" and child.attrib.has_key("data-emoji"):
+            return self._transform_emoji(child)
+
         return None
 
 
@@ -963,3 +1012,39 @@ def elements_to_string(root: ET._Element) -> str:
         return m.group(1)
     else:
         raise ValueError("expected: Confluence content")
+
+
+def _content_to_string(dtd_path: Path, content: str) -> str:
+    parser = ET.XMLParser(
+        remove_blank_text=True,
+        remove_comments=True,
+        strip_cdata=False,
+        load_dtd=True,
+    )
+
+    ns_attr_list = "".join(
+        f' xmlns:{key}="{value}"' for key, value in namespaces.items()
+    )
+
+    data = [
+        '<?xml version="1.0"?>',
+        f'<!DOCTYPE ac:confluence PUBLIC "-//Atlassian//Confluence 4 Page//EN" "{dtd_path}">'
+        f"<root{ns_attr_list}>",
+    ]
+    data.append(content)
+    data.append("</root>")
+
+    tree = ET.fromstringlist(data, parser=parser)
+    return ET.tostring(tree, pretty_print=True).decode("utf-8")
+
+
+def content_to_string(content: str) -> str:
+    "Converts a Confluence Storage Format document returned by the API into a readable XML document."
+
+    if sys.version_info >= (3, 9):
+        resource_path = resources.files(__package__).joinpath("entities.dtd")
+        with resources.as_file(resource_path) as dtd_path:
+            return _content_to_string(dtd_path, content)
+    else:
+        with resources.path(__package__, "entities.dtd") as dtd_path:
+            return _content_to_string(dtd_path, content)
diff --git a/md2conf/emoji.py b/md2conf/emoji.py
new file mode 100644
index 0000000..08f9f52
--- /dev/null
+++ b/md2conf/emoji.py
@@ -0,0 +1,48 @@
+import pathlib
+
+import pymdownx.emoji1_db as emoji_db
+
+
+def generate_source(path: pathlib.Path) -> None:
+    "Generates a source Markdown document for testing emojis."
+
+    emojis = emoji_db.emoji
+
+    with open(path, "w") as f:
+        print("<!-- confluence-page-id: 86918529216 -->", file=f)
+        print("<!-- This file has been generated by a script. -->", file=f)
+        print(file=f)
+        print("## Emoji", file=f)
+        print(file=f)
+        print("| Icon | Emoji code |", file=f)
+        print("| ---- | ---------- |", file=f)
+        for key in emojis.keys():
+            key = key.strip(":")
+            print(f"| :{key}: | `:{key}:` |", file=f)
+
+
+def generate_target(path: pathlib.Path) -> None:
+    "Generates a target Confluence Storage Format (XML) document for testing emojis."
+
+    emojis = emoji_db.emoji
+
+    with open(path, "w") as f:
+        print('<ac:structured-macro ac:name="info" ac:schema-version="1">', file=f)
+        print("<ac:rich-text-body>", file=f)
+        print("<p>This page has been generated with a tool.</p>", file=f)
+        print("</ac:rich-text-body>", file=f)
+        print("</ac:structured-macro>", file=f)
+        print("<h2>Emoji</h2>", file=f)
+        print("<table>", file=f)
+        print("<thead><tr><th>Icon</th><th>Emoji code</th></tr></thead>", file=f)
+        print("<tbody>", file=f)
+        for key, data in emojis.items():
+            key = key.strip(":")
+            unicode = "".join(f"&#x{item};" for item in data["unicode"].split("-"))
+
+            print(
+                f'<tr><td><ac:emoticon ac:name="blue-star" ac:emoji-shortname=":{key}:" ac:emoji-fallback="{unicode}"/></td><td><code>:{key}:</code></td></tr>',
+                file=f,
+            )
+        print("</tbody>", file=f)
+        print("</table>", file=f)
diff --git a/md2conf/util.py b/md2conf/util.py
new file mode 100644
index 0000000..e2ebb96
--- /dev/null
+++ b/md2conf/util.py
@@ -0,0 +1,19 @@
+import sys
+
+if sys.version_info >= (3, 9):
+
+    def removeprefix(string: str, prefix: str) -> str:
+        "If the string starts with the prefix, return the string without the prefix; otherwise, return the original string."
+
+        return string.removeprefix(prefix)
+
+else:
+
+    def removeprefix(string: str, prefix: str) -> str:
+        "If the string starts with the prefix, return the string without the prefix; otherwise, return the original string."
+
+        if string.startswith(prefix):
+            prefix_len = len(prefix)
+            return string[prefix_len:]
+        else:
+            return string
diff --git a/sample/index.md b/sample/index.md
index c8751d9..8c48371 100644
--- a/sample/index.md
+++ b/sample/index.md
@@ -132,18 +132,28 @@ Markdown has no native support for admonitions. Admonitions that follow the [Pyt
 
 Alerts are a Markdown extension based on the blockquote syntax that you can use to emphasize critical information. [GitHub](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax#alerts) and [GitLab](https://docs.gitlab.com/ee/development/documentation/styleguide/#alert-boxes) display them with distinctive colors and icons to indicate the significance of the content. When converted to Confluence, they are represented as structured macros, which are displayed as info panels.
 
+Note:
+
 > [!NOTE]
 > Useful information that users should know, even when skimming content.
 
+Tip:
+
 > [!TIP]
 > Helpful advice for doing things better or more easily.
 
+Important:
+
 > [!IMPORTANT]
 > Key information users need to know to achieve their goal.
 
+Warning:
+
 > [!WARNING]
 > Urgent info that needs immediate user attention to avoid problems.
 
+Caution:
+
 > [!CAUTION]
 > Advises about risks or negative outcomes of certain actions.
 
diff --git a/tests/source/.gitignore b/tests/source/.gitignore
new file mode 100644
index 0000000..08f575f
--- /dev/null
+++ b/tests/source/.gitignore
@@ -0,0 +1 @@
+/emoji.md
diff --git a/tests/target/.gitignore b/tests/target/.gitignore
new file mode 100644
index 0000000..6f7b873
--- /dev/null
+++ b/tests/target/.gitignore
@@ -0,0 +1 @@
+/emoji.xml
diff --git a/tests/test_conversion.py b/tests/test_conversion.py
index 9ce64c6..c02cbdf 100644
--- a/tests/test_conversion.py
+++ b/tests/test_conversion.py
@@ -2,10 +2,10 @@
 import os
 import os.path
 import re
-import shutil
 import unittest
 from pathlib import Path
 
+import md2conf.emoji as emoji
 from md2conf.converter import (
     ConfluenceDocument,
     ConfluenceDocumentOptions,
@@ -36,21 +36,22 @@ def standardize(content: str) -> str:
 
 
 class TestConversion(unittest.TestCase):
-    out_dir: Path
+    source_dir: Path
+    target_dir: Path
 
     def setUp(self) -> None:
         self.maxDiff = None
 
         test_dir = Path(__file__).parent
-        self.out_dir = test_dir / "output"
         self.source_dir = test_dir / "source"
         self.target_dir = test_dir / "target"
-        os.makedirs(self.out_dir, exist_ok=True)
-
-    def tearDown(self) -> None:
-        shutil.rmtree(self.out_dir)
 
     def test_markdown(self) -> None:
+        if not os.path.exists(self.source_dir / "emoji.md"):
+            emoji.generate_source(self.source_dir / "emoji.md")
+        if not os.path.exists(self.target_dir / "emoji.xml"):
+            emoji.generate_target(self.target_dir / "emoji.xml")
+
         matcher = Matcher(
             MatcherOptions(source=".mdignore", extension="md"), self.source_dir
         )
diff --git a/tests/test_processor.py b/tests/test_processor.py
index 90d371a..b2682db 100644
--- a/tests/test_processor.py
+++ b/tests/test_processor.py
@@ -29,7 +29,7 @@ def setUp(self) -> None:
     def tearDown(self) -> None:
         shutil.rmtree(self.out_dir)
 
-    def atest_process_document(self) -> None:
+    def test_process_document(self) -> None:
         options = ConfluenceDocumentOptions(
             ignore_invalid_url=False,
             generated_by="Test Case",