openzim · benoit74 · Nov 19, 2024 · Nov 19, 2024 · Nov 19, 2024 · Nov 19, 2024
diff --git a/scraper/src/mindtouch2zim/client.py b/scraper/src/mindtouch2zim/client.py
@@ -40,7 +40,6 @@
     with a special call, hence the specific object
     """
 
-    encoded_url: str
     tags: list[str]
 
 
@@ -52,6 +51,7 @@
     path: str
     parent: "LibraryPage | None" = None
     children: list["LibraryPage"] = []
+    encoded_url: str
     definition: LibraryPageDefinition | None = None
 
     def __repr__(self) -> str:
@@ -251,6 +251,7 @@
             id=tree_data["page"]["@id"],
             title=tree_data["page"]["title"],
             path=tree_data["page"]["path"]["#text"],
+            encoded_url=tree_data["page"]["uri.ui"],
         )
         tree_obj = LibraryTree(root=root)
         tree_obj.pages[root.id] = root
@@ -260,6 +261,7 @@
                 id=page_node["@id"],
                 title=page_node["title"],
                 path=page_node["path"]["#text"],
+                encoded_url=page_node["uri.ui"],
                 parent=parent,
             )
             parent.children.append(page)
@@ -316,9 +318,6 @@
             raw_definition = self._get_api_json(
                 f"/pages/{page.id}", timeout=HTTP_TIMEOUT_NORMAL_SECONDS
             )
-            encoded_url = raw_definition.get("uri.ui", None)
-            if encoded_url is None:
-                raise MindtouchParsingError(f"No uri.ui property for page {page.id}")
             raw_tags = raw_definition.get("tags", None)
             if raw_tags is None:
                 raise MindtouchParsingError(f"No tags property for page {page.id}")
@@ -330,7 +329,6 @@
             else:
                 tags = [raw_tag.get("@value")]
             page.definition = LibraryPageDefinition(
-                encoded_url=encoded_url,
                 tags=tags,
             )
         return page.definition
@@ -363,7 +361,7 @@
 
     def get_cover_page_encoded_url(self, page: LibraryPage) -> str:
         """Returns the url for the book page for a given child page"""
-        return self.get_page_definition(self.get_cover_page(page)).encoded_url
+        return self.get_cover_page(page).encoded_url
 
     def get_cover_page_id(self, page: LibraryPage) -> str:
         """Returns the id for the book page for a given child page"""

diff --git a/scraper/src/mindtouch2zim/constants.py b/scraper/src/mindtouch2zim/constants.py
@@ -16,6 +16,8 @@
 HTTP_TIMEOUT_NORMAL_SECONDS = 15
 HTTP_TIMEOUT_LONG_SECONDS = 30
 
+HTML_ISSUES_WARN_ONLY = False
+
 logger = getLogger(NAME, level=logging.DEBUG, log_format=DEFAULT_FORMAT_WITH_THREADS)
 
 web_session = get_session()
diff --git a/scraper/src/mindtouch2zim/entrypoint.py b/scraper/src/mindtouch2zim/entrypoint.py
@@ -199,13 +199,6 @@
         default=os.getenv("MINDTOUCH_ZIMUI_DIST", "../zimui/dist"),
     )
 
-    parser.add_argument(
-        "--keep-cache",
-        help="Keep cache of website responses",
-        action="store_true",
-        default=False,
-    )
-
     parser.add_argument(
         "--stats-filename",
         help="Path to store the progress JSON file to.",
@@ -232,6 +225,16 @@
         dest="assets_workers",
     )
 
+    parser.add_argument(
+        "--html-issues-warn-only",
+        help="[dev] Only log a warning when unexpected HTML is encountered. Use with "
+        "caution because activating this option means that ZIM HTML will probably lead "
+        "to online resources without user noticing it.",
+        action="store_true",
+        default=False,
+        dest="html_issues_warn_only",
+    )
+
     args = parser.parse_args()
 
     logger.setLevel(level=logging.DEBUG if args.debug else logging.INFO)
@@ -269,6 +272,7 @@
             illustration_url=args.illustration_url,
             s3_url_with_credentials=args.s3_url_with_credentials,
             assets_workers=args.assets_workers,
+            html_issues_warn_only=args.html_issues_warn_only,
         ).run()
     except SystemExit:
         logger.error("Generation failed, exiting")

diff --git a/scraper/src/mindtouch2zim/html_rewriting.py b/scraper/src/mindtouch2zim/html_rewriting.py
@@ -13,6 +13,7 @@
     ZimPath,
 )
 
+import mindtouch2zim.constants
 from mindtouch2zim.client import LibraryPage
 from mindtouch2zim.constants import logger
 from mindtouch2zim.errors import UnsupportedHrefSrcError, UnsupportedTagError
@@ -24,6 +25,8 @@
 html_rules.rewrite_data_rules.clear()
 html_rules.rewrite_tag_rules.clear()
 
+rewriting_context = None
+
 
 @html_rules.rewrite_attribute()
 def rewrite_href_src_attributes(
@@ -52,9 +55,15 @@
         )
     if not new_attr_value:
         # we do not (yet) support other tags / attributes so we fail the scraper
-        raise UnsupportedHrefSrcError(
-            f"Unsupported {attr_name} encountered in {tag} tag (value: {attr_value})"
+        msg = (
+            f"Unsupported '{attr_name}' encountered in '{tag}' tag (value: "
+            f"'{attr_value}') while rewriting {rewriting_context}"
         )
+        if not mindtouch2zim.constants.HTML_ISSUES_WARN_ONLY:
+            raise UnsupportedHrefSrcError(msg)
+        else:
+            logger.warning(msg)
+            return
     return (attr_name, new_attr_value)
 
 
@@ -63,7 +72,15 @@
     """Stop scraper if unsupported tag is encountered"""
     if tag not in ["picture"]:
         return
-    raise UnsupportedTagError(f"Tag {tag} is not yet supported in this scraper")
+    msg = (
+        f"Tag {tag} is not yet supported in this scraper, found while rewriting "
+        f"{rewriting_context}"
+    )
+    if not mindtouch2zim.constants.HTML_ISSUES_WARN_ONLY:
+        raise UnsupportedTagError(msg)
+    else:
+        logger.warning(msg)
+        return
 
 
 YOUTUBE_IFRAME_RE = re.compile(r".*youtube(?:-\w+)*\.\w+\/embed\/(?P<id>.*?)(?:\?.*)*$")
@@ -84,7 +101,15 @@
         raise Exception("Expecting HtmlUrlsRewriter")
     src = get_attr_value_from(attrs=attrs, name="src")
     if not src:
-        raise UnsupportedTagError("Unsupported empty src in iframe")
+        msg = (
+            "Unsupported empty src in iframe, found while rewriting "
+            f"{rewriting_context}"
+        )
+        if not mindtouch2zim.constants.HTML_ISSUES_WARN_ONLY:
+            raise UnsupportedTagError(msg)
+        else:
+            logger.warning(msg)
+            return
     image_rewriten_url = None
     try:
         if ytb_match := YOUTUBE_IFRAME_RE.match(src):

diff --git a/scraper/src/mindtouch2zim/processor.py b/scraper/src/mindtouch2zim/processor.py
@@ -29,6 +29,8 @@
 from zimscraperlib.zim.filesystem import validate_file_creatable
 from zimscraperlib.zim.indexing import IndexData
 
+import mindtouch2zim.constants
+import mindtouch2zim.html_rewriting
 from mindtouch2zim.asset import AssetDetails, AssetProcessor
 from mindtouch2zim.client import (
     LibraryPage,
@@ -144,6 +146,7 @@
         assets_workers: int,
         *,
         overwrite_existing_zim: bool,
+        html_issues_warn_only: bool,
     ) -> None:
         """Initializes Processor.
 
@@ -171,6 +174,8 @@
             n_jobs=assets_workers, return_as="generator_unordered", backend="threading"
         )
 
+        mindtouch2zim.constants.HTML_ISSUES_WARN_ONLY = html_issues_warn_only
+
         self.stats_items_done = 0
         # we add 1 more items to process so that progress is not 100% at the beginning
         # when we do not yet know how many items we have to process and so that we can
@@ -478,6 +483,9 @@
         Download content, rewrite HTML and add JSON to ZIM
         """
         logger.debug(f"  Fetching {page.id}")
+        mindtouch2zim.html_rewriting.rewriting_context = (
+            f"page {page.id} at {page.encoded_url}"
+        )
         page_content = self.mindtouch_client.get_page_content(page)
         url_rewriter = HtmlUrlsRewriter(
             self.mindtouch_client.library_url,

diff --git a/scraper/tests/test_html_rewriting.py b/scraper/tests/test_html_rewriting.py
@@ -14,7 +14,12 @@
 def url_rewriter() -> HtmlUrlsRewriter:
     return HtmlUrlsRewriter(
         library_url="https://www.acme.com",
-        page=LibraryPage(id="123", title="a page", path="A_Page"),
+        page=LibraryPage(
+            id="123",
+            title="a page",
+            path="A_Page",
+            encoded_url="https://www.acme.com/A_Page",
+        ),
         existing_zim_paths={
             ZimPath("www.acme.com/existing.html"),
         },

diff --git a/scraper/tests/test_processor.py b/scraper/tests/test_processor.py
@@ -5,43 +5,110 @@
 
 
 @pytest.fixture(scope="module")
-def library_tree() -> LibraryTree:
-    root = LibraryPage(id="24", title="Home page", path="")
+def dummy_encoded_url() -> str:
+    return "https://www.acme.com/A_Page"
+
+
+@pytest.fixture(scope="module")
+def library_tree(dummy_encoded_url) -> LibraryTree:
+    root = LibraryPage(
+        id="24", title="Home page", path="", encoded_url=dummy_encoded_url
+    )
     topic1 = LibraryPage(
-        id="25", title="1: First topic", path="1_First_Topic", parent=root
+        id="25",
+        title="1: First topic",
+        path="1_First_Topic",
+        parent=root,
+        encoded_url=dummy_encoded_url,
     )
     root.children.append(topic1)
-    topic1_1 = LibraryPage(id="26", title="1.1: Cloud", path="1.1_Cloud", parent=topic1)
+    topic1_1 = LibraryPage(
+        id="26",
+        title="1.1: Cloud",
+        path="1.1_Cloud",
+        parent=topic1,
+        encoded_url=dummy_encoded_url,
+    )
     topic1.children.append(topic1_1)
-    topic1_2 = LibraryPage(id="27", title="1.2: Tree", path="1.2_Tree", parent=topic1)
+    topic1_2 = LibraryPage(
+        id="27",
+        title="1.2: Tree",
+        path="1.2_Tree",
+        parent=topic1,
+        encoded_url=dummy_encoded_url,
+    )
     topic1.children.append(topic1_2)
-    topic1_3 = LibraryPage(id="28", title="1.3: Bees", path="1.3_Bees", parent=topic1)
+    topic1_3 = LibraryPage(
+        id="28",
+        title="1.3: Bees",
+        path="1.3_Bees",
+        parent=topic1,
+        encoded_url=dummy_encoded_url,
+    )
     topic1.children.append(topic1_3)
     topic2 = LibraryPage(
-        id="29", title="2: Second topic", path="2_Second_Topic", parent=root
+        id="29",
+        title="2: Second topic",
+        path="2_Second_Topic",
+        parent=root,
+        encoded_url=dummy_encoded_url,
     )
     root.children.append(topic2)
     topic2_1 = LibraryPage(
-        id="30", title="2.1: Underground", path="2.1_Underground", parent=topic2
+        id="30",
+        title="2.1: Underground",
+        path="2.1_Underground",
+        parent=topic2,
+        encoded_url=dummy_encoded_url,
     )
     topic2.children.append(topic2_1)
-    topic2_2 = LibraryPage(id="31", title="2.2: Lava", path="2.2_Lava", parent=topic2)
+    topic2_2 = LibraryPage(
+        id="31",
+        title="2.2: Lava",
+        path="2.2_Lava",
+        parent=topic2,
+        encoded_url=dummy_encoded_url,
+    )
     topic2.children.append(topic2_2)
     topic2_3 = LibraryPage(
-        id="32", title="2.3: Volcano", path="2.3_Volcano", parent=topic2
+        id="32",
+        title="2.3: Volcano",
+        path="2.3_Volcano",
+        parent=topic2,
+        encoded_url=dummy_encoded_url,
     )
     topic2.children.append(topic2_3)
     topic3 = LibraryPage(
-        id="33", title="3: Third topic", path="3_Third_Topic", parent=root
+        id="33",
+        title="3: Third topic",
+        path="3_Third_Topic",
+        parent=root,
+        encoded_url=dummy_encoded_url,
     )
     root.children.append(topic3)
     topic3_1 = LibraryPage(
-        id="34", title="3.1: Ground", path="3.1_Ground", parent=topic3
+        id="34",
+        title="3.1: Ground",
+        path="3.1_Ground",
+        parent=topic3,
+        encoded_url=dummy_encoded_url,
     )
     topic3.children.append(topic3_1)
-    topic3_2 = LibraryPage(id="35", title="3.2: Earth", path="3.2_Earth", parent=topic3)
+    topic3_2 = LibraryPage(
+        id="35",
+        title="3.2: Earth",
+        path="3.2_Earth",
+        parent=topic3,
+        encoded_url=dummy_encoded_url,
+    )
     topic3.children.append(topic3_2)
-    topic3_3 = LibraryPage(id="36", title="3.3: Sky", path="3.3_Sky", parent=topic3)
+    topic3_3 = LibraryPage(
+        id="36",
+        title="3.3: Sky",
+        path="3.3_Sky",
+        parent=topic3,
+        encoded_url=dummy_encoded_url,
+    )
     topic3.children.append(topic3_3)
     return LibraryTree(
         root=root,