Merge pull request #96 from openzim/assets_reliability

Enhance assets processing reliability and logs
openzim · Dec 6, 2024 · 1ef9614 · 1ef9614
2 parents 1399a4a + de9e0ee
commit 1ef9614
Show file tree

Hide file tree

Showing 4 changed files with 64 additions and 43 deletions.
diff --git a/scraper/src/mindtouch2zim/asset.py b/scraper/src/mindtouch2zim/asset.py
@@ -49,9 +49,17 @@ class HeaderData(NamedTuple):
 
 
 class AssetDetails(NamedTuple):
-    urls: set[HttpUrl]
+    asset_urls: set[HttpUrl]
+    used_by: set[str]
     always_fetch_online: bool
 
+    @property
+    def get_usage_repr(self) -> str:
+        """Returns a representation of asset usage, typically for logs"""
+        if len(self.used_by) == 0:
+            return ""
+        return f' used by {", ".join(self.used_by)}'
+
 
 class AssetProcessor:
 
@@ -68,55 +76,47 @@ def process_asset(
         asset_details: AssetDetails,
         creator: Creator,
     ):
-        logger.debug(f"Processing asset for {asset_path}")
-        context.current_thread_workitem = f"processing asset {asset_path}"
-        self._process_asset_internal(
-            asset_path=asset_path, asset_details=asset_details, creator=creator
-        )
-
-    def _process_asset_internal(
-        self,
-        asset_path: ZimPath,
-        asset_details: AssetDetails,
-        creator: Creator,
-    ):
-        for asset_url in asset_details.urls:
+        """Download and add to the ZIM a given asset (image, ...)"""
+        for asset_url in asset_details.asset_urls:
             try:
+                context.current_thread_workitem = (
+                    f"asset from {asset_url.value}{asset_details.get_usage_repr}"
+                )
                 asset_content = self.get_asset_content(
                     asset_path=asset_path,
                     asset_url=asset_url,
                     always_fetch_online=asset_details.always_fetch_online,
                 )
-                logger.debug(
-                    f"Adding {asset_url.value} to {asset_path.value} in the ZIM"
-                )
+                logger.debug(f"Adding asset to {asset_path.value} in the ZIM")
                 creator.add_item_for(
                     path="content/" + asset_path.value,
                     content=asset_content.getvalue(),
                 )
                 break  # file found and added
             except KnownBadAssetFailedError as exc:
-                logger.debug(f"Ignoring known bad asset for {asset_url.value}: {exc}")
-            except RequestException as exc:
+                logger.debug(f"Ignoring known bad asset: {exc}")
+            except Exception as exc:
+                # all other exceptions (not only RequestsException) lead to an increase
+                # of bad_assets_count, because we have no idea what could go wrong here
+                # and limit and bad assets threshold should be correct in production,
+                # or ignored at own user risk
                 with self.lock:
                     self.bad_assets_count += 1
+                    log_message = (
+                        "Exception while processing "
+                        f"{context.current_thread_workitem}: {exc}"
+                    )
                     if (
                         context.bad_assets_threshold >= 0
                         and self.bad_assets_count > context.bad_assets_threshold
                     ):
-                        logger.error(
-                            f"Exception while processing asset for {asset_url.value}: "
-                            f"{exc}"
-                        )
+                        logger.error(log_message)
                         raise OSError(  # noqa: B904
                             f"Asset failure threshold ({context.bad_assets_threshold}) "
                             "reached, stopping execution"
                         )
                     else:
-                        logger.warning(
-                            f"Exception while processing asset for {asset_url.value}: "
-                            f"{exc}"
-                        )
+                        logger.warning(log_message)
 
     def _get_header_data_for(self, url: HttpUrl) -> HeaderData:
         """Get details from headers for a given url
@@ -250,7 +250,8 @@ def get_asset_content(
                         )
                     else:
                         logger.debug(
-                            f"Not optimizing, unsupported mime type: {mime_type}"
+                            f"Not optimizing, unsupported mime type: {mime_type} for "
+                            f"{context.current_thread_workitem}"
                         )
 
             return self._download_from_online(asset_url=asset_url)
@@ -262,7 +263,7 @@ def get_asset_content(
             if context.bad_assets_regex and context.bad_assets_regex.findall(
                 asset_url.value
             ):
-                raise KnownBadAssetFailedError() from exc
+                raise KnownBadAssetFailedError(exc) from exc
             raise
 
     def _setup_s3(self):

diff --git a/scraper/src/mindtouch2zim/context.py b/scraper/src/mindtouch2zim/context.py
@@ -12,6 +12,7 @@
     NAME,
     STANDARD_KNOWN_BAD_ASSETS_REGEX,
     VERSION,
+    logger,
 )
 
 MINDTOUCH_TMP = os.getenv("MINDTOUCH_TMP")
@@ -128,6 +129,7 @@ def current_thread_workitem(self) -> str:
     @current_thread_workitem.setter
     def current_thread_workitem(self, value: str):
         self._current_thread_workitem.value = value
+        logger.debug(f"Processing {value}")
 
     @property
     def wm_user_agent(self) -> str:

diff --git a/scraper/src/mindtouch2zim/html_rewriting.py b/scraper/src/mindtouch2zim/html_rewriting.py
@@ -58,7 +58,7 @@ def rewrite_href_src_srcset_attributes(
         new_attr_value = ""
         logger.warning(
             f"Unsupported '{attr_name}' encountered in '{tag}' tag (value: "
-            f"'{attr_value}') while {context.current_thread_workitem}"
+            f"'{attr_value}') while rewriting {context.current_thread_workitem}"
         )
     return (attr_name, new_attr_value)
 

diff --git a/scraper/src/mindtouch2zim/processor.py b/scraper/src/mindtouch2zim/processor.py
@@ -143,7 +143,16 @@ def run(self) -> Path:
 
         Returns the path to the gernated ZIM.
         """
+        try:
+            return self._run_internal()
+        except Exception:
+            logger.error(
+                f"Problem encountered while processing "
+                f"{context.current_thread_workitem}"
+            )
+            raise
 
+    def _run_internal(self) -> Path:
         logger.setLevel(level=logging.DEBUG if context.debug else logging.INFO)
 
         self.zim_config = ZimConfig(
@@ -273,7 +282,7 @@ def run(self) -> Path:
 
     def run_with_creator(self, creator: Creator):
 
-        context.current_thread_workitem = "Storing standard files"
+        context.current_thread_workitem = "standard files"
 
         logger.info("  Storing configuration...")
         creator.add_item_for(
@@ -357,7 +366,7 @@ def run_with_creator(self, creator: Creator):
         )
 
         logger.info("Fetching pages tree")
-        context.current_thread_workitem = "Fetching pages tree"
+        context.current_thread_workitem = "pages tree"
         pages_tree = self.mindtouch_client.get_page_tree()
         selected_pages = self.content_filter.filter(pages_tree)
         logger.info(
@@ -377,7 +386,7 @@ def run_with_creator(self, creator: Creator):
         )
 
         logger.info("Fetching pages content")
-        context.current_thread_workitem = "Fetching pages content"
+        context.current_thread_workitem = "pages content"
         # compute the list of existing pages to properly rewrite links leading
         # in-ZIM / out-of-ZIM
         self.stats_items_total += len(selected_pages)
@@ -416,7 +425,7 @@ def run_with_creator(self, creator: Creator):
         del private_pages
 
         logger.info(f"  Retrieving {len(self.items_to_download)} assets...")
-        context.current_thread_workitem = "Processing assets"
+        context.current_thread_workitem = "assets"
         self.stats_items_total += len(self.items_to_download)
 
         res = self.asset_executor(
@@ -445,6 +454,7 @@ def _process_css(
         """Process a given CSS stylesheet
         Download content if necessary, rewrite CSS and add CSS to ZIM
         """
+        context.current_thread_workitem = f"CSS at {css_location}"
         if not css_location:
             raise ValueError(f"Cannot process empty css_location for {target_filename}")
         if not css_content:
@@ -465,10 +475,15 @@ def _process_css(
         # to use last URL encountered.
         for path, urls in url_rewriter.items_to_download.items():
             if path in self.items_to_download:
-                self.items_to_download[path].urls.update(urls)
+                self.items_to_download[path].asset_urls.update(urls)
+                self.items_to_download[path].used_by.add(
+                    context.current_thread_workitem
+                )
             else:
                 self.items_to_download[path] = AssetDetails(
-                    urls=urls, always_fetch_online=True
+                    asset_urls=urls,
+                    used_by={context.current_thread_workitem},
+                    always_fetch_online=True,
                 )
         creator.add_item_for(f"content/{target_filename}", content=result)
 
@@ -484,10 +499,7 @@ def _process_page(
         """Process a given library page
         Download content, rewrite HTML and add JSON to ZIM
         """
-        logger.debug(f"  Fetching {page.id}")
-        context.current_thread_workitem = (
-            f"processing page {page.id} at {page.encoded_url}"
-        )
+        context.current_thread_workitem = f"page ID {page.id} ({page.encoded_url})"
         page_content = self.mindtouch_client.get_page_content(page)
         url_rewriter = HtmlUrlsRewriter(
             context.library_url,
@@ -528,19 +540,25 @@ def _process_page(
                 # and since these pages are absolutely not essential, we just display a
                 # warning and store an empty page
                 logger.warning(
-                    f"Problem processing special page ID {page.id} ({page.encoded_url})"
+                    f"Problem processing special {context.current_thread_workitem}"
                     f", page is probably empty, storing empty page: {exc}"
                 )
                 return ""
         if not rewriten:
             # Default rewriting for 'normal' pages
             rewriten = rewriter.rewrite(page_content.html_body).content
         for path, urls in url_rewriter.items_to_download.items():
+
             if path in self.items_to_download:
-                self.items_to_download[path].urls.update(urls)
+                self.items_to_download[path].asset_urls.update(urls)
+                self.items_to_download[path].used_by.add(
+                    context.current_thread_workitem
+                )
             else:
                 self.items_to_download[path] = AssetDetails(
-                    urls=urls, always_fetch_online=False
+                    asset_urls=urls,
+                    used_by={context.current_thread_workitem},
+                    always_fetch_online=False,
                 )
         creator.add_item_for(
             f"content/page_content_{page.id}.json",