diff --git a/scraper/src/mindtouch2zim/asset.py b/scraper/src/mindtouch2zim/asset.py index 1bc14b4..9111295 100644 --- a/scraper/src/mindtouch2zim/asset.py +++ b/scraper/src/mindtouch2zim/asset.py @@ -49,9 +49,17 @@ class HeaderData(NamedTuple): class AssetDetails(NamedTuple): - urls: set[HttpUrl] + asset_urls: set[HttpUrl] + used_by: set[str] always_fetch_online: bool + @property + def get_usage_repr(self) -> str: + """Returns a representation of asset usage, typically for logs""" + if len(self.used_by) == 0: + return "" + return f' used by {", ".join(self.used_by)}' + class AssetProcessor: @@ -68,55 +76,47 @@ def process_asset( asset_details: AssetDetails, creator: Creator, ): - logger.debug(f"Processing asset for {asset_path}") - context.current_thread_workitem = f"processing asset {asset_path}" - self._process_asset_internal( - asset_path=asset_path, asset_details=asset_details, creator=creator - ) - - def _process_asset_internal( - self, - asset_path: ZimPath, - asset_details: AssetDetails, - creator: Creator, - ): - for asset_url in asset_details.urls: + """Download and add to the ZIM a given asset (image, ...)""" + for asset_url in asset_details.asset_urls: try: + context.current_thread_workitem = ( + f"asset from {asset_url.value}{asset_details.get_usage_repr}" + ) asset_content = self.get_asset_content( asset_path=asset_path, asset_url=asset_url, always_fetch_online=asset_details.always_fetch_online, ) - logger.debug( - f"Adding {asset_url.value} to {asset_path.value} in the ZIM" - ) + logger.debug(f"Adding asset to {asset_path.value} in the ZIM") creator.add_item_for( path="content/" + asset_path.value, content=asset_content.getvalue(), ) break # file found and added except KnownBadAssetFailedError as exc: - logger.debug(f"Ignoring known bad asset for {asset_url.value}: {exc}") - except RequestException as exc: + logger.debug(f"Ignoring known bad asset: {exc}") + except Exception as exc: + # all other exceptions (not only RequestsException) lead to an increase + # of bad_assets_count, because we have no idea what could go wrong here + # and limit and bad assets threshold should be correct in production, + # or ignored at own user risk with self.lock: self.bad_assets_count += 1 + log_message = ( + "Exception while processing " + f"{context.current_thread_workitem}: {exc}" + ) if ( context.bad_assets_threshold >= 0 and self.bad_assets_count > context.bad_assets_threshold ): - logger.error( - f"Exception while processing asset for {asset_url.value}: " - f"{exc}" - ) + logger.error(log_message) raise OSError( # noqa: B904 f"Asset failure threshold ({context.bad_assets_threshold}) " "reached, stopping execution" ) else: - logger.warning( - f"Exception while processing asset for {asset_url.value}: " - f"{exc}" - ) + logger.warning(log_message) def _get_header_data_for(self, url: HttpUrl) -> HeaderData: """Get details from headers for a given url @@ -250,7 +250,8 @@ def get_asset_content( ) else: logger.debug( - f"Not optimizing, unsupported mime type: {mime_type}" + f"Not optimizing, unsupported mime type: {mime_type} for " + f"{context.current_thread_workitem}" ) return self._download_from_online(asset_url=asset_url) @@ -262,7 +263,7 @@ def get_asset_content( if context.bad_assets_regex and context.bad_assets_regex.findall( asset_url.value ): - raise KnownBadAssetFailedError() from exc + raise KnownBadAssetFailedError(exc) from exc raise def _setup_s3(self): diff --git a/scraper/src/mindtouch2zim/context.py b/scraper/src/mindtouch2zim/context.py index f4f5529..ceb4501 100644 --- a/scraper/src/mindtouch2zim/context.py +++ b/scraper/src/mindtouch2zim/context.py @@ -12,6 +12,7 @@ NAME, STANDARD_KNOWN_BAD_ASSETS_REGEX, VERSION, + logger, ) MINDTOUCH_TMP = os.getenv("MINDTOUCH_TMP") @@ -128,6 +129,7 @@ def current_thread_workitem(self) -> str: @current_thread_workitem.setter def current_thread_workitem(self, value: str): self._current_thread_workitem.value = value + logger.debug(f"Processing {value}") @property def wm_user_agent(self) -> str: diff --git a/scraper/src/mindtouch2zim/html_rewriting.py b/scraper/src/mindtouch2zim/html_rewriting.py index e0be9df..7ae904c 100644 --- a/scraper/src/mindtouch2zim/html_rewriting.py +++ b/scraper/src/mindtouch2zim/html_rewriting.py @@ -58,7 +58,7 @@ def rewrite_href_src_srcset_attributes( new_attr_value = "" logger.warning( f"Unsupported '{attr_name}' encountered in '{tag}' tag (value: " - f"'{attr_value}') while {context.current_thread_workitem}" + f"'{attr_value}') while rewriting {context.current_thread_workitem}" ) return (attr_name, new_attr_value) diff --git a/scraper/src/mindtouch2zim/processor.py b/scraper/src/mindtouch2zim/processor.py index 8f8128d..bd5a351 100644 --- a/scraper/src/mindtouch2zim/processor.py +++ b/scraper/src/mindtouch2zim/processor.py @@ -143,7 +143,16 @@ def run(self) -> Path: Returns the path to the gernated ZIM. """ + try: + return self._run_internal() + except: + logger.error( + f"Problem encountered while processing " + f"{context.current_thread_workitem}" + ) + raise + def _run_internal(self) -> Path: logger.setLevel(level=logging.DEBUG if context.debug else logging.INFO) self.zim_config = ZimConfig( @@ -273,7 +282,7 @@ def run(self) -> Path: def run_with_creator(self, creator: Creator): - context.current_thread_workitem = "Storing standard files" + context.current_thread_workitem = "standard files" logger.info(" Storing configuration...") creator.add_item_for( @@ -357,7 +366,7 @@ def run_with_creator(self, creator: Creator): ) logger.info("Fetching pages tree") - context.current_thread_workitem = "Fetching pages tree" + context.current_thread_workitem = "pages tree" pages_tree = self.mindtouch_client.get_page_tree() selected_pages = self.content_filter.filter(pages_tree) logger.info( @@ -377,7 +386,7 @@ def run_with_creator(self, creator: Creator): ) logger.info("Fetching pages content") - context.current_thread_workitem = "Fetching pages content" + context.current_thread_workitem = "pages content" # compute the list of existing pages to properly rewrite links leading # in-ZIM / out-of-ZIM self.stats_items_total += len(selected_pages) @@ -416,7 +425,7 @@ def run_with_creator(self, creator: Creator): del private_pages logger.info(f" Retrieving {len(self.items_to_download)} assets...") - context.current_thread_workitem = "Processing assets" + context.current_thread_workitem = "assets" self.stats_items_total += len(self.items_to_download) res = self.asset_executor( @@ -445,6 +454,7 @@ def _process_css( """Process a given CSS stylesheet Download content if necessary, rewrite CSS and add CSS to ZIM """ + context.current_thread_workitem = f"CSS at {css_location}" if not css_location: raise ValueError(f"Cannot process empty css_location for {target_filename}") if not css_content: @@ -465,10 +475,15 @@ def _process_css( # to use last URL encountered. for path, urls in url_rewriter.items_to_download.items(): if path in self.items_to_download: - self.items_to_download[path].urls.update(urls) + self.items_to_download[path].asset_urls.update(urls) + self.items_to_download[path].used_by.add( + context.current_thread_workitem + ) else: self.items_to_download[path] = AssetDetails( - urls=urls, always_fetch_online=True + asset_urls=urls, + used_by={context.current_thread_workitem}, + always_fetch_online=True, ) creator.add_item_for(f"content/{target_filename}", content=result) @@ -484,10 +499,7 @@ def _process_page( """Process a given library page Download content, rewrite HTML and add JSON to ZIM """ - logger.debug(f" Fetching {page.id}") - context.current_thread_workitem = ( - f"processing page {page.id} at {page.encoded_url}" - ) + context.current_thread_workitem = f"page ID {page.id} ({page.encoded_url})" page_content = self.mindtouch_client.get_page_content(page) url_rewriter = HtmlUrlsRewriter( context.library_url, @@ -528,7 +540,7 @@ def _process_page( # and since these pages are absolutely not essential, we just display a # warning and store an empty page logger.warning( - f"Problem processing special page ID {page.id} ({page.encoded_url})" + f"Problem processing special {context.current_thread_workitem}" f", page is probably empty, storing empty page: {exc}" ) return "" @@ -536,11 +548,17 @@ def _process_page( # Default rewriting for 'normal' pages rewriten = rewriter.rewrite(page_content.html_body).content for path, urls in url_rewriter.items_to_download.items(): + if path in self.items_to_download: - self.items_to_download[path].urls.update(urls) + self.items_to_download[path].asset_urls.update(urls) + self.items_to_download[path].used_by.add( + context.current_thread_workitem + ) else: self.items_to_download[path] = AssetDetails( - urls=urls, always_fetch_online=False + asset_urls=urls, + used_by={context.current_thread_workitem}, + always_fetch_online=False, ) creator.add_item_for( f"content/page_content_{page.id}.json",