Skip to content

Commit

Permalink
Merge pull request #96 from openzim/assets_reliability
Browse files Browse the repository at this point in the history
Enhance assets processing reliability and logs
  • Loading branch information
benoit74 authored Dec 6, 2024
2 parents 1399a4a + de9e0ee commit 1ef9614
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 43 deletions.
59 changes: 30 additions & 29 deletions scraper/src/mindtouch2zim/asset.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,17 @@ class HeaderData(NamedTuple):


class AssetDetails(NamedTuple):
urls: set[HttpUrl]
asset_urls: set[HttpUrl]
used_by: set[str]
always_fetch_online: bool

@property
def get_usage_repr(self) -> str:
"""Returns a representation of asset usage, typically for logs"""
if len(self.used_by) == 0:
return ""
return f' used by {", ".join(self.used_by)}'


class AssetProcessor:

Expand All @@ -68,55 +76,47 @@ def process_asset(
asset_details: AssetDetails,
creator: Creator,
):
logger.debug(f"Processing asset for {asset_path}")
context.current_thread_workitem = f"processing asset {asset_path}"
self._process_asset_internal(
asset_path=asset_path, asset_details=asset_details, creator=creator
)

def _process_asset_internal(
self,
asset_path: ZimPath,
asset_details: AssetDetails,
creator: Creator,
):
for asset_url in asset_details.urls:
"""Download and add to the ZIM a given asset (image, ...)"""
for asset_url in asset_details.asset_urls:
try:
context.current_thread_workitem = (
f"asset from {asset_url.value}{asset_details.get_usage_repr}"
)
asset_content = self.get_asset_content(
asset_path=asset_path,
asset_url=asset_url,
always_fetch_online=asset_details.always_fetch_online,
)
logger.debug(
f"Adding {asset_url.value} to {asset_path.value} in the ZIM"
)
logger.debug(f"Adding asset to {asset_path.value} in the ZIM")
creator.add_item_for(
path="content/" + asset_path.value,
content=asset_content.getvalue(),
)
break # file found and added
except KnownBadAssetFailedError as exc:
logger.debug(f"Ignoring known bad asset for {asset_url.value}: {exc}")
except RequestException as exc:
logger.debug(f"Ignoring known bad asset: {exc}")
except Exception as exc:
# all other exceptions (not only RequestsException) lead to an increase
# of bad_assets_count, because we have no idea what could go wrong here
# and limit and bad assets threshold should be correct in production,
# or ignored at own user risk
with self.lock:
self.bad_assets_count += 1
log_message = (
"Exception while processing "
f"{context.current_thread_workitem}: {exc}"
)
if (
context.bad_assets_threshold >= 0
and self.bad_assets_count > context.bad_assets_threshold
):
logger.error(
f"Exception while processing asset for {asset_url.value}: "
f"{exc}"
)
logger.error(log_message)
raise OSError( # noqa: B904
f"Asset failure threshold ({context.bad_assets_threshold}) "
"reached, stopping execution"
)
else:
logger.warning(
f"Exception while processing asset for {asset_url.value}: "
f"{exc}"
)
logger.warning(log_message)

def _get_header_data_for(self, url: HttpUrl) -> HeaderData:
"""Get details from headers for a given url
Expand Down Expand Up @@ -250,7 +250,8 @@ def get_asset_content(
)
else:
logger.debug(
f"Not optimizing, unsupported mime type: {mime_type}"
f"Not optimizing, unsupported mime type: {mime_type} for "
f"{context.current_thread_workitem}"
)

return self._download_from_online(asset_url=asset_url)
Expand All @@ -262,7 +263,7 @@ def get_asset_content(
if context.bad_assets_regex and context.bad_assets_regex.findall(
asset_url.value
):
raise KnownBadAssetFailedError() from exc
raise KnownBadAssetFailedError(exc) from exc
raise

def _setup_s3(self):
Expand Down
2 changes: 2 additions & 0 deletions scraper/src/mindtouch2zim/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
NAME,
STANDARD_KNOWN_BAD_ASSETS_REGEX,
VERSION,
logger,
)

MINDTOUCH_TMP = os.getenv("MINDTOUCH_TMP")
Expand Down Expand Up @@ -128,6 +129,7 @@ def current_thread_workitem(self) -> str:
@current_thread_workitem.setter
def current_thread_workitem(self, value: str):
self._current_thread_workitem.value = value
logger.debug(f"Processing {value}")

@property
def wm_user_agent(self) -> str:
Expand Down
2 changes: 1 addition & 1 deletion scraper/src/mindtouch2zim/html_rewriting.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def rewrite_href_src_srcset_attributes(
new_attr_value = ""
logger.warning(
f"Unsupported '{attr_name}' encountered in '{tag}' tag (value: "
f"'{attr_value}') while {context.current_thread_workitem}"
f"'{attr_value}') while rewriting {context.current_thread_workitem}"
)
return (attr_name, new_attr_value)

Expand Down
44 changes: 31 additions & 13 deletions scraper/src/mindtouch2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,16 @@ def run(self) -> Path:
Returns the path to the gernated ZIM.
"""
try:
return self._run_internal()
except Exception:
logger.error(
f"Problem encountered while processing "
f"{context.current_thread_workitem}"
)
raise

def _run_internal(self) -> Path:
logger.setLevel(level=logging.DEBUG if context.debug else logging.INFO)

self.zim_config = ZimConfig(
Expand Down Expand Up @@ -273,7 +282,7 @@ def run(self) -> Path:

def run_with_creator(self, creator: Creator):

context.current_thread_workitem = "Storing standard files"
context.current_thread_workitem = "standard files"

logger.info(" Storing configuration...")
creator.add_item_for(
Expand Down Expand Up @@ -357,7 +366,7 @@ def run_with_creator(self, creator: Creator):
)

logger.info("Fetching pages tree")
context.current_thread_workitem = "Fetching pages tree"
context.current_thread_workitem = "pages tree"
pages_tree = self.mindtouch_client.get_page_tree()
selected_pages = self.content_filter.filter(pages_tree)
logger.info(
Expand All @@ -377,7 +386,7 @@ def run_with_creator(self, creator: Creator):
)

logger.info("Fetching pages content")
context.current_thread_workitem = "Fetching pages content"
context.current_thread_workitem = "pages content"
# compute the list of existing pages to properly rewrite links leading
# in-ZIM / out-of-ZIM
self.stats_items_total += len(selected_pages)
Expand Down Expand Up @@ -416,7 +425,7 @@ def run_with_creator(self, creator: Creator):
del private_pages

logger.info(f" Retrieving {len(self.items_to_download)} assets...")
context.current_thread_workitem = "Processing assets"
context.current_thread_workitem = "assets"
self.stats_items_total += len(self.items_to_download)

res = self.asset_executor(
Expand Down Expand Up @@ -445,6 +454,7 @@ def _process_css(
"""Process a given CSS stylesheet
Download content if necessary, rewrite CSS and add CSS to ZIM
"""
context.current_thread_workitem = f"CSS at {css_location}"
if not css_location:
raise ValueError(f"Cannot process empty css_location for {target_filename}")
if not css_content:
Expand All @@ -465,10 +475,15 @@ def _process_css(
# to use last URL encountered.
for path, urls in url_rewriter.items_to_download.items():
if path in self.items_to_download:
self.items_to_download[path].urls.update(urls)
self.items_to_download[path].asset_urls.update(urls)
self.items_to_download[path].used_by.add(
context.current_thread_workitem
)
else:
self.items_to_download[path] = AssetDetails(
urls=urls, always_fetch_online=True
asset_urls=urls,
used_by={context.current_thread_workitem},
always_fetch_online=True,
)
creator.add_item_for(f"content/{target_filename}", content=result)

Expand All @@ -484,10 +499,7 @@ def _process_page(
"""Process a given library page
Download content, rewrite HTML and add JSON to ZIM
"""
logger.debug(f" Fetching {page.id}")
context.current_thread_workitem = (
f"processing page {page.id} at {page.encoded_url}"
)
context.current_thread_workitem = f"page ID {page.id} ({page.encoded_url})"
page_content = self.mindtouch_client.get_page_content(page)
url_rewriter = HtmlUrlsRewriter(
context.library_url,
Expand Down Expand Up @@ -528,19 +540,25 @@ def _process_page(
# and since these pages are absolutely not essential, we just display a
# warning and store an empty page
logger.warning(
f"Problem processing special page ID {page.id} ({page.encoded_url})"
f"Problem processing special {context.current_thread_workitem}"
f", page is probably empty, storing empty page: {exc}"
)
return ""
if not rewriten:
# Default rewriting for 'normal' pages
rewriten = rewriter.rewrite(page_content.html_body).content
for path, urls in url_rewriter.items_to_download.items():

if path in self.items_to_download:
self.items_to_download[path].urls.update(urls)
self.items_to_download[path].asset_urls.update(urls)
self.items_to_download[path].used_by.add(
context.current_thread_workitem
)
else:
self.items_to_download[path] = AssetDetails(
urls=urls, always_fetch_online=False
asset_urls=urls,
used_by={context.current_thread_workitem},
always_fetch_online=False,
)
creator.add_item_for(
f"content/page_content_{page.id}.json",
Expand Down

0 comments on commit 1ef9614

Please sign in to comment.