Skip to content

Commit

Permalink
Catch all exception on bad asset processing, and log full details abo…
Browse files Browse the repository at this point in the history
…ut asset
  • Loading branch information
benoit74 committed Dec 6, 2024
1 parent 1399a4a commit d935c41
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 43 deletions.
59 changes: 30 additions & 29 deletions scraper/src/mindtouch2zim/asset.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,17 @@ class HeaderData(NamedTuple):


class AssetDetails(NamedTuple):
urls: set[HttpUrl]
asset_urls: set[HttpUrl]
used_by: set[str]
always_fetch_online: bool

@property
def get_usage_repr(self) -> str:
"""Returns a representation of asset usage, typically for logs"""
if len(self.used_by) == 0:
return ""
return f' used by {", ".join(self.used_by)}'

Check warning on line 61 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L60-L61

Added lines #L60 - L61 were not covered by tests


class AssetProcessor:

Expand All @@ -68,55 +76,47 @@ def process_asset(
asset_details: AssetDetails,
creator: Creator,
):
logger.debug(f"Processing asset for {asset_path}")
context.current_thread_workitem = f"processing asset {asset_path}"
self._process_asset_internal(
asset_path=asset_path, asset_details=asset_details, creator=creator
)

def _process_asset_internal(
self,
asset_path: ZimPath,
asset_details: AssetDetails,
creator: Creator,
):
for asset_url in asset_details.urls:
"""Download and add to the ZIM a given asset (image, ...)"""
for asset_url in asset_details.asset_urls:
try:
context.current_thread_workitem = (

Check warning on line 82 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L82

Added line #L82 was not covered by tests
f"asset from {asset_url.value}{asset_details.get_usage_repr}"
)
asset_content = self.get_asset_content(
asset_path=asset_path,
asset_url=asset_url,
always_fetch_online=asset_details.always_fetch_online,
)
logger.debug(
f"Adding {asset_url.value} to {asset_path.value} in the ZIM"
)
logger.debug(f"Adding asset to {asset_path.value} in the ZIM")

Check warning on line 90 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L90

Added line #L90 was not covered by tests
creator.add_item_for(
path="content/" + asset_path.value,
content=asset_content.getvalue(),
)
break # file found and added
except KnownBadAssetFailedError as exc:
logger.debug(f"Ignoring known bad asset for {asset_url.value}: {exc}")
except RequestException as exc:
logger.debug(f"Ignoring known bad asset: {exc}")
except Exception as exc:

Check warning on line 98 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L97-L98

Added lines #L97 - L98 were not covered by tests
# all other exceptions (not only RequestsException) lead to an increase
# of bad_assets_count, because we have no idea what could go wrong here
# and limit and bad assets threshold should be correct in production,
# or ignored at own user risk
with self.lock:
self.bad_assets_count += 1
log_message = (

Check warning on line 105 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L105

Added line #L105 was not covered by tests
"Exception while processing "
f"{context.current_thread_workitem}: {exc}"
)
if (
context.bad_assets_threshold >= 0
and self.bad_assets_count > context.bad_assets_threshold
):
logger.error(
f"Exception while processing asset for {asset_url.value}: "
f"{exc}"
)
logger.error(log_message)

Check warning on line 113 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L113

Added line #L113 was not covered by tests
raise OSError( # noqa: B904
f"Asset failure threshold ({context.bad_assets_threshold}) "
"reached, stopping execution"
)
else:
logger.warning(
f"Exception while processing asset for {asset_url.value}: "
f"{exc}"
)
logger.warning(log_message)

Check warning on line 119 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L119

Added line #L119 was not covered by tests

def _get_header_data_for(self, url: HttpUrl) -> HeaderData:
"""Get details from headers for a given url
Expand Down Expand Up @@ -250,7 +250,8 @@ def get_asset_content(
)
else:
logger.debug(
f"Not optimizing, unsupported mime type: {mime_type}"
f"Not optimizing, unsupported mime type: {mime_type} for "
f"{context.current_thread_workitem}"
)

return self._download_from_online(asset_url=asset_url)
Expand All @@ -262,7 +263,7 @@ def get_asset_content(
if context.bad_assets_regex and context.bad_assets_regex.findall(
asset_url.value
):
raise KnownBadAssetFailedError() from exc
raise KnownBadAssetFailedError(exc) from exc

Check warning on line 266 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L266

Added line #L266 was not covered by tests
raise

def _setup_s3(self):
Expand Down
2 changes: 2 additions & 0 deletions scraper/src/mindtouch2zim/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
NAME,
STANDARD_KNOWN_BAD_ASSETS_REGEX,
VERSION,
logger,
)

MINDTOUCH_TMP = os.getenv("MINDTOUCH_TMP")
Expand Down Expand Up @@ -128,6 +129,7 @@ def current_thread_workitem(self) -> str:
@current_thread_workitem.setter
def current_thread_workitem(self, value: str):
self._current_thread_workitem.value = value
logger.debug(f"Processing {value}")

@property
def wm_user_agent(self) -> str:
Expand Down
2 changes: 1 addition & 1 deletion scraper/src/mindtouch2zim/html_rewriting.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def rewrite_href_src_srcset_attributes(
new_attr_value = ""
logger.warning(
f"Unsupported '{attr_name}' encountered in '{tag}' tag (value: "
f"'{attr_value}') while {context.current_thread_workitem}"
f"'{attr_value}') while rewriting {context.current_thread_workitem}"
)
return (attr_name, new_attr_value)

Expand Down
44 changes: 31 additions & 13 deletions scraper/src/mindtouch2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,16 @@ def run(self) -> Path:
Returns the path to the gernated ZIM.
"""
try:
return self._run_internal()
except:

Check notice on line 148 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

codefactor.io / CodeFactor

scraper/src/mindtouch2zim/processor.py#L148

Do not use bare 'except'. (E722)
logger.error(

Check warning on line 149 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L146-L149

Added lines #L146 - L149 were not covered by tests
f"Problem encountered while processing "
f"{context.current_thread_workitem}"
)
raise

Check warning on line 153 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L153

Added line #L153 was not covered by tests

def _run_internal(self) -> Path:
logger.setLevel(level=logging.DEBUG if context.debug else logging.INFO)

self.zim_config = ZimConfig(
Expand Down Expand Up @@ -273,7 +282,7 @@ def run(self) -> Path:

def run_with_creator(self, creator: Creator):

context.current_thread_workitem = "Storing standard files"
context.current_thread_workitem = "standard files"

Check warning on line 285 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L285

Added line #L285 was not covered by tests

logger.info(" Storing configuration...")
creator.add_item_for(
Expand Down Expand Up @@ -357,7 +366,7 @@ def run_with_creator(self, creator: Creator):
)

logger.info("Fetching pages tree")
context.current_thread_workitem = "Fetching pages tree"
context.current_thread_workitem = "pages tree"

Check warning on line 369 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L369

Added line #L369 was not covered by tests
pages_tree = self.mindtouch_client.get_page_tree()
selected_pages = self.content_filter.filter(pages_tree)
logger.info(
Expand All @@ -377,7 +386,7 @@ def run_with_creator(self, creator: Creator):
)

logger.info("Fetching pages content")
context.current_thread_workitem = "Fetching pages content"
context.current_thread_workitem = "pages content"

Check warning on line 389 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L389

Added line #L389 was not covered by tests
# compute the list of existing pages to properly rewrite links leading
# in-ZIM / out-of-ZIM
self.stats_items_total += len(selected_pages)
Expand Down Expand Up @@ -416,7 +425,7 @@ def run_with_creator(self, creator: Creator):
del private_pages

logger.info(f" Retrieving {len(self.items_to_download)} assets...")
context.current_thread_workitem = "Processing assets"
context.current_thread_workitem = "assets"

Check warning on line 428 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L428

Added line #L428 was not covered by tests
self.stats_items_total += len(self.items_to_download)

res = self.asset_executor(
Expand Down Expand Up @@ -445,6 +454,7 @@ def _process_css(
"""Process a given CSS stylesheet
Download content if necessary, rewrite CSS and add CSS to ZIM
"""
context.current_thread_workitem = f"CSS at {css_location}"

Check warning on line 457 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L457

Added line #L457 was not covered by tests
if not css_location:
raise ValueError(f"Cannot process empty css_location for {target_filename}")
if not css_content:
Expand All @@ -465,10 +475,15 @@ def _process_css(
# to use last URL encountered.
for path, urls in url_rewriter.items_to_download.items():
if path in self.items_to_download:
self.items_to_download[path].urls.update(urls)
self.items_to_download[path].asset_urls.update(urls)
self.items_to_download[path].used_by.add(

Check warning on line 479 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L478-L479

Added lines #L478 - L479 were not covered by tests
context.current_thread_workitem
)
else:
self.items_to_download[path] = AssetDetails(
urls=urls, always_fetch_online=True
asset_urls=urls,
used_by={context.current_thread_workitem},
always_fetch_online=True,
)
creator.add_item_for(f"content/{target_filename}", content=result)

Expand All @@ -484,10 +499,7 @@ def _process_page(
"""Process a given library page
Download content, rewrite HTML and add JSON to ZIM
"""
logger.debug(f" Fetching {page.id}")
context.current_thread_workitem = (
f"processing page {page.id} at {page.encoded_url}"
)
context.current_thread_workitem = f"page ID {page.id} ({page.encoded_url})"

Check warning on line 502 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L502

Added line #L502 was not covered by tests
page_content = self.mindtouch_client.get_page_content(page)
url_rewriter = HtmlUrlsRewriter(
context.library_url,
Expand Down Expand Up @@ -528,19 +540,25 @@ def _process_page(
# and since these pages are absolutely not essential, we just display a
# warning and store an empty page
logger.warning(
f"Problem processing special page ID {page.id} ({page.encoded_url})"
f"Problem processing special {context.current_thread_workitem}"
f", page is probably empty, storing empty page: {exc}"
)
return ""
if not rewriten:
# Default rewriting for 'normal' pages
rewriten = rewriter.rewrite(page_content.html_body).content
for path, urls in url_rewriter.items_to_download.items():

if path in self.items_to_download:
self.items_to_download[path].urls.update(urls)
self.items_to_download[path].asset_urls.update(urls)
self.items_to_download[path].used_by.add(

Check warning on line 554 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L553-L554

Added lines #L553 - L554 were not covered by tests
context.current_thread_workitem
)
else:
self.items_to_download[path] = AssetDetails(
urls=urls, always_fetch_online=False
asset_urls=urls,
used_by={context.current_thread_workitem},
always_fetch_online=False,
)
creator.add_item_for(
f"content/page_content_{page.id}.json",
Expand Down

0 comments on commit d935c41

Please sign in to comment.