Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhance assets processing reliability and logs #96

Merged
merged 1 commit into from
Dec 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 30 additions & 29 deletions scraper/src/mindtouch2zim/asset.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,17 @@


class AssetDetails(NamedTuple):
urls: set[HttpUrl]
asset_urls: set[HttpUrl]
benoit74 marked this conversation as resolved.
Show resolved Hide resolved
used_by: set[str]
always_fetch_online: bool

@property
def get_usage_repr(self) -> str:
"""Returns a representation of asset usage, typically for logs"""
if len(self.used_by) == 0:
return ""
return f' used by {", ".join(self.used_by)}'

Check warning on line 61 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L60-L61

Added lines #L60 - L61 were not covered by tests


class AssetProcessor:

Expand All @@ -68,55 +76,47 @@
asset_details: AssetDetails,
creator: Creator,
):
logger.debug(f"Processing asset for {asset_path}")
context.current_thread_workitem = f"processing asset {asset_path}"
self._process_asset_internal(
asset_path=asset_path, asset_details=asset_details, creator=creator
)

def _process_asset_internal(
self,
asset_path: ZimPath,
asset_details: AssetDetails,
creator: Creator,
):
for asset_url in asset_details.urls:
"""Download and add to the ZIM a given asset (image, ...)"""
for asset_url in asset_details.asset_urls:
try:
context.current_thread_workitem = (

Check warning on line 82 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L82

Added line #L82 was not covered by tests
f"asset from {asset_url.value}{asset_details.get_usage_repr}"
)
asset_content = self.get_asset_content(
asset_path=asset_path,
asset_url=asset_url,
always_fetch_online=asset_details.always_fetch_online,
)
logger.debug(
f"Adding {asset_url.value} to {asset_path.value} in the ZIM"
)
logger.debug(f"Adding asset to {asset_path.value} in the ZIM")

Check warning on line 90 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L90

Added line #L90 was not covered by tests
creator.add_item_for(
path="content/" + asset_path.value,
content=asset_content.getvalue(),
)
break # file found and added
except KnownBadAssetFailedError as exc:
logger.debug(f"Ignoring known bad asset for {asset_url.value}: {exc}")
except RequestException as exc:
logger.debug(f"Ignoring known bad asset: {exc}")
except Exception as exc:

Check warning on line 98 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L97-L98

Added lines #L97 - L98 were not covered by tests
# all other exceptions (not only RequestsException) lead to an increase
# of bad_assets_count, because we have no idea what could go wrong here
# and limit and bad assets threshold should be correct in production,
# or ignored at own user risk
with self.lock:
self.bad_assets_count += 1
log_message = (

Check warning on line 105 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L105

Added line #L105 was not covered by tests
"Exception while processing "
f"{context.current_thread_workitem}: {exc}"
)
if (
context.bad_assets_threshold >= 0
and self.bad_assets_count > context.bad_assets_threshold
):
logger.error(
f"Exception while processing asset for {asset_url.value}: "
f"{exc}"
)
logger.error(log_message)

Check warning on line 113 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L113

Added line #L113 was not covered by tests
raise OSError( # noqa: B904
f"Asset failure threshold ({context.bad_assets_threshold}) "
"reached, stopping execution"
)
else:
logger.warning(
f"Exception while processing asset for {asset_url.value}: "
f"{exc}"
)
logger.warning(log_message)

Check warning on line 119 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L119

Added line #L119 was not covered by tests

def _get_header_data_for(self, url: HttpUrl) -> HeaderData:
"""Get details from headers for a given url
Expand Down Expand Up @@ -250,7 +250,8 @@
)
else:
logger.debug(
f"Not optimizing, unsupported mime type: {mime_type}"
f"Not optimizing, unsupported mime type: {mime_type} for "
f"{context.current_thread_workitem}"
)

return self._download_from_online(asset_url=asset_url)
Expand All @@ -262,7 +263,7 @@
if context.bad_assets_regex and context.bad_assets_regex.findall(
asset_url.value
):
raise KnownBadAssetFailedError() from exc
raise KnownBadAssetFailedError(exc) from exc

Check warning on line 266 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L266

Added line #L266 was not covered by tests
raise

def _setup_s3(self):
Expand Down
2 changes: 2 additions & 0 deletions scraper/src/mindtouch2zim/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
NAME,
STANDARD_KNOWN_BAD_ASSETS_REGEX,
VERSION,
logger,
)

MINDTOUCH_TMP = os.getenv("MINDTOUCH_TMP")
Expand Down Expand Up @@ -128,6 +129,7 @@ def current_thread_workitem(self) -> str:
@current_thread_workitem.setter
def current_thread_workitem(self, value: str):
self._current_thread_workitem.value = value
logger.debug(f"Processing {value}")

@property
def wm_user_agent(self) -> str:
Expand Down
2 changes: 1 addition & 1 deletion scraper/src/mindtouch2zim/html_rewriting.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def rewrite_href_src_srcset_attributes(
new_attr_value = ""
logger.warning(
f"Unsupported '{attr_name}' encountered in '{tag}' tag (value: "
f"'{attr_value}') while {context.current_thread_workitem}"
f"'{attr_value}') while rewriting {context.current_thread_workitem}"
)
return (attr_name, new_attr_value)

Expand Down
44 changes: 31 additions & 13 deletions scraper/src/mindtouch2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,16 @@

Returns the path to the gernated ZIM.
"""
try:
return self._run_internal()
except Exception:
logger.error(

Check warning on line 149 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L146-L149

Added lines #L146 - L149 were not covered by tests
f"Problem encountered while processing "
f"{context.current_thread_workitem}"
)
raise

Check warning on line 153 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L153

Added line #L153 was not covered by tests

def _run_internal(self) -> Path:
logger.setLevel(level=logging.DEBUG if context.debug else logging.INFO)

self.zim_config = ZimConfig(
Expand Down Expand Up @@ -273,7 +282,7 @@

def run_with_creator(self, creator: Creator):

context.current_thread_workitem = "Storing standard files"
context.current_thread_workitem = "standard files"

Check warning on line 285 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L285

Added line #L285 was not covered by tests

logger.info(" Storing configuration...")
creator.add_item_for(
Expand Down Expand Up @@ -357,7 +366,7 @@
)

logger.info("Fetching pages tree")
context.current_thread_workitem = "Fetching pages tree"
context.current_thread_workitem = "pages tree"

Check warning on line 369 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L369

Added line #L369 was not covered by tests
pages_tree = self.mindtouch_client.get_page_tree()
selected_pages = self.content_filter.filter(pages_tree)
logger.info(
Expand All @@ -377,7 +386,7 @@
)

logger.info("Fetching pages content")
context.current_thread_workitem = "Fetching pages content"
context.current_thread_workitem = "pages content"

Check warning on line 389 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L389

Added line #L389 was not covered by tests
# compute the list of existing pages to properly rewrite links leading
# in-ZIM / out-of-ZIM
self.stats_items_total += len(selected_pages)
Expand Down Expand Up @@ -416,7 +425,7 @@
del private_pages

logger.info(f" Retrieving {len(self.items_to_download)} assets...")
context.current_thread_workitem = "Processing assets"
context.current_thread_workitem = "assets"

Check warning on line 428 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L428

Added line #L428 was not covered by tests
self.stats_items_total += len(self.items_to_download)

res = self.asset_executor(
Expand Down Expand Up @@ -445,6 +454,7 @@
"""Process a given CSS stylesheet
Download content if necessary, rewrite CSS and add CSS to ZIM
"""
context.current_thread_workitem = f"CSS at {css_location}"

Check warning on line 457 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L457

Added line #L457 was not covered by tests
if not css_location:
raise ValueError(f"Cannot process empty css_location for {target_filename}")
if not css_content:
Expand All @@ -465,10 +475,15 @@
# to use last URL encountered.
for path, urls in url_rewriter.items_to_download.items():
if path in self.items_to_download:
self.items_to_download[path].urls.update(urls)
self.items_to_download[path].asset_urls.update(urls)
self.items_to_download[path].used_by.add(

Check warning on line 479 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L478-L479

Added lines #L478 - L479 were not covered by tests
context.current_thread_workitem
)
else:
self.items_to_download[path] = AssetDetails(
urls=urls, always_fetch_online=True
asset_urls=urls,
used_by={context.current_thread_workitem},
always_fetch_online=True,
)
creator.add_item_for(f"content/{target_filename}", content=result)

Expand All @@ -484,10 +499,7 @@
"""Process a given library page
Download content, rewrite HTML and add JSON to ZIM
"""
logger.debug(f" Fetching {page.id}")
context.current_thread_workitem = (
f"processing page {page.id} at {page.encoded_url}"
)
context.current_thread_workitem = f"page ID {page.id} ({page.encoded_url})"

Check warning on line 502 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L502

Added line #L502 was not covered by tests
page_content = self.mindtouch_client.get_page_content(page)
url_rewriter = HtmlUrlsRewriter(
context.library_url,
Expand Down Expand Up @@ -528,19 +540,25 @@
# and since these pages are absolutely not essential, we just display a
# warning and store an empty page
logger.warning(
f"Problem processing special page ID {page.id} ({page.encoded_url})"
f"Problem processing special {context.current_thread_workitem}"
f", page is probably empty, storing empty page: {exc}"
)
return ""
if not rewriten:
# Default rewriting for 'normal' pages
rewriten = rewriter.rewrite(page_content.html_body).content
for path, urls in url_rewriter.items_to_download.items():

if path in self.items_to_download:
self.items_to_download[path].urls.update(urls)
self.items_to_download[path].asset_urls.update(urls)
self.items_to_download[path].used_by.add(

Check warning on line 554 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L553-L554

Added lines #L553 - L554 were not covered by tests
context.current_thread_workitem
)
else:
self.items_to_download[path] = AssetDetails(
urls=urls, always_fetch_online=False
asset_urls=urls,
used_by={context.current_thread_workitem},
always_fetch_online=False,
)
creator.add_item_for(
f"content/page_content_{page.id}.json",
Expand Down