From a23acce5ae0e91387b2da98fa2ae6210208750a9 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 25 Oct 2024 15:20:14 +0000 Subject: [PATCH 1/4] Fix error message and move item to download handling to rewriter --- scraper/src/mindtouch2zim/processor.py | 28 +++++++++++++++----------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/scraper/src/mindtouch2zim/processor.py b/scraper/src/mindtouch2zim/processor.py index 1502de6..5103cba 100644 --- a/scraper/src/mindtouch2zim/processor.py +++ b/scraper/src/mindtouch2zim/processor.py @@ -582,7 +582,7 @@ def rewrite_href_src_attributes( if attr_name not in ("href", "src") or not attr_value: return if not isinstance(url_rewriter, HtmlUrlsRewriter): - raise Exception("Expecting MindtouchUrlRewriter") + raise Exception("Expecting HtmlUrlsRewriter") new_attr_value = None if tag == "a": rewrite_result = url_rewriter( @@ -601,17 +601,7 @@ def rewrite_href_src_attributes( ) # add 'content/' to the URL since all assets will be stored in the sub.-path new_attr_value = f"content/{rewrite_result.rewriten_url}" - if rewrite_result.zim_path is not None: - # if item is expected to be inside the ZIM, store asset information so that - # we can download it afterwards - if rewrite_result.zim_path in url_rewriter.items_to_download: - url_rewriter.items_to_download[rewrite_result.zim_path].add( - HttpUrl(rewrite_result.absolute_url) - ) - else: - url_rewriter.items_to_download[rewrite_result.zim_path] = { - HttpUrl(rewrite_result.absolute_url) - } + url_rewriter.add_item_to_download(rewrite_result) if not new_attr_value: # we do not (yet) support other tags / attributes so we fail the scraper raise ValueError( @@ -659,6 +649,20 @@ def __call__( result = super().__call__(item_url, base_href, rewrite_all_url=rewrite_all_url) return result + def add_item_to_download(self, rewrite_result: RewriteResult): + """Add item to download based on rewrite result""" + if rewrite_result.zim_path is not None: + # if item is expected to be inside the ZIM, store asset information so that + # we can download it afterwards + if rewrite_result.zim_path in self.items_to_download: + self.items_to_download[rewrite_result.zim_path].add( + HttpUrl(rewrite_result.absolute_url) + ) + else: + self.items_to_download[rewrite_result.zim_path] = { + HttpUrl(rewrite_result.absolute_url) + } + class CssUrlsRewriter(ArticleUrlRewriter): """A rewriter for CSS processing, storing items to download as URL as processed""" From 51ae41ee7c639ef51c1abc75a219c4e6691933ff Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 25 Oct 2024 15:22:13 +0000 Subject: [PATCH 2/4] Add icon indicating external URLs --- zimui/index.html | 1 + zimui/public/custom.css | 15 +++++++++++++++ zimui/public/external-link.svg | 15 +++++++++++++++ 3 files changed, 31 insertions(+) create mode 100644 zimui/public/custom.css create mode 100644 zimui/public/external-link.svg diff --git a/zimui/index.html b/zimui/index.html index 90659e5..c481be2 100644 --- a/zimui/index.html +++ b/zimui/index.html @@ -6,6 +6,7 @@ + Vite App diff --git a/zimui/public/custom.css b/zimui/public/custom.css new file mode 100644 index 0000000..d57f80f --- /dev/null +++ b/zimui/public/custom.css @@ -0,0 +1,15 @@ +a[href^="http://"]:after, +a[href^="https://"]:after +{ + content: ''; + display: inline-block; + width: 10px; + height: 10px; + background-image: url('external-link.svg'); + background-size: contain; + background-repeat: no-repeat; + margin-left: 5px; + position: relative; + bottom: 0px; + right: 0px; +} diff --git a/zimui/public/external-link.svg b/zimui/public/external-link.svg new file mode 100644 index 0000000..f06102f --- /dev/null +++ b/zimui/public/external-link.svg @@ -0,0 +1,15 @@ + + + + + + + + + + From f118f3a29d2f44cdbb249225781a0cbc5cdd83ba Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 25 Oct 2024 15:25:01 +0000 Subject: [PATCH 3/4] Replace Youtube and Vimeo videos with their thumbnails and a link to open them --- scraper/src/mindtouch2zim/client.py | 9 ++-- scraper/src/mindtouch2zim/constants.py | 3 ++ scraper/src/mindtouch2zim/processor.py | 63 +++++++++++++++++++++++++- scraper/src/mindtouch2zim/vimeo.py | 30 ++++++++++++ zimui/public/custom.css | 26 +++++++++++ zimui/public/play-button.svg | 14 ++++++ 6 files changed, 140 insertions(+), 5 deletions(-) create mode 100644 scraper/src/mindtouch2zim/vimeo.py create mode 100644 zimui/public/play-button.svg diff --git a/scraper/src/mindtouch2zim/client.py b/scraper/src/mindtouch2zim/client.py index 17d3a53..9c06fc4 100644 --- a/scraper/src/mindtouch2zim/client.py +++ b/scraper/src/mindtouch2zim/client.py @@ -7,10 +7,11 @@ from bs4 import BeautifulSoup, NavigableString from pydantic import BaseModel -from mindtouch2zim.constants import logger - -HTTP_TIMEOUT_NORMAL_SECONDS = 15 -HTTP_TIMEOUT_LONG_SECONDS = 30 +from mindtouch2zim.constants import ( + HTTP_TIMEOUT_LONG_SECONDS, + HTTP_TIMEOUT_NORMAL_SECONDS, + logger, +) class MindtouchParsingError(Exception): diff --git a/scraper/src/mindtouch2zim/constants.py b/scraper/src/mindtouch2zim/constants.py index 28f4a9d..7c7838a 100644 --- a/scraper/src/mindtouch2zim/constants.py +++ b/scraper/src/mindtouch2zim/constants.py @@ -14,4 +14,7 @@ # As of 2024-09-24, all libraries appears to be in English. LANGUAGE_ISO_639_3 = "eng" +HTTP_TIMEOUT_NORMAL_SECONDS = 15 +HTTP_TIMEOUT_LONG_SECONDS = 30 + logger = getLogger(NAME, level=logging.DEBUG) diff --git a/scraper/src/mindtouch2zim/processor.py b/scraper/src/mindtouch2zim/processor.py index 5103cba..544811f 100644 --- a/scraper/src/mindtouch2zim/processor.py +++ b/scraper/src/mindtouch2zim/processor.py @@ -15,7 +15,7 @@ from zimscraperlib.image.conversion import convert_svg2png from zimscraperlib.image.probing import format_for from zimscraperlib.rewriting.css import CssRewriter -from zimscraperlib.rewriting.html import HtmlRewriter +from zimscraperlib.rewriting.html import AttrsList, HtmlRewriter, get_attr_value_from from zimscraperlib.rewriting.html import rules as html_rules from zimscraperlib.rewriting.url_rewriting import ( ArticleUrlRewriter, @@ -41,6 +41,7 @@ PageModel, SharedModel, ) +from mindtouch2zim.vimeo import get_vimeo_thumbnail_url from mindtouch2zim.zimconfig import ZimConfig @@ -624,6 +625,66 @@ def refuse_unsupported_tags(tag: str): raise UnsupportedTagError(f"Tag {tag} is not yet supported in this scraper") +YOUTUBE_IFRAME_RE = re.compile(r".*youtube(?:-\w+)*\.\w+\/embed\/(?P.*?)(?:\?.*)*$") +VIMEO_IFRAME_RE = re.compile(r".*vimeo(?:-\w+)*\.\w+\/video\/(?:.*?)(?:\?.*)*$") + + +@html_rules.rewrite_tag() +def rewrite_iframe_tags( + tag: str, + attrs: AttrsList, + base_href: str | None, + url_rewriter: ArticleUrlRewriter, +): + """Rewrite youtube and vimeo iframes to remove player until video is included""" + if tag not in ["iframe"]: + return + if not isinstance(url_rewriter, HtmlUrlsRewriter): + raise Exception("Expecting HtmlUrlsRewriter") + src = get_attr_value_from(attrs=attrs, name="src") + if not src: + raise UnsupportedTagError("Unsupported empty src in iframe") + image_rewriten_url = None + try: + if ytb_match := YOUTUBE_IFRAME_RE.match(src): + rewrite_result = url_rewriter( + f'https://i.ytimg.com/vi/{ytb_match.group("id")}/hqdefault.jpg', + base_href=base_href, + ) + url_rewriter.add_item_to_download(rewrite_result) + image_rewriten_url = rewrite_result.rewriten_url + if VIMEO_IFRAME_RE.match(src): + rewrite_result = url_rewriter( + get_vimeo_thumbnail_url(src), + base_href=base_href, + ) + url_rewriter.add_item_to_download(rewrite_result) + image_rewriten_url = rewrite_result.rewriten_url + except Exception as exc: + logger.warning(f"Failed to rewrite iframe with src {src}", exc_info=exc) + return ( + f'' + f"
" + f"{src}" + "
" + "
" + '