From 33791d7a095551b894fc4e714a77a446c12deb8d Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 13 Dec 2024 16:06:17 +0000 Subject: [PATCH] Rewrite embed tags to provide link to original content online --- scraper/src/mindtouch2zim/html_rewriting.py | 24 +++++++++++++++ scraper/tests/test_html_rewriting.py | 33 +++++++++++++++++++-- 2 files changed, 55 insertions(+), 2 deletions(-) diff --git a/scraper/src/mindtouch2zim/html_rewriting.py b/scraper/src/mindtouch2zim/html_rewriting.py index ca64b7e..2266bfc 100644 --- a/scraper/src/mindtouch2zim/html_rewriting.py +++ b/scraper/src/mindtouch2zim/html_rewriting.py @@ -251,3 +251,27 @@ def rewrite_img_tags( + [("src", new_attr_value)] ) return f"' if auto_close else '>'}" + + +@html_rules.rewrite_tag() +def rewrite_embed_tags( + tag: str, + attrs: AttrsList, + *, + auto_close: bool, +): + + if tag != "embed": + return + if not (src_value := get_attr_value_from(attrs, "src")): + return # no need to rewrite this embed without src + + # There is 99% chance the embed src is not inside the ZIM, so we assume it is not + # (we can't know anyway with current software architecture) + return ( + "This content is not inside the ZIM. " + f'View content online at ' + f"{src_value}" + "" + f'{ "" if auto_close else ""}' + ) diff --git a/scraper/tests/test_html_rewriting.py b/scraper/tests/test_html_rewriting.py index c21b411..ff4244a 100644 --- a/scraper/tests/test_html_rewriting.py +++ b/scraper/tests/test_html_rewriting.py @@ -248,8 +248,10 @@ def test_html_iframe_rewriting( """, """""", id="video_src", ), @@ -277,6 +279,33 @@ def test_html_unknown_src_href_rewriting( assert html_rewriter.rewrite(source_html).content == expected_html +@pytest.mark.parametrize( + "source_html, expected_html", + [ + pytest.param( + """""", + "This content is not inside the ZIM. View content online at " + 'https://svs.gsfc.nasa.gov/vis/a000000/a003600/a003658/thermohaline_conveyor_30fps.mp4' + "", + id="embed_src", + ), + ], +) +def test_html_embed_rewriting( + html_rewriter: HtmlRewriter, source_html: str, expected_html: str +): + assert html_rewriter.rewrite(source_html).content == expected_html + + @pytest.mark.parametrize( "source_html, expected_html, expected_items_to_download", [