diff --git a/scraper/src/mindtouch2zim/html_rewriting.py b/scraper/src/mindtouch2zim/html_rewriting.py index ca64b7e..2266bfc 100644 --- a/scraper/src/mindtouch2zim/html_rewriting.py +++ b/scraper/src/mindtouch2zim/html_rewriting.py @@ -251,3 +251,27 @@ def rewrite_img_tags( + [("src", new_attr_value)] ) return f"' if auto_close else '>'}" + + +@html_rules.rewrite_tag() +def rewrite_embed_tags( + tag: str, + attrs: AttrsList, + *, + auto_close: bool, +): + + if tag != "embed": + return + if not (src_value := get_attr_value_from(attrs, "src")): + return # no need to rewrite this embed without src + + # There is 99% chance the embed src is not inside the ZIM, so we assume it is not + # (we can't know anyway with current software architecture) + return ( + "This content is not inside the ZIM. " + f'View content online at ' + f"{src_value}" + "" + f'{ "" if auto_close else ""}' + ) diff --git a/scraper/tests/test_html_rewriting.py b/scraper/tests/test_html_rewriting.py index c21b411..ff4244a 100644 --- a/scraper/tests/test_html_rewriting.py +++ b/scraper/tests/test_html_rewriting.py @@ -248,8 +248,10 @@ def test_html_iframe_rewriting( """, """""", id="video_src", ), @@ -277,6 +279,33 @@ def test_html_unknown_src_href_rewriting( assert html_rewriter.rewrite(source_html).content == expected_html +@pytest.mark.parametrize( + "source_html, expected_html", + [ + pytest.param( + """""", + "This content is not inside the ZIM. View content online at " + 'https://svs.gsfc.nasa.gov/vis/a000000/a003600/a003658/thermohaline_conveyor_30fps.mp4' + "", + id="embed_src", + ), + ], +) +def test_html_embed_rewriting( + html_rewriter: HtmlRewriter, source_html: str, expected_html: str +): + assert html_rewriter.rewrite(source_html).content == expected_html + + @pytest.mark.parametrize( "source_html, expected_html, expected_items_to_download", [