Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add flag to continue on HTML rewriting issue #73

Merged
merged 5 commits into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 4 additions & 6 deletions scraper/src/mindtouch2zim/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
with a special call, hence the specific object
"""

encoded_url: str
tags: list[str]


Expand All @@ -52,6 +51,7 @@
path: str
parent: "LibraryPage | None" = None
children: list["LibraryPage"] = []
encoded_url: str
definition: LibraryPageDefinition | None = None

def __repr__(self) -> str:
Expand Down Expand Up @@ -251,6 +251,7 @@
id=tree_data["page"]["@id"],
title=tree_data["page"]["title"],
path=tree_data["page"]["path"]["#text"],
encoded_url=tree_data["page"]["uri.ui"],
)
tree_obj = LibraryTree(root=root)
tree_obj.pages[root.id] = root
Expand All @@ -260,6 +261,7 @@
id=page_node["@id"],
title=page_node["title"],
path=page_node["path"]["#text"],
encoded_url=page_node["uri.ui"],
parent=parent,
)
parent.children.append(page)
Expand Down Expand Up @@ -316,9 +318,6 @@
raw_definition = self._get_api_json(
f"/pages/{page.id}", timeout=HTTP_TIMEOUT_NORMAL_SECONDS
)
encoded_url = raw_definition.get("uri.ui", None)
if encoded_url is None:
raise MindtouchParsingError(f"No uri.ui property for page {page.id}")
raw_tags = raw_definition.get("tags", None)
if raw_tags is None:
raise MindtouchParsingError(f"No tags property for page {page.id}")
Expand All @@ -330,7 +329,6 @@
else:
tags = [raw_tag.get("@value")]
page.definition = LibraryPageDefinition(
encoded_url=encoded_url,
tags=tags,
)
return page.definition
Expand Down Expand Up @@ -363,7 +361,7 @@

def get_cover_page_encoded_url(self, page: LibraryPage) -> str:
"""Returns the url for the book page for a given child page"""
return self.get_page_definition(self.get_cover_page(page)).encoded_url
return self.get_cover_page(page).encoded_url

Check warning on line 364 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L364

Added line #L364 was not covered by tests

def get_cover_page_id(self, page: LibraryPage) -> str:
"""Returns the id for the book page for a given child page"""
Expand Down
2 changes: 2 additions & 0 deletions scraper/src/mindtouch2zim/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
HTTP_TIMEOUT_NORMAL_SECONDS = 15
HTTP_TIMEOUT_LONG_SECONDS = 30

WARN_ONLY_ON_HTML_ISSUES = False
benoit74 marked this conversation as resolved.
Show resolved Hide resolved

logger = getLogger(NAME, level=logging.DEBUG, log_format=DEFAULT_FORMAT_WITH_THREADS)

web_session = get_session()
18 changes: 11 additions & 7 deletions scraper/src/mindtouch2zim/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,13 +199,6 @@
default=os.getenv("MINDTOUCH_ZIMUI_DIST", "../zimui/dist"),
)

parser.add_argument(
"--keep-cache",
help="Keep cache of website responses",
action="store_true",
default=False,
)

parser.add_argument(
"--stats-filename",
help="Path to store the progress JSON file to.",
Expand All @@ -232,6 +225,16 @@
dest="assets_workers",
)

parser.add_argument(

Check warning on line 228 in scraper/src/mindtouch2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/entrypoint.py#L228

Added line #L228 was not covered by tests
"--warn-only-on-html-issues",
benoit74 marked this conversation as resolved.
Show resolved Hide resolved
help="Only log a warning when unexpected HTML is encountered. Use with caution "
" because activating this option means that ZIM HTML will probably lead to "
" online resources without user noticing it.",
action="store_true",
default=False,
dest="warn_only_on_html_issues",
)

args = parser.parse_args()

logger.setLevel(level=logging.DEBUG if args.debug else logging.INFO)
Expand Down Expand Up @@ -269,6 +272,7 @@
illustration_url=args.illustration_url,
s3_url_with_credentials=args.s3_url_with_credentials,
assets_workers=args.assets_workers,
warn_only_on_html_issues=args.warn_only_on_html_issues,
).run()
except SystemExit:
logger.error("Generation failed, exiting")
Expand Down
33 changes: 29 additions & 4 deletions scraper/src/mindtouch2zim/html_rewriting.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
ZimPath,
)

import mindtouch2zim.constants
from mindtouch2zim.client import LibraryPage
from mindtouch2zim.constants import logger
from mindtouch2zim.errors import UnsupportedHrefSrcError, UnsupportedTagError
Expand All @@ -24,6 +25,8 @@
html_rules.rewrite_data_rules.clear()
html_rules.rewrite_tag_rules.clear()

rewriting_context = None


@html_rules.rewrite_attribute()
def rewrite_href_src_attributes(
Expand Down Expand Up @@ -52,9 +55,15 @@
)
if not new_attr_value:
# we do not (yet) support other tags / attributes so we fail the scraper
raise UnsupportedHrefSrcError(
f"Unsupported {attr_name} encountered in {tag} tag (value: {attr_value})"
msg = (
f"Unsupported '{attr_name}' encountered in '{tag}' tag (value: "
f"'{attr_value}') while rewriting {rewriting_context}"
)
if not mindtouch2zim.constants.WARN_ONLY_ON_HTML_ISSUES:
raise UnsupportedHrefSrcError(msg)
else:
logger.warning(msg)
return

Check warning on line 66 in scraper/src/mindtouch2zim/html_rewriting.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/html_rewriting.py#L65-L66

Added lines #L65 - L66 were not covered by tests
return (attr_name, new_attr_value)


Expand All @@ -63,7 +72,15 @@
"""Stop scraper if unsupported tag is encountered"""
if tag not in ["picture"]:
return
raise UnsupportedTagError(f"Tag {tag} is not yet supported in this scraper")
msg = (
f"Tag {tag} is not yet supported in this scraper, found while rewriting "
f"{rewriting_context}"
)
if not mindtouch2zim.constants.WARN_ONLY_ON_HTML_ISSUES:
raise UnsupportedTagError(msg)
else:
logger.warning(msg)
return

Check warning on line 83 in scraper/src/mindtouch2zim/html_rewriting.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/html_rewriting.py#L82-L83

Added lines #L82 - L83 were not covered by tests


YOUTUBE_IFRAME_RE = re.compile(r".*youtube(?:-\w+)*\.\w+\/embed\/(?P<id>.*?)(?:\?.*)*$")
Expand All @@ -84,7 +101,15 @@
raise Exception("Expecting HtmlUrlsRewriter")
src = get_attr_value_from(attrs=attrs, name="src")
if not src:
raise UnsupportedTagError("Unsupported empty src in iframe")
msg = (

Check warning on line 104 in scraper/src/mindtouch2zim/html_rewriting.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/html_rewriting.py#L104

Added line #L104 was not covered by tests
"Unsupported empty src in iframe, found while rewriting "
f"{rewriting_context}"
)
if not mindtouch2zim.constants.WARN_ONLY_ON_HTML_ISSUES:
raise UnsupportedTagError(msg)

Check warning on line 109 in scraper/src/mindtouch2zim/html_rewriting.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/html_rewriting.py#L109

Added line #L109 was not covered by tests
else:
logger.warning(msg)
return

Check warning on line 112 in scraper/src/mindtouch2zim/html_rewriting.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/html_rewriting.py#L111-L112

Added lines #L111 - L112 were not covered by tests
image_rewriten_url = None
try:
if ytb_match := YOUTUBE_IFRAME_RE.match(src):
Expand Down
8 changes: 8 additions & 0 deletions scraper/src/mindtouch2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
from zimscraperlib.zim.filesystem import validate_file_creatable
from zimscraperlib.zim.indexing import IndexData

import mindtouch2zim.constants
import mindtouch2zim.html_rewriting
from mindtouch2zim.asset import AssetDetails, AssetProcessor
from mindtouch2zim.client import (
LibraryPage,
Expand Down Expand Up @@ -144,6 +146,7 @@
assets_workers: int,
*,
overwrite_existing_zim: bool,
warn_only_on_html_issues: bool,
) -> None:
"""Initializes Processor.

Expand Down Expand Up @@ -171,6 +174,8 @@
n_jobs=assets_workers, return_as="generator_unordered", backend="threading"
)

mindtouch2zim.constants.WARN_ONLY_ON_HTML_ISSUES = warn_only_on_html_issues

Check warning on line 177 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L177

Added line #L177 was not covered by tests

self.stats_items_done = 0
# we add 1 more items to process so that progress is not 100% at the beginning
# when we do not yet know how many items we have to process and so that we can
Expand Down Expand Up @@ -478,6 +483,9 @@
Download content, rewrite HTML and add JSON to ZIM
"""
logger.debug(f" Fetching {page.id}")
mindtouch2zim.html_rewriting.rewriting_context = (

Check warning on line 486 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L486

Added line #L486 was not covered by tests
f"page {page.id} at {page.encoded_url}"
)
page_content = self.mindtouch_client.get_page_content(page)
url_rewriter = HtmlUrlsRewriter(
self.mindtouch_client.library_url,
Expand Down
7 changes: 6 additions & 1 deletion scraper/tests/test_html_rewriting.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@
def url_rewriter() -> HtmlUrlsRewriter:
return HtmlUrlsRewriter(
library_url="https://www.acme.com",
page=LibraryPage(id="123", title="a page", path="A_Page"),
page=LibraryPage(
id="123",
title="a page",
path="A_Page",
encoded_url="https://www.acme.com/A_Page",
),
existing_zim_paths={
ZimPath("www.acme.com/existing.html"),
},
Expand Down
95 changes: 81 additions & 14 deletions scraper/tests/test_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,43 +5,110 @@


@pytest.fixture(scope="module")
def library_tree() -> LibraryTree:
root = LibraryPage(id="24", title="Home page", path="")
def dummy_encoded_url() -> str:
return "https://www.acme.com/A_Page"


@pytest.fixture(scope="module")
def library_tree(dummy_encoded_url) -> LibraryTree:
root = LibraryPage(
id="24", title="Home page", path="", encoded_url=dummy_encoded_url
)
topic1 = LibraryPage(
id="25", title="1: First topic", path="1_First_Topic", parent=root
id="25",
title="1: First topic",
path="1_First_Topic",
parent=root,
encoded_url=dummy_encoded_url,
)
root.children.append(topic1)
topic1_1 = LibraryPage(id="26", title="1.1: Cloud", path="1.1_Cloud", parent=topic1)
topic1_1 = LibraryPage(
id="26",
title="1.1: Cloud",
path="1.1_Cloud",
parent=topic1,
encoded_url=dummy_encoded_url,
)
topic1.children.append(topic1_1)
topic1_2 = LibraryPage(id="27", title="1.2: Tree", path="1.2_Tree", parent=topic1)
topic1_2 = LibraryPage(
id="27",
title="1.2: Tree",
path="1.2_Tree",
parent=topic1,
encoded_url=dummy_encoded_url,
)
topic1.children.append(topic1_2)
topic1_3 = LibraryPage(id="28", title="1.3: Bees", path="1.3_Bees", parent=topic1)
topic1_3 = LibraryPage(
id="28",
title="1.3: Bees",
path="1.3_Bees",
parent=topic1,
encoded_url=dummy_encoded_url,
)
topic1.children.append(topic1_3)
topic2 = LibraryPage(
id="29", title="2: Second topic", path="2_Second_Topic", parent=root
id="29",
title="2: Second topic",
path="2_Second_Topic",
parent=root,
encoded_url=dummy_encoded_url,
)
root.children.append(topic2)
topic2_1 = LibraryPage(
id="30", title="2.1: Underground", path="2.1_Underground", parent=topic2
id="30",
title="2.1: Underground",
path="2.1_Underground",
parent=topic2,
encoded_url=dummy_encoded_url,
)
topic2.children.append(topic2_1)
topic2_2 = LibraryPage(id="31", title="2.2: Lava", path="2.2_Lava", parent=topic2)
topic2_2 = LibraryPage(
id="31",
title="2.2: Lava",
path="2.2_Lava",
parent=topic2,
encoded_url=dummy_encoded_url,
)
topic2.children.append(topic2_2)
topic2_3 = LibraryPage(
id="32", title="2.3: Volcano", path="2.3_Volcano", parent=topic2
id="32",
title="2.3: Volcano",
path="2.3_Volcano",
parent=topic2,
encoded_url=dummy_encoded_url,
)
topic2.children.append(topic2_3)
topic3 = LibraryPage(
id="33", title="3: Third topic", path="3_Third_Topic", parent=root
id="33",
title="3: Third topic",
path="3_Third_Topic",
parent=root,
encoded_url=dummy_encoded_url,
)
root.children.append(topic3)
topic3_1 = LibraryPage(
id="34", title="3.1: Ground", path="3.1_Ground", parent=topic3
id="34",
title="3.1: Ground",
path="3.1_Ground",
parent=topic3,
encoded_url=dummy_encoded_url,
)
topic3.children.append(topic3_1)
topic3_2 = LibraryPage(id="35", title="3.2: Earth", path="3.2_Earth", parent=topic3)
topic3_2 = LibraryPage(
id="35",
title="3.2: Earth",
path="3.2_Earth",
parent=topic3,
encoded_url=dummy_encoded_url,
)
topic3.children.append(topic3_2)
topic3_3 = LibraryPage(id="36", title="3.3: Sky", path="3.3_Sky", parent=topic3)
topic3_3 = LibraryPage(
id="36",
title="3.3: Sky",
path="3.3_Sky",
parent=topic3,
encoded_url=dummy_encoded_url,
)
topic3.children.append(topic3_3)
return LibraryTree(
root=root,
Expand Down