Skip to content

Commit

Permalink
Grab encoded_url from LibraryPage since definition is not needed
Browse files Browse the repository at this point in the history
  • Loading branch information
benoit74 committed Nov 19, 2024
1 parent e59ba73 commit e9677a4
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 22 deletions.
10 changes: 4 additions & 6 deletions scraper/src/mindtouch2zim/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ class LibraryPageDefinition(BaseModel):
with a special call, hence the specific object
"""

encoded_url: str
tags: list[str]


Expand All @@ -52,6 +51,7 @@ class LibraryPage(BaseModel):
path: str
parent: "LibraryPage | None" = None
children: list["LibraryPage"] = []
encoded_url: str
definition: LibraryPageDefinition | None = None

def __repr__(self) -> str:
Expand Down Expand Up @@ -251,6 +251,7 @@ def get_page_tree(self) -> LibraryTree:
id=tree_data["page"]["@id"],
title=tree_data["page"]["title"],
path=tree_data["page"]["path"]["#text"],
encoded_url=tree_data["page"]["uri.ui"],
)
tree_obj = LibraryTree(root=root)
tree_obj.pages[root.id] = root
Expand All @@ -260,6 +261,7 @@ def _add_page(page_node: Any, parent: LibraryPage) -> LibraryPage:
id=page_node["@id"],
title=page_node["title"],
path=page_node["path"]["#text"],
encoded_url=page_node["uri.ui"],
parent=parent,
)
parent.children.append(page)
Expand Down Expand Up @@ -316,9 +318,6 @@ def get_page_definition(self, page: LibraryPage) -> LibraryPageDefinition:
raw_definition = self._get_api_json(
f"/pages/{page.id}", timeout=HTTP_TIMEOUT_NORMAL_SECONDS
)
encoded_url = raw_definition.get("uri.ui", None)
if encoded_url is None:
raise MindtouchParsingError(f"No uri.ui property for page {page.id}")
raw_tags = raw_definition.get("tags", None)
if raw_tags is None:
raise MindtouchParsingError(f"No tags property for page {page.id}")
Expand All @@ -330,7 +329,6 @@ def get_page_definition(self, page: LibraryPage) -> LibraryPageDefinition:
else:
tags = [raw_tag.get("@value")]
page.definition = LibraryPageDefinition(
encoded_url=encoded_url,
tags=tags,
)
return page.definition
Expand Down Expand Up @@ -363,7 +361,7 @@ def get_cover_page(self, page: LibraryPage) -> LibraryPage:

def get_cover_page_encoded_url(self, page: LibraryPage) -> str:
"""Returns the url for the book page for a given child page"""
return self.get_page_definition(self.get_cover_page(page)).encoded_url
return self.get_cover_page(page).encoded_url

Check warning on line 364 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L364

Added line #L364 was not covered by tests

def get_cover_page_id(self, page: LibraryPage) -> str:
"""Returns the id for the book page for a given child page"""
Expand Down
2 changes: 1 addition & 1 deletion scraper/src/mindtouch2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,7 +484,7 @@ def _process_page(
"""
logger.debug(f" Fetching {page.id}")
mindtouch2zim.html_rewriting.rewriting_context = (

Check warning on line 486 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L486

Added line #L486 was not covered by tests
f"page {page.id} at {page.path}"
f"page {page.id} at {page.encoded_url}"
)
page_content = self.mindtouch_client.get_page_content(page)
url_rewriter = HtmlUrlsRewriter(
Expand Down
7 changes: 6 additions & 1 deletion scraper/tests/test_html_rewriting.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@
def url_rewriter() -> HtmlUrlsRewriter:
return HtmlUrlsRewriter(
library_url="https://www.acme.com",
page=LibraryPage(id="123", title="a page", path="A_Page"),
page=LibraryPage(
id="123",
title="a page",
path="A_Page",
encoded_url="https://www.acme.com/A_Page",
),
existing_zim_paths={
ZimPath("www.acme.com/existing.html"),
},
Expand Down
95 changes: 81 additions & 14 deletions scraper/tests/test_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,43 +5,110 @@


@pytest.fixture(scope="module")
def library_tree() -> LibraryTree:
root = LibraryPage(id="24", title="Home page", path="")
def dummy_encoded_url() -> str:
return "https://www.acme.com/A_Page"


@pytest.fixture(scope="module")
def library_tree(dummy_encoded_url) -> LibraryTree:
root = LibraryPage(
id="24", title="Home page", path="", encoded_url=dummy_encoded_url
)
topic1 = LibraryPage(
id="25", title="1: First topic", path="1_First_Topic", parent=root
id="25",
title="1: First topic",
path="1_First_Topic",
parent=root,
encoded_url=dummy_encoded_url,
)
root.children.append(topic1)
topic1_1 = LibraryPage(id="26", title="1.1: Cloud", path="1.1_Cloud", parent=topic1)
topic1_1 = LibraryPage(
id="26",
title="1.1: Cloud",
path="1.1_Cloud",
parent=topic1,
encoded_url=dummy_encoded_url,
)
topic1.children.append(topic1_1)
topic1_2 = LibraryPage(id="27", title="1.2: Tree", path="1.2_Tree", parent=topic1)
topic1_2 = LibraryPage(
id="27",
title="1.2: Tree",
path="1.2_Tree",
parent=topic1,
encoded_url=dummy_encoded_url,
)
topic1.children.append(topic1_2)
topic1_3 = LibraryPage(id="28", title="1.3: Bees", path="1.3_Bees", parent=topic1)
topic1_3 = LibraryPage(
id="28",
title="1.3: Bees",
path="1.3_Bees",
parent=topic1,
encoded_url=dummy_encoded_url,
)
topic1.children.append(topic1_3)
topic2 = LibraryPage(
id="29", title="2: Second topic", path="2_Second_Topic", parent=root
id="29",
title="2: Second topic",
path="2_Second_Topic",
parent=root,
encoded_url=dummy_encoded_url,
)
root.children.append(topic2)
topic2_1 = LibraryPage(
id="30", title="2.1: Underground", path="2.1_Underground", parent=topic2
id="30",
title="2.1: Underground",
path="2.1_Underground",
parent=topic2,
encoded_url=dummy_encoded_url,
)
topic2.children.append(topic2_1)
topic2_2 = LibraryPage(id="31", title="2.2: Lava", path="2.2_Lava", parent=topic2)
topic2_2 = LibraryPage(
id="31",
title="2.2: Lava",
path="2.2_Lava",
parent=topic2,
encoded_url=dummy_encoded_url,
)
topic2.children.append(topic2_2)
topic2_3 = LibraryPage(
id="32", title="2.3: Volcano", path="2.3_Volcano", parent=topic2
id="32",
title="2.3: Volcano",
path="2.3_Volcano",
parent=topic2,
encoded_url=dummy_encoded_url,
)
topic2.children.append(topic2_3)
topic3 = LibraryPage(
id="33", title="3: Third topic", path="3_Third_Topic", parent=root
id="33",
title="3: Third topic",
path="3_Third_Topic",
parent=root,
encoded_url=dummy_encoded_url,
)
root.children.append(topic3)
topic3_1 = LibraryPage(
id="34", title="3.1: Ground", path="3.1_Ground", parent=topic3
id="34",
title="3.1: Ground",
path="3.1_Ground",
parent=topic3,
encoded_url=dummy_encoded_url,
)
topic3.children.append(topic3_1)
topic3_2 = LibraryPage(id="35", title="3.2: Earth", path="3.2_Earth", parent=topic3)
topic3_2 = LibraryPage(
id="35",
title="3.2: Earth",
path="3.2_Earth",
parent=topic3,
encoded_url=dummy_encoded_url,
)
topic3.children.append(topic3_2)
topic3_3 = LibraryPage(id="36", title="3.3: Sky", path="3.3_Sky", parent=topic3)
topic3_3 = LibraryPage(
id="36",
title="3.3: Sky",
path="3.3_Sky",
parent=topic3,
encoded_url=dummy_encoded_url,
)
topic3.children.append(topic3_3)
return LibraryTree(
root=root,
Expand Down

0 comments on commit e9677a4

Please sign in to comment.