Skip to content

Commit

Permalink
Grab encoded_url from LibraryPage since definition is not needed
Browse files Browse the repository at this point in the history
  • Loading branch information
benoit74 committed Nov 19, 2024
1 parent e59ba73 commit db7a59e
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 7 deletions.
10 changes: 4 additions & 6 deletions scraper/src/mindtouch2zim/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ class LibraryPageDefinition(BaseModel):
with a special call, hence the specific object
"""

encoded_url: str
tags: list[str]


Expand All @@ -52,6 +51,7 @@ class LibraryPage(BaseModel):
path: str
parent: "LibraryPage | None" = None
children: list["LibraryPage"] = []
encoded_url: str
definition: LibraryPageDefinition | None = None

def __repr__(self) -> str:
Expand Down Expand Up @@ -251,6 +251,7 @@ def get_page_tree(self) -> LibraryTree:
id=tree_data["page"]["@id"],
title=tree_data["page"]["title"],
path=tree_data["page"]["path"]["#text"],
encoded_url=tree_data["page"]["uri.ui"],
)
tree_obj = LibraryTree(root=root)
tree_obj.pages[root.id] = root
Expand All @@ -260,6 +261,7 @@ def _add_page(page_node: Any, parent: LibraryPage) -> LibraryPage:
id=page_node["@id"],
title=page_node["title"],
path=page_node["path"]["#text"],
encoded_url=page_node["uri.ui"],
parent=parent,
)
parent.children.append(page)
Expand Down Expand Up @@ -316,9 +318,6 @@ def get_page_definition(self, page: LibraryPage) -> LibraryPageDefinition:
raw_definition = self._get_api_json(
f"/pages/{page.id}", timeout=HTTP_TIMEOUT_NORMAL_SECONDS
)
encoded_url = raw_definition.get("uri.ui", None)
if encoded_url is None:
raise MindtouchParsingError(f"No uri.ui property for page {page.id}")
raw_tags = raw_definition.get("tags", None)
if raw_tags is None:
raise MindtouchParsingError(f"No tags property for page {page.id}")
Expand All @@ -330,7 +329,6 @@ def get_page_definition(self, page: LibraryPage) -> LibraryPageDefinition:
else:
tags = [raw_tag.get("@value")]
page.definition = LibraryPageDefinition(
encoded_url=encoded_url,
tags=tags,
)
return page.definition
Expand Down Expand Up @@ -363,7 +361,7 @@ def get_cover_page(self, page: LibraryPage) -> LibraryPage:

def get_cover_page_encoded_url(self, page: LibraryPage) -> str:
"""Returns the url for the book page for a given child page"""
return self.get_page_definition(self.get_cover_page(page)).encoded_url
return self.get_cover_page(page).encoded_url

def get_cover_page_id(self, page: LibraryPage) -> str:
"""Returns the id for the book page for a given child page"""
Expand Down
2 changes: 1 addition & 1 deletion scraper/src/mindtouch2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,7 +484,7 @@ def _process_page(
"""
logger.debug(f" Fetching {page.id}")
mindtouch2zim.html_rewriting.rewriting_context = (
f"page {page.id} at {page.path}"
f"page {page.id} at {page.encoded_url}"
)
page_content = self.mindtouch_client.get_page_content(page)
url_rewriter = HtmlUrlsRewriter(
Expand Down

0 comments on commit db7a59e

Please sign in to comment.