Skip to content

Commit

Permalink
Merge pull request #102 from openzim/root_page_tree
Browse files Browse the repository at this point in the history
Fetch only the required subtree of the website instead of the whole site
  • Loading branch information
benoit74 authored Dec 6, 2024
2 parents e153312 + cb1f2f7 commit d961cc2
Show file tree
Hide file tree
Showing 6 changed files with 159 additions and 49 deletions.
101 changes: 74 additions & 27 deletions scraper/src/mindtouch2zim/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ class LibraryPageDefinition(BaseModel):
"""

tags: list[str]
parent_id: str | None


class LibraryPage(BaseModel):
Expand Down Expand Up @@ -238,10 +239,10 @@ def get_root_page_id(self) -> LibraryPageId:
)
return tree["page"]["@id"]

def get_page_tree(self) -> LibraryTree:
def get_page_tree(self, page: str = "home") -> LibraryTree:

tree_data = self._get_api_json(
"/pages/home/tree", timeout=context.http_timeout_long_seconds
f"/pages/{page}/tree", timeout=context.http_timeout_long_seconds
)

root = LibraryPage(
Expand Down Expand Up @@ -306,32 +307,43 @@ def get_page_content(self, page: LibraryPage) -> LibraryPageContent:
)
return LibraryPageContent(html_body=tree["body"][0])

def get_page_definition(self, page: LibraryPage) -> LibraryPageDefinition:
def get_page_definition(self, page: LibraryPage | str) -> LibraryPageDefinition:
"""Return the definition of a given page
Definition is kept in memory, and retrieved on-demand when it is not yet there
"""
if page.definition is None:
raw_definition = self._get_api_json(
f"/pages/{page.id}", timeout=context.http_timeout_normal_seconds
)
raw_tags = raw_definition.get("tags", None)
if raw_tags is None:
raise MindtouchParsingError(f"No tags property for page {page.id}")
raw_tag = raw_tags.get("tag", None)
if raw_tag is None:
raise MindtouchParsingError(f"No tag property for page {page.id}")
if isinstance(raw_tag, list):
tags = [item.get("@value") for item in raw_tag]
else:
tags = [raw_tag.get("@value")]
page.definition = LibraryPageDefinition(
tags=tags,
)
return page.definition

def get_cover_page(self, page: LibraryPage) -> LibraryPage:
"""Get the cover page of a given page
if isinstance(page, str):
page_id = page
elif page.definition is not None:
return page.definition
else:
page_id = page.id

raw_definition = self._get_api_json(
f"/pages/{page_id}", timeout=context.http_timeout_normal_seconds
)
raw_tag = raw_definition.get("tags", {}).get("tag", None)
if raw_tag is None:
raise MindtouchParsingError(f"No tag property for page {page_id}")
if isinstance(raw_tag, list):
tags = [item.get("@value") for item in raw_tag]
else:
tags = [raw_tag.get("@value")]

parent = raw_definition.get("page.parent", None)

page_definition = LibraryPageDefinition(
tags=tags, parent_id=None if parent is None else parent["@id"]
)

if isinstance(page, LibraryPage):
page.definition = page_definition

return page_definition

def get_cover_page(self, page: LibraryPage) -> LibraryPage | None:
"""Get the cover page of a given page object
Logic originally defined in `getCoverpage` function of
https://cdn.libretexts.net/github/LibreTextsMain/Miscellaneous/reuse.js
Expand All @@ -350,19 +362,54 @@ def get_cover_page(self, page: LibraryPage) -> LibraryPage:
or "coverpage:nocommons" in current_definition.tags
):
return current_page
if "article:topic-category" in current_definition.tags:
return None
if current_page.parent is None:
raise MindtouchParsingError(
f"No more parent for {page.id}, reached root at {current_page.id}"
)
current_page = current_page.parent

def get_cover_page_encoded_url(self, page: LibraryPage) -> str:
def _get_cover_page_from_str_id(self, page_id: str) -> str | None:
"""Get the cover page ID of a given page identifier as string
Logic originally defined in `getCoverpage` function of
https://cdn.libretexts.net/github/LibreTextsMain/Miscellaneous/reuse.js
Probably originates from getCoverpage function of
https://github.com/LibreTexts/Libretext/blob/master/public/Miscellaneous/reuse.js
See https://github.com/openzim/mindtouch/issues/68 for a copy of original code
"""
current_page = page_id
while True:
current_definition = self.get_page_definition(current_page)
if (
"coverpage:yes" in current_definition.tags
or "coverpage:toc" in current_definition.tags
or "coverpage:nocommons" in current_definition.tags
):
return current_page
if "article:topic-category" in current_definition.tags:
return None
if current_definition.parent_id is None:
raise MindtouchParsingError(
f"No more parent for {page_id}, reached root at {current_page}"
)
current_page = current_definition.parent_id

def get_cover_page_encoded_url(self, page: LibraryPage) -> str | None:
"""Returns the url for the book page for a given child page"""
return self.get_cover_page(page).encoded_url
cover_page = self.get_cover_page(page)
return cover_page.encoded_url if cover_page is not None else None

def get_cover_page_id(self, page: LibraryPage) -> str:
def get_cover_page_id(self, page: LibraryPage | str) -> str | None:
"""Returns the id for the book page for a given child page"""
return self.get_cover_page(page).id
if isinstance(page, LibraryPage):
cover_page = self.get_cover_page(page)
return cover_page.id if cover_page is not None else None
else:
return self._get_cover_page_from_str_id(page)

def get_template_content(self, page_id: str, template: str) -> str:
"""Returns the templated content of a given page"""
Expand Down
8 changes: 5 additions & 3 deletions scraper/src/mindtouch2zim/libretexts/detailed_licensing.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from mindtouch2zim.client import LibraryPage, MindtouchClient
from mindtouch2zim.constants import logger
from mindtouch2zim.context import Context
from mindtouch2zim.libretexts.errors import BadBookPageError

context = Context.get()

Expand Down Expand Up @@ -87,11 +88,12 @@ def rewrite_detailed_licensing(
"""

cover_page_url = mindtouch_client.get_cover_page_encoded_url(page)
if cover_page_url is None:
raise BadBookPageError()
return rewriter.rewrite(
_render_html_from_data(
jinja2_template=jinja2_template,
licensing_data=_get_licensing_report_data(
mindtouch_client.get_cover_page_encoded_url(page)
),
licensing_data=_get_licensing_report_data(cover_page_url),
)
).content
4 changes: 4 additions & 0 deletions scraper/src/mindtouch2zim/libretexts/errors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
class BadBookPageError(Exception):
"""Raised when we are processing a special book page but we are not inside a book"""

pass
6 changes: 5 additions & 1 deletion scraper/src/mindtouch2zim/libretexts/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from zimscraperlib.rewriting.html import HtmlRewriter

from mindtouch2zim.client import LibraryPage, MindtouchClient
from mindtouch2zim.libretexts.errors import BadBookPageError


class IndexPage(BaseModel):
Expand All @@ -28,11 +29,14 @@ def rewrite_index(
page: LibraryPage,
) -> str:
"""Get and rewrite index HTML"""
cover_page_id = mindtouch_client.get_cover_page_id(page)
if cover_page_id is None:
raise BadBookPageError()
return get_libretexts_transformed_html(
jinja2_template=jinja2_template,
libretexts_template_content=rewriter.rewrite(
mindtouch_client.get_template_content(
page_id=mindtouch_client.get_cover_page_id(page),
page_id=cover_page_id,
template="=Template%253AMindTouch%252FIDF3%252FViews%252FTag_directory",
)
).content,
Expand Down
7 changes: 6 additions & 1 deletion scraper/src/mindtouch2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,12 @@ def run_with_creator(self, creator: Creator):

logger.info("Fetching pages tree")
context.current_thread_workitem = "pages tree"
pages_tree = self.mindtouch_client.get_page_tree()
root_page_id = self.content_filter.root_page_id or "home"
cover_page_id = (
self.mindtouch_client.get_cover_page_id(root_page_id)
or root_page_id # if --root-page-id is not inside a book but a category
)
pages_tree = self.mindtouch_client.get_page_tree(cover_page_id)
selected_pages = self.content_filter.filter(pages_tree)
logger.info(
f"{len(selected_pages)} pages (out of {len(pages_tree.pages)}) will be "
Expand Down
82 changes: 65 additions & 17 deletions scraper/tests-integration/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,27 +18,46 @@


@pytest.fixture(scope="module")
def client(libretexts_url: str, cache_folder: Path) -> MindtouchClient:
def raw_client(libretexts_url: str, cache_folder: Path) -> MindtouchClient:
context.library_url = libretexts_url
context.cache_folder = cache_folder
return MindtouchClient()


@pytest.fixture(scope="module")
def client(
raw_client: MindtouchClient,
deki_token: str, # noqa: ARG001
) -> MindtouchClient:
"""already authenticated client (avoid having to fetch deki_token in tests)"""
return raw_client


@pytest.fixture(scope="module")
def home(client: MindtouchClient) -> MindtouchHome:
return client.get_home()


@pytest.fixture(scope="module")
def deki_token(client: MindtouchClient) -> str:
return client.get_deki_token()
def deki_token(raw_client: MindtouchClient) -> str:
return raw_client.get_deki_token()


@pytest.fixture(scope="module")
def minimum_number_of_pages() -> int:
return 8000


@pytest.fixture(scope="module")
def somewhere_page_id() -> LibraryPageId:
return "15728"


@pytest.fixture(scope="module")
def nb_somewhere_children() -> int:
return 5


@pytest.fixture(scope="module")
def root_page_id() -> LibraryPageId:
return "34"
Expand All @@ -52,7 +71,6 @@ def nb_root_children() -> int:
@pytest.fixture(scope="module")
def page_tree(
client: MindtouchClient,
deki_token: str, # noqa: ARG001
) -> LibraryTree:
return client.get_page_tree()

Expand All @@ -65,20 +83,11 @@ def test_get_deki_token(deki_token: str):
def test_get_all_pages_ids(
client: MindtouchClient,
minimum_number_of_pages: int,
deki_token: str, # noqa: ARG001
):
pages_ids = client.get_all_pages_ids()
assert len(pages_ids) > minimum_number_of_pages


def test_get_root_page_id(
client: MindtouchClient,
root_page_id: LibraryPageId,
deki_token: str, # noqa: ARG001
):
assert client.get_root_page_id() == root_page_id


def test_get_page_tree_pages(
page_tree: LibraryTree,
minimum_number_of_pages: int,
Expand Down Expand Up @@ -114,6 +123,19 @@ def test_get_page_tree_subtree(
assert len(subtree2.pages.keys()) == 94


def test_get_page_tree_somewhere(
client: MindtouchClient,
somewhere_page_id: str,
nb_somewhere_children: int,
):
page_tree = client.get_page_tree(somewhere_page_id)
assert page_tree.root.id == somewhere_page_id
assert len(page_tree.root.children) == nb_somewhere_children
assert page_tree.root.title
for child in page_tree.root.children:
assert child.title


def test_get_home_image_url(home: MindtouchHome):
"""Ensures proper image url is retrieved"""
assert home.welcome_image_url == "https://cdn.libretexts.net/Logos/geo_full.png"
Expand Down Expand Up @@ -146,8 +168,10 @@ def test_get_index_page_from_template(
):
"""Ensures we can get content of an index page"""
page_15837 = page_tree.sub_tree("15837").root
cover_page_id = client.get_cover_page_id(page_15837)
assert cover_page_id
assert client.get_template_content(
page_id=client.get_cover_page_id(page_15837),
page_id=cover_page_id,
template="=Template%253AMindTouch%252FIDF3%252FViews%252FTag_directory",
)

Expand All @@ -164,12 +188,36 @@ def test_get_cover_page_encoded_url(
)


def test_get_cover_page_id(
@pytest.mark.parametrize(
"current_id, expected_cover_page_id",
[
("15837", "15718"),
(":0794f6ff8238481ab880b6484deb65f4", "15718"),
("15844", None),
("34", None),
("home", None),
],
)
def test_get_cover_page_id_by_id(
client: MindtouchClient,
current_id: str,
expected_cover_page_id: str | None,
):
assert client.get_cover_page_id(current_id) == expected_cover_page_id


@pytest.mark.parametrize(
"current_id, expected_cover_page_id",
[("15837", "15718"), ("15844", None), ("34", None)],
)
def test_get_cover_page_id_by_page(
client: MindtouchClient,
page_tree: LibraryTree,
current_id: str,
expected_cover_page_id: str | None,
):
page_15837 = page_tree.sub_tree("15837").root
assert client.get_cover_page_id(page_15837) == "15718"
page_object = page_tree.sub_tree(current_id).root
assert client.get_cover_page_id(page_object) == expected_cover_page_id


def test_get_home_screen_css_url(home: MindtouchHome):
Expand Down

0 comments on commit d961cc2

Please sign in to comment.