diff --git a/scraper/src/libretexts2zim/client.py b/scraper/src/libretexts2zim/client.py index 1689693..99bd7b7 100644 --- a/scraper/src/libretexts2zim/client.py +++ b/scraper/src/libretexts2zim/client.py @@ -24,22 +24,27 @@ class LibreTextsHome(BaseModel): welcome_image_url: str -class DekiPage(BaseModel): - id: str +LibraryPageId = str + + +class LibraryPage(BaseModel): + """Class holding information about a given library page on the library tree""" + + id: LibraryPageId title: str - parent: "DekiPage | None" = None - children: list["DekiPage"] = [] + parent: "LibraryPage | None" = None + children: list["LibraryPage"] = [] def __repr__(self) -> str: return ( - f"DekiPage(id='{self.id}', title='{self.title}', " + f"WikiPage(id='{self.id}', title='{self.title}', " f"parent='{'None' if not self.parent else self.parent.id}', " f"children='{','.join([child.id for child in self.children])}')" ) @property - def self_and_parents(self) -> list["DekiPage"]: - result: list[DekiPage] = [self] + def self_and_parents(self) -> list["LibraryPage"]: + result: list[LibraryPage] = [self] current = self while current.parent is not None: result.append(current.parent) @@ -47,14 +52,16 @@ def self_and_parents(self) -> list["DekiPage"]: return result -class DekiTree(BaseModel): - root: DekiPage - pages: dict[str, DekiPage] = {} +class LibraryTree(BaseModel): + """Class holding information about the tree of pages on a given library""" - def sub_tree(self, subroot_id: str) -> "DekiTree": + root: LibraryPage + pages: dict[LibraryPageId, LibraryPage] = {} + + def sub_tree(self, subroot_id: LibraryPageId) -> "LibraryTree": """Returns a sub-tree, starting at give page id""" new_root = self.pages[subroot_id] - tree = DekiTree(root=new_root) + tree = LibraryTree(root=new_root) tree.pages[new_root.id] = new_root children_to_explore = [*new_root.children] while len(children_to_explore) > 0: @@ -205,12 +212,12 @@ def get_deki_token(self) -> str: self.deki_token = _get_deki_token_from_home(soup) return self.deki_token - def get_all_pages_ids(self): + def get_all_pages_ids(self) -> list[LibraryPageId]: """Returns the IDs of all pages on current website, exploring the whole tree""" tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS) - page_ids: list[str] = [] + page_ids: list[LibraryPageId] = [] def _get_page_ids(page_node: Any) -> None: page_ids.append(page_node["@id"]) @@ -226,31 +233,33 @@ def _get_page_ids(page_node: Any) -> None: return page_ids - def get_root_page_id(self) -> str: + def get_root_page_id(self) -> LibraryPageId: """Returns the ID the root of the tree of pages""" tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS) return tree["page"]["@id"] - def get_page_tree(self) -> DekiTree: + def get_page_tree(self) -> LibraryTree: tree_data = self._get_api_json( "/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS ) - root = DekiPage(id=tree_data["page"]["@id"], title=tree_data["page"]["title"]) - tree_obj = DekiTree(root=root) + root = LibraryPage( + id=tree_data["page"]["@id"], title=tree_data["page"]["title"] + ) + tree_obj = LibraryTree(root=root) tree_obj.pages[root.id] = root - def _add_page(page_node: Any, parent: DekiPage) -> DekiPage: - page = DekiPage( + def _add_page(page_node: Any, parent: LibraryPage) -> LibraryPage: + page = LibraryPage( id=page_node["@id"], title=page_node["title"], parent=parent ) parent.children.append(page) tree_obj.pages[page.id] = page return page - def _process_tree_data(page_node: Any, parent: DekiPage) -> None: + def _process_tree_data(page_node: Any, parent: LibraryPage) -> None: if not page_node["subpages"]: return if "@id" in page_node["subpages"]["page"]: diff --git a/scraper/src/libretexts2zim/entrypoint.py b/scraper/src/libretexts2zim/entrypoint.py index 04b2ac5..80a6ca2 100644 --- a/scraper/src/libretexts2zim/entrypoint.py +++ b/scraper/src/libretexts2zim/entrypoint.py @@ -128,17 +128,18 @@ def add_content_filter_flags(parser: argparse.ArgumentParser): parser.add_argument( "--page-title-include", help="Includes only pages with title matching the given regular " - "expression, and their parent pages for proper navigation. Can be combined" - " with --page-id-include (pages with matching title or id will be included" - ")", + "expression, and their parent pages for proper navigation, up to root (or " + "subroot if --root-page-id is set). Can be combined with --page-id-include " + "(pages with matching title or id will be included)", metavar="REGEX", ) parser.add_argument( "--page-id-include", - help="CSV value of page ids to include. Parent pages will be included as " - "well for proper navigation. Can be combined with --page-title-include " - "(pages with matching title or id will be included)", + help="CSV of page ids to include. Parent pages will be included as " + "well for proper navigation, up to root (or subroot if --root-page-id is set). " + "Can be combined with --page-title-include (pages with matching title or id " + "will be included)", ) parser.add_argument( diff --git a/scraper/src/libretexts2zim/processor.py b/scraper/src/libretexts2zim/processor.py index 51bacec..4862afb 100644 --- a/scraper/src/libretexts2zim/processor.py +++ b/scraper/src/libretexts2zim/processor.py @@ -14,8 +14,9 @@ from zimscraperlib.zim.indexing import IndexData from libretexts2zim.client import ( - DekiPage, - DekiTree, + LibraryPage, + LibraryPageId, + LibraryTree, LibreTextsClient, LibreTextsMetadata, ) @@ -53,7 +54,7 @@ def of(namespace: argparse.Namespace) -> "ContentFilter": """Parses a namespace to create a new DocFilter.""" return ContentFilter.model_validate(namespace, from_attributes=True) - def filter(self, page_tree: DekiTree) -> list[DekiPage]: + def filter(self, page_tree: LibraryTree) -> list[LibraryPage]: """Filters pages based on the user's choices.""" if self.root_page_id: @@ -78,8 +79,8 @@ def filter(self, page_tree: DekiTree) -> list[DekiPage]: def is_selected( title_include_re: re.Pattern[str] | None, title_exclude_re: re.Pattern[str] | None, - id_include: list[str] | None, - page: DekiPage, + id_include: list[LibraryPageId] | None, + page: LibraryPage, ) -> bool: return ( ( diff --git a/scraper/tests-integration/test_client.py b/scraper/tests-integration/test_client.py index b4f45f9..5217470 100644 --- a/scraper/tests-integration/test_client.py +++ b/scraper/tests-integration/test_client.py @@ -7,7 +7,12 @@ ) from zimscraperlib.image.probing import format_for -from libretexts2zim.client import DekiTree, LibreTextsClient, LibreTextsHome +from libretexts2zim.client import ( + LibraryPageId, + LibraryTree, + LibreTextsClient, + LibreTextsHome, +) @pytest.fixture(scope="module") @@ -31,7 +36,7 @@ def minimum_number_of_pages() -> int: @pytest.fixture(scope="module") -def root_page_id() -> str: +def root_page_id() -> LibraryPageId: return "34" @@ -44,7 +49,7 @@ def nb_root_children() -> int: def page_tree( client: LibreTextsClient, deki_token: str, # noqa: ARG001 -) -> DekiTree: +) -> LibraryTree: return client.get_page_tree() @@ -64,21 +69,21 @@ def test_get_all_pages_ids( def test_get_root_page_id( client: LibreTextsClient, - root_page_id: str, + root_page_id: LibraryPageId, deki_token: str, # noqa: ARG001 ): assert client.get_root_page_id() == root_page_id def test_get_page_tree_pages( - page_tree: DekiTree, + page_tree: LibraryTree, minimum_number_of_pages: int, ): assert len(page_tree.pages.keys()) > minimum_number_of_pages def test_get_page_tree_root( - page_tree: DekiTree, + page_tree: LibraryTree, root_page_id: str, nb_root_children: int, ): @@ -90,7 +95,7 @@ def test_get_page_tree_root( def test_get_page_tree_subtree( - page_tree: DekiTree, + page_tree: LibraryTree, ): # 28207 = https://geo.libretexts.org/Courses/Coastline_College/An_Introduction_To_Geology_-_Coastline_College/01%3A_Understanding_Science diff --git a/scraper/tests/test_processor.py b/scraper/tests/test_processor.py index 9953738..50e6c4f 100644 --- a/scraper/tests/test_processor.py +++ b/scraper/tests/test_processor.py @@ -1,37 +1,37 @@ import pytest -from libretexts2zim.client import DekiPage, DekiTree +from libretexts2zim.client import LibraryPage, LibraryTree from libretexts2zim.processor import ContentFilter @pytest.fixture(scope="module") -def deki_tree() -> DekiTree: - root = DekiPage(id="24", title="Home page") - topic1 = DekiPage(id="25", title="1: First topic", parent=root) +def library_tree() -> LibraryTree: + root = LibraryPage(id="24", title="Home page") + topic1 = LibraryPage(id="25", title="1: First topic", parent=root) root.children.append(topic1) - topic1_1 = DekiPage(id="26", title="1.1: Cloud", parent=topic1) + topic1_1 = LibraryPage(id="26", title="1.1: Cloud", parent=topic1) topic1.children.append(topic1_1) - topic1_2 = DekiPage(id="27", title="1.2: Tree", parent=topic1) + topic1_2 = LibraryPage(id="27", title="1.2: Tree", parent=topic1) topic1.children.append(topic1_2) - topic1_3 = DekiPage(id="28", title="1.3: Bees", parent=topic1) + topic1_3 = LibraryPage(id="28", title="1.3: Bees", parent=topic1) topic1.children.append(topic1_3) - topic2 = DekiPage(id="29", title="2: Second topic", parent=root) + topic2 = LibraryPage(id="29", title="2: Second topic", parent=root) root.children.append(topic2) - topic2_1 = DekiPage(id="30", title="2.1: Underground", parent=topic2) + topic2_1 = LibraryPage(id="30", title="2.1: Underground", parent=topic2) topic2.children.append(topic2_1) - topic2_2 = DekiPage(id="31", title="2.2: Lava", parent=topic2) + topic2_2 = LibraryPage(id="31", title="2.2: Lava", parent=topic2) topic2.children.append(topic2_2) - topic2_3 = DekiPage(id="32", title="2.3: Volcano", parent=topic2) + topic2_3 = LibraryPage(id="32", title="2.3: Volcano", parent=topic2) topic2.children.append(topic2_3) - topic3 = DekiPage(id="33", title="3: Third topic", parent=root) + topic3 = LibraryPage(id="33", title="3: Third topic", parent=root) root.children.append(topic3) - topic3_1 = DekiPage(id="34", title="3.1: Ground", parent=topic3) + topic3_1 = LibraryPage(id="34", title="3.1: Ground", parent=topic3) topic3.children.append(topic3_1) - topic3_2 = DekiPage(id="35", title="3.2: Earth", parent=topic3) + topic3_2 = LibraryPage(id="35", title="3.2: Earth", parent=topic3) topic3.children.append(topic3_2) - topic3_3 = DekiPage(id="36", title="3.3: Sky", parent=topic3) + topic3_3 = LibraryPage(id="36", title="3.3: Sky", parent=topic3) topic3.children.append(topic3_3) - return DekiTree( + return LibraryTree( root=root, pages={ root.id: root, @@ -177,6 +177,6 @@ def deki_tree() -> DekiTree: ], ) def test_content_filter( - content_filter: ContentFilter, expected_ids: list[str], deki_tree: DekiTree + content_filter: ContentFilter, expected_ids: list[str], library_tree: LibraryTree ): - assert [page.id for page in content_filter.filter(deki_tree)] == expected_ids + assert [page.id for page in content_filter.filter(library_tree)] == expected_ids