Skip to content

Commit

Permalink
Add ability to select only a subtree of the website
Browse files Browse the repository at this point in the history
  • Loading branch information
benoit74 committed Oct 1, 2024
1 parent 6ef358d commit 94461b9
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 6 deletions.
15 changes: 15 additions & 0 deletions scraper/src/libretexts2zim/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,21 @@ class DekiTree(BaseModel):
root: DekiPage
pages: dict[str, DekiPage] = {}

def sub_tree(self, subroot_id: str) -> "DekiTree":
"""Returns a sub-tree, starting at give page id"""
new_root = self.pages[subroot_id]
tree = DekiTree(root=new_root)
tree.pages[new_root.id] = new_root
children_to_explore = [*new_root.children]
while len(children_to_explore) > 0:
child = children_to_explore[0]
children_to_explore.remove(child)
if child.id in tree.pages:
continue # safe-guard

Check warning on line 70 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L70

Added line #L70 was not covered by tests
tree.pages[child.id] = child
children_to_explore.extend(child.children)
return tree


class LibreTextsMetadata(BaseModel):
"""Metadata about a course."""
Expand Down
11 changes: 11 additions & 0 deletions scraper/src/libretexts2zim/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ class ContentFilter(BaseModel):
page_id_include: str | None
# If specified, page with title matching the regex are excluded.
page_title_exclude: str | None
# If specified, only this page and its subpages will be included.
root_page_id: str | None

@staticmethod
def add_flags(parser: argparse.ArgumentParser):
Expand Down Expand Up @@ -72,6 +74,12 @@ def add_flags(parser: argparse.ArgumentParser):
metavar="REGEX",
)

parser.add_argument(

Check warning on line 77 in scraper/src/libretexts2zim/generator.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/generator.py#L77

Added line #L77 was not covered by tests
"--root-page-id",
help="ID of the root page to include in ZIM. Only this page and its"
" subpages will be included in the ZIM",
)

@staticmethod
def of(namespace: argparse.Namespace) -> "ContentFilter":
"""Parses a namespace to create a new DocFilter."""
Expand All @@ -80,6 +88,9 @@ def of(namespace: argparse.Namespace) -> "ContentFilter":
def filter(self, page_tree: DekiTree) -> list[DekiPage]:
"""Filters pages based on the user's choices."""

if self.root_page_id:
page_tree = page_tree.sub_tree(self.root_page_id)

title_include_re = (
re.compile(self.page_title_include, re.IGNORECASE)
if self.page_title_include
Expand Down
39 changes: 33 additions & 6 deletions scraper/tests-integration/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
)
from zimscraperlib.image.probing import format_for

from libretexts2zim.client import LibreTextsClient, LibreTextsHome
from libretexts2zim.client import DekiTree, LibreTextsClient, LibreTextsHome


@pytest.fixture(scope="module")
Expand Down Expand Up @@ -40,6 +40,14 @@ def nb_root_children() -> int:
return 6


@pytest.fixture(scope="module")
def page_tree(
client: LibreTextsClient,
deki_token: str, # noqa: ARG001
) -> DekiTree:
return client.get_page_tree()


def test_get_deki_token(deki_token: str):
"""Ensures we achieve to get a deki_token"""
assert deki_token
Expand All @@ -62,22 +70,41 @@ def test_get_root_page_id(
assert client.get_root_page_id() == root_page_id


def test_get_page_tree(
client: LibreTextsClient,
def test_get_page_tree_pages(
page_tree: DekiTree,
minimum_number_of_pages: int,
deki_token: str, # noqa: ARG001
):
assert len(page_tree.pages.keys()) > minimum_number_of_pages


def test_get_page_tree_root(
page_tree: DekiTree,
root_page_id: str,
nb_root_children: int,
):
page_tree = client.get_page_tree()
assert len(page_tree.pages.keys()) > minimum_number_of_pages
assert page_tree.root.id == root_page_id
assert len(page_tree.root.children) == nb_root_children
assert page_tree.root.title
for child in page_tree.root.children:
assert child.title


def test_get_page_tree_subtree(
page_tree: DekiTree,
):

# 28207 = https://geo.libretexts.org/Courses/Coastline_College/An_Introduction_To_Geology_-_Coastline_College/01%3A_Understanding_Science
subtree1 = page_tree.sub_tree("28207")
# 4 = "1. Understransding Science" + "1.1: What is Science?"
# + "1.2: The Scientific Method" + "1.3: The Study of Geology"
assert len(subtree1.pages.keys()) == 4

# 28196 = https://geo.libretexts.org/Courses/Coastline_College/An_Introduction_To_Geology_-_Coastline_College
subtree2 = page_tree.sub_tree("28196")
# 94 is number retrieved in Oct. 2024, might change
assert len(subtree2.pages.keys()) == 94


def test_get_home_image_url(home: LibreTextsHome):
"""Ensures proper image url is retrieved"""
assert home.welcome_image_url == "https://cdn.libretexts.net/Logos/geo_full.png"
Expand Down
30 changes: 30 additions & 0 deletions scraper/tests/test_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def deki_tree() -> DekiTree:
page_title_include=r"^1\..*",
page_title_exclude=None,
page_id_include=None,
root_page_id=None,
),
["24", "25", "26", "27", "28"],
id="include_1",
Expand All @@ -68,6 +69,7 @@ def deki_tree() -> DekiTree:
page_title_include=r"^2\..*",
page_title_exclude=None,
page_id_include=None,
root_page_id=None,
),
["24", "29", "30", "31", "32"],
id="include_2",
Expand All @@ -77,6 +79,7 @@ def deki_tree() -> DekiTree:
page_title_include=None,
page_title_exclude=None,
page_id_include="26,27,28",
root_page_id=None,
),
["24", "25", "26", "27", "28"],
id="include_3",
Expand All @@ -86,6 +89,7 @@ def deki_tree() -> DekiTree:
page_title_include="ground",
page_title_exclude=None,
page_id_include=None,
root_page_id=None,
),
["24", "29", "30", "33", "34"],
id="include_4",
Expand All @@ -95,6 +99,7 @@ def deki_tree() -> DekiTree:
page_title_include=r"^1\..*",
page_title_exclude="Tree",
page_id_include=None,
root_page_id=None,
),
["24", "25", "26", "28"],
id="include_exclude_1",
Expand All @@ -104,6 +109,7 @@ def deki_tree() -> DekiTree:
page_title_include=None,
page_title_exclude="Tree",
page_id_include="26,27,28",
root_page_id=None,
),
["24", "25", "26", "28"],
id="include_exclude_2",
Expand All @@ -113,6 +119,7 @@ def deki_tree() -> DekiTree:
page_title_include="ground",
page_title_exclude="^2",
page_id_include=None,
root_page_id=None,
),
["24", "33", "34"],
id="include_exclude_3",
Expand All @@ -122,6 +129,7 @@ def deki_tree() -> DekiTree:
page_title_include=r"^1\..*",
page_title_exclude="tree",
page_id_include=None,
root_page_id=None,
),
["24", "25", "26", "28"],
id="include_exclude_case_insensitive",
Expand All @@ -131,6 +139,7 @@ def deki_tree() -> DekiTree:
page_title_include="tree",
page_title_exclude=None,
page_id_include=None,
root_page_id=None,
),
["24", "25", "27"],
id="include_case_insensitive",
Expand All @@ -140,10 +149,31 @@ def deki_tree() -> DekiTree:
page_title_include="^tree",
page_title_exclude=None,
page_id_include=None,
root_page_id=None,
),
[],
id="include_no_match",
),
pytest.param(
ContentFilter(
page_title_include=None,
page_title_exclude=None,
page_id_include=None,
root_page_id="25",
),
["25", "26", "27", "28"],
id="root_page_id",
),
pytest.param(
ContentFilter(
page_title_include=r"^1\.1.*",
page_title_exclude=None,
page_id_include=None,
root_page_id="25",
),
["25", "26"],
id="root_page_id_and_include",
),
],
)
def test_content_filter(
Expand Down

0 comments on commit 94461b9

Please sign in to comment.