From 90dec6ba0476b89edf1ae4f8ce1402b3df6256b2 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 1 Oct 2024 09:15:47 +0000 Subject: [PATCH] Add ability to select only a subtree of the website --- scraper/src/libretexts2zim/client.py | 15 +++++++++ scraper/src/libretexts2zim/generator.py | 11 +++++++ scraper/tests-integration/test_client.py | 39 ++++++++++++++++++++---- 3 files changed, 59 insertions(+), 6 deletions(-) diff --git a/scraper/src/libretexts2zim/client.py b/scraper/src/libretexts2zim/client.py index a448e94..7a54501 100644 --- a/scraper/src/libretexts2zim/client.py +++ b/scraper/src/libretexts2zim/client.py @@ -57,6 +57,21 @@ class DekiTree(BaseModel): root: DekiPage pages: dict[str, DekiPage] = {} + def sub_tree(self, subroot_id: str) -> "DekiTree": + """Returns a sub-tree, starting at give page id""" + new_root = self.pages[subroot_id] + tree = DekiTree(root=new_root) + tree.pages[new_root.id] = new_root + children_to_explore = new_root.children + while len(children_to_explore) > 0: + child = children_to_explore[0] + children_to_explore.remove(child) + if child.id in tree.pages: + continue # safe-guard + tree.pages[child.id] = child + children_to_explore.extend(child.children) + return tree + class LibreTextsMetadata(BaseModel): """Metadata about a course.""" diff --git a/scraper/src/libretexts2zim/generator.py b/scraper/src/libretexts2zim/generator.py index b2d4000..18a1d4f 100644 --- a/scraper/src/libretexts2zim/generator.py +++ b/scraper/src/libretexts2zim/generator.py @@ -45,6 +45,8 @@ class ContentFilter(BaseModel): page_id_include: str | None # If specified, page with title matching the regex are excluded. page_title_exclude: str | None + # If specified, only this page and its subpages will be included. + root_page_id: str | None @staticmethod def add_flags(parser: argparse.ArgumentParser): @@ -72,6 +74,12 @@ def add_flags(parser: argparse.ArgumentParser): metavar="REGEX", ) + parser.add_argument( + "--root-page-id", + help="ID of the root page to include in ZIM. Only this page and its" + " subpages will be included in the ZIM", + ) + @staticmethod def of(namespace: argparse.Namespace) -> "ContentFilter": """Parses a namespace to create a new DocFilter.""" @@ -80,6 +88,9 @@ def of(namespace: argparse.Namespace) -> "ContentFilter": def filter(self, page_tree: DekiTree) -> list[DekiPage]: """Filters pages based on the user's choices.""" + if self.root_page_id: + page_tree = page_tree.sub_tree(self.root_page_id) + title_include_re = ( re.compile(self.page_title_include, re.IGNORECASE) if self.page_title_include diff --git a/scraper/tests-integration/test_client.py b/scraper/tests-integration/test_client.py index b556c94..b4f45f9 100644 --- a/scraper/tests-integration/test_client.py +++ b/scraper/tests-integration/test_client.py @@ -7,7 +7,7 @@ ) from zimscraperlib.image.probing import format_for -from libretexts2zim.client import LibreTextsClient, LibreTextsHome +from libretexts2zim.client import DekiTree, LibreTextsClient, LibreTextsHome @pytest.fixture(scope="module") @@ -40,6 +40,14 @@ def nb_root_children() -> int: return 6 +@pytest.fixture(scope="module") +def page_tree( + client: LibreTextsClient, + deki_token: str, # noqa: ARG001 +) -> DekiTree: + return client.get_page_tree() + + def test_get_deki_token(deki_token: str): """Ensures we achieve to get a deki_token""" assert deki_token @@ -62,15 +70,18 @@ def test_get_root_page_id( assert client.get_root_page_id() == root_page_id -def test_get_page_tree( - client: LibreTextsClient, +def test_get_page_tree_pages( + page_tree: DekiTree, minimum_number_of_pages: int, - deki_token: str, # noqa: ARG001 +): + assert len(page_tree.pages.keys()) > minimum_number_of_pages + + +def test_get_page_tree_root( + page_tree: DekiTree, root_page_id: str, nb_root_children: int, ): - page_tree = client.get_page_tree() - assert len(page_tree.pages.keys()) > minimum_number_of_pages assert page_tree.root.id == root_page_id assert len(page_tree.root.children) == nb_root_children assert page_tree.root.title @@ -78,6 +89,22 @@ def test_get_page_tree( assert child.title +def test_get_page_tree_subtree( + page_tree: DekiTree, +): + + # 28207 = https://geo.libretexts.org/Courses/Coastline_College/An_Introduction_To_Geology_-_Coastline_College/01%3A_Understanding_Science + subtree1 = page_tree.sub_tree("28207") + # 4 = "1. Understransding Science" + "1.1: What is Science?" + # + "1.2: The Scientific Method" + "1.3: The Study of Geology" + assert len(subtree1.pages.keys()) == 4 + + # 28196 = https://geo.libretexts.org/Courses/Coastline_College/An_Introduction_To_Geology_-_Coastline_College + subtree2 = page_tree.sub_tree("28196") + # 94 is number retrieved in Oct. 2024, might change + assert len(subtree2.pages.keys()) == 94 + + def test_get_home_image_url(home: LibreTextsHome): """Ensures proper image url is retrieved""" assert home.welcome_image_url == "https://cdn.libretexts.net/Logos/geo_full.png"