Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fetch only the required subtree of the website instead of the whole site #102

Open
wants to merge 1 commit into
base: flexbooks
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 72 additions & 35 deletions scraper/src/mindtouch2zim/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
"""

tags: list[str]
parent_id: str | None


class LibraryPage(BaseModel):
Expand Down Expand Up @@ -231,18 +232,10 @@

return page_ids

def get_root_page_id(self) -> LibraryPageId:
"""Returns the ID the root of the tree of pages"""

tree = self._get_api_json(
"/pages/home/tree", timeout=CONTEXT.http_timeout_long_seconds
)
return tree["page"]["@id"]

def get_page_tree(self) -> LibraryTree:
def get_page_tree(self, page: str = "home") -> LibraryTree:

tree_data = self._get_api_json(
"/pages/home/tree", timeout=CONTEXT.http_timeout_long_seconds
f"/pages/{page}/tree", timeout=CONTEXT.http_timeout_long_seconds
)

root = LibraryPage(
Expand Down Expand Up @@ -307,32 +300,41 @@
)
return LibraryPageContent(html_body=tree["body"][0])

def get_page_definition(self, page: LibraryPage) -> LibraryPageDefinition:
def get_page_definition(self, page: LibraryPage | str) -> LibraryPageDefinition:
"""Return the definition of a given page

Definition is kept in memory, and retrieved on-demand when it is not yet there
"""
if page.definition is None:
raw_definition = self._get_api_json(
f"/pages/{page.id}", timeout=CONTEXT.http_timeout_normal_seconds
)
raw_tags = raw_definition.get("tags", None)
if raw_tags is None:
raise MindtouchParsingError(f"No tags property for page {page.id}")
raw_tag = raw_tags.get("tag", None)
if raw_tag is None:
raise MindtouchParsingError(f"No tag property for page {page.id}")
if isinstance(raw_tag, list):
tags = [item.get("@value") for item in raw_tag]
else:
tags = [raw_tag.get("@value")]
page.definition = LibraryPageDefinition(
tags=tags,
)
return page.definition
if isinstance(page, str):
page_id = page

Check warning on line 309 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L309

Added line #L309 was not covered by tests
elif page.definition is not None:
return page.definition

Check warning on line 311 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L311

Added line #L311 was not covered by tests
else:
page_id = page.id

Check warning on line 313 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L313

Added line #L313 was not covered by tests

raw_definition = self._get_api_json(

Check warning on line 315 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L315

Added line #L315 was not covered by tests
f"/pages/{page_id}", timeout=CONTEXT.http_timeout_normal_seconds
)
raw_tags = raw_definition.get("tags", None)

Check warning on line 318 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L318

Added line #L318 was not covered by tests
if raw_tags is None:
raise MindtouchParsingError(f"No tags property for page {page_id}")
raw_tag = raw_tags.get("tag", None)

Check warning on line 321 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L320-L321

Added lines #L320 - L321 were not covered by tests
if raw_tag is None:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cant we combine those two conditions into one?

raise MindtouchParsingError(f"No tag property for page {page_id}")

Check warning on line 323 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L323

Added line #L323 was not covered by tests
if isinstance(raw_tag, list):
tags = [item.get("@value") for item in raw_tag]

Check warning on line 325 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L325

Added line #L325 was not covered by tests
else:
tags = [raw_tag.get("@value")]
parent = raw_definition.get("page.parent", None)
page_definition = LibraryPageDefinition(

Check warning on line 329 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L327-L329

Added lines #L327 - L329 were not covered by tests
tags=tags, parent_id=None if parent is None else parent["@id"]
)
if isinstance(page, LibraryPage):
page.definition = page_definition
return page_definition

Check warning on line 334 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L333-L334

Added lines #L333 - L334 were not covered by tests

def get_cover_page(self, page: LibraryPage) -> LibraryPage:
"""Get the cover page of a given page
def get_cover_page(self, page: LibraryPage) -> LibraryPage | None:
"""Get the cover page of a given page object

Logic originally defined in `getCoverpage` function of
https://cdn.libretexts.net/github/LibreTextsMain/Miscellaneous/reuse.js
Expand All @@ -351,19 +353,54 @@
or "coverpage:nocommons" in current_definition.tags
):
return current_page
if "article:topic-category" in current_definition.tags:
return None

Check warning on line 357 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L357

Added line #L357 was not covered by tests
if current_page.parent is None:
raise MindtouchParsingError(
f"No more parent for {page.id}, reached root at {current_page.id}"
)
current_page = current_page.parent

def get_cover_page_encoded_url(self, page: LibraryPage) -> str:
def _get_cover_page_from_str_id(self, page_id: str) -> str | None:
"""Get the cover page ID of a given page identifier as string

Logic originally defined in `getCoverpage` function of
https://cdn.libretexts.net/github/LibreTextsMain/Miscellaneous/reuse.js

Probably originates from getCoverpage function of
https://github.com/LibreTexts/Libretext/blob/master/public/Miscellaneous/reuse.js

See https://github.com/openzim/mindtouch/issues/68 for a copy of original code
"""
current_page = page_id
while True:
current_definition = self.get_page_definition(current_page)

Check warning on line 377 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L375-L377

Added lines #L375 - L377 were not covered by tests
if (
"coverpage:yes" in current_definition.tags
or "coverpage:toc" in current_definition.tags
or "coverpage:nocommons" in current_definition.tags
):
return current_page

Check warning on line 383 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L383

Added line #L383 was not covered by tests
if "article:topic-category" in current_definition.tags:
return None

Check warning on line 385 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L385

Added line #L385 was not covered by tests
if current_definition.parent_id is None:
raise MindtouchParsingError(

Check warning on line 387 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L387

Added line #L387 was not covered by tests
f"No more parent for {page_id}, reached root at {current_page}"
)
current_page = current_definition.parent_id

Check warning on line 390 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L390

Added line #L390 was not covered by tests

def get_cover_page_encoded_url(self, page: LibraryPage) -> str | None:
"""Returns the url for the book page for a given child page"""
return self.get_cover_page(page).encoded_url
cover_page = self.get_cover_page(page)
return cover_page.encoded_url if cover_page is not None else None

Check warning on line 395 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L394-L395

Added lines #L394 - L395 were not covered by tests

def get_cover_page_id(self, page: LibraryPage) -> str:
def get_cover_page_id(self, page: LibraryPage | str) -> str | None:
"""Returns the id for the book page for a given child page"""
return self.get_cover_page(page).id
if isinstance(page, LibraryPage):
cover_page = self.get_cover_page(page)
return cover_page.id if cover_page is not None else None

Check warning on line 401 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L400-L401

Added lines #L400 - L401 were not covered by tests
else:
return self._get_cover_page_from_str_id(page)

Check warning on line 403 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L403

Added line #L403 was not covered by tests

def get_template_content(self, page_id: str, template: str) -> str:
"""Returns the templated content of a given page"""
Expand Down
8 changes: 5 additions & 3 deletions scraper/src/mindtouch2zim/libretexts/detailed_licensing.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from mindtouch2zim.client import LibraryPage, MindtouchClient
from mindtouch2zim.constants import logger
from mindtouch2zim.context import CONTEXT
from mindtouch2zim.libretexts.errors import BadBookPageError


class LicenseStatistic(BaseModel):
Expand Down Expand Up @@ -88,11 +89,12 @@

"""

cover_page_url = mindtouch_client.get_cover_page_encoded_url(page)

Check warning on line 92 in scraper/src/mindtouch2zim/libretexts/detailed_licensing.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/detailed_licensing.py#L92

Added line #L92 was not covered by tests
if cover_page_url is None:
raise BadBookPageError()

Check warning on line 94 in scraper/src/mindtouch2zim/libretexts/detailed_licensing.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/detailed_licensing.py#L94

Added line #L94 was not covered by tests
return rewriter.rewrite(
_render_html_from_data(
jinja2_template=jinja2_template,
licensing_data=_get_licensing_report_data(
mindtouch_client.get_cover_page_encoded_url(page)
),
licensing_data=_get_licensing_report_data(cover_page_url),
)
).content
4 changes: 4 additions & 0 deletions scraper/src/mindtouch2zim/libretexts/errors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
class BadBookPageError(Exception):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BadBookPage implies that the Book page as an issue. In this case, the problem is more with the request if I understand correctly. If so, *IncorectBookPage` seems more appropriate.

"""Raised when we are processing a special book page but we are not inside a book"""

pass
6 changes: 5 additions & 1 deletion scraper/src/mindtouch2zim/libretexts/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from zimscraperlib.rewriting.html import HtmlRewriter

from mindtouch2zim.client import LibraryPage, MindtouchClient
from mindtouch2zim.libretexts.errors import BadBookPageError


class IndexPage(BaseModel):
Expand All @@ -28,11 +29,14 @@
page: LibraryPage,
) -> str:
"""Get and rewrite index HTML"""
cover_page_id = mindtouch_client.get_cover_page_id(page)

Check warning on line 32 in scraper/src/mindtouch2zim/libretexts/index.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/index.py#L32

Added line #L32 was not covered by tests
if cover_page_id is None:
raise BadBookPageError()

Check warning on line 34 in scraper/src/mindtouch2zim/libretexts/index.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/index.py#L34

Added line #L34 was not covered by tests
return get_libretexts_transformed_html(
jinja2_template=jinja2_template,
libretexts_template_content=rewriter.rewrite(
mindtouch_client.get_template_content(
page_id=mindtouch_client.get_cover_page_id(page),
page_id=cover_page_id,
template="=Template%253AMindTouch%252FIDF3%252FViews%252FTag_directory",
)
).content,
Expand Down
7 changes: 6 additions & 1 deletion scraper/src/mindtouch2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,12 @@

logger.info("Fetching pages tree")
CONTEXT.processing_step = "pages tree"
pages_tree = self.mindtouch_client.get_page_tree()
root_page_id = self.content_filter.root_page_id or "home"
cover_page_id = (

Check warning on line 348 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L347-L348

Added lines #L347 - L348 were not covered by tests
self.mindtouch_client.get_cover_page_id(root_page_id)
or root_page_id # if --root-page-id is not inside a book but a category
)
pages_tree = self.mindtouch_client.get_page_tree(cover_page_id)

Check warning on line 352 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L352

Added line #L352 was not covered by tests
selected_pages = self.content_filter.filter(pages_tree)
logger.info(
f"{len(selected_pages)} pages (out of {len(pages_tree.pages)}) will be "
Expand Down
82 changes: 65 additions & 17 deletions scraper/tests-integration/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,27 +16,46 @@


@pytest.fixture(scope="module")
def client(libretexts_url: str, cache_folder: Path) -> MindtouchClient:
def raw_client(libretexts_url: str, cache_folder: Path) -> MindtouchClient:
CONTEXT.library_url = libretexts_url
CONTEXT.cache_folder = cache_folder
return MindtouchClient()


@pytest.fixture(scope="module")
def client(
raw_client: MindtouchClient,
deki_token: str, # noqa: ARG001
) -> MindtouchClient:
"""already authenticated client (avoid having to fetch deki_token in tests)"""
return raw_client


@pytest.fixture(scope="module")
def home(client: MindtouchClient) -> MindtouchHome:
return client.get_home()


@pytest.fixture(scope="module")
def deki_token(client: MindtouchClient) -> str:
return client.get_deki_token()
def deki_token(raw_client: MindtouchClient) -> str:
return raw_client.get_deki_token()


@pytest.fixture(scope="module")
def minimum_number_of_pages() -> int:
return 8000


@pytest.fixture(scope="module")
def somewhere_page_id() -> LibraryPageId:
return "15728"


@pytest.fixture(scope="module")
def nb_somewhere_children() -> int:
return 5


@pytest.fixture(scope="module")
def root_page_id() -> LibraryPageId:
return "34"
Expand All @@ -50,7 +69,6 @@ def nb_root_children() -> int:
@pytest.fixture(scope="module")
def page_tree(
client: MindtouchClient,
deki_token: str, # noqa: ARG001
) -> LibraryTree:
return client.get_page_tree()

Expand All @@ -63,20 +81,11 @@ def test_get_deki_token(deki_token: str):
def test_get_all_pages_ids(
client: MindtouchClient,
minimum_number_of_pages: int,
deki_token: str, # noqa: ARG001
):
pages_ids = client.get_all_pages_ids()
assert len(pages_ids) > minimum_number_of_pages


def test_get_root_page_id(
client: MindtouchClient,
root_page_id: LibraryPageId,
deki_token: str, # noqa: ARG001
):
assert client.get_root_page_id() == root_page_id


def test_get_page_tree_pages(
page_tree: LibraryTree,
minimum_number_of_pages: int,
Expand Down Expand Up @@ -112,6 +121,19 @@ def test_get_page_tree_subtree(
assert len(subtree2.pages.keys()) == 94


def test_get_page_tree_somewhere(
client: MindtouchClient,
somewhere_page_id: str,
nb_somewhere_children: int,
):
page_tree = client.get_page_tree(somewhere_page_id)
assert page_tree.root.id == somewhere_page_id
assert len(page_tree.root.children) == nb_somewhere_children
assert page_tree.root.title
for child in page_tree.root.children:
assert child.title


def test_get_home_image_url(home: MindtouchHome):
"""Ensures proper image url is retrieved"""
assert home.welcome_image_url == "https://cdn.libretexts.net/Logos/geo_full.png"
Expand Down Expand Up @@ -144,8 +166,10 @@ def test_get_index_page_from_template(
):
"""Ensures we can get content of an index page"""
page_15837 = page_tree.sub_tree("15837").root
cover_page_id = client.get_cover_page_id(page_15837)
assert cover_page_id
assert client.get_template_content(
page_id=client.get_cover_page_id(page_15837),
page_id=cover_page_id,
template="=Template%253AMindTouch%252FIDF3%252FViews%252FTag_directory",
)

Expand All @@ -162,12 +186,36 @@ def test_get_cover_page_encoded_url(
)


def test_get_cover_page_id(
@pytest.mark.parametrize(
"current_id, expected_cover_page_id",
[
("15837", "15718"),
(":0794f6ff8238481ab880b6484deb65f4", "15718"),
("15844", None),
("34", None),
("home", None),
],
)
def test_get_cover_page_id_by_id(
client: MindtouchClient,
current_id: str,
expected_cover_page_id: str | None,
):
assert client.get_cover_page_id(current_id) == expected_cover_page_id


@pytest.mark.parametrize(
"current_id, expected_cover_page_id",
[("15837", "15718"), ("15844", None), ("34", None)],
)
def test_get_cover_page_id_by_page(
client: MindtouchClient,
page_tree: LibraryTree,
current_id: str,
expected_cover_page_id: str | None,
):
page_15837 = page_tree.sub_tree("15837").root
assert client.get_cover_page_id(page_15837) == "15718"
page_object = page_tree.sub_tree(current_id).root
assert client.get_cover_page_id(page_object) == expected_cover_page_id


def test_get_home_screen_css_url(home: MindtouchHome):
Expand Down