From bd1b95d0636817973a38ebe97afa5ff899d25b56 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 19 Nov 2024 07:38:15 +0000 Subject: [PATCH] Retrieve and rewrite Detailed licensing pages content --- .github/workflows/PublishDockerDevImage.yaml | 1 + .../libretexts/detailed_licensing.py | 98 +++++++++++++++++++ scraper/src/mindtouch2zim/processor.py | 19 +++- .../libretexts.detailed-licensing.html.jinja2 | 41 ++++++++ .../libretexts/test_detailed_licensing.py | 65 ++++++++++++ 5 files changed, 223 insertions(+), 1 deletion(-) create mode 100644 scraper/src/mindtouch2zim/libretexts/detailed_licensing.py create mode 100644 scraper/src/mindtouch2zim/templates/libretexts.detailed-licensing.html.jinja2 create mode 100644 scraper/tests-integration/libretexts/test_detailed_licensing.py diff --git a/.github/workflows/PublishDockerDevImage.yaml b/.github/workflows/PublishDockerDevImage.yaml index f801f4f..c97bfb8 100644 --- a/.github/workflows/PublishDockerDevImage.yaml +++ b/.github/workflows/PublishDockerDevImage.yaml @@ -4,6 +4,7 @@ on: push: branches: - main + workflow_dispatch: jobs: publish: diff --git a/scraper/src/mindtouch2zim/libretexts/detailed_licensing.py b/scraper/src/mindtouch2zim/libretexts/detailed_licensing.py new file mode 100644 index 0000000..fc8fb7d --- /dev/null +++ b/scraper/src/mindtouch2zim/libretexts/detailed_licensing.py @@ -0,0 +1,98 @@ +from typing import Any + +from jinja2 import Template +from pydantic import BaseModel +from zimscraperlib.rewriting.html import HtmlRewriter + +from mindtouch2zim.client import LibraryPage, MindtouchClient +from mindtouch2zim.constants import logger +from mindtouch2zim.context import CONTEXT + + +class LicenseStatistic(BaseModel): + label: str + version: str | None + percent: float + count: int + link: str + + +class LicenseInfo(BaseModel): + statistics: list[LicenseStatistic] + details: list + + +class PageInfo(BaseModel): + license_label: str + license_version: str + url: str + title: str + children: list["PageInfo"] + + +def _get_licensing_report_data(cover_url: str) -> Any: + """ + Get licensing report from libretexts.org + + Logic to get the data has been adapted from `buildLicensingReport` function + at https://cdn.libretexts.net/github/LibreTextsMain/DynamicLicensing/dist/dynamicLicensing.min.js + + Probably coming from + https://github.com/LibreTexts/Libretext/blob/master/public/DynamicLicensing/dynamicLicensing.js + """ + api_url = f"https://api.libretexts.org/endpoint/licensereport/{cover_url}" + logger.debug(f"Calling API at {api_url}") + resp = CONTEXT.web_session.get( + url=api_url, + headers={"Origin": "https://www.libretexts.org"}, # kinda authorization header + timeout=CONTEXT.http_timeout_long_seconds, + ) + resp.raise_for_status() + return resp.json() + + +def _render_html_from_data(jinja2_template: Template, licensing_data: Any) -> str: + if not licensing_data.get("meta", {}).get("specialRestrictions", None): + special_restrictions = None + else: + + def get_restriction_label(restriction_key: str): + if restriction_key == "noncommercial": + return "Noncommercial" + elif restriction_key == "noderivatives": + return "No Derivatives" + elif restriction_key == "fairuse": + return "Fair Use" + else: + return restriction_key + + special_restrictions = ", ".join( + [ + get_restriction_label(restriction) + for restriction in licensing_data["meta"]["specialRestrictions"] + ] + ) + return jinja2_template.render( + data=licensing_data, special_restrictions=special_restrictions + ) + + +def rewrite_detailed_licensing( + rewriter: HtmlRewriter, + jinja2_template: Template, + mindtouch_client: MindtouchClient, + page: LibraryPage, +) -> str: + """ + Get and statically rewrite the detailed licensing info of libretexts.org + + """ + + return rewriter.rewrite( + _render_html_from_data( + jinja2_template=jinja2_template, + licensing_data=_get_licensing_report_data( + mindtouch_client.get_cover_page_encoded_url(page) + ), + ) + ).content diff --git a/scraper/src/mindtouch2zim/processor.py b/scraper/src/mindtouch2zim/processor.py index 2168ff1..00dcb61 100644 --- a/scraper/src/mindtouch2zim/processor.py +++ b/scraper/src/mindtouch2zim/processor.py @@ -47,6 +47,7 @@ from mindtouch2zim.errors import NoIllustrationFoundError from mindtouch2zim.html import get_text from mindtouch2zim.html_rewriting import HtmlUrlsRewriter +from mindtouch2zim.libretexts.detailed_licensing import rewrite_detailed_licensing from mindtouch2zim.libretexts.glossary import rewrite_glossary from mindtouch2zim.libretexts.index import rewrite_index from mindtouch2zim.ui import ( @@ -224,6 +225,9 @@ def run(self) -> Path: self.libretexts_index_template = self.jinja2_env.get_template( "libretexts.index.html" ) + self.libretexts_detailed_licensing_template = self.jinja2_env.get_template( + "libretexts.detailed-licensing.html.jinja2" + ) # Start creator early to detect problems early. with creator as creator: @@ -514,6 +518,20 @@ def _process_page( jinja2_template=self.libretexts_glossary_template, original_content=page_content.html_body, ) + elif ( + "https://cdn.libretexts.net/github/LibreTextsMain/DynamicLicensing/dist/dynamicLicensing.min.js" + in page_content.html_body + ): + logger.debug( + f"Rewriting {CONTEXT.processing_step} as libretexts.org " + "detailed licensing" + ) + rewriten = rewrite_detailed_licensing( + rewriter=rewriter, + jinja2_template=self.libretexts_detailed_licensing_template, + mindtouch_client=self.mindtouch_client, + page=page, + ) except Exception as exc: # code has been tested to work "in-general", but many edge-case occurs # and since these pages are absolutely not essential, we just display a @@ -522,7 +540,6 @@ def _process_page( f"Problem processing special {CONTEXT.processing_step}" f", page is probably empty, storing empty page: {exc}" ) - return "" if not rewriten: # Default rewriting for 'normal' pages rewriten = rewriter.rewrite(page_content.html_body).content diff --git a/scraper/src/mindtouch2zim/templates/libretexts.detailed-licensing.html.jinja2 b/scraper/src/mindtouch2zim/templates/libretexts.detailed-licensing.html.jinja2 new file mode 100644 index 0000000..1433e80 --- /dev/null +++ b/scraper/src/mindtouch2zim/templates/libretexts.detailed-licensing.html.jinja2 @@ -0,0 +1,41 @@ +

Overview

+

+ Title: + {{ data.text.title }} +

+

Webpages:{{ data.text.totalPages}}

+{% if special_restrictions %} +

+ Applicable Restrictions: + {{ special_restrictions }} +

+{% endif %} +

All licenses found:

+ +

By Page

+{% macro render_detail(detail) -%} +
  • {{ detail.title }} +{% if detail.license %} +- {{ detail.license.label }} {{ detail.license.version or "" }} +{% endif %} +{% if detail.children %} + +{% endif %} +
  • +{% endmacro %} +
    + +
    diff --git a/scraper/tests-integration/libretexts/test_detailed_licensing.py b/scraper/tests-integration/libretexts/test_detailed_licensing.py new file mode 100644 index 0000000..4d7b546 --- /dev/null +++ b/scraper/tests-integration/libretexts/test_detailed_licensing.py @@ -0,0 +1,65 @@ +from typing import Any + +import pytest +from jinja2 import Environment, FileSystemLoader, select_autoescape + +from mindtouch2zim.constants import ROOT_DIR +from mindtouch2zim.libretexts.detailed_licensing import ( + _get_licensing_report_data, + _render_html_from_data, +) + + +@pytest.fixture(scope="module") +def licensing_report_data() -> Any: + return _get_licensing_report_data( + "https://geo.libretexts.org/Courses/California_State_University_Los_Angeles/" + "Book%3A_An_Introduction_to_Geology_(Johnson_Affolter_Inkenbrandt_and_Mosher)" + ) + + +def test_get_licensing_report_data(licensing_report_data: Any): + """Check we can still get licensing report data""" + + assert licensing_report_data + + # statistics properties + assert "meta" in licensing_report_data + assert "specialRestrictions" in licensing_report_data["meta"] + assert "licenses" in licensing_report_data["meta"] + assert isinstance(licensing_report_data["meta"]["licenses"], list) + assert "label" in licensing_report_data["meta"]["licenses"][0] + assert "link" in licensing_report_data["meta"]["licenses"][0] + assert "version" in licensing_report_data["meta"]["licenses"][0] + assert "count" in licensing_report_data["meta"]["licenses"][0] + assert int(licensing_report_data["meta"]["licenses"][0]["count"]) + assert "percent" in licensing_report_data["meta"]["licenses"][0] + assert float(licensing_report_data["meta"]["licenses"][0]["percent"]) + assert "text" in licensing_report_data + assert "totalPages" in licensing_report_data["text"] + + # details properties + def check_item(data: Any): + assert "license" in data + assert "label" in data["license"] + assert "link" in data["license"] + # optional property, not set at least for "Undeclared" license + if data["license"]["label"] != "Undeclared": + assert "version" in data["license"] + assert "url" in data + assert "title" in data + assert "children" in data + assert isinstance(data["children"], list) + for child in data["children"]: + check_item(child) + + check_item(licensing_report_data["text"]) + + +def test_render_licensing_template(licensing_report_data: Any): + jinja2_env = Environment( + loader=FileSystemLoader(ROOT_DIR.joinpath("templates")), + autoescape=select_autoescape(), + ) + template = jinja2_env.get_template("libretexts.detailed-licensing.html.jinja2") + assert _render_html_from_data(template, licensing_report_data)