Skip to content

Commit

Permalink
Retrieve and rewrite Detailed licensing pages content
Browse files Browse the repository at this point in the history
  • Loading branch information
benoit74 committed Nov 26, 2024
1 parent a86df2c commit bd1b95d
Show file tree
Hide file tree
Showing 5 changed files with 223 additions and 1 deletion.
1 change: 1 addition & 0 deletions .github/workflows/PublishDockerDevImage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ on:
push:
branches:
- main
workflow_dispatch:

jobs:
publish:
Expand Down
98 changes: 98 additions & 0 deletions scraper/src/mindtouch2zim/libretexts/detailed_licensing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
from typing import Any

from jinja2 import Template
from pydantic import BaseModel
from zimscraperlib.rewriting.html import HtmlRewriter

from mindtouch2zim.client import LibraryPage, MindtouchClient
from mindtouch2zim.constants import logger
from mindtouch2zim.context import CONTEXT


class LicenseStatistic(BaseModel):
label: str
version: str | None
percent: float
count: int
link: str


class LicenseInfo(BaseModel):
statistics: list[LicenseStatistic]
details: list


class PageInfo(BaseModel):
license_label: str
license_version: str
url: str
title: str
children: list["PageInfo"]


def _get_licensing_report_data(cover_url: str) -> Any:
"""
Get licensing report from libretexts.org
Logic to get the data has been adapted from `buildLicensingReport` function
at https://cdn.libretexts.net/github/LibreTextsMain/DynamicLicensing/dist/dynamicLicensing.min.js
Probably coming from
https://github.com/LibreTexts/Libretext/blob/master/public/DynamicLicensing/dynamicLicensing.js
"""
api_url = f"https://api.libretexts.org/endpoint/licensereport/{cover_url}"
logger.debug(f"Calling API at {api_url}")
resp = CONTEXT.web_session.get(

Check warning on line 45 in scraper/src/mindtouch2zim/libretexts/detailed_licensing.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/detailed_licensing.py#L43-L45

Added lines #L43 - L45 were not covered by tests
url=api_url,
headers={"Origin": "https://www.libretexts.org"}, # kinda authorization header
timeout=CONTEXT.http_timeout_long_seconds,
)
resp.raise_for_status()
return resp.json()

Check warning on line 51 in scraper/src/mindtouch2zim/libretexts/detailed_licensing.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/detailed_licensing.py#L50-L51

Added lines #L50 - L51 were not covered by tests


def _render_html_from_data(jinja2_template: Template, licensing_data: Any) -> str:
if not licensing_data.get("meta", {}).get("specialRestrictions", None):
special_restrictions = None

Check warning on line 56 in scraper/src/mindtouch2zim/libretexts/detailed_licensing.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/detailed_licensing.py#L56

Added line #L56 was not covered by tests
else:

def get_restriction_label(restriction_key: str):

Check warning on line 59 in scraper/src/mindtouch2zim/libretexts/detailed_licensing.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/detailed_licensing.py#L59

Added line #L59 was not covered by tests
if restriction_key == "noncommercial":
return "Noncommercial"

Check warning on line 61 in scraper/src/mindtouch2zim/libretexts/detailed_licensing.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/detailed_licensing.py#L61

Added line #L61 was not covered by tests
elif restriction_key == "noderivatives":
return "No Derivatives"

Check warning on line 63 in scraper/src/mindtouch2zim/libretexts/detailed_licensing.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/detailed_licensing.py#L63

Added line #L63 was not covered by tests
elif restriction_key == "fairuse":
return "Fair Use"

Check warning on line 65 in scraper/src/mindtouch2zim/libretexts/detailed_licensing.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/detailed_licensing.py#L65

Added line #L65 was not covered by tests
else:
return restriction_key

Check warning on line 67 in scraper/src/mindtouch2zim/libretexts/detailed_licensing.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/detailed_licensing.py#L67

Added line #L67 was not covered by tests

special_restrictions = ", ".join(

Check warning on line 69 in scraper/src/mindtouch2zim/libretexts/detailed_licensing.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/detailed_licensing.py#L69

Added line #L69 was not covered by tests
[
get_restriction_label(restriction)
for restriction in licensing_data["meta"]["specialRestrictions"]
]
)
return jinja2_template.render(

Check warning on line 75 in scraper/src/mindtouch2zim/libretexts/detailed_licensing.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/detailed_licensing.py#L75

Added line #L75 was not covered by tests
data=licensing_data, special_restrictions=special_restrictions
)


def rewrite_detailed_licensing(
rewriter: HtmlRewriter,
jinja2_template: Template,
mindtouch_client: MindtouchClient,
page: LibraryPage,
) -> str:
"""
Get and statically rewrite the detailed licensing info of libretexts.org
"""

return rewriter.rewrite(

Check warning on line 91 in scraper/src/mindtouch2zim/libretexts/detailed_licensing.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/libretexts/detailed_licensing.py#L91

Added line #L91 was not covered by tests
_render_html_from_data(
jinja2_template=jinja2_template,
licensing_data=_get_licensing_report_data(
mindtouch_client.get_cover_page_encoded_url(page)
),
)
).content
19 changes: 18 additions & 1 deletion scraper/src/mindtouch2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
from mindtouch2zim.errors import NoIllustrationFoundError
from mindtouch2zim.html import get_text
from mindtouch2zim.html_rewriting import HtmlUrlsRewriter
from mindtouch2zim.libretexts.detailed_licensing import rewrite_detailed_licensing
from mindtouch2zim.libretexts.glossary import rewrite_glossary
from mindtouch2zim.libretexts.index import rewrite_index
from mindtouch2zim.ui import (
Expand Down Expand Up @@ -224,6 +225,9 @@ def run(self) -> Path:
self.libretexts_index_template = self.jinja2_env.get_template(
"libretexts.index.html"
)
self.libretexts_detailed_licensing_template = self.jinja2_env.get_template(

Check warning on line 228 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L228

Added line #L228 was not covered by tests
"libretexts.detailed-licensing.html.jinja2"
)

# Start creator early to detect problems early.
with creator as creator:
Expand Down Expand Up @@ -514,6 +518,20 @@ def _process_page(
jinja2_template=self.libretexts_glossary_template,
original_content=page_content.html_body,
)
elif (
"https://cdn.libretexts.net/github/LibreTextsMain/DynamicLicensing/dist/dynamicLicensing.min.js"
in page_content.html_body
):
logger.debug(

Check warning on line 525 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L525

Added line #L525 was not covered by tests
f"Rewriting {CONTEXT.processing_step} as libretexts.org "
"detailed licensing"
)
rewriten = rewrite_detailed_licensing(

Check warning on line 529 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L529

Added line #L529 was not covered by tests
rewriter=rewriter,
jinja2_template=self.libretexts_detailed_licensing_template,
mindtouch_client=self.mindtouch_client,
page=page,
)
except Exception as exc:
# code has been tested to work "in-general", but many edge-case occurs
# and since these pages are absolutely not essential, we just display a
Expand All @@ -522,7 +540,6 @@ def _process_page(
f"Problem processing special {CONTEXT.processing_step}"
f", page is probably empty, storing empty page: {exc}"
)
return ""
if not rewriten:
# Default rewriting for 'normal' pages
rewriten = rewriter.rewrite(page_content.html_body).content
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
<h2>Overview</h2>
<p>
<strong>Title:</strong>
<a href="{{ data.text.url }}" target="_blank" rel="noreferrer">{{ data.text.title }}</a>
</p>
<p><strong>Webpages:</strong>{{ data.text.totalPages}}</p>
{% if special_restrictions %}
<p>
<strong>Applicable Restrictions:</strong>
{{ special_restrictions }}
</p>
{% endif %}
<p><strong>All licenses found:</strong></p>
<ul>
{% for license in data.meta.licenses %}
<li>
<a href="{{ license.link }}" target="_blank" rel="noreferrer">{{ license.label }}{% if license.version %}&nbsp;{{ license.version }}{% endif %}</a>:
{{ license.percent }}% ({{ license.count }} {% if license.count > 1 %}pages{% else %}page{% endif %})
</li>
{% endfor %}
</ul>
<h2>By Page</h2>
{% macro render_detail(detail) -%}
<li><a href="{{ detail.url }}" target="_blank">{{ detail.title }}</a>
{% if detail.license %}
- <a href="{{ detail.license.link }}" target="_blank" rel="noreferrer"> <em>{{ detail.license.label }} {{ detail.license.version or "" }}</em></a>
{% endif %}
{% if detail.children %}
<ul>
{% for child in detail.children %}
{{ render_detail(child) }}
{% endfor %}
</ul>
{% endif %}
</li>
{% endmacro %}
<div style="column-count: 2; margin-top: 1em;">
<ul style="margin: 0;">
{{ render_detail(data.text) }}
</ul>
</div>
65 changes: 65 additions & 0 deletions scraper/tests-integration/libretexts/test_detailed_licensing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from typing import Any

import pytest
from jinja2 import Environment, FileSystemLoader, select_autoescape

from mindtouch2zim.constants import ROOT_DIR
from mindtouch2zim.libretexts.detailed_licensing import (
_get_licensing_report_data,
_render_html_from_data,
)


@pytest.fixture(scope="module")
def licensing_report_data() -> Any:
return _get_licensing_report_data(
"https://geo.libretexts.org/Courses/California_State_University_Los_Angeles/"
"Book%3A_An_Introduction_to_Geology_(Johnson_Affolter_Inkenbrandt_and_Mosher)"
)


def test_get_licensing_report_data(licensing_report_data: Any):
"""Check we can still get licensing report data"""

assert licensing_report_data

# statistics properties
assert "meta" in licensing_report_data
assert "specialRestrictions" in licensing_report_data["meta"]
assert "licenses" in licensing_report_data["meta"]
assert isinstance(licensing_report_data["meta"]["licenses"], list)
assert "label" in licensing_report_data["meta"]["licenses"][0]
assert "link" in licensing_report_data["meta"]["licenses"][0]
assert "version" in licensing_report_data["meta"]["licenses"][0]
assert "count" in licensing_report_data["meta"]["licenses"][0]
assert int(licensing_report_data["meta"]["licenses"][0]["count"])
assert "percent" in licensing_report_data["meta"]["licenses"][0]
assert float(licensing_report_data["meta"]["licenses"][0]["percent"])
assert "text" in licensing_report_data
assert "totalPages" in licensing_report_data["text"]

# details properties
def check_item(data: Any):
assert "license" in data
assert "label" in data["license"]
assert "link" in data["license"]
# optional property, not set at least for "Undeclared" license
if data["license"]["label"] != "Undeclared":
assert "version" in data["license"]
assert "url" in data
assert "title" in data
assert "children" in data
assert isinstance(data["children"], list)
for child in data["children"]:
check_item(child)

check_item(licensing_report_data["text"])


def test_render_licensing_template(licensing_report_data: Any):
jinja2_env = Environment(
loader=FileSystemLoader(ROOT_DIR.joinpath("templates")),
autoescape=select_autoescape(),
)
template = jinja2_env.get_template("libretexts.detailed-licensing.html.jinja2")
assert _render_html_from_data(template, licensing_report_data)

0 comments on commit bd1b95d

Please sign in to comment.