diff --git a/CHANGES/706.feature b/CHANGES/706.feature new file mode 100644 index 00000000..6c01594c --- /dev/null +++ b/CHANGES/706.feature @@ -0,0 +1 @@ +Pull-through caching now respects the include/exclude filters on the upstream remote. diff --git a/docs/user/guides/publish.md b/docs/user/guides/publish.md index 351b3ef3..9c28817b 100644 --- a/docs/user/guides/publish.md +++ b/docs/user/guides/publish.md @@ -88,6 +88,9 @@ from the remote source and have Pulp store that package as orphaned content. pulp python distribution update --name foo --remote bar ``` +!!! note + Pull-through caching will respect the includes/excludes filters on the supplied remote. + !!! warning Support for pull-through caching is provided as a tech preview in Pulp 3. Functionality may not work or may be incomplete. Also, backwards compatibility when upgrading diff --git a/pulp_python/app/pypi/views.py b/pulp_python/app/pypi/views.py index 60b3c04d..3aba1bc9 100644 --- a/pulp_python/app/pypi/views.py +++ b/pulp_python/app/pypi/views.py @@ -1,5 +1,6 @@ +import json import logging -import requests +import os from rest_framework.viewsets import ViewSet from rest_framework.response import Response @@ -15,7 +16,8 @@ Http404, HttpResponseForbidden, HttpResponseBadRequest, - StreamingHttpResponse + StreamingHttpResponse, + HttpResponse, ) from drf_spectacular.utils import extend_schema from dynaconf import settings @@ -23,7 +25,7 @@ from packaging.utils import canonicalize_name from urllib.parse import urljoin, urlparse, urlunsplit from pathlib import PurePath -from pypi_simple import parse_links_stream_response +from pypi_simple import ACCEPT_JSON_PREFERRED, ProjectPage from pulpcore.plugin.viewsets import OperationPostponedResponse from pulpcore.plugin.tasking import dispatch @@ -45,6 +47,7 @@ python_content_to_json, PYPI_LAST_SERIAL, PYPI_SERIAL_CONSTANT, + get_remote_package_filter, ) from pulp_python.app import tasks @@ -232,27 +235,34 @@ def list(self, request, path): def pull_through_package_simple(self, package, path, remote): """Gets the package's simple page from remote.""" - def parse_url(link): - parsed = urlparse(link.url) - digest, _, value = parsed.fragment.partition('=') + def parse_package(dis_package): + parsed = urlparse(dis_package.url) stripped_url = urlunsplit(chain(parsed[:3], ("", ""))) - redirect = f'{path}/{link.text}?redirect={stripped_url}' - d_url = urljoin(self.base_content_url, redirect) - return link.text, d_url, value if digest == 'sha256' else '' + redirect_path = f'{path}/{dis_package.filename}?redirect={stripped_url}' + d_url = urljoin(self.base_content_url, redirect_path) + return dis_package.filename, d_url, dis_package.digests.get("sha256", "") + + rfilter = get_remote_package_filter(remote) + if not rfilter.filter_project(package): + raise Http404(f"{package} does not exist.") url = remote.get_remote_artifact_url(f'simple/{package}/') - kwargs = {} - if proxy_url := remote.proxy_url: - if remote.proxy_username or remote.proxy_password: - parsed_proxy = urlparse(proxy_url) - netloc = f"{remote.proxy_username}:{remote.proxy_password}@{parsed_proxy.netloc}" - proxy_url = urlunsplit((parsed_proxy.scheme, netloc, "", "", "")) - kwargs["proxies"] = {"http": proxy_url, "https": proxy_url} - - response = requests.get(url, stream=True, **kwargs) - links = parse_links_stream_response(response) - packages = (parse_url(link) for link in links) - return StreamingHttpResponse(write_simple_detail(package, packages, streamed=True)) + remote.headers = remote.headers or [] + remote.headers.append({"Accept": ACCEPT_JSON_PREFERRED}) + downloader = remote.get_downloader(url=url, max_retries=1) + try: + d = downloader.fetch() + except Exception: + raise Http404(f"Could not find {package}.") + + if d.headers["content-type"] == "application/vnd.pypi.simple.v1+json": + page = ProjectPage.from_json_data(json.load(open(d.path, "rb")), base_url=remote.url) + else: + page = ProjectPage.from_html(package, open(d.path, "rb").read(), base_url=remote.url) + packages = [ + parse_package(p) for p in page.packages if rfilter.filter_release(package, p.version) + ] + return HttpResponse(write_simple_detail(package, packages)) @extend_schema(operation_id="pypi_simple_package_read", summary="Get package simple page") def retrieve(self, request, path, package): diff --git a/pulp_python/app/utils.py b/pulp_python/app/utils.py index 126037a8..d94ed136 100644 --- a/pulp_python/app/utils.py +++ b/pulp_python/app/utils.py @@ -6,7 +6,8 @@ from django.conf import settings from jinja2 import Template from packaging.utils import canonicalize_name -from packaging.version import parse +from packaging.requirements import Requirement +from packaging.version import parse, InvalidVersion PYPI_LAST_SERIAL = "X-PYPI-LAST-SERIAL" @@ -317,3 +318,78 @@ def write_simple_detail(project_name, project_packages, streamed=False): detail = Template(simple_detail_template) context = {"project_name": project_name, "project_packages": project_packages} return detail.stream(**context) if streamed else detail.render(**context) + + +class PackageIncludeFilter: + """A special class to help filter Package's based on a remote's include/exclude""" + + def __init__(self, remote): + self.remote = remote.cast() + self._filter_includes = self._parse_packages(self.remote.includes) + self._filter_excludes = self._parse_packages(self.remote.excludes) + + def _parse_packages(self, packages): + config = defaultdict(lambda: defaultdict(list)) + for value in packages: + requirement = Requirement(value) + requirement.name = canonicalize_name(requirement.name) + if requirement.specifier: + requirement.specifier.prereleases = True + config["range"][requirement.name].append(requirement) + else: + config["full"][requirement.name].append(requirement) + return config + + def filter_project(self, project_name): + """Return true/false if project_name would be allowed through remote's filters.""" + project_name = canonicalize_name(project_name) + include_full = self._filter_includes.get("full", {}) + include_range = self._filter_includes.get("range", {}) + include = set(include_range.keys()).union(include_full.keys()) + if include and project_name not in include: + return False + + exclude_full = self._filter_excludes.get("full", {}) + if project_name in exclude_full: + return False + + return True + + def filter_release(self, project_name, version): + """Returns true/false if release would be allowed through remote's filters.""" + project_name = canonicalize_name(project_name) + if not self.filter_project(project_name): + return False + + try: + version = parse(version) + except InvalidVersion: + return False + + include_range = self._filter_includes.get("range", {}) + if project_name in include_range: + for req in include_range[project_name]: + if version in req.specifier: + break + else: + return False + + exclude_range = self._filter_excludes.get("range", {}) + if project_name in exclude_range: + for req in exclude_range[project_name]: + if version in req.specifier: + return False + + return True + + +_remote_filters = {} +def get_remote_package_filter(remote): + if date_filter_tuple := _remote_filters.get(remote.pulp_id): + last_update, rfilter = date_filter_tuple + if last_update == remote.pulp_last_updated: + return rfilter + + rfilter = PackageIncludeFilter(remote) + _remote_filters[remote.pulp_id] = (remote.pulp_last_updated, rfilter) + return rfilter diff --git a/pulp_python/tests/functional/api/test_full_mirror.py b/pulp_python/tests/functional/api/test_full_mirror.py index 2f438009..9101862d 100644 --- a/pulp_python/tests/functional/api/test_full_mirror.py +++ b/pulp_python/tests/functional/api/test_full_mirror.py @@ -5,9 +5,12 @@ from pulp_python.tests.functional.constants import ( PYPI_URL, PYTHON_XS_FIXTURE_CHECKSUMS, + PYTHON_SM_PROJECT_SPECIFIER, + PYTHON_SM_FIXTURE_RELEASES, ) from pypi_simple import ProjectPage +from packaging.version import parse from urllib.parse import urljoin, urlsplit @@ -54,6 +57,44 @@ def test_pull_through_simple(python_remote_factory, python_distribution_factory, assert PYTHON_XS_FIXTURE_CHECKSUMS[package.filename] == package.digests["sha256"] +@pytest.mark.parallel +def test_pull_through_filter(python_remote_factory, python_distribution_factory): + """Tests that pull-through respects the includes/excludes filter on the remote.""" + remote = python_remote_factory(url=PYPI_URL, includes=["shelf-reader"]) + distro = python_distribution_factory(remote=remote.pulp_href) + + r = requests.get(f"{distro.base_url}simple/pulpcore/") + assert r.status_code == 404 + assert r.json() == {'detail': 'pulpcore does not exist.'} + + r = requests.get(f"{distro.base_url}simple/shelf-reader/") + assert r.status_code == 200 + + # Test complex include specifiers + remote = python_remote_factory(includes=PYTHON_SM_PROJECT_SPECIFIER) + distro = python_distribution_factory(remote=remote.pulp_href) + for package, releases in PYTHON_SM_FIXTURE_RELEASES.items(): + url = f"{distro.base_url}simple/{package}/" + project_page = ProjectPage.from_response(requests.get(url), package) + packages = {p.filename for p in project_page.packages if not parse(p.version).is_prerelease} + assert packages == set(releases) + + # Test exclude logic + remote = python_remote_factory(includes=[], excludes=["django"]) + distro = python_distribution_factory(remote=remote.pulp_href) + + r = requests.get(f"{distro.base_url}simple/django/") + assert r.status_code == 404 + assert r.json() == {'detail': 'django does not exist.'} + + r = requests.get(f"{distro.base_url}simple/pulpcore/") + assert r.status_code == 404 + assert r.json() == {'detail': 'Could not find pulpcore.'} + + r = requests.get(f"{distro.base_url}simple/shelf-reader/") + assert r.status_code == 200 + + @pytest.mark.parallel def test_pull_through_with_repo( python_repo_with_sync, python_remote_factory, python_distribution_factory