Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add dependency syncing #626

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES/340.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Added ability (tech-preview) to recursively sync dependencies from a remote with `sync_dependencies=True`.
29 changes: 29 additions & 0 deletions docs/workflows/sync.rst
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,11 @@ Remote Create Response::
],
"excludes": [],
"prereleases": true,
"package_types": [],
"keep_latest_packages": 0,
"exclude_platforms": [],
"sync_dependencies": false

}


Expand Down Expand Up @@ -119,6 +124,30 @@ Reference: `Python Remote Usage <../restapi.html#tag/Remotes:-Python>`_

.. _mirror-workflow:

Syncing Dependencies
--------------------

When specifying included packages to sync, Pulp can also sync the dependencies of those packages::

$ pulp python remote create \
--name 'packages-with-dependencies' \
--url 'https://pypi.org/' \
--sync-dependencies # Enable syncing dependencies for included packages \
--includes '[
"django>=4.0", # Sync the dependencies for each django version >=4.0
"pulpcore[s3]", # Sync the dependencies for all pulpcore versions + extra dependencies for s3
]'

Turning on dependency syncing will only sync the necessary dependencies to install the package for the
given versions declared in the includes list. You can sync the extra dependencies of a package by
using the `extras notation <https://peps.python.org/pep-0508/#extras>`_. Synced dependencies are also
filtered through the other filters defined on the remote. Note that many packages have unrestricted
dependencies which can cause syncs to become significantly larger than normal. It is recommended
to use extra filters to trim any unwanted packages.

.. warning:: This feature is provided as a tech preview and could change in backwards incompatible
ways in the future.

Creating a remote to sync all of PyPI
_____________________________________

Expand Down
17 changes: 17 additions & 0 deletions pulp_python/app/migrations/0012_pythonremote_sync_dependencies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Generated by Django 4.2.5 on 2023-10-19 03:05

from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("python", "0011_alter_pythondistribution_distribution_ptr_and_more"),
]

operations = [
migrations.AddField(
model_name="pythonremote",
name="sync_dependencies",
field=models.BooleanField(default=False),
),
]
1 change: 1 addition & 0 deletions pulp_python/app/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,7 @@ class PythonRemote(Remote):
exclude_platforms = ArrayField(
models.CharField(max_length=10, blank=True), choices=PLATFORMS, default=list
)
sync_dependencies = models.BooleanField(default=False)

def get_remote_artifact_url(self, relative_path=None, request=None):
"""Get url for remote_artifact"""
Expand Down
7 changes: 6 additions & 1 deletion pulp_python/app/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,11 @@ class PythonRemoteSerializer(core_serializers.RemoteSerializer):
choices=python_models.PLATFORMS,
default=list
)
sync_dependencies = serializers.BooleanField(
required=False,
help_text=_("Whether to sync dependencies specified by package metadata. (Tech Preview)"),
default=False,
)

def validate_includes(self, value):
"""Validates the includes"""
Expand All @@ -351,7 +356,7 @@ def validate_excludes(self, value):
class Meta:
fields = core_serializers.RemoteSerializer.Meta.fields + (
"includes", "excludes", "prereleases", "package_types", "keep_latest_packages",
"exclude_platforms",
"exclude_platforms", "sync_dependencies"
)
model = python_models.PythonRemote

Expand Down
73 changes: 70 additions & 3 deletions pulp_python/app/tasks/sync.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
import logging
import tempfile
from typing import Dict, Set

from aiohttp import ClientResponseError, ClientError
from collections import defaultdict
from itertools import chain
from lxml.etree import LxmlError
from gettext import gettext as _
from os import environ, path
from os import environ

from rest_framework import serializers

Expand All @@ -19,13 +23,14 @@
PythonPackageContent,
PythonRemote,
)
from pulp_python.app.utils import parse_metadata, PYPI_LAST_SERIAL
from pulp_python.app.utils import parse_json, parse_metadata, PYPI_LAST_SERIAL
from pypi_simple import parse_repo_index_page

from bandersnatch.mirror import Mirror
from bandersnatch.master import Master
from bandersnatch.configuration import BandersnatchConfig
from packaging.requirements import Requirement
from packaging.utils import canonicalize_name
from urllib.parse import urljoin, urlsplit, urlunsplit

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -113,7 +118,8 @@ async def run(self):
"""
# Prevent bandersnatch from reading actual .netrc file, set to nonexistent file
# See discussion on https://github.com/pulp/pulp_python/issues/581
environ["NETRC"] = f"{path.curdir}/.fake-netrc"
fake_netrc = tempfile.NamedTemporaryFile(dir=".", delete=False)
environ["NETRC"] = fake_netrc.name
# TODO Change Bandersnatch internal API to take proxy settings in from config parameters
if proxy_url := self.remote.proxy_url:
if self.remote.proxy_username or self.remote.proxy_password:
Expand Down Expand Up @@ -146,6 +152,23 @@ async def run(self):
Requirement(pkg).name for pkg in self.remote.includes
]
await pmirror.synchronize(packages_to_sync)
if pmirror.sync_dependencies:
depth = 1
while pmirror.dependencies_to_sync and depth <= 25: # ensure no circular loops
logger.info(_("Syncing dependencies: depth {}").format(depth))
depth += 1
packages_to_sync = list(pmirror.dependencies_to_sync.keys())
pmirror.allow_filter.allowlist_release_requirements = list(
chain(*pmirror.dependencies_to_sync.values())
)
logger.info(
f"Re-initialized release plugin {pmirror.allow_filter.name}, filtering "
+ f"{pmirror.allow_filter.allowlist_release_requirements}"
)
pmirror.dependencies_to_sync.clear()
await pmirror.synchronize(packages_to_sync)
if pmirror.dependencies_to_sync:
logger.warning(_("Reached dependency sync depth limit! Breaking out"))


class PulpMirror(Mirror):
Expand All @@ -160,8 +183,18 @@ def __init__(
super().__init__(master=master, workers=workers)
self.synced_serial = serial
self.python_stage = python_stage
self.remote = self.python_stage.remote
self.progress_report = progress_report
self.deferred_download = deferred_download
self.sync_dependencies = self.remote.includes and self.remote.sync_dependencies
if self.sync_dependencies:
# Find the allowlist_filter, so we can update it when syncing dependencies
for fil in self.filters.filter_release_plugins():
if fil.name == "allowlist_release":
self.allow_filter = fil
break
self.already_synced: Dict[str, Set[str]] = defaultdict(set)
self.dependencies_to_sync: Dict[str, Set[Requirement]] = defaultdict(set)

async def determine_packages_to_sync(self):
"""
Expand Down Expand Up @@ -230,6 +263,28 @@ async def create_content(self, pkg):
create a Content Unit to put into the pipeline
"""
for version, dists in pkg.releases.items():
if self.sync_dependencies:
if version in self.already_synced[pkg.name]:
continue
self.already_synced[pkg.name].add(version)

for req_spec in await self.get_required_dists(pkg, version):
req = Requirement(req_spec)
req.name = canonicalize_name(req.name)
req.specifier.prereleases = True
if req.marker:
if "extra == " in str(req.marker):
# Only sync the required dependency if we specified the correct 'extra'
extras = set()
for cur_allow_pkg in self.allow_filter.allowlist_release_requirements:
if cur_allow_pkg.name == pkg.name:
extras |= cur_allow_pkg.extras
extra = str(req.marker).rpartition("extra == ")[2].strip("'\"")
if extra not in extras:
continue

self.dependencies_to_sync[req.name].add(req)

for package in dists:
entry = parse_metadata(pkg.info, version, package)
url = entry.pop("url")
Expand Down Expand Up @@ -258,3 +313,15 @@ def on_error(self, exception, **kwargs):
This should have some error checking
"""
pass

async def get_required_dists(self, pkg, version):
"""Returns a list of required dists from given package version."""
# TODO: Can this logic live in Bandersnatch?
url = urljoin(self.remote.url, f"pypi/{pkg.name}/{version}/json")
downloader = self.remote.get_downloader(url=url)
try:
result = await downloader.run()
except ClientResponseError:
return []
else:
return parse_json(result).get("info", {}).get("requires_dist", []) or []
6 changes: 6 additions & 0 deletions pulp_python/app/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,3 +312,9 @@ def write_simple_detail(project_name, project_packages, streamed=False):
detail = Template(simple_detail_template)
context = {"project_name": project_name, "project_packages": project_packages}
return detail.stream(**context) if streamed else detail.render(**context)


def parse_json(download_result):
"""Parses JSON file."""
with open(download_result.path) as fd:
return json.load(fd)
64 changes: 64 additions & 0 deletions pulp_python/tests/functional/api/test_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
PYTHON_LG_FIXTURE_SUMMARY,
PYTHON_LG_FIXTURE_COUNTS,
DJANGO_LATEST_3,
DJANGO_PLUS_PYTZ,
DJANGO_PLUS_PYTZ_BCRYPT,
SCIPY_COUNTS,
)
from pulp_python.tests.functional.utils import gen_python_client, gen_python_remote
Expand Down Expand Up @@ -687,6 +689,68 @@ def test_proxy_auth_sync(
assert content_resp.count == 2


@pytest.mark.parallel
def test_sync_dependency(
python_repo, python_repo_api_client, python_content_api_client, python_remote_factory
):
"""Test syncing dependencies."""
# The only required dependency for Django in our fixtures is pytz
body = gen_python_remote(includes=["Django"], sync_dependencies=True, prereleases=True)
remote = python_remote_factory(**body)
sync_resp = python_repo_api_client.sync(python_repo.pulp_href, {"remote": remote.pulp_href})
monitor_task(sync_resp.task)

repo = python_repo_api_client.read(python_repo.pulp_href)
assert repo.latest_version_href[-2] == "1"

content_resp = python_content_api_client.list(repository_version=repo.latest_version_href)
assert content_resp.count == DJANGO_PLUS_PYTZ

content_resp = python_content_api_client.list(
repository_version=repo.latest_version_href, name="pytz"
)
assert content_resp.count > 0


@pytest.mark.parallel
def test_sync_dependency_extras(
python_repo, python_repo_api_client, python_content_api_client, python_remote_factory
):
"""Test syncing dependencies w/ extras"""
body = gen_python_remote(includes=["Django[bcrypt]"], sync_dependencies=True, prereleases=True)
remote = python_remote_factory(**body)
sync_resp = python_repo_api_client.sync(python_repo.pulp_href, {"remote": remote.pulp_href})
monitor_task(sync_resp.task)

repo = python_repo_api_client.read(python_repo.pulp_href)
assert repo.latest_version_href[-2] == "1"

content_resp = python_content_api_client.list(repository_version=repo.latest_version_href)
assert content_resp.count == DJANGO_PLUS_PYTZ_BCRYPT

content_resp = python_content_api_client.list(
repository_version=repo.latest_version_href, name="bcrypt"
)
assert content_resp.count > 0


@pytest.mark.parallel
def test_sync_dependency_not_present(
python_repo, python_repo_api_client, python_content_api_client, python_remote_factory
):
"""Test syncing dependencies that are not present in the upstream doesn't fail the sync."""
body = gen_python_remote(includes=["scipy"], sync_dependencies=True)
remote = python_remote_factory(**body)
sync_resp = python_repo_api_client.sync(python_repo.pulp_href, {"remote": remote.pulp_href})
monitor_task(sync_resp.task)

repo = python_repo_api_client.read(python_repo.pulp_href)
assert repo.latest_version_href[-2] == "1"

content_resp = python_content_api_client.list(repository_version=repo.latest_version_href)
assert content_resp.count == SCIPY_COUNTS["total"]


def sync_to_remote(self, body, create=False, mirror=False):
"""Takes a body and creates/updates a remote object, then it performs a sync"""
if create:
Expand Down
2 changes: 2 additions & 0 deletions pulp_python/tests/functional/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,8 @@
}

DJANGO_LATEST_3 = 4 # latest version has 2 dists, each other has 1
DJANGO_PLUS_PYTZ = 37
DJANGO_PLUS_PYTZ_BCRYPT = 45
SCIPY_COUNTS = {
"total": 23, # scipy has 23 different release files for the same release
"windows": 8,
Expand Down
Loading