Skip to content

Commit

Permalink
Merge pull request #132 from ecmwf-projects/COPDS-1801-custom-types-p…
Browse files Browse the repository at this point in the history
…art2

different uids for each content's site/application
  • Loading branch information
alex75 authored Sep 23, 2024
2 parents 3b6aa61 + 22075fd commit 18248f5
Show file tree
Hide file tree
Showing 5 changed files with 175 additions and 111 deletions.
36 changes: 36 additions & 0 deletions alembic/versions/694fde86c48c_id_manage_of_contents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""id manage of contents.
Revision ID: 694fde86c48c
Revises: 59fa8a6b0a81
Create Date: 2024-09-20 09:01:06.015849
"""

import sqlalchemy as sa

from alembic import op

# revision identifiers, used by Alembic.
revision = "694fde86c48c"
down_revision = "59fa8a6b0a81"
branch_labels = None
depends_on = None


def upgrade() -> None:
op.create_index("ix_contents_type", "contents", ["type"])
op.drop_column("contents", "content_uid")
op.add_column("contents", sa.Column("slug", sa.String, index=True, nullable=False))
op.create_unique_constraint(
"contents_site_slug_type_key", "contents", ["site", "slug", "type"]
)


def downgrade() -> None:
op.drop_constraint("contents_site_slug_type_key", "contents")
op.drop_column("contents", "slug")
op.add_column(
"contents",
sa.Column("content_uid", sa.String, index=True, unique=True, nullable=False),
)
op.drop_index("ix_contents_type", "contents")
123 changes: 65 additions & 58 deletions cads_catalogue/contents.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,9 @@ def content_sync(
The created/updated db message
"""
content = content.copy()
content_uid = content["content_uid"]
keywords = content.pop("keywords", [])

subpath = os.path.join("contents", content["content_uid"])
site, ctype, slug = content["site"], content["type"], content["slug"]
subpath = os.path.join("contents", site, ctype, slug)
for field in OBJECT_STORAGE_UPLOAD_FIELDS:
file_path = content.get(field)
if not file_path:
Expand All @@ -66,19 +65,25 @@ def content_sync(

# upsert of the message
db_content = session.scalars(
sa.select(database.Content).filter_by(content_uid=content_uid).limit(1)
sa.select(database.Content)
.filter_by(
site=site,
type=ctype,
slug=slug,
)
.limit(1)
).first()
if not db_content:
db_content = database.Content(**content)
session.add(db_content)
logger.debug("added db content %r" % content_uid)
logger.debug(f"added content {ctype} '{slug}' for site {site}")
else:
session.execute(
sa.update(database.Content)
.filter_by(content_id=db_content.content_id)
.values(**content)
)
logger.debug("updated db content %r" % content_uid)
logger.debug("updated content {ctype} '{slug}' for site {site}")

# build related keywords
db_content.keywords = [] # type: ignore
Expand All @@ -98,50 +103,55 @@ def content_sync(
return db_content


def load_content_folder(content_folder: str | pathlib.Path) -> dict[str, Any]:
def load_content_folder(content_folder: str | pathlib.Path) -> List[dict[str, Any]]:
"""
Parse a content folder and returns its metadata dictionary.
Parse folder and returns a list of metadata dictionaries, each one for a content.
Parameters
----------
content_folder: folder path containing content files
Returns
-------
dictionary of information parsed.
list of dictionaries of information parsed.
"""
metadata_file_path = os.path.join(content_folder, "metadata.json")
with open(metadata_file_path) as fp:
data = json.load(fp)
metadata = {
"site": ",".join(data["site"]),
"type": data["resource_type"],
"content_uid": data["id"],
"title": data["title"],
"description": data["abstract"],
"publication_date": data["publication_date"],
"content_update": data["update_date"],
"link": data.get("link"),
"keywords": data.get("keywords", []),
"data": data.get("data"),
# managed below:
# "image": None,
# "layout": None,
}
for ancillar_file_field in OBJECT_STORAGE_UPLOAD_FIELDS: # image, layout
metadata[ancillar_file_field] = None
rel_path = data.get(ancillar_file_field)
if rel_path:
ancillar_file_path = os.path.abspath(os.path.join(content_folder, rel_path))
if os.path.isfile(ancillar_file_path):
metadata[ancillar_file_field] = os.path.abspath(
ret_value = []
for site in data["site"]:
metadata = {
"site": site,
"type": data["resource_type"],
"slug": data["id"],
"title": data["title"],
"description": data["abstract"],
"publication_date": data["publication_date"],
"content_update": data["update_date"],
"link": data.get("link"),
"keywords": data.get("keywords", []),
"data": data.get("data"),
# managed below:
# "image": None,
# "layout": None,
}
for ancillar_file_field in OBJECT_STORAGE_UPLOAD_FIELDS: # image, layout
metadata[ancillar_file_field] = None
rel_path = data.get(ancillar_file_field)
if rel_path:
ancillar_file_path = os.path.abspath(
os.path.join(content_folder, rel_path)
)
else:
raise ValueError(
f"{metadata_file_path} contains reference to {ancillar_file_field} file not found!"
)
return metadata
if os.path.isfile(ancillar_file_path):
metadata[ancillar_file_field] = os.path.abspath(
os.path.join(content_folder, rel_path)
)
else:
raise ValueError(
f"{metadata_file_path} contains reference to {ancillar_file_field} file not found!"
)
ret_value.append(metadata)
return ret_value


def load_contents(contents_root_folder: str | pathlib.Path) -> List[dict[str, Any]]:
Expand Down Expand Up @@ -169,13 +179,13 @@ def load_contents(contents_root_folder: str | pathlib.Path) -> List[dict[str, An
logger.warning("unknown file %r found" % content_folder)
continue
try:
content_md = load_content_folder(content_folder)
contents_md = load_content_folder(content_folder)
except: # noqa
logger.exception(
"failed parsing content in %s, error follows" % content_folder
)
continue
loaded_contents.append(content_md)
loaded_contents += contents_md
return loaded_contents


Expand All @@ -197,41 +207,38 @@ def update_catalogue_contents(
Returns
-------
list: list of content uids involved
list: list of (site, type, slug) of contents involved
"""
contents = load_contents(contents_package_path)
logger.info(
"loaded %s contents from folder %s" % (len(contents), contents_package_path)
)
involved_content_ids = []
involved_content_props = []
for content in contents:
content_uid = content["content_uid"]
involved_content_ids.append(content_uid)
site, ctype, slug = content["site"], content["type"], content["slug"]
involved_content_props.append((site, ctype, slug))
try:
with session.begin_nested():
content_sync(session, content, storage_settings)
logger.info("content '%s' db sync successful" % content_uid)
logger.info(f"content {ctype} '{slug}' for site {site}: db sync successful")
except Exception: # noqa
logger.exception(
"db sync for content '%s' failed, error follows" % content_uid
f"db sync for content {ctype} '{slug}' for site {site} failed, error follows"
)

if not remove_orphans:
return involved_content_ids
return involved_content_props

# remove not loaded contents from the db
contents_to_delete = (
session.scalars(
sa.select(database.Content).filter(
database.Content.content_uid.notin_(involved_content_ids)
all_db_contents = session.scalars(sa.select(database.Content))
for db_content in all_db_contents:
content_props = (db_content.site, db_content.type, db_content.slug)
if content_props not in involved_content_props:
db_content.keywords = []
session.delete(db_content)
logger.info(
f"removed old content {content_props[1]} '{content_props[2]}' "
f"for site {content_props[0]}"
)
)
.unique()
.all()
)
for content_to_delete in contents_to_delete:
content_to_delete.keywords = []
session.delete(content_to_delete)
logger.info("removed old content '%s'" % content_to_delete.content_uid)

return involved_content_ids
return involved_content_props
5 changes: 3 additions & 2 deletions cads_catalogue/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,10 @@ class Content(BaseModel):
"""Content ORM model."""

__tablename__ = "contents"
__table_args__ = (sa.UniqueConstraint("site", "slug", "type"),)

content_id = sa.Column(sa.Integer, primary_key=True)
content_uid = sa.Column(sa.String, index=True, unique=True, nullable=False)
slug = sa.Column(sa.String, index=True, nullable=False)
content_update = sa.Column(sa.TIMESTAMP, nullable=False)
data = sa.Column(dialect_postgresql.JSONB)
description = sa.Column(sa.String, nullable=False)
Expand All @@ -85,7 +86,7 @@ class Content(BaseModel):
publication_date = sa.Column(sa.TIMESTAMP, nullable=False)
site = sa.Column(sa.String, index=True, nullable=False)
title = sa.Column(sa.String, nullable=False)
type = sa.Column(sa.String, nullable=False)
type = sa.Column(sa.String, index=True, nullable=False)

keywords: sa.orm.Mapped[List["ContentKeyword"]] = sa.orm.relationship(
"ContentKeyword", secondary="contents_keywords_m2m", back_populates="contents"
Expand Down
89 changes: 54 additions & 35 deletions tests/test_15_contents.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import datetime
import os.path
from operator import itemgetter

import pytest_mock
import sqlalchemy as sa
Expand All @@ -15,43 +16,45 @@ def test_load_content_folder() -> None:
content_folder = os.path.join(
TEST_CONTENT_ROOT_PATH, "copernicus-interactive-climates-atlas"
)
expected_content = {
"content_uid": "copernicus-interactive-climates-atlas",
"publication_date": "2024-09-13T00:00:00Z",
"description": "The Copernicus Interactive Climate Atlas provides graphical "
"information about recent past trends and future changes "
"(for different scenarios and global warming levels)",
"image": os.path.join(content_folder, "cica-overview.png"),
"keywords": [
"Product type: Application",
"Spatial coverage: Global",
"Temporal coverage: Past",
"Variable domain: Land (hydrology)",
"Variable domain: Land (physics)",
"Variable domain: Land (biosphere)",
"Provider: Copernicus C3S",
],
"layout": None,
"link": "https://atlas.climate.copernicus.eu/atlas",
"content_update": "2024-09-16T00:00:00Z",
"site": "cds",
"title": "Copernicus Interactive Climate Atlas",
"type": "application",
"data": {
"file-format": "GRIB (optional conversion to netCDF)",
"data-type": "Gridded",
"horizontal-coverage": "Global",
},
}
expected_contents = [
{
"slug": "copernicus-interactive-climates-atlas",
"publication_date": "2024-09-13T00:00:00Z",
"description": "The Copernicus Interactive Climate Atlas provides graphical "
"information about recent past trends and future changes "
"(for different scenarios and global warming levels)",
"image": os.path.join(content_folder, "cica-overview.png"),
"keywords": [
"Product type: Application",
"Spatial coverage: Global",
"Temporal coverage: Past",
"Variable domain: Land (hydrology)",
"Variable domain: Land (physics)",
"Variable domain: Land (biosphere)",
"Provider: Copernicus C3S",
],
"layout": None,
"link": "https://atlas.climate.copernicus.eu/atlas",
"content_update": "2024-09-16T00:00:00Z",
"site": "cds",
"title": "Copernicus Interactive Climate Atlas",
"type": "application",
"data": {
"file-format": "GRIB (optional conversion to netCDF)",
"data-type": "Gridded",
"horizontal-coverage": "Global",
},
}
]

effective_content = contents.load_content_folder(content_folder)
assert effective_content == expected_content
effective_contents = contents.load_content_folder(content_folder)
assert effective_contents == expected_contents


def test_load_contents() -> None:
expected_contents = [
{
"content_uid": "copernicus-interactive-climates-atlas",
"slug": "copernicus-interactive-climates-atlas",
"publication_date": "2024-09-13T00:00:00Z",
"description": "The Copernicus Interactive Climate Atlas provides graphical "
"information about recent past trends and future changes "
Expand Down Expand Up @@ -83,21 +86,37 @@ def test_load_contents() -> None:
},
},
{
"content_uid": "how-to-api",
"slug": "how-to-api",
"publication_date": "2024-09-13T10:01:50Z",
"description": "Access the full data store catalogue, with search and availability features",
"image": None,
"keywords": [],
"layout": os.path.join(TEST_CONTENT_ROOT_PATH, "how-to-api", "layout.json"),
"content_update": "2024-09-16T02:10:22Z",
"link": None,
"site": "cds,ads",
"site": "ads",
"title": "CDSAPI setup",
"type": "page",
"data": None,
},
{
"slug": "how-to-api",
"publication_date": "2024-09-13T10:01:50Z",
"description": "Access the full data store catalogue, with search and availability features",
"image": None,
"keywords": [],
"layout": os.path.join(TEST_CONTENT_ROOT_PATH, "how-to-api", "layout.json"),
"content_update": "2024-09-16T02:10:22Z",
"link": None,
"site": "cds",
"title": "CDSAPI setup",
"type": "page",
"data": None,
},
]
effective_contents = contents.load_contents(TEST_CONTENT_ROOT_PATH)
effective_contents = sorted(
contents.load_contents(TEST_CONTENT_ROOT_PATH), key=itemgetter("slug", "site")
)
assert effective_contents == expected_contents


Expand All @@ -119,7 +138,7 @@ def test_content_sync(
mocker.patch.object(object_storage, "store_file", return_value="an url")
# load testing content
content1 = {
"content_uid": "copernicus-interactive-climates-atlas",
"slug": "copernicus-interactive-climates-atlas",
"publication_date": "2024-09-13T00:00:00Z",
"description": "The Copernicus Interactive Climate Atlas provides graphical "
"information about recent past trends and future changes "
Expand Down
Loading

0 comments on commit 18248f5

Please sign in to comment.