Merge pull request #132 from ecmwf-projects/COPDS-1801-custom-types-p…

…art2 different uids for each content's site/application
ecmwf-projects · Sep 23, 2024 · 18248f5 · 18248f5
2 parents 3b6aa61 + 22075fd
commit 18248f5
Show file tree

Hide file tree

Showing 5 changed files with 175 additions and 111 deletions.
diff --git a/alembic/versions/694fde86c48c_id_manage_of_contents.py b/alembic/versions/694fde86c48c_id_manage_of_contents.py
@@ -0,0 +1,36 @@
+"""id manage of contents.
+
+Revision ID: 694fde86c48c
+Revises: 59fa8a6b0a81
+Create Date: 2024-09-20 09:01:06.015849
+
+"""
+
+import sqlalchemy as sa
+
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision = "694fde86c48c"
+down_revision = "59fa8a6b0a81"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.create_index("ix_contents_type", "contents", ["type"])
+    op.drop_column("contents", "content_uid")
+    op.add_column("contents", sa.Column("slug", sa.String, index=True, nullable=False))
+    op.create_unique_constraint(
+        "contents_site_slug_type_key", "contents", ["site", "slug", "type"]
+    )
+
+
+def downgrade() -> None:
+    op.drop_constraint("contents_site_slug_type_key", "contents")
+    op.drop_column("contents", "slug")
+    op.add_column(
+        "contents",
+        sa.Column("content_uid", sa.String, index=True, unique=True, nullable=False),
+    )
+    op.drop_index("ix_contents_type", "contents")
diff --git a/cads_catalogue/contents.py b/cads_catalogue/contents.py
@@ -48,10 +48,9 @@ def content_sync(
     The created/updated db message
     """
     content = content.copy()
-    content_uid = content["content_uid"]
     keywords = content.pop("keywords", [])
-
-    subpath = os.path.join("contents", content["content_uid"])
+    site, ctype, slug = content["site"], content["type"], content["slug"]
+    subpath = os.path.join("contents", site, ctype, slug)
     for field in OBJECT_STORAGE_UPLOAD_FIELDS:
         file_path = content.get(field)
         if not file_path:
@@ -66,19 +65,25 @@ def content_sync(
 
     # upsert of the message
     db_content = session.scalars(
-        sa.select(database.Content).filter_by(content_uid=content_uid).limit(1)
+        sa.select(database.Content)
+        .filter_by(
+            site=site,
+            type=ctype,
+            slug=slug,
+        )
+        .limit(1)
     ).first()
     if not db_content:
         db_content = database.Content(**content)
         session.add(db_content)
-        logger.debug("added db content %r" % content_uid)
+        logger.debug(f"added content {ctype} '{slug}' for site {site}")
     else:
         session.execute(
             sa.update(database.Content)
             .filter_by(content_id=db_content.content_id)
             .values(**content)
         )
-        logger.debug("updated db content %r" % content_uid)
+        logger.debug("updated content {ctype} '{slug}' for site {site}")
 
     # build related keywords
     db_content.keywords = []  # type: ignore
@@ -98,50 +103,55 @@ def content_sync(
     return db_content
 
 
-def load_content_folder(content_folder: str | pathlib.Path) -> dict[str, Any]:
+def load_content_folder(content_folder: str | pathlib.Path) -> List[dict[str, Any]]:
     """
-    Parse a content folder and returns its metadata dictionary.
+    Parse folder and returns a list of metadata dictionaries, each one for a content.
 
     Parameters
     ----------
     content_folder: folder path containing content files
 
     Returns
     -------
-    dictionary of information parsed.
+    list of dictionaries of information parsed.
     """
     metadata_file_path = os.path.join(content_folder, "metadata.json")
     with open(metadata_file_path) as fp:
         data = json.load(fp)
-    metadata = {
-        "site": ",".join(data["site"]),
-        "type": data["resource_type"],
-        "content_uid": data["id"],
-        "title": data["title"],
-        "description": data["abstract"],
-        "publication_date": data["publication_date"],
-        "content_update": data["update_date"],
-        "link": data.get("link"),
-        "keywords": data.get("keywords", []),
-        "data": data.get("data"),
-        # managed below:
-        # "image": None,
-        # "layout": None,
-    }
-    for ancillar_file_field in OBJECT_STORAGE_UPLOAD_FIELDS:  # image, layout
-        metadata[ancillar_file_field] = None
-        rel_path = data.get(ancillar_file_field)
-        if rel_path:
-            ancillar_file_path = os.path.abspath(os.path.join(content_folder, rel_path))
-            if os.path.isfile(ancillar_file_path):
-                metadata[ancillar_file_field] = os.path.abspath(
+    ret_value = []
+    for site in data["site"]:
+        metadata = {
+            "site": site,
+            "type": data["resource_type"],
+            "slug": data["id"],
+            "title": data["title"],
+            "description": data["abstract"],
+            "publication_date": data["publication_date"],
+            "content_update": data["update_date"],
+            "link": data.get("link"),
+            "keywords": data.get("keywords", []),
+            "data": data.get("data"),
+            # managed below:
+            # "image": None,
+            # "layout": None,
+        }
+        for ancillar_file_field in OBJECT_STORAGE_UPLOAD_FIELDS:  # image, layout
+            metadata[ancillar_file_field] = None
+            rel_path = data.get(ancillar_file_field)
+            if rel_path:
+                ancillar_file_path = os.path.abspath(
                     os.path.join(content_folder, rel_path)
                 )
-            else:
-                raise ValueError(
-                    f"{metadata_file_path} contains reference to {ancillar_file_field} file not found!"
-                )
-    return metadata
+                if os.path.isfile(ancillar_file_path):
+                    metadata[ancillar_file_field] = os.path.abspath(
+                        os.path.join(content_folder, rel_path)
+                    )
+                else:
+                    raise ValueError(
+                        f"{metadata_file_path} contains reference to {ancillar_file_field} file not found!"
+                    )
+        ret_value.append(metadata)
+    return ret_value
 
 
 def load_contents(contents_root_folder: str | pathlib.Path) -> List[dict[str, Any]]:
@@ -169,13 +179,13 @@ def load_contents(contents_root_folder: str | pathlib.Path) -> List[dict[str, An
             logger.warning("unknown file %r found" % content_folder)
             continue
         try:
-            content_md = load_content_folder(content_folder)
+            contents_md = load_content_folder(content_folder)
         except:  # noqa
             logger.exception(
                 "failed parsing content in %s, error follows" % content_folder
             )
             continue
-        loaded_contents.append(content_md)
+        loaded_contents += contents_md
     return loaded_contents
 
 
@@ -197,41 +207,38 @@ def update_catalogue_contents(
 
     Returns
     -------
-    list: list of content uids involved
+    list: list of (site, type, slug) of contents involved
     """
     contents = load_contents(contents_package_path)
     logger.info(
         "loaded %s contents from folder %s" % (len(contents), contents_package_path)
     )
-    involved_content_ids = []
+    involved_content_props = []
     for content in contents:
-        content_uid = content["content_uid"]
-        involved_content_ids.append(content_uid)
+        site, ctype, slug = content["site"], content["type"], content["slug"]
+        involved_content_props.append((site, ctype, slug))
         try:
             with session.begin_nested():
                 content_sync(session, content, storage_settings)
-            logger.info("content '%s' db sync successful" % content_uid)
+            logger.info(f"content {ctype} '{slug}' for site {site}: db sync successful")
         except Exception:  # noqa
             logger.exception(
-                "db sync for content '%s' failed, error follows" % content_uid
+                f"db sync for content {ctype} '{slug}' for site {site} failed, error follows"
             )
 
     if not remove_orphans:
-        return involved_content_ids
+        return involved_content_props
 
     # remove not loaded contents from the db
-    contents_to_delete = (
-        session.scalars(
-            sa.select(database.Content).filter(
-                database.Content.content_uid.notin_(involved_content_ids)
+    all_db_contents = session.scalars(sa.select(database.Content))
+    for db_content in all_db_contents:
+        content_props = (db_content.site, db_content.type, db_content.slug)
+        if content_props not in involved_content_props:
+            db_content.keywords = []
+            session.delete(db_content)
+            logger.info(
+                f"removed old content {content_props[1]} '{content_props[2]}' "
+                f"for site {content_props[0]}"
             )
-        )
-        .unique()
-        .all()
-    )
-    for content_to_delete in contents_to_delete:
-        content_to_delete.keywords = []
-        session.delete(content_to_delete)
-        logger.info("removed old content '%s'" % content_to_delete.content_uid)
 
-    return involved_content_ids
+    return involved_content_props
diff --git a/cads_catalogue/database.py b/cads_catalogue/database.py
@@ -73,9 +73,10 @@ class Content(BaseModel):
     """Content ORM model."""
 
     __tablename__ = "contents"
+    __table_args__ = (sa.UniqueConstraint("site", "slug", "type"),)
 
     content_id = sa.Column(sa.Integer, primary_key=True)
-    content_uid = sa.Column(sa.String, index=True, unique=True, nullable=False)
+    slug = sa.Column(sa.String, index=True, nullable=False)
     content_update = sa.Column(sa.TIMESTAMP, nullable=False)
     data = sa.Column(dialect_postgresql.JSONB)
     description = sa.Column(sa.String, nullable=False)
@@ -85,7 +86,7 @@ class Content(BaseModel):
     publication_date = sa.Column(sa.TIMESTAMP, nullable=False)
     site = sa.Column(sa.String, index=True, nullable=False)
     title = sa.Column(sa.String, nullable=False)
-    type = sa.Column(sa.String, nullable=False)
+    type = sa.Column(sa.String, index=True, nullable=False)
 
     keywords: sa.orm.Mapped[List["ContentKeyword"]] = sa.orm.relationship(
         "ContentKeyword", secondary="contents_keywords_m2m", back_populates="contents"

diff --git a/tests/test_15_contents.py b/tests/test_15_contents.py
@@ -1,5 +1,6 @@
 import datetime
 import os.path
+from operator import itemgetter
 
 import pytest_mock
 import sqlalchemy as sa
@@ -15,43 +16,45 @@ def test_load_content_folder() -> None:
     content_folder = os.path.join(
         TEST_CONTENT_ROOT_PATH, "copernicus-interactive-climates-atlas"
     )
-    expected_content = {
-        "content_uid": "copernicus-interactive-climates-atlas",
-        "publication_date": "2024-09-13T00:00:00Z",
-        "description": "The Copernicus Interactive Climate Atlas provides graphical "
-        "information about recent past trends and future changes "
-        "(for different scenarios and global warming levels)",
-        "image": os.path.join(content_folder, "cica-overview.png"),
-        "keywords": [
-            "Product type: Application",
-            "Spatial coverage: Global",
-            "Temporal coverage: Past",
-            "Variable domain: Land (hydrology)",
-            "Variable domain: Land (physics)",
-            "Variable domain: Land (biosphere)",
-            "Provider: Copernicus C3S",
-        ],
-        "layout": None,
-        "link": "https://atlas.climate.copernicus.eu/atlas",
-        "content_update": "2024-09-16T00:00:00Z",
-        "site": "cds",
-        "title": "Copernicus Interactive Climate Atlas",
-        "type": "application",
-        "data": {
-            "file-format": "GRIB (optional conversion to netCDF)",
-            "data-type": "Gridded",
-            "horizontal-coverage": "Global",
-        },
-    }
+    expected_contents = [
+        {
+            "slug": "copernicus-interactive-climates-atlas",
+            "publication_date": "2024-09-13T00:00:00Z",
+            "description": "The Copernicus Interactive Climate Atlas provides graphical "
+            "information about recent past trends and future changes "
+            "(for different scenarios and global warming levels)",
+            "image": os.path.join(content_folder, "cica-overview.png"),
+            "keywords": [
+                "Product type: Application",
+                "Spatial coverage: Global",
+                "Temporal coverage: Past",
+                "Variable domain: Land (hydrology)",
+                "Variable domain: Land (physics)",
+                "Variable domain: Land (biosphere)",
+                "Provider: Copernicus C3S",
+            ],
+            "layout": None,
+            "link": "https://atlas.climate.copernicus.eu/atlas",
+            "content_update": "2024-09-16T00:00:00Z",
+            "site": "cds",
+            "title": "Copernicus Interactive Climate Atlas",
+            "type": "application",
+            "data": {
+                "file-format": "GRIB (optional conversion to netCDF)",
+                "data-type": "Gridded",
+                "horizontal-coverage": "Global",
+            },
+        }
+    ]
 
-    effective_content = contents.load_content_folder(content_folder)
-    assert effective_content == expected_content
+    effective_contents = contents.load_content_folder(content_folder)
+    assert effective_contents == expected_contents
 
 
 def test_load_contents() -> None:
     expected_contents = [
         {
-            "content_uid": "copernicus-interactive-climates-atlas",
+            "slug": "copernicus-interactive-climates-atlas",
             "publication_date": "2024-09-13T00:00:00Z",
             "description": "The Copernicus Interactive Climate Atlas provides graphical "
             "information about recent past trends and future changes "
@@ -83,21 +86,37 @@ def test_load_contents() -> None:
             },
         },
         {
-            "content_uid": "how-to-api",
+            "slug": "how-to-api",
             "publication_date": "2024-09-13T10:01:50Z",
             "description": "Access the full data store catalogue, with search and availability features",
             "image": None,
             "keywords": [],
             "layout": os.path.join(TEST_CONTENT_ROOT_PATH, "how-to-api", "layout.json"),
             "content_update": "2024-09-16T02:10:22Z",
             "link": None,
-            "site": "cds,ads",
+            "site": "ads",
+            "title": "CDSAPI setup",
+            "type": "page",
+            "data": None,
+        },
+        {
+            "slug": "how-to-api",
+            "publication_date": "2024-09-13T10:01:50Z",
+            "description": "Access the full data store catalogue, with search and availability features",
+            "image": None,
+            "keywords": [],
+            "layout": os.path.join(TEST_CONTENT_ROOT_PATH, "how-to-api", "layout.json"),
+            "content_update": "2024-09-16T02:10:22Z",
+            "link": None,
+            "site": "cds",
             "title": "CDSAPI setup",
             "type": "page",
             "data": None,
         },
     ]
-    effective_contents = contents.load_contents(TEST_CONTENT_ROOT_PATH)
+    effective_contents = sorted(
+        contents.load_contents(TEST_CONTENT_ROOT_PATH), key=itemgetter("slug", "site")
+    )
     assert effective_contents == expected_contents
 
 
@@ -119,7 +138,7 @@ def test_content_sync(
     mocker.patch.object(object_storage, "store_file", return_value="an url")
     # load testing content
     content1 = {
-        "content_uid": "copernicus-interactive-climates-atlas",
+        "slug": "copernicus-interactive-climates-atlas",
         "publication_date": "2024-09-13T00:00:00Z",
         "description": "The Copernicus Interactive Climate Atlas provides graphical "
         "information about recent past trends and future changes "