From 442a5a4466a7ac2eab77aac49391ed93b38e7bb0 Mon Sep 17 00:00:00 2001 From: Ramakrishna Sakhamuru Date: Tue, 5 Nov 2024 12:01:09 +0530 Subject: [PATCH] Updated files to accept multiple sitemaps request --- doajtest/mocks/models_Cache.py | 10 ++++ doajtest/unit/test_bll_site_sitemap.py | 66 +++++++++++++++++++++----- portality/bll/services/site.py | 26 ++++++++-- portality/models/cache.py | 15 ++++++ portality/view/doaj.py | 17 ++++++- 5 files changed, 115 insertions(+), 19 deletions(-) diff --git a/doajtest/mocks/models_Cache.py b/doajtest/mocks/models_Cache.py index 580a8e08cf..d0a91ddadd 100644 --- a/doajtest/mocks/models_Cache.py +++ b/doajtest/mocks/models_Cache.py @@ -26,6 +26,16 @@ def cache_sitemap(cls, filename): "filename" : filename } + @classmethod + def cache_nth_sitemap(cls, n, url): + cls.__memory__["sitemap" + str(n)] = { + "filename": url + } + + @classmethod + def get_sitemap(cls, n): + return cls.__memory__["sitemap" + str(n)] + @classmethod def get_latest_sitemap(cls): return cls.__memory__["sitemap"] diff --git a/doajtest/unit/test_bll_site_sitemap.py b/doajtest/unit/test_bll_site_sitemap.py index d7f8ec4b0c..b94eb382f7 100644 --- a/doajtest/unit/test_bll_site_sitemap.py +++ b/doajtest/unit/test_bll_site_sitemap.py @@ -1,3 +1,4 @@ +import os.path from io import StringIO from combinatrix.testintegration import load_parameter_sets @@ -5,7 +6,7 @@ from parameterized import parameterized from doajtest import helpers -from doajtest.fixtures import JournalFixtureFactory +from doajtest.fixtures import JournalFixtureFactory, ArticleFixtureFactory from doajtest.helpers import DoajTestCase, patch_config from doajtest.mocks.models_Cache import ModelCacheMockFactory from doajtest.mocks.store import StoreMockFactory @@ -67,6 +68,10 @@ def setUp(self): } ] + self.base_url = app.config.get("BASE_URL") + if not self.base_url.endswith("/"): + self.base_url += "/" + def tearDown(self): self.localStore.delete_container(self.container_id) self.tmpStore.delete_container(self.container_id) @@ -110,12 +115,21 @@ def test_sitemap(self, name, kwargs): expectations = [(j.bibjson().get_preferred_issn(), j.last_updated) for j in journals] + articles = [] + for s in ArticleFixtureFactory.make_many_article_sources(count=10, in_doaj=True): + a = models.Article(**s) + a.save() + articles.append(a) + models.Article.blockall([(a.id, a.last_updated) for a in articles]) + + articles_expectations = [(a.id, a.last_updated) for a in articles] + if prune: - self.localStore.store(self.container_id, "sitemap__doaj_20180101_0000_utf8.xml", + self.localStore.store(self.container_id, "sitemap_doaj_20180101_0000_utf8.xml", source_stream=StringIO("test1")) - self.localStore.store(self.container_id, "sitemap__doaj_20180601_0000_utf8.xml", + self.localStore.store(self.container_id, "sitemap_doaj_20180601_0000_utf8.xml", source_stream=StringIO("test2")) - self.localStore.store(self.container_id, "sitemap__doaj_20190101_0000_utf8.xml", + self.localStore.store(self.container_id, "sitemap_doaj_20190101_0000_utf8.xml", source_stream=StringIO("test3")) ########################################################### @@ -139,41 +153,63 @@ def test_sitemap(self, name, kwargs): filenames = self.localStore.list(self.container_id) if prune: assert len(filenames) == 2, "expected 0, received {}".format(len(filenames)) - assert "sitemap__doaj_20180101_0000_utf8.xml" not in filenames - assert "sitemap__doaj_20180601_0000_utf8.xml" not in filenames - assert "sitemap__doaj_20190101_0000_utf8.xml" in filenames + assert "sitemap_doaj_20180101_0000_utf8.xml" not in filenames + assert "sitemap_doaj_20180601_0000_utf8.xml" not in filenames + assert "sitemap_doaj_20190101_0000_utf8.xml" in filenames else: assert len(filenames) == 1, "expected 0, received {}".format(len(filenames)) latest = None for fn in filenames: - if fn != "sitemap__doaj_20190101_0000_utf8.xml": + if fn != "sitemap_doaj_20190101_0000_utf8.xml": latest = fn break - handle = self.localStore.get(self.container_id, latest, encoding="utf-8") + NS = "{http://www.sitemaps.org/schemas/sitemap/0.9}" + + file_date = '_'.join(latest.split('_')[2:]) + index_file = os.path.join(latest, 'sitemap_index_doaj_'+file_date+'_utf8.xml') - # check the contents + handle = self.localStore.get(self.container_id, index_file, encoding="utf-8") + + # check sitemap index file + tree = etree.parse(handle) + urlElements = tree.getroot().getchildren() + for urlElement in urlElements: + loc = urlElement.find(NS + "loc").text + assert loc.startswith(self.base_url + "sitemap") tocs = [] statics = [] - NS = "{http://www.sitemaps.org/schemas/sitemap/0.9}" + article_ids = [] + + # check sitemap file + sitemap_file = os.path.join(latest, 'sitemap_doaj_' + file_date + '_0_utf8.xml') + handle = self.localStore.get(self.container_id, sitemap_file, encoding="utf-8") tree = etree.parse(handle) urlElements = tree.getroot().getchildren() + for urlElement in urlElements: loc = urlElement.find(NS + "loc").text lm = urlElement.find(NS + "lastmod") if lm is not None: lm = lm.text cf = urlElement.find(NS + "changefreq").text - + assert cf == "daily" + # check journals if "/toc" in loc: for exp in expectations: if loc.endswith(exp[0]): tocs.append(exp[0]) assert lm == exp[1] - assert cf == "daily" + # check articles + elif "/article/" in loc: + for exp in articles_expectations: + if loc.endswith(exp[0]): + article_ids.append(exp[0]) + assert lm == exp[1] + # check static pages else: statics.append(loc) assert lm is None @@ -183,6 +219,10 @@ def test_sitemap(self, name, kwargs): list(set(tocs)) assert len(tocs) == len(expectations) + # deduplicate the list of articles, to check that we saw all articles + list(set(article_ids)) + assert len(article_ids) == len(articles_expectations) + # deduplicate the statics, to check we saw all of them too _urls = (get_full_url_safe(r) for r in nav.yield_all_route(self.static_entries)) diff --git a/portality/bll/services/site.py b/portality/bll/services/site.py index 8e7edb0e03..87a7b22da6 100644 --- a/portality/bll/services/site.py +++ b/portality/bll/services/site.py @@ -9,9 +9,10 @@ from portality.core import app from portality.lib import nav, dates from portality.lib.argvalidate import argvalidate -from portality.lib.dates import FMT_DATETIME_SHORT +from portality.lib.dates import FMT_DATETIME_SHORT, FMT_DATETIME_STD from portality.store import StoreFactory, prune_container from portality.util import get_full_url_safe +from portality.view.doaj import sitemap NS = "{http://www.sitemaps.org/schemas/sitemap/0.9}" IN_DOAJ = { @@ -103,6 +104,7 @@ def sitemap(self, prune: bool = True): base_url += "/" run_start_time = dates.now_str(FMT_DATETIME_SHORT) + lastmod_date = dates.now_str(FMT_DATETIME_STD) filename_prefix = 'sitemap_doaj_' + run_start_time cache_container_id = app.config.get("STORE_CACHE_CONTAINER") @@ -150,6 +152,7 @@ def sitemap(self, prune: bool = True): sitemap_generator.add_url(article_loc, lastmod=a.last_updated) total_articles_count += 1 + # check last sitemap if sitemap_generator.get_url_count() > 0: sitemap_generator.finalize_sitemap_file() @@ -159,13 +162,28 @@ def sitemap(self, prune: bool = True): with open(sitemap_index_path, "w") as f: f.write('\n') f.write('\n') + sitemap_count = 0 for sitemap_url in sitemap_generator.get_sitemap_files(): f.write(f" \n") - f.write(f" {sitemap_url}\n") - f.write(f" {run_start_time}\n") + f.write(f" {base_url}sitemap{sitemap_count}.xml\n") + f.write(f" {lastmod_date}\n") f.write(f" \n") + # Cache the sitemap + models.Cache.cache_nth_sitemap(sitemap_count, sitemap_url) + sitemap_count += 1 f.write('\n') + # Delete any previous cache. Usually this may not be the situation but check + # if there are any previous sitemap available and delete + while True: + cache = models.Cache.pull("sitemap"+str(sitemap_count)) + if cache: + cache.delete() + else: + break + sitemap_count += 1 + + mainStore.store(container_id, sitemap_index_filename, source_path=sitemap_index_path) index_url = mainStore.url(container_id, sitemap_index_filename) @@ -174,7 +192,7 @@ def sitemap(self, prune: bool = True): # Prune old sitemaps if required if prune: def sort(filelist): - rx = "sitemap_doaj_(\d{8})_(\d{4})" + rx = r"^sitemap_doaj_(\d{8})_(\d{4})" matched_dates = [ (filename, datetime.strptime(match.groups()[0]+"_"+match.groups()[1], FMT_DATETIME_SHORT)) diff --git a/portality/models/cache.py b/portality/models/cache.py index 25c2843e9c..9b1c56b445 100644 --- a/portality/models/cache.py +++ b/portality/models/cache.py @@ -62,6 +62,14 @@ def cache_sitemap(cls, url): cobj.set_id("sitemap") cobj.save() + @classmethod + def cache_nth_sitemap(cls, n, url): + cobj = cls(**{ + "filename": url + }) + cobj.set_id("sitemap"+str(n)) + cobj.save() + @classmethod def get_latest_sitemap(cls): rec = cls.pull("sitemap") @@ -69,6 +77,13 @@ def get_latest_sitemap(cls): return None return rec.get("filename") + @classmethod + def get_sitemap(cls, n): + rec = cls.pull("sitemap"+str(n)) + if rec is None: + return None + return rec.get("filename") + @classmethod def cache_public_data_dump(cls, article_container, article_filename, article_url, article_size, journal_container, journal_filename, journal_url, journal_size): diff --git a/portality/view/doaj.py b/portality/view/doaj.py index d4d775e846..198c101bd3 100644 --- a/portality/view/doaj.py +++ b/portality/view/doaj.py @@ -1,4 +1,5 @@ import json +import os.path import re import urllib.error import urllib.parse @@ -168,7 +169,7 @@ def csv_data(): store_url = "/store" + store_url return redirect(store_url, code=307) - +@blueprint.route("/sitemap_index.xml") @blueprint.route("/sitemap.xml") def sitemap(): sitemap_url = models.Cache.get_latest_sitemap() @@ -178,6 +179,13 @@ def sitemap(): sitemap_url = "/store" + sitemap_url return redirect(sitemap_url, code=307) +@blueprint.route("/sitemap.xml") +def nth_sitemap(n): + sitemap_url = models.Cache.get_sitemap(n) + if sitemap_url.startswith("/"): + sitemap_url = "/store" + sitemap_url + return redirect(sitemap_url, code=307) + @blueprint.route("/public-data-dump/") @api_key_required @@ -206,6 +214,10 @@ def public_data_dump_redirect(record_type): return redirect(store_url, code=307) +@blueprint.route("/store///") +def get_from_local_store_dir(container, dir, filename): + file = os.path.join(dir, filename) + return get_from_local_store(container, file) @blueprint.route("/store//") def get_from_local_store(container, filename): @@ -215,7 +227,8 @@ def get_from_local_store(container, filename): from portality import store localStore = store.StoreFactory.get(None) file_handle = localStore.get(container, filename) - return send_file(file_handle, mimetype="application/octet-stream", as_attachment=True, attachment_filename=filename) + return send_file(file_handle, mimetype="application/octet-stream", as_attachment=True, + attachment_filename=os.path.basename(filename)) @blueprint.route('/autocomplete//', methods=["GET", "POST"])