Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 19 Use preferred CMR-Search-After method for iterating through results #22

Merged
merged 9 commits into from
Aug 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .DS_Store
Binary file not shown.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ venv/
tags
.venv
*.egg-info
dist
dist
.vscode/*
26 changes: 17 additions & 9 deletions cmr/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,15 @@ def get(self, limit=2000):
url = self._build_url()

results = []
page = 1
while len(results) < limit:
more_results = True
while more_results == True:

response = get(url, headers=self.headers, params={'page_size': page_size, 'page_num': page})
# Only get what we need
page_size = min(limit - len(results), page_size)
response = get(url, headers=self.headers, params={'page_size': page_size})
if self.headers == None:
self.headers = {}
self.headers['cmr-search-after'] = response.headers['cmr-search-after']

try:
response.raise_for_status()
Expand All @@ -65,13 +70,16 @@ def get(self, limit=2000):
latest = response.json()['feed']['entry']
else:
latest = [response.text]

if len(latest) == 0:
break


results.extend(latest)
page += 1


if page_size > len(response.json()['feed']['entry']) or len(results) >= limit:
more_results = False

# This header is transient. We need to get rid of it before we do another different query
if self.headers['cmr-search-after']:
del self.headers['cmr-search-after']

return results

def hits(self):
Expand Down
367 changes: 358 additions & 9 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ requests = "^2.26.0"
[tool.poetry.dev-dependencies]
flake8 = "^4.0.1"
pytest = "^6.2.5"
vcrpy = "^5.1.0"

[build-system]
requires = ["poetry-core>=1.0.0"]
Expand Down
Binary file added tests/fixtures/.DS_Store
Binary file not shown.
20,399 changes: 20,399 additions & 0 deletions tests/fixtures/vcr_cassettes/CYGNSS.yaml

Large diffs are not rendered by default.

18,136 changes: 18,136 additions & 0 deletions tests/fixtures/vcr_cassettes/MOD02QKM.yaml

Large diffs are not rendered by default.

12,068 changes: 12,068 additions & 0 deletions tests/fixtures/vcr_cassettes/MOD02QKM_2000.yaml

Large diffs are not rendered by default.

2,088 changes: 2,088 additions & 0 deletions tests/fixtures/vcr_cassettes/TELLUS_GRAC.yaml

Large diffs are not rendered by default.

95 changes: 95 additions & 0 deletions tests/test_multiple_queries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import unittest
import json

import vcr
import urllib.request

from cmr.queries import GranuleQuery

my_vcr = vcr.VCR(
record_mode='once',
decode_compressed_response=True,
# Header matching is not set by default, we need that to test the
# search-after functionality is performing correctly.
match_on=['method', 'scheme', 'host', 'port', 'path', 'query', 'headers']
)

def assert_unique_granules_from_results(granules):
"""
When we invoke a search request multiple times we want to ensure that we don't
get the same results back. This is a one shot test as the results are preserved
by VCR but still useful.
"""
granule_ids = []
for granule in granules:
granule_ids.append(granule['title'])

unique_granules = set(granule_ids)
return len(unique_granules) == len(granule_ids)

class TestMultipleQueries(unittest.TestCase):

def test_get_more_than_2000(self):
"""
If we execute a get with a limit of more than 2000
then we expect multiple invocations of a cmr granule search and
to not fetch back more results than we ask for
"""
with my_vcr.use_cassette('tests/fixtures/vcr_cassettes/MOD02QKM.yaml') as cass:
api = GranuleQuery()

granules = api.short_name("MOD02QKM").get(3000)
self.assertEqual(len(granules), 3000)
# Assert all 3000 qranule results have unique granule ids
assert_unique_granules_from_results(granules)
# Assert that we performed two search results queries
self.assertEqual(len(cass), 2)
self.assertIsNone(api.headers.get('cmr-search-after'))

def test_get(self):
"""
If we execute a get with no arguments then we expect
to get the maximum no. of granules from a single CMR call (2000)
in a single request
"""
with my_vcr.use_cassette('tests/fixtures/vcr_cassettes/MOD02QKM_2000.yaml') as cass:
api = GranuleQuery()
granules = api.short_name("MOD02QKM").get()
self.assertEqual(len(granules), 2000)
# Assert all 2000 qranule results have unique granule ids
assert_unique_granules_from_results(granules)
# Assert that we performed one search results query
self.assertEqual(len(cass), 1)
self.assertIsNone(api.headers.get('cmr-search-after'))

def test_get_all_less_than_2k(self):
"""
If we execute a get_all then we expect multiple
invocations of a cmr granule search and
to not fetch back more results than we ask for
"""
with my_vcr.use_cassette('tests/fixtures/vcr_cassettes/TELLUS_GRAC.yaml') as cass:
api = GranuleQuery()
granules = api.short_name("TELLUS_GRAC_L3_JPL_RL06_LND_v04").get_all()
self.assertEqual(len(granules), 163)
# Assert all 163 qranule results have unique granule ids
assert_unique_granules_from_results(granules)
# Assert that we performed a hits query and one search results query
self.assertEqual(len(cass), 2)
self.assertIsNone(api.headers.get('cmr-search-after'))

def test_get_all_more_than_2k(self):
"""
If we execute a get_all then we expect multiple
invocations of a cmr granule search and
to not fetch back more results than we ask for
"""
with my_vcr.use_cassette('tests/fixtures/vcr_cassettes/CYGNSS.yaml') as cass:
api = GranuleQuery()
granules = api.short_name("CYGNSS_NOAA_L2_SWSP_25KM_V1.2").get_all()
self.assertEqual(len(granules), 2285)
# Assert all 2285 qranule results have unique granule ids
assert_unique_granules_from_results(granules)
# Assert that we performed a hits query and two search results queries
self.assertEqual(len(cass), 3)
self.assertIsNone(api.headers.get('cmr-search-after'))