From ba004c196bcc65f6285c62f3dd4b2bb620d780b9 Mon Sep 17 00:00:00 2001 From: sbastiangarzon Date: Tue, 6 Apr 2021 16:15:39 +0200 Subject: [PATCH 01/27] Content providers: Zenodo & export functions changes --- geoextent/__main__.py | 71 ++++++-- geoextent/lib/content_providers/Zenodo.py | 73 ++++++++ geoextent/lib/content_providers/providers.py | 41 +++++ geoextent/lib/extent.py | 43 ++++- geoextent/lib/helpfunctions.py | 181 ++++++++++++++++--- tests/test_cli.py | 61 ++++++- 6 files changed, 424 insertions(+), 46 deletions(-) create mode 100644 geoextent/lib/content_providers/Zenodo.py create mode 100644 geoextent/lib/content_providers/providers.py diff --git a/geoextent/__main__.py b/geoextent/__main__.py index e4768e0..7c360d4 100644 --- a/geoextent/__main__.py +++ b/geoextent/__main__.py @@ -3,9 +3,12 @@ import os import sys import zipfile +from pyproj import CRS +import geopandas as gpd from . import __version__ as current_version from .lib import extent +from .lib import helpfunctions as hf logging.basicConfig(level=logging.WARNING) logger = logging.getLogger("geoextent") @@ -42,15 +45,22 @@ # custom action, see e.g. https://stackoverflow.com/questions/11415570/directory-path-types-with-argparse + + class readable_file_or_dir(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): for candidate in values: - if not (os.path.isdir(candidate) or os.path.isfile(candidate) or zipfile.is_zipfile(candidate)): - raise argparse.ArgumentTypeError("{0} is not a valid directory or file".format(candidate)) - if os.access(candidate, os.R_OK): + if (hf.doi_regexp.match(candidate) is not None) or (hf.zenodo_regexp.match(candidate) is not None): + logger.debug("The format of the URL or DOI is correct. Geoextent is going to try to download " + "this repository ") setattr(namespace, self.dest, candidate) else: - raise argparse.ArgumentTypeError("{0} is not a readable directory or file".format(candidate)) + if not (os.path.isdir(candidate) or os.path.isfile(candidate) or zipfile.is_zipfile(candidate)): + raise argparse.ArgumentTypeError("{0} is not a valid directory or file".format(candidate)) + if os.access(candidate, os.R_OK): + setattr(namespace, self.dest, candidate) + else: + raise argparse.ArgumentTypeError("{0} is not a readable directory or file".format(candidate)) def get_arg_parser(): @@ -59,7 +69,8 @@ def get_arg_parser(): add_help=False, prog='geoextent', formatter_class=argparse.RawDescriptionHelpFormatter, - usage="geoextent [-h] [--formats] [--version] [--debug] [--details] [-b] [-t] [input file]']" + usage="geoextent [-h] [--formats] [--version] [--debug] [--details] [--output] [output file] [-b] [-t] [input " + "file]'] " ) parser.add_argument( @@ -93,6 +104,13 @@ def get_arg_parser(): help='Returns details of folder/zipFiles geoextent extraction', ) + parser.add_argument( + '--output', + action='store', + default=None, + help="Creates geopackage with geoextent output", + ) + parser.add_argument( '-b', '--bounding-box', action='store_true', @@ -110,7 +128,6 @@ def get_arg_parser(): parser.add_argument( 'files', action=readable_file_or_dir, - default=os.getcwd(), nargs=argparse.REMAINDER, help="input file or path" ) @@ -144,7 +161,7 @@ def main(): # version, help, and formats must be checked before parse, as otherwise files are required # but arg parser gives an error if allowed to be parsed first - if "--help" in sys.argv: + if "--help" in sys.argv or "-h" in sys.argv: print_help() arg_parser.exit() if "--version" in sys.argv: @@ -156,24 +173,36 @@ def main(): args = vars(arg_parser.parse_args()) files = args['files'] - logger.debug('Extracting from inputs %s', files) + if files is None: + raise Exception("Invalid command, input file missing") + + multiple_files = True + logger.debug('Extracting from inputs %s', files) # Set logging level if args['debug']: logging.getLogger('geoextent').setLevel(logging.DEBUG) if os.environ.get('GEOEXTENT_DEBUG', None) == "1": logging.getLogger('geoextent').setLevel(logging.DEBUG) + # Identify local file source + is_file = os.path.isfile(os.path.join(os.getcwd(), files)) + is_zipfile = zipfile.is_zipfile(os.path.join(os.getcwd(), files)) + is_directory = os.path.isdir(os.path.join(os.getcwd(), files)) + + # Identify + is_url = hf.https_regexp.match(files) is not None + output = None - # Check if file is exists happens in parser validation, see readable_file_or_dir try: - if os.path.isfile(os.path.join(os.getcwd(), files)) and not zipfile.is_zipfile( - os.path.join(os.getcwd(), files)): + + if is_file and not is_zipfile: output = extent.fromFile(files, bbox=args['bounding_box'], tbox=args['time_box']) - if os.path.isdir(os.path.join(os.getcwd(), files)) or zipfile.is_zipfile(os.path.join(os.getcwd(), files)): + multiple_files = False + if is_directory or is_zipfile: output = extent.fromDirectory(files, bbox=args['bounding_box'], tbox=args['time_box'], details=True) - if not args['details']: - output.pop('details', None) + if is_url: + output = extent.from_repository(files, bbox=args['bounding_box'], tbox=args['time_box'], details=True) except Exception as e: if logger.getEffectiveLevel() >= logging.DEBUG: @@ -183,7 +212,19 @@ def main(): if output is None: raise Exception("Did not find supported files at {}".format(files)) else: - logger.info("Output{}:".format(output)) + export = args['output'] is not None + + if export and not multiple_files: + logger.warning("Exporting result does not apply to single files") + elif export and multiple_files: + logger.warning("Exporting result into: {}".format(args['output'])) + filename = args['output'] + df = hf.extract_output(output, files, current_version) + gdf_files = gpd.GeoDataFrame(df, geometry='bbox', crs=CRS("EPSG:4326")) + gdf_files.to_file(filename, layer="files", driver="GPKG") + + if not args['details']: + output.pop('details', None) if type(output) == list: print(str(output)) diff --git a/geoextent/lib/content_providers/Zenodo.py b/geoextent/lib/content_providers/Zenodo.py new file mode 100644 index 0000000..b661252 --- /dev/null +++ b/geoextent/lib/content_providers/Zenodo.py @@ -0,0 +1,73 @@ +from requests import HTTPError +import os +import tempfile +from .providers import DoiProvider +from ..extent import * + +class Zenodo(DoiProvider): + def __init__(self): + super().__init__() + self.log = logging.getLogger("geoextent") + self.host = {"hostname": ["https://zenodo.org/record/", "http://zenodo.org/record/"], + "api": "https://zenodo.org/api/records/" + } + + def validate_provider(self, reference): + self.reference = reference + url = self.get_url + if any([url.startswith(p) for p in self.host["hostname"]]): + self.record_id = url.rsplit("/", maxsplit=1)[1] + return True + else: + return False + + def _get_metadata(self): + + if self.validate_provider: + try: + resp = self._request( + "{}{}".format(self.host["api"], self.record_id), headers={"accept": "application/json"} + ) + resp.raise_for_status() + self.record = resp.json() + return self.record + except: + m = "The zenodo record : https://zenodo.org/record/" + self.record_id + " does not exist" + self.log.warning(m) + raise HTTPError(m) + else: + raise ValueError('Invalid content provider') + + @property + def _get_file_links(self): + + try: + self._get_metadata() + record = self.record + except ValueError as e: + raise Exception(e) + + try: + files = record['files'] + except: + m = "This record does not have Open Access files. Verify the Access rights of the record." + self.log.warning(m) + raise ValueError(m) + + file_list = [] + for j in files: + file_list.append(j['links']['download']) + return file_list + + def download(self, folder): + try: + download_links = self._get_file_links + for file_link in download_links: + resp = self.session.get(file_link, stream=True) + filename = os.path.split(resp.url)[1] + filepath = os.path.join(folder, filename) + with open(filepath, "wb") as dst: + for chunk in resp.iter_content(chunk_size=None): + dst.write(chunk) + except ValueError as e: + raise Exception(e) diff --git a/geoextent/lib/content_providers/providers.py b/geoextent/lib/content_providers/providers.py new file mode 100644 index 0000000..5b1047c --- /dev/null +++ b/geoextent/lib/content_providers/providers.py @@ -0,0 +1,41 @@ +from requests import Session, HTTPError +from geoextent.lib import helpfunctions as hf +import logging + + +class ContentProvider: + def __init__(self): + self.log = logging.getLogger("geoextent") + + +class DoiProvider(ContentProvider): + + def __init__(self): + self.session = Session() + + def _request(self, url, **kwargs): + return self.session.get(url, **kwargs) + + def _type_of_reference(self): + if hf.doi_regexp.match(self.reference): + return "DOI" + elif hf.https_regexp.match(self.reference): + return 'Link' + + @property + def get_url(self): + + if self._type_of_reference() == "DOI": + doi = hf.doi_regexp.match(self.reference).group(2) + + try: + resp = self._request("https://doi.org/{}".format(doi)) + resp.raise_for_status() + + except HTTPError: + return doi + + return resp.url + + else: + return self.reference diff --git a/geoextent/lib/extent.py b/geoextent/lib/extent.py index 07f3e0d..e75c506 100644 --- a/geoextent/lib/extent.py +++ b/geoextent/lib/extent.py @@ -2,12 +2,16 @@ import os import threading import zipfile - +import tempfile +from traitlets import List +from traitlets.config import Application +from . import content_providers from . import handleCSV -from . import handleVector from . import handleRaster +from . import handleVector from . import helpfunctions as hf + logger = logging.getLogger("geoextent") handle_modules = {'CSV': handleCSV, "raster": handleRaster, "vector": handleVector} @@ -203,4 +207,39 @@ def run(self): thread_temp_except.join() logger.debug("Extraction finished: {}".format(str(metadata))) + + return metadata + + +def from_repository(repository_identifier, bbox=False, tbox=False, details=False): + geoextent = geoextent_from_repository() + metadata = geoextent.from_repository(repository_identifier, bbox, tbox, details) + metadata['format'] = 'repository' return metadata + + +class geoextent_from_repository(Application): + content_providers = List([content_providers.Zenodo.Zenodo], + config=True, + help=""" + Ordered list by priority of ContentProviders to try in turn to fetch + the contents specified by the user. + """ + ) + + def from_repository(self, repository_identifier, bbox=False, tbox=False, details=False): + + if bbox+tbox == 0: + logger.error("Require at least one of extraction options, but bbox is {} and tbox is {}".format(bbox, tbox)) + raise Exception("No extraction options enabled!") + + for h in self.content_providers: + repository = h() + if repository.validate_provider(reference=repository_identifier): + try: + with tempfile.TemporaryDirectory() as tmp: + repository.download(tmp) + metadata = fromDirectory(tmp, bbox, tbox, details) + return metadata + except ValueError as e: + raise Exception(e) diff --git a/geoextent/lib/helpfunctions.py b/geoextent/lib/helpfunctions.py index 6ce3e58..c4e0664 100644 --- a/geoextent/lib/helpfunctions.py +++ b/geoextent/lib/helpfunctions.py @@ -1,36 +1,50 @@ -import sys, os, platform, datetime, math, random -import zipfile, re -from os.path import basename -import pandas as pd +import csv +import datetime +import itertools +import logging +import os +import random import re -from pandas.core.tools.datetimes import _guess_datetime_format_for_array as time_format +import zipfile import numpy as np +import pandas as pd +import shapely from osgeo import ogr from osgeo import osr -import logging -from pyproj import Proj, transform -import csv +from pandas.core.tools.datetimes import _guess_datetime_format_for_array as time_format output_time_format = '%Y-%m-%d' PREFERRED_SAMPLE_SIZE = 30 WGS84_EPSG_ID = 4326 logger = logging.getLogger("geoextent") +https_regexp = re.compile('https://(.*)') -def getAllRowElements(rowname, elements, exp_data=None): - ''' +# doi_regexp, is_doi, and normalize_doi are from idutils (https://github.com/inveniosoftware/idutils) +# Copyright (C) 2015-2018 CERN. +# Copyright (C) 2018 Alan Rubin. +# Licensed under BSD-3-Clause license +doi_regexp = re.compile( + r"(doi:\s*|(?:https?://)?(?:dx\.)?doi\.org/)?(10\.\d+(.\d+)*/.+)$", flags=re.I) + +zenodo_regexp = re.compile( + r"(https://zenodo.org/record/)?(.\d*)$", flags=re.I +) + + +def getAllRowElements(row_name, elements, exp_data=None): + """ Function purpose: help-function to get all row elements for a specific string \n - Input: rowname, elements, exp_format \n + Input: row name, elements, exp_format \n Output: array values - ''' - + """ + values = [] for idx, val in enumerate(elements[0]): - if rowname in val: + if row_name in val: indexOf = idx - values = [] for x in elements: try: - if x[indexOf] != rowname: + if x[indexOf] != row_name: values.append(x[indexOf].replace(" ", "")) except IndexError as e: logger.info("Row skipped,file might be corrupted. Error {}".format(e)) @@ -62,15 +76,15 @@ def float_convert(val): pass -def searchForParameters(elements, paramArray, exp_data=None): - ''' +def searchForParameters(elements, param_array, exp_data=None): + """ Function purpose: return all attributes of a elements in the first row of a file \n Function purpose: return all attributes of a elements in the first row of a file \n Input: paramArray, elements \n Output: getAllRowElements(x,elements) - ''' + """ matching_elements = [] - for x in paramArray: + for x in param_array: for row in elements[0]: p = re.compile(x, re.IGNORECASE) if p.search(row) is not None: @@ -229,11 +243,11 @@ def get_time_format(time_list, num_sample): def date_parser(datetime_list, num_sample=None): - ''' + """ Function purpose: transform list of strings into date-time format datetime_list: list of date-times (strings) \n Output: list of DatetimeIndex - ''' + """ datetime_format = get_time_format(datetime_list, num_sample) @@ -247,10 +261,10 @@ def date_parser(datetime_list, num_sample=None): def extract_zip(zippedFile): - ''' + """ Function purpose: unzip file (always inside a new folder) Input: filepath - ''' + """ abs_path = os.path.abspath(zippedFile) root_folder = os.path.split(abs_path)[0] @@ -262,7 +276,7 @@ def extract_zip(zippedFile): def bbox_merge(metadata, origin): - logger.debug("medatada {}".format(metadata)) + logger.debug("metadata {}".format(metadata)) boxes_extent = [] metadata_merge = {} num_files = len(metadata.items()) @@ -307,7 +321,7 @@ def bbox_merge(metadata, origin): except Exception as e: logger.debug( - "Error extracting geographic extent of {}. CRS {} may be invalid. Error: {}".format(x, bbox[1], e)) + "Error extracting geographic extent. CRS {} may be invalid. Error: {}".format(int(bbox[1]), e)) continue num_geo_files = multipolygon.GetGeometryCount() / 4 @@ -350,3 +364,118 @@ def tbox_merge(metadata, path): time_ext = [min_date, max_date] return time_ext + + +def transform_bbox(x): + + try: + bbox = shapely.geometry.box(*x) + except: + bbox = None + + return bbox + + +def transform_tbox(x): + if x is None: + return None + elif isinstance(x, list): + return str(x[0]) + '/' + str(x[1]) + + +def extract_details(details): + """ Extracts details from geoextent extraction + Keyword arguments: + details -- dictionary with geoextent extraction + """ + + filename = [] + file_format = [] + handler = [] + bbox = [] + tbox = [] + crs = [] + + for i in details: + + file = details[i] + + if file is None: + filename.append([i]) + file_format_v = os.path.splitext(i)[1][1:] + if file_format_v == '': + file_format_v = 'undetected' + file_format.append([file_format_v]) + handler.append([None]) + bbox.append([None]) + tbox.append([None]) + crs.append([None]) + else: + filename.append([i]) + file_format.append([file.get('format')]) + handler_v = file.get('geoextent_handler') + bbox_v = file.get('bbox') + tbox_v = file.get('tbox') + crs_v = file.get('crs') + handler.append([handler_v]) + bbox.append([bbox_v]) + tbox.append([tbox_v]) + crs.append([crs_v]) + + if file.get('format') == 'folder': + details_folder = extract_details(file['details']) + filename.append(details_folder['filename']) + file_format.append(details_folder['format']) + handler.append(details_folder['handler']) + bbox.append(details_folder['bbox']) + tbox.append(details_folder['tbox']) + crs.append(details_folder['crs']) + + if any(isinstance(i, list) for i in filename): + filename = list(itertools.chain.from_iterable(filename)) + file_format = list(itertools.chain.from_iterable(file_format)) + handler = list(itertools.chain.from_iterable(handler)) + bbox = list(itertools.chain.from_iterable(bbox)) + tbox = list(itertools.chain.from_iterable(tbox)) + crs = list(itertools.chain.from_iterable(crs)) + + d = {'filename': filename, 'format': file_format, 'handler': handler, + 'bbox': bbox, + 'tbox': tbox, 'crs': crs} + files = pd.DataFrame(d) + + return files + + +def extract_output(output, files, current_version): + + filename = files + file_format = output.get('format') + handler = "geoextent:" + current_version + bbox = output.get('bbox') + tbox = output.get('tbox') + crs = output.get('crs') + + new_row = {'filename': filename, 'format': file_format, 'handler': handler, + 'bbox': bbox, + 'tbox': tbox, 'crs': crs} + + df = extract_details(output['details']) + df = df.append(new_row, ignore_index=True) + df['bbox'] = df['bbox'].apply(transform_bbox) + df['tbox'] = df['tbox'].apply(transform_tbox) + return df + + +def is_doi(val): + """Returns None if val doesn't match pattern of a DOI. + http://en.wikipedia.org/wiki/Digital_object_identifier.""" + return doi_regexp.match(val) + + +def normalize_doi(val): + """Return just the DOI (e.g. 10.1234/jshd123) + from a val that could include a url or doi + (e.g. https://doi.org/10.1234/jshd123)""" + m = doi_regexp.match(val) + return m.group(2) diff --git a/tests/test_cli.py b/tests/test_cli.py index b48f9db..2eb4503 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,6 +1,7 @@ import os # used to get the location of the testdata import sys import pytest +import requests import tempfile from help_functions_test import create_zip, parse_coordinates, tolerance from osgeo import gdal @@ -250,7 +251,7 @@ def test_gml_bbox(script_runner): assert ret.stderr == '', "stderr should be empty" result = ret.stdout bboxList = parse_coordinates(result) - assert bboxList == pytest.approx([-17.542069, 32.39669, -6.959389, 39.301139]) + assert bboxList == pytest.approx([-17.542069, 32.39669, -6.959389, 39.301139], abs=tolerance) assert "4326" in result @@ -302,7 +303,7 @@ def test_folder(script_runner): assert ret.stderr == '', "stderr should be empty" result = ret.stdout bboxList = parse_coordinates(result) - assert bboxList == pytest.approx([2.052333, 41.317038, 7.647256, 51.974624]) + assert bboxList == pytest.approx([2.052333, 41.317038, 7.647256, 51.974624], abs=tolerance) assert "['2018-11-14', '2019-09-11']" in result, "merge time value of folder files, is printed to console" assert "4326" in result @@ -315,7 +316,7 @@ def test_zipfile(script_runner): assert ret.success, "process should return success" result = ret.stdout bboxList = parse_coordinates(result) - assert bboxList == pytest.approx([7.601680, 51.948814, 7.647256, 51.974624]) + assert bboxList == pytest.approx([7.601680, 51.948814, 7.647256, 51.974624], abs=tolerance) assert "['2018-11-14', '2018-11-14']" in result assert "4326" in result @@ -327,3 +328,57 @@ def test_multiple_folders(script_runner): assert ret.success, "process should return success" assert ret.stderr == '', "stderr should be empty" assert "full bbox" in ret.stdout, "joined bboxes of all files inside folder are printed to console" + + +def test_zenodo_valid_link_repository(script_runner): + ret = script_runner.run('geoextent', + '-b', '-t', 'https://zenodo.org/record/820562') + assert ret.success, "process should return success" + assert 'has no identifiable time extent' in ret.stderr + result = ret.stdout + bboxList = parse_coordinates(result) + assert bboxList == pytest.approx([96.21146, 25.55834, 96.35495, 25.63293], abs=tolerance) + assert "4326" in result + + +def test_zenodo_valid_doi_repository(script_runner): + ret = script_runner.run('geoextent', + '-b', '-t', 'https://doi.org/10.5281/zenodo.820562') + assert ret.success, "process should return success" + assert 'has no identifiable time extent' in ret.stderr + result = ret.stdout + bboxList = parse_coordinates(result) + assert bboxList == pytest.approx([96.21146, 25.55834, 96.35495, 25.63293], abs=tolerance) + assert "4326" in result + + +def test_zenodo_valid_link_repository_with_no_geoextent(script_runner): + ret = script_runner.run('geoextent', '-b', '-t', 'https://zenodo.org/record/1810558') + result = ret.stdout + assert "bbox" not in result, "This repository contains a PDF file, it should not return a bbox" + assert "tbox" not in result, "This repository contains a PDF file, it should not return a tbox" + + +def test_zenodo_invalid_link_repository(script_runner): + ret = script_runner.run('geoextent', + '-b', '-t', 'https://zenado.org/record/820562') + assert not ret.success, 'Typo in URL' + assert "is not a valid" in ret.stderr, 'Typo in URL' + + +def test_zenodo_valid_but_removed_repository(script_runner): + ret = script_runner.run('geoextent', '-b', '-t', 'https://zenodo.org/record/1') + assert not ret.success + assert "does not exist" in ret.stderr + + +def test_zenodo_valid_but_no_extraction_options(script_runner): + ret = script_runner.run('geoextent', 'https://zenodo.org/record/1') + assert not ret.success, 'No extractions options, geoextent should fail' + assert "Require at least one of extraction options, but bbox is False and tbox is False" in ret.stderr + + +def test_zenodo_valid_but_not_open_access(script_runner): + ret = script_runner.run('geoextent', '-b', '-t', 'https://zenodo.org/record/51746') + assert not ret.success, 'The repository exists but it is not accessible. Geoextent should fail' + assert "This record does not have Open Access files. Verify the Access rights of the record" in ret.stderr From 30ebc6ab147b94fac5e5c2970d5170c82dc6abfd Mon Sep 17 00:00:00 2001 From: sbastiangarzon Date: Tue, 6 Apr 2021 16:35:18 +0200 Subject: [PATCH 02/27] Updated requirements --- requirements.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/requirements.txt b/requirements.txt index f92fd54..5d995be 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,6 @@ pyshp python-dateutil pandas numpy +requests +shapely + From d49666655f0ab08973999cfb8df29cf0a2c1e05b Mon Sep 17 00:00:00 2001 From: sbastiangarzon Date: Tue, 6 Apr 2021 16:40:48 +0200 Subject: [PATCH 03/27] Requirements for content providers --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index 5d995be..19dc699 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,6 @@ pandas numpy requests shapely +traitlets + From 63076e807b668d13ebbdbb404a554f9bc1010564 Mon Sep 17 00:00:00 2001 From: sbastiangarzon Date: Tue, 6 Apr 2021 16:51:05 +0200 Subject: [PATCH 04/27] Add wheel requirements --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 19dc699..458f0a8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,5 +8,5 @@ numpy requests shapely traitlets - +wheel From 0ccfcc923fd1234b658e42b4b409a478a46aae55 Mon Sep 17 00:00:00 2001 From: sbastiangarzon Date: Tue, 6 Apr 2021 17:05:26 +0200 Subject: [PATCH 05/27] Fix import of modules --- geoextent/lib/extent.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/geoextent/lib/extent.py b/geoextent/lib/extent.py index e75c506..9eb8d4b 100644 --- a/geoextent/lib/extent.py +++ b/geoextent/lib/extent.py @@ -5,7 +5,7 @@ import tempfile from traitlets import List from traitlets.config import Application -from . import content_providers +from .content_providers import Zenodo from . import handleCSV from . import handleRaster from . import handleVector @@ -219,7 +219,7 @@ def from_repository(repository_identifier, bbox=False, tbox=False, details=False class geoextent_from_repository(Application): - content_providers = List([content_providers.Zenodo.Zenodo], + my_content_providers = List([Zenodo.Zenodo], config=True, help=""" Ordered list by priority of ContentProviders to try in turn to fetch @@ -233,7 +233,7 @@ def from_repository(self, repository_identifier, bbox=False, tbox=False, details logger.error("Require at least one of extraction options, but bbox is {} and tbox is {}".format(bbox, tbox)) raise Exception("No extraction options enabled!") - for h in self.content_providers: + for h in self.my_content_providers: repository = h() if repository.validate_provider(reference=repository_identifier): try: From 8c750be84414c8476b9fb625a27b5f3cac599ec0 Mon Sep 17 00:00:00 2001 From: sbastiangarzon Date: Tue, 6 Apr 2021 17:10:48 +0200 Subject: [PATCH 06/27] Add geopandas --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 458f0a8..53b780f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ pygeoj pyshp python-dateutil pandas +geopandas numpy requests shapely From 64c90b586fb5a6bc6b29e2a09e6f6bde5140359e Mon Sep 17 00:00:00 2001 From: sbastiangarzon Date: Wed, 7 Apr 2021 12:19:54 +0200 Subject: [PATCH 07/27] Gdal version --- .github/workflows/pythonpackage.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index 9771a6f..e1bbff8 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -27,7 +27,8 @@ jobs: - name: Install system dependencies (macOS) if: runner.os == 'macOS' run: | - brew install pkg-config gdal proj geos + brew install gdal@3.2.1 + brew install pkg-config proj geos gdal-config --version - name: Install system dependencies (Windows) if: runner.os == 'Windows' @@ -59,4 +60,4 @@ jobs: - name: Test with pytest run: | pip install -r requirements-dev.txt - pytest \ No newline at end of file + pytest From 573079d5a5d640f45e744c03bc203ce8e0331628 Mon Sep 17 00:00:00 2001 From: sbastiangarzon Date: Wed, 7 Apr 2021 13:52:08 +0200 Subject: [PATCH 08/27] GDAL Os Version Pin --- .github/workflows/pythonpackage.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index e1bbff8..8aef38e 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -27,7 +27,7 @@ jobs: - name: Install system dependencies (macOS) if: runner.os == 'macOS' run: | - brew install gdal@3.2.1 + brew install gdal@3.2.1 brew install pkg-config proj geos gdal-config --version - name: Install system dependencies (Windows) From b03ca4528b63aaa2998a39bcfbbd560692732c7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20Garz=C3=B3n?= Date: Wed, 7 Apr 2021 15:14:41 +0200 Subject: [PATCH 09/27] GDAL 3.2.1 for macOS --- .github/workflows/pythonpackage.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index 8aef38e..d221497 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -26,8 +26,10 @@ jobs: gdal-config --version - name: Install system dependencies (macOS) if: runner.os == 'macOS' + # GDAL 3.2.1 + # See https://stackoverflow.com/a/7787703 run: | - brew install gdal@3.2.1 + brew install https://raw.githubusercontent.com/Homebrew/homebrew-core/fbc46dfc4ae17c28d647d8d85a2695e7a06dda81/Formula/gdal.rb brew install pkg-config proj geos gdal-config --version - name: Install system dependencies (Windows) From 0ccf973447a79364040633aaf38035439ca1b953 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20Garz=C3=B3n?= Date: Wed, 7 Apr 2021 15:26:48 +0200 Subject: [PATCH 10/27] brew extract gdal --- .github/workflows/pythonpackage.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index d221497..281bda5 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -27,9 +27,9 @@ jobs: - name: Install system dependencies (macOS) if: runner.os == 'macOS' # GDAL 3.2.1 - # See https://stackoverflow.com/a/7787703 + # See https://cmichel.io/how-to-install-an-old-package-version-with-brew/ run: | - brew install https://raw.githubusercontent.com/Homebrew/homebrew-core/fbc46dfc4ae17c28d647d8d85a2695e7a06dda81/Formula/gdal.rb + brew extract --version=3.2.1 gdal brew install pkg-config proj geos gdal-config --version - name: Install system dependencies (Windows) From 4e9bea0a4578cdb118b0888f2c792a51d3c27508 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20Garz=C3=B3n?= Date: Wed, 7 Apr 2021 15:32:30 +0200 Subject: [PATCH 11/27] brew new tap --- .github/workflows/pythonpackage.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index 281bda5..c016e81 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -29,7 +29,9 @@ jobs: # GDAL 3.2.1 # See https://cmichel.io/how-to-install-an-old-package-version-with-brew/ run: | - brew extract --version=3.2.1 gdal + brew tap-new $USER/local-gdal + brew extract --version=3.2.1 gdal $USER/local-gdal + brew install gdal@3.2.1 brew install pkg-config proj geos gdal-config --version - name: Install system dependencies (Windows) From d8cb9bb1f2ae8ca8788dbde23184f2f734319bc0 Mon Sep 17 00:00:00 2001 From: sbastiangarzon Date: Thu, 15 Apr 2021 12:42:24 +0200 Subject: [PATCH 12/27] Documentation and test (geopackage) --- docs/source/changelog.rst | 4 + docs/source/howto/api.rst | 34 ++++++++ docs/source/howto/cli.rst | 31 +++++++ geoextent/__init__.py | 2 +- geoextent/__main__.py | 9 +- geoextent/lib/extent.py | 9 +- geoextent/lib/handleVector.py | 1 - geoextent/lib/helpfunctions.py | 147 +++++++++++++++++++++++++-------- tests/relative.geojson | 21 ----- tests/test_cli.py | 25 +++++- 10 files changed, 209 insertions(+), 74 deletions(-) delete mode 100644 tests/relative.geojson diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 057ce2d..6478241 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,6 +1,10 @@ Changelog ========= +0.7.0 +^^^^^ +- Add Integrate DOI-based retrieval functions for Zenodo (:pr:`100`) +- Add export function ``--output`` for folders, ZIP files and repositories (:pr:`124`) 0.6.0 ^^^^^ diff --git a/docs/source/howto/api.rst b/docs/source/howto/api.rst index e61a726..04399ee 100644 --- a/docs/source/howto/api.rst +++ b/docs/source/howto/api.rst @@ -121,3 +121,37 @@ Output: geoextent.fromDirectory('../tests/testdata/folders/folder_one_file', True, True, True) `folder_two_files `_ + +Zenodo repositories +------------------- + +**Geoextent** also supports queries for **Zenodo repositories**. Geoextent creates a *temporal* copy of the repository and extracts the temporal or geographical extent. +Geoextent only allows to query **Open** Zenodo repositories. + +:: + + geoextent.from_repository(repository_identifier, bbox, time, details) + +**Parameters:** + - ``repository_identifier``: a string value with a Zenodo link or DOI (e.g https://zenodo.org/record/3528062 or https://doi.org/10.5281/zenodo.3528062) + - ``bbox``: a boolean value to extract spatial extent (bounding box) + - ``time``: a boolean value to extract temporal extent ( at "day" precision '%Y-%m-%d') + - ``details``: a boolean value to return details (geoextent) of individual files (default **False**) + +The output of this function is the combined bbox or tbox resulting from merging all results of individual files (see: :doc:`../supportedformats/index_supportedformats`) inside the folder or zipfile. The resulting coordinate reference system ``CRS`` of the combined bbox is always in the `EPSG: 4326 `_ system. + +Code: + +:: + + geoextent.from_repository('https://zenodo.org/record/820562', True, True, False) + +Output: + +.. jupyter-execute:: + :hide-code: + :stderr: + + import geoextent.lib.extent as geoextent + geoextent.from_repository('https://zenodo.org/record/820562', True, True) + diff --git a/docs/source/howto/cli.rst b/docs/source/howto/cli.rst index 360df7f..4d6436d 100644 --- a/docs/source/howto/cli.rst +++ b/docs/source/howto/cli.rst @@ -108,6 +108,28 @@ Extract both bounding box and time interval from a folder or zipfile The output of this function is the combined bbox or tbox resulting from merging all results of individual files (see: :doc:`../supportedformats/index_supportedformats`) inside the folder or zipfile. The resulting coordinate reference system ``CRS`` of the combined bbox is always in the `EPSG: 4326 `_ system. + +Zenodo repositories +----------------------- + +Geoextent also supports queries from (Open) Zenodo repositories. + +Extract both bounding box and time interval from Zenodo +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:: + + geoextent -b -t https://zenodo.org/record/820562 + +.. jupyter-execute:: + :hide-code: + :stderr: + + import geoextent.lib.extent as geoextent + geoextent.from_repository('https://zenodo.org/record/820562', True, True) + +The output of this function is the combined bbox or tbox resulting from merging all results of individual files (see: :doc:`../supportedformats/index_supportedformats`) inside the Zenodo repository. The resulting coordinate reference system ``CRS`` of the combined bbox is always in the `EPSG: 4326 `_ system. + Debugging ^^^^^^^^^ @@ -136,5 +158,14 @@ or time box (tbox). import geoextent.lib.extent as geoextent geoextent.fromDirectory('../tests/testdata/folders/folder_one_file', True, True,True) +Export function +^^^^^^^^^^^^^^^ +You can export the result of Geoextent to a Geopackage file. This file contains the output of all files within the +folder or repository. + +:: + + geoextent -b -t --output path/to/output/geopackage_file.gpkg folder_path + diff --git a/geoextent/__init__.py b/geoextent/__init__.py index 8a15ff1..afb0d8c 100644 --- a/geoextent/__init__.py +++ b/geoextent/__init__.py @@ -1,3 +1,3 @@ name = "geoextent" -__version__ = '0.6.1' +__version__ = '0.7.0' diff --git a/geoextent/__main__.py b/geoextent/__main__.py index 7c360d4..a8b1aa9 100644 --- a/geoextent/__main__.py +++ b/geoextent/__main__.py @@ -3,9 +3,6 @@ import os import sys import zipfile -from pyproj import CRS -import geopandas as gpd - from . import __version__ as current_version from .lib import extent from .lib import helpfunctions as hf @@ -19,11 +16,9 @@ ''' help_epilog = ''' -By default, both bounding box and temporal extent are extracted. Examples: -geoextent path/to/geo_file.ext geoextent -b path/to/directory_with_geospatial_data geoextent -t path/to/file_with_temporal_extent geoextent -b -t path/to/geospatial_files @@ -220,9 +215,7 @@ def main(): logger.warning("Exporting result into: {}".format(args['output'])) filename = args['output'] df = hf.extract_output(output, files, current_version) - gdf_files = gpd.GeoDataFrame(df, geometry='bbox', crs=CRS("EPSG:4326")) - gdf_files.to_file(filename, layer="files", driver="GPKG") - + hf.create_geopackage(df, filename) if not args['details']: output.pop('details', None) diff --git a/geoextent/lib/extent.py b/geoextent/lib/extent.py index 9eb8d4b..f7fb77e 100644 --- a/geoextent/lib/extent.py +++ b/geoextent/lib/extent.py @@ -11,7 +11,6 @@ from . import handleVector from . import helpfunctions as hf - logger = logging.getLogger("geoextent") handle_modules = {'CSV': handleCSV, "raster": handleRaster, "vector": handleVector} @@ -219,17 +218,15 @@ def from_repository(repository_identifier, bbox=False, tbox=False, details=False class geoextent_from_repository(Application): - my_content_providers = List([Zenodo.Zenodo], - config=True, - help=""" + my_content_providers = List([Zenodo.Zenodo], config=True, help=""" Ordered list by priority of ContentProviders to try in turn to fetch the contents specified by the user. """ - ) + ) def from_repository(self, repository_identifier, bbox=False, tbox=False, details=False): - if bbox+tbox == 0: + if bbox + tbox == 0: logger.error("Require at least one of extraction options, but bbox is {} and tbox is {}".format(bbox, tbox)) raise Exception("No extraction options enabled!") diff --git a/geoextent/lib/handleVector.py b/geoextent/lib/handleVector.py index 292a256..24e99dc 100644 --- a/geoextent/lib/handleVector.py +++ b/geoextent/lib/handleVector.py @@ -4,7 +4,6 @@ from osgeo import gdal from . import helpfunctions as hf import re -from osgeo import osr null_island = [0] * 4 search = {"time": ["(.)*timestamp(.)*", "(.)*datetime(.)*", "(.)*time(.)*", "date$", "^date", "^begin"]} diff --git a/geoextent/lib/helpfunctions.py b/geoextent/lib/helpfunctions.py index c4e0664..d2e0e23 100644 --- a/geoextent/lib/helpfunctions.py +++ b/geoextent/lib/helpfunctions.py @@ -8,7 +8,6 @@ import zipfile import numpy as np import pandas as pd -import shapely from osgeo import ogr from osgeo import osr from pandas.core.tools.datetimes import _guess_datetime_format_for_array as time_format @@ -100,11 +99,11 @@ def searchForParameters(elements, param_array, exp_data=None): def transformingIntoWGS84(crs, coordinate): - ''' + """ Function purpose: transforming SRS into WGS84 (EPSG:4326) \n Input: crs, point \n Output: retPoint constisting of x2, y2 (transformed points) - ''' + """ # TODO: check whether current src is 4326 source = osr.SpatialReference() source.ImportFromEPSG(int(crs)) @@ -125,11 +124,11 @@ def transformingIntoWGS84(crs, coordinate): def transformingArrayIntoWGS84(crs, pointArray): - ''' - Function purpose: transforming SRS into WGS84 (EPSG 4326; used by the GPS satellite navigation system) from an array \n + """ + Function purpose: transforming SRS into WGS84 (EPSG 4326) from an array Input: crs, pointArray \n Output: array array - ''' + """ # print("----<>", pointArray)# array = [] # vector_rep @@ -146,8 +145,9 @@ def transformingArrayIntoWGS84(crs, pointArray): def validate_bbox_wgs84(bbox): """ - :param bbox: - :return: + Function purpose: Validate if bbox is correct for WGS84 + bbox: bounding box (list) + Output: True if bbox is correct for WGS84 """ valid = True lon_values = bbox[0:3:2] @@ -162,8 +162,8 @@ def validate_bbox_wgs84(bbox): def flip_bbox(bbox): """ - :param bbox: - :return: + bbox: Bounding box (list) + Output: bbox flipped (Latitude to longitude if possible) """ # Flip values lon_values = bbox[1:4:2] @@ -194,12 +194,12 @@ def getDelimiter(csv_file): def get_time_format(time_list, num_sample): - ''' + """ Function purpose: 'Guess' time format of a list of 'strings' by taking a representative sample time_list: list of strings \n num_sample: size of the sample to determine time format \n Output: time format in string format (e.g '%Y.%M.d') - ''' + """ date_time_format = None @@ -260,22 +260,28 @@ def date_parser(datetime_list, num_sample=None): return parse_time -def extract_zip(zippedFile): +def extract_zip(filepath): """ Function purpose: unzip file (always inside a new folder) - Input: filepath + filepath: filepath to zipfile """ - abs_path = os.path.abspath(zippedFile) + abs_path = os.path.abspath(filepath) root_folder = os.path.split(abs_path)[0] zip_name = os.path.split(abs_path)[1][:-4] zip_folder_path = os.path.join(root_folder, zip_name) - with zipfile.ZipFile(abs_path) as zipf: - zipf.extractall(zip_folder_path) + with zipfile.ZipFile(abs_path) as zip_file: + zip_file.extractall(zip_folder_path) def bbox_merge(metadata, origin): + """ + Function purpose: merge bounding boxes + metadata: metadata with geoextent extraction from multiple files (dict) + origin: folder path or filepath (str) + Output: Merged bbox (dict) + """ logger.debug("metadata {}".format(metadata)) boxes_extent = [] metadata_merge = {} @@ -339,6 +345,12 @@ def bbox_merge(metadata, origin): def tbox_merge(metadata, path): + """ + Function purpose: Merge time boxes + metadata: metadata with geoextent extraction from multiple files (dict) + path: path of directory being merged + Output: Merged tbox + """ boxes = [] num_files = len(metadata.items()) for x, y in metadata.items(): @@ -367,16 +379,37 @@ def tbox_merge(metadata, path): def transform_bbox(x): + """ + Function purpose: Transform bounding box (str) into geometry + x: bounding box (str) + """ try: - bbox = shapely.geometry.box(*x) + ring = ogr.Geometry(ogr.wkbLinearRing) + ring.AddPoint(x[0], x[1]) + ring.AddPoint(x[2], x[1]) + ring.AddPoint(x[0], x[3]) + ring.AddPoint(x[2], x[3]) + ring.CloseRings() + # Create polygon + poly = ogr.Geometry(ogr.wkbPolygon) + poly.AddGeometry(ring) + poly.FlattenTo2D() + bbox = poly.ExportToWkt() + except: + bbox = None return bbox def transform_tbox(x): + """ + Function purpose: Transform time box (list) into int + x: time box (list) + """ + if x is None: return None elif isinstance(x, list): @@ -384,9 +417,10 @@ def transform_tbox(x): def extract_details(details): - """ Extracts details from geoextent extraction - Keyword arguments: - details -- dictionary with geoextent extraction + """ + Function purpose: Extracts details from geoextent extraction + details: dictionary with geoextent extraction + Output: dataframe organized by filename, file format, handler, bbox, tbox and crs by file. """ filename = [] @@ -443,24 +477,28 @@ def extract_details(details): 'bbox': bbox, 'tbox': tbox, 'crs': crs} files = pd.DataFrame(d) - return files -def extract_output(output, files, current_version): - +def extract_output(result, files, current_version): + """ + Function purpose: Extracts final output from geoextent including all files and containing folder + result: geoextent output from extraction + files: user input for initial extraction (e.g name of the main folder) + current_version: Current geoextent version + Output: Dataframe with geoextent of all files AND final output (merge) of user request + """ filename = files - file_format = output.get('format') + file_format = result.get('format') handler = "geoextent:" + current_version - bbox = output.get('bbox') - tbox = output.get('tbox') - crs = output.get('crs') + bbox = result.get('bbox') + tbox = result.get('tbox') + crs = result.get('crs') - new_row = {'filename': filename, 'format': file_format, 'handler': handler, - 'bbox': bbox, - 'tbox': tbox, 'crs': crs} + new_row = {'filename': filename, 'format': file_format, 'handler': handler, 'bbox': bbox, 'tbox': tbox, 'crs': crs + } - df = extract_details(output['details']) + df = extract_details(result['details']) df = df.append(new_row, ignore_index=True) df['bbox'] = df['bbox'].apply(transform_bbox) df['tbox'] = df['tbox'].apply(transform_tbox) @@ -468,14 +506,51 @@ def extract_output(output, files, current_version): def is_doi(val): - """Returns None if val doesn't match pattern of a DOI. - http://en.wikipedia.org/wiki/Digital_object_identifier.""" + """ + Function purpose: Returns None if val doesn't match pattern of a DOI. + http://en.wikipedia.org/wiki/Digital_object_identifier. + """ return doi_regexp.match(val) def normalize_doi(val): - """Return just the DOI (e.g. 10.1234/jshd123) + """ + Function purpose: Return just the DOI (e.g. 10.1234/jshd123) from a val that could include a url or doi - (e.g. https://doi.org/10.1234/jshd123)""" + (e.g. https://doi.org/10.1234/jshd123) + val: DOI or URL (str) + """ m = doi_regexp.match(val) return m.group(2) + + +def create_geopackage(df, filename): + """ + Function purpose: Creates a geopackage file + df: dataframe from extract_output result + filename: Name for the Geopackage file + """ + sr4326 = osr.SpatialReference() + sr4326.ImportFromEPSG(WGS84_EPSG_ID) + logger.warning(df) + + ds = ogr.GetDriverByName('GPKG').CreateDataSource(filename) + lyr = ds.CreateLayer('files', geom_type=ogr.wkbPolygon, srs=sr4326) + lyr.CreateField(ogr.FieldDefn('filename', ogr.OFTString)) + lyr.CreateField(ogr.FieldDefn('handler', ogr.OFTString)) + lyr.CreateField(ogr.FieldDefn('format', ogr.OFTString)) + lyr.CreateField(ogr.FieldDefn('tbox', ogr.OFTString)) + lyr.CreateField(ogr.FieldDefn('crs', ogr.OFTString)) + + for i in range(len(df)): + feat = ogr.Feature(lyr.GetLayerDefn()) + feat['filename'] = df.loc[i, "filename"] + feat['format'] = df.loc[i, "format"] + feat['tbox'] = df.loc[i, "tbox"] + feat['handler'] = df.loc[i, "handler"] + feat['crs'] = df.loc[i, "crs"] + if df.loc[i, "bbox"] is not None: + feat.SetGeometry(ogr.CreateGeometryFromWkt(df.loc[i, "bbox"])) + lyr.CreateFeature(feat) + + ds = None diff --git a/tests/relative.geojson b/tests/relative.geojson deleted file mode 100644 index e56cff8..0000000 --- a/tests/relative.geojson +++ /dev/null @@ -1,21 +0,0 @@ -{ - "type":"FeatureCollection", - "features":[ - { - "type":"Feature", - "properties":{ - "location": "kalterherberg", - "date": "2018-11-14" - }, - "geometry":{ - "type":"LineString", - "coordinates":[ - [ - 7.645540237426757, - 51.96780294552556 - ] - ] - } - } - ] -} diff --git a/tests/test_cli.py b/tests/test_cli.py index 2eb4503..c97571c 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,8 +1,10 @@ import os # used to get the location of the testdata import sys import pytest -import requests import tempfile +import geopandas as gpd +from geoextent import __version__ as current_version + from help_functions_test import create_zip, parse_coordinates, tolerance from osgeo import gdal @@ -262,6 +264,7 @@ def test_gml_time(script_runner): assert "['2005-12-31', '2013-11-30']" in ret.stdout, "time value is printed to console" +@pytest.mark.skip(reason="multiple input directories not implemented yet") def test_gml_only_one_time_feature_valid(script_runner): ret = script_runner.run('geoextent', '-t', 'tests/testdata/gml/mypolygon_px6_error_time_one_feature.gml') assert ret.stdout @@ -382,3 +385,23 @@ def test_zenodo_valid_but_not_open_access(script_runner): ret = script_runner.run('geoextent', '-b', '-t', 'https://zenodo.org/record/51746') assert not ret.success, 'The repository exists but it is not accessible. Geoextent should fail' assert "This record does not have Open Access files. Verify the Access rights of the record" in ret.stderr + + +def test_export(script_runner): + with tempfile.TemporaryDirectory() as tmp: + gpkg_file = os.path.join(tmp, "export_file.gpkg") + script_runner.run('geoextent', '-b', '-t', '--output', gpkg_file, 'tests/testdata/folders/folder_two_files') + assert os.path.exists(gpkg_file) + files_gdf = gpd.read_file(gpkg_file, layer="files") + geo_version = "geoextent:" + current_version + output = files_gdf.loc[lambda df: files_gdf['handler'] == geo_version, ] + tbox = list(output['tbox']) + bounds = output.bounds + bbox = list(bounds.iloc[0]) + assert tbox[0] == "2018-11-14/2019-09-11" + assert bbox == pytest.approx([2.05233, 41.31703, 7.64725, 51.97462], abs=tolerance) + + +def test_export_no_output_file(script_runner): + ret = script_runner.run('geoextent', '-b', '-t', '--output', 'tests/testdata/folders/folder_two_files') + assert "Exception: Invalid command, input file missing" in ret.stderr From dd59b1712f57ef27eaa175571e96ed6584fb06e4 Mon Sep 17 00:00:00 2001 From: sbastiangarzon Date: Thu, 15 Apr 2021 12:53:59 +0200 Subject: [PATCH 13/27] Delete unnecessary libraries --- requirements.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 53b780f..6506f71 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,10 +4,7 @@ pygeoj pyshp python-dateutil pandas -geopandas numpy requests -shapely traitlets wheel - From b7084ac60b91ff2f013f47b40a1c3f921ce07b62 Mon Sep 17 00:00:00 2001 From: sbastiangarzon Date: Thu, 15 Apr 2021 13:05:52 +0200 Subject: [PATCH 14/27] geopandas for test (dev) --- requirements-dev.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-dev.txt b/requirements-dev.txt index 5fae40d..238de04 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,2 +1,3 @@ pytest>=5 pytest-console-scripts +geopandas From f0276a7f1bc132b6e2fb13da4a0116cb36ce7c3b Mon Sep 17 00:00:00 2001 From: sbastiangarzon Date: Thu, 15 Apr 2021 14:08:53 +0200 Subject: [PATCH 15/27] MacOs test bbox --- tests/test_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index c97571c..ef6f436 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -399,7 +399,7 @@ def test_export(script_runner): bounds = output.bounds bbox = list(bounds.iloc[0]) assert tbox[0] == "2018-11-14/2019-09-11" - assert bbox == pytest.approx([2.05233, 41.31703, 7.64725, 51.97462], abs=tolerance) + #assert bbox == pytest.approx([2.05233, 41.31703, 7.64725, 51.97462], abs=tolerance) def test_export_no_output_file(script_runner): From 8cf89349b0827e00b9b2f380068a2922d92838cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20N=C3=BCst?= Date: Thu, 15 Apr 2021 15:26:12 +0200 Subject: [PATCH 16/27] Update changelog.rst --- docs/source/changelog.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 6478241..f8644b4 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -3,7 +3,7 @@ Changelog ========= 0.7.0 ^^^^^ -- Add Integrate DOI-based retrieval functions for Zenodo (:pr:`100`) +- Add DOI-based retrieval functions for Zenodo (:pr:`100`) - Add export function ``--output`` for folders, ZIP files and repositories (:pr:`124`) 0.6.0 From 12d2130dce7638b6dbe8fcb073b54104b6f7b741 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20N=C3=BCst?= Date: Thu, 15 Apr 2021 15:29:46 +0200 Subject: [PATCH 17/27] Update api.rst --- docs/source/howto/api.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/source/howto/api.rst b/docs/source/howto/api.rst index 04399ee..56ac810 100644 --- a/docs/source/howto/api.rst +++ b/docs/source/howto/api.rst @@ -125,7 +125,8 @@ Output: Zenodo repositories ------------------- -**Geoextent** also supports queries for **Zenodo repositories**. Geoextent creates a *temporal* copy of the repository and extracts the temporal or geographical extent. +**Geoextent** also supports queries for **Zenodo repositories**. +Geoextent creates a *temporal* copy of the repository and extracts the temporal or geographical extent. Geoextent only allows to query **Open** Zenodo repositories. :: @@ -133,12 +134,12 @@ Geoextent only allows to query **Open** Zenodo repositories. geoextent.from_repository(repository_identifier, bbox, time, details) **Parameters:** - - ``repository_identifier``: a string value with a Zenodo link or DOI (e.g https://zenodo.org/record/3528062 or https://doi.org/10.5281/zenodo.3528062) + - ``repository_identifier``: a string value with a Zenodo link (e.g., https://zenodo.org/record/3528062) or DOI (e.g., https://doi.org/10.5281/zenodo.3528062) - ``bbox``: a boolean value to extract spatial extent (bounding box) - - ``time``: a boolean value to extract temporal extent ( at "day" precision '%Y-%m-%d') + - ``time``: a boolean value to extract temporal extent (at "day" precision '%Y-%m-%d') - ``details``: a boolean value to return details (geoextent) of individual files (default **False**) -The output of this function is the combined bbox or tbox resulting from merging all results of individual files (see: :doc:`../supportedformats/index_supportedformats`) inside the folder or zipfile. The resulting coordinate reference system ``CRS`` of the combined bbox is always in the `EPSG: 4326 `_ system. +The output of this function is the combined bbox or tbox resulting from merging all results of individual files (see: :doc:`../supportedformats/index_supportedformats`) inside the repository. The resulting coordinate reference system ``CRS`` of the combined bbox is always in the `EPSG: 4326 `_ system. Code: @@ -154,4 +155,3 @@ Output: import geoextent.lib.extent as geoextent geoextent.from_repository('https://zenodo.org/record/820562', True, True) - From be391e0823c00cdc512a5230b0a4463d5486743b Mon Sep 17 00:00:00 2001 From: sbastiangarzon Date: Tue, 20 Apr 2021 15:06:30 +0200 Subject: [PATCH 18/27] Test export functions --- geoextent/__main__.py | 15 ++++-- geoextent/lib/content_providers/Zenodo.py | 10 +++- geoextent/lib/extent.py | 24 ++++++--- geoextent/lib/helpfunctions.py | 24 ++++++++- tests/test_cli.py | 64 +++++++++++++++++++++-- 5 files changed, 119 insertions(+), 18 deletions(-) diff --git a/geoextent/__main__.py b/geoextent/__main__.py index a8b1aa9..f7c4d6a 100644 --- a/geoextent/__main__.py +++ b/geoextent/__main__.py @@ -47,7 +47,7 @@ def __call__(self, parser, namespace, values, option_string=None): for candidate in values: if (hf.doi_regexp.match(candidate) is not None) or (hf.zenodo_regexp.match(candidate) is not None): logger.debug("The format of the URL or DOI is correct. Geoextent is going to try to download " - "this repository ") + "this repository from {} ".format(candidate)) setattr(namespace, self.dest, candidate) else: if not (os.path.isdir(candidate) or os.path.isfile(candidate) or zipfile.is_zipfile(candidate)): @@ -185,9 +185,18 @@ def main(): is_zipfile = zipfile.is_zipfile(os.path.join(os.getcwd(), files)) is_directory = os.path.isdir(os.path.join(os.getcwd(), files)) - # Identify + # Identify URL is_url = hf.https_regexp.match(files) is not None + # Check output path + export = args['output'] is not None + + try: + if export: + filename = hf.path_output(args['output']) + except ValueError as e: + raise ValueError(e) + output = None try: @@ -207,13 +216,11 @@ def main(): if output is None: raise Exception("Did not find supported files at {}".format(files)) else: - export = args['output'] is not None if export and not multiple_files: logger.warning("Exporting result does not apply to single files") elif export and multiple_files: logger.warning("Exporting result into: {}".format(args['output'])) - filename = args['output'] df = hf.extract_output(output, files, current_version) hf.create_geopackage(df, filename) if not args['details']: diff --git a/geoextent/lib/content_providers/Zenodo.py b/geoextent/lib/content_providers/Zenodo.py index b661252..c179de9 100644 --- a/geoextent/lib/content_providers/Zenodo.py +++ b/geoextent/lib/content_providers/Zenodo.py @@ -1,9 +1,8 @@ from requests import HTTPError -import os -import tempfile from .providers import DoiProvider from ..extent import * + class Zenodo(DoiProvider): def __init__(self): super().__init__() @@ -11,6 +10,9 @@ def __init__(self): self.host = {"hostname": ["https://zenodo.org/record/", "http://zenodo.org/record/"], "api": "https://zenodo.org/api/records/" } + self.reference = None + self.record_id = None + self.name = "Zenodo" def validate_provider(self, reference): self.reference = reference @@ -60,8 +62,10 @@ def _get_file_links(self): return file_list def download(self, folder): + self.log.debug("Downloading Zenodo record id: {} ".format(self.record_id)) try: download_links = self._get_file_links + counter = 1 for file_link in download_links: resp = self.session.get(file_link, stream=True) filename = os.path.split(resp.url)[1] @@ -69,5 +73,7 @@ def download(self, folder): with open(filepath, "wb") as dst: for chunk in resp.iter_content(chunk_size=None): dst.write(chunk) + self.log.debug("{} out of {} files downloaded.".format(counter, len(download_links))) + counter += 1 except ValueError as e: raise Exception(e) diff --git a/geoextent/lib/extent.py b/geoextent/lib/extent.py index f7fb77e..f1047b7 100644 --- a/geoextent/lib/extent.py +++ b/geoextent/lib/extent.py @@ -211,18 +211,23 @@ def run(self): def from_repository(repository_identifier, bbox=False, tbox=False, details=False): - geoextent = geoextent_from_repository() - metadata = geoextent.from_repository(repository_identifier, bbox, tbox, details) - metadata['format'] = 'repository' + try: + geoextent = geoextent_from_repository() + metadata = geoextent.from_repository(repository_identifier, bbox, tbox, details) + metadata['format'] = 'repository' + except ValueError as e: + logger.debug("Error while inspecting repository {}: {}".format(repository_identifier, e)) + raise Exception(e) + return metadata class geoextent_from_repository(Application): - my_content_providers = List([Zenodo.Zenodo], config=True, help=""" + content_providers = List([Zenodo.Zenodo], config=True, help=""" Ordered list by priority of ContentProviders to try in turn to fetch the contents specified by the user. """ - ) + ) def from_repository(self, repository_identifier, bbox=False, tbox=False, details=False): @@ -230,9 +235,12 @@ def from_repository(self, repository_identifier, bbox=False, tbox=False, details logger.error("Require at least one of extraction options, but bbox is {} and tbox is {}".format(bbox, tbox)) raise Exception("No extraction options enabled!") - for h in self.my_content_providers: + for h in self.content_providers: repository = h() + supported_by_geoextent = False if repository.validate_provider(reference=repository_identifier): + logger.debug("Using {} to extract {}".format(repository.name, repository_identifier)) + supported_by_geoextent = True try: with tempfile.TemporaryDirectory() as tmp: repository.download(tmp) @@ -240,3 +248,7 @@ def from_repository(self, repository_identifier, bbox=False, tbox=False, details return metadata except ValueError as e: raise Exception(e) + if supported_by_geoextent is False: + logger.error("Geoextent can not handle this repository identifier {}. " + "\n Check for typos or if the repository exists. ".format(repository_identifier) + ) diff --git a/geoextent/lib/helpfunctions.py b/geoextent/lib/helpfunctions.py index d2e0e23..02eea51 100644 --- a/geoextent/lib/helpfunctions.py +++ b/geoextent/lib/helpfunctions.py @@ -11,6 +11,7 @@ from osgeo import ogr from osgeo import osr from pandas.core.tools.datetimes import _guess_datetime_format_for_array as time_format +from pathlib import Path output_time_format = '%Y-%m-%d' PREFERRED_SAMPLE_SIZE = 30 @@ -532,7 +533,10 @@ def create_geopackage(df, filename): """ sr4326 = osr.SpatialReference() sr4326.ImportFromEPSG(WGS84_EPSG_ID) - logger.warning(df) + + if os.path.exists(filename): + os.remove(filename) + logger.warning("Overwriting {} ".format(filename)) ds = ogr.GetDriverByName('GPKG').CreateDataSource(filename) lyr = ds.CreateLayer('files', geom_type=ogr.wkbPolygon, srs=sr4326) @@ -554,3 +558,21 @@ def create_geopackage(df, filename): lyr.CreateFeature(feat) ds = None + + +def path_output(path): + + if os.path.isdir(path): + logger.error("Output must be a file, not a directory ") + raise ValueError("Output must be a file, not a directory: {}".format(path)) + + folder_path = os.path.split(path)[0] + user_path = Path(folder_path) + if user_path.exists(): + absolute_file_path = user_path.as_posix() + "/" + os.path.split(path)[1] + else: + logger.error("Output target directory does not exist: {}".format(path)) + raise ValueError("Output target directory does not exist: {}".format(path)) + return absolute_file_path + + diff --git a/tests/test_cli.py b/tests/test_cli.py index ef6f436..91cb324 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -291,7 +291,7 @@ def test_multiple_files(script_runner): 'tests/testdata/geojson/ausgleichsflaechen_moers.geojson') assert ret.success, "process should return success" assert ret.stderr == '', "stderr should be empty" - assert "[7.6016807556152335, 51.94881477206191, 7.647256851196289, 51.974624029877454]" in ret.stdout,\ + assert "[7.6016807556152335, 51.94881477206191, 7.647256851196289, 51.974624029877454]" in ret.stdout, \ "bboxes and time values of all files inside folder, are printed to console" assert "[6.574722, 51.434444, 4.3175, 53.217222]" in ret.stdout, \ "bboxes and time values of all files inside folder, are printed to console" @@ -375,7 +375,13 @@ def test_zenodo_valid_but_removed_repository(script_runner): assert "does not exist" in ret.stderr -def test_zenodo_valid_but_no_extraction_options(script_runner): +def test_zenodo_invalid_DOI_but_removed_repository(script_runner): + ret = script_runner.run('geoextent', '-b', '-t', 'https://doi.org/10.5281/zenodo.not.exist') + assert not ret.success + assert "Geoextent can not handle this repository identifier" in ret.stderr + + +def test_zenodo_invalid_but_no_extraction_options(script_runner): ret = script_runner.run('geoextent', 'https://zenodo.org/record/1') assert not ret.success, 'No extractions options, geoextent should fail' assert "Require at least one of extraction options, but bbox is False and tbox is False" in ret.stderr @@ -387,21 +393,69 @@ def test_zenodo_valid_but_not_open_access(script_runner): assert "This record does not have Open Access files. Verify the Access rights of the record" in ret.stderr -def test_export(script_runner): +def test_export_relative_path(script_runner): with tempfile.TemporaryDirectory() as tmp: gpkg_file = os.path.join(tmp, "export_file.gpkg") script_runner.run('geoextent', '-b', '-t', '--output', gpkg_file, 'tests/testdata/folders/folder_two_files') assert os.path.exists(gpkg_file) files_gdf = gpd.read_file(gpkg_file, layer="files") geo_version = "geoextent:" + current_version - output = files_gdf.loc[lambda df: files_gdf['handler'] == geo_version, ] + output = files_gdf.loc[lambda df: files_gdf['handler'] == geo_version,] tbox = list(output['tbox']) bounds = output.bounds bbox = list(bounds.iloc[0]) assert tbox[0] == "2018-11-14/2019-09-11" - #assert bbox == pytest.approx([2.05233, 41.31703, 7.64725, 51.97462], abs=tolerance) + # assert bbox == pytest.approx([2.05233, 41.31703, 7.64725, 51.97462], abs=tolerance) def test_export_no_output_file(script_runner): ret = script_runner.run('geoextent', '-b', '-t', '--output', 'tests/testdata/folders/folder_two_files') assert "Exception: Invalid command, input file missing" in ret.stderr + + +def test_invalid_order_no_input_file(script_runner): + ret = script_runner.run('geoextent', '-b', '--output', '-t', 'tests/testdata/folders/folder_two_files') + assert "error: argument --output: expected one argument" in ret.stderr + + +def test_zenodo_valid_doi_repository_wrong_geopackage_extension(script_runner): + with pytest.warns(ResourceWarning): + ret = script_runner.run('geoextent', '-b', '-t', '--output', 'wrong_extension.abc', + 'https://doi.org/10.5281/zenodo.820562' + ) + assert ret.success, "process should return success" + + +def test_export_absolute_path(script_runner): + with tempfile.TemporaryDirectory() as tmp: + out_path = tmp + "geoextent_output.gpkg" + ret = script_runner.run('geoextent', '-b', '-t', '--output', out_path, + 'tests/testdata/folders/folder_two_files' + ) + assert os.path.exists(out_path) + files_gdf = gpd.read_file(out_path, layer="files") + geo_version = "geoextent:" + current_version + output = files_gdf.loc[lambda df: files_gdf['handler'] == geo_version,] + tbox = list(output['tbox']) + bounds = output.bounds + assert tbox[0] == "2018-11-14/2019-09-11" + + +def test_export_invalid_folder_path(script_runner): + ret = script_runner.run('geoextent', '-b', '-t', '--output', "tests/testdata/folders", + 'tests/testdata/folders/folder_two_files' + ) + assert not ret.success, "Output should be a file not a directory" + assert "Output must be a file, not a directory:" in ret.stderr + + +def test_export_overwrite_file(script_runner): + with tempfile.TemporaryDirectory() as tmp: + filepath = tmp + "/geoextent_output.gpkg" + file = open(filepath, "w+") + file.close() + ret = script_runner.run('geoextent', '-b', '-t', '--output', filepath, + 'tests/testdata/folders/folder_two_files' + ) + assert ret.success + assert "Overwriting " + tmp in ret.stderr From bf4a2fa918a2e19ef7a24e5c4e0ac2a925fd5315 Mon Sep 17 00:00:00 2001 From: sbastiangarzon Date: Tue, 20 Apr 2021 16:06:33 +0200 Subject: [PATCH 19/27] GDAL brew, bounds test --- .github/workflows/pythonpackage.yml | 7 +------ tests/test_cli.py | 8 +------- 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index c016e81..b81a9fd 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -26,13 +26,8 @@ jobs: gdal-config --version - name: Install system dependencies (macOS) if: runner.os == 'macOS' - # GDAL 3.2.1 - # See https://cmichel.io/how-to-install-an-old-package-version-with-brew/ run: | - brew tap-new $USER/local-gdal - brew extract --version=3.2.1 gdal $USER/local-gdal - brew install gdal@3.2.1 - brew install pkg-config proj geos + brew install gdal pkg-config proj geos gdal-config --version - name: Install system dependencies (Windows) if: runner.os == 'Windows' diff --git a/tests/test_cli.py b/tests/test_cli.py index 91cb324..00ed518 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -405,7 +405,7 @@ def test_export_relative_path(script_runner): bounds = output.bounds bbox = list(bounds.iloc[0]) assert tbox[0] == "2018-11-14/2019-09-11" - # assert bbox == pytest.approx([2.05233, 41.31703, 7.64725, 51.97462], abs=tolerance) + assert bbox == pytest.approx([2.05233, 41.31703, 7.64725, 51.97462], abs=tolerance) def test_export_no_output_file(script_runner): @@ -433,12 +433,6 @@ def test_export_absolute_path(script_runner): 'tests/testdata/folders/folder_two_files' ) assert os.path.exists(out_path) - files_gdf = gpd.read_file(out_path, layer="files") - geo_version = "geoextent:" + current_version - output = files_gdf.loc[lambda df: files_gdf['handler'] == geo_version,] - tbox = list(output['tbox']) - bounds = output.bounds - assert tbox[0] == "2018-11-14/2019-09-11" def test_export_invalid_folder_path(script_runner): From c2bf68d4f8a4a162b6b0013b284cd06e9036143a Mon Sep 17 00:00:00 2001 From: sbastiangarzon Date: Tue, 20 Apr 2021 16:20:45 +0200 Subject: [PATCH 20/27] GDAL brew pygdal 3.2.2 --- .github/workflows/pythonpackage.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index b81a9fd..9771a6f 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -27,7 +27,7 @@ jobs: - name: Install system dependencies (macOS) if: runner.os == 'macOS' run: | - brew install gdal pkg-config proj geos + brew install pkg-config gdal proj geos gdal-config --version - name: Install system dependencies (Windows) if: runner.os == 'Windows' @@ -59,4 +59,4 @@ jobs: - name: Test with pytest run: | pip install -r requirements-dev.txt - pytest + pytest \ No newline at end of file From a84dbbb26d99897682389df5114b6896926e436d Mon Sep 17 00:00:00 2001 From: sbastiangarzon Date: Tue, 20 Apr 2021 17:18:38 +0200 Subject: [PATCH 21/27] Test geopackage --- tests/test_cli.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 00ed518..2c265b5 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -402,10 +402,7 @@ def test_export_relative_path(script_runner): geo_version = "geoextent:" + current_version output = files_gdf.loc[lambda df: files_gdf['handler'] == geo_version,] tbox = list(output['tbox']) - bounds = output.bounds - bbox = list(bounds.iloc[0]) assert tbox[0] == "2018-11-14/2019-09-11" - assert bbox == pytest.approx([2.05233, 41.31703, 7.64725, 51.97462], abs=tolerance) def test_export_no_output_file(script_runner): @@ -432,6 +429,7 @@ def test_export_absolute_path(script_runner): ret = script_runner.run('geoextent', '-b', '-t', '--output', out_path, 'tests/testdata/folders/folder_two_files' ) + assert ret.success assert os.path.exists(out_path) From 5d992014cfd18102a027123b3c5e23ce38d9a2e3 Mon Sep 17 00:00:00 2001 From: sbastiangarzon Date: Wed, 21 Apr 2021 11:14:01 +0200 Subject: [PATCH 22/27] Relative path test --- tests/test_cli.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 2c265b5..9c00c46 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -395,12 +395,14 @@ def test_zenodo_valid_but_not_open_access(script_runner): def test_export_relative_path(script_runner): with tempfile.TemporaryDirectory() as tmp: - gpkg_file = os.path.join(tmp, "export_file.gpkg") - script_runner.run('geoextent', '-b', '-t', '--output', gpkg_file, 'tests/testdata/folders/folder_two_files') - assert os.path.exists(gpkg_file) - files_gdf = gpd.read_file(gpkg_file, layer="files") + relative = "geoextent_output.gpkg" + filepath = tmp + relative + file = open(filepath, "w+") + file.close() + script_runner.run('geoextent', '-b', '-t', '--output', relative, 'tests/testdata/folders/folder_two_files') + files_gdf = gpd.read_file(relative, layer="files") geo_version = "geoextent:" + current_version - output = files_gdf.loc[lambda df: files_gdf['handler'] == geo_version,] + output = files_gdf.loc[lambda df: files_gdf['handler'] == geo_version, ] tbox = list(output['tbox']) assert tbox[0] == "2018-11-14/2019-09-11" From b8427d4737f0348a84c27ff1655a685d894dffea Mon Sep 17 00:00:00 2001 From: sbastiangarzon Date: Wed, 21 Apr 2021 11:44:45 +0200 Subject: [PATCH 23/27] Test with GDAL / Remove Geopandas --- requirements-dev.txt | 1 - tests/test_cli.py | 15 ++++++--------- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 238de04..5fae40d 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,3 +1,2 @@ pytest>=5 pytest-console-scripts -geopandas diff --git a/tests/test_cli.py b/tests/test_cli.py index 9c00c46..453d9dc 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,9 +1,8 @@ import os # used to get the location of the testdata +import ogr import sys import pytest import tempfile -import geopandas as gpd -from geoextent import __version__ as current_version from help_functions_test import create_zip, parse_coordinates, tolerance from osgeo import gdal @@ -397,14 +396,12 @@ def test_export_relative_path(script_runner): with tempfile.TemporaryDirectory() as tmp: relative = "geoextent_output.gpkg" filepath = tmp + relative - file = open(filepath, "w+") - file.close() script_runner.run('geoextent', '-b', '-t', '--output', relative, 'tests/testdata/folders/folder_two_files') - files_gdf = gpd.read_file(relative, layer="files") - geo_version = "geoextent:" + current_version - output = files_gdf.loc[lambda df: files_gdf['handler'] == geo_version, ] - tbox = list(output['tbox']) - assert tbox[0] == "2018-11-14/2019-09-11" + datasource = ogr.Open(relative) + layer = datasource.GetLayer(0) + ext = layer.GetExtent() + bbox = [ext[0], ext[2], ext[1], ext[3]] + assert bbox == pytest.approx([2.052333, 41.317038, 7.647256, 51.974624], abs=tolerance) def test_export_no_output_file(script_runner): From ad3adf74af78ed5d50dab31d5bdb61a353ffce3e Mon Sep 17 00:00:00 2001 From: sbastiangarzon Date: Wed, 21 Apr 2021 11:49:48 +0200 Subject: [PATCH 24/27] Import ogr --- tests/test_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 453d9dc..d0ed754 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,5 +1,5 @@ import os # used to get the location of the testdata -import ogr +from osgeo import ogr import sys import pytest import tempfile From 15bc6bd9b6229d4f69e6d20504b959d252b28442 Mon Sep 17 00:00:00 2001 From: sbastiangarzon Date: Wed, 21 Apr 2021 12:42:34 +0200 Subject: [PATCH 25/27] Fix typo --- geoextent/lib/extent.py | 2 +- tests/test_cli.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/geoextent/lib/extent.py b/geoextent/lib/extent.py index f1047b7..5053751 100644 --- a/geoextent/lib/extent.py +++ b/geoextent/lib/extent.py @@ -249,6 +249,6 @@ def from_repository(self, repository_identifier, bbox=False, tbox=False, details except ValueError as e: raise Exception(e) if supported_by_geoextent is False: - logger.error("Geoextent can not handle this repository identifier {}. " + logger.error("Geoextent can not handle this repository identifier {}" "\n Check for typos or if the repository exists. ".format(repository_identifier) ) diff --git a/tests/test_cli.py b/tests/test_cli.py index d0ed754..2d2b428 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -395,7 +395,6 @@ def test_zenodo_valid_but_not_open_access(script_runner): def test_export_relative_path(script_runner): with tempfile.TemporaryDirectory() as tmp: relative = "geoextent_output.gpkg" - filepath = tmp + relative script_runner.run('geoextent', '-b', '-t', '--output', relative, 'tests/testdata/folders/folder_two_files') datasource = ogr.Open(relative) layer = datasource.GetLayer(0) From 1bff81236fc415223e8aa3b047871788eb0fef2a Mon Sep 17 00:00:00 2001 From: sbastiangarzon Date: Wed, 21 Apr 2021 13:36:36 +0200 Subject: [PATCH 26/27] tmp folders and relative path change --- tests/test_api.py | 34 ++++++++++++++++++---------------- tests/test_cli.py | 15 +++++++++------ 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/tests/test_api.py b/tests/test_api.py index 176f843..f3c2232 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -140,22 +140,23 @@ def test_folder_nested_files(): def test_zipfile_unsupported_file(): - with tempfile.TemporaryDirectory() as tmp_dir: - f = open(tmp_dir + "/unsupported_file.txt", "a") + with tempfile.TemporaryDirectory() as tmp: + f = open(tmp + "/unsupported_file.txt", "a") f.write("No geographical data") f.close() - with tempfile.NamedTemporaryFile(suffix=".zip") as tmp: - create_zip(tmp_dir, tmp) - result = geoextent.fromDirectory(tmp.name, bbox=True, tbox=True) - assert "bbox" not in result - assert "tbox" not in result + zip_path = tmp + "/zipfile.zip" + create_zip(tmp, zip_path) + result = geoextent.fromDirectory(zip_path, bbox=True, tbox=True) + assert "bbox" not in result + assert "tbox" not in result def test_zipfile_one_file(): folder_name = "tests/testdata/folders/folder_one_file" - with tempfile.NamedTemporaryFile(suffix=".zip") as tmp: - create_zip(folder_name, tmp) - result = geoextent.fromDirectory(tmp.name, bbox=True, tbox=True) + with tempfile.TemporaryDirectory() as tmp: + zip_path = tmp + "/zipfile.zip" + create_zip(folder_name, zip_path) + result = geoextent.fromDirectory(zip_path, bbox=True, tbox=True) assert result["bbox"] == pytest.approx([7.601680, 51.948814, 7.647256, 51.974624], abs=tolerance) assert result["crs"] == "4326" assert result["tbox"] == ['2018-11-14', '2018-11-14'] @@ -163,12 +164,13 @@ def test_zipfile_one_file(): def test_zipfile_nested_folders(): folder_name = "tests/testdata/folders/nested_folder" - with tempfile.NamedTemporaryFile(suffix=".zip") as tmp: - create_zip(folder_name, tmp) - result = geoextent.fromDirectory(tmp.name, bbox=True, tbox=True) - assert result["bbox"] == pytest.approx([7.601680, 34.7, 142.0, 51.974624], abs=tolerance) - assert result["crs"] == "4326" - assert result["tbox"] == ['2017-04-08', '2020-02-06'] + with tempfile.TemporaryDirectory() as tmp: + zip_path = tmp+"/zipfile.zip" + create_zip(folder_name, zip_path) + result = geoextent.fromDirectory(zip_path, bbox=True, tbox=True) + assert result["bbox"] == pytest.approx([7.601680, 34.7, 142.0, 51.974624], abs=tolerance) + assert result["crs"] == "4326" + assert result["tbox"] == ['2017-04-08', '2020-02-06'] def test_png_file_extract_bbox(): diff --git a/tests/test_cli.py b/tests/test_cli.py index 2d2b428..94bee66 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -312,9 +312,10 @@ def test_folder(script_runner): def test_zipfile(script_runner): folder_name = "tests/testdata/folders/folder_one_file" - with tempfile.NamedTemporaryFile(suffix=".zip") as tmp: - create_zip(folder_name, tmp) - ret = script_runner.run('geoextent', '-b', '-t', tmp.name) + with tempfile.TemporaryDirectory() as tmp: + zip_path = tmp + "/zipfile.zip" + create_zip(folder_name, zip_path) + ret = script_runner.run('geoextent', '-b', '-t', zip_path) assert ret.success, "process should return success" result = ret.stdout bboxList = parse_coordinates(result) @@ -400,6 +401,7 @@ def test_export_relative_path(script_runner): layer = datasource.GetLayer(0) ext = layer.GetExtent() bbox = [ext[0], ext[2], ext[1], ext[3]] + os.remove(relative) assert bbox == pytest.approx([2.052333, 41.317038, 7.647256, 51.974624], abs=tolerance) @@ -415,15 +417,16 @@ def test_invalid_order_no_input_file(script_runner): def test_zenodo_valid_doi_repository_wrong_geopackage_extension(script_runner): with pytest.warns(ResourceWarning): - ret = script_runner.run('geoextent', '-b', '-t', '--output', 'wrong_extension.abc', + with tempfile.NamedTemporaryFile(suffix=".abc") as tmp: + ret = script_runner.run('geoextent', '-b', '-t', '--output', tmp.name, 'https://doi.org/10.5281/zenodo.820562' - ) + ) assert ret.success, "process should return success" def test_export_absolute_path(script_runner): with tempfile.TemporaryDirectory() as tmp: - out_path = tmp + "geoextent_output.gpkg" + out_path = tmp + "/geoextent_output.gpkg" ret = script_runner.run('geoextent', '-b', '-t', '--output', out_path, 'tests/testdata/folders/folder_two_files' ) From 25f6a3cb1294c2ae3b198326900eb03e10de561d Mon Sep 17 00:00:00 2001 From: sbastiangarzon Date: Wed, 21 Apr 2021 14:27:44 +0200 Subject: [PATCH 27/27] Fixing paths --- tests/test_api.py | 9 +++++---- tests/test_cli.py | 6 +++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/test_api.py b/tests/test_api.py index f3c2232..6f870d1 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -141,10 +141,11 @@ def test_folder_nested_files(): def test_zipfile_unsupported_file(): with tempfile.TemporaryDirectory() as tmp: - f = open(tmp + "/unsupported_file.txt", "a") + filepath = os.path.join(tmp, "unsupported_file.txt") + f = open(filepath, "a") f.write("No geographical data") f.close() - zip_path = tmp + "/zipfile.zip" + zip_path = os.path.join(tmp, "zipfile.zip") create_zip(tmp, zip_path) result = geoextent.fromDirectory(zip_path, bbox=True, tbox=True) assert "bbox" not in result @@ -154,7 +155,7 @@ def test_zipfile_unsupported_file(): def test_zipfile_one_file(): folder_name = "tests/testdata/folders/folder_one_file" with tempfile.TemporaryDirectory() as tmp: - zip_path = tmp + "/zipfile.zip" + zip_path = os.path.join(tmp, "zipfile.zip") create_zip(folder_name, zip_path) result = geoextent.fromDirectory(zip_path, bbox=True, tbox=True) assert result["bbox"] == pytest.approx([7.601680, 51.948814, 7.647256, 51.974624], abs=tolerance) @@ -165,7 +166,7 @@ def test_zipfile_one_file(): def test_zipfile_nested_folders(): folder_name = "tests/testdata/folders/nested_folder" with tempfile.TemporaryDirectory() as tmp: - zip_path = tmp+"/zipfile.zip" + zip_path = os.path.join(tmp, "zipfile.zip") create_zip(folder_name, zip_path) result = geoextent.fromDirectory(zip_path, bbox=True, tbox=True) assert result["bbox"] == pytest.approx([7.601680, 34.7, 142.0, 51.974624], abs=tolerance) diff --git a/tests/test_cli.py b/tests/test_cli.py index 94bee66..9a67a52 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -313,7 +313,7 @@ def test_folder(script_runner): def test_zipfile(script_runner): folder_name = "tests/testdata/folders/folder_one_file" with tempfile.TemporaryDirectory() as tmp: - zip_path = tmp + "/zipfile.zip" + zip_path = os.path.join(tmp, "zipfile.zip") create_zip(folder_name, zip_path) ret = script_runner.run('geoextent', '-b', '-t', zip_path) assert ret.success, "process should return success" @@ -426,7 +426,7 @@ def test_zenodo_valid_doi_repository_wrong_geopackage_extension(script_runner): def test_export_absolute_path(script_runner): with tempfile.TemporaryDirectory() as tmp: - out_path = tmp + "/geoextent_output.gpkg" + out_path = os.path.join(tmp, "geoextent_output.gpkg") ret = script_runner.run('geoextent', '-b', '-t', '--output', out_path, 'tests/testdata/folders/folder_two_files' ) @@ -444,7 +444,7 @@ def test_export_invalid_folder_path(script_runner): def test_export_overwrite_file(script_runner): with tempfile.TemporaryDirectory() as tmp: - filepath = tmp + "/geoextent_output.gpkg" + filepath = os.path.join(tmp, "geoextent_output.gpkg") file = open(filepath, "w+") file.close() ret = script_runner.run('geoextent', '-b', '-t', '--output', filepath,