diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 057ce2de..f8644b4c 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,6 +1,10 @@ Changelog ========= +0.7.0 +^^^^^ +- Add DOI-based retrieval functions for Zenodo (:pr:`100`) +- Add export function ``--output`` for folders, ZIP files and repositories (:pr:`124`) 0.6.0 ^^^^^ diff --git a/docs/source/howto/api.rst b/docs/source/howto/api.rst index e61a726e..56ac810c 100644 --- a/docs/source/howto/api.rst +++ b/docs/source/howto/api.rst @@ -121,3 +121,37 @@ Output: geoextent.fromDirectory('../tests/testdata/folders/folder_one_file', True, True, True) `folder_two_files `_ + +Zenodo repositories +------------------- + +**Geoextent** also supports queries for **Zenodo repositories**. +Geoextent creates a *temporal* copy of the repository and extracts the temporal or geographical extent. +Geoextent only allows to query **Open** Zenodo repositories. + +:: + + geoextent.from_repository(repository_identifier, bbox, time, details) + +**Parameters:** + - ``repository_identifier``: a string value with a Zenodo link (e.g., https://zenodo.org/record/3528062) or DOI (e.g., https://doi.org/10.5281/zenodo.3528062) + - ``bbox``: a boolean value to extract spatial extent (bounding box) + - ``time``: a boolean value to extract temporal extent (at "day" precision '%Y-%m-%d') + - ``details``: a boolean value to return details (geoextent) of individual files (default **False**) + +The output of this function is the combined bbox or tbox resulting from merging all results of individual files (see: :doc:`../supportedformats/index_supportedformats`) inside the repository. The resulting coordinate reference system ``CRS`` of the combined bbox is always in the `EPSG: 4326 `_ system. + +Code: + +:: + + geoextent.from_repository('https://zenodo.org/record/820562', True, True, False) + +Output: + +.. jupyter-execute:: + :hide-code: + :stderr: + + import geoextent.lib.extent as geoextent + geoextent.from_repository('https://zenodo.org/record/820562', True, True) diff --git a/docs/source/howto/cli.rst b/docs/source/howto/cli.rst index 360df7f5..4d6436df 100644 --- a/docs/source/howto/cli.rst +++ b/docs/source/howto/cli.rst @@ -108,6 +108,28 @@ Extract both bounding box and time interval from a folder or zipfile The output of this function is the combined bbox or tbox resulting from merging all results of individual files (see: :doc:`../supportedformats/index_supportedformats`) inside the folder or zipfile. The resulting coordinate reference system ``CRS`` of the combined bbox is always in the `EPSG: 4326 `_ system. + +Zenodo repositories +----------------------- + +Geoextent also supports queries from (Open) Zenodo repositories. + +Extract both bounding box and time interval from Zenodo +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:: + + geoextent -b -t https://zenodo.org/record/820562 + +.. jupyter-execute:: + :hide-code: + :stderr: + + import geoextent.lib.extent as geoextent + geoextent.from_repository('https://zenodo.org/record/820562', True, True) + +The output of this function is the combined bbox or tbox resulting from merging all results of individual files (see: :doc:`../supportedformats/index_supportedformats`) inside the Zenodo repository. The resulting coordinate reference system ``CRS`` of the combined bbox is always in the `EPSG: 4326 `_ system. + Debugging ^^^^^^^^^ @@ -136,5 +158,14 @@ or time box (tbox). import geoextent.lib.extent as geoextent geoextent.fromDirectory('../tests/testdata/folders/folder_one_file', True, True,True) +Export function +^^^^^^^^^^^^^^^ +You can export the result of Geoextent to a Geopackage file. This file contains the output of all files within the +folder or repository. + +:: + + geoextent -b -t --output path/to/output/geopackage_file.gpkg folder_path + diff --git a/geoextent/__init__.py b/geoextent/__init__.py index 8a15ff13..afb0d8cf 100644 --- a/geoextent/__init__.py +++ b/geoextent/__init__.py @@ -1,3 +1,3 @@ name = "geoextent" -__version__ = '0.6.1' +__version__ = '0.7.0' diff --git a/geoextent/__main__.py b/geoextent/__main__.py index e4768e06..f7c4d6a5 100644 --- a/geoextent/__main__.py +++ b/geoextent/__main__.py @@ -3,9 +3,9 @@ import os import sys import zipfile - from . import __version__ as current_version from .lib import extent +from .lib import helpfunctions as hf logging.basicConfig(level=logging.WARNING) logger = logging.getLogger("geoextent") @@ -16,11 +16,9 @@ ''' help_epilog = ''' -By default, both bounding box and temporal extent are extracted. Examples: -geoextent path/to/geo_file.ext geoextent -b path/to/directory_with_geospatial_data geoextent -t path/to/file_with_temporal_extent geoextent -b -t path/to/geospatial_files @@ -42,15 +40,22 @@ # custom action, see e.g. https://stackoverflow.com/questions/11415570/directory-path-types-with-argparse + + class readable_file_or_dir(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): for candidate in values: - if not (os.path.isdir(candidate) or os.path.isfile(candidate) or zipfile.is_zipfile(candidate)): - raise argparse.ArgumentTypeError("{0} is not a valid directory or file".format(candidate)) - if os.access(candidate, os.R_OK): + if (hf.doi_regexp.match(candidate) is not None) or (hf.zenodo_regexp.match(candidate) is not None): + logger.debug("The format of the URL or DOI is correct. Geoextent is going to try to download " + "this repository from {} ".format(candidate)) setattr(namespace, self.dest, candidate) else: - raise argparse.ArgumentTypeError("{0} is not a readable directory or file".format(candidate)) + if not (os.path.isdir(candidate) or os.path.isfile(candidate) or zipfile.is_zipfile(candidate)): + raise argparse.ArgumentTypeError("{0} is not a valid directory or file".format(candidate)) + if os.access(candidate, os.R_OK): + setattr(namespace, self.dest, candidate) + else: + raise argparse.ArgumentTypeError("{0} is not a readable directory or file".format(candidate)) def get_arg_parser(): @@ -59,7 +64,8 @@ def get_arg_parser(): add_help=False, prog='geoextent', formatter_class=argparse.RawDescriptionHelpFormatter, - usage="geoextent [-h] [--formats] [--version] [--debug] [--details] [-b] [-t] [input file]']" + usage="geoextent [-h] [--formats] [--version] [--debug] [--details] [--output] [output file] [-b] [-t] [input " + "file]'] " ) parser.add_argument( @@ -93,6 +99,13 @@ def get_arg_parser(): help='Returns details of folder/zipFiles geoextent extraction', ) + parser.add_argument( + '--output', + action='store', + default=None, + help="Creates geopackage with geoextent output", + ) + parser.add_argument( '-b', '--bounding-box', action='store_true', @@ -110,7 +123,6 @@ def get_arg_parser(): parser.add_argument( 'files', action=readable_file_or_dir, - default=os.getcwd(), nargs=argparse.REMAINDER, help="input file or path" ) @@ -144,7 +156,7 @@ def main(): # version, help, and formats must be checked before parse, as otherwise files are required # but arg parser gives an error if allowed to be parsed first - if "--help" in sys.argv: + if "--help" in sys.argv or "-h" in sys.argv: print_help() arg_parser.exit() if "--version" in sys.argv: @@ -156,24 +168,45 @@ def main(): args = vars(arg_parser.parse_args()) files = args['files'] - logger.debug('Extracting from inputs %s', files) + if files is None: + raise Exception("Invalid command, input file missing") + + multiple_files = True + logger.debug('Extracting from inputs %s', files) # Set logging level if args['debug']: logging.getLogger('geoextent').setLevel(logging.DEBUG) if os.environ.get('GEOEXTENT_DEBUG', None) == "1": logging.getLogger('geoextent').setLevel(logging.DEBUG) + # Identify local file source + is_file = os.path.isfile(os.path.join(os.getcwd(), files)) + is_zipfile = zipfile.is_zipfile(os.path.join(os.getcwd(), files)) + is_directory = os.path.isdir(os.path.join(os.getcwd(), files)) + + # Identify URL + is_url = hf.https_regexp.match(files) is not None + + # Check output path + export = args['output'] is not None + + try: + if export: + filename = hf.path_output(args['output']) + except ValueError as e: + raise ValueError(e) + output = None - # Check if file is exists happens in parser validation, see readable_file_or_dir try: - if os.path.isfile(os.path.join(os.getcwd(), files)) and not zipfile.is_zipfile( - os.path.join(os.getcwd(), files)): + + if is_file and not is_zipfile: output = extent.fromFile(files, bbox=args['bounding_box'], tbox=args['time_box']) - if os.path.isdir(os.path.join(os.getcwd(), files)) or zipfile.is_zipfile(os.path.join(os.getcwd(), files)): + multiple_files = False + if is_directory or is_zipfile: output = extent.fromDirectory(files, bbox=args['bounding_box'], tbox=args['time_box'], details=True) - if not args['details']: - output.pop('details', None) + if is_url: + output = extent.from_repository(files, bbox=args['bounding_box'], tbox=args['time_box'], details=True) except Exception as e: if logger.getEffectiveLevel() >= logging.DEBUG: @@ -183,7 +216,15 @@ def main(): if output is None: raise Exception("Did not find supported files at {}".format(files)) else: - logger.info("Output{}:".format(output)) + + if export and not multiple_files: + logger.warning("Exporting result does not apply to single files") + elif export and multiple_files: + logger.warning("Exporting result into: {}".format(args['output'])) + df = hf.extract_output(output, files, current_version) + hf.create_geopackage(df, filename) + if not args['details']: + output.pop('details', None) if type(output) == list: print(str(output)) diff --git a/geoextent/lib/content_providers/Zenodo.py b/geoextent/lib/content_providers/Zenodo.py new file mode 100644 index 00000000..c179de9b --- /dev/null +++ b/geoextent/lib/content_providers/Zenodo.py @@ -0,0 +1,79 @@ +from requests import HTTPError +from .providers import DoiProvider +from ..extent import * + + +class Zenodo(DoiProvider): + def __init__(self): + super().__init__() + self.log = logging.getLogger("geoextent") + self.host = {"hostname": ["https://zenodo.org/record/", "http://zenodo.org/record/"], + "api": "https://zenodo.org/api/records/" + } + self.reference = None + self.record_id = None + self.name = "Zenodo" + + def validate_provider(self, reference): + self.reference = reference + url = self.get_url + if any([url.startswith(p) for p in self.host["hostname"]]): + self.record_id = url.rsplit("/", maxsplit=1)[1] + return True + else: + return False + + def _get_metadata(self): + + if self.validate_provider: + try: + resp = self._request( + "{}{}".format(self.host["api"], self.record_id), headers={"accept": "application/json"} + ) + resp.raise_for_status() + self.record = resp.json() + return self.record + except: + m = "The zenodo record : https://zenodo.org/record/" + self.record_id + " does not exist" + self.log.warning(m) + raise HTTPError(m) + else: + raise ValueError('Invalid content provider') + + @property + def _get_file_links(self): + + try: + self._get_metadata() + record = self.record + except ValueError as e: + raise Exception(e) + + try: + files = record['files'] + except: + m = "This record does not have Open Access files. Verify the Access rights of the record." + self.log.warning(m) + raise ValueError(m) + + file_list = [] + for j in files: + file_list.append(j['links']['download']) + return file_list + + def download(self, folder): + self.log.debug("Downloading Zenodo record id: {} ".format(self.record_id)) + try: + download_links = self._get_file_links + counter = 1 + for file_link in download_links: + resp = self.session.get(file_link, stream=True) + filename = os.path.split(resp.url)[1] + filepath = os.path.join(folder, filename) + with open(filepath, "wb") as dst: + for chunk in resp.iter_content(chunk_size=None): + dst.write(chunk) + self.log.debug("{} out of {} files downloaded.".format(counter, len(download_links))) + counter += 1 + except ValueError as e: + raise Exception(e) diff --git a/geoextent/lib/content_providers/providers.py b/geoextent/lib/content_providers/providers.py new file mode 100644 index 00000000..5b1047c7 --- /dev/null +++ b/geoextent/lib/content_providers/providers.py @@ -0,0 +1,41 @@ +from requests import Session, HTTPError +from geoextent.lib import helpfunctions as hf +import logging + + +class ContentProvider: + def __init__(self): + self.log = logging.getLogger("geoextent") + + +class DoiProvider(ContentProvider): + + def __init__(self): + self.session = Session() + + def _request(self, url, **kwargs): + return self.session.get(url, **kwargs) + + def _type_of_reference(self): + if hf.doi_regexp.match(self.reference): + return "DOI" + elif hf.https_regexp.match(self.reference): + return 'Link' + + @property + def get_url(self): + + if self._type_of_reference() == "DOI": + doi = hf.doi_regexp.match(self.reference).group(2) + + try: + resp = self._request("https://doi.org/{}".format(doi)) + resp.raise_for_status() + + except HTTPError: + return doi + + return resp.url + + else: + return self.reference diff --git a/geoextent/lib/extent.py b/geoextent/lib/extent.py index 07f3e0de..50537519 100644 --- a/geoextent/lib/extent.py +++ b/geoextent/lib/extent.py @@ -2,10 +2,13 @@ import os import threading import zipfile - +import tempfile +from traitlets import List +from traitlets.config import Application +from .content_providers import Zenodo from . import handleCSV -from . import handleVector from . import handleRaster +from . import handleVector from . import helpfunctions as hf logger = logging.getLogger("geoextent") @@ -203,4 +206,49 @@ def run(self): thread_temp_except.join() logger.debug("Extraction finished: {}".format(str(metadata))) + return metadata + + +def from_repository(repository_identifier, bbox=False, tbox=False, details=False): + try: + geoextent = geoextent_from_repository() + metadata = geoextent.from_repository(repository_identifier, bbox, tbox, details) + metadata['format'] = 'repository' + except ValueError as e: + logger.debug("Error while inspecting repository {}: {}".format(repository_identifier, e)) + raise Exception(e) + + return metadata + + +class geoextent_from_repository(Application): + content_providers = List([Zenodo.Zenodo], config=True, help=""" + Ordered list by priority of ContentProviders to try in turn to fetch + the contents specified by the user. + """ + ) + + def from_repository(self, repository_identifier, bbox=False, tbox=False, details=False): + + if bbox + tbox == 0: + logger.error("Require at least one of extraction options, but bbox is {} and tbox is {}".format(bbox, tbox)) + raise Exception("No extraction options enabled!") + + for h in self.content_providers: + repository = h() + supported_by_geoextent = False + if repository.validate_provider(reference=repository_identifier): + logger.debug("Using {} to extract {}".format(repository.name, repository_identifier)) + supported_by_geoextent = True + try: + with tempfile.TemporaryDirectory() as tmp: + repository.download(tmp) + metadata = fromDirectory(tmp, bbox, tbox, details) + return metadata + except ValueError as e: + raise Exception(e) + if supported_by_geoextent is False: + logger.error("Geoextent can not handle this repository identifier {}" + "\n Check for typos or if the repository exists. ".format(repository_identifier) + ) diff --git a/geoextent/lib/handleVector.py b/geoextent/lib/handleVector.py index 292a2562..24e99dc1 100644 --- a/geoextent/lib/handleVector.py +++ b/geoextent/lib/handleVector.py @@ -4,7 +4,6 @@ from osgeo import gdal from . import helpfunctions as hf import re -from osgeo import osr null_island = [0] * 4 search = {"time": ["(.)*timestamp(.)*", "(.)*datetime(.)*", "(.)*time(.)*", "date$", "^date", "^begin"]} diff --git a/geoextent/lib/helpfunctions.py b/geoextent/lib/helpfunctions.py index 6ce3e58c..02eea51f 100644 --- a/geoextent/lib/helpfunctions.py +++ b/geoextent/lib/helpfunctions.py @@ -1,36 +1,50 @@ -import sys, os, platform, datetime, math, random -import zipfile, re -from os.path import basename -import pandas as pd +import csv +import datetime +import itertools +import logging +import os +import random import re -from pandas.core.tools.datetimes import _guess_datetime_format_for_array as time_format +import zipfile import numpy as np +import pandas as pd from osgeo import ogr from osgeo import osr -import logging -from pyproj import Proj, transform -import csv +from pandas.core.tools.datetimes import _guess_datetime_format_for_array as time_format +from pathlib import Path output_time_format = '%Y-%m-%d' PREFERRED_SAMPLE_SIZE = 30 WGS84_EPSG_ID = 4326 logger = logging.getLogger("geoextent") +https_regexp = re.compile('https://(.*)') + +# doi_regexp, is_doi, and normalize_doi are from idutils (https://github.com/inveniosoftware/idutils) +# Copyright (C) 2015-2018 CERN. +# Copyright (C) 2018 Alan Rubin. +# Licensed under BSD-3-Clause license +doi_regexp = re.compile( + r"(doi:\s*|(?:https?://)?(?:dx\.)?doi\.org/)?(10\.\d+(.\d+)*/.+)$", flags=re.I) -def getAllRowElements(rowname, elements, exp_data=None): - ''' +zenodo_regexp = re.compile( + r"(https://zenodo.org/record/)?(.\d*)$", flags=re.I +) + + +def getAllRowElements(row_name, elements, exp_data=None): + """ Function purpose: help-function to get all row elements for a specific string \n - Input: rowname, elements, exp_format \n + Input: row name, elements, exp_format \n Output: array values - ''' - + """ + values = [] for idx, val in enumerate(elements[0]): - if rowname in val: + if row_name in val: indexOf = idx - values = [] for x in elements: try: - if x[indexOf] != rowname: + if x[indexOf] != row_name: values.append(x[indexOf].replace(" ", "")) except IndexError as e: logger.info("Row skipped,file might be corrupted. Error {}".format(e)) @@ -62,15 +76,15 @@ def float_convert(val): pass -def searchForParameters(elements, paramArray, exp_data=None): - ''' +def searchForParameters(elements, param_array, exp_data=None): + """ Function purpose: return all attributes of a elements in the first row of a file \n Function purpose: return all attributes of a elements in the first row of a file \n Input: paramArray, elements \n Output: getAllRowElements(x,elements) - ''' + """ matching_elements = [] - for x in paramArray: + for x in param_array: for row in elements[0]: p = re.compile(x, re.IGNORECASE) if p.search(row) is not None: @@ -86,11 +100,11 @@ def searchForParameters(elements, paramArray, exp_data=None): def transformingIntoWGS84(crs, coordinate): - ''' + """ Function purpose: transforming SRS into WGS84 (EPSG:4326) \n Input: crs, point \n Output: retPoint constisting of x2, y2 (transformed points) - ''' + """ # TODO: check whether current src is 4326 source = osr.SpatialReference() source.ImportFromEPSG(int(crs)) @@ -111,11 +125,11 @@ def transformingIntoWGS84(crs, coordinate): def transformingArrayIntoWGS84(crs, pointArray): - ''' - Function purpose: transforming SRS into WGS84 (EPSG 4326; used by the GPS satellite navigation system) from an array \n + """ + Function purpose: transforming SRS into WGS84 (EPSG 4326) from an array Input: crs, pointArray \n Output: array array - ''' + """ # print("----<>", pointArray)# array = [] # vector_rep @@ -132,8 +146,9 @@ def transformingArrayIntoWGS84(crs, pointArray): def validate_bbox_wgs84(bbox): """ - :param bbox: - :return: + Function purpose: Validate if bbox is correct for WGS84 + bbox: bounding box (list) + Output: True if bbox is correct for WGS84 """ valid = True lon_values = bbox[0:3:2] @@ -148,8 +163,8 @@ def validate_bbox_wgs84(bbox): def flip_bbox(bbox): """ - :param bbox: - :return: + bbox: Bounding box (list) + Output: bbox flipped (Latitude to longitude if possible) """ # Flip values lon_values = bbox[1:4:2] @@ -180,12 +195,12 @@ def getDelimiter(csv_file): def get_time_format(time_list, num_sample): - ''' + """ Function purpose: 'Guess' time format of a list of 'strings' by taking a representative sample time_list: list of strings \n num_sample: size of the sample to determine time format \n Output: time format in string format (e.g '%Y.%M.d') - ''' + """ date_time_format = None @@ -229,11 +244,11 @@ def get_time_format(time_list, num_sample): def date_parser(datetime_list, num_sample=None): - ''' + """ Function purpose: transform list of strings into date-time format datetime_list: list of date-times (strings) \n Output: list of DatetimeIndex - ''' + """ datetime_format = get_time_format(datetime_list, num_sample) @@ -246,23 +261,29 @@ def date_parser(datetime_list, num_sample=None): return parse_time -def extract_zip(zippedFile): - ''' +def extract_zip(filepath): + """ Function purpose: unzip file (always inside a new folder) - Input: filepath - ''' + filepath: filepath to zipfile + """ - abs_path = os.path.abspath(zippedFile) + abs_path = os.path.abspath(filepath) root_folder = os.path.split(abs_path)[0] zip_name = os.path.split(abs_path)[1][:-4] zip_folder_path = os.path.join(root_folder, zip_name) - with zipfile.ZipFile(abs_path) as zipf: - zipf.extractall(zip_folder_path) + with zipfile.ZipFile(abs_path) as zip_file: + zip_file.extractall(zip_folder_path) def bbox_merge(metadata, origin): - logger.debug("medatada {}".format(metadata)) + """ + Function purpose: merge bounding boxes + metadata: metadata with geoextent extraction from multiple files (dict) + origin: folder path or filepath (str) + Output: Merged bbox (dict) + """ + logger.debug("metadata {}".format(metadata)) boxes_extent = [] metadata_merge = {} num_files = len(metadata.items()) @@ -307,7 +328,7 @@ def bbox_merge(metadata, origin): except Exception as e: logger.debug( - "Error extracting geographic extent of {}. CRS {} may be invalid. Error: {}".format(x, bbox[1], e)) + "Error extracting geographic extent. CRS {} may be invalid. Error: {}".format(int(bbox[1]), e)) continue num_geo_files = multipolygon.GetGeometryCount() / 4 @@ -325,6 +346,12 @@ def bbox_merge(metadata, origin): def tbox_merge(metadata, path): + """ + Function purpose: Merge time boxes + metadata: metadata with geoextent extraction from multiple files (dict) + path: path of directory being merged + Output: Merged tbox + """ boxes = [] num_files = len(metadata.items()) for x, y in metadata.items(): @@ -350,3 +377,202 @@ def tbox_merge(metadata, path): time_ext = [min_date, max_date] return time_ext + + +def transform_bbox(x): + """ + Function purpose: Transform bounding box (str) into geometry + x: bounding box (str) + """ + + try: + ring = ogr.Geometry(ogr.wkbLinearRing) + ring.AddPoint(x[0], x[1]) + ring.AddPoint(x[2], x[1]) + ring.AddPoint(x[0], x[3]) + ring.AddPoint(x[2], x[3]) + ring.CloseRings() + # Create polygon + poly = ogr.Geometry(ogr.wkbPolygon) + poly.AddGeometry(ring) + poly.FlattenTo2D() + bbox = poly.ExportToWkt() + + except: + + bbox = None + + return bbox + + +def transform_tbox(x): + """ + Function purpose: Transform time box (list) into int + x: time box (list) + """ + + if x is None: + return None + elif isinstance(x, list): + return str(x[0]) + '/' + str(x[1]) + + +def extract_details(details): + """ + Function purpose: Extracts details from geoextent extraction + details: dictionary with geoextent extraction + Output: dataframe organized by filename, file format, handler, bbox, tbox and crs by file. + """ + + filename = [] + file_format = [] + handler = [] + bbox = [] + tbox = [] + crs = [] + + for i in details: + + file = details[i] + + if file is None: + filename.append([i]) + file_format_v = os.path.splitext(i)[1][1:] + if file_format_v == '': + file_format_v = 'undetected' + file_format.append([file_format_v]) + handler.append([None]) + bbox.append([None]) + tbox.append([None]) + crs.append([None]) + else: + filename.append([i]) + file_format.append([file.get('format')]) + handler_v = file.get('geoextent_handler') + bbox_v = file.get('bbox') + tbox_v = file.get('tbox') + crs_v = file.get('crs') + handler.append([handler_v]) + bbox.append([bbox_v]) + tbox.append([tbox_v]) + crs.append([crs_v]) + + if file.get('format') == 'folder': + details_folder = extract_details(file['details']) + filename.append(details_folder['filename']) + file_format.append(details_folder['format']) + handler.append(details_folder['handler']) + bbox.append(details_folder['bbox']) + tbox.append(details_folder['tbox']) + crs.append(details_folder['crs']) + + if any(isinstance(i, list) for i in filename): + filename = list(itertools.chain.from_iterable(filename)) + file_format = list(itertools.chain.from_iterable(file_format)) + handler = list(itertools.chain.from_iterable(handler)) + bbox = list(itertools.chain.from_iterable(bbox)) + tbox = list(itertools.chain.from_iterable(tbox)) + crs = list(itertools.chain.from_iterable(crs)) + + d = {'filename': filename, 'format': file_format, 'handler': handler, + 'bbox': bbox, + 'tbox': tbox, 'crs': crs} + files = pd.DataFrame(d) + return files + + +def extract_output(result, files, current_version): + """ + Function purpose: Extracts final output from geoextent including all files and containing folder + result: geoextent output from extraction + files: user input for initial extraction (e.g name of the main folder) + current_version: Current geoextent version + Output: Dataframe with geoextent of all files AND final output (merge) of user request + """ + filename = files + file_format = result.get('format') + handler = "geoextent:" + current_version + bbox = result.get('bbox') + tbox = result.get('tbox') + crs = result.get('crs') + + new_row = {'filename': filename, 'format': file_format, 'handler': handler, 'bbox': bbox, 'tbox': tbox, 'crs': crs + } + + df = extract_details(result['details']) + df = df.append(new_row, ignore_index=True) + df['bbox'] = df['bbox'].apply(transform_bbox) + df['tbox'] = df['tbox'].apply(transform_tbox) + return df + + +def is_doi(val): + """ + Function purpose: Returns None if val doesn't match pattern of a DOI. + http://en.wikipedia.org/wiki/Digital_object_identifier. + """ + return doi_regexp.match(val) + + +def normalize_doi(val): + """ + Function purpose: Return just the DOI (e.g. 10.1234/jshd123) + from a val that could include a url or doi + (e.g. https://doi.org/10.1234/jshd123) + val: DOI or URL (str) + """ + m = doi_regexp.match(val) + return m.group(2) + + +def create_geopackage(df, filename): + """ + Function purpose: Creates a geopackage file + df: dataframe from extract_output result + filename: Name for the Geopackage file + """ + sr4326 = osr.SpatialReference() + sr4326.ImportFromEPSG(WGS84_EPSG_ID) + + if os.path.exists(filename): + os.remove(filename) + logger.warning("Overwriting {} ".format(filename)) + + ds = ogr.GetDriverByName('GPKG').CreateDataSource(filename) + lyr = ds.CreateLayer('files', geom_type=ogr.wkbPolygon, srs=sr4326) + lyr.CreateField(ogr.FieldDefn('filename', ogr.OFTString)) + lyr.CreateField(ogr.FieldDefn('handler', ogr.OFTString)) + lyr.CreateField(ogr.FieldDefn('format', ogr.OFTString)) + lyr.CreateField(ogr.FieldDefn('tbox', ogr.OFTString)) + lyr.CreateField(ogr.FieldDefn('crs', ogr.OFTString)) + + for i in range(len(df)): + feat = ogr.Feature(lyr.GetLayerDefn()) + feat['filename'] = df.loc[i, "filename"] + feat['format'] = df.loc[i, "format"] + feat['tbox'] = df.loc[i, "tbox"] + feat['handler'] = df.loc[i, "handler"] + feat['crs'] = df.loc[i, "crs"] + if df.loc[i, "bbox"] is not None: + feat.SetGeometry(ogr.CreateGeometryFromWkt(df.loc[i, "bbox"])) + lyr.CreateFeature(feat) + + ds = None + + +def path_output(path): + + if os.path.isdir(path): + logger.error("Output must be a file, not a directory ") + raise ValueError("Output must be a file, not a directory: {}".format(path)) + + folder_path = os.path.split(path)[0] + user_path = Path(folder_path) + if user_path.exists(): + absolute_file_path = user_path.as_posix() + "/" + os.path.split(path)[1] + else: + logger.error("Output target directory does not exist: {}".format(path)) + raise ValueError("Output target directory does not exist: {}".format(path)) + return absolute_file_path + + diff --git a/requirements.txt b/requirements.txt index f92fd54a..6506f71e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,6 @@ pyshp python-dateutil pandas numpy +requests +traitlets +wheel diff --git a/tests/relative.geojson b/tests/relative.geojson deleted file mode 100644 index e56cff86..00000000 --- a/tests/relative.geojson +++ /dev/null @@ -1,21 +0,0 @@ -{ - "type":"FeatureCollection", - "features":[ - { - "type":"Feature", - "properties":{ - "location": "kalterherberg", - "date": "2018-11-14" - }, - "geometry":{ - "type":"LineString", - "coordinates":[ - [ - 7.645540237426757, - 51.96780294552556 - ] - ] - } - } - ] -} diff --git a/tests/test_api.py b/tests/test_api.py index 176f8435..6f870d17 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -140,22 +140,24 @@ def test_folder_nested_files(): def test_zipfile_unsupported_file(): - with tempfile.TemporaryDirectory() as tmp_dir: - f = open(tmp_dir + "/unsupported_file.txt", "a") + with tempfile.TemporaryDirectory() as tmp: + filepath = os.path.join(tmp, "unsupported_file.txt") + f = open(filepath, "a") f.write("No geographical data") f.close() - with tempfile.NamedTemporaryFile(suffix=".zip") as tmp: - create_zip(tmp_dir, tmp) - result = geoextent.fromDirectory(tmp.name, bbox=True, tbox=True) - assert "bbox" not in result - assert "tbox" not in result + zip_path = os.path.join(tmp, "zipfile.zip") + create_zip(tmp, zip_path) + result = geoextent.fromDirectory(zip_path, bbox=True, tbox=True) + assert "bbox" not in result + assert "tbox" not in result def test_zipfile_one_file(): folder_name = "tests/testdata/folders/folder_one_file" - with tempfile.NamedTemporaryFile(suffix=".zip") as tmp: - create_zip(folder_name, tmp) - result = geoextent.fromDirectory(tmp.name, bbox=True, tbox=True) + with tempfile.TemporaryDirectory() as tmp: + zip_path = os.path.join(tmp, "zipfile.zip") + create_zip(folder_name, zip_path) + result = geoextent.fromDirectory(zip_path, bbox=True, tbox=True) assert result["bbox"] == pytest.approx([7.601680, 51.948814, 7.647256, 51.974624], abs=tolerance) assert result["crs"] == "4326" assert result["tbox"] == ['2018-11-14', '2018-11-14'] @@ -163,12 +165,13 @@ def test_zipfile_one_file(): def test_zipfile_nested_folders(): folder_name = "tests/testdata/folders/nested_folder" - with tempfile.NamedTemporaryFile(suffix=".zip") as tmp: - create_zip(folder_name, tmp) - result = geoextent.fromDirectory(tmp.name, bbox=True, tbox=True) - assert result["bbox"] == pytest.approx([7.601680, 34.7, 142.0, 51.974624], abs=tolerance) - assert result["crs"] == "4326" - assert result["tbox"] == ['2017-04-08', '2020-02-06'] + with tempfile.TemporaryDirectory() as tmp: + zip_path = os.path.join(tmp, "zipfile.zip") + create_zip(folder_name, zip_path) + result = geoextent.fromDirectory(zip_path, bbox=True, tbox=True) + assert result["bbox"] == pytest.approx([7.601680, 34.7, 142.0, 51.974624], abs=tolerance) + assert result["crs"] == "4326" + assert result["tbox"] == ['2017-04-08', '2020-02-06'] def test_png_file_extract_bbox(): diff --git a/tests/test_cli.py b/tests/test_cli.py index b48f9dba..9a67a529 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,7 +1,9 @@ import os # used to get the location of the testdata +from osgeo import ogr import sys import pytest import tempfile + from help_functions_test import create_zip, parse_coordinates, tolerance from osgeo import gdal @@ -250,7 +252,7 @@ def test_gml_bbox(script_runner): assert ret.stderr == '', "stderr should be empty" result = ret.stdout bboxList = parse_coordinates(result) - assert bboxList == pytest.approx([-17.542069, 32.39669, -6.959389, 39.301139]) + assert bboxList == pytest.approx([-17.542069, 32.39669, -6.959389, 39.301139], abs=tolerance) assert "4326" in result @@ -261,6 +263,7 @@ def test_gml_time(script_runner): assert "['2005-12-31', '2013-11-30']" in ret.stdout, "time value is printed to console" +@pytest.mark.skip(reason="multiple input directories not implemented yet") def test_gml_only_one_time_feature_valid(script_runner): ret = script_runner.run('geoextent', '-t', 'tests/testdata/gml/mypolygon_px6_error_time_one_feature.gml') assert ret.stdout @@ -287,7 +290,7 @@ def test_multiple_files(script_runner): 'tests/testdata/geojson/ausgleichsflaechen_moers.geojson') assert ret.success, "process should return success" assert ret.stderr == '', "stderr should be empty" - assert "[7.6016807556152335, 51.94881477206191, 7.647256851196289, 51.974624029877454]" in ret.stdout,\ + assert "[7.6016807556152335, 51.94881477206191, 7.647256851196289, 51.974624029877454]" in ret.stdout, \ "bboxes and time values of all files inside folder, are printed to console" assert "[6.574722, 51.434444, 4.3175, 53.217222]" in ret.stdout, \ "bboxes and time values of all files inside folder, are printed to console" @@ -302,20 +305,21 @@ def test_folder(script_runner): assert ret.stderr == '', "stderr should be empty" result = ret.stdout bboxList = parse_coordinates(result) - assert bboxList == pytest.approx([2.052333, 41.317038, 7.647256, 51.974624]) + assert bboxList == pytest.approx([2.052333, 41.317038, 7.647256, 51.974624], abs=tolerance) assert "['2018-11-14', '2019-09-11']" in result, "merge time value of folder files, is printed to console" assert "4326" in result def test_zipfile(script_runner): folder_name = "tests/testdata/folders/folder_one_file" - with tempfile.NamedTemporaryFile(suffix=".zip") as tmp: - create_zip(folder_name, tmp) - ret = script_runner.run('geoextent', '-b', '-t', tmp.name) + with tempfile.TemporaryDirectory() as tmp: + zip_path = os.path.join(tmp, "zipfile.zip") + create_zip(folder_name, zip_path) + ret = script_runner.run('geoextent', '-b', '-t', zip_path) assert ret.success, "process should return success" result = ret.stdout bboxList = parse_coordinates(result) - assert bboxList == pytest.approx([7.601680, 51.948814, 7.647256, 51.974624]) + assert bboxList == pytest.approx([7.601680, 51.948814, 7.647256, 51.974624], abs=tolerance) assert "['2018-11-14', '2018-11-14']" in result assert "4326" in result @@ -327,3 +331,124 @@ def test_multiple_folders(script_runner): assert ret.success, "process should return success" assert ret.stderr == '', "stderr should be empty" assert "full bbox" in ret.stdout, "joined bboxes of all files inside folder are printed to console" + + +def test_zenodo_valid_link_repository(script_runner): + ret = script_runner.run('geoextent', + '-b', '-t', 'https://zenodo.org/record/820562') + assert ret.success, "process should return success" + assert 'has no identifiable time extent' in ret.stderr + result = ret.stdout + bboxList = parse_coordinates(result) + assert bboxList == pytest.approx([96.21146, 25.55834, 96.35495, 25.63293], abs=tolerance) + assert "4326" in result + + +def test_zenodo_valid_doi_repository(script_runner): + ret = script_runner.run('geoextent', + '-b', '-t', 'https://doi.org/10.5281/zenodo.820562') + assert ret.success, "process should return success" + assert 'has no identifiable time extent' in ret.stderr + result = ret.stdout + bboxList = parse_coordinates(result) + assert bboxList == pytest.approx([96.21146, 25.55834, 96.35495, 25.63293], abs=tolerance) + assert "4326" in result + + +def test_zenodo_valid_link_repository_with_no_geoextent(script_runner): + ret = script_runner.run('geoextent', '-b', '-t', 'https://zenodo.org/record/1810558') + result = ret.stdout + assert "bbox" not in result, "This repository contains a PDF file, it should not return a bbox" + assert "tbox" not in result, "This repository contains a PDF file, it should not return a tbox" + + +def test_zenodo_invalid_link_repository(script_runner): + ret = script_runner.run('geoextent', + '-b', '-t', 'https://zenado.org/record/820562') + assert not ret.success, 'Typo in URL' + assert "is not a valid" in ret.stderr, 'Typo in URL' + + +def test_zenodo_valid_but_removed_repository(script_runner): + ret = script_runner.run('geoextent', '-b', '-t', 'https://zenodo.org/record/1') + assert not ret.success + assert "does not exist" in ret.stderr + + +def test_zenodo_invalid_DOI_but_removed_repository(script_runner): + ret = script_runner.run('geoextent', '-b', '-t', 'https://doi.org/10.5281/zenodo.not.exist') + assert not ret.success + assert "Geoextent can not handle this repository identifier" in ret.stderr + + +def test_zenodo_invalid_but_no_extraction_options(script_runner): + ret = script_runner.run('geoextent', 'https://zenodo.org/record/1') + assert not ret.success, 'No extractions options, geoextent should fail' + assert "Require at least one of extraction options, but bbox is False and tbox is False" in ret.stderr + + +def test_zenodo_valid_but_not_open_access(script_runner): + ret = script_runner.run('geoextent', '-b', '-t', 'https://zenodo.org/record/51746') + assert not ret.success, 'The repository exists but it is not accessible. Geoextent should fail' + assert "This record does not have Open Access files. Verify the Access rights of the record" in ret.stderr + + +def test_export_relative_path(script_runner): + with tempfile.TemporaryDirectory() as tmp: + relative = "geoextent_output.gpkg" + script_runner.run('geoextent', '-b', '-t', '--output', relative, 'tests/testdata/folders/folder_two_files') + datasource = ogr.Open(relative) + layer = datasource.GetLayer(0) + ext = layer.GetExtent() + bbox = [ext[0], ext[2], ext[1], ext[3]] + os.remove(relative) + assert bbox == pytest.approx([2.052333, 41.317038, 7.647256, 51.974624], abs=tolerance) + + +def test_export_no_output_file(script_runner): + ret = script_runner.run('geoextent', '-b', '-t', '--output', 'tests/testdata/folders/folder_two_files') + assert "Exception: Invalid command, input file missing" in ret.stderr + + +def test_invalid_order_no_input_file(script_runner): + ret = script_runner.run('geoextent', '-b', '--output', '-t', 'tests/testdata/folders/folder_two_files') + assert "error: argument --output: expected one argument" in ret.stderr + + +def test_zenodo_valid_doi_repository_wrong_geopackage_extension(script_runner): + with pytest.warns(ResourceWarning): + with tempfile.NamedTemporaryFile(suffix=".abc") as tmp: + ret = script_runner.run('geoextent', '-b', '-t', '--output', tmp.name, + 'https://doi.org/10.5281/zenodo.820562' + ) + assert ret.success, "process should return success" + + +def test_export_absolute_path(script_runner): + with tempfile.TemporaryDirectory() as tmp: + out_path = os.path.join(tmp, "geoextent_output.gpkg") + ret = script_runner.run('geoextent', '-b', '-t', '--output', out_path, + 'tests/testdata/folders/folder_two_files' + ) + assert ret.success + assert os.path.exists(out_path) + + +def test_export_invalid_folder_path(script_runner): + ret = script_runner.run('geoextent', '-b', '-t', '--output', "tests/testdata/folders", + 'tests/testdata/folders/folder_two_files' + ) + assert not ret.success, "Output should be a file not a directory" + assert "Output must be a file, not a directory:" in ret.stderr + + +def test_export_overwrite_file(script_runner): + with tempfile.TemporaryDirectory() as tmp: + filepath = os.path.join(tmp, "geoextent_output.gpkg") + file = open(filepath, "w+") + file.close() + ret = script_runner.run('geoextent', '-b', '-t', '--output', filepath, + 'tests/testdata/folders/folder_two_files' + ) + assert ret.success + assert "Overwriting " + tmp in ret.stderr