Skip to content

Commit

Permalink
Merge pull request #125 from SbastianGarzon/export
Browse files Browse the repository at this point in the history
  • Loading branch information
nuest authored Apr 21, 2021
2 parents e1b9e04 + 25f6a3c commit 5433223
Show file tree
Hide file tree
Showing 14 changed files with 721 additions and 108 deletions.
4 changes: 4 additions & 0 deletions docs/source/changelog.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@

Changelog
=========
0.7.0
^^^^^
- Add DOI-based retrieval functions for Zenodo (:pr:`100`)
- Add export function ``--output`` for folders, ZIP files and repositories (:pr:`124`)

0.6.0
^^^^^
Expand Down
34 changes: 34 additions & 0 deletions docs/source/howto/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -121,3 +121,37 @@ Output:
geoextent.fromDirectory('../tests/testdata/folders/folder_one_file', True, True, True)

`folder_two_files <https://github.com/o2r-project/geoextent/blob/master/tests/testdata/folders/folder_two_files>`_

Zenodo repositories
-------------------

**Geoextent** also supports queries for **Zenodo repositories**.
Geoextent creates a *temporal* copy of the repository and extracts the temporal or geographical extent.
Geoextent only allows to query **Open** Zenodo repositories.

::

geoextent.from_repository(repository_identifier, bbox, time, details)

**Parameters:**
- ``repository_identifier``: a string value with a Zenodo link (e.g., https://zenodo.org/record/3528062) or DOI (e.g., https://doi.org/10.5281/zenodo.3528062)
- ``bbox``: a boolean value to extract spatial extent (bounding box)
- ``time``: a boolean value to extract temporal extent (at "day" precision '%Y-%m-%d')
- ``details``: a boolean value to return details (geoextent) of individual files (default **False**)

The output of this function is the combined bbox or tbox resulting from merging all results of individual files (see: :doc:`../supportedformats/index_supportedformats`) inside the repository. The resulting coordinate reference system ``CRS`` of the combined bbox is always in the `EPSG: 4326 <https://epsg.io/4326>`_ system.

Code:

::

geoextent.from_repository('https://zenodo.org/record/820562', True, True, False)

Output:

.. jupyter-execute::
:hide-code:
:stderr:

import geoextent.lib.extent as geoextent
geoextent.from_repository('https://zenodo.org/record/820562', True, True)
31 changes: 31 additions & 0 deletions docs/source/howto/cli.rst
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,28 @@ Extract both bounding box and time interval from a folder or zipfile

The output of this function is the combined bbox or tbox resulting from merging all results of individual files (see: :doc:`../supportedformats/index_supportedformats`) inside the folder or zipfile. The resulting coordinate reference system ``CRS`` of the combined bbox is always in the `EPSG: 4326 <https://epsg.io/4326>`_ system.


Zenodo repositories
-----------------------

Geoextent also supports queries from (Open) Zenodo repositories.

Extract both bounding box and time interval from Zenodo
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

::

geoextent -b -t https://zenodo.org/record/820562

.. jupyter-execute::
:hide-code:
:stderr:

import geoextent.lib.extent as geoextent
geoextent.from_repository('https://zenodo.org/record/820562', True, True)

The output of this function is the combined bbox or tbox resulting from merging all results of individual files (see: :doc:`../supportedformats/index_supportedformats`) inside the Zenodo repository. The resulting coordinate reference system ``CRS`` of the combined bbox is always in the `EPSG: 4326 <https://epsg.io/4326>`_ system.

Debugging
^^^^^^^^^

Expand Down Expand Up @@ -136,5 +158,14 @@ or time box (tbox).
import geoextent.lib.extent as geoextent
geoextent.fromDirectory('../tests/testdata/folders/folder_one_file', True, True,True)

Export function
^^^^^^^^^^^^^^^
You can export the result of Geoextent to a Geopackage file. This file contains the output of all files within the
folder or repository.

::

geoextent -b -t --output path/to/output/geopackage_file.gpkg folder_path



2 changes: 1 addition & 1 deletion geoextent/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
name = "geoextent"

__version__ = '0.6.1'
__version__ = '0.7.0'
77 changes: 59 additions & 18 deletions geoextent/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
import os
import sys
import zipfile

from . import __version__ as current_version
from .lib import extent
from .lib import helpfunctions as hf

logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger("geoextent")
Expand All @@ -16,11 +16,9 @@
'''

help_epilog = '''
By default, both bounding box and temporal extent are extracted.
Examples:
geoextent path/to/geo_file.ext
geoextent -b path/to/directory_with_geospatial_data
geoextent -t path/to/file_with_temporal_extent
geoextent -b -t path/to/geospatial_files
Expand All @@ -42,15 +40,22 @@


# custom action, see e.g. https://stackoverflow.com/questions/11415570/directory-path-types-with-argparse


class readable_file_or_dir(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
for candidate in values:
if not (os.path.isdir(candidate) or os.path.isfile(candidate) or zipfile.is_zipfile(candidate)):
raise argparse.ArgumentTypeError("{0} is not a valid directory or file".format(candidate))
if os.access(candidate, os.R_OK):
if (hf.doi_regexp.match(candidate) is not None) or (hf.zenodo_regexp.match(candidate) is not None):
logger.debug("The format of the URL or DOI is correct. Geoextent is going to try to download "
"this repository from {} ".format(candidate))
setattr(namespace, self.dest, candidate)
else:
raise argparse.ArgumentTypeError("{0} is not a readable directory or file".format(candidate))
if not (os.path.isdir(candidate) or os.path.isfile(candidate) or zipfile.is_zipfile(candidate)):
raise argparse.ArgumentTypeError("{0} is not a valid directory or file".format(candidate))
if os.access(candidate, os.R_OK):
setattr(namespace, self.dest, candidate)
else:
raise argparse.ArgumentTypeError("{0} is not a readable directory or file".format(candidate))


def get_arg_parser():
Expand All @@ -59,7 +64,8 @@ def get_arg_parser():
add_help=False,
prog='geoextent',
formatter_class=argparse.RawDescriptionHelpFormatter,
usage="geoextent [-h] [--formats] [--version] [--debug] [--details] [-b] [-t] [input file]']"
usage="geoextent [-h] [--formats] [--version] [--debug] [--details] [--output] [output file] [-b] [-t] [input "
"file]'] "
)

parser.add_argument(
Expand Down Expand Up @@ -93,6 +99,13 @@ def get_arg_parser():
help='Returns details of folder/zipFiles geoextent extraction',
)

parser.add_argument(
'--output',
action='store',
default=None,
help="Creates geopackage with geoextent output",
)

parser.add_argument(
'-b', '--bounding-box',
action='store_true',
Expand All @@ -110,7 +123,6 @@ def get_arg_parser():
parser.add_argument(
'files',
action=readable_file_or_dir,
default=os.getcwd(),
nargs=argparse.REMAINDER,
help="input file or path"
)
Expand Down Expand Up @@ -144,7 +156,7 @@ def main():

# version, help, and formats must be checked before parse, as otherwise files are required
# but arg parser gives an error if allowed to be parsed first
if "--help" in sys.argv:
if "--help" in sys.argv or "-h" in sys.argv:
print_help()
arg_parser.exit()
if "--version" in sys.argv:
Expand All @@ -156,24 +168,45 @@ def main():

args = vars(arg_parser.parse_args())
files = args['files']
logger.debug('Extracting from inputs %s', files)

if files is None:
raise Exception("Invalid command, input file missing")

multiple_files = True
logger.debug('Extracting from inputs %s', files)
# Set logging level
if args['debug']:
logging.getLogger('geoextent').setLevel(logging.DEBUG)
if os.environ.get('GEOEXTENT_DEBUG', None) == "1":
logging.getLogger('geoextent').setLevel(logging.DEBUG)

# Identify local file source
is_file = os.path.isfile(os.path.join(os.getcwd(), files))
is_zipfile = zipfile.is_zipfile(os.path.join(os.getcwd(), files))
is_directory = os.path.isdir(os.path.join(os.getcwd(), files))

# Identify URL
is_url = hf.https_regexp.match(files) is not None

# Check output path
export = args['output'] is not None

try:
if export:
filename = hf.path_output(args['output'])
except ValueError as e:
raise ValueError(e)

output = None
# Check if file is exists happens in parser validation, see readable_file_or_dir
try:
if os.path.isfile(os.path.join(os.getcwd(), files)) and not zipfile.is_zipfile(
os.path.join(os.getcwd(), files)):

if is_file and not is_zipfile:
output = extent.fromFile(files, bbox=args['bounding_box'], tbox=args['time_box'])
if os.path.isdir(os.path.join(os.getcwd(), files)) or zipfile.is_zipfile(os.path.join(os.getcwd(), files)):
multiple_files = False
if is_directory or is_zipfile:
output = extent.fromDirectory(files, bbox=args['bounding_box'], tbox=args['time_box'], details=True)
if not args['details']:
output.pop('details', None)
if is_url:
output = extent.from_repository(files, bbox=args['bounding_box'], tbox=args['time_box'], details=True)

except Exception as e:
if logger.getEffectiveLevel() >= logging.DEBUG:
Expand All @@ -183,7 +216,15 @@ def main():
if output is None:
raise Exception("Did not find supported files at {}".format(files))
else:
logger.info("Output{}:".format(output))

if export and not multiple_files:
logger.warning("Exporting result does not apply to single files")
elif export and multiple_files:
logger.warning("Exporting result into: {}".format(args['output']))
df = hf.extract_output(output, files, current_version)
hf.create_geopackage(df, filename)
if not args['details']:
output.pop('details', None)

if type(output) == list:
print(str(output))
Expand Down
79 changes: 79 additions & 0 deletions geoextent/lib/content_providers/Zenodo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
from requests import HTTPError
from .providers import DoiProvider
from ..extent import *


class Zenodo(DoiProvider):
def __init__(self):
super().__init__()
self.log = logging.getLogger("geoextent")
self.host = {"hostname": ["https://zenodo.org/record/", "http://zenodo.org/record/"],
"api": "https://zenodo.org/api/records/"
}
self.reference = None
self.record_id = None
self.name = "Zenodo"

def validate_provider(self, reference):
self.reference = reference
url = self.get_url
if any([url.startswith(p) for p in self.host["hostname"]]):
self.record_id = url.rsplit("/", maxsplit=1)[1]
return True
else:
return False

def _get_metadata(self):

if self.validate_provider:
try:
resp = self._request(
"{}{}".format(self.host["api"], self.record_id), headers={"accept": "application/json"}
)
resp.raise_for_status()
self.record = resp.json()
return self.record
except:
m = "The zenodo record : https://zenodo.org/record/" + self.record_id + " does not exist"
self.log.warning(m)
raise HTTPError(m)
else:
raise ValueError('Invalid content provider')

@property
def _get_file_links(self):

try:
self._get_metadata()
record = self.record
except ValueError as e:
raise Exception(e)

try:
files = record['files']
except:
m = "This record does not have Open Access files. Verify the Access rights of the record."
self.log.warning(m)
raise ValueError(m)

file_list = []
for j in files:
file_list.append(j['links']['download'])
return file_list

def download(self, folder):
self.log.debug("Downloading Zenodo record id: {} ".format(self.record_id))
try:
download_links = self._get_file_links
counter = 1
for file_link in download_links:
resp = self.session.get(file_link, stream=True)
filename = os.path.split(resp.url)[1]
filepath = os.path.join(folder, filename)
with open(filepath, "wb") as dst:
for chunk in resp.iter_content(chunk_size=None):
dst.write(chunk)
self.log.debug("{} out of {} files downloaded.".format(counter, len(download_links)))
counter += 1
except ValueError as e:
raise Exception(e)
41 changes: 41 additions & 0 deletions geoextent/lib/content_providers/providers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from requests import Session, HTTPError
from geoextent.lib import helpfunctions as hf
import logging


class ContentProvider:
def __init__(self):
self.log = logging.getLogger("geoextent")


class DoiProvider(ContentProvider):

def __init__(self):
self.session = Session()

def _request(self, url, **kwargs):
return self.session.get(url, **kwargs)

def _type_of_reference(self):
if hf.doi_regexp.match(self.reference):
return "DOI"
elif hf.https_regexp.match(self.reference):
return 'Link'

@property
def get_url(self):

if self._type_of_reference() == "DOI":
doi = hf.doi_regexp.match(self.reference).group(2)

try:
resp = self._request("https://doi.org/{}".format(doi))
resp.raise_for_status()

except HTTPError:
return doi

return resp.url

else:
return self.reference
Loading

0 comments on commit 5433223

Please sign in to comment.