Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use jsonpath to collect values when downloading a file #425

Merged
merged 3 commits into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions kgforge/core/archetypes/read_only_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
)
from kgforge.core.commons.execution import not_supported
from kgforge.core.commons.sparql_query_builder import SPARQLQueryBuilder
from kgforge.core.reshaping import collect_values
from kgforge.core.reshaping import collect_values, collect_values_jp
from kgforge.core.wrappings import Filter
from kgforge.core.wrappings.dict import DictWrapper

Expand Down Expand Up @@ -107,9 +107,13 @@ def download(
# path: DirPath.
urls = []
store_metadata = []
constraint_dict = None
if content_type:
constraint_dict = {'encodingFormat': content_type}
to_download = [data] if isinstance(data, Resource) else data
for d in to_download:
collected_values = collect_values(d, follow, DownloadingError)
# collected_values = collect_values(d, follow, DownloadingError)
collected_values = collect_values_jp(d, follow, DownloadingError, constraint_dict)
urls.extend(collected_values)
store_metadata.extend(
[d._store_metadata for _ in range(len(collected_values))]
Expand Down
28 changes: 27 additions & 1 deletion kgforge/core/reshaping.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
# You should have received a copy of the GNU Lesser General Public License
# along with Blue Brain Nexus Forge. If not, see <https://choosealicense.com/licenses/lgpl-3.0/>.

from typing import Dict, Iterator, List, Union, Type
from typing import Dict, Iterator, List, Union, Type, Optional
import jsonpath_ng as jp

from kgforge.core.resource import Resource
from kgforge.core.commons.attributes import repr_class
Expand Down Expand Up @@ -106,3 +107,28 @@ def _collect(things: List) -> Iterator[str]:
raise exception(
f"An error occur when collecting values for path to follow '{follow}': {str(e)}"
) from e


def collect_values_jp(data: Resource, follow: str,
exception: Type[Exception] = Exception,
constraint_dict: Optional[Dict] = None) -> List[str]:
try:
properties = follow.split('.')
pattern = f"$." + "[*].".join(properties)
jp_query = jp.parse(pattern)
data = as_json(data, False, False, None, None, None)
results = jp_query.find(data)
if len(results) == 0:
raise exception(f"Path not found")
if constraint_dict:
if len(constraint_dict) != 1:
raise NotImplementedError("Only one constraint can be impossed at the moment")
[(k, v)] = list(constraint_dict.items())
return [result.value for result in results if result.context.value[k] == v]
else:
return [result.value for result in results]

except Exception as e:
raise exception(
f"An error occur when collecting values for path to follow '{follow}': {str(e)}"
) from e
2 changes: 0 additions & 2 deletions kgforge/specializations/stores/bluebrain_nexus.py
Original file line number Diff line number Diff line change
Expand Up @@ -601,8 +601,6 @@ def _download_one(
params_download = copy.deepcopy(self.service.params.get("download", {}))
headers = (
self.service.headers_download
if not content_type
else update_dict(self.service.headers_download, {"Accept": content_type})
)

response = requests.get(
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@
"owlrl>=5.2.3",
"elasticsearch_dsl==7.4.0",
"requests==2.32.0",
"typing-extensions"
"typing-extensions",
"jsonpath-ng"
],
extras_require={
"dev": [
Expand Down
28 changes: 27 additions & 1 deletion tests/core/test_reshaping.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import pytest
from kgforge.core.resource import Resource
from kgforge.core.forge import KnowledgeGraphForge
from kgforge.core.reshaping import collect_values, Reshaper
from kgforge.core.reshaping import collect_values, collect_values_jp, Reshaper


def test_collect_values():
Expand All @@ -43,6 +43,32 @@ def test_collect_values():
collect_values(None, "hasPart.url",ValueError)


def test_collect_values_jasonpath():
simple = Resource(type="Experiment", url="file.gz")
r = collect_values_jp(simple, "url")
assert simple.url in r, "url should be in the list"
deep = Resource(type="Experiment", level1=Resource(level2=Resource(url="file.gz")))
r = collect_values_jp(deep, "level1.level2.url")
assert deep.level1.level2.url in r, "url should be in the list"
files = [Resource(type="Experiment", url=f"file{i}") for i in range(3)]
files.append(Resource(type="Experiment", contentUrl=f"file3"))
data_set = Resource(type="Dataset", hasPart=files)
r = collect_values_jp(data_set, "hasPart.contentUrl")
assert ["file3"] == r, "one element should be in the list"
r = collect_values_jp(data_set, "hasPart.url")
assert ["file0", "file1", "file2"] == r, "three elements should be in the list"
files = [Resource(type="Experiment", url=f"file{i}", encodingFormat=f"application/{ext}") for i, ext in enumerate(['csv', 'swc'])]
data_set = Resource(type="Dataset", hasPart=files)
r = collect_values_jp(data_set, "hasPart.url", constraint_dict={'encodingFormat': 'application/swc'})
assert ["file1"] == r, "only the file with encodingFormat `application/swc` must be returned"
with pytest.raises(Exception):
collect_values_jp(data_set, "hasPart.url", constraint_dict={'encodingFormat': 'application/swc', 'contentUrl': 'something'})
with pytest.raises(Exception):
collect_values_jp(data_set, "fake.path")
with pytest.raises(ValueError):
collect_values_jp(None, "hasPart.url", ValueError)


def test_reshape(config):
forge = KnowledgeGraphForge(config)
reshaper = Reshaper(versioned_id_template="{x.id}?_version={x._store_metadata.version}")
Expand Down
Loading