Skip to content

Commit

Permalink
Merge remote-tracking branch 'refs/remotes/origin/ga-model-store' int…
Browse files Browse the repository at this point in the history
…o ga-model-store
  • Loading branch information
pauldg committed Apr 6, 2022
2 parents 39b1d7e + 81b1e1e commit f56f974
Show file tree
Hide file tree
Showing 24 changed files with 15,538 additions and 4,762 deletions.
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ jinja2
python-dateutil
click
prov
schema_salad
typing-extensions
83 changes: 35 additions & 48 deletions rocrate/provenance_profile.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,12 @@
import copy
import pdb
import datetime
import logging
import urllib
import uuid
import json
from io import BytesIO
from pathlib import PurePath, PurePosixPath
from socket import getfqdn
from typing import (
Any,
Dict,
Iterable,
List,
MutableMapping,
MutableSequence,
Optional,
Tuple,
Expand All @@ -23,9 +16,7 @@

from prov.identifier import Identifier
from prov.model import PROV, PROV_LABEL, PROV_TYPE, PROV_VALUE, ProvDocument, ProvEntity
from schema_salad.sourceline import SourceLine
from typing_extensions import TYPE_CHECKING
from tools.load_ga_export import load_ga_history_export, GalaxyJob, GalaxyDataset
from tools.load_ga_export import load_ga_history_export, GalaxyJob
from ast import literal_eval
import os

Expand All @@ -36,16 +27,12 @@
from rocrate.provenance_constants import (
ACCOUNT_UUID,
CWLPROV,
ENCODING,
FOAF,
METADATA,
ORE,
PROVENANCE,
RO,
SCHEMA,
SHA1,
SHA256,
TEXT_PLAIN,
UUID,
WF4EVER,
WFDESC,
Expand All @@ -59,15 +46,17 @@
# from rocrate.provenance import ResearchObject

from pathlib import Path
import rocrate.rocrate as roc


def posix_path(local_path: str) -> str:
return str(PurePosixPath(Path(local_path)))


def remove_escapes(s):
escapes = ''.join([chr(char) for char in range(1, 32)])
translator = str.maketrans('', '', escapes)
t = s.translate(translator)
s.translate(translator)


def reassign(d):
for k, v in d.items():
Expand All @@ -78,16 +67,17 @@ def reassign(d):
except ValueError:
pass


class ProvenanceProfile:
"""
"""\
Provenance profile.
Populated from a galaxy workflow export.
"""

def __init__(
self,
ga_export: Dict,
ga_export: Dict,
full_name: str = None,
orcid: str = None,
# prov_name: str = None,
Expand All @@ -112,12 +102,11 @@ def __init__(
self.base_uri = "arcp://uuid,%s/" % self.ro_uuid
self.document = ProvDocument()
# TODO extract engine_uuid from galaxy, type: str
self.engine_uuid = "urn:uuid:%s" % uuid.uuid4() #type: str
self.engine_uuid = "urn:uuid:%s" % uuid.uuid4() # type: str
self.full_name = full_name
self.workflow_run_uuid = run_uuid or uuid.uuid4()
self.workflow_run_uri = self.workflow_run_uuid.urn # type: str

# move to separate function
# move to separate function
metadata_export = load_ga_history_export(ga_export)
self.generate_prov_doc()
self.jobs = []
Expand Down Expand Up @@ -153,7 +142,7 @@ def generate_prov_doc(self) -> Tuple[str, ProvDocument]:
# PROV_TYPE: FOAF["OnlineAccount"],
# TODO: change how we register galaxy version, probably a declare_version func
# self.galaxy_version = self.ga_export["jobs_attrs"][0]["galaxy_version"]
# TODO: change notation to already imported namespaces?
# TODO: change notation to already imported namespaces?
self.document.add_namespace("wfprov", "http://purl.org/wf4ever/wfprov#")
# document.add_namespace('prov', 'http://www.w3.org/ns/prov#')
self.document.add_namespace("wfdesc", "http://purl.org/wf4ever/wfdesc#")
Expand All @@ -176,7 +165,7 @@ def generate_prov_doc(self) -> Tuple[str, ProvDocument]:
"provenance", self.base_uri + posix_path(PROVENANCE) + "/"
)
# TODO: use appropriate refs for ga_export and related inputs
ro_identifier_workflow = self.base_uri + "ga_export" + "/"
ro_identifier_workflow = self.base_uri + "ga_export" + "/"
self.wf_ns = self.document.add_namespace("wf", ro_identifier_workflow)
ro_identifier_input = (
self.base_uri + "ga_export/datasets#"
Expand Down Expand Up @@ -240,15 +229,15 @@ def declare_process(
"""Record the start of each Process."""
if process_run_id is None:
process_run_id = uuid.uuid4().urn
cmd = ga_export_jobs_attrs["command_line"]

# cmd = ga_export_jobs_attrs["command_line"]
process_name = ga_export_jobs_attrs["tool_id"]
tool_version = ga_export_jobs_attrs["tool_version"]
prov_label = "Run of " + process_name
start_time = ga_export_jobs_attrs["create_time"]
end_time = ga_export_jobs_attrs["update_time"]

#TODO: Find out how to include commandline as a string
# TODO: Find out how to include commandline as a string
# cmd = self.document.entity(
# uuid.uuid4().urn,
# {PROV_TYPE: WFPROV["Artifact"], PROV_LABEL: ga_export_jobs_attrs["command_line"]}
Expand All @@ -259,9 +248,9 @@ def declare_process(
start_time,
end_time,
{
PROV_TYPE: WFPROV["ProcessRun"],
PROV_LABEL: prov_label,
#TODO: Find out how to include commandline as a string
PROV_TYPE: WFPROV["ProcessRun"],
PROV_LABEL: prov_label,
# TODO: Find out how to include commandline as a string
# PROV_LABEL: cmd
},
)
Expand Down Expand Up @@ -289,7 +278,7 @@ def used_artefacts(
base += "/" + process_name
tool_id = process_metadata["tool_id"]
base += "/" + tool_id
items = ["inputs","outputs","parameters"]
items = ["inputs", "outputs", "parameters"]
# print(process_metadata["params"])
for item in items:
# print(item)
Expand Down Expand Up @@ -317,7 +306,6 @@ def used_artefacts(

# for artefact in value:
try:
# pdb.set_trace()
entity = self.declare_artefact(value)
self.document.used(
process_run_id,
Expand Down Expand Up @@ -356,7 +344,7 @@ def declare_artefact(self, value: Any) -> ProvEntity:
# byte_s = BytesIO(value)
# data_file = self.research_object.add_data_file(byte_s)
# FIXME: Don't naively assume add_data_file uses hash in filename!
data_id = "data:%s" % str(value) #PurePosixPath(data_file).stem
data_id = "data:%s" % str(value) # PurePosixPath(data_file).stem
return self.document.entity(
data_id,
{PROV_TYPE: WFPROV["Artifact"], PROV_VALUE: str(value)},
Expand Down Expand Up @@ -394,7 +382,7 @@ def declare_artefact(self, value: Any) -> ProvEntity:
)

if value.get("class"):
#_logger.warning("Unknown data class %s.", value["class"])
# _logger.warning("Unknown data class %s.", value["class"])
# FIXME: The class might be "http://example.com/somethingelse"
coll.add_asserted_type(CWLPROV[value["class"]])

Expand All @@ -404,7 +392,7 @@ def declare_artefact(self, value: Any) -> ProvEntity:
# clean up unwanted characters
if isinstance(key, str):
key = key.replace("|", "_")
if isinstance(val, str):
if isinstance(val, str):
val = val.replace("|", "_")

v_ent = self.declare_artefact(val)
Expand Down Expand Up @@ -451,7 +439,7 @@ def declare_artefact(self, value: Any) -> ProvEntity:
# FIXME: list value does not support adding "@id"
return coll
except TypeError:
#_logger.warning("Unrecognized type %s of %r", type(value), value)
# _logger.warning("Unrecognized type %s of %r", type(value), value)
# Let's just fall back to Python repr()
entity = self.document.entity(uuid.uuid4().urn, {PROV_LABEL: repr(value)})
# self.research_object.add_uri(entity.identifier.uri)
Expand All @@ -466,7 +454,7 @@ def declare_file(self, value: Dict) -> Tuple[ProvEntity, ProvEntity, str]:
if "checksum" in value:
csum = cast(str, value["checksum"])
(method, checksum) = csum.split("$", 1)
if method == SHA1: # and self.research_object.has_data_file(checksum):
if method == SHA1: # and self.research_object.has_data_file(checksum):
entity = self.document.entity("data:" + checksum)

if not entity and "location" in value:
Expand Down Expand Up @@ -513,8 +501,8 @@ def declare_file(self, value: Dict) -> Tuple[ProvEntity, ProvEntity, str]:

# Check for secondaries
for sec in cast(
# MutableSequence[CWLObjectType],
value.get("secondaryFiles", [])
# MutableSequence[CWLObjectType],
value.get("secondaryFiles", []) # noqa
):
# TODO: Record these in a specializationOf entity with UUID?
if sec["class"] == "File":
Expand All @@ -535,8 +523,10 @@ def declare_file(self, value: Dict) -> Tuple[ProvEntity, ProvEntity, str]:

return file_entity, entity, checksum

def declare_directory(self
# , value: CWLObjectType
def declare_directory(
self,
# value: CWLObjectType
value
) -> ProvEntity:
"""Register any nested files/directories."""
# FIXME: Calculate a hash-like identifier for directory
Expand Down Expand Up @@ -647,12 +637,11 @@ def declare_string(self, value: str) -> Tuple[ProvEntity, str]:
# checksum = PurePosixPath(data_file).name
# FIXME: Don't naively assume add_data_file uses hash in filename!
value = str(value).replace("|", "_")
data_id = "data:%s" % str(value) #PurePosixPath(data_file).stem
data_id = "data:%s" % str(value) # PurePosixPath(data_file).stem
entity = self.document.entity(
data_id, {PROV_TYPE: WFPROV["Artifact"], PROV_VALUE: str(value)}
) # type: ProvEntity
return entity #, checksum

return entity # , checksum

def generate_output_prov(
self,
Expand Down Expand Up @@ -735,7 +724,7 @@ def activity_has_provenance(self, activity, prov_ids):
self.document.activity(activity, other_attributes=attribs)
# Tip: we can't use https://www.w3.org/TR/prov-links/#term-mention
# as prov:mentionOf() is only for entities, not activities
uris = [i.uri for i in prov_ids]
# uris = [i.uri for i in prov_ids]
# self.research_object.add_annotation(activity, uris, PROV["has_provenance"].uri)

def finalize_prov_profile(self, name=None, out_path=None):
Expand Down Expand Up @@ -770,7 +759,7 @@ def finalize_prov_profile(self, name=None, out_path=None):

# https://www.w3.org/TR/prov-xml/
# serialized_prov_docs["xml"] = self.document.serialize(format="xml", indent=4)
prov_ids.append(self.provenance_ns[filename + ".xml"])
prov_ids.append(self.provenance_ns[filename + ".xml"])
with open(basename + ".xml", "w") as provenance_file:
self.document.serialize(provenance_file, format="xml", indent=4)

Expand All @@ -779,7 +768,6 @@ def finalize_prov_profile(self, name=None, out_path=None):
prov_ids.append(self.provenance_ns[filename + ".provn"])
with open(basename + ".provn", "w") as provenance_file:
self.document.serialize(provenance_file, format="provn", indent=2)


# https://www.w3.org/Submission/prov-json/
# serialized_prov_docs["json"] = self.document.serialize(format="json", indent=2)
Expand Down Expand Up @@ -810,7 +798,6 @@ def finalize_prov_profile(self, name=None, out_path=None):
prov_ids.append(self.provenance_ns[filename + ".jsonld"])
with open(basename + ".jsonld", "w") as provenance_file:
self.document.serialize(provenance_file, format="rdf", rdf_format="json-ld")


#_logger.debug("[provenance] added provenance: %s", prov_ids)
# _logger.debug("[provenance] added provenance: %s", prov_ids)
return (serialized_prov_docs, prov_ids)
4 changes: 2 additions & 2 deletions rocrate/rocrate_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
# limitations under the License.

from pathlib import Path
import os

import rocrate.rocrate as roc
from rocrate.provenance_profile import ProvenanceProfile
Expand Down Expand Up @@ -79,6 +78,7 @@ def make_workflow_rocrate(workflow_path, wf_type, include_files=[],

return wf_crate


# WIP
def make_workflow_run_rocrate(workflow_path, wf_type, wfr_metadata_path, author=None, orcid=None,
include_files=[], fetch_remote=False, prov_name=None, prov_path=None, cwl=None, diagram=None):
Expand Down Expand Up @@ -110,4 +110,4 @@ def make_workflow_run_rocrate(workflow_path, wf_type, wfr_metadata_path, author=
for file_entry in include_files:
wfr_crate.add_file(file_entry)

return wfr_crate
return wfr_crate
Loading

0 comments on commit f56f974

Please sign in to comment.