diff --git a/assemble.cwl b/assemble.cwl index 12a852f..c7a47d3 100755 --- a/assemble.cwl +++ b/assemble.cwl @@ -2,7 +2,7 @@ class: Workflow cwlVersion: v1.2 doc: | - Assemble a set of reads using SKESA + Assemble a set of reads using SKESA requirements: - class: MultipleInputFeatureRequirement inputs: diff --git a/scripts/pgap.py b/scripts/pgap.py index c9b6b2c..4c17fa8 100755 --- a/scripts/pgap.py +++ b/scripts/pgap.py @@ -12,6 +12,7 @@ import argparse import atexit +import contextlib import glob import json import os @@ -24,7 +25,9 @@ import threading import time import tempfile -import contextlib +import xml +import xml.dom.minidom + from io import open from urllib.parse import urlparse, urlencode @@ -165,7 +168,7 @@ def __init__(self, params, local_input, pipeline): self.input_file = '/pgap/user_input/pgap_input.yaml' submol = self.get_submol(local_input) if ( submol != None ): - self.submol = self.create_submolfile(submol) + self.submol = self.create_submolfile(submol, params.ani_output, params.ani_hr_output, params.args.auto_correct_tax) else: self.submol = None self.yaml = self.create_inputfile(local_input) @@ -222,14 +225,17 @@ def make_docker_cmd(self): def make_podman_cmd(self): - self.cmd = [self.params.docker_cmd, 'run', '-i', '--rm' ] + self.cmd = [self.params.docker_cmd] + if self.params.args.debug: + self.cmd.extend(['--log-level', 'debug']) + self.cmd.extend(['run', '-i', '--rm' ]) self.cmd.extend([ - '--volume', '{}:/pgap/input:ro'.format(self.data_dir), - '--volume', '{}:/pgap/user_input'.format(self.input_dir), - '--volume', '{}:{}:ro'.format(self.yaml, self.input_file ), + '--volume', '{}:/pgap/input:ro,Z'.format(self.data_dir), + '--volume', '{}:/pgap/user_input:Z'.format(self.input_dir), + '--volume', '{}:{}:ro,Z'.format(self.yaml, self.input_file ), '--volume', '{}:/tmp:rw'.format(os.getenv("TMPDIR", "/tmp")), - '--volume', '{}:/pgap/output:rw'.format(self.params.outputdir)]) + '--volume', '{}:/pgap/output:rw,Z'.format(self.params.outputdir)]) if (self.params.args.memory): self.cmd.extend(['--memory', self.params.args.memory]) @@ -239,6 +245,8 @@ def make_podman_cmd(self): log_dir = self.params.outputdir + '/debug/log' os.makedirs(log_dir, exist_ok=True) self.cmd.extend(['--volume', '{}:/log/srv'.format(log_dir)]) + if self.params.args.container_name: + self.cmd.extend(['--name', self.params.args.container_name]) self.cmd.append(self.params.docker_image) def make_singularity_cmd(self): @@ -256,7 +264,11 @@ def make_singularity_cmd(self): log_dir = self.params.outputdir + '/debug/log' os.makedirs(log_dir, exist_ok=True) self.cmd.extend(['--bind', '{}:/log/srv'.format(log_dir)]) - self.cmd.extend(["--pwd", "/pgap", "docker://" + self.params.docker_image]) + + if self.params.args.container_path: + self.cmd.extend(["--pwd", "/pgap", self.params.docker_image]) + else: + self.cmd.extend(["--pwd", "/pgap", "docker://" + self.params.docker_image]) def get_submol(self, local_input): with open(local_input, 'r') as fIn: @@ -278,11 +290,35 @@ def regexp_file(self, filename, field): for line in fIn: if(re.search(field,line)): return True - return False; - - def create_submolfile(self, local_submol): + return False + def get_genus_species(self, xml_file): + genus_species = None + try: + doc = xml.dom.minidom.parse(xml_file) + predicted_taxid = doc.getElementsByTagName('predicted-taxid')[0] + if predicted_taxid.getAttribute('confidence')=='HIGH': + genus_species = predicted_taxid.getAttribute('org-name') + except: + genus_species = None + return genus_species + + def create_submolfile(self, local_submol, ani_output, ani_hr_output, auto_correct_tax): has_authors = self.regexp_file(local_submol, '^authors:') has_contact_info = self.regexp_file(local_submol, '^contact_info:') + genus_species = None + if auto_correct_tax: + if ani_output != None: + genus_species = self.get_genus_species(ani_output) + if genus_species != None: + print('ANI analysis detected species "{}", and we will use it for PGAP'.format(genus_species)) + else: + if ani_hr_output != None: + chosen_ani_output_type = ani_hr_output + else: + chosen_ani_output_type = ani_output + print('ERROR: taxcheck failed to assign a species with high confidence, thus PGAP will not execute. See {}'.format(chosen_ani_output_type)) + + sys.exit(1) with tempfile.NamedTemporaryFile(mode='w', suffix=".yaml", prefix="pgap_submol_", @@ -292,6 +328,10 @@ def create_submolfile(self, local_submol): with open(local_submol, 'r') as fIn: for line in fIn: if line: # skip empty lines + if auto_correct_tax and genus_species != None and re.match(r'\s+genus_species:', line): + print('replacing organism in line: {}'.format(line.rstrip())) + line = re.sub(r'genus_species:.*', "genus_species: '{}'".format(genus_species), line) + print('with organism: {}'.format(genus_species)) fOut.write(line.rstrip()) fOut.write(u'\n') if has_authors == False: @@ -354,7 +394,8 @@ def check_runtime_setting(settings, value, min): print('WARNING: {} is less than the recommended value of {}'.format(value, min)) if (self.params.docker_type == 'singularity'): - cmd = [self.params.docker_cmd, 'exec', '--bind', '{}:/cwd:ro'.format(os.getcwd()), "docker://"+self.params.docker_image, + singularity_docker_image = self.params.docker_image if self.params.args.container_path else "docker://"+self.params.docker_image + cmd = [self.params.docker_cmd, 'exec', '--bind', '{}:/cwd:ro'.format(os.getcwd()), singularity_docker_image, 'bash', '-c', 'df -k /cwd /tmp ; ulimit -a ; cat /proc/{meminfo,cpuinfo}'] else: cmd = [self.params.docker_cmd, 'run', '-i', '-v', '{}:/cwd'.format(os.getcwd()), self.params.docker_image, @@ -438,6 +479,9 @@ class Setup: def __init__(self, args): self.args = args + self.ani_output = None + # human readable ANI output file + self.ani_hr_output = None self.branch = self.get_branch() self.repo = self.get_repo() self.rundir = self.get_dir() @@ -455,7 +499,10 @@ def __init__(self, args): self.list_remote_versions() return self.use_version = self.get_use_version() - self.docker_image = "ncbi/{}:{}".format(self.repo, self.use_version) + if self.args.container_path: + self.docker_image = self.args.container_path + else: + self.docker_image = "ncbi/{}:{}".format(self.repo, self.use_version) self.data_path = '{}/input-{}'.format(self.rundir, self.use_version) self.test_genomes_path = '{}/test_genomes-{}'.format(self.rundir, self.use_version) self.outputdir = self.get_output_dir() @@ -647,6 +694,7 @@ def check_install_data(self): packages = ['pgap'] for package in packages: guard_file = f"{self.rundir}/input-{self.use_version}/.{package}_complete" + return True # DEBUG only do not commit if not os.path.isfile(guard_file): return False return True @@ -776,6 +824,9 @@ def main(): parser.add_argument("--container-name", dest='container_name', help='Specify a container name that will be used instead of automatically generated.') + parser.add_argument("--container-path", + dest='container_path', + help='Override path to image.') parser.add_argument("--ignore-all-errors", dest='ignore_all_errors', action='store_true', @@ -784,6 +835,12 @@ def main(): dest='no_internet', action='store_true', help='Disable internet access for all programs in pipeline.') + parser.add_argument("--auto-correct-tax", + dest='auto_correct_tax', + action='store_true', + help=''' + If flag is set, run ANI first, then PGAP, overriding the organism name provided by the user in the input YAML with the value returned by ANI Predicted organism. Obviously both actions need to be requested for this flag to take effect + ''') parser.add_argument('-D', '--docker', metavar='path', help='Docker-compatible executable (e.g. docker, podman, singularity), which may include a full path like /usr/bin/docker') parser.add_argument('-o', '--output', metavar='path', default='output', @@ -814,12 +871,36 @@ def main(): retcode = p.launch() p.cleaunup() if args.ignore_all_errors == False: + # args.output for some reason not always available + time.sleep(1) # analyze ani output here if not os.path.exists(args.output): - raise + print("INTERNAL(SYSTEM)PROBLEM: abort: output directory does not exist: {}".format(args.output)) + sys.exit(1) + params.ani_output = os.path.join(args.output, "ani-tax-report.xml") + params.ani_hr_output = os.path.join(args.output, "ani-tax-report.txt") + if os.path.exists(params.ani_output) and os.path.getsize(params.ani_output) > 0: + True + else: + params.ani_output = None + if os.path.exists(params.ani_hr_output) and os.path.getsize(params.ani_hr_output) > 0: + True + else: + params.ani_hr_output = None + errors_xml_fn = os.path.join(args.output, "errors.xml") - if os.path.exists(errors_xml_fn) and os.path.getsize(errors_xml_fn) > 0: - raise + # if there are errors + # and we do not want to recover them when it is recoverable + # then bail + if os.path.exists(errors_xml_fn) and os.path.getsize(errors_xml_fn) > 0 and not ( args.auto_correct_tax and params.ani_output != None ) : + error_file = None + if params.ani_output != None: + error_file = params.ani_output + else: + error_file = errors_xml_fn + print("ERROR: taxcheck calls the genome misassigned or contaminated, thus PGAP will not execute. See {}".format(error_file)) + sys.exit(1) + if not args.ani_only: p = Pipeline(params, args.input, "pgap") retcode = p.launch()