Skip to content

Commit

Permalink
Merge pull request #203 from rhysnewell/dev
Browse files Browse the repository at this point in the history
merging v0.9.1
  • Loading branch information
rhysnewell authored May 14, 2024
2 parents 7832126 + 86656d0 commit 387ac6d
Show file tree
Hide file tree
Showing 13 changed files with 247 additions and 104 deletions.
26 changes: 25 additions & 1 deletion aviary/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,25 @@
__version__ = "0.9.0"
__version__ = "0.9.1"


# CONSTANTS
LONG_READ_TYPES = ["ont", "ont_hq", "rs", "sq", "ccs", "hifi"]
MEDAKA_MODELS = [
"r103_fast_g507", "r103_fast_snp_g507", "r103_fast_variant_g507", "r103_hac_g507", "r103_hac_snp_g507",
"r103_hac_variant_g507", "r103_min_high_g345", "r103_min_high_g360", "r103_prom_high_g360", "r103_prom_snp_g3210",
"r103_prom_variant_g3210", "r103_sup_g507", "r103_sup_snp_g507", "r103_sup_variant_g507", "r1041_e82_260bps_fast_g632",
"r1041_e82_260bps_fast_variant_g632", "r1041_e82_260bps_hac_g632", "r1041_e82_260bps_hac_variant_g632", "r1041_e82_260bps_sup_g632",
"r1041_e82_260bps_sup_variant_g632", "r1041_e82_400bps_fast_g615", "r1041_e82_400bps_fast_g632",
"r1041_e82_400bps_fast_variant_g615", "r1041_e82_400bps_fast_variant_g632", "r1041_e82_400bps_hac_g615",
"r1041_e82_400bps_hac_g632", "r1041_e82_400bps_hac_variant_g615", "r1041_e82_400bps_hac_variant_g632", "r1041_e82_400bps_sup_g615",
"r1041_e82_400bps_sup_variant_g615", "r104_e81_fast_g5015", "r104_e81_fast_variant_g5015", "r104_e81_hac_g5015",
"r104_e81_hac_variant_g5015", "r104_e81_sup_g5015", "r104_e81_sup_g610", "r104_e81_sup_variant_g610", "r10_min_high_g303",
"r10_min_high_g340", "r941_e81_fast_g514", "r941_e81_fast_variant_g514", "r941_e81_hac_g514", "r941_e81_hac_variant_g514",
"r941_e81_sup_g514", "r941_e81_sup_variant_g514", "r941_min_fast_g303", "r941_min_fast_g507", "r941_min_fast_snp_g507",
"r941_min_fast_variant_g507", "r941_min_hac_g507", "r941_min_hac_snp_g507", "r941_min_hac_variant_g507", "r941_min_high_g303",
"r941_min_high_g330", "r941_min_high_g340_rle", "r941_min_high_g344", "r941_min_high_g351", "r941_min_high_g360", "r941_min_sup_g507",
"r941_min_sup_snp_g507", "r941_min_sup_variant_g507", "r941_prom_fast_g303", "r941_prom_fast_g507", "r941_prom_fast_snp_g507",
"r941_prom_fast_variant_g507", "r941_prom_hac_g507", "r941_prom_hac_snp_g507", "r941_prom_hac_variant_g507", "r941_prom_high_g303",
"r941_prom_high_g330", "r941_prom_high_g344", "r941_prom_high_g360", "r941_prom_high_g4011", "r941_prom_snp_g303", "r941_prom_snp_g322",
"r941_prom_snp_g360", "r941_prom_sup_g507", "r941_prom_sup_snp_g507", "r941_prom_sup_variant_g507", "r941_prom_variant_g303",
"r941_prom_variant_g322", "r941_prom_variant_g360", "r941_sup_plant_g610", "r941_sup_plant_variant_g610"
]
45 changes: 15 additions & 30 deletions aviary/aviary.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
###############################################################################
import aviary.config.config as Config
from aviary.modules.processor import Processor, process_batch
from .__init__ import __version__
from .__init__ import __version__, MEDAKA_MODELS, LONG_READ_TYPES
__author__ = "Rhys Newell"
__copyright__ = "Copyright 2022"
__credits__ = ["Rhys Newell"]
Expand All @@ -35,7 +35,6 @@
import logging
import os
from datetime import datetime
import tempfile

# Debug
debug={1:logging.CRITICAL,
Expand Down Expand Up @@ -198,7 +197,7 @@ def main():
help='Path to the location that will be treated used for temporary files. If none is specified, the TMPDIR \n'
'environment variable will be used. Can be configured within the `configure` subcommand',
dest='tmpdir',
default=tempfile.gettempdir(),
default=None,
)

base_group.add_argument(
Expand Down Expand Up @@ -478,34 +477,15 @@ def main():
'reads, "ont" for Oxford Nanopore and "ont_hq" for Oxford Nanopore high quality reads (Guppy5+ or Q20) \n',
dest='longread_type',
default="ont",
choices=["ont","ont_hq", "rs", "sq", "ccs", "hifi"],
choices=LONG_READ_TYPES,
)

long_read_group.add_argument(
'--medaka-model', '--medaka_model',
help='Medaka model to use for polishing long reads. \n',
dest='medaka_model',
default="r941_min_hac_g507",
choices=[
"r103_fast_g507", "r103_fast_snp_g507", "r103_fast_variant_g507", "r103_hac_g507", "r103_hac_snp_g507",
"r103_hac_variant_g507", "r103_min_high_g345", "r103_min_high_g360", "r103_prom_high_g360", "r103_prom_snp_g3210",
"r103_prom_variant_g3210", "r103_sup_g507", "r103_sup_snp_g507", "r103_sup_variant_g507", "r1041_e82_260bps_fast_g632",
"r1041_e82_260bps_fast_variant_g632", "r1041_e82_260bps_hac_g632", "r1041_e82_260bps_hac_variant_g632", "r1041_e82_260bps_sup_g632",
"r1041_e82_260bps_sup_variant_g632", "r1041_e82_400bps_fast_g615", "r1041_e82_400bps_fast_g632",
"r1041_e82_400bps_fast_variant_g615", "r1041_e82_400bps_fast_variant_g632", "r1041_e82_400bps_hac_g615",
"r1041_e82_400bps_hac_g632", "r1041_e82_400bps_hac_variant_g615", "r1041_e82_400bps_hac_variant_g632", "r1041_e82_400bps_sup_g615",
"r1041_e82_400bps_sup_variant_g615", "r104_e81_fast_g5015", "r104_e81_fast_variant_g5015", "r104_e81_hac_g5015",
"r104_e81_hac_variant_g5015", "r104_e81_sup_g5015", "r104_e81_sup_g610", "r104_e81_sup_variant_g610", "r10_min_high_g303",
"r10_min_high_g340", "r941_e81_fast_g514", "r941_e81_fast_variant_g514", "r941_e81_hac_g514", "r941_e81_hac_variant_g514",
"r941_e81_sup_g514", "r941_e81_sup_variant_g514", "r941_min_fast_g303", "r941_min_fast_g507", "r941_min_fast_snp_g507",
"r941_min_fast_variant_g507", "r941_min_hac_g507", "r941_min_hac_snp_g507", "r941_min_hac_variant_g507", "r941_min_high_g303",
"r941_min_high_g330", "r941_min_high_g340_rle", "r941_min_high_g344", "r941_min_high_g351", "r941_min_high_g360", "r941_min_sup_g507",
"r941_min_sup_snp_g507", "r941_min_sup_variant_g507", "r941_prom_fast_g303", "r941_prom_fast_g507", "r941_prom_fast_snp_g507",
"r941_prom_fast_variant_g507", "r941_prom_hac_g507", "r941_prom_hac_snp_g507", "r941_prom_hac_variant_g507", "r941_prom_high_g303",
"r941_prom_high_g330", "r941_prom_high_g344", "r941_prom_high_g360", "r941_prom_high_g4011", "r941_prom_snp_g303", "r941_prom_snp_g322",
"r941_prom_snp_g360", "r941_prom_sup_g507", "r941_prom_sup_snp_g507", "r941_prom_sup_variant_g507", "r941_prom_variant_g303",
"r941_prom_variant_g322", "r941_prom_variant_g360", "r941_sup_plant_g610", "r941_sup_plant_variant_g610"
]
choices=MEDAKA_MODELS
)

long_read_group.add_argument(
Expand Down Expand Up @@ -1078,14 +1058,14 @@ def main():
aviary batch -f batch_file.tsv -t 32 -o batch_test
An example batch file can be found at:
An example batch file can be found at: https://rhysnewell.github.io/aviary/examples
''')

batch_options.add_argument(
'-f', '--batch_file', '--batch-file',
help='The tab or comma separated batch file containing the input samples to assemble and/or recover MAGs from. \n'
'An example batch file can be found at XXX. The heading line is required. \n'
'An example batch file can be found at https://rhysnewell.github.io/aviary/examples. The heading line is required. \n'
'The number of reads provided to each sample is flexible as is the type of assembly being performed (if any). \n'
'Multiple reads can be supplied by providing a comma-separated list (surrounded by double quotes \"\" if using a \n'
'comma separated batch file) within the specific read column.',
Expand All @@ -1109,7 +1089,7 @@ def main():
type=str2bool,
nargs='?',
const=True,
default=True
default=False
)

batch_options.add_argument(
Expand All @@ -1136,6 +1116,14 @@ def main():
default='95'
)

batch_options.add_argument(
'--medaka-model', '--medaka_model',
help='Medaka model to use for polishing long reads. \n',
dest='medaka_model',
default="r941_min_hac_g507",
choices=MEDAKA_MODELS
)

add_workflow_arg(
batch_options,
['get_bam_indices', 'recover_mags', 'annotate', 'lorikeet'],
Expand Down Expand Up @@ -1290,9 +1278,6 @@ def manage_env_vars(args):
if args.conda_prefix is None:
args.conda_prefix = Config.get_software_db_path('CONDA_ENV_PATH', '--conda-prefix')

if args.tmpdir is None:
args.tmpdir = Config.get_software_db_path('TMPDIR', '--tmpdir')

try:
if args.gtdb_path is None:
args.gtdb_path = Config.get_software_db_path('GTDBTK_DATA_PATH', '--gtdb-path')
Expand Down
2 changes: 1 addition & 1 deletion aviary/modules/annotation/annotation.smk
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ rule eggnog:
params:
mag_extension = config['mag_extension'],
eggnog_db = config['eggnog_folder'],
tmpdir = config["tmpdir"]
tmpdir = config["tmpdir"] if config["tmpdir"] else "$TMPDIR",
output:
done = 'data/eggnog/done'
threads:
Expand Down
31 changes: 4 additions & 27 deletions aviary/modules/assembly/assembly.smk
Original file line number Diff line number Diff line change
Expand Up @@ -367,31 +367,8 @@ rule spades_assembly:
"envs/spades.yaml"
benchmark:
"benchmarks/spades_assembly.benchmark.txt"
shell:
"""
rm -rf data/spades_assembly/tmp;
minimumsize=500000;
actualsize=$(stat -c%s data/short_reads.filt.fastq.gz);
if [ -d "data/spades_assembly/" ]
then
spades.py --restart-from last --memory {params.max_memory} -t {threads} -o data/spades_assembly -k {params.kmer_sizes} --tmp-dir {params.tmpdir} > {log} 2>&1 && \
cp data/spades_assembly/scaffolds.fasta data/spades_assembly.fasta
elif [ $actualsize -ge $minimumsize ]
then
if [ {params.long_read_type} = "ont" ] || [ {params.long_read_type} = "ont_hq" ]
then
spades.py --checkpoints all --memory {params.max_memory} --meta --nanopore {input.long_reads} --12 {input.fastq} \
-o data/spades_assembly -t {threads} -k {params.kmer_sizes} --tmp-dir {params.tmpdir} >> {log} 2>&1 && \
cp data/spades_assembly/scaffolds.fasta data/spades_assembly.fasta
else
spades.py --checkpoints all --memory {params.max_memory} --meta --pacbio {input.long_reads} --12 {input.fastq} \
-o data/spades_assembly -t {threads} -k {params.kmer_sizes} --tmp-dir {params.tmpdir} >> {log} 2>&1 && \
cp data/spades_assembly/scaffolds.fasta data/spades_assembly.fasta
fi
else
mkdir -p {output.spades_folder} && touch {output.fasta}
fi
"""
script:
"scripts/spades_assembly.py"


# Perform short read assembly only with no other steps
Expand Down Expand Up @@ -458,14 +435,14 @@ rule spades_assembly_coverage:
log:
"logs/spades_assembly_coverage.log"
params:
tmpdir = config["tmpdir"]
tmpdir = f"TMPDIR={config['tmpdir']}" if config["tmpdir"] else ""
conda:
"../../envs/coverm.yaml"
benchmark:
"benchmarks/spades_assembly_coverage.benchmark.txt"
shell:
"""
TMPDIR={params.tmpdir} coverm contig -m metabat -t {threads} -r {input.fasta} --interleaved {input.fastq} --bam-file-cache-directory data/cached_bams/ > {output.assembly_cov} 2> {log};
{params.tmpdir} coverm contig -m metabat -t {threads} -r {input.fasta} --interleaved {input.fastq} --bam-file-cache-directory data/cached_bams/ > {output.assembly_cov} 2> {log};
mv data/cached_bams/*.bam {output.bam} && samtools index -@ {threads} {output.bam} 2>> {log}
"""

Expand Down
13 changes: 11 additions & 2 deletions aviary/modules/assembly/scripts/assemble_short_reads.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,15 @@ def assemble_short_reads(
:return:
'''

if not tmp_dir:
try:
tmp_dir = os.environ["TMPDIR"]
except KeyError:
tmp_dir_arg = ""

if tmp_dir:
tmp_dir_arg = f"--tmp-dir {tmp_dir}"

# deal with read sets i.e. are we coassembling? Which assembler are we using?
# Non co-assembled reads are handled the same for each assembler
if read_set1 != 'none':
Expand Down Expand Up @@ -86,7 +95,7 @@ def assemble_short_reads(
# Run chosen assembler
if use_megahit:
max_memory_in_bytes = max_memory * 1024*1024*1024
command = f"megahit {read_string} -t {threads} -m {max_memory_in_bytes} -o data/megahit_assembly --tmp-dir {tmp_dir}"
command = f"megahit {read_string} -t {threads} -m {max_memory_in_bytes} -o data/megahit_assembly {tmp_dir_arg}"

with open(log, 'a') as logf:
logf.write(f"Queueing command {command}\n")
Expand All @@ -97,7 +106,7 @@ def assemble_short_reads(
else:
kmers = " ".join(kmer_sizes)
command = f"spades.py --memory {max_memory} --meta -t {threads} " \
f"-o data/short_read_assembly {read_string} -k {kmers} --tmp-dir {tmp_dir}"
f"-o data/short_read_assembly {read_string} -k {kmers} {tmp_dir_arg}"
with open(log, 'a') as logf:
logf.write(f"Queueing command {command}\n")
subprocess.run(command.split(), stdout=logf, stderr=subprocess.STDOUT)
Expand Down
89 changes: 89 additions & 0 deletions aviary/modules/assembly/scripts/spades_assembly.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import subprocess
import os
import sys
from typing import List

def spades_asssembly(
input_fastq: str,
input_long_reads: str,
output_fasta: str,
output_spades_folder: str,
max_memory: int,
threads: int,
kmer_sizes: List[int],
tmp_dir: str,
long_read_type: str,
log: str):

'''
Assemble short reads and long reads (if any) using spades
:param input_fastq: short reads fastq file
:param input_long_reads: long reads fastq file
:param output_fasta: output fasta file
:param output_spades_folder: output spades folder
:param max_memory: maximum memory to use
:param threads: number of threads
:param tmpdir: temporary directory
:param kmer_sizes: list of kmer sizes
:param long_read_type: type of long reads
:param log: log file
:return:
'''
if tmp_dir:
tmp_dir_arg = f"--tmp-dir {tmp_dir}"
else:
try:
tmp_dir = os.environ["TMPDIR"]
tmp_dir_arg = f"--tmp-dir {tmp_dir}"
except KeyError:
tmp_dir_arg = ""

if os.path.exists("data/spades_assembly/tmp"):
with open(log, 'a') as logf:
subprocess.run("rm -rf data/spades_assembly/tmp".split(), stdout=logf, stderr=subprocess.STDOUT)
# remove existing temporary directory
minimumsize=500000
actualsize = int(subprocess.check_output('stat -c%s data/short_reads.filt.fastq.gz', shell=True))
# check if directory exists
if os.path.exists("data/spades_assembly"):
# resume previous assembly
command = f"spades.py --restart-from last --memory {max_memory} -t {threads} " \
f"-o data/spades_assembly -k {kmer_sizes} {tmp_dir_arg}"
# run cmd
with open(log, 'a') as logf:
logf.write(f"Queueing command {command}\n")
subprocess.run(command.split(), stdout=logf, stderr=subprocess.STDOUT)
subprocess.run("cp data/spades_assembly/scaffolds.fasta data/spades_assembly.fasta".split(), stdout=logf, stderr=subprocess.STDOUT)
elif actualsize >= minimumsize:
if long_read_type in ["ont","ont_hq"]:
command = f"spades.py --checkpoints all --memory {max_memory} --meta --nanopore {input_long_reads} --12 {input_fastq} "\
f"-o data/spades_assembly -t {threads} -k {kmer_sizes} {tmp_dir_arg} "
else:
command = f"spades.py --checkpoints all --memory {max_memory} --meta --pacbio {input_long_reads} --12 {input_fastq} "\
f"-o data/spades_assembly -t {threads} -k {kmer_sizes} {tmp_dir_arg} "
# run cmd
with open(log, 'a') as logf:
logf.write(f"Queueing command {command}\n")
subprocess.run(command.split(), stdout=logf, stderr=subprocess.STDOUT)
subprocess.run("cp data/spades_assembly/scaffolds.fasta data/spades_assembly.fasta".split(), stdout=logf, stderr=subprocess.STDOUT)
else:
with open(log, 'a') as logf:
subprocess.run(f"mkdir -p {output.spades_folder} && touch {output.fasta}".split(), stdout=logf, stderr=subprocess.STDOUT)


if __name__ == '__main__':
log = snakemake.log[0]
with open(log, 'w') as logf: pass

spades_asssembly(
snakemake.input.fastq,
snakemake.input.long_reads,
snakemake.output.fasta,
snakemake.output.spades_folder,
snakemake.params.max_memory,
snakemake.threads,
snakemake.params.kmer_sizes,
snakemake.params.tmpdir,
snakemake.params.long_read_type,
log
)
3 changes: 2 additions & 1 deletion aviary/modules/binning/scripts/get_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ def get_coverage(
threads: int,
log: str,
):
os.environ["TMPDIR"] = tmpdir
if tmpdir: os.environ["TMPDIR"] = tmpdir

if long_reads != "none" and not os.path.exists("data/long_cov.tsv"):
if long_read_type in ["ont", "ont_hq"]:
coverm_cmd = f"coverm contig -t {threads} -r {input_fasta} --single {' '.join(long_reads)} -p minimap2-ont -m length trimmed_mean variance --bam-file-cache-directory data/binning_bams/ --discard-unmapped --min-read-percent-identity 0.85 --output-file data/long_cov.tsv".split()
Expand Down
Loading

0 comments on commit 387ac6d

Please sign in to comment.