Skip to content

Commit

Permalink
add support for Pyrodigal v3 (#244)
Browse files Browse the repository at this point in the history
* add support for Pyrodigal v3
* refactor code
* actually use ProcessPoolExecutor
* bump pyrodigal dependency to v3.0.1
* remove orphan path parameter in cds predict
* discard multithreaded CDS prediction
  • Loading branch information
oschwengers authored Oct 17, 2023
1 parent ff80f62 commit 358ac16
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 30 deletions.
50 changes: 23 additions & 27 deletions bakta/features/cds.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import concurrent.futures as cf
import copy
import logging
import subprocess as sp
Expand All @@ -7,7 +6,6 @@

from collections import OrderedDict
from typing import Dict, Sequence, Set, Tuple, Union
from pathlib import Path

import pyrodigal
import pyhmmer
Expand All @@ -27,7 +25,7 @@
log = logging.getLogger('CDS')


def predict(genome: dict, sequences_path: Path):
def predict(genome: dict):
"""Predict open reading frames with Pyrodigal."""
# create Pyrodigal trainining file if not provided by the user
prodigal_tf_path = cfg.prodigal_tf
Expand All @@ -38,9 +36,9 @@ def predict(genome: dict, sequences_path: Path):
closed = not genome['complete']
if(not prodigal_metamode):
log.info('create prodigal training info object: meta=%s, closed=%s', prodigal_metamode, closed)
orffinder = pyrodigal.OrfFinder(meta=prodigal_metamode, closed=closed)
gene_finder = pyrodigal.GeneFinder(meta=prodigal_metamode, closed=closed)
seqs = [c['sequence'] for c in genome['contigs']]
trainings_info = orffinder.train(*seqs, translation_table=cfg.translation_table)
trainings_info = gene_finder.train(*seqs, translation_table=cfg.translation_table)
else:
log.info('skip creation of prodigal training info object: meta=%s, closed=%s', prodigal_metamode, closed)
else:
Expand All @@ -55,30 +53,28 @@ def predict(genome: dict, sequences_path: Path):

cdss = []
# predict genes on linear sequences
linear_sequences = {c['id'] : c for c in genome['contigs'] if c['topology'] == bc.TOPOLOGY_LINEAR}
if(len(linear_sequences) > 0):
orffinder = pyrodigal.OrfFinder(trainings_info, meta=prodigal_metamode, closed=True, mask=True)
future_per_contig = {}
with cf.ProcessPoolExecutor(max_workers=cfg.threads) as tpe:
for id, sequence in linear_sequences.items():
future_per_contig[sequence['id']] = tpe.submit(orffinder.find_genes, sequence['sequence'])
for contig_id, future in future_per_contig.items():
sequence = linear_sequences[contig_id]
cdss_per_sequence = create_cdss(future.result(), sequence)
cdss.extend(cdss_per_sequence)
linear_contigs = [c for c in genome['contigs'] if c['topology'] == bc.TOPOLOGY_LINEAR]
if(len(linear_contigs) > 0):
if prodigal_metamode:
gene_finder = pyrodigal.GeneFinder(meta=True, metagenomic_bins=None, closed=True, mask=True)
else:
gene_finder = pyrodigal.GeneFinder(trainings_info, meta=False, closed=True, mask=True)
sequences = [contig['sequence'] for contig in linear_contigs]
for contig, genes in zip(linear_contigs, map(gene_finder.find_genes, sequences)):
cdss_per_sequence = create_cdss(genes, contig)
cdss.extend(cdss_per_sequence)

# predict genes on circular replicons (chromosomes/plasmids)
circular_sequences = {c['id'] : c for c in genome['contigs'] if c['topology'] == bc.TOPOLOGY_CIRCULAR}
if(len(circular_sequences) > 0):
orffinder = pyrodigal.OrfFinder(trainings_info, meta=prodigal_metamode, closed=False, mask=True)
future_per_contig = {}
with cf.ProcessPoolExecutor(max_workers=cfg.threads) as tpe:
for id, sequence in circular_sequences.items():
future_per_contig[sequence['id']] = tpe.submit(orffinder.find_genes, sequence['sequence'])
for contig_id, future in future_per_contig.items():
sequence = circular_sequences[contig_id]
cdss_per_sequence = create_cdss(future.result(), sequence)
cdss.extend(cdss_per_sequence)
circular_contigs = [c for c in genome['contigs'] if c['topology'] == bc.TOPOLOGY_CIRCULAR]
if(len(circular_contigs) > 0):
if prodigal_metamode:
gene_finder = pyrodigal.GeneFinder(meta=True, metagenomic_bins=None, closed=False, mask=True)
else:
gene_finder = pyrodigal.GeneFinder(trainings_info, meta=False, closed=False, mask=True)
sequences = [contig['sequence'] for contig in circular_contigs]
for contig, genes in zip(circular_contigs, map(gene_finder.find_genes, sequences)):
cdss_per_sequence = create_cdss(genes, contig)
cdss.extend(cdss_per_sequence)

log.info('predicted=%i', len(cdss))
return cdss
Expand Down
2 changes: 1 addition & 1 deletion bakta/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ def main():
else:
print('predict & annotate CDSs...')
log.debug('predict CDS')
cdss = feat_cds.predict(genome, contigs_path)
cdss = feat_cds.predict(genome)
print(f"\tpredicted: {len(cdss)} ")

if(len(cdss) > 0):
Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ dependencies:
- requests>=2.25.1
- alive-progress>=3.0.1
- pyyaml>=6.0
- pyrodigal>=2.1.0,<3.0
- pyrodigal>=3.0.1
- trnascan-se>=2.0.11
- aragorn>=1.2.41
- infernal>=1.1.4
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
'requests >= 2.25.1',
'alive-progress >= 3.0.1',
'PyYAML >= 6.0',
'pyrodigal >= 2.1.0, <3.0',
'pyrodigal >= 3.0.1',
'pyhmmer >= 0.10.0'
],
entry_points={
Expand Down

0 comments on commit 358ac16

Please sign in to comment.