Skip to content

Commit

Permalink
Merge branch 'main' into feature-user-hmms
Browse files Browse the repository at this point in the history
  • Loading branch information
oschwengers authored Sep 25, 2024
2 parents 0eb2bbe + a7b3cfd commit 6bcbcfd
Show file tree
Hide file tree
Showing 5 changed files with 85 additions and 27 deletions.
14 changes: 9 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -363,10 +363,11 @@ Exemplary annotation result files for several genomes (mostly ESKAPE species) ar
## Usage

```bash
usage: bakta [--db DB] [--min-contig-length MIN_CONTIG_LENGTH] [--prefix PREFIX] [--output OUTPUT]
usage: bakta [--db DB] [--min-contig-length MIN_CONTIG_LENGTH] [--prefix PREFIX] [--output OUTPUT] [--force]
[--genus GENUS] [--species SPECIES] [--strain STRAIN] [--plasmid PLASMID]
[--complete] [--prodigal-tf PRODIGAL_TF] [--translation-table {11,4}] [--gram {+,-,?}] [--locus LOCUS]
[--locus-tag LOCUS_TAG] [--keep-contig-headers] [--replicons REPLICONS] [--compliant] [--replicons REPLICONS] [--regions REGIONS] [--proteins PROTEINS] [--meta]
[--complete] [--prodigal-tf PRODIGAL_TF] [--translation-table {11,4,25}] [--gram {+,-,?}]
[--locus LOCUS] [--locus-tag LOCUS_TAG] [--locus-tag-increment {1,5,10}] [--keep-contig-headers] [--compliant]
[--replicons REPLICONS] [--regions REGIONS] [--proteins PROTEINS] [--meta]
[--skip-trna] [--skip-tmrna] [--skip-rrna] [--skip-ncrna] [--skip-ncrna-region]
[--skip-crispr] [--skip-cds] [--skip-pseudo] [--skip-sorf] [--skip-gap] [--skip-ori] [--skip-plot]
[--help] [--verbose] [--debug] [--threads THREADS] [--tmp-dir TMP_DIR] [--version]
Expand Down Expand Up @@ -397,12 +398,15 @@ Annotation:
--complete All sequences are complete replicons (chromosome/plasmid[s])
--prodigal-tf PRODIGAL_TF
Path to existing Prodigal training file to use for CDS prediction
--translation-table {11,4}
Translation table: 11/4 (default = 11)
--translation-table {11,4,25}
Translation table: 11/4/25 (default = 11)
--gram {+,-,?} Gram type for signal peptide predictions: +/-/? (default = ?)
--locus LOCUS Locus prefix (default = 'contig')
--locus-tag LOCUS_TAG
Locus tag prefix (default = autogenerated)
--locus-tag-increment {1,5,10}
Locus tag increment: 1/5/10 (default = 1)

--keep-contig-headers
Keep original contig headers
--compliant Force Genbank/ENA/DDJB compliance
Expand Down
5 changes: 4 additions & 1 deletion bakta/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
keep_contig_headers = None
locus = None
locus_tag = None
locus_tag_increment = None
gram = None
replicons = None
compliant = None
Expand Down Expand Up @@ -162,7 +163,7 @@ def setup(args):
taxon = None

# annotation configurations
global complete, prodigal_tf, translation_table, keep_contig_headers, locus, locus_tag, gram, replicons, compliant, user_proteins, user_hmms, meta, regions
global complete, prodigal_tf, translation_table, keep_contig_headers, locus, locus_tag, locus_tag_increment, gram, replicons, compliant, user_proteins, user_hmms, meta, regions
complete = args.complete
log.info('complete=%s', complete)
prodigal_tf = args.prodigal_tf
Expand Down Expand Up @@ -218,6 +219,8 @@ def setup(args):
log.error("Invalid 'locus-tag' parameter! locus-tag=%s", locus_tag)
sys.exit(f"ERROR: invalid 'locus-tag' parameter ({locus_tag})!\nLocus tag prefixes must contain between 1 and 24 alphanumeric characters or '_.-' signs.")
log.info('locus-tag=%s', locus_tag)
locus_tag_increment = args.locus_tag_increment
log.info('locus-tag-increment=%s', locus_tag_increment)
keep_contig_headers = args.keep_contig_headers
log.info('keep_contig_headers=%s', keep_contig_headers)
replicons = args.replicons
Expand Down
48 changes: 28 additions & 20 deletions bakta/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,9 @@ def main():
print('Options and arguments:')
print(f'\tinput: {cfg.genome_path}')
print(f"\tdb: {cfg.db_path}, version {cfg.db_info['major']}.{cfg.db_info['minor']}, {cfg.db_info['type']}")
if(cfg.prodigal_tf): print(f'\tprodigal training file: {cfg.prodigal_tf}')
if(cfg.replicons): print(f'\treplicon table: {cfg.replicons}')
if(cfg.prodigal_tf): print(f'\tprodigal training file: {cfg.prodigal_tf}')
if(cfg.regions): print(f'\tregion table: {cfg.regions}')
if(cfg.user_proteins): print(f'\tuser proteins: {cfg.user_proteins}')
if(cfg.user_hmms): print(f'\tuser hmms: {cfg.user_hmms}')
print(f'\toutput: {cfg.output_path}')
Expand Down Expand Up @@ -249,11 +250,15 @@ def main():
cdss.extend(imported_cdss)

if(len(cdss) > 0):
log.debug('lookup CDS UPS/IPS')
cdss_ups, cdss_not_found = ups.lookup(cdss)
cdss_ips, cdss_not_found_tmp = ips.lookup(cdss_ups)
cdss_not_found.extend(cdss_not_found_tmp)
print(f'\tdetected IPSs: {len(cdss_ips)}')
if(cfg.db_info['type'] == 'full'):
log.debug('lookup CDS UPS/IPS')
cdss_ups, cdss_not_found_ups = ups.lookup(cdss)
cdss_ips, cdss_not_found_ips = ips.lookup(cdss_ups)
cdss_not_found = cdss_not_found_ups + cdss_not_found_ips
print(f'\tdetected IPSs: {len(cdss_ips)}')
else:
cdss_not_found = [*cdss]
print(f'\tskip UPS/IPS detection with light db version')

if(len(cdss_not_found) > 0):
if(cfg.db_info['type'] == 'full'):
Expand Down Expand Up @@ -303,17 +308,20 @@ def main():
anno.combine_annotation(cds) # combine IPS & PSC annotations and mark hypotheticals

hypotheticals = [cds for cds in cdss if 'hypothetical' in cds and 'edge' not in cds and cds.get('start_type', 'Edge') != 'Edge']
if(len(hypotheticals) > 0 and not cfg.skip_pseudo and cfg.db_info['type'] == 'full'):
print('\tdetect pseudogenes...')
log.debug('search pseudogene candidates')
pseudo_candidates = feat_cds.predict_pseudo_candidates(hypotheticals)
print(f'\t\tpseudogene candidates: {len(pseudo_candidates)}')
pseudogenes = feat_cds.detect_pseudogenes(pseudo_candidates, cdss, genome) if len(pseudo_candidates) > 0 else []
psc.lookup(pseudogenes, pseudo=True)
pscc.lookup(pseudogenes, pseudo=True)
for pseudogene in pseudogenes:
anno.combine_annotation(pseudogene)
print(f'\t\tfound pseudogenes: {len(pseudogenes)}')
if(len(hypotheticals) > 0 and not cfg.skip_pseudo):
if(cfg.db_info['type'] == 'full'):
print('\tdetect pseudogenes...')
log.debug('search pseudogene candidates')
pseudo_candidates = feat_cds.predict_pseudo_candidates(hypotheticals)
print(f'\t\tpseudogene candidates: {len(pseudo_candidates)}')
pseudogenes = feat_cds.detect_pseudogenes(pseudo_candidates, cdss, genome) if len(pseudo_candidates) > 0 else []
psc.lookup(pseudogenes, pseudo=True)
pscc.lookup(pseudogenes, pseudo=True)
for pseudogene in pseudogenes:
anno.combine_annotation(pseudogene)
print(f'\t\tfound pseudogenes: {len(pseudogenes)}')
else:
print(f'\tskip pseudogene detection with light db version')
hypotheticals = [cds for cds in cdss if 'hypothetical' in cds]
if(len(hypotheticals) > 0):
log.debug('analyze hypotheticals')
Expand Down Expand Up @@ -475,15 +483,15 @@ def main():
log.info('selected features=%i', len(features))
print(f'selected: {len(features)}')

locus_tag_nr = 5
# use user provided locus tag if not None/non-empty or generate a sequence based locus prefix
locus_tag_prefix = cfg.locus_tag if cfg.locus_tag else bu.create_locus_tag_prefix(contigs)
log.info('locus tag prefix=%s', locus_tag_prefix)
locus_tag_nr = cfg.locus_tag_increment
for feature in features:
locus_tag = f'{locus_tag_prefix}_{locus_tag_nr:05}'
locus_tag = f'{locus_tag_prefix}_{locus_tag_nr:0{len(str(cfg.locus_tag_increment*len(features)))+1}}'
if(feature['type'] in [bc.FEATURE_T_RNA, bc.FEATURE_TM_RNA, bc.FEATURE_R_RNA, bc.FEATURE_NC_RNA, bc.FEATURE_CDS, bc.FEATURE_SORF]):
feature['locus'] = locus_tag
locus_tag_nr += 5
locus_tag_nr += cfg.locus_tag_increment

############################################################################
# Improve annotations
Expand Down
3 changes: 2 additions & 1 deletion bakta/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,11 @@ def parse_arguments():
arg_group_annotation = parser.add_argument_group('Annotation')
arg_group_annotation.add_argument('--complete', action='store_true', help='All sequences are complete replicons (chromosome/plasmid[s])')
arg_group_annotation.add_argument('--prodigal-tf', action='store', default=None, dest='prodigal_tf', help='Path to existing Prodigal training file to use for CDS prediction')
arg_group_annotation.add_argument('--translation-table', action='store', type=int, default=11, choices=[11, 4], dest='translation_table', help='Translation table: 11/4 (default = 11)')
arg_group_annotation.add_argument('--translation-table', action='store', type=int, default=11, choices=[11, 4, 25], dest='translation_table', help='Translation table: 11/4/25 (default = 11)')
arg_group_annotation.add_argument('--gram', action='store', default=bc.GRAM_UNKNOWN, choices=[bc.GRAM_POSITIVE, bc.GRAM_NEGATIVE, bc.GRAM_UNKNOWN], help=f'Gram type for signal peptide predictions: {bc.GRAM_POSITIVE}/{bc.GRAM_NEGATIVE}/{bc.GRAM_UNKNOWN} (default = {bc.GRAM_UNKNOWN})')
arg_group_annotation.add_argument('--locus', action='store', default=None, help="Locus prefix (default = 'contig')")
arg_group_annotation.add_argument('--locus-tag', action='store', default=None, dest='locus_tag', help='Locus tag prefix (default = autogenerated)')
arg_group_annotation.add_argument('--locus-tag-increment', action='store', type=int, default=1, choices=[1, 5, 10], dest='locus_tag_increment', help='Locus tag increment: 1/5/10 (default = 1)')
arg_group_annotation.add_argument('--keep-contig-headers', action='store_true', dest='keep_contig_headers', help='Keep original contig headers')
arg_group_annotation.add_argument('--compliant', action='store_true', help='Force Genbank/ENA/DDJB compliance')
arg_group_annotation.add_argument('--replicons', '-r', action='store', default=None, dest='replicons', help='Replicon information table (tsv/csv)')
Expand Down
42 changes: 42 additions & 0 deletions test/test_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,48 @@ def test_locustag_compliant_ok(parameters, tmpdir):
assert proc.returncode == 0


@pytest.mark.parametrize(
'parameters',
[
(['--locus-tag-increment']), # not provided
(['--locus-tag-increment', '']), # empty
(['--locus-tag-increment', ' ']), # whitespace only
(['--locus-tag-increment', 'A']), # wrong characters
(['--locus-tag-increment', 'a']), # wrong characters
(['--locus-tag-increment', '0']), # wrong number
(['--locus-tag-increment', '11']), # wrong number
]
)
def test_locustag_increment_failiing(parameters, tmpdir):
# test locus-tag increment arguments
proc = run(
['bin/bakta', '--db', 'test/db', '--output', tmpdir, '--force', '--skip-plot'] +
parameters +
SKIP_PARAMETERS +
['test/data/NC_002127.1.fna']
)
assert proc.returncode != 0


@pytest.mark.parametrize(
'parameters',
[
(['--locus-tag-increment', '1']),
(['--locus-tag-increment', '5']),
(['--locus-tag-increment', '10'])
]
)
def test_locustag_increment_ok(parameters, tmpdir):
# test locus-tag increment arguments
proc = run(
['bin/bakta', '--db', 'test/db', '--output', tmpdir, '--force', '--skip-plot'] +
parameters +
SKIP_PARAMETERS +
['test/data/NC_002127.1.fna']
)
assert proc.returncode == 0


@pytest.mark.parametrize(
'parameters',
[
Expand Down

0 comments on commit 6bcbcfd

Please sign in to comment.