From 3a3b6600b0cbcfd4bc60851caea940f379b29e10 Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Tue, 24 Sep 2024 17:18:11 +0200 Subject: [PATCH 1/7] add regions parameter to stdout --- bakta/main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bakta/main.py b/bakta/main.py index 3cfb3a4d..0336ea5f 100755 --- a/bakta/main.py +++ b/bakta/main.py @@ -61,9 +61,10 @@ def main(): print('Options and arguments:') print(f'\tinput: {cfg.genome_path}') print(f"\tdb: {cfg.db_path}, version {cfg.db_info['major']}.{cfg.db_info['minor']}, {cfg.db_info['type']}") - if(cfg.user_proteins): print(f'\tuser proteins: {cfg.user_proteins}') if(cfg.replicons): print(f'\treplicon table: {cfg.replicons}') + if(cfg.regions): print(f'\tregion table: {cfg.regions}') if(cfg.prodigal_tf): print(f'\tprodigal training file: {cfg.prodigal_tf}') + if(cfg.user_proteins): print(f'\tuser proteins: {cfg.user_proteins}') print(f'\toutput: {cfg.output_path}') if(cfg.force): print(f'\tforce: {cfg.force}') print(f'\ttmp directory: {cfg.tmp_path}') From 36445261c4659f2ffa251ffe7b0d6f09edf423f4 Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Tue, 24 Sep 2024 17:24:52 +0200 Subject: [PATCH 2/7] polish ups/ips not found --- bakta/main.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bakta/main.py b/bakta/main.py index 0336ea5f..8c3859a4 100755 --- a/bakta/main.py +++ b/bakta/main.py @@ -249,9 +249,9 @@ def main(): if(len(cdss) > 0): log.debug('lookup CDS UPS/IPS') - cdss_ups, cdss_not_found = ups.lookup(cdss) - cdss_ips, sorf_pscs = ips.lookup(cdss_ups) - cdss_not_found.extend(sorf_pscs) + cdss_ups, cdss_not_found_ups = ups.lookup(cdss) + cdss_ips, cdss_not_found_ips = ips.lookup(cdss_ups) + cdss_not_found = cdss_not_found_ups + cdss_not_found_ips print(f'\tdetected IPSs: {len(cdss_ips)}') if(len(cdss_not_found) > 0): From 2932ebd6086f5cc0e22d9ff8fbe20c8734fbcde1 Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Tue, 24 Sep 2024 17:47:11 +0200 Subject: [PATCH 3/7] skip UPS/IPS detection with light db version --- bakta/main.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/bakta/main.py b/bakta/main.py index 8c3859a4..51dde239 100755 --- a/bakta/main.py +++ b/bakta/main.py @@ -248,11 +248,15 @@ def main(): cdss.extend(imported_cdss) if(len(cdss) > 0): - log.debug('lookup CDS UPS/IPS') - cdss_ups, cdss_not_found_ups = ups.lookup(cdss) - cdss_ips, cdss_not_found_ips = ips.lookup(cdss_ups) - cdss_not_found = cdss_not_found_ups + cdss_not_found_ips - print(f'\tdetected IPSs: {len(cdss_ips)}') + if(cfg.db_info['type'] == 'full'): + log.debug('lookup CDS UPS/IPS') + cdss_ups, cdss_not_found_ups = ups.lookup(cdss) + cdss_ips, cdss_not_found_ips = ips.lookup(cdss_ups) + cdss_not_found = cdss_not_found_ups + cdss_not_found_ips + print(f'\tdetected IPSs: {len(cdss_ips)}') + else: + cdss_not_found = [*cdss] + print(f'\tskip UPS/IPS detection with light db version') if(len(cdss_not_found) > 0): if(cfg.db_info['type'] == 'full'): From c8a93e11cae31db58eab3711ca344e3ceaf35eba Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Tue, 24 Sep 2024 17:49:28 +0200 Subject: [PATCH 4/7] print stdout note on skipped pseudogene detection b/c light db #320 --- bakta/main.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/bakta/main.py b/bakta/main.py index 51dde239..4da9e96b 100755 --- a/bakta/main.py +++ b/bakta/main.py @@ -301,17 +301,20 @@ def main(): anno.combine_annotation(cds) # combine IPS & PSC annotations and mark hypotheticals hypotheticals = [cds for cds in cdss if 'hypothetical' in cds and 'edge' not in cds and cds.get('start_type', 'Edge') != 'Edge'] - if(len(hypotheticals) > 0 and not cfg.skip_pseudo and cfg.db_info['type'] == 'full'): - print('\tdetect pseudogenes...') - log.debug('search pseudogene candidates') - pseudo_candidates = feat_cds.predict_pseudo_candidates(hypotheticals) - print(f'\t\tpseudogene candidates: {len(pseudo_candidates)}') - pseudogenes = feat_cds.detect_pseudogenes(pseudo_candidates, cdss, genome) if len(pseudo_candidates) > 0 else [] - psc.lookup(pseudogenes, pseudo=True) - pscc.lookup(pseudogenes, pseudo=True) - for pseudogene in pseudogenes: - anno.combine_annotation(pseudogene) - print(f'\t\tfound pseudogenes: {len(pseudogenes)}') + if(len(hypotheticals) > 0 and not cfg.skip_pseudo): + if(cfg.db_info['type'] == 'full'): + print('\tdetect pseudogenes...') + log.debug('search pseudogene candidates') + pseudo_candidates = feat_cds.predict_pseudo_candidates(hypotheticals) + print(f'\t\tpseudogene candidates: {len(pseudo_candidates)}') + pseudogenes = feat_cds.detect_pseudogenes(pseudo_candidates, cdss, genome) if len(pseudo_candidates) > 0 else [] + psc.lookup(pseudogenes, pseudo=True) + pscc.lookup(pseudogenes, pseudo=True) + for pseudogene in pseudogenes: + anno.combine_annotation(pseudogene) + print(f'\t\tfound pseudogenes: {len(pseudogenes)}') + else: + print(f'\tskip pseudogene detection with light db version') hypotheticals = [cds for cds in cdss if 'hypothetical' in cds] if(len(hypotheticals) > 0): log.debug('analyze hypotheticals') From b8c0410b88ca152469b75a9e37ca984d59b59a63 Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Tue, 24 Sep 2024 18:15:50 +0200 Subject: [PATCH 5/7] add support for translation table 25 #323 --- README.md | 6 +++--- bakta/utils.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index ea38870d..e5762b33 100644 --- a/README.md +++ b/README.md @@ -344,7 +344,7 @@ Exemplary annotation result files for several genomes (mostly ESKAPE species) ar ```bash usage: bakta [--db DB] [--min-contig-length MIN_CONTIG_LENGTH] [--prefix PREFIX] [--output OUTPUT] [--genus GENUS] [--species SPECIES] [--strain STRAIN] [--plasmid PLASMID] - [--complete] [--prodigal-tf PRODIGAL_TF] [--translation-table {11,4}] [--gram {+,-,?}] [--locus LOCUS] + [--complete] [--prodigal-tf PRODIGAL_TF] [--translation-table {11,4,25}] [--gram {+,-,?}] [--locus LOCUS] [--locus-tag LOCUS_TAG] [--keep-contig-headers] [--replicons REPLICONS] [--compliant] [--replicons REPLICONS] [--regions REGIONS] [--proteins PROTEINS] [--meta] [--skip-trna] [--skip-tmrna] [--skip-rrna] [--skip-ncrna] [--skip-ncrna-region] [--skip-crispr] [--skip-cds] [--skip-pseudo] [--skip-sorf] [--skip-gap] [--skip-ori] [--skip-plot] @@ -376,8 +376,8 @@ Annotation: --complete All sequences are complete replicons (chromosome/plasmid[s]) --prodigal-tf PRODIGAL_TF Path to existing Prodigal training file to use for CDS prediction - --translation-table {11,4} - Translation table: 11/4 (default = 11) + --translation-table {11,4,25} + Translation table: 11/4/25 (default = 11) --gram {+,-,?} Gram type for signal peptide predictions: +/-/? (default = ?) --locus LOCUS Locus prefix (default = 'contig') --locus-tag LOCUS_TAG diff --git a/bakta/utils.py b/bakta/utils.py index 7be14693..8760d0f3 100644 --- a/bakta/utils.py +++ b/bakta/utils.py @@ -82,7 +82,7 @@ def parse_arguments(): arg_group_annotation = parser.add_argument_group('Annotation') arg_group_annotation.add_argument('--complete', action='store_true', help='All sequences are complete replicons (chromosome/plasmid[s])') arg_group_annotation.add_argument('--prodigal-tf', action='store', default=None, dest='prodigal_tf', help='Path to existing Prodigal training file to use for CDS prediction') - arg_group_annotation.add_argument('--translation-table', action='store', type=int, default=11, choices=[11, 4], dest='translation_table', help='Translation table: 11/4 (default = 11)') + arg_group_annotation.add_argument('--translation-table', action='store', type=int, default=11, choices=[11, 4, 25], dest='translation_table', help='Translation table: 11/4/25 (default = 11)') arg_group_annotation.add_argument('--gram', action='store', default=bc.GRAM_UNKNOWN, choices=[bc.GRAM_POSITIVE, bc.GRAM_NEGATIVE, bc.GRAM_UNKNOWN], help=f'Gram type for signal peptide predictions: {bc.GRAM_POSITIVE}/{bc.GRAM_NEGATIVE}/{bc.GRAM_UNKNOWN} (default = {bc.GRAM_UNKNOWN})') arg_group_annotation.add_argument('--locus', action='store', default=None, help="Locus prefix (default = 'contig')") arg_group_annotation.add_argument('--locus-tag', action='store', default=None, dest='locus_tag', help='Locus tag prefix (default = autogenerated)') From 650eedc17e4814c15dad604487e8c88aab72fad4 Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Wed, 25 Sep 2024 11:07:21 +0200 Subject: [PATCH 6/7] add locus tag increment parameter #279 --- README.md | 10 +++++++--- bakta/config.py | 5 ++++- bakta/main.py | 6 +++--- bakta/utils.py | 1 + 4 files changed, 15 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index e5762b33..0f4bc82e 100644 --- a/README.md +++ b/README.md @@ -342,10 +342,11 @@ Exemplary annotation result files for several genomes (mostly ESKAPE species) ar ## Usage ```bash -usage: bakta [--db DB] [--min-contig-length MIN_CONTIG_LENGTH] [--prefix PREFIX] [--output OUTPUT] +usage: bakta [--db DB] [--min-contig-length MIN_CONTIG_LENGTH] [--prefix PREFIX] [--output OUTPUT] [--force] [--genus GENUS] [--species SPECIES] [--strain STRAIN] [--plasmid PLASMID] - [--complete] [--prodigal-tf PRODIGAL_TF] [--translation-table {11,4,25}] [--gram {+,-,?}] [--locus LOCUS] - [--locus-tag LOCUS_TAG] [--keep-contig-headers] [--replicons REPLICONS] [--compliant] [--replicons REPLICONS] [--regions REGIONS] [--proteins PROTEINS] [--meta] + [--complete] [--prodigal-tf PRODIGAL_TF] [--translation-table {11,4,25}] [--gram {+,-,?}] + [--locus LOCUS] [--locus-tag LOCUS_TAG] [--locus-tag-increment {1,5,10}] [--keep-contig-headers] [--compliant] + [--replicons REPLICONS] [--regions REGIONS] [--proteins PROTEINS] [--meta] [--skip-trna] [--skip-tmrna] [--skip-rrna] [--skip-ncrna] [--skip-ncrna-region] [--skip-crispr] [--skip-cds] [--skip-pseudo] [--skip-sorf] [--skip-gap] [--skip-ori] [--skip-plot] [--help] [--verbose] [--debug] [--threads THREADS] [--tmp-dir TMP_DIR] [--version] @@ -382,6 +383,9 @@ Annotation: --locus LOCUS Locus prefix (default = 'contig') --locus-tag LOCUS_TAG Locus tag prefix (default = autogenerated) + --locus-tag-increment {1,5,10} + Locus tag increment: 1/5/10 (default = 1) + --keep-contig-headers Keep original contig headers --compliant Force Genbank/ENA/DDJB compliance diff --git a/bakta/config.py b/bakta/config.py index 0bf01990..6cb419e0 100644 --- a/bakta/config.py +++ b/bakta/config.py @@ -50,6 +50,7 @@ keep_contig_headers = None locus = None locus_tag = None +locus_tag_increment = None gram = None replicons = None compliant = None @@ -161,7 +162,7 @@ def setup(args): taxon = None # annotation configurations - global complete, prodigal_tf, translation_table, keep_contig_headers, locus, locus_tag, gram, replicons, compliant, user_proteins, meta, regions + global complete, prodigal_tf, translation_table, keep_contig_headers, locus, locus_tag, locus_tag_increment, gram, replicons, compliant, user_proteins, meta, regions complete = args.complete log.info('complete=%s', complete) prodigal_tf = args.prodigal_tf @@ -217,6 +218,8 @@ def setup(args): log.error("Invalid 'locus-tag' parameter! locus-tag=%s", locus_tag) sys.exit(f"ERROR: invalid 'locus-tag' parameter ({locus_tag})!\nLocus tag prefixes must contain between 1 and 24 alphanumeric characters or '_.-' signs.") log.info('locus-tag=%s', locus_tag) + locus_tag_increment = args.locus_tag_increment + log.info('locus-tag-increment=%s', locus_tag_increment) keep_contig_headers = args.keep_contig_headers log.info('keep_contig_headers=%s', keep_contig_headers) replicons = args.replicons diff --git a/bakta/main.py b/bakta/main.py index 4da9e96b..1765cb20 100755 --- a/bakta/main.py +++ b/bakta/main.py @@ -476,15 +476,15 @@ def main(): log.info('selected features=%i', len(features)) print(f'selected: {len(features)}') - locus_tag_nr = 5 # use user provided locus tag if not None/non-empty or generate a sequence based locus prefix locus_tag_prefix = cfg.locus_tag if cfg.locus_tag else bu.create_locus_tag_prefix(contigs) log.info('locus tag prefix=%s', locus_tag_prefix) + locus_tag_nr = cfg.locus_tag_increment for feature in features: - locus_tag = f'{locus_tag_prefix}_{locus_tag_nr:05}' + locus_tag = f'{locus_tag_prefix}_{locus_tag_nr:0{len(str(cfg.locus_tag_increment*len(features)))+1}}' if(feature['type'] in [bc.FEATURE_T_RNA, bc.FEATURE_TM_RNA, bc.FEATURE_R_RNA, bc.FEATURE_NC_RNA, bc.FEATURE_CDS, bc.FEATURE_SORF]): feature['locus'] = locus_tag - locus_tag_nr += 5 + locus_tag_nr += cfg.locus_tag_increment ############################################################################ # Improve annotations diff --git a/bakta/utils.py b/bakta/utils.py index 8760d0f3..46bae8fe 100644 --- a/bakta/utils.py +++ b/bakta/utils.py @@ -86,6 +86,7 @@ def parse_arguments(): arg_group_annotation.add_argument('--gram', action='store', default=bc.GRAM_UNKNOWN, choices=[bc.GRAM_POSITIVE, bc.GRAM_NEGATIVE, bc.GRAM_UNKNOWN], help=f'Gram type for signal peptide predictions: {bc.GRAM_POSITIVE}/{bc.GRAM_NEGATIVE}/{bc.GRAM_UNKNOWN} (default = {bc.GRAM_UNKNOWN})') arg_group_annotation.add_argument('--locus', action='store', default=None, help="Locus prefix (default = 'contig')") arg_group_annotation.add_argument('--locus-tag', action='store', default=None, dest='locus_tag', help='Locus tag prefix (default = autogenerated)') + arg_group_annotation.add_argument('--locus-tag-increment', action='store', type=int, default=1, choices=[1, 5, 10], dest='locus_tag_increment', help='Locus tag increment: 1/5/10 (default = 1)') arg_group_annotation.add_argument('--keep-contig-headers', action='store_true', dest='keep_contig_headers', help='Keep original contig headers') arg_group_annotation.add_argument('--compliant', action='store_true', help='Force Genbank/ENA/DDJB compliance') arg_group_annotation.add_argument('--replicons', '-r', action='store', default=None, dest='replicons', help='Replicon information table (tsv/csv)') From a7b3cfde8adfdca559b9aaf67726f253376cb3b1 Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Wed, 25 Sep 2024 11:07:32 +0200 Subject: [PATCH 7/7] add locus tag increment parameter tests #279 --- test/test_args.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/test/test_args.py b/test/test_args.py index e2845f67..0aa14116 100644 --- a/test/test_args.py +++ b/test/test_args.py @@ -446,6 +446,48 @@ def test_locustag_compliant_ok(parameters, tmpdir): assert proc.returncode == 0 +@pytest.mark.parametrize( + 'parameters', + [ + (['--locus-tag-increment']), # not provided + (['--locus-tag-increment', '']), # empty + (['--locus-tag-increment', ' ']), # whitespace only + (['--locus-tag-increment', 'A']), # wrong characters + (['--locus-tag-increment', 'a']), # wrong characters + (['--locus-tag-increment', '0']), # wrong number + (['--locus-tag-increment', '11']), # wrong number + ] +) +def test_locustag_increment_failiing(parameters, tmpdir): + # test locus-tag increment arguments + proc = run( + ['bin/bakta', '--db', 'test/db', '--output', tmpdir, '--force', '--skip-plot'] + + parameters + + SKIP_PARAMETERS + + ['test/data/NC_002127.1.fna'] + ) + assert proc.returncode != 0 + + +@pytest.mark.parametrize( + 'parameters', + [ + (['--locus-tag-increment', '1']), + (['--locus-tag-increment', '5']), + (['--locus-tag-increment', '10']) + ] +) +def test_locustag_increment_ok(parameters, tmpdir): + # test locus-tag increment arguments + proc = run( + ['bin/bakta', '--db', 'test/db', '--output', tmpdir, '--force', '--skip-plot'] + + parameters + + SKIP_PARAMETERS + + ['test/data/NC_002127.1.fna'] + ) + assert proc.returncode == 0 + + @pytest.mark.parametrize( 'parameters', [