From 3a3b6600b0cbcfd4bc60851caea940f379b29e10 Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Tue, 24 Sep 2024 17:18:11 +0200
Subject: [PATCH 1/7] add regions parameter to stdout

---
 bakta/main.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/bakta/main.py b/bakta/main.py
index 3cfb3a4d..0336ea5f 100755
--- a/bakta/main.py
+++ b/bakta/main.py
@@ -61,9 +61,10 @@ def main():
         print('Options and arguments:')
         print(f'\tinput: {cfg.genome_path}')
         print(f"\tdb: {cfg.db_path}, version {cfg.db_info['major']}.{cfg.db_info['minor']}, {cfg.db_info['type']}")
-        if(cfg.user_proteins): print(f'\tuser proteins: {cfg.user_proteins}')
         if(cfg.replicons): print(f'\treplicon table: {cfg.replicons}')
+        if(cfg.regions): print(f'\tregion table: {cfg.regions}')
         if(cfg.prodigal_tf): print(f'\tprodigal training file: {cfg.prodigal_tf}')
+        if(cfg.user_proteins): print(f'\tuser proteins: {cfg.user_proteins}')
         print(f'\toutput: {cfg.output_path}')
         if(cfg.force): print(f'\tforce: {cfg.force}')
         print(f'\ttmp directory: {cfg.tmp_path}')

From 36445261c4659f2ffa251ffe7b0d6f09edf423f4 Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Tue, 24 Sep 2024 17:24:52 +0200
Subject: [PATCH 2/7] polish ups/ips not found

---
 bakta/main.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/bakta/main.py b/bakta/main.py
index 0336ea5f..8c3859a4 100755
--- a/bakta/main.py
+++ b/bakta/main.py
@@ -249,9 +249,9 @@ def main():
 
         if(len(cdss) > 0):
             log.debug('lookup CDS UPS/IPS')
-            cdss_ups, cdss_not_found = ups.lookup(cdss)
-            cdss_ips, sorf_pscs = ips.lookup(cdss_ups)
-            cdss_not_found.extend(sorf_pscs)
+            cdss_ups, cdss_not_found_ups = ups.lookup(cdss)
+            cdss_ips, cdss_not_found_ips = ips.lookup(cdss_ups)
+            cdss_not_found = cdss_not_found_ups + cdss_not_found_ips
             print(f'\tdetected IPSs: {len(cdss_ips)}')
 
             if(len(cdss_not_found) > 0):

From 2932ebd6086f5cc0e22d9ff8fbe20c8734fbcde1 Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Tue, 24 Sep 2024 17:47:11 +0200
Subject: [PATCH 3/7] skip UPS/IPS detection with light db version

---
 bakta/main.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/bakta/main.py b/bakta/main.py
index 8c3859a4..51dde239 100755
--- a/bakta/main.py
+++ b/bakta/main.py
@@ -248,11 +248,15 @@ def main():
             cdss.extend(imported_cdss)
 
         if(len(cdss) > 0):
-            log.debug('lookup CDS UPS/IPS')
-            cdss_ups, cdss_not_found_ups = ups.lookup(cdss)
-            cdss_ips, cdss_not_found_ips = ips.lookup(cdss_ups)
-            cdss_not_found = cdss_not_found_ups + cdss_not_found_ips
-            print(f'\tdetected IPSs: {len(cdss_ips)}')
+            if(cfg.db_info['type'] == 'full'):
+                log.debug('lookup CDS UPS/IPS')
+                cdss_ups, cdss_not_found_ups = ups.lookup(cdss)
+                cdss_ips, cdss_not_found_ips = ips.lookup(cdss_ups)
+                cdss_not_found = cdss_not_found_ups + cdss_not_found_ips
+                print(f'\tdetected IPSs: {len(cdss_ips)}')
+            else:
+                cdss_not_found = [*cdss]
+                print(f'\tskip UPS/IPS detection with light db version')
 
             if(len(cdss_not_found) > 0):
                 if(cfg.db_info['type'] == 'full'):

From c8a93e11cae31db58eab3711ca344e3ceaf35eba Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Tue, 24 Sep 2024 17:49:28 +0200
Subject: [PATCH 4/7] print stdout note on skipped pseudogene detection b/c
 light db #320

---
 bakta/main.py | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/bakta/main.py b/bakta/main.py
index 51dde239..4da9e96b 100755
--- a/bakta/main.py
+++ b/bakta/main.py
@@ -301,17 +301,20 @@ def main():
                 anno.combine_annotation(cds)  # combine IPS & PSC annotations and mark hypotheticals
 
             hypotheticals = [cds for cds in cdss if 'hypothetical' in cds and 'edge' not in cds and cds.get('start_type', 'Edge') != 'Edge']
-            if(len(hypotheticals) > 0  and  not cfg.skip_pseudo  and  cfg.db_info['type'] == 'full'):
-                print('\tdetect pseudogenes...')
-                log.debug('search pseudogene candidates')
-                pseudo_candidates = feat_cds.predict_pseudo_candidates(hypotheticals)
-                print(f'\t\tpseudogene candidates: {len(pseudo_candidates)}')
-                pseudogenes = feat_cds.detect_pseudogenes(pseudo_candidates, cdss, genome) if len(pseudo_candidates) > 0 else []
-                psc.lookup(pseudogenes, pseudo=True)
-                pscc.lookup(pseudogenes, pseudo=True)
-                for pseudogene in pseudogenes:
-                    anno.combine_annotation(pseudogene)
-                print(f'\t\tfound pseudogenes: {len(pseudogenes)}')
+            if(len(hypotheticals) > 0  and  not cfg.skip_pseudo):
+                if(cfg.db_info['type'] == 'full'):
+                    print('\tdetect pseudogenes...')
+                    log.debug('search pseudogene candidates')
+                    pseudo_candidates = feat_cds.predict_pseudo_candidates(hypotheticals)
+                    print(f'\t\tpseudogene candidates: {len(pseudo_candidates)}')
+                    pseudogenes = feat_cds.detect_pseudogenes(pseudo_candidates, cdss, genome) if len(pseudo_candidates) > 0 else []
+                    psc.lookup(pseudogenes, pseudo=True)
+                    pscc.lookup(pseudogenes, pseudo=True)
+                    for pseudogene in pseudogenes:
+                        anno.combine_annotation(pseudogene)
+                    print(f'\t\tfound pseudogenes: {len(pseudogenes)}')
+                else:
+                    print(f'\tskip pseudogene detection with light db version')
             hypotheticals = [cds for cds in cdss if 'hypothetical' in cds]
             if(len(hypotheticals) > 0):
                 log.debug('analyze hypotheticals')

From b8c0410b88ca152469b75a9e37ca984d59b59a63 Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Tue, 24 Sep 2024 18:15:50 +0200
Subject: [PATCH 5/7] add support for translation table 25 #323

---
 README.md      | 6 +++---
 bakta/utils.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index ea38870d..e5762b33 100644
--- a/README.md
+++ b/README.md
@@ -344,7 +344,7 @@ Exemplary annotation result files for several genomes (mostly ESKAPE species) ar
 ```bash
 usage: bakta [--db DB] [--min-contig-length MIN_CONTIG_LENGTH] [--prefix PREFIX] [--output OUTPUT]
              [--genus GENUS] [--species SPECIES] [--strain STRAIN] [--plasmid PLASMID]
-             [--complete] [--prodigal-tf PRODIGAL_TF] [--translation-table {11,4}] [--gram {+,-,?}] [--locus LOCUS]
+             [--complete] [--prodigal-tf PRODIGAL_TF] [--translation-table {11,4,25}] [--gram {+,-,?}] [--locus LOCUS]
              [--locus-tag LOCUS_TAG] [--keep-contig-headers] [--replicons REPLICONS] [--compliant] [--replicons REPLICONS] [--regions REGIONS] [--proteins PROTEINS] [--meta]
              [--skip-trna] [--skip-tmrna] [--skip-rrna] [--skip-ncrna] [--skip-ncrna-region]
              [--skip-crispr] [--skip-cds] [--skip-pseudo] [--skip-sorf] [--skip-gap] [--skip-ori] [--skip-plot]
@@ -376,8 +376,8 @@ Annotation:
   --complete            All sequences are complete replicons (chromosome/plasmid[s])
   --prodigal-tf PRODIGAL_TF
                         Path to existing Prodigal training file to use for CDS prediction
-  --translation-table {11,4}
-                        Translation table: 11/4 (default = 11)
+  --translation-table {11,4,25}
+                        Translation table: 11/4/25 (default = 11)
   --gram {+,-,?}        Gram type for signal peptide predictions: +/-/? (default = ?)
   --locus LOCUS         Locus prefix (default = 'contig')
   --locus-tag LOCUS_TAG
diff --git a/bakta/utils.py b/bakta/utils.py
index 7be14693..8760d0f3 100644
--- a/bakta/utils.py
+++ b/bakta/utils.py
@@ -82,7 +82,7 @@ def parse_arguments():
     arg_group_annotation = parser.add_argument_group('Annotation')
     arg_group_annotation.add_argument('--complete', action='store_true', help='All sequences are complete replicons (chromosome/plasmid[s])')
     arg_group_annotation.add_argument('--prodigal-tf', action='store', default=None, dest='prodigal_tf', help='Path to existing Prodigal training file to use for CDS prediction')
-    arg_group_annotation.add_argument('--translation-table', action='store', type=int, default=11, choices=[11, 4], dest='translation_table', help='Translation table: 11/4 (default = 11)')
+    arg_group_annotation.add_argument('--translation-table', action='store', type=int, default=11, choices=[11, 4, 25], dest='translation_table', help='Translation table: 11/4/25 (default = 11)')
     arg_group_annotation.add_argument('--gram', action='store', default=bc.GRAM_UNKNOWN, choices=[bc.GRAM_POSITIVE, bc.GRAM_NEGATIVE, bc.GRAM_UNKNOWN], help=f'Gram type for signal peptide predictions: {bc.GRAM_POSITIVE}/{bc.GRAM_NEGATIVE}/{bc.GRAM_UNKNOWN} (default = {bc.GRAM_UNKNOWN})')
     arg_group_annotation.add_argument('--locus', action='store', default=None, help="Locus prefix (default = 'contig')")
     arg_group_annotation.add_argument('--locus-tag', action='store', default=None, dest='locus_tag', help='Locus tag prefix (default = autogenerated)')

From 650eedc17e4814c15dad604487e8c88aab72fad4 Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Wed, 25 Sep 2024 11:07:21 +0200
Subject: [PATCH 6/7] add locus tag increment parameter #279

---
 README.md       | 10 +++++++---
 bakta/config.py |  5 ++++-
 bakta/main.py   |  6 +++---
 bakta/utils.py  |  1 +
 4 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index e5762b33..0f4bc82e 100644
--- a/README.md
+++ b/README.md
@@ -342,10 +342,11 @@ Exemplary annotation result files for several genomes (mostly ESKAPE species) ar
 ## Usage
 
 ```bash
-usage: bakta [--db DB] [--min-contig-length MIN_CONTIG_LENGTH] [--prefix PREFIX] [--output OUTPUT]
+usage: bakta [--db DB] [--min-contig-length MIN_CONTIG_LENGTH] [--prefix PREFIX] [--output OUTPUT] [--force]
              [--genus GENUS] [--species SPECIES] [--strain STRAIN] [--plasmid PLASMID]
-             [--complete] [--prodigal-tf PRODIGAL_TF] [--translation-table {11,4,25}] [--gram {+,-,?}] [--locus LOCUS]
-             [--locus-tag LOCUS_TAG] [--keep-contig-headers] [--replicons REPLICONS] [--compliant] [--replicons REPLICONS] [--regions REGIONS] [--proteins PROTEINS] [--meta]
+             [--complete] [--prodigal-tf PRODIGAL_TF] [--translation-table {11,4,25}] [--gram {+,-,?}]
+             [--locus LOCUS] [--locus-tag LOCUS_TAG] [--locus-tag-increment {1,5,10}] [--keep-contig-headers] [--compliant]
+             [--replicons REPLICONS] [--regions REGIONS] [--proteins PROTEINS] [--meta]
              [--skip-trna] [--skip-tmrna] [--skip-rrna] [--skip-ncrna] [--skip-ncrna-region]
              [--skip-crispr] [--skip-cds] [--skip-pseudo] [--skip-sorf] [--skip-gap] [--skip-ori] [--skip-plot]
              [--help] [--verbose] [--debug] [--threads THREADS] [--tmp-dir TMP_DIR] [--version]
@@ -382,6 +383,9 @@ Annotation:
   --locus LOCUS         Locus prefix (default = 'contig')
   --locus-tag LOCUS_TAG
                         Locus tag prefix (default = autogenerated)
+  --locus-tag-increment {1,5,10}
+                        Locus tag increment: 1/5/10 (default = 1)
+
   --keep-contig-headers
                         Keep original contig headers
   --compliant           Force Genbank/ENA/DDJB compliance
diff --git a/bakta/config.py b/bakta/config.py
index 0bf01990..6cb419e0 100644
--- a/bakta/config.py
+++ b/bakta/config.py
@@ -50,6 +50,7 @@
 keep_contig_headers = None
 locus = None
 locus_tag = None
+locus_tag_increment = None
 gram = None
 replicons = None
 compliant = None
@@ -161,7 +162,7 @@ def setup(args):
         taxon = None
 
     # annotation configurations
-    global complete, prodigal_tf, translation_table, keep_contig_headers, locus, locus_tag, gram, replicons, compliant, user_proteins, meta, regions
+    global complete, prodigal_tf, translation_table, keep_contig_headers, locus, locus_tag, locus_tag_increment, gram, replicons, compliant, user_proteins, meta, regions
     complete = args.complete
     log.info('complete=%s', complete)
     prodigal_tf = args.prodigal_tf
@@ -217,6 +218,8 @@ def setup(args):
                 log.error("Invalid 'locus-tag' parameter! locus-tag=%s", locus_tag)
                 sys.exit(f"ERROR: invalid 'locus-tag' parameter ({locus_tag})!\nLocus tag prefixes must contain between 1 and 24 alphanumeric characters or '_.-' signs.")
     log.info('locus-tag=%s', locus_tag)
+    locus_tag_increment = args.locus_tag_increment
+    log.info('locus-tag-increment=%s', locus_tag_increment)
     keep_contig_headers = args.keep_contig_headers
     log.info('keep_contig_headers=%s', keep_contig_headers)
     replicons = args.replicons
diff --git a/bakta/main.py b/bakta/main.py
index 4da9e96b..1765cb20 100755
--- a/bakta/main.py
+++ b/bakta/main.py
@@ -476,15 +476,15 @@ def main():
     log.info('selected features=%i', len(features))
     print(f'selected: {len(features)}')
 
-    locus_tag_nr = 5
     # use user provided locus tag if not None/non-empty or generate a sequence based locus prefix
     locus_tag_prefix = cfg.locus_tag if cfg.locus_tag else bu.create_locus_tag_prefix(contigs)
     log.info('locus tag prefix=%s', locus_tag_prefix)
+    locus_tag_nr = cfg.locus_tag_increment
     for feature in features:
-        locus_tag = f'{locus_tag_prefix}_{locus_tag_nr:05}'
+        locus_tag = f'{locus_tag_prefix}_{locus_tag_nr:0{len(str(cfg.locus_tag_increment*len(features)))+1}}'
         if(feature['type'] in [bc.FEATURE_T_RNA, bc.FEATURE_TM_RNA, bc.FEATURE_R_RNA, bc.FEATURE_NC_RNA, bc.FEATURE_CDS, bc.FEATURE_SORF]):
             feature['locus'] = locus_tag
-            locus_tag_nr += 5
+            locus_tag_nr += cfg.locus_tag_increment
 
     ############################################################################
     # Improve annotations
diff --git a/bakta/utils.py b/bakta/utils.py
index 8760d0f3..46bae8fe 100644
--- a/bakta/utils.py
+++ b/bakta/utils.py
@@ -86,6 +86,7 @@ def parse_arguments():
     arg_group_annotation.add_argument('--gram', action='store', default=bc.GRAM_UNKNOWN, choices=[bc.GRAM_POSITIVE, bc.GRAM_NEGATIVE, bc.GRAM_UNKNOWN], help=f'Gram type for signal peptide predictions: {bc.GRAM_POSITIVE}/{bc.GRAM_NEGATIVE}/{bc.GRAM_UNKNOWN} (default = {bc.GRAM_UNKNOWN})')
     arg_group_annotation.add_argument('--locus', action='store', default=None, help="Locus prefix (default = 'contig')")
     arg_group_annotation.add_argument('--locus-tag', action='store', default=None, dest='locus_tag', help='Locus tag prefix (default = autogenerated)')
+    arg_group_annotation.add_argument('--locus-tag-increment', action='store', type=int, default=1, choices=[1, 5, 10], dest='locus_tag_increment', help='Locus tag increment: 1/5/10 (default = 1)')
     arg_group_annotation.add_argument('--keep-contig-headers', action='store_true', dest='keep_contig_headers', help='Keep original contig headers')
     arg_group_annotation.add_argument('--compliant', action='store_true', help='Force Genbank/ENA/DDJB compliance')
     arg_group_annotation.add_argument('--replicons', '-r', action='store', default=None, dest='replicons', help='Replicon information table (tsv/csv)')

From a7b3cfde8adfdca559b9aaf67726f253376cb3b1 Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Wed, 25 Sep 2024 11:07:32 +0200
Subject: [PATCH 7/7] add locus tag increment parameter tests #279

---
 test/test_args.py | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/test/test_args.py b/test/test_args.py
index e2845f67..0aa14116 100644
--- a/test/test_args.py
+++ b/test/test_args.py
@@ -446,6 +446,48 @@ def test_locustag_compliant_ok(parameters, tmpdir):
     assert proc.returncode == 0
 
 
+@pytest.mark.parametrize(
+    'parameters',
+    [
+        (['--locus-tag-increment']),  # not provided
+        (['--locus-tag-increment', '']),  # empty
+        (['--locus-tag-increment', ' ']),  # whitespace only
+        (['--locus-tag-increment', 'A']),  # wrong characters
+        (['--locus-tag-increment', 'a']),  # wrong characters
+        (['--locus-tag-increment', '0']),  # wrong number
+        (['--locus-tag-increment', '11']),  # wrong number
+    ]
+)
+def test_locustag_increment_failiing(parameters, tmpdir):
+    # test locus-tag increment arguments
+    proc = run(
+        ['bin/bakta', '--db', 'test/db', '--output', tmpdir, '--force', '--skip-plot'] +
+        parameters +
+        SKIP_PARAMETERS +
+        ['test/data/NC_002127.1.fna']
+    )
+    assert proc.returncode != 0
+
+
+@pytest.mark.parametrize(
+    'parameters',
+    [
+        (['--locus-tag-increment', '1']),
+        (['--locus-tag-increment', '5']),
+        (['--locus-tag-increment', '10'])
+    ]
+)
+def test_locustag_increment_ok(parameters, tmpdir):
+    # test locus-tag increment arguments
+    proc = run(
+        ['bin/bakta', '--db', 'test/db', '--output', tmpdir, '--force', '--skip-plot'] +
+        parameters +
+        SKIP_PARAMETERS +
+        ['test/data/NC_002127.1.fna']
+    )
+    assert proc.returncode == 0
+
+
 @pytest.mark.parametrize(
     'parameters',
     [