oschwengers · oschwengers · Oct 9, 2024 · Sep 25, 2024 · Sep 25, 2024 · Sep 25, 2024
diff --git a/README.md b/README.md
@@ -285,6 +285,7 @@ Annotation results are provided in standard bioinformatics file formats:
 - `<prefix>.fna`: replicon/contig DNA sequences as FASTA
 - `<prefix>.ffn`: feature nucleotide sequences as FASTA
 - `<prefix>.faa`: CDS/sORF amino acid sequences as FASTA
+- `<prefix>.inference.tsv`: inference metrics (score, evalue, coverage, identity) for annotated accessions as TSV
 - `<prefix>.hypotheticals.tsv`: further information on hypothetical protein CDS as simple human readble tab separated values
 - `<prefix>.hypotheticals.faa`: hypothetical protein CDS amino acid sequences as FASTA
 - `<prefix>.json`: all (internal) annotation & sequence information as JSON

diff --git a/bakta/expert/protein_sequences.py b/bakta/expert/protein_sequences.py
@@ -26,7 +26,7 @@ def search(cdss: Sequence[dict], cds_fasta_path: Path, expert_system: str, db_pa
         '--db', str(db_path),
         '--out', str(diamond_output_path),
         '--max-target-seqs', '1',  # single best output
-        '--outfmt', '6', 'qseqid', 'sseqid', 'slen', 'length', 'pident', 'evalue', 'bitscore', 'stitle',
+        '--outfmt', '6', 'qseqid', 'sseqid', 'qlen', 'slen', 'length', 'pident', 'evalue', 'bitscore', 'stitle',
         '--threads', str(cfg.threads),
         '--tmpdir', str(cfg.tmp_path),  # use tmp folder
         '--block-size', '4',  # increase block size for faster executions
@@ -51,13 +51,13 @@ def search(cdss: Sequence[dict], cds_fasta_path: Path, expert_system: str, db_pa
     cds_by_hexdigest = orf.get_orf_dictionary(cdss)
     with diamond_output_path.open() as fh:
         for line in fh:
-            (aa_identifier, model_id, model_length, alignment_length, identity, evalue, bitscore, model_title) = line.strip().split('\t')
+            (aa_identifier, model_id, query_length, model_length, alignment_length, identity, evalue, bitscore, model_title) = line.strip().split('\t')
             cds = cds_by_hexdigest[aa_identifier]
             query_cov = int(alignment_length) / len(cds['aa'])
             model_cov = int(alignment_length) / int(model_length)
             identity = float(identity) / 100
-            evalue = float(evalue)
             bitscore = float(bitscore)
+            evalue = float(evalue)
             (source, rank, min_identity, min_query_cov, min_model_cov, gene, product, dbxrefs) = model_title.split(' ', 1)[1].split('~~~')
             rank = int(rank)
             min_identity = float(min_identity) / 100
@@ -72,19 +72,19 @@ def search(cdss: Sequence[dict], cds_fasta_path: Path, expert_system: str, db_pa
                     'gene': gene if gene != '' else None,
                     'product': product,
                     'query_cov': query_cov,
-                    'model_cov': model_cov,
+                    'subject_cov': model_cov,
                     'identity': identity,
+                    'score': bitscore,
                     'evalue': evalue,
-                    'bitscore': bitscore,
                     'db_xrefs': [] if dbxrefs == '' else dbxrefs.split(',')
                 }
                 if(expert_system == 'user_proteins'):
                     hit['db_xrefs'].append(f'UserProtein:{model_id}')
                 cds.setdefault('expert', [])
                 cds['expert'].append(hit)
                 log.debug(
-                    'hit: source=%s, rank=%i, contig=%s, start=%i, stop=%i, strand=%s, query-cov=%0.3f, model-cov=%0.3f, identity=%0.3f, gene=%s, product=%s, evalue=%1.1e, bitscore=%f',
-                    source, rank, cds['contig'], cds['start'], cds['stop'], cds['strand'], query_cov, model_cov, identity, gene, product, evalue, bitscore
+                    'hit: source=%s, rank=%i, contig=%s, start=%i, stop=%i, strand=%s, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, gene=%s, product=%s',
+                    source, rank, cds['contig'], cds['start'], cds['stop'], cds['strand'], query_cov, model_cov, identity, bitscore, evalue, gene, product
                 )
                 cds_found.add(aa_identifier)
 

diff --git a/bakta/features/cds.py b/bakta/features/cds.py
@@ -528,7 +528,7 @@ def predict_pseudo_candidates(hypotheticals: Sequence[dict]) -> Sequence[dict]:
         '--query-cover', str(int(bc.MIN_PSEUDOGENE_QUERY_COVERAGE * 100)),      # '80'
         '--subject-cover', str(int(bc.MIN_PSEUDOGENE_SUBJECT_COVERAGE * 100)),  # '40'
         '--max-target-seqs', '1',  # single best output
-        '--outfmt', '6', 'qseqid', 'sseqid', 'pident', 'length', 'qstart', 'qend', 'sstart', 'send', 'full_sseq',
+        '--outfmt', '6', 'qseqid', 'sseqid', 'qlen', 'slen', 'length', 'pident', 'evalue', 'bitscore', 'qstart', 'qend', 'sstart', 'send', 'full_sseq',
         '--threads', str(cfg.threads),
         '--tmpdir', str(cfg.tmp_path),
         '--block-size', '3',  # slightly increase block size for faster executions
@@ -552,18 +552,21 @@ def predict_pseudo_candidates(hypotheticals: Sequence[dict]) -> Sequence[dict]:
     cds_by_hexdigest = orf.get_orf_dictionary(hypotheticals)
     with diamond_output_path.open() as fh:
         for line in fh:
-            (aa_identifier, cluster_id, identity, alignment_length, query_start, query_end, subject_start, subject_end, subject_sequence) = line.rstrip('\n').split('\t')
+            (aa_identifier, cluster_id, query_length, subject_length, alignment_length, identity, evalue, bitscore, query_start, query_end, subject_start, subject_end, subject_sequence) = line.rstrip('\n').split('\t')
             cds = cds_by_hexdigest[aa_identifier]
             query_cov = int(alignment_length) / len(cds['aa'])
-            subject_cov = int(alignment_length) / len(subject_sequence)
+            subject_cov = int(alignment_length) / int(subject_length)
             identity = float(identity) / 100
-            if query_cov >= bc.MIN_PSEUDOGENE_QUERY_COVERAGE and identity >= bc.MIN_PSEUDOGENE_IDENTITY \
-                    and bc.MIN_PSEUDOGENE_SUBJECT_COVERAGE <= subject_cov < bc.MIN_PSC_COVERAGE:
+            bitscore = float(bitscore)
+            evalue = float(evalue)
+            if(query_cov >= bc.MIN_PSEUDOGENE_QUERY_COVERAGE and bc.MIN_PSEUDOGENE_SUBJECT_COVERAGE <= subject_cov < bc.MIN_PSC_COVERAGE and identity >= bc.MIN_PSEUDOGENE_IDENTITY):
                 cds['pseudo-inference'] = {
                     DB_PSC_COL_UNIREF90: cluster_id,
-                    'query-cov': query_cov,
-                    'subject-cov': subject_cov,
+                    'query_cov': query_cov,
+                    'subject_cov': subject_cov,
                     'identity': identity,
+                    'score': bitscore,
+                    'evalue': evalue,
                     'gene_start': int(query_start),
                     'gene_end': int(query_end),
                     'reference_start': int(subject_start),
@@ -572,8 +575,8 @@ def predict_pseudo_candidates(hypotheticals: Sequence[dict]) -> Sequence[dict]:
                 }
                 pseudo_candidates.append(cds)
                 log.debug(
-                    'pseudogene-candidate: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, UniRef90=%s',
-                    cds['contig'], cds['start'], cds['stop'], cds['strand'], len(cds['aa']), query_cov, subject_cov, identity, cluster_id
+                    'pseudogene-candidate: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, UniRef90=%s',
+                    cds['contig'], cds['start'], cds['stop'], cds['strand'], len(cds['aa']), query_cov, subject_cov, identity, bitscore, evalue, cluster_id
                 )
     log.info('found: pseudogene-candidates=%i', len(pseudo_candidates))
     return pseudo_candidates
@@ -662,6 +665,9 @@ def detect_pseudogenes(candidates: Sequence[dict], cdss: Sequence[dict], genome:
                     query_alignment_start = int(hit.find('Hit_hsps/Hsp/Hsp_query-from').text)
                     query_alignment_stop = int(hit.find('Hit_hsps/Hsp/Hsp_query-to').text)
                     alignment_length = int(hit.find('Hit_hsps/Hsp/Hsp_align-len').text)
+                    identity = float(hit.find('Hit_hsps/Hsp/Hsp_identity').text) / alignment_length
+                    bitscore = float(hit.find('Hit_hsps/Hsp/Hsp_bit-score').text)
+                    evalue = float(hit.find('Hit_hsps/Hsp/Hsp_evalue').text)
 
                     if alignment_length == len(cds['aa']):  # skip non-extended genes (full match)
                         log.debug(
@@ -686,7 +692,10 @@ def detect_pseudogenes(candidates: Sequence[dict], cdss: Sequence[dict], genome:
                             'stop': positions['stop'],
                             'observations': clean_observations(observations),
                             'inference': cds['pseudo-inference'],
-                            'paralog': is_paralog(uniref90_by_hexdigest, aa_identifier, cluster_id)
+                            'paralog': is_paralog(uniref90_by_hexdigest, aa_identifier, cluster_id),
+                            'identity': identity,
+                            'score': bitscore,
+                            'evalue': evalue
                         }
 
                         effects = []

diff --git a/bakta/features/ori.py b/bakta/features/ori.py
@@ -72,7 +72,7 @@ def predict_oris(genome: dict, contigs_path: Path, ori_type: str) -> Sequence[di
                 if(len(contig_hits) == 1):
                     hits[hit['contig']] = contig_hits
                 log.debug(
-                    'raw hit: type=%s, contig=%s, start=%i, stop=%i, strand=%s, coverage=%f, identity=%f',
+                    'raw hit: type=%s, contig=%s, start=%i, stop=%i, strand=%s, coverage=%0.3f, identity=%0.3f',
                     ori_type, hit['contig'], hit['contig_start'], hit['contig_stop'], hit['strand'], hit['coverage'], hit['identity']
                 )
 

diff --git a/bakta/features/s_orf.py b/bakta/features/s_orf.py
@@ -324,7 +324,7 @@ def search(sorfs: Sequence[dict], cluster_type: str):
         '--query-cover', str(int(bc.MIN_SORF_COVERAGE * 100)),  # '90'
         '--subject-cover', str(int(bc.MIN_SORF_COVERAGE * 100)),  # '90'
         '--max-target-seqs', '1',  # single best output
-        '--outfmt', '6',
+        '--outfmt', '6', 'qseqid', 'sseqid', 'qlen', 'slen', 'length', 'pident', 'evalue', 'bitscore',
         '--threads', str(cfg.threads),
         '--tmpdir', str(cfg.tmp_path),  # use tmp folder
         '--block-size', '3',  # slightly increase block size for faster executions
@@ -347,22 +347,26 @@ def search(sorfs: Sequence[dict], cluster_type: str):
     sorf_by_aa_digest = orf.get_orf_dictionary(sorfs)
     with diamond_output_path.open() as fh:
         for line in fh:
-            (sorf_hash, cluster_id, identity, alignment_length, align_mismatches,
-                align_gaps, query_start, query_end, subject_start, subject_end,
-                evalue, bitscore) = line.split('\t')
+            (sorf_hash, cluster_id, query_length, subject_length, alignment_length, identity, evalue, bitscore) = line.split('\t')
             sorf = sorf_by_aa_digest[sorf_hash]
             query_cov = int(alignment_length) / len(sorf['aa'])
+            subject_cov = int(alignment_length) / int(subject_length)
             identity = float(identity) / 100
-            if(query_cov >= bc.MIN_SORF_COVERAGE and identity >= bc.MIN_SORF_IDENTITY):
+            bitscore = float(bitscore)
+            evalue = float(evalue)
+            if(query_cov >= bc.MIN_SORF_COVERAGE and subject_cov >= bc.MIN_SORF_COVERAGE and identity >= bc.MIN_SORF_IDENTITY):
                 result = {
                     'query_cov': query_cov,
-                    'identity': identity
+                    'subject_cov': subject_cov,
+                    'identity': identity,
+                    'score': bitscore,
+                    'evalue': evalue
                 }
                 result[psc.DB_PSC_COL_UNIREF90 if cluster_type == 'full' else pscc.DB_PSCC_COL_UNIREF50] = cluster_id
                 sorf['psc' if cluster_type == 'full' else 'pscc'] = result
                 log.info(
-                    'homology: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, identity=%0.3f, UniRef90=%s',
-                    sorf['contig'], sorf['start'], sorf['stop'], sorf['strand'], len(sorf['aa']), query_cov, identity, cluster_id
+                    'homology: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, UniRef=%s',
+                    sorf['contig'], sorf['start'], sorf['stop'], sorf['strand'], len(sorf['aa']), query_cov, subject_cov, identity, bitscore, evalue, cluster_id
                 )
 
     sorfs_found = []

diff --git a/bakta/io/gff.py b/bakta/io/gff.py
@@ -14,9 +14,9 @@
 log = logging.getLogger('GFF')
 
 
-def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Path):
+def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Path):
     """Export features in GFF3 format."""
-    log.info('write GFF3: path=%s', gff3_path)
+    log.info('write features: path=%s', gff3_path)
 
     with gff3_path.open('wt') as fh:
         fh.write('##gff-version 3\n')  # GFF version

diff --git a/bakta/io/insdc.py b/bakta/io/insdc.py
@@ -18,7 +18,7 @@
 log = logging.getLogger('INSDC')
 
 
-def write_insdc(genome: dict, features: Sequence[dict], genbank_output_path: Path, embl_output_path: Path):
+def write_features(genome: dict, features: Sequence[dict], genbank_output_path: Path, embl_output_path: Path):
     log.debug('prepare: genbank=%s, embl=%s', genbank_output_path, embl_output_path)
 
     contig_list = []

diff --git a/bakta/io/tsv.py b/bakta/io/tsv.py
@@ -7,14 +7,18 @@
 import bakta
 import bakta.config as cfg
 import bakta.constants as bc
+import bakta.ips as bips
+import bakta.ups as bups
+import bakta.psc as bpsc
+import bakta.pscc as bpscc
 
 
 log = logging.getLogger('TSV')
 
 
-def write_tsv(contigs: Sequence[dict], features_by_contig: Dict[str, dict], tsv_path: Path):
+def write_features(contigs: Sequence[dict], features_by_contig: Dict[str, dict], tsv_path: Path):
     """Export features in TSV format."""
-    log.info('write tsv: path=%s', tsv_path)
+    log.info('write feature tsv: path=%s', tsv_path)
 
     with tsv_path.open('wt') as fh:
         fh.write('# Annotated with Bakta\n')
@@ -63,10 +67,79 @@ def write_tsv(contigs: Sequence[dict], features_by_contig: Dict[str, dict], tsv_
     return
 
 
-def write_features(features: Sequence[dict], header_columns: Sequence[str], mapping: LambdaType, tsv_path: Path):
-    """Export features in TSV format."""
+def write_feature_inferences(contigs: Sequence[dict], features_by_contig: Dict[str, dict], tsv_path: Path):
+    """Export feature inference statistics in TSV format."""
     log.info('write tsv: path=%s', tsv_path)
 
+    with tsv_path.open('wt') as fh:
+        fh.write('# Annotated with Bakta\n')
+        fh.write(f'# Software: v{bakta.__version__}\n')
+        fh.write(f"# Database: v{cfg.db_info['major']}.{cfg.db_info['minor']}, {cfg.db_info['type']}\n")
+        fh.write(f'# DOI: {bc.BAKTA_DOI}\n')
+        fh.write(f'# URL: {bc.BAKTA_URL}\n')
+        fh.write('#Sequence Id\tType\tStart\tStop\tStrand\tLocus Tag\tScore\tEvalue\tQuery Cov\tSubject Cov\tId\tAccession\n')
+
+        for contig in contigs:
+            for feat in features_by_contig[contig['id']]:
+                if(feat['type'] in [bc.FEATURE_CDS, bc.FEATURE_SORF]):
+                    score, evalue, query_cov, subject_cov, identity, accession = None, None, None, None, None, '-'
+                    if('ups' in feat or 'ips' in feat):
+                        query_cov = 1
+                        subject_cov = 1
+                        identity = 1
+                        evalue = 0
+                        accession = f"{bc.DB_XREF_UNIREF}:{feat['ips'][bips.DB_IPS_COL_UNIREF100]}" if 'ips' in feat else f"{bc.DB_XREF_UNIPARC}:{feat['ups'][bups.DB_UPS_COL_UNIPARC]}"
+                    elif('psc' in feat or 'pscc' in feat):
+                        psc_type = 'psc' if 'psc' in feat else 'pscc'
+                        query_cov = feat[psc_type]['query_cov']
+                        subject_cov = feat[psc_type]['subject_cov']
+                        identity = feat[psc_type]['identity']
+                        score = feat[psc_type]['score']
+                        evalue = feat[psc_type]['evalue']
+                        accession = f"{bc.DB_XREF_UNIREF}:{feat['psc'][bpsc.DB_PSC_COL_UNIREF90]}" if 'psc' in feat else f"{bc.DB_XREF_UNIREF}:{feat['pscc'][bpscc.DB_PSCC_COL_UNIREF50]}"
+                    fh.write('\t'.join(
+                        [
+                            feat['contig'],
+                            feat['type'],
+                            str(feat['start']),
+                            str(feat['stop']),
+                            feat['strand'],
+                            feat['locus'],
+                            f"{score:0.1f}" if score != None else '-',
+                            ('0.0' if evalue == 0 else f"{evalue:1.1e}") if evalue != None else '-',
+                            ('1.0' if query_cov == 1 else f"{query_cov:0.3f}") if query_cov != None else '-',
+                            ('1.0' if subject_cov == 1 else f"{subject_cov:0.3f}") if subject_cov != None else '-',
+                            ('1.0' if identity == 1 else f"{identity:0.3f}") if identity != None else '-',
+                            accession
+                        ])
+                    )
+                    fh.write('\n')
+                elif(feat['type'] in [bc.FEATURE_T_RNA, bc.FEATURE_R_RNA, bc.FEATURE_NC_RNA, bc.FEATURE_NC_RNA_REGION]):
+                    accession = '-' if feat['type'] == bc.FEATURE_T_RNA else [xref for xref in feat['db_xrefs'] if bc.DB_XREF_RFAM in xref][0]
+                    fh.write('\t'.join(
+                        [
+                            feat['contig'],
+                            feat['type'],
+                            str(feat['start']),
+                            str(feat['stop']),
+                            feat['strand'],
+                            feat['locus'] if 'locus' in feat else '-',
+                            f"{feat['score']:0.1f}",
+                            ('0.0' if feat['evalue'] == 0 else f"{feat['evalue']:1.1e}") if 'evalue' in feat else '-',
+                            ('1.0' if feat['query_cov'] == 1 else f"{feat['query_cov']:0.3f}") if 'query_cov' in feat else '-',
+                            ('1.0' if feat['subject_cov'] == 1 else f"{feat['subject_cov']:0.3f}") if 'subject_cov' in feat else '-',
+                            ('1.0' if feat['identity'] == 1 else f"{feat['identity']:0.3f}") if 'identity' in feat else '-',
+                            accession
+                        ])
+                    )
+                    fh.write('\n')
+    return
+
+
+def write_protein_features(features: Sequence[dict], header_columns: Sequence[str], mapping: LambdaType, tsv_path: Path):
+    """Export protein features in TSV format."""
+    log.info('write protein feature tsv: path=%s', tsv_path)
+
     with tsv_path.open('wt') as fh:
         fh.write(f'#Annotated with Bakta (v{bakta.__version__}): https://github.com/oschwengers/bakta\n')
         fh.write(f"#Database (v{cfg.db_info['major']}.{cfg.db_info['minor']}): https://doi.org/10.5281/zenodo.4247252\n")
@@ -79,7 +152,7 @@ def write_features(features: Sequence[dict], header_columns: Sequence[str], mapp
     return
 
 
-def write_hypotheticals_tsv(hypotheticals: Sequence[dict], tsv_path: Path):
+def write_hypotheticals(hypotheticals: Sequence[dict], tsv_path: Path):
     """Export hypothetical information in TSV format."""
     log.info('write hypothetical tsv: path=%s', tsv_path)