Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce feature inference TSV output file #331

Merged
merged 10 commits into from
Oct 9, 2024
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,7 @@ Annotation results are provided in standard bioinformatics file formats:
- `<prefix>.fna`: replicon/contig DNA sequences as FASTA
- `<prefix>.ffn`: feature nucleotide sequences as FASTA
- `<prefix>.faa`: CDS/sORF amino acid sequences as FASTA
- `<prefix>.inference.tsv`: inference metrics (score, evalue, coverage, identity) for annotated accessions as TSV
- `<prefix>.hypotheticals.tsv`: further information on hypothetical protein CDS as simple human readble tab separated values
- `<prefix>.hypotheticals.faa`: hypothetical protein CDS amino acid sequences as FASTA
- `<prefix>.json`: all (internal) annotation & sequence information as JSON
Expand Down
14 changes: 7 additions & 7 deletions bakta/expert/protein_sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def search(cdss: Sequence[dict], cds_fasta_path: Path, expert_system: str, db_pa
'--db', str(db_path),
'--out', str(diamond_output_path),
'--max-target-seqs', '1', # single best output
'--outfmt', '6', 'qseqid', 'sseqid', 'slen', 'length', 'pident', 'evalue', 'bitscore', 'stitle',
'--outfmt', '6', 'qseqid', 'sseqid', 'qlen', 'slen', 'length', 'pident', 'evalue', 'bitscore', 'stitle',
'--threads', str(cfg.threads),
'--tmpdir', str(cfg.tmp_path), # use tmp folder
'--block-size', '4', # increase block size for faster executions
Expand All @@ -51,13 +51,13 @@ def search(cdss: Sequence[dict], cds_fasta_path: Path, expert_system: str, db_pa
cds_by_hexdigest = orf.get_orf_dictionary(cdss)
with diamond_output_path.open() as fh:
for line in fh:
(aa_identifier, model_id, model_length, alignment_length, identity, evalue, bitscore, model_title) = line.strip().split('\t')
(aa_identifier, model_id, query_length, model_length, alignment_length, identity, evalue, bitscore, model_title) = line.strip().split('\t')
cds = cds_by_hexdigest[aa_identifier]
query_cov = int(alignment_length) / len(cds['aa'])
model_cov = int(alignment_length) / int(model_length)
identity = float(identity) / 100
evalue = float(evalue)
bitscore = float(bitscore)
evalue = float(evalue)
(source, rank, min_identity, min_query_cov, min_model_cov, gene, product, dbxrefs) = model_title.split(' ', 1)[1].split('~~~')
rank = int(rank)
min_identity = float(min_identity) / 100
Expand All @@ -72,19 +72,19 @@ def search(cdss: Sequence[dict], cds_fasta_path: Path, expert_system: str, db_pa
'gene': gene if gene != '' else None,
'product': product,
'query_cov': query_cov,
'model_cov': model_cov,
'subject_cov': model_cov,
'identity': identity,
'score': bitscore,
'evalue': evalue,
'bitscore': bitscore,
'db_xrefs': [] if dbxrefs == '' else dbxrefs.split(',')
}
if(expert_system == 'user_proteins'):
hit['db_xrefs'].append(f'UserProtein:{model_id}')
cds.setdefault('expert', [])
cds['expert'].append(hit)
log.debug(
'hit: source=%s, rank=%i, contig=%s, start=%i, stop=%i, strand=%s, query-cov=%0.3f, model-cov=%0.3f, identity=%0.3f, gene=%s, product=%s, evalue=%1.1e, bitscore=%f',
source, rank, cds['contig'], cds['start'], cds['stop'], cds['strand'], query_cov, model_cov, identity, gene, product, evalue, bitscore
'hit: source=%s, rank=%i, contig=%s, start=%i, stop=%i, strand=%s, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, gene=%s, product=%s',
source, rank, cds['contig'], cds['start'], cds['stop'], cds['strand'], query_cov, model_cov, identity, bitscore, evalue, gene, product
)
cds_found.add(aa_identifier)

Expand Down
29 changes: 19 additions & 10 deletions bakta/features/cds.py
Original file line number Diff line number Diff line change
Expand Up @@ -528,7 +528,7 @@ def predict_pseudo_candidates(hypotheticals: Sequence[dict]) -> Sequence[dict]:
'--query-cover', str(int(bc.MIN_PSEUDOGENE_QUERY_COVERAGE * 100)), # '80'
'--subject-cover', str(int(bc.MIN_PSEUDOGENE_SUBJECT_COVERAGE * 100)), # '40'
'--max-target-seqs', '1', # single best output
'--outfmt', '6', 'qseqid', 'sseqid', 'pident', 'length', 'qstart', 'qend', 'sstart', 'send', 'full_sseq',
'--outfmt', '6', 'qseqid', 'sseqid', 'qlen', 'slen', 'length', 'pident', 'evalue', 'bitscore', 'qstart', 'qend', 'sstart', 'send', 'full_sseq',
'--threads', str(cfg.threads),
'--tmpdir', str(cfg.tmp_path),
'--block-size', '3', # slightly increase block size for faster executions
Expand All @@ -552,18 +552,21 @@ def predict_pseudo_candidates(hypotheticals: Sequence[dict]) -> Sequence[dict]:
cds_by_hexdigest = orf.get_orf_dictionary(hypotheticals)
with diamond_output_path.open() as fh:
for line in fh:
(aa_identifier, cluster_id, identity, alignment_length, query_start, query_end, subject_start, subject_end, subject_sequence) = line.rstrip('\n').split('\t')
(aa_identifier, cluster_id, query_length, subject_length, alignment_length, identity, evalue, bitscore, query_start, query_end, subject_start, subject_end, subject_sequence) = line.rstrip('\n').split('\t')
cds = cds_by_hexdigest[aa_identifier]
query_cov = int(alignment_length) / len(cds['aa'])
subject_cov = int(alignment_length) / len(subject_sequence)
subject_cov = int(alignment_length) / int(subject_length)
identity = float(identity) / 100
if query_cov >= bc.MIN_PSEUDOGENE_QUERY_COVERAGE and identity >= bc.MIN_PSEUDOGENE_IDENTITY \
and bc.MIN_PSEUDOGENE_SUBJECT_COVERAGE <= subject_cov < bc.MIN_PSC_COVERAGE:
bitscore = float(bitscore)
evalue = float(evalue)
if(query_cov >= bc.MIN_PSEUDOGENE_QUERY_COVERAGE and bc.MIN_PSEUDOGENE_SUBJECT_COVERAGE <= subject_cov < bc.MIN_PSC_COVERAGE and identity >= bc.MIN_PSEUDOGENE_IDENTITY):
cds['pseudo-inference'] = {
DB_PSC_COL_UNIREF90: cluster_id,
'query-cov': query_cov,
'subject-cov': subject_cov,
'query_cov': query_cov,
'subject_cov': subject_cov,
'identity': identity,
'score': bitscore,
'evalue': evalue,
'gene_start': int(query_start),
'gene_end': int(query_end),
'reference_start': int(subject_start),
Expand All @@ -572,8 +575,8 @@ def predict_pseudo_candidates(hypotheticals: Sequence[dict]) -> Sequence[dict]:
}
pseudo_candidates.append(cds)
log.debug(
'pseudogene-candidate: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, UniRef90=%s',
cds['contig'], cds['start'], cds['stop'], cds['strand'], len(cds['aa']), query_cov, subject_cov, identity, cluster_id
'pseudogene-candidate: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, UniRef90=%s',
cds['contig'], cds['start'], cds['stop'], cds['strand'], len(cds['aa']), query_cov, subject_cov, identity, bitscore, evalue, cluster_id
)
log.info('found: pseudogene-candidates=%i', len(pseudo_candidates))
return pseudo_candidates
Expand Down Expand Up @@ -662,6 +665,9 @@ def detect_pseudogenes(candidates: Sequence[dict], cdss: Sequence[dict], genome:
query_alignment_start = int(hit.find('Hit_hsps/Hsp/Hsp_query-from').text)
query_alignment_stop = int(hit.find('Hit_hsps/Hsp/Hsp_query-to').text)
alignment_length = int(hit.find('Hit_hsps/Hsp/Hsp_align-len').text)
identity = float(hit.find('Hit_hsps/Hsp/Hsp_identity').text) / alignment_length
bitscore = float(hit.find('Hit_hsps/Hsp/Hsp_bit-score').text)
evalue = float(hit.find('Hit_hsps/Hsp/Hsp_evalue').text)

if alignment_length == len(cds['aa']): # skip non-extended genes (full match)
log.debug(
Expand All @@ -686,7 +692,10 @@ def detect_pseudogenes(candidates: Sequence[dict], cdss: Sequence[dict], genome:
'stop': positions['stop'],
'observations': clean_observations(observations),
'inference': cds['pseudo-inference'],
'paralog': is_paralog(uniref90_by_hexdigest, aa_identifier, cluster_id)
'paralog': is_paralog(uniref90_by_hexdigest, aa_identifier, cluster_id),
'identity': identity,
'score': bitscore,
'evalue': evalue
}

effects = []
Expand Down
2 changes: 1 addition & 1 deletion bakta/features/ori.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def predict_oris(genome: dict, contigs_path: Path, ori_type: str) -> Sequence[di
if(len(contig_hits) == 1):
hits[hit['contig']] = contig_hits
log.debug(
'raw hit: type=%s, contig=%s, start=%i, stop=%i, strand=%s, coverage=%f, identity=%f',
'raw hit: type=%s, contig=%s, start=%i, stop=%i, strand=%s, coverage=%0.3f, identity=%0.3f',
ori_type, hit['contig'], hit['contig_start'], hit['contig_stop'], hit['strand'], hit['coverage'], hit['identity']
)

Expand Down
20 changes: 12 additions & 8 deletions bakta/features/s_orf.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ def search(sorfs: Sequence[dict], cluster_type: str):
'--query-cover', str(int(bc.MIN_SORF_COVERAGE * 100)), # '90'
'--subject-cover', str(int(bc.MIN_SORF_COVERAGE * 100)), # '90'
'--max-target-seqs', '1', # single best output
'--outfmt', '6',
'--outfmt', '6', 'qseqid', 'sseqid', 'qlen', 'slen', 'length', 'pident', 'evalue', 'bitscore',
'--threads', str(cfg.threads),
'--tmpdir', str(cfg.tmp_path), # use tmp folder
'--block-size', '3', # slightly increase block size for faster executions
Expand All @@ -347,22 +347,26 @@ def search(sorfs: Sequence[dict], cluster_type: str):
sorf_by_aa_digest = orf.get_orf_dictionary(sorfs)
with diamond_output_path.open() as fh:
for line in fh:
(sorf_hash, cluster_id, identity, alignment_length, align_mismatches,
align_gaps, query_start, query_end, subject_start, subject_end,
evalue, bitscore) = line.split('\t')
(sorf_hash, cluster_id, query_length, subject_length, alignment_length, identity, evalue, bitscore) = line.split('\t')
sorf = sorf_by_aa_digest[sorf_hash]
query_cov = int(alignment_length) / len(sorf['aa'])
subject_cov = int(alignment_length) / int(subject_length)
identity = float(identity) / 100
if(query_cov >= bc.MIN_SORF_COVERAGE and identity >= bc.MIN_SORF_IDENTITY):
bitscore = float(bitscore)
evalue = float(evalue)
if(query_cov >= bc.MIN_SORF_COVERAGE and subject_cov >= bc.MIN_SORF_COVERAGE and identity >= bc.MIN_SORF_IDENTITY):
result = {
'query_cov': query_cov,
'identity': identity
'subject_cov': subject_cov,
'identity': identity,
'score': bitscore,
'evalue': evalue
}
result[psc.DB_PSC_COL_UNIREF90 if cluster_type == 'full' else pscc.DB_PSCC_COL_UNIREF50] = cluster_id
sorf['psc' if cluster_type == 'full' else 'pscc'] = result
log.info(
'homology: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, identity=%0.3f, UniRef90=%s',
sorf['contig'], sorf['start'], sorf['stop'], sorf['strand'], len(sorf['aa']), query_cov, identity, cluster_id
'homology: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, UniRef=%s',
sorf['contig'], sorf['start'], sorf['stop'], sorf['strand'], len(sorf['aa']), query_cov, subject_cov, identity, bitscore, evalue, cluster_id
)

sorfs_found = []
Expand Down
4 changes: 2 additions & 2 deletions bakta/io/gff.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
log = logging.getLogger('GFF')


def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Path):
def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Path):
"""Export features in GFF3 format."""
log.info('write GFF3: path=%s', gff3_path)
log.info('write features: path=%s', gff3_path)

with gff3_path.open('wt') as fh:
fh.write('##gff-version 3\n') # GFF version
Expand Down
2 changes: 1 addition & 1 deletion bakta/io/insdc.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
log = logging.getLogger('INSDC')


def write_insdc(genome: dict, features: Sequence[dict], genbank_output_path: Path, embl_output_path: Path):
def write_features(genome: dict, features: Sequence[dict], genbank_output_path: Path, embl_output_path: Path):
log.debug('prepare: genbank=%s, embl=%s', genbank_output_path, embl_output_path)

contig_list = []
Expand Down
83 changes: 78 additions & 5 deletions bakta/io/tsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,18 @@
import bakta
import bakta.config as cfg
import bakta.constants as bc
import bakta.ips as bips
import bakta.ups as bups
import bakta.psc as bpsc
import bakta.pscc as bpscc


log = logging.getLogger('TSV')


def write_tsv(contigs: Sequence[dict], features_by_contig: Dict[str, dict], tsv_path: Path):
def write_features(contigs: Sequence[dict], features_by_contig: Dict[str, dict], tsv_path: Path):
"""Export features in TSV format."""
log.info('write tsv: path=%s', tsv_path)
log.info('write feature tsv: path=%s', tsv_path)

with tsv_path.open('wt') as fh:
fh.write('# Annotated with Bakta\n')
Expand Down Expand Up @@ -63,10 +67,79 @@ def write_tsv(contigs: Sequence[dict], features_by_contig: Dict[str, dict], tsv_
return


def write_features(features: Sequence[dict], header_columns: Sequence[str], mapping: LambdaType, tsv_path: Path):
"""Export features in TSV format."""
def write_feature_inferences(contigs: Sequence[dict], features_by_contig: Dict[str, dict], tsv_path: Path):
"""Export feature inference statistics in TSV format."""
log.info('write tsv: path=%s', tsv_path)

with tsv_path.open('wt') as fh:
fh.write('# Annotated with Bakta\n')
fh.write(f'# Software: v{bakta.__version__}\n')
fh.write(f"# Database: v{cfg.db_info['major']}.{cfg.db_info['minor']}, {cfg.db_info['type']}\n")
fh.write(f'# DOI: {bc.BAKTA_DOI}\n')
fh.write(f'# URL: {bc.BAKTA_URL}\n')
fh.write('#Sequence Id\tType\tStart\tStop\tStrand\tLocus Tag\tScore\tEvalue\tQuery Cov\tSubject Cov\tId\tAccession\n')

for contig in contigs:
for feat in features_by_contig[contig['id']]:
if(feat['type'] in [bc.FEATURE_CDS, bc.FEATURE_SORF]):
score, evalue, query_cov, subject_cov, identity, accession = None, None, None, None, None, '-'
if('ups' in feat or 'ips' in feat):
query_cov = 1
subject_cov = 1
identity = 1
evalue = 0
accession = f"{bc.DB_XREF_UNIREF}:{feat['ips'][bips.DB_IPS_COL_UNIREF100]}" if 'ips' in feat else f"{bc.DB_XREF_UNIPARC}:{feat['ups'][bups.DB_UPS_COL_UNIPARC]}"
elif('psc' in feat or 'pscc' in feat):
psc_type = 'psc' if 'psc' in feat else 'pscc'
query_cov = feat[psc_type]['query_cov']
subject_cov = feat[psc_type]['subject_cov']
identity = feat[psc_type]['identity']
score = feat[psc_type]['score']
evalue = feat[psc_type]['evalue']
accession = f"{bc.DB_XREF_UNIREF}:{feat['psc'][bpsc.DB_PSC_COL_UNIREF90]}" if 'psc' in feat else f"{bc.DB_XREF_UNIREF}:{feat['pscc'][bpscc.DB_PSCC_COL_UNIREF50]}"
fh.write('\t'.join(
[
feat['contig'],
feat['type'],
str(feat['start']),
str(feat['stop']),
feat['strand'],
feat['locus'],
f"{score:0.1f}" if score != None else '-',
('0.0' if evalue == 0 else f"{evalue:1.1e}") if evalue != None else '-',
('1.0' if query_cov == 1 else f"{query_cov:0.3f}") if query_cov != None else '-',
('1.0' if subject_cov == 1 else f"{subject_cov:0.3f}") if subject_cov != None else '-',
('1.0' if identity == 1 else f"{identity:0.3f}") if identity != None else '-',
accession
])
)
fh.write('\n')
elif(feat['type'] in [bc.FEATURE_T_RNA, bc.FEATURE_R_RNA, bc.FEATURE_NC_RNA, bc.FEATURE_NC_RNA_REGION]):
accession = '-' if feat['type'] == bc.FEATURE_T_RNA else [xref for xref in feat['db_xrefs'] if bc.DB_XREF_RFAM in xref][0]
fh.write('\t'.join(
[
feat['contig'],
feat['type'],
str(feat['start']),
str(feat['stop']),
feat['strand'],
feat['locus'] if 'locus' in feat else '-',
f"{feat['score']:0.1f}",
('0.0' if feat['evalue'] == 0 else f"{feat['evalue']:1.1e}") if 'evalue' in feat else '-',
('1.0' if feat['query_cov'] == 1 else f"{feat['query_cov']:0.3f}") if 'query_cov' in feat else '-',
('1.0' if feat['subject_cov'] == 1 else f"{feat['subject_cov']:0.3f}") if 'subject_cov' in feat else '-',
('1.0' if feat['identity'] == 1 else f"{feat['identity']:0.3f}") if 'identity' in feat else '-',
accession
])
)
fh.write('\n')
return


def write_protein_features(features: Sequence[dict], header_columns: Sequence[str], mapping: LambdaType, tsv_path: Path):
"""Export protein features in TSV format."""
log.info('write protein feature tsv: path=%s', tsv_path)

with tsv_path.open('wt') as fh:
fh.write(f'#Annotated with Bakta (v{bakta.__version__}): https://github.com/oschwengers/bakta\n')
fh.write(f"#Database (v{cfg.db_info['major']}.{cfg.db_info['minor']}): https://doi.org/10.5281/zenodo.4247252\n")
Expand All @@ -79,7 +152,7 @@ def write_features(features: Sequence[dict], header_columns: Sequence[str], mapp
return


def write_hypotheticals_tsv(hypotheticals: Sequence[dict], tsv_path: Path):
def write_hypotheticals(hypotheticals: Sequence[dict], tsv_path: Path):
"""Export hypothetical information in TSV format."""
log.info('write hypothetical tsv: path=%s', tsv_path)

Expand Down
Loading
Loading