diff --git a/README.md b/README.md index 0f4bc82e..b591631f 100644 --- a/README.md +++ b/README.md @@ -285,6 +285,7 @@ Annotation results are provided in standard bioinformatics file formats: - `.fna`: replicon/contig DNA sequences as FASTA - `.ffn`: feature nucleotide sequences as FASTA - `.faa`: CDS/sORF amino acid sequences as FASTA +- `.inference.tsv`: inference metrics (score, evalue, coverage, identity) for annotated accessions as TSV - `.hypotheticals.tsv`: further information on hypothetical protein CDS as simple human readble tab separated values - `.hypotheticals.faa`: hypothetical protein CDS amino acid sequences as FASTA - `.json`: all (internal) annotation & sequence information as JSON diff --git a/bakta/expert/protein_sequences.py b/bakta/expert/protein_sequences.py index fee7fa2b..982c7011 100644 --- a/bakta/expert/protein_sequences.py +++ b/bakta/expert/protein_sequences.py @@ -26,7 +26,7 @@ def search(cdss: Sequence[dict], cds_fasta_path: Path, expert_system: str, db_pa '--db', str(db_path), '--out', str(diamond_output_path), '--max-target-seqs', '1', # single best output - '--outfmt', '6', 'qseqid', 'sseqid', 'slen', 'length', 'pident', 'evalue', 'bitscore', 'stitle', + '--outfmt', '6', 'qseqid', 'sseqid', 'qlen', 'slen', 'length', 'pident', 'evalue', 'bitscore', 'stitle', '--threads', str(cfg.threads), '--tmpdir', str(cfg.tmp_path), # use tmp folder '--block-size', '4', # increase block size for faster executions @@ -51,13 +51,13 @@ def search(cdss: Sequence[dict], cds_fasta_path: Path, expert_system: str, db_pa cds_by_hexdigest = orf.get_orf_dictionary(cdss) with diamond_output_path.open() as fh: for line in fh: - (aa_identifier, model_id, model_length, alignment_length, identity, evalue, bitscore, model_title) = line.strip().split('\t') + (aa_identifier, model_id, query_length, model_length, alignment_length, identity, evalue, bitscore, model_title) = line.strip().split('\t') cds = cds_by_hexdigest[aa_identifier] query_cov = int(alignment_length) / len(cds['aa']) model_cov = int(alignment_length) / int(model_length) identity = float(identity) / 100 - evalue = float(evalue) bitscore = float(bitscore) + evalue = float(evalue) (source, rank, min_identity, min_query_cov, min_model_cov, gene, product, dbxrefs) = model_title.split(' ', 1)[1].split('~~~') rank = int(rank) min_identity = float(min_identity) / 100 @@ -72,10 +72,10 @@ def search(cdss: Sequence[dict], cds_fasta_path: Path, expert_system: str, db_pa 'gene': gene if gene != '' else None, 'product': product, 'query_cov': query_cov, - 'model_cov': model_cov, + 'subject_cov': model_cov, 'identity': identity, + 'score': bitscore, 'evalue': evalue, - 'bitscore': bitscore, 'db_xrefs': [] if dbxrefs == '' else dbxrefs.split(',') } if(expert_system == 'user_proteins'): @@ -83,8 +83,8 @@ def search(cdss: Sequence[dict], cds_fasta_path: Path, expert_system: str, db_pa cds.setdefault('expert', []) cds['expert'].append(hit) log.debug( - 'hit: source=%s, rank=%i, contig=%s, start=%i, stop=%i, strand=%s, query-cov=%0.3f, model-cov=%0.3f, identity=%0.3f, gene=%s, product=%s, evalue=%1.1e, bitscore=%f', - source, rank, cds['contig'], cds['start'], cds['stop'], cds['strand'], query_cov, model_cov, identity, gene, product, evalue, bitscore + 'hit: source=%s, rank=%i, contig=%s, start=%i, stop=%i, strand=%s, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, gene=%s, product=%s', + source, rank, cds['contig'], cds['start'], cds['stop'], cds['strand'], query_cov, model_cov, identity, bitscore, evalue, gene, product ) cds_found.add(aa_identifier) diff --git a/bakta/features/cds.py b/bakta/features/cds.py index 19e555b1..fb4f91cf 100644 --- a/bakta/features/cds.py +++ b/bakta/features/cds.py @@ -528,7 +528,7 @@ def predict_pseudo_candidates(hypotheticals: Sequence[dict]) -> Sequence[dict]: '--query-cover', str(int(bc.MIN_PSEUDOGENE_QUERY_COVERAGE * 100)), # '80' '--subject-cover', str(int(bc.MIN_PSEUDOGENE_SUBJECT_COVERAGE * 100)), # '40' '--max-target-seqs', '1', # single best output - '--outfmt', '6', 'qseqid', 'sseqid', 'pident', 'length', 'qstart', 'qend', 'sstart', 'send', 'full_sseq', + '--outfmt', '6', 'qseqid', 'sseqid', 'qlen', 'slen', 'length', 'pident', 'evalue', 'bitscore', 'qstart', 'qend', 'sstart', 'send', 'full_sseq', '--threads', str(cfg.threads), '--tmpdir', str(cfg.tmp_path), '--block-size', '3', # slightly increase block size for faster executions @@ -552,18 +552,21 @@ def predict_pseudo_candidates(hypotheticals: Sequence[dict]) -> Sequence[dict]: cds_by_hexdigest = orf.get_orf_dictionary(hypotheticals) with diamond_output_path.open() as fh: for line in fh: - (aa_identifier, cluster_id, identity, alignment_length, query_start, query_end, subject_start, subject_end, subject_sequence) = line.rstrip('\n').split('\t') + (aa_identifier, cluster_id, query_length, subject_length, alignment_length, identity, evalue, bitscore, query_start, query_end, subject_start, subject_end, subject_sequence) = line.rstrip('\n').split('\t') cds = cds_by_hexdigest[aa_identifier] query_cov = int(alignment_length) / len(cds['aa']) - subject_cov = int(alignment_length) / len(subject_sequence) + subject_cov = int(alignment_length) / int(subject_length) identity = float(identity) / 100 - if query_cov >= bc.MIN_PSEUDOGENE_QUERY_COVERAGE and identity >= bc.MIN_PSEUDOGENE_IDENTITY \ - and bc.MIN_PSEUDOGENE_SUBJECT_COVERAGE <= subject_cov < bc.MIN_PSC_COVERAGE: + bitscore = float(bitscore) + evalue = float(evalue) + if(query_cov >= bc.MIN_PSEUDOGENE_QUERY_COVERAGE and bc.MIN_PSEUDOGENE_SUBJECT_COVERAGE <= subject_cov < bc.MIN_PSC_COVERAGE and identity >= bc.MIN_PSEUDOGENE_IDENTITY): cds['pseudo-inference'] = { DB_PSC_COL_UNIREF90: cluster_id, - 'query-cov': query_cov, - 'subject-cov': subject_cov, + 'query_cov': query_cov, + 'subject_cov': subject_cov, 'identity': identity, + 'score': bitscore, + 'evalue': evalue, 'gene_start': int(query_start), 'gene_end': int(query_end), 'reference_start': int(subject_start), @@ -572,8 +575,8 @@ def predict_pseudo_candidates(hypotheticals: Sequence[dict]) -> Sequence[dict]: } pseudo_candidates.append(cds) log.debug( - 'pseudogene-candidate: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, UniRef90=%s', - cds['contig'], cds['start'], cds['stop'], cds['strand'], len(cds['aa']), query_cov, subject_cov, identity, cluster_id + 'pseudogene-candidate: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, UniRef90=%s', + cds['contig'], cds['start'], cds['stop'], cds['strand'], len(cds['aa']), query_cov, subject_cov, identity, bitscore, evalue, cluster_id ) log.info('found: pseudogene-candidates=%i', len(pseudo_candidates)) return pseudo_candidates @@ -662,6 +665,9 @@ def detect_pseudogenes(candidates: Sequence[dict], cdss: Sequence[dict], genome: query_alignment_start = int(hit.find('Hit_hsps/Hsp/Hsp_query-from').text) query_alignment_stop = int(hit.find('Hit_hsps/Hsp/Hsp_query-to').text) alignment_length = int(hit.find('Hit_hsps/Hsp/Hsp_align-len').text) + identity = float(hit.find('Hit_hsps/Hsp/Hsp_identity').text) / alignment_length + bitscore = float(hit.find('Hit_hsps/Hsp/Hsp_bit-score').text) + evalue = float(hit.find('Hit_hsps/Hsp/Hsp_evalue').text) if alignment_length == len(cds['aa']): # skip non-extended genes (full match) log.debug( @@ -686,7 +692,10 @@ def detect_pseudogenes(candidates: Sequence[dict], cdss: Sequence[dict], genome: 'stop': positions['stop'], 'observations': clean_observations(observations), 'inference': cds['pseudo-inference'], - 'paralog': is_paralog(uniref90_by_hexdigest, aa_identifier, cluster_id) + 'paralog': is_paralog(uniref90_by_hexdigest, aa_identifier, cluster_id), + 'identity': identity, + 'score': bitscore, + 'evalue': evalue } effects = [] diff --git a/bakta/features/ori.py b/bakta/features/ori.py index 735a68b2..82281612 100644 --- a/bakta/features/ori.py +++ b/bakta/features/ori.py @@ -72,7 +72,7 @@ def predict_oris(genome: dict, contigs_path: Path, ori_type: str) -> Sequence[di if(len(contig_hits) == 1): hits[hit['contig']] = contig_hits log.debug( - 'raw hit: type=%s, contig=%s, start=%i, stop=%i, strand=%s, coverage=%f, identity=%f', + 'raw hit: type=%s, contig=%s, start=%i, stop=%i, strand=%s, coverage=%0.3f, identity=%0.3f', ori_type, hit['contig'], hit['contig_start'], hit['contig_stop'], hit['strand'], hit['coverage'], hit['identity'] ) diff --git a/bakta/features/s_orf.py b/bakta/features/s_orf.py index 04e52fdd..74124c32 100644 --- a/bakta/features/s_orf.py +++ b/bakta/features/s_orf.py @@ -324,7 +324,7 @@ def search(sorfs: Sequence[dict], cluster_type: str): '--query-cover', str(int(bc.MIN_SORF_COVERAGE * 100)), # '90' '--subject-cover', str(int(bc.MIN_SORF_COVERAGE * 100)), # '90' '--max-target-seqs', '1', # single best output - '--outfmt', '6', + '--outfmt', '6', 'qseqid', 'sseqid', 'qlen', 'slen', 'length', 'pident', 'evalue', 'bitscore', '--threads', str(cfg.threads), '--tmpdir', str(cfg.tmp_path), # use tmp folder '--block-size', '3', # slightly increase block size for faster executions @@ -347,22 +347,26 @@ def search(sorfs: Sequence[dict], cluster_type: str): sorf_by_aa_digest = orf.get_orf_dictionary(sorfs) with diamond_output_path.open() as fh: for line in fh: - (sorf_hash, cluster_id, identity, alignment_length, align_mismatches, - align_gaps, query_start, query_end, subject_start, subject_end, - evalue, bitscore) = line.split('\t') + (sorf_hash, cluster_id, query_length, subject_length, alignment_length, identity, evalue, bitscore) = line.split('\t') sorf = sorf_by_aa_digest[sorf_hash] query_cov = int(alignment_length) / len(sorf['aa']) + subject_cov = int(alignment_length) / int(subject_length) identity = float(identity) / 100 - if(query_cov >= bc.MIN_SORF_COVERAGE and identity >= bc.MIN_SORF_IDENTITY): + bitscore = float(bitscore) + evalue = float(evalue) + if(query_cov >= bc.MIN_SORF_COVERAGE and subject_cov >= bc.MIN_SORF_COVERAGE and identity >= bc.MIN_SORF_IDENTITY): result = { 'query_cov': query_cov, - 'identity': identity + 'subject_cov': subject_cov, + 'identity': identity, + 'score': bitscore, + 'evalue': evalue } result[psc.DB_PSC_COL_UNIREF90 if cluster_type == 'full' else pscc.DB_PSCC_COL_UNIREF50] = cluster_id sorf['psc' if cluster_type == 'full' else 'pscc'] = result log.info( - 'homology: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, identity=%0.3f, UniRef90=%s', - sorf['contig'], sorf['start'], sorf['stop'], sorf['strand'], len(sorf['aa']), query_cov, identity, cluster_id + 'homology: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, UniRef=%s', + sorf['contig'], sorf['start'], sorf['stop'], sorf['strand'], len(sorf['aa']), query_cov, subject_cov, identity, bitscore, evalue, cluster_id ) sorfs_found = [] diff --git a/bakta/io/gff.py b/bakta/io/gff.py index 35fcef08..086ebfde 100644 --- a/bakta/io/gff.py +++ b/bakta/io/gff.py @@ -14,9 +14,9 @@ log = logging.getLogger('GFF') -def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Path): +def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Path): """Export features in GFF3 format.""" - log.info('write GFF3: path=%s', gff3_path) + log.info('write features: path=%s', gff3_path) with gff3_path.open('wt') as fh: fh.write('##gff-version 3\n') # GFF version diff --git a/bakta/io/insdc.py b/bakta/io/insdc.py index ba8d630e..5a1de13a 100644 --- a/bakta/io/insdc.py +++ b/bakta/io/insdc.py @@ -18,7 +18,7 @@ log = logging.getLogger('INSDC') -def write_insdc(genome: dict, features: Sequence[dict], genbank_output_path: Path, embl_output_path: Path): +def write_features(genome: dict, features: Sequence[dict], genbank_output_path: Path, embl_output_path: Path): log.debug('prepare: genbank=%s, embl=%s', genbank_output_path, embl_output_path) contig_list = [] diff --git a/bakta/io/tsv.py b/bakta/io/tsv.py index 8a3eb25a..cb16ac9a 100644 --- a/bakta/io/tsv.py +++ b/bakta/io/tsv.py @@ -7,14 +7,18 @@ import bakta import bakta.config as cfg import bakta.constants as bc +import bakta.ips as bips +import bakta.ups as bups +import bakta.psc as bpsc +import bakta.pscc as bpscc log = logging.getLogger('TSV') -def write_tsv(contigs: Sequence[dict], features_by_contig: Dict[str, dict], tsv_path: Path): +def write_features(contigs: Sequence[dict], features_by_contig: Dict[str, dict], tsv_path: Path): """Export features in TSV format.""" - log.info('write tsv: path=%s', tsv_path) + log.info('write feature tsv: path=%s', tsv_path) with tsv_path.open('wt') as fh: fh.write('# Annotated with Bakta\n') @@ -63,10 +67,79 @@ def write_tsv(contigs: Sequence[dict], features_by_contig: Dict[str, dict], tsv_ return -def write_features(features: Sequence[dict], header_columns: Sequence[str], mapping: LambdaType, tsv_path: Path): - """Export features in TSV format.""" +def write_feature_inferences(contigs: Sequence[dict], features_by_contig: Dict[str, dict], tsv_path: Path): + """Export feature inference statistics in TSV format.""" log.info('write tsv: path=%s', tsv_path) + with tsv_path.open('wt') as fh: + fh.write('# Annotated with Bakta\n') + fh.write(f'# Software: v{bakta.__version__}\n') + fh.write(f"# Database: v{cfg.db_info['major']}.{cfg.db_info['minor']}, {cfg.db_info['type']}\n") + fh.write(f'# DOI: {bc.BAKTA_DOI}\n') + fh.write(f'# URL: {bc.BAKTA_URL}\n') + fh.write('#Sequence Id\tType\tStart\tStop\tStrand\tLocus Tag\tScore\tEvalue\tQuery Cov\tSubject Cov\tId\tAccession\n') + + for contig in contigs: + for feat in features_by_contig[contig['id']]: + if(feat['type'] in [bc.FEATURE_CDS, bc.FEATURE_SORF]): + score, evalue, query_cov, subject_cov, identity, accession = None, None, None, None, None, '-' + if('ups' in feat or 'ips' in feat): + query_cov = 1 + subject_cov = 1 + identity = 1 + evalue = 0 + accession = f"{bc.DB_XREF_UNIREF}:{feat['ips'][bips.DB_IPS_COL_UNIREF100]}" if 'ips' in feat else f"{bc.DB_XREF_UNIPARC}:{feat['ups'][bups.DB_UPS_COL_UNIPARC]}" + elif('psc' in feat or 'pscc' in feat): + psc_type = 'psc' if 'psc' in feat else 'pscc' + query_cov = feat[psc_type]['query_cov'] + subject_cov = feat[psc_type]['subject_cov'] + identity = feat[psc_type]['identity'] + score = feat[psc_type]['score'] + evalue = feat[psc_type]['evalue'] + accession = f"{bc.DB_XREF_UNIREF}:{feat['psc'][bpsc.DB_PSC_COL_UNIREF90]}" if 'psc' in feat else f"{bc.DB_XREF_UNIREF}:{feat['pscc'][bpscc.DB_PSCC_COL_UNIREF50]}" + fh.write('\t'.join( + [ + feat['contig'], + feat['type'], + str(feat['start']), + str(feat['stop']), + feat['strand'], + feat['locus'], + f"{score:0.1f}" if score != None else '-', + ('0.0' if evalue == 0 else f"{evalue:1.1e}") if evalue != None else '-', + ('1.0' if query_cov == 1 else f"{query_cov:0.3f}") if query_cov != None else '-', + ('1.0' if subject_cov == 1 else f"{subject_cov:0.3f}") if subject_cov != None else '-', + ('1.0' if identity == 1 else f"{identity:0.3f}") if identity != None else '-', + accession + ]) + ) + fh.write('\n') + elif(feat['type'] in [bc.FEATURE_T_RNA, bc.FEATURE_R_RNA, bc.FEATURE_NC_RNA, bc.FEATURE_NC_RNA_REGION]): + accession = '-' if feat['type'] == bc.FEATURE_T_RNA else [xref for xref in feat['db_xrefs'] if bc.DB_XREF_RFAM in xref][0] + fh.write('\t'.join( + [ + feat['contig'], + feat['type'], + str(feat['start']), + str(feat['stop']), + feat['strand'], + feat['locus'] if 'locus' in feat else '-', + f"{feat['score']:0.1f}", + ('0.0' if feat['evalue'] == 0 else f"{feat['evalue']:1.1e}") if 'evalue' in feat else '-', + ('1.0' if feat['query_cov'] == 1 else f"{feat['query_cov']:0.3f}") if 'query_cov' in feat else '-', + ('1.0' if feat['subject_cov'] == 1 else f"{feat['subject_cov']:0.3f}") if 'subject_cov' in feat else '-', + ('1.0' if feat['identity'] == 1 else f"{feat['identity']:0.3f}") if 'identity' in feat else '-', + accession + ]) + ) + fh.write('\n') + return + + +def write_protein_features(features: Sequence[dict], header_columns: Sequence[str], mapping: LambdaType, tsv_path: Path): + """Export protein features in TSV format.""" + log.info('write protein feature tsv: path=%s', tsv_path) + with tsv_path.open('wt') as fh: fh.write(f'#Annotated with Bakta (v{bakta.__version__}): https://github.com/oschwengers/bakta\n') fh.write(f"#Database (v{cfg.db_info['major']}.{cfg.db_info['minor']}): https://doi.org/10.5281/zenodo.4247252\n") @@ -79,7 +152,7 @@ def write_features(features: Sequence[dict], header_columns: Sequence[str], mapp return -def write_hypotheticals_tsv(hypotheticals: Sequence[dict], tsv_path: Path): +def write_hypotheticals(hypotheticals: Sequence[dict], tsv_path: Path): """Export hypothetical information in TSV format.""" log.info('write hypothetical tsv: path=%s', tsv_path) diff --git a/bakta/main.py b/bakta/main.py index 1765cb20..01f299b3 100755 --- a/bakta/main.py +++ b/bakta/main.py @@ -92,9 +92,10 @@ def main(): if(cfg.skip_gap): print(f'\tskip gap: {cfg.skip_gap}') if(cfg.skip_ori): print(f'\tskip oriC/V/T: {cfg.skip_ori}') if(cfg.skip_plot): print(f'\tskip plot: {cfg.skip_plot}') + print() if(cfg.debug): - print(f"\nBakta runs in DEBUG mode! Temporary data will not be destroyed at: {cfg.tmp_path}") + print(f"Bakta runs in DEBUG mode! Temporary data will not be destroyed at: {cfg.tmp_path}\n") else: atexit.register(bu.cleanup, log, cfg.tmp_path) # register cleanup exit hook @@ -104,7 +105,7 @@ def main(): # - apply contig length filter # - rename contigs ############################################################################ - print('\nparse genome sequences...') + print('parse genome sequences...') try: contigs = fasta.import_contigs(cfg.genome_path) log.info('imported sequences=%i', len(contigs)) @@ -535,16 +536,16 @@ def main(): print(f'\nexport annotation results to: {cfg.output_path}') print('\thuman readable TSV...') tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.tsv') - tsv.write_tsv(genome['contigs'], features_by_contig, tsv_path) + tsv.write_features(genome['contigs'], features_by_contig, tsv_path) print('\tGFF3...') gff3_path = cfg.output_path.joinpath(f'{cfg.prefix}.gff3') - gff.write_gff3(genome, features_by_contig, gff3_path) + gff.write_features(genome, features_by_contig, gff3_path) print('\tINSDC GenBank & EMBL...') genbank_path = cfg.output_path.joinpath(f'{cfg.prefix}.gbff') embl_path = cfg.output_path.joinpath(f'{cfg.prefix}.embl') - insdc.write_insdc(genome, features, genbank_path, embl_path) + insdc.write_features(genome, features, genbank_path, embl_path) print('\tgenome sequences...') fna_path = cfg.output_path.joinpath(f'{cfg.prefix}.fna') @@ -558,17 +559,21 @@ def main(): faa_path = cfg.output_path.joinpath(f'{cfg.prefix}.faa') fasta.write_faa(features, faa_path) + print('\tfeature inferences...') + tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.inference.tsv') + tsv.write_feature_inferences(genome['contigs'], features_by_contig, tsv_path) + if(cfg.skip_plot or cfg.meta): print('\tskip generation of circular genome plot...') else: print('\tcircular genome plot...') - plot.write_plot(features, contigs, cfg.output_path) + plot.write(features, contigs, cfg.output_path) if(cfg.skip_cds is False): hypotheticals = [feat for feat in features if feat['type'] == bc.FEATURE_CDS and 'hypothetical' in feat] print('\thypothetical TSV...') tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.hypotheticals.tsv') - tsv.write_hypotheticals_tsv(hypotheticals, tsv_path) + tsv.write_hypotheticals(hypotheticals, tsv_path) print('\ttranslated hypothetical CDS sequences...') faa_path = cfg.output_path.joinpath(f'{cfg.prefix}.hypotheticals.faa') diff --git a/bakta/plot.py b/bakta/plot.py index 7ec5def9..675b52aa 100644 --- a/bakta/plot.py +++ b/bakta/plot.py @@ -180,7 +180,7 @@ def main(): if args.sequences == 'all': # write whole genome plot print(f'draw circular genome plot (type={plot_type}) containing all sequences...') - write_plot(features, contigs, output_path, colors, plot_type=plot_type) + write(features, contigs, output_path, colors, plot_type=plot_type) else: # write genome plot containing provided sequences only plot_contigs = [] sequence_identifiers = [] @@ -198,10 +198,10 @@ def main(): plot_name_suffix = '_'.join(sequence_identifiers) plot_contig_ids = [c['id'] for c in plot_contigs] features = [feat for feat in features if feat['contig'] in plot_contig_ids] - write_plot(features, plot_contigs, output_path, colors, plot_name_suffix=plot_name_suffix, plot_type=plot_type) + write(features, plot_contigs, output_path, colors, plot_name_suffix=plot_name_suffix, plot_type=plot_type) -def write_plot(features, contigs, output_path, colors=COLORS, plot_name_suffix=None, plot_type=bc.PLOT_FEATURES): +def write(features, contigs, output_path, colors=COLORS, plot_name_suffix=None, plot_type=bc.PLOT_FEATURES): # config paths circos_path = cfg.tmp_path.joinpath(f'circos') circos_path.mkdir(parents=True, exist_ok=True) diff --git a/bakta/proteins.py b/bakta/proteins.py index eb827129..a3b51158 100644 --- a/bakta/proteins.py +++ b/bakta/proteins.py @@ -154,7 +154,7 @@ def main(): annotations_path = output_path.joinpath(f'{cfg.prefix}.tsv') header_columns = ['ID', 'Length', 'Gene', 'Product', 'EC', 'GO', 'COG', 'RefSeq', 'UniParc', 'UniRef'] print(f'\tfull annotations (TSV): {annotations_path}') - tsv.write_features(aas, header_columns, map_aa_columns, annotations_path) + tsv.write_protein_features(aas, header_columns, map_aa_columns, annotations_path) full_annotations_path = output_path.joinpath(f'{cfg.prefix}.json') print(f'\tfull annotations (JSON): {full_annotations_path}') json.write_json(None, aas, full_annotations_path) @@ -162,7 +162,7 @@ def main(): header_columns = ['ID', 'Length', 'Mol Weight [kDa]', 'Iso El. Point', 'Pfam hits'] hypotheticals = hypotheticals = [aa for aa in aas if 'hypothetical' in aa] print(f'\tinformation on hypotheticals (TSV): {hypotheticals_path}') - tsv.write_features(hypotheticals, header_columns, map_hypothetical_columns, hypotheticals_path) + tsv.write_protein_features(hypotheticals, header_columns, map_hypothetical_columns, hypotheticals_path) aa_output_path = output_path.joinpath(f'{cfg.prefix}.faa') print(f'\tannotated sequences (Fasta): {aa_output_path}') fasta.write_faa(aas, aa_output_path) diff --git a/bakta/psc.py b/bakta/psc.py index 27d80b80..24cb774b 100644 --- a/bakta/psc.py +++ b/bakta/psc.py @@ -43,7 +43,7 @@ def search(cdss: Sequence[dict]) -> Tuple[Sequence[dict], Sequence[dict], Sequen '--query-cover', str(int(bc.MIN_PSC_COVERAGE * 100)), # '80' '--subject-cover', str(int(bc.MIN_PSC_COVERAGE * 100)), # '80' '--max-target-seqs', '1', # single best output - '--outfmt', '6', + '--outfmt', '6', 'qseqid', 'sseqid', 'qlen', 'slen', 'length', 'pident', 'evalue', 'bitscore', '--threads', str(cfg.threads), '--tmpdir', str(cfg.tmp_path), # use tmp folder '--block-size', '3', # slightly increase block size for faster executions @@ -66,22 +66,26 @@ def search(cdss: Sequence[dict]) -> Tuple[Sequence[dict], Sequence[dict], Sequen cds_by_hexdigest = orf.get_orf_dictionary(cdss) with diamond_output_path.open() as fh: for line in fh: - (aa_identifier, cluster_id, identity, alignment_length, align_mismatches, - align_gaps, query_start, query_end, subject_start, subject_end, - evalue, bitscore) = line.split('\t') + (aa_identifier, cluster_id, query_length, subject_length, alignment_length, identity, evalue, bitscore) = line.split('\t') cds = cds_by_hexdigest[aa_identifier] query_cov = int(alignment_length) / len(cds['aa']) + subject_cov = int(alignment_length) / int(subject_length) identity = float(identity) / 100 - if(query_cov >= bc.MIN_PSC_COVERAGE and identity >= bc.MIN_PSCC_IDENTITY): + bitscore = float(bitscore) + evalue = float(evalue) + if(query_cov >= bc.MIN_PSC_COVERAGE and subject_cov >= bc.MIN_PSC_COVERAGE and identity >= bc.MIN_PSCC_IDENTITY): # use PSCC min id (50%) to allow for pseudogene seeds cds['psc'] = { DB_PSC_COL_UNIREF90: cluster_id, 'query_cov': query_cov, + 'subject_cov': subject_cov, 'identity': identity, - 'valid': identity >= bc.MIN_PSC_IDENTITY + 'score': bitscore, + 'evalue': evalue, + 'valid': identity >= bc.MIN_PSC_IDENTITY # whether a valid PSC hit (id > 90%) } log.debug( - 'homology: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, identity=%0.3f, UniRef90=%s', - cds['contig'], cds['start'], cds['stop'], cds['strand'], len(cds['aa']), query_cov, identity, cluster_id + 'homology: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, UniRef90=%s', + cds['contig'], cds['start'], cds['stop'], cds['strand'], len(cds['aa']), query_cov, subject_cov, identity, bitscore, evalue, cluster_id ) pscs_found = [] diff --git a/bakta/pscc.py b/bakta/pscc.py index ee1be42d..b4b34906 100644 --- a/bakta/pscc.py +++ b/bakta/pscc.py @@ -36,7 +36,7 @@ def search(cdss: Sequence[dict]) -> Tuple[Sequence[dict], Sequence[dict], Sequen '--query-cover', str(int(bc.MIN_PSC_COVERAGE * 100)), # '80' '--subject-cover', str(int(bc.MIN_PSC_COVERAGE * 100)), # '80' '--max-target-seqs', '1', # single best output - '--outfmt', '6', + '--outfmt', '6', 'qseqid', 'sseqid', 'qlen', 'slen', 'length', 'pident', 'evalue', 'bitscore', '--threads', str(cfg.threads), '--tmpdir', str(cfg.tmp_path), # use tmp folder '--block-size', '3', # slightly increase block size for faster executions @@ -59,22 +59,25 @@ def search(cdss: Sequence[dict]) -> Tuple[Sequence[dict], Sequence[dict], Sequen cds_by_hexdigest = orf.get_orf_dictionary(cdss) with diamond_output_path.open() as fh: for line in fh: - (aa_identifier, cluster_id, identity, alignment_length, align_mismatches, - align_gaps, query_start, query_end, subject_start, subject_end, - evalue, bitscore) = line.split('\t') + (aa_identifier, cluster_id, query_length, subject_length, alignment_length, identity, evalue, bitscore) = line.split('\t') cds = cds_by_hexdigest[aa_identifier] query_cov = int(alignment_length) / len(cds['aa']) + subject_cov = int(alignment_length) / int(subject_length) identity = float(identity) / 100 - if(query_cov >= bc.MIN_PSC_COVERAGE and identity >= bc.MIN_PSCC_IDENTITY): + bitscore = float(bitscore) + evalue = float(evalue) + if(query_cov >= bc.MIN_PSC_COVERAGE and subject_cov >= bc.MIN_PSC_COVERAGE and identity >= bc.MIN_PSCC_IDENTITY): cds['pscc'] = { DB_PSCC_COL_UNIREF50: cluster_id, 'query_cov': query_cov, + 'subject_cov': subject_cov, 'identity': identity, - 'valid': identity >= bc.MIN_PSC_IDENTITY + 'score': bitscore, + 'evalue': evalue } log.debug( - 'homology: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, identity=%0.3f, UniRef90=%s', - cds['contig'], cds['start'], cds['stop'], cds['strand'], len(cds['aa']), query_cov, identity, cluster_id + 'homology: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, UniRef50=%s', + cds['contig'], cds['start'], cds['stop'], cds['strand'], len(cds['aa']), query_cov, subject_cov, identity, bitscore, evalue, cluster_id ) psccs_found = [] @@ -120,7 +123,10 @@ def lookup(features: Sequence[dict], pseudo: bool = False): if(pseudo): feature[bc.PSEUDOGENE]['pscc'] = pscc else: - feature['pscc'] = pscc + if('pscc' in feature): + feature['pscc'] = {**feature['pscc'], **pscc} # merge dicts, add PSCC annotation info to PSCC alignment info + else: + feature['pscc'] = pscc # add PSCC annotation info no_pscc_lookups += 1 log.debug( 'lookup: contig=%s, start=%i, stop=%i, strand=%s, UniRef50=%s, product=%s', diff --git a/scripts/extract-region.py b/scripts/extract-region.py index 303b07f5..b357b137 100755 --- a/scripts/extract-region.py +++ b/scripts/extract-region.py @@ -76,11 +76,11 @@ print('Write selected features...') output_path = Path(args.output).resolve() gff3_path = output_path.joinpath(f'{prefix}.gff3') -gff.write_gff3(genome, features_by_contig, gff3_path) +gff.write_features(genome, features_by_contig, gff3_path) print('\t...INSDC GenBank & EMBL') genbank_path = output_path.joinpath(f'{prefix}.gbff') embl_path = output_path.joinpath(f'{prefix}.embl') -insdc.write_insdc(genome, features_selected, genbank_path, embl_path) +insdc.write_features(genome, features_selected, genbank_path, embl_path) print('\t...feature nucleotide sequences') ffn_path = output_path.joinpath(f'{prefix}.ffn') fasta.write_ffn(features_selected, ffn_path)