diff --git a/.gitignore b/.gitignore index 05d5357..68b54a0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,10 @@ data/ *.pyc +*.o +amr_report +amrfinder +fasta_check +point_mut +gff_check +amrfinder_update +fasta2parts diff --git a/Makefile b/Makefile index e4260e7..6c1fda9 100644 --- a/Makefile +++ b/Makefile @@ -1,47 +1,50 @@ +############################################################################## +# PUBLIC DOMAIN NOTICE This software/database is "United States Government +# Work" under the terms of the United States Copyright Act. It was written as +# part of the authors' official duties for the United States Government and +# thus cannot be copyrighted. This software/database is freely available to the +# public for use without a copyright notice. Restrictions cannot be placed on +# its present or future use. +# +# Although all reasonable efforts have been taken to ensure the accuracy and +# reliability of the software and data, the National Center for Biotechnology +# Information (NCBI) and the U.S. Government do not and cannot warrant the +# performance or results that may be obtained by using this software or data. +# NCBI, NLM, and the U.S. Government disclaim all warranties as to performance, +# merchantability or fitness for any particular purpose. +# +# In any work or product derived from this material, proper attribution of the +# authors as the source of the software or data should be made, using: +# https://ncbi.nlm.nih.gov/pathogens/antimicrobial-resistance/AMRFinder/ as the +# citation. +############################################################################### + # the SVNREV is set automatically here for convenience, # but when actually building we should override it like: # make all SVNREV=-D\'SVN_REV=\"$VERSION\"\' or use # a version.txt file ifeq ($(wildcard version.txt),) - VERSION_STRING := $(shell svnversion -n .) + VERSION_STRING := $(shell git describe --tags) else VERSION_STRING := $(shell cat version.txt) endif SVNREV := -D'SVN_REV="$(VERSION_STRING)"' -SHELL=/bin/sh -INSTALL = install +# Define default paths +PREFIX ?= /usr/local +INSTALL=install -CPPFLAGS = -std=gnu++11 \ - -malign-double -fno-math-errno \ - -Wall -Wextra \ - -Wcast-align -Wconversion -Wdeprecated-declarations -Wformat -Winit-self -Wlogical-op \ - -Wmissing-braces -Wmissing-field-initializers -Wmissing-format-attribute -Wmissing-include-dirs \ - -Woverloaded-virtual -pedantic -Wparentheses -Wpointer-arith -Wsequence-point -Wshadow -Wunused \ - -Wsuggest-attribute=format -Wswitch -Wuninitialized -Wsign-conversion -Wuseless-cast \ - -O3 \ - $(SVNREV) \ - -Wno-error=misleading-indentation -Wno-nonnull-compare \ +CPPFLAGS = -std=gnu++11 -pthread -malign-double -fno-math-errno -O3 \ CXX=g++ -COMPILE.cpp= $(CXX) $(CPPFLAGS) -c - -#prefix = /panfs/pan1.be-md.ncbi.nlm.nih.gov/bacterial_pathogens/backup -prefix=/usr/local - -# soft links are created here to INSTALL_DIR -bindir="$(prefix)/bin" +COMPILE.cpp= $(CXX) $(CPPFLAGS) $(SVNREV) -c -datadir=$(prefix)/share -# This is where AMRFinder and the default database will be installed -INSTALL_DIR=$(datadir)/amrfinder +.PHONY: all clean install release +BINARIES= amr_report amrfinder amrfinder_update fasta_check fasta2parts gff_check point_mut -.PHONY: all clean install dist -DISTFILES = Makefile *.cpp *.hpp *.inc test_* amrfinder.pl AMRFinder-dna.sh AMRFinder-prot.sh fasta_check gff_check amr_report - -all: amr_report fasta_check gff_check +all: $(BINARIES) release: clean svnversion . > version.txt @@ -55,35 +58,66 @@ amr_reportOBJS=amr_report.o common.o gff.o amr_report: $(amr_reportOBJS) $(CXX) -o $@ $(amr_reportOBJS) -fasta_check.o: common.hpp common.inc -fasta_checkOBJS=fasta_check.o common.o +amrfinder.o: common.hpp common.inc amrfinder.inc +amrfinderOBJS=amrfinder.o common.o +amrfinder: $(amrfinderOBJS) + $(CXX) -o $@ $(amrfinderOBJS) -pthread + +amrfinder_update.o: common.hpp common.inc amrfinder.inc +amrfinder_updateOBJS=amrfinder_update.o common.o +amrfinder_update: $(amrfinder_updateOBJS) + $(CXX) -o $@ $(amrfinder_updateOBJS) -lcurl + +fasta_check.o: common.hpp common.inc +fasta_checkOBJS=fasta_check.o common.o fasta_check: $(fasta_checkOBJS) $(CXX) -o $@ $(fasta_checkOBJS) +fasta2parts.o: common.hpp common.inc +fasta2partsOBJS=fasta2parts.o common.o +fasta2parts: $(fasta2partsOBJS) + $(CXX) -o $@ $(fasta2partsOBJS) + gff_check.o: common.hpp common.inc gff.hpp gff_checkOBJS=gff_check.o common.o gff.o gff_check: $(gff_checkOBJS) $(CXX) -o $@ $(gff_checkOBJS) +point_mut.o: common.hpp common.inc +point_mutOBJS=point_mut.o common.o +point_mut: $(point_mutOBJS) + $(CXX) -o $@ $(point_mutOBJS) + + clean: rm -f *.o - rm -f amr_report fasta_check gff_check - #rm version.txt + rm -f $(BINARIES) install: - $(INSTALL) -D -t $(INSTALL_DIR) amr_report fasta_check gff_check amrfinder.pl -# @dest=$(INSTALL_DIR); \ -# if [ ! -e $(bindir)/amrfinder.pl ]; \ -# then \ -# ln -s "$$dest/amrfinder.pl" $(bindir); \ -# ln -s "$$dest/amr_report" $(bindir); \ -# ln -s "$$dest/fasta_check" $(bindir); \ -# ln -s "$$dest/gff_check" $(bindir); \ -# else \ -# echo "$$dest/amrfinder.pl already exists, so skipping link creation"; \ -# fi - -DISTDIR=amrfinder.$(VERSION_STRING) + $(INSTALL) -D --target-directory=$(PREFIX)/bin $(BINARIES) + +# amrfinder binaries for github binary release +GITHUB_FILE=amrfinder_binaries_v$(VERSION_STRING) +GITHUB_FILES = test_* $(BINARIES) +github_binaries: + @if [ ! -e version.txt ]; \ + then \ + echo >&2 "version.txt required to make a distribution file"; \ + false; \ + fi +# first recompile amrfinder.o to pick up the new version info + rm amrfinder.o amrfinder + make + mkdir $(GITHUB_FILE) + echo $(VERSION_STRING) > $(GITHUB_FILE)/version.txt + cp $(GITHUB_FILES) $(GITHUB_FILE) + if [ -e $(GITHUB_FILE).tar.gz ]; then rm $(GITHUB_FILE).tar.gz; fi + cd $(GITHUB_FILE); tar cvfz ../$(GITHUB_FILE).tar.gz * +# tar cvfz $(GITHUB_FILE).tar.gz $(GITHUB_FILE)/* + rm -r $(GITHUB_FILE)/* + rmdir $(GITHUB_FILE) + +DISTFILES=$(GITHUB_FILES) Makefile *.cpp *.hpp *.inc dist: @if [ ! -e version.txt ]; \ then \ @@ -104,22 +138,20 @@ dist: rm -r $(DISTDIR)/* rmdir $(DISTDIR) -# amrfinder binaries for github binary release -GITHUB_FILE=amrfinder_binaries_v$(VERSION_STRING) -GITHUB_FILES = test_* amrfinder.pl fasta_check gff_check amr_report -github_binaries: all - @if [ ! -e version.txt ]; \ - then \ - echo >&2 "version.txt required to make a distribution file"; \ - false; \ - fi - mkdir $(GITHUB_FILE) - echo $(VERSION_STRING) > $(GITHUB_FILE)/version.txt - cp $(GITHUB_FILES) $(GITHUB_FILE) - sed "s/curr_version = .*/curr_version = '$(VERSION_STRING)';/" amrfinder.pl > $(GITHUB_FILE)/amrfinder.pl - if [ -e $(GITHUB_FILE).tar.gz ]; then rm $(GITHUB_FILE).tar.gz; fi - cd $(GITHUB_FILE); tar cvfz ../$(GITHUB_FILE).tar.gz * -# tar cvfz $(GITHUB_FILE).tar.gz $(GITHUB_FILE)/* - rm -r $(GITHUB_FILE)/* - rmdir $(GITHUB_FILE) +test: + curl -O https://raw.githubusercontent.com/ncbi/amr/v3b/test_dna.fa \ + -O https://raw.githubusercontent.com/ncbi/amr/v3b/test_prot.fa \ + -O https://raw.githubusercontent.com/ncbi/amr/v3b/test_prot.gff \ + -O https://raw.githubusercontent.com/ncbi/amr/v3b/test_both.expected \ + -O https://raw.githubusercontent.com/ncbi/amr/v3b/test_dna.expected \ + -O https://raw.githubusercontent.com/ncbi/amr/v3b/test_prot.expected + amrfinder --plus -p test_prot.fa -g test_prot.gff -O Campylobacter > test_prot.got + diff test_prot.expected test_prot.got + amrfinder --plus -n test_dna.fa -O Campylobacter > test_dna.got + diff test_dna.expected test_dna.got + amrfinder --plus -n test_dna.fa -p test_prot.fa -g test_prot.gff -O Campylobacter > test_both.got + diff test_both.got test_both.expected + + + -O https://raw.githubusercontent.com/ncbi/amr/v3b/test_prot.expected diff --git a/README.md b/README.md index 7448ccd..f60b438 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -# NCBI Antimicrobial Resistance Gene Finder (AMRFinder) +# NCBI Antimicrobial Resistance Gene Finder (AMRFinderPlus) -This software and the accompanying database are designed to find acquired antimicrobial resistance genes in protein or nucleotide sequences. +This software and the accompanying database are designed to find acquired antimicrobial resistance genes and some point mutations in protein or assembled nucleotide sequences. We have also added "plus" stress, head, and biocide resistance as well as some virulence factors and E. coli antigens. ## See [the wiki for documentation](https://github.com/ncbi/amr/wiki) [Subscribe to our announce list](https://www.ncbi.nlm.nih.gov/mailman/listinfo/amrfinder-announce) for announcements of database or software updates. diff --git a/amr_report.cpp b/amr_report.cpp index b9b3c6f..7a1ed62 100644 --- a/amr_report.cpp +++ b/amr_report.cpp @@ -27,11 +27,22 @@ * Author: Vyacheslav Brover * * File Description: -* Identification of AMR genes using the AMR database and BLAST and HMM search results +* Identification of AMR genes using BLAST and HMM search results vs. the NCBI virulence database * */ +// PAR ?? +//#define VER 1 +//#define VER 2 // PD-2074 +#define VER 3 // PD-2109 +// +#if VER <= 0 + #error "Negative version " VER +#endif + + + #undef NDEBUG #include "common.inc" @@ -43,6 +54,7 @@ using namespace GFF_sp; static constexpr bool useCrossOrigin = false; // GPipe: true +static constexpr char pm_delimiter = '_'; @@ -51,9 +63,49 @@ namespace constexpr static double frac_delta = 1e-5; // PAR +const double NaN = numeric_limits::quiet_NaN (); + +// PAR +constexpr double ident_min_def = 0.9; +constexpr double complete_coverage_min_def = 0.9; +constexpr double partial_coverage_min_def = 0.5; +bool ident_min_user = false; +struct BlastRule +// PD-2310 +{ + // 0 <=> undefined + // 0 .. 1 + double ident {0.0}; + double target_coverage {0.0}; + double ref_coverage {0.0}; + + BlastRule (double ident_arg, + //double target_coverage_arg, + double ref_coverage_arg) + : ident (ident_arg) + //, target_coverage (target_coverage_arg) + , ref_coverage (ref_coverage_arg) + { + ASSERT (ident >= 0.0); + ASSERT (ident <= 1.0); + ASSERT (target_coverage >= 0.0); + ASSERT (target_coverage <= 1.0); + ASSERT (ref_coverage >= 0.0); + ASSERT (ref_coverage <= 1.0); + } + BlastRule () = default; + + bool empty () const + { return ! ident; } +}; + +BlastRule defaultCompleteBR; +BlastRule defaultPartialBR; + + struct Batch; // forward @@ -67,31 +119,51 @@ struct Fam string id; string genesymbol; string familyName; - bool reportable {false}; + uchar reportable {0}; + // 0,1,2 // HMM string hmm; // May be empty() - double tc1 {NAN}; - double tc2 {NAN}; + double tc1 {NaN}; + double tc2 {NaN}; + + // BlastRule's + BlastRule completeBR; + BlastRule partialBR; + + string type; + string subtype; + string classS; + string subclass; - Fam (Fam* parent_arg, - const string &id_arg, + Fam (const string &id_arg, const string &genesymbol_arg, const string &hmm_arg, double tc1_arg, double tc2_arg, + const BlastRule &completeBR_arg, + const BlastRule &partialBR_arg, + const string &type_arg, + const string &subtype_arg, + const string &class_arg, + const string &subclass_arg, const string &familyName_arg, - bool reportable_arg) - : parent (parent_arg) - , id (id_arg) + uchar reportable_arg) + : id (id_arg) , genesymbol (genesymbol_arg) , familyName (familyName_arg) , reportable (reportable_arg) , hmm (hmm_arg) , tc1 (tc1_arg) , tc2 (tc2_arg) + , completeBR (completeBR_arg) + , partialBR (partialBR_arg) + , type (type_arg) + , subtype (subtype_arg) + , classS (class_arg) + , subclass (subclass_arg) { if (genesymbol == "-") genesymbol. clear (); if (hmm == "-") @@ -107,10 +179,9 @@ struct Fam ASSERT (tc2 >= 0); ASSERT (tc2 <= tc1); } - Fam () - {} + Fam () = default; void saveText (ostream &os) const - { os << hmm << " " << tc1 << " " << tc2 << " " << familyName << " " << reportable; } + { os << hmm << " " << tc1 << " " << tc2 << " " << familyName << " " << (int) reportable; } bool descendantOf (const Fam* ancestor) const @@ -133,6 +204,12 @@ struct Fam map famId2fam; + // Value: !nullptr + // Not delete'd + + + +struct BlastAlignment; @@ -140,25 +217,31 @@ struct HmmAlignment // Query: AMR HMM { string sseqid; - double score1 {NAN}; - double score2 {NAN}; + double score1 {NaN}; + double score2 {NaN}; // May be different from max(Domain::score) const Fam* fam {nullptr}; // Query //ali_from, ali_to ?? + unique_ptr blastAl; + // (bool)get() HmmAlignment (const string &line, const Batch &batch); // Update: batch.domains - HmmAlignment () - {} + HmmAlignment (const HmmAlignment &other); + HmmAlignment operator= (const HmmAlignment &other); + HmmAlignment () = default; void saveText (ostream &os) const { os << sseqid << ' ' << score1 << ' ' << score2 << ' ' << (fam ? fam->hmm : string ()); } bool good () const - { return fam + { /*cout << fam << endl; + if (fam) + cout << fam->id << ' ' << fam->hmm << ' ' << fam->tc1 << ' ' << fam->tc2 << endl; */ + return fam && ! fam->hmm. empty () && score1 >= fam->tc1 && score2 >= fam->tc2 @@ -193,6 +276,7 @@ struct HmmAlignment bool better (const HmmAlignment &other, unsigned char criterion) const { return betterEq (other, criterion) && ! other. betterEq (*this, criterion); } + bool better (const BlastAlignment &other) const; typedef pair Pair; @@ -212,16 +296,110 @@ struct HmmAlignment Domain (const string &line, Batch &batch); // Input: line: hmmsearch -domtable line - Domain () - {} + Domain () = default; }; }; -double ident_min = NAN; -double complete_cover_min = NAN; -double partial_cover_min = NAN; +struct PointMut +{ + size_t pos {0}; + // In reference sequence + // >= 0 + char alleleChar {' '}; + // Upper-case + // !empty() + string geneMutation; + // Depends on the above + string geneMutationGen; + // geneMutation generalized, may have a different pos + string classS; + string subclass; + string name; + // Species binomial + resistance + bool additional {false}; + + + PointMut (size_t pos_arg, + char alleleChar_arg, + const string &geneMutation_arg, + const string &geneMutationGen_arg, + const string &class_arg, + const string &subclass_arg, + const string &name_arg) + : pos (pos_arg) + , alleleChar (toUpper (alleleChar_arg)) + , geneMutation (geneMutation_arg) + , geneMutationGen (geneMutationGen_arg) + , classS (class_arg) + , subclass (subclass_arg) + , name (name_arg) + { ASSERT (pos > 0); + pos--; + ASSERT (alleleChar != ' '); + ASSERT (! geneMutation. empty ()); + ASSERT (! geneMutationGen. empty ()); + if (isRight (geneMutation, "STOP")) + ASSERT (alleleChar == '*') + else + ASSERT (geneMutation. back () == alleleChar); + ASSERT (geneMutation. front () != alleleChar); + if (isRight (geneMutationGen, "STOP")) + ASSERT (alleleChar == '*') + else + ASSERT (geneMutationGen. back () == alleleChar); + ASSERT (geneMutationGen. front () != alleleChar); + ASSERT (! name. empty ()); + ASSERT (! contains (name, '\t')); + replace (name, '_', ' '); + ASSERT (! contains (name, " ")); + } + PointMut (const string &gene, + size_t pos_arg, + char refChar, + char alleleChar_arg) + : pos (pos_arg) + , alleleChar (alleleChar_arg) + , geneMutation (gene + pm_delimiter + refChar + toString (pos + 1) + alleleChar) + , geneMutationGen (geneMutation) + , name (string (refChar == alleleChar ? "wildtype" : "mutation") + " " + strUpper1 (gene)) + , additional (true) + { ASSERT (! gene. empty ()); + ASSERT (isUpper (refChar)); + ASSERT (isUpper (alleleChar)); + ASSERT (refChar != ' '); + ASSERT (alleleChar != ' '); + } + PointMut () = default; + + + bool empty () const + { return name. empty (); } + string getResistance () const + { size_t p = name. find (' '); + ASSERT (p != string::npos); + p = name. find (' ', p + 1); + ASSERT (p != string::npos); + return name. substr (p + 1); + } + void print (ostream &os) const + { os << pos << ' ' << alleleChar << ' ' << geneMutation << ' ' << geneMutationGen << ' ' << name << endl; } + bool operator< (const PointMut &other) const + { LESS_PART (*this, other, pos); + LESS_PART (*this, other, alleleChar); + return false; + } + bool operator== (const PointMut &other) const + { return pos == other. pos + && alleleChar == other. alleleChar; + } +}; + + +map > accession2pointMuts; + + bool cdsExist = false; bool print_fam = false; @@ -229,16 +407,18 @@ bool reportPseudo = false; const string stopCodonS ("[stop]"); const string frameShiftS ("[frameshift]"); +unique_ptr point_mut_all; + struct BlastAlignment { // BLAST alignment size_t length {0}, nident {0} // aa - , refStart {0}, refEnd {0}, refLen {0} - , targetStart {0}, targetEnd {0}, targetLen {0}; + , refStart {0}, refStop {0}, refLen {0} + , targetStart {0}, targetStop {0}, targetLen {0}; // Positions are 0-based - // targetStart < targetEnd + // start < stop // target string targetName; @@ -253,11 +433,9 @@ struct BlastAlignment bool frameShift {false}; // Reference (AMR) protein - bool refStrand {true}; long gi {0}; // 0 <=> HMM method string accessionProt; - string accessionDna; size_t part {1}; // >= 1 // <= parts @@ -267,12 +445,15 @@ struct BlastAlignment string famId; string gene; // FAM.class - string mechanism; - // FAM.resistance - const bool mutant {false}; + string resistance; + + BlastRule completeBR; + BlastRule partialBR; + string product; - vector cdss; + Vector cdss; + Vector pointMuts; static constexpr size_t mismatchTail_aa = 10; // PAR size_t mismatchTailTarget {0}; @@ -282,77 +463,189 @@ struct BlastAlignment bool targetProt_arg) : targetProt (targetProt_arg) { - istringstream iss (line); - string refName, targetSeq; - iss >> targetName >> refName >> length >> nident >> targetStart >> targetEnd >> targetLen >> refStart >> refEnd >> refLen >> targetSeq; - // format: qseqid sseqid length nident qstart qend qlen sstart send slen sseq - // blastp: ... ... 663 169 2 600 639 9 665 693 ... - // blastx: ... ... 381 381 13407 14549 57298 1 381 381 ... - // blastn: ... ... 733 733 62285 63017 88215 105 837 837 ... - ASSERT (! targetSeq. empty ()); - - // refName - product = rfindSplit (refName, '|'); - //mutant = // str2 (rfindSplit (refName, '|')); // ?? - mechanism = rfindSplit (refName, '|'); - gene = rfindSplit (refName, '|'); // Reportable_vw.class - famId = rfindSplit (refName, '|'); // Reportable_vw.fam - parts = (size_t) str2 (rfindSplit (refName, '|')); - part = (size_t) str2 (rfindSplit (refName, '|')); - accessionDna = rfindSplit (refName, '|'); - accessionProt = rfindSplit (refName, '|'); - gi = str2 (refName); - ASSERT (gi > 0); - - replace (product, '_', ' '); - - ASSERT (refStart != refEnd); - refStrand = refStart < refEnd; - ASSERT (refStrand); // ?? - if (! refStrand) - swap (refStart, refEnd); - - ASSERT (targetStart != targetEnd); - targetStrand = targetStart < targetEnd; - IMPLY (targetProt, targetStrand); - if (! targetStrand) - swap (targetStart, targetEnd); - - ASSERT (refStart >= 1); - ASSERT (targetStart >= 1); - ASSERT (refStart < refEnd); - ASSERT (targetStart < targetEnd); - refStart--; - targetStart--; - - partialDna = false; - constexpr size_t mismatchTailDna = 10; // PAR - if (! targetProt && targetEnd - targetStart >= 30) // PAR, PD-671 - { - if (refStart > 0 && targetTail (true) <= mismatchTailDna) partialDna = true; - else if (refEnd < refLen && targetTail (false) <= mismatchTailDna) partialDna = true; - } - - setTargetAlign (); - - if (contains (targetSeq, "*")) - stopCodon = true; - //frameShift = contains (targetSeq, "/"); // Needs "blastall -p blastx ... " - if (! targetProt && (targetEnd - targetStart) % 3 != 0) - frameShift = true; - - // For BLASTX - // PD-1280 - if ( ! targetProt - && refStart == 0 - && charInSet (targetSeq [0], "LIV") - && nident < targetAlign_aa - ) - nident++; - - mismatchTailTarget = mismatchTail_aa; - if (! targetProt) - mismatchTailTarget *= 3; + try + { + string refName, targetSeq, refSeq; + static Istringstream iss; + iss. reset (line); + iss >> targetName >> refName >> length >> nident >> targetStart >> targetStop >> targetLen >> refStart >> refStop >> refLen >> targetSeq >> refSeq; + // format: qseqid sseqid length nident qstart qend qlen sstart send slen sseq qseq + // blastp: ... ... 663 169 2 600 639 9 665 693 ... + // blastx: ... ... 381 381 13407 14549 57298 1 381 381 ... + ASSERT (! targetSeq. empty ()); + ASSERT (targetSeq. size () == refSeq. size ()); + + try + { + // refName + product = rfindSplit (refName, '|'); + resistance = rfindSplit (refName, '|'); + gene = rfindSplit (refName, '|'); // Reportable_vw.class + famId = rfindSplit (refName, '|'); // Reportable_vw.fam + parts = (size_t) str2 (rfindSplit (refName, '|')); + part = (size_t) str2 (rfindSplit (refName, '|')); + #if VER == 1 + rfindSplit (refName, '|'); // nucl_accession + #endif + accessionProt = rfindSplit (refName, '|'); + gi = str2 (refName); + } + catch (const exception &e) + { + throw runtime_error (string ("Bad AMRFinder database\n") + e. what ()); + } + ASSERT (gi > 0); + + replace (product, '_', ' '); + + + if (! isPointMut ()) + { + // PD-2310 + completeBR. ident = getFam () -> completeBR. ident; + partialBR. ident = getFam () -> partialBR. ident; + } + + completeBR. ref_coverage = complete_coverage_min_def; + partialBR. ref_coverage = defaultPartialBR. ref_coverage; + + if (completeBR. empty ()) + completeBR = defaultCompleteBR; + if (partialBR. empty ()) + partialBR = defaultPartialBR; + + if (ident_min_user) + { + completeBR. ident = defaultCompleteBR. ident; + partialBR. ident = defaultPartialBR. ident; + } + + + ASSERT (refStart < refStop); + + ASSERT (targetStart != targetStop); + targetStrand = targetStart < targetStop; + IMPLY (targetProt, targetStrand); + if (! targetStrand) + swap (targetStart, targetStop); + + ASSERT (refStart >= 1); + ASSERT (targetStart >= 1); + ASSERT (refStart < refStop); + ASSERT (targetStart < targetStop); + refStart--; + targetStart--; + + partialDna = false; + constexpr size_t mismatchTailDna = 10; // PAR + if (! targetProt && targetStop - targetStart >= 30) // PAR, PD-671 + { + if (refStart > 0 && targetTail (true) <= mismatchTailDna) partialDna = true; + else if (refStop < refLen && targetTail (false) <= mismatchTailDna) partialDna = true; + } + + setTargetAlign (); + + if (contains (targetSeq, "*")) + stopCodon = true; + //frameShift = contains (targetSeq, "/"); // Needs "blastall -p blastx ... " + if (! targetProt && (targetStop - targetStart) % 3 != 0) + frameShift = true; + + // For BLASTX + // PD-1280 + if ( ! targetProt + && refStart == 0 + && charInSet (targetSeq [0], "LIV") + && nident < targetAlign_aa + ) + nident++; + + mismatchTailTarget = mismatchTail_aa; + if (! targetProt) + mismatchTailTarget *= 3; + + if (! targetProt) + cdss << move (Locus (0, targetName, targetStart, targetStop, targetStrand, partialDna, 0)); + + strUpper (targetSeq); + strUpper (refSeq); + if (const Vector* pointMuts_all = findPtr (accession2pointMuts, accessionProt)) + { + if (verbose ()) + cout << "PointMut protein found: " << refName << endl; + ASSERT (isPointMut ()); + ASSERT (! pointMuts_all->empty ()); + string s (pointMuts_all->front (). geneMutation); + rfindSplit (s, pm_delimiter); + const string pmGene (s); + size_t refPos = refStart; + size_t j = 0; + while (j < pointMuts_all->size () && pointMuts_all->at (j). pos < refPos) + { + if (point_mut_all. get ()) + { + PointMut pm (pointMuts_all->at (j)); + pm. name = "Non-called " + pmGene; + pm. additional = true; + pointMuts << move (pm); + } + j++; + } + VectorPtr pos_pm; // PointMut's of the same refPos + FFOR (size_t, i, refSeq. size ()) + { + if (refSeq [i] == '-') + continue; + pos_pm. clear (); + while (j < pointMuts_all->size () && pointMuts_all->at (j). pos == refPos) + { + pos_pm << & pointMuts_all->at (j); + j++; + } + if (targetSeq [i] == refSeq [i]) + if (pos_pm. empty ()) + ; + else + { + for (const PointMut* pm : pos_pm) + if (targetSeq [i] == pm->alleleChar) + throw logic_error ("Wildtype allele " + pm->geneMutation); + if (point_mut_all. get ()) + pointMuts << move (PointMut (pmGene, refPos, refSeq [i], targetSeq [i])); + } + else + { + //bool found = false; + for (const PointMut* pm : pos_pm) + if (targetSeq [i] == pm->alleleChar) + { + pointMuts << *pm; + //found = true; + } + #if 0 + if (! found && point_mut_all. get ()) + pointMuts << move (PointMut (pmGene, refPos, refSeq [i], targetSeq [i])); + #endif + } + refPos++; + } + if (point_mut_all. get ()) + while (j < pointMuts_all->size ()) + { + PointMut pm (pointMuts_all->at (j)); + pm. name = "Non-called " + pmGene; + pm. additional = true; + pointMuts << move (pm); + j++; + } + } + } + catch (...) + { + cout << line << endl; + throw; + } } explicit BlastAlignment (const HmmAlignment& other) : targetName (other. sseqid) @@ -366,241 +659,431 @@ struct BlastAlignment { if (! qc_on) return; - ASSERT (! famId. empty ()); - ASSERT (! gene. empty ()); - ASSERT (part >= 1); - ASSERT (part <= parts); - ASSERT (! product. empty ()); - ASSERT (refStrand); - IMPLY (targetProt, targetStrand); - ASSERT (targetStart < targetEnd); - ASSERT (targetEnd <= targetLen); - ASSERT ((bool) gi == (bool) length); - ASSERT ((bool) gi == (bool) refLen); - ASSERT ((bool) gi == (bool) nident); - ASSERT ((bool) gi == ! accessionProt. empty ()); - IMPLY (! gi, getFam () -> getHmmFam ()); - IMPLY (accessionProt. empty (), accessionDna. empty ()); - IMPLY (targetProt, ! partialDna); - //IMPLY (! targetProt, (targetEnd - targetStart) % 3 == 0); - ASSERT (targetAlign); - IMPLY (targetProt, targetAlign == targetAlign_aa); - IMPLY (! targetProt, targetAlign == 3 * targetAlign_aa); - ASSERT (nident <= targetAlign_aa); - IMPLY (! targetProt, cdss. empty ()); + QC_ASSERT (! famId. empty ()); + QC_ASSERT (! gene. empty ()); + QC_ASSERT (part >= 1); + QC_ASSERT (part <= parts); + QC_ASSERT (! product. empty ()); + QC_IMPLY (targetProt, targetStrand); + QC_ASSERT (targetStart < targetStop); + QC_ASSERT (targetStop <= targetLen); + QC_ASSERT ((bool) gi == (bool) length); + QC_ASSERT ((bool) gi == (bool) refLen); + QC_ASSERT ((bool) gi == (bool) nident); + QC_ASSERT ((bool) gi == ! accessionProt. empty ()); + QC_IMPLY (! gi && ! isPointMut (), getFam () -> getHmmFam ()); + QC_IMPLY (targetProt, ! partialDna); + //QC_IMPLY (! targetProt, (targetStop - targetStart) % 3 == 0); + QC_ASSERT (targetAlign); + QC_IMPLY (targetProt, targetAlign == targetAlign_aa); + QC_IMPLY (! targetProt, targetAlign == 3 * targetAlign_aa); + QC_ASSERT (nident <= targetAlign_aa); + //QC_IMPLY (! targetProt, cdss. empty ()); if (gi) { - ASSERT (refStart < refEnd); - ASSERT (nident <= refEnd - refStart); - ASSERT (refEnd <= refLen); - ASSERT (refEnd - refStart <= length); - ASSERT (targetAlign_aa <= length); + QC_ASSERT (refStart < refStop); + QC_ASSERT (nident <= refStop - refStart); + QC_ASSERT (refStop <= refLen); + QC_ASSERT (refStop - refStart <= length); + QC_ASSERT (targetAlign_aa <= length); } + #if 0 + if (targetProt) + for (const Locus& cds : cdss) + QC_ASSERT ( cds. size () == 3 * targetLen + 3 + || cds. size () == 3 * targetLen + ); + #endif + QC_IMPLY (! pointMuts. empty (), isPointMut ()); } void saveText (ostream& os) const { // PD-736, PD-774, PD-780, PD-799 - const string method (getMethod ()); const string na ("NA"); - const string proteinName (refExactlyMatched () || ! gi ? product : nvl (getFam () -> familyName, na)); + const string proteinName (isPointMut () + ? string () + : refExactlyMatched () || ! gi + ? product + : nvl (getFam () -> familyName, na) + ); ASSERT (! contains (proteinName, '\t')); - if (! verbose ()) // Otherwise data may be not completely prepared - if (targetProt && ! (! cdsExist == cdss. empty ())) - { - cout << endl << targetName << '!' << endl; - ERROR; - } - vector cdss_ (cdss); + Vector cdss_ (cdss); if (cdss_. empty ()) - cdss_. push_back (Cds ()); - for (const Cds& cds : cdss_) - { - TabDel td (2, false); - //if (targetProt) - td << targetName; - if (cdsExist) - td << (cds. contig. empty () ? targetName : cds. contig) - << (cds. contig. empty () ? targetStart : cds. start) + 1 - << (cds. contig. empty () ? targetEnd : cds. stop) - << (cds. contig. empty () ? (/*refStrand*/ targetStrand ? '+' : '-') : (cds. strand ? '+' : '-')); - td << (print_fam - ? famId - : (method == "ALLELE" ? famId : nvl (getFam () -> genesymbol, na)) - ) - << proteinName + ifS (reportPseudo, ifS (stopCodon, " " + stopCodonS) + ifS (frameShift, " " + frameShiftS)) - << method - << (targetProt ? targetLen : targetAlign_aa); - if (gi) - td << refLen - << refCoverage () * 100 - << (/*targetProt ? pRefEffectiveLen () :*/ pIdentity ()) * 100 // refIdentity - << length - << accessionProt - << product - //<< accessionDna - ; - else - td << na - << na - << na - << na - << na - << na; - // PD-775 - if (const Fam* f = getFam () -> getHmmFam ()) - td << f->hmm - << f->familyName; - else - { - td << na - << na; - ASSERT (method != "HMM"); - } - if (cdsExist) - { - IMPLY (cds. crossOriginSeqLen, useCrossOrigin); - if (useCrossOrigin) - { - if (cds. crossOriginSeqLen) - td << cds. crossOriginSeqLen; - else - td << na; + cdss_ << Locus (); + Vector pointMuts_ (pointMuts); + if (pointMuts_. empty ()) + pointMuts_ << PointMut (); + for (const Locus& cds : cdss_) + for (const PointMut& pm : pointMuts_) + { + const string method (getMethod (cds)); + TabDel td (2, false); + td << (targetProt ? targetName : na); // PD-2534 + if (cdsExist) + td << (cds. contig. empty () ? targetName : cds. contig) + << (cds. contig. empty () ? targetStart : cds. start) + 1 + << (cds. contig. empty () ? targetStop : cds. stop) + << (cds. contig. empty () ? (targetStrand ? '+' : '-') : (cds. strand ? '+' : '-')); + td << (isPointMut () /*! pm. empty ()*/ + ? pm. geneMutation + : print_fam + ? famId + : (isLeft (method, "ALLELE") ? famId : nvl (getFam () -> genesymbol, na)) + ) + << (pm. empty () ? proteinName : pm. name) + + ifS (reportPseudo, ifS (frameShift, " " + frameShiftS)) + << (isPointMut () || getFam () -> reportable >= 2 ? "core" : "plus"); // PD-2825 + // PD-1856 + if (isPointMut ()) + td << "AMR" + << "POINT" + << nvl (pm. classS, na) + << nvl (pm. subclass, na); + else + td << nvl (getFam () -> type, na) + << nvl (getFam () -> subtype, na) + << nvl (getFam () -> classS, na) + << nvl (getFam () -> subclass, na); + td << method + << (targetProt ? targetLen : targetAlign_aa); + if (gi) + td << refLen + << refCoverage () * 100.0 + << pIdentity () * 100.0 // refIdentity + << length + << accessionProt + << product + ; + else + td << na + << na + << na + << na + << na + << na; + // PD-775 + if (isPointMut ()) + td << na + << na; + else if (const Fam* f = getFam () -> getHmmFam ()) + td << f->hmm + << f->familyName; + else + { + td << na + << na; + ASSERT (method != "HMM"); } - } - os << td. str () << endl; - } + if (cdsExist) + { + IMPLY (cds. crossOrigin, useCrossOrigin); + if (useCrossOrigin) + { + if (cds. crossOrigin) + td << cds. contigLen; + else + td << na; + } + } + if (pm. empty () || ! pm. additional) + { + if (verbose ()) + os << refExactlyMatched () + << '\t' << allele () + << '\t' << alleleReported () + << '\t' << targetProt + << '\t' << nident + << '\t'; + os << td. str () << endl; + } + if (point_mut_all. get () && ! pm. empty ()) + *point_mut_all << td. str () << endl; + } } + bool isPointMut () const + { return resistance == "point_mutation"; } + Set getMutations () const + { Set mutations; + for (const PointMut& pointMut : pointMuts) + mutations << pointMut. geneMutation; + return mutations; + } bool allele () const { return famId != gene && parts == 1; } size_t targetTail (bool upstream) const - { return targetStrand == upstream ? targetStart : (targetLen - targetEnd); } + { return targetStrand == upstream ? targetStart : (targetLen - targetStop); } size_t refEffectiveLen () const - { return partialDna ? refEnd - refStart : refLen; } -#if 0 - double pRefEffectiveLen () const - { ASSERT (nident); - return (double) nident / (double) refEffectiveLen (); - } -#endif + { return partialDna ? refStop - refStart : refLen; } double pIdentity () const { return (double) nident / (double) length; } double refCoverage () const - { return (double) (refEnd - refStart) / (double) refLen; } + { return (double) (refStop - refStart) / (double) refLen; } + double targetCoverage () const + { return targetProt ? (double) (targetStop - targetStart) / (double) targetLen : NaN; } bool refExactlyMatched () const { return refLen && nident == refLen && refLen == length; } + bool targetExactlyMatched () const + { return targetLen + && nident == targetLen + && targetLen == length; + } bool partial () const // Requires: good() - { return refCoverage () < complete_cover_min - frac_delta; } - string getMethod () const - { return refExactlyMatched () - ? allele () && (! targetProt || refLen == targetLen) - ? "ALLELE" - : "EXACT" // PD-776 - : gi - ? partial () - ? "PARTIAL" - : "BLAST" - : "HMM"; + { return refCoverage () < complete_coverage_min_def - frac_delta; } + bool getTargetStrand (const Locus &cds) const + { return targetProt + ? cds. empty () + ? true + : cds. strand + : targetStrand; + } + size_t missedDnaStart (const Locus &cds) const + { return 3 * (getTargetStrand (cds) + ? refStart + : refLen - refStop + ); + } + size_t missedDnaStop (const Locus &cds) const + { return 3 * (getTargetStrand (cds) + ? refLen - refStop + : refStart + ); + } + bool truncated (const Locus &cds) const + { return (missedDnaStart (cds) > 0 && (targetProt ? (cds. empty () ? false : cds. atContigStart ()) : targetStart <= Locus::end_delta)) + || (missedDnaStop (cds) > 0 && (targetProt ? (cds. empty () ? false : cds. atContigStop ()) : targetLen - targetStop <= Locus::end_delta)); + } + bool alleleReported () const + { return refExactlyMatched () && allele () && (! targetProt || refLen == targetLen); } + string getMethod (const Locus &cds) const + { string method (refExactlyMatched () + ? alleleReported () + ? "ALLELE" + : "EXACT" // PD-776 + : gi + ? partial () + ? truncated (cds) + ? "PARTIAL_CONTIG_END" // PD-2267 + : "PARTIAL" + : isPointMut () + ? "POINT" + : "BLAST" + : "HMM" + ); + // PD-2088, PD-2320 + if ( ( method == "BLAST" + || method == "PARTIAL" + || method == "PARTIAL_CONTIG_END" + ) + && stopCodon + && ! targetProt + ) + method = "INTERNAL_STOP"; + else if (method != "HMM") + method += (targetProt ? "P" : "X"); + return method; } // PD-736 + bool passBlastRule (const BlastRule &br) const + { return pIdentity () >= br. ident - frac_delta + && refCoverage () >= br. ref_coverage - frac_delta + ; + } bool good () const - { if (! reportPseudo) + { if (! gi) + return true; + if (! reportPseudo) { if (stopCodon) return false; if (frameShift) - return false; + return false; + if (partial () && ! cdss. empty ()) + { + bool found = false; + for (const Locus& cds : cdss) + if (truncated (cds)) + found = true; + if (! found) + return false; + } } - #if 0 - if (targetProt) - { if (pRefEffectiveLen () < ident_min - frac_delta) - return false; - } - else - #endif - { // PD-1032 - if ( pIdentity () < ident_min - frac_delta - || refCoverage () < partial_cover_min - frac_delta - ) - return false; - if (parts > 1 && refCoverage () < complete_cover_min - frac_delta) + // PD-1032 + if (partial ()) + if (parts > 1) return false; - } - return true; + else + return passBlastRule (partialBR); + else + return passBlastRule (completeBR); } private: +#if 0 + bool sameTarget (const BlastAlignment &other) const + // Symmetric + { ASSERT (targetProt == other. targetProt); + return targetStrand == other. targetStrand + && difference (targetStart, other. targetStart) <= mismatchTailTarget + && difference (targetStop, other. targetStop) <= mismatchTailTarget; + } + // Requires: same targetName +#endif bool insideEq (const BlastAlignment &other) const - { return targetStrand == other. targetStrand + { ASSERT (targetProt == other. targetProt); + return targetStrand == other. targetStrand && targetStart + mismatchTailTarget >= other. targetStart - && targetEnd <= other. targetEnd + mismatchTailTarget; + && targetStop <= other. targetStop + mismatchTailTarget; } // Requires: same targetName - bool descendantOf (const BlastAlignment &other) const - { return ! other. allele () - && getFam () -> descendantOf (other. getFam ()); + bool matchesCds (const BlastAlignment &other) const + { ASSERT (targetProt); + ASSERT (! other. targetProt); + ASSERT (! cdss. empty ()); + for (const Locus& cds : cdss) + if ( cds. contig == other. targetName + && cds. strand == other. targetStrand + && ! cds. crossOrigin + ) + { size_t protStart = cds. start; + size_t protStop = cds. stop; + if (cds. strand) + { + ASSERT (protStop > 3); + protStop -= 3; + } + else + protStart += 3; + ASSERT (protStart < protStop); + const size_t dnaStart = other. targetStart; + const size_t dnaStop = other. targetStop; + ASSERT (dnaStart < dnaStop); + #if 0 + // PD-2271: partial + // use truncated(cds) ?? + if (cds. strand) // Checking frames + { if (difference (protStart, dnaStart) > 3 && protStart % 3 != dnaStart % 3) + continue; + } + else + if (difference (protStop, dnaStop) > 3 && protStop % 3 != dnaStop % 3) + continue; + #endif + #if 0 // PD-2099 + if (cds. strand) // PD-1902 + { if (protStop != dnaStop) + continue; + } + else + if (protStart != dnaStart) + continue; + #endif + const size_t intersectionStart = max (protStart, dnaStart); + const size_t intersectionStop = min (protStop, dnaStop); + if ( intersectionStart < intersectionStop + && double (intersectionStop - intersectionStart) / double (protStop - protStart) > 0.75 // PAR + ) + return true; + } + return false; } bool betterEq (const BlastAlignment &other) const // Reflexive - { if (targetName != other. targetName) - return false; - if (targetProt != other. targetProt) // ?? - return false; - // PD-727 - /*if (getFam () != other. getFam ()) - { if (descendantOf (other)) - return true; - if (other. descendantOf (*this)) - return false; - }*/ - // PD-807 - //if (! targetProt && ! other. insideEq (*this)) - if ( ! other. insideEq (*this) - && ! insideEq (other) - ) - return false; - LESS_PART (other, *this, refExactlyMatched ()); // PD-1261, PD-1678 - LESS_PART (other, *this, nident); - LESS_PART (*this, other, refEffectiveLen ()); + { if (targetProt == other. targetProt) + { + if (targetName != other. targetName) + return false; + // PD-807 + if ( ! (targetProt && famId == other. famId) // PD-2441 + //&& ! sameTarget (other) + && ! other. insideEq (*this) + && ! insideEq (other) + ) + return false; + //if (targetProt) + //{ LESS_PART (other, *this, isPointMut ()); } + LESS_PART (other, *this, refExactlyMatched ()); // PD-1261, PD-1678 + LESS_PART (other, *this, nident); + LESS_PART (*this, other, refEffectiveLen ()); + } + else + { // PD-1902, PD-2139, PD-2313, PD-2320 + if (targetProt && ! matchesCds (other)) + return false; + if (! targetProt && ! other. matchesCds (*this)) + return false; + if (isPointMut ()) + { + if (! other. isPointMut ()) + return true; + const Set mutations ( getMutations ()); + const Set otherMutations (other. getMutations ()); + if (mutations == otherMutations) + return targetProt; + if (mutations. containsAll (otherMutations)) + return true; + if (otherMutations. containsAll (mutations)) + return false; + } + else + { + LESS_PART (other, *this, refExactlyMatched ()); + //LESS_PART (other, *this, allele ()); // PD-2352 + LESS_PART (other, *this, alleleReported ()); + LESS_PART (other, *this, targetProt); + } + } return true; } public: const Fam* getFam () const - { const Fam* fam = famId2fam [famId]; + { ASSERT (! isPointMut ()); + const Fam* fam = famId2fam [famId]; if (! fam) fam = famId2fam [gene]; - ASSERT (fam); + if (! fam) + throw runtime_error ("Cannot find hierarchy for: " + famId + " / " + gene); return fam; } bool better (const BlastAlignment &other) const + // Requires: all SCCs of betterEq() are complete subgraphs ?? { return betterEq (other) && ( ! other. betterEq (*this) - || accessionProt < other. accessionProt // Tie resolution; PD-1245 + || accessionProt < other. accessionProt // Tie resolution: PD-1245 ); } bool better (const HmmAlignment& other) const { ASSERT (other. good ()); - if (targetName != other. sseqid) - return false; + ASSERT (other. blastAl. get ()); + if (isPointMut ()) + return false; + if (targetProt) + { if (targetName != other. sseqid) + return false; + } + else + if (! other. blastAl->matchesCds (*this)) + return false; return refExactlyMatched () //|| getFam () -> getHmmFam () == other. fam || getFam () -> descendantOf (other. fam) ; } bool operator< (const BlastAlignment &other) const - { LESS_PART (*this, other, targetName); + { + LESS_PART (*this, other, targetName); LESS_PART (*this, other, targetStart); LESS_PART (*this, other, famId); LESS_PART (*this, other, accessionProt); return false; } //size_t lengthScore () const - //{ return refLen - (refEnd - refStart); } + //{ return refLen - (refStop - refStart); } void setTargetAlign () - { targetAlign = targetEnd - targetStart; + { targetAlign = targetStop - targetStart; targetAlign_aa = targetAlign; if (! targetProt) { @@ -608,23 +1091,73 @@ struct BlastAlignment targetAlign_aa = targetAlign / 3; } } + void setCdss (const map &seqId2locusTag, + const Annot &annot) + { ASSERT (targetProt); + ASSERT (cdss. empty ()); + string locusTag = targetName; + if (! seqId2locusTag. empty ()) + { + string s; + if (! find (seqId2locusTag, locusTag, s)) + throw runtime_error ("Target " + strQuote (locusTag) + " is not found in GFF-match file"); + locusTag = s; + } + if (const Set* cdss_ = findPtr (annot. prot2cdss, locusTag)) + { + insertAll (cdss, *cdss_); + #if 0 + // PD-2269 + for (const Locus& cds : *cdss_) + // PD-2138 + if ( ! cds. partial + && cds. size () != 3 * targetLen + 3 + && cds. size () != 3 * targetLen // PD-2159 + ) + throw runtime_error ("Mismatch in protein length between the protein " + targetName + + " and the length of the protein on line " + toString (cds. lineNum) + + " of the GFF file. Please check that the GFF and protein files match."); + #endif + } + else + throw runtime_error ("Locus tag " + strQuote (locusTag) + " is misssing in .gff-file. Protein name: " + targetName); + qc (); + } }; +bool HmmAlignment::better (const BlastAlignment& other) const +{ + ASSERT (good ()); + ASSERT (other. good ()); + if (other. isPointMut ()) + return false; + if (! other. targetProt) + return false; + if (sseqid != other. targetName) + return false; + return fam != other. getFam () + && fam->descendantOf (other. getFam ()); +} + + + +// Batch + struct Batch { // Reference input map hmm2fam; - Fam* root {nullptr}; - // Not delete'd in ~Batch() + uchar reportable_min {0}; // Target input typedef List BlastAls; typedef List HmmAls; BlastAls blastAls; + bool hmmExist {false}; map domains; // Best domain HmmAls hmmAls; @@ -632,75 +1165,153 @@ struct Batch BlastAls goodBlastAls; - explicit Batch (const string &famFName) - : root (new Fam ()) + Batch (const string &famFName, + const string &organism, + const string &point_mut, + bool non_reportable, + bool report_core_only) + : reportable_min (non_reportable + ? 0 + : report_core_only + ? 2 + : 1 + ) { if (famFName. empty ()) throw runtime_error ("fam (protein family hierarchy) file is missing"); - + // Tree of Fam // Pass 1 + const string pointMutParent ("POINT_MUTATION"); { + if (verbose ()) + cout << "Reading " << famFName << " Pass 1 ..." << endl; LineInput f (famFName); while (f. nextLine ()) - { - trim (f. line); - //cout << f. line << endl; - const string famId (findSplit (f. line, '\t')); - /*const string parentFamName =*/ findSplit (f. line, '\t'); - const string genesymbol (findSplit (f. line, '\t')); - const string hmm (findSplit (f. line, '\t')); - const double tc1 = str2 (findSplit (f. line, '\t')); - const double tc2 = str2 (findSplit (f. line, '\t')); - const int reportable = str2 (findSplit (f. line, '\t')); - ASSERT ( reportable == 0 - || reportable == 1 - ); - const auto fam = new Fam (root, famId, genesymbol, hmm, tc1, tc2, f. line, reportable); - famId2fam [famId] = fam; - if (! fam->hmm. empty ()) - hmm2fam [fam->hmm] = fam; - } + try + { + trim (f. line); + //cout << f. line << endl; + const string famId (findSplit (f. line, '\t')); + const string parentFamId (findSplit (f. line, '\t')); + const string genesymbol (findSplit (f. line, '\t')); + const string hmm (findSplit (f. line, '\t')); + const double tc1 = str2 (findSplit (f. line, '\t')); + const double tc2 = str2 (findSplit (f. line, '\t')); + BlastRule completeBR; + BlastRule partialBR; + #if VER >= 3 + completeBR. ident = str2 (findSplit (f. line, '\t')); + completeBR. target_coverage = str2 (findSplit (f. line, '\t')); + completeBR. ref_coverage = str2 (findSplit (f. line, '\t')); + partialBR. ident = str2 (findSplit (f. line, '\t')); + partialBR. target_coverage = str2 (findSplit (f. line, '\t')); + partialBR. ref_coverage = str2 (findSplit (f. line, '\t')); + toProb (completeBR. ident); + toProb (completeBR. target_coverage); + toProb (completeBR. ref_coverage); + toProb (partialBR. ident); + toProb (partialBR. target_coverage); + toProb (partialBR. ref_coverage); + #endif + const uchar reportable = (uchar) str2 (findSplit (f. line, '\t')); + QC_ASSERT (reportable <= 2); + const string type (findSplit (f. line, '\t')); + const string subtype (findSplit (f. line, '\t')); + const string classS (findSplit (f. line, '\t')); + const string subclass (findSplit (f. line, '\t')); + if (parentFamId == pointMutParent) + continue; + const auto fam = new Fam (famId, genesymbol, hmm, tc1, tc2, completeBR, partialBR, type, subtype, classS, subclass, f. line, reportable); + if (famId2fam [famId]) + throw runtime_error ("Family " + famId + " is duplicated"); + famId2fam [famId] = fam; + if (! fam->hmm. empty ()) + hmm2fam [fam->hmm] = fam; + } + catch (const exception &e) + { + throw runtime_error ("Cannot read " + famFName +", line " + toString (f. lineNum) + "\n" + e. what ()); + } } - // Pass 2 { + if (verbose ()) + cout << "Reading " << famFName << " Pass 2 ..." << endl; LineInput f (famFName); while (f. nextLine ()) { trim (f. line); //cout << f. line << endl; - Fam* child = const_cast (famId2fam [findSplit (f. line, '\t')]); - ASSERT (child); + const Fam* child = famId2fam [findSplit (f. line, '\t')]; const string parentFamId (findSplit (f. line, '\t')); - Fam* parent = nullptr; + if (parentFamId == pointMutParent) + continue; + ASSERT (child); + const Fam* parent = nullptr; if (! parentFamId. empty ()) - { EXEC_ASSERT (parent = const_cast (famId2fam [parentFamId])); } - child->parent = parent; + { + parent = famId2fam [parentFamId]; + if (! parent) + throw runtime_error ("parentFamId " + strQuote (parentFamId) + " is not found in famId2fam for child " + strQuote (child->id)); + } + var_cast (child) -> parent = parent; } } - } - - - void process (bool skip_hmm_check) - // Input: root, blastAls, domains, hmmAls - // Output: goodBlastAls - { - ASSERT (root); - - if (! (ident_min >= 0 && ident_min <= 1)) - throw runtime_error ("ident_min must be between 0 and 1"); - if (! (complete_cover_min >= 0 && complete_cover_min <= 1)) - throw runtime_error ("complete_cover_min must be between 0 and 1"); - if (! (partial_cover_min >= 0 && partial_cover_min <= 1)) - throw runtime_error ("partial_cover_min must be between 0 and 1"); - if (partial_cover_min > complete_cover_min) - throw runtime_error ("partial_cover_min msut be less than or equal to complete_cover_min"); - - // Filtering by ::good() has been done above - - // Group by targetName and process each targetName separately for speed ?? + + if (qc_on) + { + size_t roots = 0; + for (const auto& it : famId2fam) + if (! it. second->parent) + roots++; + QC_ASSERT (roots == 1); + } + - // Pareto-better() + if (! organism. empty ()) + { + if (verbose ()) + cout << "Reading " << point_mut << endl; + LineInput f (point_mut); + Istringstream iss; + while (f. nextLine ()) + { + string organism_, accession, geneMutation, geneMutationGen, classS, subclass, name; + int pos; + char alleleChar; + iss. reset (f. line); + iss >> organism_ >> accession >> pos >> alleleChar >> geneMutation >> geneMutationGen >> classS >> subclass >> name; + ASSERT (pos > 0); + replace (organism_, '_', ' '); + if (organism_ == organism) + accession2pointMuts [accession] << move (PointMut ((size_t) pos, alleleChar, geneMutation, geneMutationGen, classS, subclass, name)); + } + // PD-2008 + if (accession2pointMuts. empty ()) + throw runtime_error ("No protein point mutations for organism " + strQuote (organism) + " found in the AMRFinder database. Please check the " + strQuote ("organism") + " option."); + for (auto& it : accession2pointMuts) + { + it. second. sort (); + //if (! it. second. isUniq ()) + //throw runtime_error ("Duplicate mutations for " + it. first); + } + } + } +private: + static void toProb (double &x) + { + ASSERT (x >= 0.0); + ASSERT (x <= 100.0); + x /= 100.0; + } + + + void blastParetoBetter () + // BLAST: Pareto-better() + // Input: blastAls + // Output: goodBlastAls + { + goodBlastAls. clear (); for (const auto& blastAl : blastAls) { ASSERT (blastAl. good ()); @@ -715,14 +1326,23 @@ struct Batch continue; for (Iter goodIter (goodBlastAls); goodIter. next ();) if (blastAl. better (*goodIter)) + { + if (verbose ()) + { + cout << "Bad: "; + goodIter->saveText (cout); + cout << "Good: "; + blastAl. saveText (cout); + } goodIter. erase (); - goodBlastAls << blastAl; + } + goodBlastAls << blastAl; } if (verbose ()) { cout << "# Best Blasts: " << goodBlastAls. size () << endl; - if (! goodBlastAls. empty ()) - goodBlastAls. front (). saveText (cout); + for (const auto& blastAl : goodBlastAls) + blastAl. saveText (cout); } #if 0 for (const auto& blastAl1 : goodBlastAls) @@ -736,8 +1356,64 @@ struct Batch << endl; } #endif + } +public: + + + void process (bool skip_hmm_check) + // Input: blastAls, domains, hmmAls + // Output: goodBlastAls + { + // Use BlastAlignment.cdss + for (Iter iter (blastAls); iter. next ();) + if (! iter->good ()) + { + if (verbose ()) + { + cout << "Erased:" << endl; + iter->saveText (cout); + } + iter. erase (); + } + + // Group by targetName and process each targetName separately for speed ?? + + blastParetoBetter (); + + #if 0 + ?? + // Cf. point_mut.cpp + FFOR (size_t, i, batch. blastAls. size ()) + { + const BlastAlignment& blastAl1 = batch. blastAls [i]; + for (const auto& it1 : blastAl1. targetPos2pointMut) + { + const PointMut& pm1 = it1. second; + FFOR_START (size_t, j, i + 1, batch. blastAls. size ()) + { + BlastAlignment& blastAl2 = batch. blastAls [j]; + if (blastAl2. targetName == blastAl1. targetName) + for (auto& it2 : blastAl2. targetPos2pointMut) + { + PointMut& pm2 = it2. second; + if (verbose ()) + cout << blastAl2. targetName + << ' ' << it1. first + << ' ' << it2. first + << ' ' << pm1. geneMutationGen + << ' ' << pm2. geneMutationGen + << endl; + if ( it1. first == it2. first + && pm1. geneMutationGen == pm2. geneMutationGen + ) + pm2 = PointMut (); + } + } + } + } + #endif - // Pareto-better() + // HMM: Pareto-better() HmmAls goodHmmAls; FOR (unsigned char, criterion, 2) { @@ -774,10 +1450,12 @@ struct Batch } // PD-741 - if (! skip_hmm_check && ! goodHmmAls. empty ()) + if (hmmExist && ! skip_hmm_check) for (Iter iter (goodBlastAls); iter. next ();) if ( /*! iter->refExactlyMatched () */ - iter->pIdentity () < 0.98 - frac_delta // PAR // PD-1673 + ! iter->isPointMut () + && iter->targetProt + && iter->pIdentity () < 0.98 - frac_delta // PAR // PD-1673 && ! iter->partial () ) if (const Fam* fam = iter->getFam () -> getHmmFam ()) @@ -792,7 +1470,14 @@ struct Batch break; } if (! found) // BLAST is wrong + { + if (verbose ()) + { + cout << "HMM-wrong: "; + iter->saveText (cout); + } iter. erase (); + } } if (verbose ()) cout << "# Best Blasts left: " << goodBlastAls. size () << endl; @@ -805,61 +1490,73 @@ struct Batch break; } + // PD-2783 + for (Iter blastIt (goodBlastAls); blastIt. next ();) + for (const auto& hmmAl : goodHmmAls) + if (hmmAl. better (*blastIt)) + { + blastIt. erase (); + break; + } + // Output // goodHmmAls --> goodBlastAls for (const auto& hmmAl : goodHmmAls) { - BlastAlignment al (hmmAl); - if (verbose ()) - cout << al. targetName << " " << al. gene << endl; - const HmmAlignment::Domain domain = domains [HmmAlignment::Pair (al. targetName, al. gene)]; - if (! domain. hmmLen) - continue; // domain does not exist - /*al. refLen = domain. hmmLen; - al. refStart = domain. hmmStart; - al. refEnd = domain. hmmStop; */ - al. targetLen = domain. seqLen; - al. targetStart = domain. seqStart; - al. targetEnd = domain. seqStop; - al. setTargetAlign (); - ASSERT (! al. refExactlyMatched ()); - al. qc (); - goodBlastAls << al; + ASSERT (hmmAl. blastAl. get ()); + goodBlastAls << * hmmAl. blastAl. get (); } + //blastAls = move (goodBlastAls); + //blastParetoBetter (); + goodBlastAls. sort (); + + if (verbose ()) + { + cout << endl << "After process():" << endl; + for (const auto& blastAl : goodBlastAls) + { + blastAl. saveText (cout); + cout << "# Point mutations: " << blastAl. pointMuts. size () << endl; + } + } } - - + void report (ostream &os) const // Input: goodBlastAls { // PD-283, PD-780 - // Cf. BlastAlignment::saveText() - { // Cf. BlastAlignment::saveText() TabDel td; - td << "Target identifier"; // targetName + td << "Protein identifier"; // targetName // PD-2534 if (cdsExist) // Contig td << "Contig id" - << "Start" // targetStart - << "Stop" // targetEnd + << "Start" // targetStart + << "Stop" // targetStop << "Strand"; // targetStrand td << (print_fam ? "FAM.id" : "Gene symbol") - << "Protein name" + << "Sequence name" + << "Scope" // PD-2825 + // PD-1856 + << "Element type" + << "Element subtype" + << "Class" + << "Subclass" + // << "Method" << "Target length" // - << "Reference protein length" // refLen - << "% Coverage of reference protein" // queryCoverage - << "% Identity to reference protein" - << "Alignment length" // length - << "Accession of closest protein" // accessionProt - << "Name of closest protein" + << "Reference sequence length" // refLen + << "% Coverage of reference sequence" // queryCoverage + << "% Identity to reference sequence" + << "Alignment length" // length + << "Accession of closest sequence" // accessionProt + << "Name of closest sequence" // << "HMM id" << "HMM description" @@ -868,14 +1565,21 @@ struct Batch if (useCrossOrigin) td << "Cross-origin length"; os << td. str () << endl; + if (point_mut_all. get ()) + *point_mut_all << td. str () << endl; } for (const auto& blastAl : goodBlastAls) - if (blastAl. getFam () -> reportable) - { - blastAl. qc (); + { + blastAl. qc (); + if (blastAl. isPointMut ()) + if (blastAl. pointMuts. empty ()) + ; + else + blastAl. saveText (os); + else if (blastAl. getFam () -> reportable >= reportable_min) blastAl. saveText (os); - } + } } @@ -884,7 +1588,10 @@ struct Batch { ASSERT (os. good ()); for (const auto& blastAl : goodBlastAls) - if (blastAl. getFam () -> reportable) + if ( blastAl. targetProt + && ! blastAl. isPointMut () + && blastAl. getFam () -> reportable >= reportable_min + ) os << blastAl. targetName << endl; } }; @@ -897,7 +1604,8 @@ struct Batch HmmAlignment::HmmAlignment (const string &line, const Batch &batch) { - istringstream iss (line); + static Istringstream iss; + iss. reset (line); string hmm, dummy; // --- full sequence --- --- best 1 domain -- // target name accession query name accession E-value score bias E-value score bias exp reg clu ov env dom rep inc description of target @@ -909,13 +1617,36 @@ HmmAlignment::HmmAlignment (const string &line, +HmmAlignment::HmmAlignment (const HmmAlignment &other) +: sseqid (other. sseqid) +, score1 (other. score1) +, score2 (other. score2) +, fam (other. fam) +, blastAl (new BlastAlignment (* other. blastAl. get ())) +{} + + + +HmmAlignment HmmAlignment::operator= (const HmmAlignment &other) +{ + sseqid = other. sseqid; + score1 = other. score1; + score2 = other. score2; + fam = other. fam; + blastAl. reset (new BlastAlignment (* other. blastAl. get ())); + return *this; +} + + + // Domain HmmAlignment::Domain::Domain (const string &line, Batch &batch) { - istringstream iss (line); + static Istringstream iss; + iss. reset (line); string target_name, accession, query_name, query_accession; size_t n, of, env_from, env_to; double eValue, full_score, full_bias, cValue, i_eValue, domain_bias, accuracy; @@ -943,7 +1674,7 @@ HmmAlignment::Domain::Domain (const string &line, ASSERT (n <= of); ASSERT (score > 0); - const Fam* fam = batch. hmm2fam [query_accession/*query_name*/]; + const Fam* fam = batch. hmm2fam [query_accession]; if (! fam) return; const HmmAlignment::Pair p (target_name, fam->id); @@ -965,26 +1696,33 @@ struct ThisApplication : Application { // Input addKey ("fam", "Table FAM"); - const string blastFormat ("qseqid sseqid length nident qstart qend qlen sstart send slen sseq. qseqid format: gi|Protein accession|DNA accession|fusion part|# fusions|FAM.id|FAM.class|resistance mechanism|Product name"); + const string blastFormat ("qseqid sseqid length nident qstart qend qlen sstart send slen sseq qseq. qseqid format: gi|Protein accession|fusion part|# fusions|FAM.id|FAM.class|Product name"); addKey ("blastp", "blastp output in the format: " + blastFormat); addKey ("blastx", "blastx output in the format: " + blastFormat); addKey ("gff", ".gff assembly file"); addKey ("gff_match", ".gff-FASTA matching file for \"locus_tag\": \" \""); + addFlag ("bed", "Browser Extensible Data format of the file"); + addKey ("dna_len", "File with lines: "); addKey ("hmmdom", "HMM domain alignments"); addKey ("hmmsearch", "Output of hmmsearch"); - addKey ("ident_min", "Min. identity to the reference protein (0..1)", "0.9"); - addKey ("complete_cover_min", "Min. coverage of the reference protein (0..1) for a complete hit", "0.9"); - addKey ("partial_cover_min", "Min. coverage of the reference protein (0..1) for a partial hit", "0.5"); + addKey ("organism", "Taxonomy group for point mutations"); + addKey ("point_mut", "Point mutation table"); + addKey ("point_mut_all", "File to report all target positions of reference point mutations"); + addKey ("ident_min", "Min. identity to the reference protein (0..1). -1 means use a curated threshold if it exists and " + toString (ident_min_def) + " otherwise", "-1"); + addKey ("coverage_min", "Min. coverage of the reference protein (0..1) for partial hits", toString (partial_coverage_min_def)); addFlag ("skip_hmm_check", "Skip checking HMM for a BLAST hit"); // Output addKey ("out", "Identifiers of the reported input proteins"); addFlag ("print_fam", "Print the FAM.id instead of gene symbol"); - addFlag ("pseudo", "Indicate pseudo-genes in the protein name as \"" + stopCodonS + "\" or \"" + frameShiftS + "\""); + addFlag ("pseudo", "Indicate pseudo-genes in the protein name as " + strQuote (stopCodonS) + " or " + strQuote (frameShiftS)); + addFlag ("force_cds_report", "Report contig/start/stop/strand even if this information does not exist"); + addFlag ("non_reportable", "Report non-reportable families"); + addFlag ("core", "Report only core reportale families"); // Testing addFlag ("nosame", "Exclude the same reference ptotein from the BLAST output (for testing)"); addFlag ("noblast", "Exclude the BLAST output (for testing)"); #ifdef SVN_REV - addFlag ("version", "Print the version and exit"); + version = SVN_REV; #endif } @@ -992,47 +1730,69 @@ struct ThisApplication : Application void body () const final { - const string famFName = getArg ("fam"); - const string blastpFName = getArg ("blastp"); - const string blastxFName = getArg ("blastx"); - const string gffFName = getArg ("gff"); - const string gffMatchFName = getArg ("gff_match"); - const string hmmDom = getArg ("hmmdom"); - const string hmmsearch = getArg ("hmmsearch"); - ident_min = str2 (getArg ("ident_min")); - complete_cover_min = str2 (getArg ("complete_cover_min")); - partial_cover_min = str2 (getArg ("partial_cover_min")); - const bool skip_hmm_check = getFlag ("skip_hmm_check"); - const string outFName = getArg ("out"); - print_fam = getFlag ("print_fam"); - reportPseudo = getFlag ("pseudo"); - const bool nosame = getFlag ("nosame"); - const bool noblast = getFlag ("noblast"); - #ifdef SVN_REV - const bool version = getFlag ("version"); - #endif + static_assert (complete_coverage_min_def > partial_coverage_min_def, "complete_coverage_min_def > partial_coverage_min_def"); + + const string famFName = getArg ("fam"); + const string blastpFName = getArg ("blastp"); + const string blastxFName = getArg ("blastx"); + const string gffFName = getArg ("gff"); + const string gffMatchFName = getArg ("gff_match"); + const bool bedP = getFlag ("bed"); + const string dnaLenFName = getArg ("dna_len"); + const string hmmDom = getArg ("hmmdom"); + const string hmmsearch = getArg ("hmmsearch"); + const string organism = getArg ("organism"); + const string point_mut = getArg ("point_mut"); + const string point_mut_all_FName = getArg ("point_mut_all"); + double ident_min = str2 (getArg ("ident_min")); + const double partial_coverage_min = str2 (getArg ("coverage_min")); + const bool skip_hmm_check = getFlag ("skip_hmm_check"); + const string outFName = getArg ("out"); + print_fam = getFlag ("print_fam"); + reportPseudo = getFlag ("pseudo"); + const bool force_cds_report = getFlag ("force_cds_report"); + const bool non_reportable = getFlag ("non_reportable"); + const bool report_core_only = getFlag ("core"); + const bool nosame = getFlag ("nosame"); + const bool noblast = getFlag ("noblast"); ASSERT (hmmsearch. empty () == hmmDom. empty ()); + IMPLY (! outFName. empty (), ! blastpFName. empty ()); + IMPLY (! gffFName. empty (), ! blastpFName. empty ()); + if (! blastpFName. empty () && ! blastxFName. empty () && gffFName. empty ()) + throw runtime_error ("If BLASTP and BLASTX files are present then a GFF file must be present"); + + if (ident_min == -1.0) + ident_min = ident_min_def; + else + ident_min_user = true; + + if (! (ident_min >= 0.0 && ident_min <= 1.0)) + throw runtime_error ("-ident_min must be -1 or between 0 and 1"); + if (! (partial_coverage_min >= 0.0 && partial_coverage_min <= 1.0)) + throw runtime_error ("-coverage_min must be -1 or between 0 and 1"); + + if (partial_coverage_min > complete_coverage_min_def) + throw runtime_error ("-coverage_min must be less than " + toString (complete_coverage_min_def) + " - threshod for complete matches"); + + + defaultCompleteBR = BlastRule (ident_min, complete_coverage_min_def); + defaultPartialBR = BlastRule (ident_min, partial_coverage_min); - #ifdef SVN_REV - if (version) - { - cout << "version: " << SVN_REV << endl; - exit(1); - } - #endif - cdsExist = ! blastxFName. empty () + cdsExist = force_cds_report + || ! blastxFName. empty () || ! gffFName. empty (); - Batch batch (famFName); + if (! point_mut_all_FName. empty ()) + point_mut_all. reset (new OFStream (point_mut_all_FName)); + + + Batch batch (famFName, organism, point_mut, non_reportable, report_core_only); - // Fusion proteins, see PD-283 ?? - - // Input // batch.blastAls @@ -1049,14 +1809,22 @@ struct ThisApplication : Application if (verbose ()) cout << f. line << endl; } - const BlastAlignment al (f. line, true); + BlastAlignment al (f. line, true); al. qc (); if (nosame && toString (al. gi) == al. targetName) continue; if (al. good ()) - batch. blastAls << al; + batch. blastAls << move (al); else - { ASSERT (! al. refExactlyMatched ()); } + { ASSERT (! al. refExactlyMatched ()); + if (false) + { + al. saveText (cout); + cout << al. targetName << endl; + cout << al. pIdentity () << ' ' << al. refCoverage () << ' ' << al. targetCoverage () << endl; + ERROR; + } + } } } @@ -1070,12 +1838,12 @@ struct ThisApplication : Application if (verbose ()) cout << f. line << endl; } - const BlastAlignment al (f. line, false); + BlastAlignment al (f. line, false); al. qc (); if (nosame && toString (al. gi) == al. targetName) continue; if (al. good ()) - batch. blastAls << al; + batch. blastAls << move (al); else { ASSERT (! al. refExactlyMatched ()); } } @@ -1088,6 +1856,7 @@ struct ThisApplication : Application // batch.domains if (! hmmDom. empty ()) { + batch. hmmExist = true; LineInput f (hmmDom); while (f. nextLine ()) { @@ -1112,61 +1881,104 @@ struct ThisApplication : Application cout << f. line << endl; if (f. line. empty () || f. line [0] == '#') continue; - const HmmAlignment hmmAl (f. line, batch); - if (hmmAl. good ()) - batch. hmmAls << hmmAl; + HmmAlignment hmmAl (f. line, batch); + if (! hmmAl. good ()) + { + if (verbose ()) + { + cout << " Bad HMM: " << endl; + hmmAl. saveText (cout); + } + continue; + } + auto al = new BlastAlignment (hmmAl); + hmmAl. blastAl. reset (al); + if (verbose ()) + cout << al->targetName << " " << al->gene << endl; + const HmmAlignment::Domain domain = batch. domains [HmmAlignment::Pair (al->targetName, al->gene)]; + if (! domain. hmmLen) + continue; // domain does not exist + /*al->refLen = domain. hmmLen; + al->refStart = domain. hmmStart; + al->refStop = domain. hmmStop; */ + al->targetLen = domain. seqLen; + al->targetStart = domain. seqStart; + al->targetStop = domain. seqStop; + al->setTargetAlign (); + ASSERT (! al->refExactlyMatched ()); + al->qc (); + batch. hmmAls << move (hmmAl); } } if (verbose ()) cout << "# Good HMMs: " << batch. hmmAls. size () << endl; - // Output - batch. process (skip_hmm_check); - - // For Batch::report() if (! gffFName. empty ()) { - const Gff gff (gffFName, ! gffMatchFName. empty ()); + unique_ptr annot; + if (bedP) + { + Annot::Bed bed; + annot. reset (new Annot (bed, gffFName)); + } + else + { + Annot::Gff gff; + annot. reset (new Annot (gff, gffFName, ! gffMatchFName. empty ())); + } + ASSERT (annot. get ()); map seqId2locusTag; if (! gffMatchFName. empty ()) { LineInput f (gffMatchFName); + Istringstream iss; + string seqId, locusTag; while (f. nextLine ()) { - istringstream iss (f. line); - string seqId, locusTag; + iss. reset (f. line); iss >> seqId >> locusTag; ASSERT (! locusTag. empty ()); seqId2locusTag [seqId] = locusTag; } + if (seqId2locusTag. empty ()) + throw runtime_error ("File " + gffMatchFName + " is empty"); } - for (auto& al : batch. goodBlastAls) + { + // Locus::contigLen + map contig2len; + if (! dnaLenFName. empty ()) + { + LineInput f (dnaLenFName); + string contig; + size_t len; + Istringstream iss; + while (f. nextLine ()) + { + iss. reset (f. line); + len = 0; + iss >> contig >> len; + ASSERT (len); + contig2len [contig] = len; + } + for (const auto& it : annot->prot2cdss) + for (const Locus& locus : it. second) + if (! locus. contigLen) + var_cast (locus). contigLen = contig2len [locus.contig]; + } + } + for (auto& al : batch. blastAls) if (al. targetProt) - { - ASSERT (al. cdss. empty ()); - #if 0 - string s (al. targetName); - trimSuffix (s, "|"); - string locusTag (rfindSplit (s, '|')); - #else - string locusTag = al. targetName; - #endif - if (! gffMatchFName. empty ()) - locusTag = seqId2locusTag [locusTag]; - if (const Set* cdss = findPtr (gff. seqid2cdss, locusTag)) - insertAll (al. cdss, *cdss); - else - throw runtime_error ("Locus tag \"" + locusTag + "\" is misssing in .gff file. Protein name: " + al. targetName); - al. qc (); - } + al. setCdss (seqId2locusTag, * annot. get ()); + for (auto& hmmAl : batch. hmmAls) + var_cast (hmmAl. blastAl. get ()) -> setCdss (seqId2locusTag, * annot. get ());; } + // Output + batch. process (skip_hmm_check); batch. report (cout); - - if (! outFName. empty ()) { OFStream ofs (outFName); diff --git a/amrfinder.cpp b/amrfinder.cpp new file mode 100644 index 0000000..e03e111 --- /dev/null +++ b/amrfinder.cpp @@ -0,0 +1,498 @@ +// amrfinder.cpp + +/*=========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's official duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government have not placed any restriction on its use or reproduction. +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* Please cite the author in any work or product based on this material. +* +* =========================================================================== +* +* Author: Vyacheslav Brover +* +* File Description: +* AMRFinder +* +*/ + + +#ifdef _MSC_VER + #error "UNIX is required" +#endif + +#undef NDEBUG +#include "common.inc" + +#include "common.hpp" +using namespace Common_sp; + +#include "amrfinder.inc" + + + +namespace +{ + + +// PAR +constexpr size_t threads_max_min = 1; // was: 4 +constexpr size_t threads_def = 4; +// Cf. amr_report.cpp +constexpr double ident_min_def = 0.9; +constexpr double partial_coverage_min_def = 0.5; + + + + +// ThisApplication + +struct ThisApplication : ShellApplication +{ + ThisApplication () + : ShellApplication ("Identify AMR genes in proteins and/or contigs and print a report", true, true, true) + { + addKey ("protein", "Protein FASTA file to search", "", 'p', "PROT_FASTA"); + addKey ("nucleotide", "Nucleotide FASTA file to search", "", 'n', "NUC_FASTA"); + addKey ("database", "Alternative directory with AMRFinder database. Default: $AMRFINDER_DB", "", 'd', "DATABASE_DIR"); + addFlag ("update", "Update the AMRFinder database", 'u'); // PD-2379 + addKey ("gff", "GFF file for protein locations. Protein id should be in the attribute 'Name=' (9th field) of the rows with type 'CDS' or 'gene' (3rd field).", "", 'g', "GFF_FILE"); + addKey ("ident_min", "Minimum identity for nucleotide hit (0..1). -1 means use a curated threshold if it exists and " + toString (ident_min_def) + " otherwise", "-1", 'i', "MIN_IDENT"); + addKey ("coverage_min", "Minimum coverage of the reference protein (0..1)", toString (partial_coverage_min_def), 'c', "MIN_COV"); + addKey ("organism", "Taxonomy group for point mutation assessment\n " ORGANISMS, "", 'O', "ORGANISM"); + addKey ("translation_table", "NCBI genetic code for translated blast", "11", 't', "TRANSLATION_TABLE"); + addFlag ("plus", "Add the plus genes to the report"); // PD-2789 + addKey ("point_mut_all", "File to report all target positions of reference point mutations", "", '\0', "POINT_MUT_ALL_FILE"); + addKey ("blast_bin", "Directory for BLAST. Deafult: $BLAST_BIN", "", '\0', "BLAST_DIR"); + addKey ("parm", "amr_report parameters for testing: -nosame -noblast -skip_hmm_check -bed", "", '\0', "PARM"); + addKey ("output", "Write output to OUTPUT_FILE instead of STDOUT", "", 'o', "OUTPUT_FILE"); + addFlag ("quiet", "Suppress messages to STDERR", 'q'); + addFlag ("gpipe", "Protein identifiers in the protein FASTA file have format 'gnl||'"); + #ifdef SVN_REV + version = SVN_REV; + #endif + #if 0 + setRequiredGroup ("protein", "Input"); + setRequiredGroup ("nucleotide", "Input"); + #endif + // threads_max: do not include blast/hmmsearch's threads ?? + } + + + + void initEnvironment () final + { + ShellApplication::initEnvironment (); + var_cast (name2arg ["threads"] -> asKey ()) -> defaultValue = to_string (threads_def); + } + + + + bool blastThreadable (const string &blast, + const string &logFName) const + { + try { exec (fullProg (blast) + " -help | grep '^ *\\-num_threads' > " + logFName + " 2> " + logFName, logFName); } + catch (const runtime_error &) + { return false; } + return true; + } + + + + size_t get_threads_max_max (const string &logFName) const + { +#if __APPLE__ + int count; + size_t count_len = sizeof(count); + sysctlbyname("hw.logicalcpu", &count, &count_len, NULL, 0); + // fprintf(stderr,"you have %i cpu cores", count); + return count; +#else + exec ("nproc --all > " + tmp + ".nproc", logFName); + LineInput f (tmp + ".nproc"); + return str2 (f. getString ()); +#endif + } + + + + void shellBody () const final + { + const string prot = shellQuote (getArg ("protein")); + const string dna = shellQuote (getArg ("nucleotide")); + string db = getArg ("database"); + const bool update = getFlag ("update"); + const string gff = shellQuote (getArg ("gff")); + const double ident = arg2double ("ident_min"); + const double cov = arg2double ("coverage_min"); + const string organism = shellQuote (getArg ("organism")); + const uint gencode = arg2uint ("translation_table"); + const bool add_plus = getFlag ("plus"); + const string point_mut_all = getArg ("point_mut_all"); + string blast_bin = getArg ("blast_bin"); + const string parm = getArg ("parm"); + const string output = shellQuote (getArg ("output")); + const bool quiet = getFlag ("quiet"); + const bool gpipe = getFlag ("gpipe"); + + + const string logFName (tmp + ".log"); + + + Stderr stderr (quiet); + stderr << "Running "<< getCommandLine () << '\n'; + const Verbose vrb (qc_on); + + if (threads_max < threads_max_min) + throw runtime_error ("Number of threads cannot be less than " + to_string (threads_max_min)); + + if (ident != -1.0 && (ident < 0.0 || ident > 1.0)) + throw runtime_error ("ident_min must be between 0 and 1"); + + if (cov < 0.0 || cov > 1.0) + throw runtime_error ("coverage_min must be between 0 and 1"); + + + if (! emptyArg (output)) + try { OFStream f (unQuote (output)); } + catch (...) { throw runtime_error ("Cannot open output file " + output); } + + + time_t start, end; // For timing... + start = time (NULL); + + + const size_t threads_max_max = get_threads_max_max (logFName); + if (threads_max > threads_max_max) + { + stderr << "The number of threads cannot be greater than " << threads_max_max << " on this computer" << '\n' + << "The current number of threads is " << threads_max << ", reducing to " << threads_max_max << '\n'; + threads_max = threads_max_max; + } + + + const string defaultDb (execDir + "/data/latest"); + + // db + if (db. empty ()) + { + if (const char* s = getenv ("AMRFINDER_DB")) + db = string (s); + else + db = defaultDb; + } + ASSERT (! db. empty ()); + + + if (update) + { + // PD-2447 + if (! emptyArg (prot) || ! emptyArg (dna)) + throw runtime_error ("AMRFinder -u/--update option cannot be run with -n/--nucleotide or -p/--protein options"); + if (! getArg ("database"). empty ()) + throw runtime_error ("AMRFinder update option (-u/--update) only operates on the default database directory. The -d/--database option is not permitted"); + if (getenv ("AMRFINDER_DB")) + { + cout << "WARNING: AMRFINDER_DB is set, but AMRFinder auto-update only downloads to the default database directory" << endl; + db = defaultDb; + } + const Dir dbDir (db); + if (! dbDir. items. empty () && dbDir. items. back () == "latest") + { + findProg ("amrfinder_update"); + exec (fullProg ("amrfinder_update") + " -d " + dbDir. getParent () + ifS (quiet, " -q") + ifS (qc_on, " --debug") + " > " + logFName, logFName); + } + else + cout << "WARNING: Updating database directory works only for databases with the default data directory format." << endl + << "Please see https://github.com/ncbi/amr/wiki for details." << endl + << "Current database directory is: " << strQuote (dbDir. getParent ()) << endl + << "New database directories will be created as subdirectories of " << strQuote (dbDir. getParent ()) << endl; + } + + + if (! directoryExists (db)) // PD-2447 + throw runtime_error ("No valid AMRFinder database found. To download the latest version to the default directory run amrfinder -u"); + + + string searchMode; + StringVector includes; + if (emptyArg (prot)) + if (emptyArg (dna)) + { + if (! update) + throw runtime_error ("Parameter --prot or --nucleotide must be present"); + } + else + { + if (! emptyArg (gff)) + throw runtime_error ("Parameter --gff is redundant"); + searchMode = "translated nucleotide"; + } + else + { + searchMode = "protein"; + if (emptyArg (dna)) + { + searchMode += "-only"; + includes << key2shortHelp ("nucleotide") + " and " + key2shortHelp ("gff") + " options to add translated searches"; + } + else + { + if (emptyArg (gff)) + throw runtime_error ("If parameters --prot and --nucleotide are present then parameter --gff must be present"); + searchMode = "combined translated and protein"; + } + } + if (emptyArg (organism)) + includes << key2shortHelp ("organism") + " option to add point-mutation searches"; + else + searchMode += " and point-mutation"; + + + if (searchMode. empty ()) + return; + + + stderr << "AMRFinder " << searchMode << " search with database " << db << "\n"; + for (const string& include : includes) + stderr << " - include " << include << '\n'; + + + // blast_bin + if (blast_bin. empty ()) + if (const char* s = getenv ("BLAST_BIN")) + blast_bin = string (s); + if (! blast_bin. empty ()) + { + if (! isRight (blast_bin, "/")) + blast_bin += "/"; + prog2dir ["blastp"] = blast_bin; + prog2dir ["blastx"] = blast_bin; + prog2dir ["blastn"] = blast_bin; + } + + string organism1; + if (! emptyArg (organism)) + { + organism1 = unQuote (organism); + replace (organism1, ' ', '_'); + } + + + if (! emptyArg (organism)) + { + string errMsg; + try { exec ("grep -w ^" + organism1 + " " + db + "/AMRProt-point_mut.tab > /dev/null 2> /dev/null"); } + catch (const runtime_error &) + { + errMsg = "No protein point mutations for organism " + organism; + } + if (! emptyArg (dna)) + if (! fileExists (db + "/AMR_DNA-" + organism1)) + errMsg = "No DNA point mutations for organism " + organism; + if (! errMsg. empty ()) + throw runtime_error (errMsg + "\nPossible organisms: " ORGANISMS); + } + + + const string qcS (qc_on ? "-qc -verbose 1" : ""); + const string force_cds_report (! emptyArg (dna) && ! emptyArg (organism) ? "-force_cds_report" : ""); // Needed for point_mut + + + findProg ("fasta_check"); + findProg ("fasta2parts"); + findProg ("amr_report"); + + + string blastp_par; + string blastx_par; + bool blastxChunks = false; + { + Threads th (threads_max - 1, true); + + double prot_share = 0.0; + double dna_share = 0.0; + if ( ! emptyArg (prot)) + prot_share = 1.0; // PAR + if (! emptyArg (dna)) + dna_share = 1.0; // PAR + const double total_share = prot_share + dna_share; + + string dna_ = dna; + if (! emptyArg (dna) && gpipe) + { + dna_ = tmp + ".dna"; + if (system (("sed 's/^>gnl|[^|]*|/>/1' " + dna + " > " + dna_). c_str ())) + throw runtime_error ("Cannot remove 'gnl' from " + dna); + } + + if (! emptyArg (prot)) + { + findProg ("blastp"); + findProg ("hmmsearch"); + + exec (fullProg ("fasta_check") + prot + " -aa -hyphen -log " + logFName, logFName); + + string gff_match; + if (! emptyArg (gff) && ! contains (parm, "-bed")) + { + string locus_tag; + const int status = system (("grep '^>.*\\[locus_tag=' " + prot + " > /dev/null"). c_str ()); + const bool locus_tagP = (status == 0); + if (locus_tagP || gpipe) + { + locus_tag = " -locus_tag " + tmp + ".match"; + gff_match = " -gff_match " + tmp + ".match"; + } + findProg ("gff_check"); + string dnaPar; + if (! emptyArg (dna)) + dnaPar = " -dna " + dna_; + const string gpipeS (ifS (gpipe, " -gpipe")); + exec (fullProg ("gff_check") + gff + " -prot " + prot + dnaPar + gpipeS + locus_tag + " -log " + logFName, logFName); + } + + if (! fileExists (db + "/AMRProt.phr")) + throw runtime_error ("BLAST database " + shellQuote (db + "/AMRProt") + " does not exist"); + + const size_t prot_threads = (size_t) floor ((double) th. getAvailable () * (prot_share / total_share) / 2.0); + + stderr << "Running blastp...\n"; + // " -task blastp-fast -word_size 6 -threshold 21 " // PD-2303 + string num_threads; + if (blastThreadable ("blastp", logFName) && prot_threads > 1) + num_threads = "-num_threads " + to_string (prot_threads); + th. exec (fullProg ("blastp") + " -query " + prot + " -db " + db + "/AMRProt -show_gis -evalue 1e-20 -comp_based_stats 0 " + + num_threads + " " + "-outfmt '6 qseqid sseqid length nident qstart qend qlen sstart send slen qseq sseq' " + "-out " + tmp + ".blastp > /dev/null 2> /dev/null", prot_threads); + + stderr << "Running hmmsearch...\n"; + string cpu; + if (prot_threads > 1) + cpu = "--cpu " + to_string (prot_threads); + th. exec (fullProg ("hmmsearch") + " --tblout " + tmp + ".hmmsearch --noali --domtblout " + tmp + ".dom --cut_tc -Z 10000 " + cpu + " " + db + "/AMR.LIB " + prot + " > /dev/null 2> /dev/null", prot_threads); + + blastp_par = "-blastp " + tmp + ".blastp -hmmsearch " + tmp + ".hmmsearch -hmmdom " + tmp + ".dom"; + if (! emptyArg (gff)) + blastp_par += " -gff " + gff + gff_match; + } + + if (! emptyArg (dna)) + { + stderr << "Running blastx...\n"; + findProg ("blastx"); + + exec (fullProg ("fasta_check") + dna_ + " -hyphen -len "+ tmp + ".len -log " + logFName, logFName); + const size_t threadsAvailable = th. getAvailable (); + //ASSERT (threadsAvailable); + if (threadsAvailable >= 2) + { + exec ("mkdir " + tmp + ".chunk"); + exec (fullProg ("fasta2parts") + dna_ + " " + to_string (threadsAvailable) + " " + tmp + ".chunk -log " + logFName, logFName); // PAR + exec ("mkdir " + tmp + ".blastx_dir"); + FileItemGenerator fig (false, true, tmp + ".chunk"); + string item; + while (fig. next (item)) + th << thread (exec, fullProg ("blastx") + " -query " + tmp + ".chunk/" + item + " -db " + db + "/AMRProt " + "-show_gis -word_size 3 -evalue 1e-20 -query_gencode " + to_string (gencode) + " " + "-seg no -comp_based_stats 0 -max_target_seqs 10000 " + "-outfmt '6 qseqid sseqid length nident qstart qend qlen sstart send slen qseq sseq' " + "-out " + tmp + ".blastx_dir/" + item + " > /dev/null 2> /dev/null", string ()); + blastxChunks = true; + } + else + th. exec (fullProg ("blastx") + " -query " + dna_ + " -db " + db + "/AMRProt " + "-show_gis -word_size 3 -evalue 1e-20 -query_gencode " + to_string (gencode) + " " + "-seg no -comp_based_stats 0 -max_target_seqs 10000 " + "-outfmt '6 qseqid sseqid length nident qstart qend qlen sstart send slen qseq sseq' " + "-out " + tmp + ".blastx > /dev/null 2> /dev/null", threadsAvailable); + blastx_par = "-blastx " + tmp + ".blastx -dna_len " + tmp + ".len"; + } + + if (! emptyArg (dna) && ! emptyArg (organism)) + { + QC_ASSERT (fileExists (db + "/AMR_DNA-" + organism1)); + findProg ("blastn"); + findProg ("point_mut"); + stderr << "Running blastn...\n"; + exec (fullProg ("blastn") + " -query " + dna_ + " -db " + db + "/AMR_DNA-" + organism1 + " -evalue 1e-20 -dust no " + "-outfmt '6 qseqid sseqid length nident qstart qend qlen sstart send slen qseq sseq' -out " + tmp + ".blastn > " + logFName + " 2> " + logFName, logFName); + } + } + + + if (blastxChunks) + exec ("cat " + tmp + ".blastx_dir/* > " + tmp + ".blastx"); + + + const string point_mut_allS (point_mut_all. empty () ? "" : ("-point_mut_all " + point_mut_all)); + const string coreS (add_plus ? "" : " -core"); + exec (fullProg ("amr_report") + " -fam " + db + "/fam.tab " + blastp_par + " " + blastx_par + + " -organism " + organism + " -point_mut " + db + "/AMRProt-point_mut.tab " + point_mut_allS + " " + + force_cds_report + " -pseudo" + coreS + + (ident == -1 ? string () : " -ident_min " + toString (ident)) + + " -coverage_min " + toString (cov) + + " " + qcS + " " + parm + " -log " + logFName + " > " + tmp + ".amr-raw", logFName); + + if (! emptyArg (dna) && ! emptyArg (organism)) + { + QC_ASSERT (fileExists (db + "/AMR_DNA-" + organism1)); + exec (fullProg ("point_mut") + tmp + ".blastn " + db + "/AMR_DNA-" + organism1 + ".tab " + qcS + " -log " + logFName + " > " + tmp + ".amr-snp", logFName); + exec ("tail -n +2 " + tmp + ".amr-snp >> " + tmp + ".amr-raw"); + } + + // $tmp.amr-raw --> $tmp.amr + + // timing the run + end = time(NULL); + stderr << "AMRFinder took " << end - start << " seconds to complete\n"; + + string sort_cols; + if ( ! force_cds_report. empty () + || ! blastx_par. empty () + || ! emptyArg (gff) + ) + sort_cols = " -k2 -k3n -k4n -k5"; + exec ("head -1 " + tmp + ".amr-raw > " + tmp + ".amr"); + exec ("tail -n +2 " + tmp + ".amr-raw | sort" + sort_cols + " -k1 >> " + tmp + ".amr"); + + + if (emptyArg (output)) + exec ("cat " + tmp + ".amr"); + else + exec ("cp " + tmp + ".amr " + output); + } +}; + + + +} // namespace + + + + +int main (int argc, + const char* argv[]) +{ + ThisApplication app; + return app. run (argc, argv); +} + + + diff --git a/amrfinder.inc b/amrfinder.inc new file mode 100644 index 0000000..7f665b6 --- /dev/null +++ b/amrfinder.inc @@ -0,0 +1,9 @@ +#define ORGANISMS "Campylobacter|Escherichia|Salmonella" +#if __APPLE__ + + #include + #include + +#endif + + diff --git a/amrfinder_update.cpp b/amrfinder_update.cpp new file mode 100644 index 0000000..7e2b3c1 --- /dev/null +++ b/amrfinder_update.cpp @@ -0,0 +1,287 @@ +// amrfinder_update.cpp + +/*=========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's official duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government have not placed any restriction on its use or reproduction. +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* Please cite the author in any work or product based on this material. +* +* =========================================================================== +* +* Author: Vyacheslav Brover +* +* File Description: +* Updating of AMRFinder data +* +*/ + + +#ifdef _MSC_VER + #error "UNIX is required" +#endif + +#undef NDEBUG +#include "common.inc" + +#include + +#include "common.hpp" +using namespace Common_sp; + +#include "amrfinder.inc" + + + +namespace +{ + + + +struct Curl +{ + CURL* eh {nullptr}; + + + Curl () + { eh = curl_easy_init (); + ASSERT (eh); + } + ~Curl () + { curl_easy_cleanup (eh); } + + + void download (const string &url, + const string &fName); + string read (const string &url); +}; + + + + +size_t write_stream_cb (char* ptr, + size_t size, + size_t nMemb, + void* userData) +{ + ASSERT (ptr); + ASSERT (size == 1); + ASSERT (userData); + + OFStream& f = * static_cast (userData); + FOR (size_t, i, nMemb) + f << ptr [i];; + + return nMemb; +} + + + +void Curl::download (const string &url, + const string &fName) +{ + ASSERT (! url. empty ()); + ASSERT (! fName. empty ()); + + OFStream f (fName); + curl_easy_setopt (eh, CURLOPT_URL, url. c_str ()); + curl_easy_setopt (eh, CURLOPT_WRITEFUNCTION, write_stream_cb); + curl_easy_setopt (eh, CURLOPT_WRITEDATA, & f); + EXEC_ASSERT (curl_easy_perform (eh) == 0); +} + + + +size_t write_string_cb (char* ptr, + size_t size, + size_t nMemb, + void* userData) +{ + ASSERT (ptr); + ASSERT (size == 1); + ASSERT (userData); + + string& s = * static_cast (userData); + FOR (size_t, i, nMemb) + s += ptr [i];; + + return nMemb; +} + + + +string Curl::read (const string &url) +{ + ASSERT (! url. empty ()); + + string s; s. reserve (1024); // PAR + curl_easy_setopt (eh, CURLOPT_URL, url. c_str ()); + curl_easy_setopt (eh, CURLOPT_WRITEFUNCTION, write_string_cb); + curl_easy_setopt (eh, CURLOPT_WRITEDATA, & s); + EXEC_ASSERT (curl_easy_perform (eh) == 0); + + return s; +} + +// + + + +// #define URL "https://ftp.ncbi.nlm.nih.gov/pathogen/Technical/AMRFinder_technical/v2.data/" +#define URL "https://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/AMRFinderPlus/data/" + +string getLatestVersion (Curl &curl) +// Return: empty() <=> failure +{ + string prevLine; + const StringVector dir (curl. read (URL), '\n'); + if (verbose ()) + cout << dir << endl; + for (const string& line : dir) + { + if (contains (line, "latest/")) + break; + prevLine = line; + } + if (prevLine. empty ()) + return string (); + + const size_t pos1 = prevLine. find ('>'); + if (pos1 == string::npos) + return string (); + prevLine. erase (0, pos1 + 1); + + const size_t pos2 = prevLine. find ("/<"); + if (pos2 == string::npos) + return string (); + prevLine. erase (pos2); + + return prevLine; +} + + + +void fetchAMRFile (Curl &curl, + const string &dir, + const string &fName) +{ + ASSERT (isDirName (dir)); + ASSERT (! fName. empty ()); + curl. download (string (URL "latest/") + fName, dir + fName); +} + + + + +// ThisApplication + +struct ThisApplication : ShellApplication +{ + ThisApplication () + : ShellApplication ("Identify AMR genes in proteins and/or contigs and print a report", false, true, true) + { + addKey ("database", "Directory for all versions of AMRFinder databases", "$BASE/data", 'd', "DATABASE_DIR"); + // Symbolic link ?? + addFlag ("quiet", "Suppress messages to STDERR", 'q'); + #ifdef SVN_REV + version = SVN_REV; + #endif + } + + + + void shellBody () const final + { + string mainDirOrig = getArg ("database"); + const bool quiet = getFlag ("quiet"); + + + Stderr stderr (quiet); + stderr << "Running "<< getCommandLine () << '\n'; + const Verbose vrb (qc_on); + + // mainDir + const Dir mainDir (mainDirOrig); + string mainDirS (mainDir. get ()); + if (! isRight (mainDirS, "/")) + mainDirS += "/"; + + findProg ("makeblastdb"); + findProg ("hmmpress"); + + Curl curl; + + + const string latest_version (getLatestVersion (curl)); + if (latest_version. empty ()) + throw runtime_error ("Cannot get the latest version"); + + const string latestDir (mainDirS + latest_version + "/"); + const string latestLink (mainDirS + "latest"); + + if (! directoryExists (mainDirS)) + exec ("mkdir " + mainDirS); + + if (directoryExists (latestDir)) + stderr << latestDir << " already exists, overwriting what was there\n"; + else + exec ("mkdir " + latestDir); + + if (directoryExists (latestLink)) + exec ("rm " + latestLink); + exec ("ln -s " + latest_version + " " + latestLink); + + StringVector dnaPointMuts (ORGANISMS, '|'); + + stderr << "Dowloading AMRFinder database version " << latest_version << " into " << latestDir << "\n"; + fetchAMRFile (curl, latestDir, "AMR.LIB"); + fetchAMRFile (curl, latestDir, "AMRProt"); + fetchAMRFile (curl, latestDir, "AMRProt-point_mut.tab"); + fetchAMRFile (curl, latestDir, "AMR_CDS"); + for (const string& dnaPointMut : dnaPointMuts) + { + fetchAMRFile (curl, latestDir, "AMR_DNA-" + dnaPointMut); + fetchAMRFile (curl, latestDir, "AMR_DNA-" + dnaPointMut + ".tab"); + } + fetchAMRFile (curl, latestDir, "fam.tab"); + fetchAMRFile (curl, latestDir, "changes.txt"); + + stderr << "Indexing" << "\n"; + exec (fullProg ("hmmpress") + " -f " + latestDir + "AMR.LIB > /dev/null 2> /dev/null"); + exec (fullProg ("makeblastdb") + " -in " + latestDir + "AMRProt -dbtype prot -logfile /dev/null"); + exec (fullProg ("makeblastdb") + " -in " + latestDir + "AMR_CDS -dbtype nucl -logfile /dev/null"); + for (const string& dnaPointMut : dnaPointMuts) + exec (fullProg ("makeblastdb") + " -in " + latestDir + "AMR_DNA-" + dnaPointMut + " -dbtype nucl -logfile /dev/null"); + } +}; + + + +} // namespace + + + +int main (int argc, + const char* argv[]) +{ + ThisApplication app; + return app. run (argc, argv); +} + + + diff --git a/common.cpp b/common.cpp index 193a251..833a90f 100644 --- a/common.cpp +++ b/common.cpp @@ -41,8 +41,18 @@ #include #ifndef _MSC_VER #include +//#include #endif -#include +#include + + + + +[[noreturn]] void errorThrow (const string &msg) +{ + throw std::logic_error (msg); +} + @@ -53,11 +63,60 @@ namespace Common_sp vector programArgs; string programName; + + + +string getCommandLine () +{ + string commandLine; + for (const string& s : programArgs) + { + const bool bad = s. empty () + || contains (s, ' ') + || contains (s, '|') + || contains (s, ';') + || contains (s, '#') + || contains (s, '*') + || contains (s, '?') + || contains (s, '$') + || contains (s, '(') + || contains (s, ')') + || contains (s, '<') + || contains (s, '>') + || contains (s, '~') + || contains (s, '\'') + || contains (s, '"') + || contains (s, '\\'); + if (! commandLine. empty ()) + commandLine += ' '; + if (bad) + commandLine += strQuote (s, '\''); + else + commandLine += s; + } + return commandLine; +} + + + ostream* logPtr = nullptr; bool qc_on = false; -size_t threads_max = 1; ulong seed_global = 1; +bool sigpipe = false; + + +// thread +size_t threads_max = 1; + +inline thread::id get_thread_id () + { return this_thread::get_id (); } + +thread::id main_thread_id = get_thread_id (); + +bool isMainThread () + { return get_thread_id () == main_thread_id; } + bool Chronometer::enabled = false; @@ -72,6 +131,15 @@ void segmFaultHandler (int /*sig_num*/) errorExit ("Segmentation fault", true); } + +[[noreturn]] void sigpipe_handler (int /*sig_num*/) +{ + signal (SIGPIPE, SIG_DFL); + if (sigpipe) + exit (0); + exit (1); +} + } @@ -81,17 +149,18 @@ bool initCommon () MODULE_INIT signal (SIGSEGV, segmFaultHandler); + signal (SIGPIPE, sigpipe_handler); #ifdef _MSC_VER #pragma warning (disable : 4127) #endif - ASSERT (numeric_limits::is_signed); - ASSERT (sizeof (long) >= 4); + static_assert (numeric_limits::is_signed, "char is signed"); + static_assert (sizeof (long) >= 4, "long size >= 4"); #ifdef _MSC_VER #pragma warning (default : 4127) #endif - ASSERT (SIZE_MAX == std::numeric_limits::max ()); + static_assert (SIZE_MAX == std::numeric_limits::max (), "SIZE_MAX is correct"); return true; } @@ -104,32 +173,33 @@ namespace -void errorExit (const char* msg, - bool segmFault) +[[noreturn]] void errorExit (const char* msg, + bool segmFault) // alloc() may not work { - ostream* os = logPtr ? logPtr : & cout; + ostream* os = logPtr ? logPtr : & cerr; // time ?? #ifndef _MSC_VER const char* hostname = getenv ("HOSTNAME"); const char* pwd = getenv ("PWD"); #endif - *os << endl - << "*** ERROR ***" << endl - << msg << endl - #ifndef _MSC_VER - << "HOSTNAME: " << (hostname ? hostname : "?") << endl - << "PWD: " << (pwd ? pwd : "?") << endl - #endif - << "Progam name: " << programName << endl - << "Command line:"; - for (const string& s : programArgs) - *os << " " << s; - *os << endl; -//system (("env >> " + logFName). c_str ()); - - os->flush (); + const string errorS ("*** ERROR ***"); + if (isLeft (msg, errorS)) // msg already is the result of errorExit() + *os << endl << msg << endl; + else + *os << endl + << errorS << endl + << msg << endl << endl + #ifndef _MSC_VER + << "HOSTNAME: " << (hostname ? hostname : "?") << endl + << "PWD: " << (pwd ? pwd : "?") << endl + #endif + << "Progam name: " << programName << endl + << "Command line: " << getCommandLine () << endl; + //system (("env >> " + logFName). c_str ()); + + os->flush (); if (segmFault) abort (); @@ -138,6 +208,29 @@ void errorExit (const char* msg, +#if 0 +#ifndef _MSC_VER +string getStack () +{ + string s; + constexpr size_t size = 100; // PAR + void* buffer [size]; + const int nptrs = backtrace (buffer, size); + char** strings = backtrace_symbols (buffer, nptrs); + if (strings) + FOR (int, j, nptrs) + s += string (strings [j]) + "\n"; // No function names ?? + else + s = "Cannot get stack trace"; +//free (strings); + + return s; +} +#endif +#endif + + + namespace { @@ -261,6 +354,20 @@ bool goodName (const string &name) +bool isIdentifier (const string& name) +{ + if (name. empty ()) + return false; + if (isDigit (name [0])) + return false; + for (const char c : name) + if (! isLetter (c)) + return false; + return true; +} + + + bool strBlank (const string &s) { for (const char c : s) @@ -364,6 +471,31 @@ void trimTrailing (string &s) +void trimLeading (string &s, + char c) +{ + size_t i = 0; + for (; i < s. size () && s [i] == c; i++) + ; + s. erase (0, i); +} + + + +void trimTrailing (string &s, + char c) +{ + size_t i = s. size (); + while (i) + if (s [i - 1] == c) + i--; + else + break; + s. erase (i); +} + + + size_t containsWord (const string& hay, const string& needle) { @@ -410,24 +542,38 @@ void replaceStr (string &s, const string &from, const string &to) { + ASSERT (! from. empty ()); + if (from == to) return; - + + const bool inside = contains (to, from); + + size_t start = 0; for (;;) { - const size_t pos = s. find (from); + const size_t pos = s. find (from, start); if (pos == string::npos) break; - #if 0 - s = s. substr (0, pos) + to + s. substr (pos + from. size ()); - #else s. replace (pos, from. size (), to); - #endif + start = pos + (inside ? to. size () : 0); } } +string strQuote (const string &s, + char quote) +{ + string s1 (s); + replaceStr (s1, "\\", "\\\\"); + const char slashQuote [] = {'\\', quote, '\0'}; + replaceStr (s1, string (1, quote), slashQuote); + return quote + s1 + quote; +} + + + string to_c (const string &s) { string r; @@ -539,13 +685,21 @@ string rfindSplit (string &s, +void reverse (string &s) +{ + FFOR (size_t, i, s. size () / 2) + swap (s [i], s [s. size () - 1 - i]); +} + + + List str2list (const string &s, char c) { List res; string s1 (s); while (! s1. empty ()) - res << findSplit (s1, c); + res << move (findSplit (s1, c)); return res; } @@ -555,11 +709,13 @@ string list2str (const List &strList, const string &sep) { string s; + bool first = true; for (const string& e : strList) { - if (! s. empty ()) + if (! first) s += sep; s += e; + first = false; } return s; } @@ -596,6 +752,71 @@ bool directoryExists (const string &dirName) + +Dir::Dir (const string &name) +{ + ASSERT (! name. empty ()); + + items = str2list (name, fileSlash); + + auto it = items. begin (); + while (it != items. end ()) + if (it->empty () && it != items. begin ()) + { + auto it1 = items. erase (it); + it = it1; + } + else + it++; + + it = items. begin (); + while (it != items. end ()) + if (*it == ".") + { + auto it1 = items. erase (it); + it = it1; + } + else + it++; + + it = items. begin (); + while (it != items. end ()) + if (*it == ".." && it != items. begin ()) + { + auto it1 = items. erase (it); + it1--; + if (it1->empty ()) + { + ASSERT (it1 == items. begin ()); + *it1 = ".."; + it = it1; + } + else + { + auto it2 = items. erase (it1); + it = it2; + } + } + else + it++; +} + + + + +streampos getFileSize (const string &fName) +{ + try + { + ifstream f (fName, ios::ate | ios::binary); + return f. tellg (); + } + catch (const exception &e) + { throw runtime_error ("Cannot open file " + strQuote (fName) + "\n" + e. what ()); } +} + + + // size_t strMonth2num (const string& month) @@ -635,7 +856,7 @@ size_t strMonth2num (const string& month) bool getChar (istream &is, char &c) { - ASSERT (is. good ()); + QC_ASSERT (is. good ()); const int i = is. get (); c = EOF; @@ -662,7 +883,7 @@ void skipLine (istream &is) char c = '\0'; while (! is. eof () && c != '\n') // UNIX { - ASSERT (is. good ()); + QC_ASSERT (is. good ()); c = (char) is. get (); } #endif @@ -754,8 +975,8 @@ void Rand::qc () const { if (! qc_on) return; - ASSERT (seed > 0); - ASSERT (seed < max_); + QC_ASSERT (seed > 0); + QC_ASSERT (seed < max_); } @@ -792,6 +1013,12 @@ namespace } +int getVerbosity () +{ + return verbose_; +} + + bool verbose (int inc) { return Verbose::enabled () ? (verbose_ + inc > 0) : false; @@ -821,7 +1048,6 @@ Verbose::~Verbose () } - Unverbose::Unverbose () { if (Verbose::enabled ()) @@ -838,6 +1064,75 @@ Unverbose::~Unverbose () +// + +void exec (const string &cmd, + const string &logFName) +{ + ASSERT (! cmd. empty ()); + +//Chronometer_OnePass cop (cmd); + if (verbose ()) + cout << cmd << endl; + + const int status = system (cmd. c_str ()); + if (status) + { + if (! logFName. empty ()) + { + LineInput f (logFName); + throw runtime_error (f. getString ()); + } + throw runtime_error ("Command failed:\n" + cmd + "\nstatus = " + toString (status)); + } +} + + + + +// Threads + +Threads::Threads (size_t threadsToStart_arg, + bool quiet_arg) +: quiet (quiet_arg) +{ + if (! isMainThread ()) + throw logic_error ("Threads are started not from the main thread"); + if (! empty ()) + throw logic_error ("Previous threads have not finished"); + + threadsToStart = threadsToStart_arg; + if (threadsToStart >= threads_max) + throw logic_error ("Too many threads to start"); + + threads. reserve (threadsToStart); + + if (! quiet && verbose (1)) + cerr << "# Threads started: " << threadsToStart + 1 << endl; +} + + + +Threads::~Threads () +{ + { + Progress prog (threads. size () + 1, ! quiet); + const string step ("consecutive threads finished"); + prog (step); // Main thread + for (auto& t : threads) + { + t. join (); + prog (step); + } + } + + threads. clear (); + threadsToStart = 0; +} + + + + // Root void Root::saveFile (const string &fName) const @@ -854,24 +1149,13 @@ void Root::saveFile (const string &fName) const // Named -Named::Named (const string& name_arg) -: name (name_arg) -{ -#ifndef NDEBUG - if (! goodName (name)) - ERROR_MSG ("Bad name: \"" + name_arg + "\""); -#endif -} - - - void Named::qc () const { if (! qc_on) return; Root::qc (); - ASSERT (goodName (name)); + QC_ASSERT (goodName (name)); } @@ -898,6 +1182,18 @@ StringVector::StringVector (const string &fName, +StringVector::StringVector (const string &s, + char c) +{ + StringVector res; + string s1 (s); + while (! s1. empty ()) + *this << move (findSplit (s1, c)); +} + + + + // DisjointCluster @@ -938,6 +1234,24 @@ size_t Progress::beingUsed = 0; +void Progress::report () const +{ + cerr << '\r'; +#ifndef _MSC_VER + cerr << "\033[2K"; +#endif + cerr << n; + if (n_max) + cerr << " / " << n_max; + if (! step. empty ()) + cerr << ' ' << step; + cerr << ' '; + if (! Threads::empty () && ! contains (step, "thread")) + cerr << "(main thread) "; +} + + + // Input @@ -946,21 +1260,47 @@ Input::Input (const string &fName, uint displayPeriod) : buf (new char [bufSize]) , ifs (fName) +, is (& ifs) , prog (0, displayPeriod) { if (! ifs. good ()) - throw runtime_error ("Bad file: '" + fName + "'"); + throw runtime_error ("Cannot open file " + strQuote (fName)); if (! ifs. rdbuf () -> pubsetbuf (buf. get (), (long) bufSize)) - throw runtime_error ("Cannot allocate buffer to '" + fName + "'"); + throw runtime_error ("Cannot allocate buffer to file " + strQuote (fName)); +} + + + +Input::Input (istream &is_arg, + uint displayPeriod) +: is (& is_arg) +, prog (0, displayPeriod) +{ + QC_ASSERT (is); + if (! is->good ()) + throw runtime_error ("Bad input stream"); } +void Input::reset () +{ + ASSERT (is == & ifs); + + ifs. seekg (0); + QC_ASSERT (ifs. good ()); + prog. reset (); +} + + + // LineInput bool LineInput::nextLine () { + ASSERT (is); + if (eof) { line. clear (); @@ -969,12 +1309,20 @@ bool LineInput::nextLine () try { - readLine (ifs, line); + readLine (*is, line); + const bool end = line. empty () && is->eof (); + + if (! commentStart. empty ()) + { + const size_t pos = line. find (commentStart); + if (pos != string::npos) + line. erase (pos); + } trimTrailing (line); - eof = ifs. eof (); + + eof = is->eof (); lineNum++; - const bool end = line. empty () && eof; if (! end) prog (); @@ -993,15 +1341,17 @@ bool LineInput::nextLine () bool ObjectInput::next (Root &row) { + ASSERT (is); + row. clear (); if (eof) return false; - row. read (ifs); + row. read (*is); lineNum++; - eof = ifs. eof (); + eof = is->eof (); if (eof) { ASSERT (row. empty ()); @@ -1010,10 +1360,10 @@ bool ObjectInput::next (Root &row) prog (); - ASSERT (ifs. peek () == '\n'); + ASSERT (is->peek () == '\n'); row. qc (); - skipLine (ifs); + skipLine (*is); return true; } @@ -1024,11 +1374,13 @@ bool ObjectInput::next (Root &row) char CharInput::get () { + ASSERT (is); + ungot = false; - const char c = (char) ifs. get (); + const char c = (char) is->get (); - eof = ifs. eof (); + eof = is->eof (); ASSERT (eof == (c == (char) EOF)); if (eol) @@ -1049,9 +1401,10 @@ char CharInput::get () void CharInput::unget () { + ASSERT (is); ASSERT (! ungot); - ifs. unget (); + is->unget (); charNum--; ungot = true; } @@ -1074,48 +1427,114 @@ string CharInput::getLine () +// PairFile + +bool PairFile::next () +{ + if (! f. nextLine ()) + return false; + + iss. reset (f. line); + name2. clear (); + iss >> name1 >> name2; + + if (name2. empty ()) + throw runtime_error ("No pair: " + strQuote (name1) + " - " + strQuote (name2)); + if (! sameAllowed && name1 == name2) + throw runtime_error ("Same name: " + name1); + + if (orderNames && name1 > name2) + swap (name1, name2); + + return true; +} + + + + // Token void Token::readInput (CharInput &in) { - clear (); + ASSERT (empty ()); + // Skip spaces char c = '\0'; do { c = in. get (); } while (! in. eof && isSpace (c)); if (in. eof) - { - ASSERT (empty ()); return; - } - charNum = in. charNum; - if (c == quote) + charNum = in. charNum; + + if ( c == '\'' + || c == '\"' + ) { type = eText; + quote = c; for (;;) { c = in. get (); + if (in. eof) + throw CharInput::Error (in, "Text is not finished: end of file", false); if (in. eol) - throw CharInput::Error (in, "ending quote"); + //throw CharInput::Error (in, "ending quote", false); + continue; if (c == quote) break; name += c; } } - else if (isDigit (c)) + else if (isDigit (c) || c == '-') { - type = eNumber; - while (! in. eof && isDigit (c)) + while ( ! in. eof + && ( isDigit (c) + || c == '.' + || c == 'e' + || c == 'E' + || c == '-' + ) + ) { name += c; c = in. get (); } if (! in. eof) in. unget (); - num = str2 (name); - ASSERT (num != numeric_limits::max ()); + if (name == "-") + type = eDelimiter; + else + { + strLower (name); + if ( contains (name, '.') + || contains (name, 'e') + ) + { + type = eDouble; + d = str2 (name); + decimals = 0; + size_t pos = name. find ('.'); + if (pos != string::npos) + { + pos++; + while ( pos < name. size () + && isDigit (name [pos]) + ) + { + pos++; + decimals++; + } + } + scientific = contains (name, 'e'); + } + else + { + type = eInteger; + n = str2 (name); + } + } } else if (isLetter (c)) { @@ -1128,12 +1547,22 @@ void Token::readInput (CharInput &in) if (! in. eof) in. unget (); } - else + else if (Common_sp::isDelimiter (c)) { - ASSERT (type == eDelimiter); + type = eDelimiter; name = c; } + else + throw CharInput::Error (in, "Unknown token starting with ASCII " + toString ((int) c), false); + ASSERT (! empty ()); + qc (); + + if (verbose ()) + { + cout << type2str (type) << ' '; + cout << *this << endl; + } } @@ -1144,28 +1573,50 @@ void Token::qc () const return; if (! empty ()) { - ASSERT (! name. empty ()); - ASSERT (! contains (name, quote)); - IMPLY (type != eText, ! contains (name, ' ')); - IMPLY (type != eNumber, num == noNum); - const char c = name [0]; + QC_IMPLY (type != eText, ! name. empty ()); + QC_IMPLY (type != eText, ! contains (name, ' ')); + QC_IMPLY (type != eText, quote == '\0'); + QC_ASSERT (! contains (name, quote)); + QC_IMPLY (type != eDouble, decimals == 0); switch (type) { - case eText: break; - case eNumber: ASSERT (isDigit (c)); - ASSERT (num != noNum); + case eName: QC_ASSERT (isIdentifier (name)); break; - case eName: ASSERT (isLetter (c) && ! isDigit (c)); + case eText: break; + case eInteger: + case eDouble: QC_ASSERT (name [0] == '-' || isDigit (name [0])); break; - case eDelimiter: ASSERT (name. size () == 1); + case eDelimiter: QC_ASSERT (name. size () == 1); + QC_ASSERT (Common_sp::isDelimiter (name [0])); break; - default: throw runtime_error ("Unknown type"); + default: throw runtime_error ("Token: Unknown type"); } } } +void Token::saveText (ostream &os) const +{ + if (empty ()) + return; + + switch (type) + { + case eName: os << name; break; + case eText: os << quote << name << quote; break; + case eInteger: os << n; break; + case eDouble: { + const ONumber on (os, decimals, scientific); + os << d; + } + break; + case eDelimiter: os << name; break; + default: throw runtime_error ("Token: Unknown type"); + } +} + + // OFStream @@ -1173,8 +1624,8 @@ void OFStream::open (const string &dirName, const string &fileName, const string &extension) { - ASSERT (! fileName. empty ()); - ASSERT (! is_open ()); + QC_ASSERT (! fileName. empty ()); + QC_ASSERT (! is_open ()); string pathName; if (! dirName. empty () && ! isDirName (dirName)) @@ -1186,7 +1637,7 @@ void OFStream::open (const string &dirName, ofstream::open (pathName); if (! good ()) - throw runtime_error ("Cannot create file \"" + pathName + "\""); + throw runtime_error ("Cannot create file " + strQuote (pathName)); } @@ -1196,7 +1647,7 @@ void OFStream::open (const string &dirName, string Csv::getWord () { - ASSERT (goodPos ()); + QC_ASSERT (goodPos ()); size_t start = pos; size_t stop = NO_INDEX; @@ -1220,18 +1671,18 @@ string Csv::getWord () -void csvLine2vec (const string &line, - StringVector &words) +StringVector csvLine2vec (const string &line) { - words. clear (); + StringVector words; words. reserve (256); // PAR Csv csv (line); string s; while (csv. goodPos ()) { s = csv. getWord (); trim (s); - words << s; + words << move (s); } + return words; } @@ -1247,123 +1698,75 @@ Json::Json (JsonContainer* parent, if (const JsonArray* jArray = parent->asJsonArray ()) { ASSERT (name. empty ()); - const_cast (jArray) -> data << this; + var_cast (jArray) -> data << this; } else if (const JsonMap* jMap = parent->asJsonMap ()) { ASSERT (! name. empty ()); ASSERT (! contains (jMap->data, name)); - const_cast (jMap) -> data [name] = this; + var_cast (jMap) -> data [name] = this; } + // throw() in a child constructor will invoke terminate() if an ancestor is JsonArray or JsonMap } -Token Json::readToken (istream &is) +void Json::parse (CharInput& in, + const Token& firstToken, + JsonContainer* parent, + const string& name) { - const string delim ("[]{},:"); - - string s; - bool spaces = true; - bool isC = false; - while (is) + switch (firstToken. type) { - char c; - is. get (c); - const bool isDelim = charInSet (c, delim); - if (spaces) - if (isSpace (c)) - continue; - else + case Token::eName: { - if (isDelim) - return Token (string (1, c), Token::eDelimiter); - spaces = false; - isC = c == '\''; + string tokenName (firstToken. name); + strLower (tokenName); + if ( tokenName == "null" + || tokenName == "nan" + ) + new JsonNull (parent, name); + else if (tokenName == "true") + new JsonBoolean (true, parent, name); + else if (tokenName == "false") + new JsonBoolean (false, parent, name); + else + new JsonString (firstToken. name, parent, name); } - else - { - // Opposite to toStr() - if (isC) + break; + case Token::eText: new JsonString (firstToken. name, parent, name); break; + case Token::eInteger: new JsonInt (firstToken. n, parent, name); break; + case Token::eDouble: new JsonDouble (firstToken. d, firstToken. decimals, parent, name); break; + case Token::eDelimiter: + switch (firstToken. name [0]) { - if (c == '\'') - break; - else - if (c == '\\') - { - is. get (c); - if (c == 'n') - c = '\n'; - } + case '{': new JsonMap (in, parent, name); break; + case '[': new JsonArray (in, parent, name); break; + default: throw CharInput::Error (in, "Bad delimiter", false); } - else - if (isSpace (c) || isDelim) - { - is. unget (); - break; - } - } - - s += c; + break; + default: throw CharInput::Error (in, "Bad token", false); } - - if (isC) - s. erase (0, 1); - else - ASSERT (! s. empty ()); - - return Token (s, isC ? Token::eText : Token::eNumber); } -void Json::parse (istream& is, - const Token& firstToken, - JsonContainer* parent, - const string& name) -{ - string s (firstToken. name); - strLower (s); - - if (firstToken. type == Token::eDelimiter && s == "{") - new JsonMap (is, parent, name); - else if (firstToken. type == Token::eDelimiter && s == "[") - new JsonArray (is, parent, name); - else if (firstToken. type == Token::eText && s == "null") - new JsonNull (parent, name); - else if (firstToken. type == Token::eNumber) - { - if ( contains (s, '.') - || contains (s, 'e') - ) - { - uint decimals = 0; - size_t pos = s. find ('.'); - if (pos != string::npos) - { - pos++; - while ( pos < s. size () - && isDigit (s [pos]) - ) - { - pos++; - decimals++; - } - } - new JsonDouble (str2 (s), decimals, parent, name); - } - else - new JsonInt (str2 (s), parent, name); - } - else - new JsonString (firstToken. name, parent, name); +string Json::getString () const +{ + const auto* this_ = this; + if (! this_ || asJsonNull ()) + throw runtime_error ("undefined"); + if (const JsonString* j = asJsonString ()) + return j->s; + throw runtime_error ("Not a JsonString"); } int Json::getInt () const { - if (! this || asJsonNull ()) + const auto* this_ = this; + if (! this_ || asJsonNull ()) throw runtime_error ("undefined"); if (const JsonInt* j = asJsonInt ()) return j->n; @@ -1374,10 +1777,11 @@ int Json::getInt () const double Json::getDouble () const { - if (! this) + const auto* this_ = this; + if (! this_) throw runtime_error ("undefined"); if (asJsonNull ()) - return NAN; + return numeric_limits::quiet_NaN (); if (const JsonDouble* j = asJsonDouble ()) return j->n; throw runtime_error ("Not a JsonDouble"); @@ -1385,20 +1789,22 @@ double Json::getDouble () const -string Json::getString () const +bool Json::getBoolean () const { - if (! this || asJsonNull ()) + const auto* this_ = this; + if (! this_ || asJsonNull ()) throw runtime_error ("undefined"); - if (const JsonString* j = asJsonString ()) - return j->s; - throw runtime_error ("Not a JsonString"); + if (const JsonBoolean* j = asJsonBoolean ()) + return j->b; + throw runtime_error ("Not a JsonBoolean"); } const Json* Json::at (const string& name_arg) const { - if (! this) + const auto* this_ = this; + if (! this_) throw runtime_error ("undefined"); if (asJsonNull ()) return nullptr; @@ -1411,7 +1817,8 @@ const Json* Json::at (const string& name_arg) const const Json* Json::at (size_t index) const { - if (! this) + const auto* this_ = this; + if (! this_) throw runtime_error ("undefined"); if (asJsonNull ()) return nullptr; @@ -1429,7 +1836,8 @@ const Json* Json::at (size_t index) const size_t Json::getSize () const { - if (! this) + const auto* this_ = this; + if (! this_) throw runtime_error ("undefined"); if (asJsonNull ()) return 0; @@ -1443,7 +1851,7 @@ size_t Json::getSize () const // JsonArray -JsonArray::JsonArray (istream& is, +JsonArray::JsonArray (CharInput& in, JsonContainer* parent, const string& name) : JsonContainer (parent, name) @@ -1451,15 +1859,16 @@ JsonArray::JsonArray (istream& is, bool first = true; for (;;) { - Token token (readToken (is)); + Token token (in); if (token. isDelimiter (']')) break; if (! first) { - ASSERT (token. isDelimiter (',')); - token = readToken (is); + if (! token. isDelimiter (',')) + throw CharInput::Error (in, "\',\'"); + token = Token (in); } - parse (is, token, this, string()); + parse (in, token, this, string()); first = false; } } @@ -1495,36 +1904,41 @@ JsonMap::JsonMap () JsonMap::JsonMap (const string &fName) { - ifstream ifs (fName. c_str ()); - if (! ifs. good ()) - throw runtime_error ("Bad file: '" + fName + "'"); - const Token token (readToken (ifs)); + CharInput in (fName); + const Token token (in); if (! token. isDelimiter ('{')) - throw runtime_error ("Json file: '" + fName + "': should start with '{'"); - parse (ifs); + throw CharInput::Error (in, "Json file " + strQuote (fName) + ": text should start with '{'", false); + parse (in); } -void JsonMap::parse (istream& is) +void JsonMap::parse (CharInput& in) { ASSERT (data. empty ()); bool first = true; for (;;) { - Token token (readToken (is)); + Token token (in); if (token. isDelimiter ('}')) break; if (! first) { - ASSERT (token. isDelimiter (',')); - token = readToken (is); + if (! token. isDelimiter (',')) + throw CharInput::Error (in, "\',\'"); + token = Token (in); } - ASSERT (! token. name. empty ()); - const Token colon (readToken (is)); - ASSERT (colon. isDelimiter (':')); - Json::parse (is, readToken (is), this, token. name); + if ( token. type != Token::eName + && token. type != Token::eText + ) + throw CharInput::Error (in, "name or text"); + string name (token. name); + const Token colon (in); + if (! colon. isDelimiter (':')) + throw CharInput::Error (in, "\':\'"); + token = Token (in); + Json::parse (in, token, this, name); first = false; } } @@ -1533,7 +1947,7 @@ void JsonMap::parse (istream& is) JsonMap::~JsonMap () { - for (auto it : data) + for (auto& it : data) delete it. second; } @@ -1543,7 +1957,7 @@ void JsonMap::print (ostream& os) const { os << "{"; bool first = true; - for (const auto it : data) + for (const auto& it : data) { if (! first) os << ","; @@ -1571,17 +1985,6 @@ size_t Offset::size = 0; -// - -void exec (const string &cmd) -{ - ASSERT (! cmd. empty ()); - EXEC_ASSERT (system (cmd. c_str ()) == 0); -} - - - - // FileItemGenerator FileItemGenerator::FileItemGenerator (size_t progress_displayPeriod, @@ -1602,14 +2005,22 @@ FileItemGenerator::FileItemGenerator (size_t progress_displayPeriod, strcat (lsfName, "/XXXXXX"); EXEC_ASSERT (mkstemp (lsfName) != -1); ASSERT (lsfName [0]); - const int res = system (("ls " + fName + " > " + lsfName). c_str ()); - //printf ("res = %d\n", res); - ASSERT (! res); + const int res = system (("ls -a " + fName + " > " + lsfName). c_str ()); + if (res) + throw runtime_error ("Command ls failed: status = " + to_string (res)); fName = lsfName; #endif } f. open (fName); - ASSERT (f. good ()); + QC_ASSERT (f. good ()); + if (isDir) + { + string s; + readLine (f, s); + ASSERT (s == "."); + readLine (f, s); + ASSERT (s == ".."); + } } @@ -1646,48 +2057,114 @@ Application::Arg::Arg (const string &name_arg, { trim (name); ASSERT (! name. empty ()); - ASSERT (! contains (name, ' ')); - + ASSERT (! contains (name, ' ')); + ASSERT (! contains (name, '=')); + ASSERT (! isLeft (name, "-")); ASSERT (! description. empty ()); + ASSERT (name != Application::helpS); + ASSERT (name != Application::versionS); +} + + + +void Application::Arg::qc () const +{ + if (! qc_on) + return; + + QC_ASSERT (! name. empty ()); + QC_ASSERT (! description. empty ()); +} + + + +void Application::Key::qc () const +{ + if (! qc_on) + return; + Arg::qc (); + + QC_IMPLY (app. gnu, name. size () > 1); + QC_IMPLY (acronym, app. gnu); + QC_ASSERT (isUpper (var)); + QC_ASSERT (! var. empty () == (app. gnu && ! flag)); + QC_IMPLY (! requiredGroup. empty (), defaultValue. empty ()); } void Application::Key::saveText (ostream &os) const { - os << "[-" << name; - if (! flag) - { - os << " "; - const bool quoted = value. empty () || contains (value, ' '); - if (quoted) - os << "\""; - os << value; - if (quoted) - os << "\""; - } - os << "]"; + os << '-'; + if (app. gnu) + { + os << "-" << name; + if (! flag) + os << ' ' << var; + } + else + { + os << name; + if (! flag) + { + os << ' '; + const bool quoted = defaultValue. empty () || contains (defaultValue, ' '); + if (quoted) + os << '\"'; + os << defaultValue; + if (quoted) + os << '\"'; + } + } +} + + + +string Application::Key::getShortHelp () const +{ + string acronymS; + if (acronym) + { + acronymS = string ("-") + acronym; + if (! flag) + acronymS += " " + var; + acronymS += ", "; + } + return acronymS + str (); } void Application::addKey (const string &name, const string &argDescription, - const string &defaultValue) + const string &defaultValue, + char acronym, + const string &var) { - ASSERT (! contains (args, name)); - keys << Key (name, argDescription, defaultValue); - args [name] = & keys. back (); + ASSERT (! name. empty ()); + ASSERT (! contains (name2arg, name)); + if (acronym && contains (char2arg, acronym)) + throw logic_error ("Duplicate option " + strQuote (string (1, acronym))); + keys << Key (*this, name, argDescription, defaultValue, acronym, var); + name2arg [name] = & keys. back (); + if (acronym) + char2arg [acronym] = & keys. back (); } void Application::addFlag (const string &name, - const string &argDescription) + const string &argDescription, + char acronym) { - ASSERT (! contains (args, name)); - keys << Key (name, argDescription); - args [name] = & keys. back (); + ASSERT (! name. empty ()); + ASSERT (! contains (name2arg, name)); + if (acronym && contains (char2arg, acronym)) + throw logic_error ("Duplicate option " + strQuote (string (1, acronym))); + keys << Key (*this, name, argDescription, acronym); + name2arg [name] = & keys. back (); + if (acronym) + char2arg [acronym] = & keys. back (); } @@ -1695,18 +2172,90 @@ void Application::addFlag (const string &name, void Application::addPositional (const string &name, const string &argDescription) { - ASSERT (! contains (args, name)); + ASSERT (! contains (name2arg, name)); + IMPLY (gnu, isUpper (name)); positionals << Positional (name, argDescription); - args [name] = & positionals. back (); + name2arg [name] = & positionals. back (); +} + + + +Application::Key* Application::getKey (const string &keyName) const +{ + if (! contains (name2arg, keyName)) + errorExitStr ("Unknown key: " + strQuote (keyName) + "\n\n" + getInstruction ()); + + Key* key = nullptr; + if (const Arg* arg = findPtr (name2arg, keyName)) + key = var_cast (arg->asKey ()); + if (! key) + errorExitStr (strQuote (keyName) + " is not a key\n\n" + getInstruction ()); + + return key; +} + + + +void Application::setPositional (List::iterator &posIt, + const string &value) +{ + if (posIt == positionals. end ()) + { + if (isLeft (value, "-")) + errorExitStr (strQuote (value) + " is not a valid option\n\n" + getInstruction ()); + else + errorExitStr (strQuote (value) + " cannot be a positional parameter\n\n" + getInstruction ()); + } + (*posIt). value = value; + posIt++; } +void Application::setRequiredGroup (const string &keyName, + const string &requiredGroup) +{ + ASSERT (! requiredGroup. empty ()); + getKey (keyName) -> requiredGroup = requiredGroup; +} + + + +void Application::qc () const +{ + if (! qc_on) + return; + + QC_ASSERT (! description. empty ()); + QC_ASSERT (! version. empty ()); + QC_IMPLY (! needsArg, positionals. empty ()); + + for (const Positional& p : positionals) + p. qc (); + + map requiredGroups; + string requiredGroup_prev; + for (const Key& key : keys) + { + key. qc (); + if (! key. requiredGroup. empty () && key. requiredGroup != requiredGroup_prev) + { + QC_ASSERT (! contains (requiredGroups, key. requiredGroup)); + requiredGroups [key. requiredGroup] ++; + } + requiredGroup_prev = key. requiredGroup; + } + for (const auto& it : requiredGroups) + QC_ASSERT (it. second > 1); +} + + + string Application::getArg (const string &name) const { - if (contains (args, name)) - return args. at (name) -> value; - throw runtime_error ("Parameter \"" + name + "\" is not found"); + if (contains (name2arg, name)) + return name2arg. at (name) -> value; + throw logic_error ("Parameter " + strQuote (name) + " is not found"); } @@ -1714,24 +2263,66 @@ string Application::getArg (const string &name) const bool Application::getFlag (const string &name) const { const string value (getArg (name)); - const Key* key = args. at (name) -> asKey (); + const Key* key = name2arg. at (name) -> asKey (); if (! key || ! key->flag) - throw runtime_error ("Parameter \"" + name + "\" is not a flag"); + throw logic_error ("Parameter " + strQuote (name) + " is not a flag"); return value == "true"; } +string Application::key2shortHelp (const string &name) const +{ + if (! contains (name2arg, name)) + throw logic_error ("Parameter " + strQuote (name) + " is not found"); + const Arg* arg = name2arg. at (name); + ASSERT (arg); + if (const Key* key = arg->asKey ()) + return key->getShortHelp (); + throw logic_error ("Parameter " + strQuote (name) + " is not a key"); +} + + + string Application::getInstruction () const { string instr (description); - instr += "\nUsage: " + programName; + instr += "\n\nUSAGE: " + programName; + for (const Positional& p : positionals) instr += " " + p. str (); + + string requiredGroup_prev; for (const Key& key : keys) - instr += " " + key. str (); + { + if (key. requiredGroup == requiredGroup_prev) + if (key. requiredGroup. empty ()) + instr += " "; + else + instr += " | "; + else + if (requiredGroup_prev. empty ()) + instr += " ("; + else + { + instr += ") "; + if (! key. requiredGroup. empty ()) + instr += "("; + } + if (key. requiredGroup. empty ()) + instr += "["; + instr += key. str (); + if (key. requiredGroup. empty ()) + instr += "]"; + requiredGroup_prev = key. requiredGroup; + } + if (! requiredGroup_prev. empty ()) + instr += ")"; - instr += string ("\n") + "Help: " + programName + " -help|-h"; +//if (! contains (name2arg, "help")) + instr += "\nHELP: " + programName + " " + ifS (gnu, "-") + "-" + helpS; +//if (! contains (name2arg, "version")) + instr += "\nVERSION: " + programName + " " + ifS (gnu, "-") + "-" + versionS; return instr; } @@ -1741,12 +2332,26 @@ string Application::getInstruction () const string Application::getHelp () const { string instr (getInstruction ()); - instr += "\nParameters:"; - const string par ("\n "); - for (const Positional& p : positionals) - instr += par + p. str () + ": " + p. description; - for (const Key& key : keys) - instr += par + key. str () + ": " + key. description;; + + const string par ("\n "); + + if (! positionals. empty ()) + { + instr += "\n\nOBLIGATORY PARAMETERS:"; + for (const Positional& p : positionals) + instr += "\n" + p. str () + par + p. description; + } + + if (! keys. empty ()) + { + instr += "\n\nOPTIONAL PARAMETERS:"; + for (const Key& key : keys) + { + instr += "\n" + key. getShortHelp () + par + key. description; + if (gnu && ! key. defaultValue. empty ()) + instr += par + "Default: " + key. defaultValue; + } + } return instr; } @@ -1757,118 +2362,219 @@ int Application::run (int argc, const char* argv []) { ASSERT (programArgs. empty ()); + + + addDefaultArgs (); + + try { + // programArgs for (int i = 0; i < argc; i++) - programArgs. push_back (argv [i]); + { + // gnu: -abc --> -a -b -c ?? + string key (argv [i]); + string value; + bool valueP = false; + if ( i + && ! key. empty () + && key [0] == '-' + ) + { + const size_t eqPos = key. find ('='); + if (eqPos != string::npos) + { + value = key. substr (eqPos + 1); + key = key. substr (0, eqPos); + valueP = true; + } + } + programArgs. push_back (key); + if (valueP) + programArgs. push_back (value); + } ASSERT (! programArgs. empty ()); - + + initEnvironment (); + + // positionals, keys - bool first = true; - posIt = positionals. begin (); - Key* key = nullptr; - Set keysRead; - for (string s : programArgs) + Set keysRead; { - if (first) - { - programName = rfindSplit (s, fileSlash); - ASSERT (! programName. empty ()); - } - else - { - if (! s. empty () && s [0] == '-') + bool first = true; + List::iterator posIt = positionals. begin (); + Key* key = nullptr; + for (string s : programArgs) + { + if (first) + { + programName = rfindSplit (s, fileSlash); + ASSERT (! programName. empty ()); + } + else if (key) + { + ASSERT (! key->flag); + key->value = s; + key = nullptr; + } + else if (! s. empty () && s [0] == '-') { - if (key) - errorExitStr ("Key with no value: " + key->name + "\n" + getInstruction ()); - const string name (s. substr (1)); - if ( name == "help" - || name == "h" - ) + + const string s1 (s. substr (1)); + if (s1. empty ()) + errorExitStr ("Dash with no key\n\n" + getInstruction ()); + + string name; + const char c = s1 [0]; // Valid if name.empty() + if (gnu) + if (c == '-') + { + name = s1. substr (1); + if (name. empty ()) + errorExitStr ("Dashes with no key\n\n" + getInstruction ()); + } + else + { + if (s1. size () != 1) + errorExitStr ("Single character expected: " + strQuote (s1) + "\n\n" + getInstruction ()); + } + else + name = s1; + + if (name == helpS /*&& ! contains (name2arg, helpS)*/) { cout << getHelp () << endl; return 0; } - if (! contains (args, name)) - errorExitStr ("Unknown key: " + name + "\n" + getInstruction ()); - key = const_cast (args [name] -> asKey ()); - if (! key) - errorExitStr (name + " is not a key\n" + getInstruction ()); - if (keysRead. contains (name)) - errorExitStr ("Parameter \"" + name + "\" is used more than once"); - else - keysRead << name; - if (key->flag) + if (name == versionS /*&& ! contains (name2arg, versionS)*/) { - key->value = "true"; - key = nullptr; + cout << version << endl; + return 0; } - } - else - if (key) + + // key + if (name. empty ()) { - ASSERT (! key->flag); - key->value = s; - key = nullptr; - } + ASSERT (gnu); + key = var_cast (findPtr (char2arg, c)); + } else + if (const Arg* arg = findPtr (name2arg, name)) + key = var_cast (arg->asKey ()); + + if (key) { - if (posIt == positionals. end ()) - errorExitStr ("Too many positional arguments\n" + getInstruction ()); - (*posIt). value = s; - posIt++; - } - } - first = false; - } - - - const string logFName = getArg ("log"); - ASSERT (! logPtr); - if (! logFName. empty ()) - logPtr = new ofstream (logFName, ios_base::app); - - if (getFlag ("qc")) - qc_on = true; + if (keysRead. contains (key)) + errorExitStr ("Parameter " + strQuote (key->name) + " is used more than once"); + else + keysRead << key; + + if (key->flag) + { + key->value = "true"; + key = nullptr; + } + } + else + setPositional (posIt, s); + } + else + setPositional (posIt, s); - Verbose vrb (str2 (getArg ("verbose"))); - - if (getFlag ("noprogress")) - Progress::disable (); - if (getFlag ("profile")) - Chronometer::enabled = true; - - seed_global = str2 (getArg ("seed")); - if (! seed_global) - throw runtime_error ("Seed cannot be 0"); + first = false; + } + if (key) + errorExitStr ("Key with no value: " + key->name + "\n\n" + getInstruction ()); - threads_max = str2 (getArg ("threads")); - if (! threads_max) - throw runtime_error ("Number of threads cannot be 0"); - if (threads_max > 1 && Chronometer::enabled) - throw runtime_error ("Cannot profile with threads"); - - const string jsonFName = getArg ("json"); - ASSERT (! jRoot); - if (! jsonFName. empty ()) - { - new JsonMap (); - ASSERT (jRoot); - } + if (programArgs. size () == 1 && (! positionals. empty () || needsArg)) + { + cout << getInstruction () << endl; + return 1; + } + + + if (posIt != positionals. end ()) + errorExitStr ("Too few positional parameters\n\n" + getInstruction ()); + } + + + for (Key& key : keys) + if (! keysRead. contains (& key)) + key. value = key. defaultValue; + + + { + map requiredGroup2names; + map requiredGroup2given; + for (const Key& key : keys) + if (! key. requiredGroup. empty ()) + { + if (! key. value. empty ()) + { + const StringVector& given = requiredGroup2given [key. requiredGroup]; + if (! given. empty ()) + throw runtime_error ("Parameter " + strQuote (key. name) + " is conflicting with " + given. toString (", ")); + requiredGroup2given [key. requiredGroup] << strQuote (key. name); + } + requiredGroup2names [key. requiredGroup] << strQuote (key. name); + } + for (const auto& it : requiredGroup2names) + if (requiredGroup2given [it. first]. empty ()) + throw runtime_error ("One of required parameters is missing: " + it. second. toString (", ")); + } - if (programArgs. size () == 1 && (! positionals. empty () || needsArg)) + string logFName; + string jsonFName; + if (gnu) { - cout << getInstruction () << endl; - return 1; + if (getFlag ("debug")) + qc_on = true; } - - if (posIt != positionals. end ()) - errorExitStr ("Too few positional arguments\n" + getInstruction ()); + else + { + logFName = getArg ("log"); + ASSERT (! logPtr); + if (! logFName. empty ()) + logPtr = new ofstream (logFName, ios_base::app); + + if (getFlag ("qc")) + qc_on = true; + + if (getFlag ("noprogress")) + Progress::disable (); + if (getFlag ("profile")) + Chronometer::enabled = true; + + seed_global = str2 (getArg ("seed")); + if (! seed_global) + throw runtime_error ("Seed cannot be 0"); + + jsonFName = getArg ("json"); + ASSERT (! jRoot); + if (! jsonFName. empty ()) + { + new JsonMap (); + ASSERT (jRoot); + } + + sigpipe = getFlag ("sigpipe"); + } + + + threads_max = str2 (getArg ("threads")); + if (! threads_max) + throw runtime_error ("Number of threads cannot be 0"); + if (threads_max > 1 && Chronometer::enabled) + throw runtime_error ("Cannot profile with threads"); + const Verbose vrb (gnu ? 0 : str2 (getArg ("verbose"))); + + + qc (); body (); @@ -1899,6 +2605,111 @@ int Application::run (int argc, +// ShellApplication + +ShellApplication::~ShellApplication () +{ + if (! qc_on && ! tmp. empty ()) + exec ("rm -fr " + tmp + "*"); +} + + + +void ShellApplication::initEnvironment () +{ + ASSERT (tmp. empty ()); + ASSERT (! programArgs. empty ()); + + // tmp + if (useTmp) + { + char templateS [] = {'/', 't', 'm', 'p', '/', 'X', 'X', 'X', 'X', 'X', 'X', '\0'}; + #if 0 + tmp = tmpnam (NULL); + #else + EXEC_ASSERT (mkstemp (templateS) != -1); + tmp = templateS; + #endif + if (tmp. empty ()) + throw runtime_error ("Cannot create a temporary file"); + } + + // execDir + execDir = getProgramDirName (); + if (execDir. empty ()) + execDir = which (programArgs. front ()); + ASSERT (isRight (execDir, "/")); + + + string execDir_ (execDir); + trimSuffix (execDir_, "/"); + for (Key& key : keys) + if (! key. flag) + replaceStr (key. defaultValue, "$BASE", execDir_); +} + + + +void ShellApplication::body () const +{ + if (useTmp && qc_on) + cout << tmp << endl; + shellBody (); +} + + + +string ShellApplication::which (const string &progName) const +{ + if (tmp. empty ()) + throw runtime_error ("Temporary file is needed"); + + try { exec ("which " + progName + " 1> " + tmp + ".src 2> /dev/null"); } + catch (const runtime_error &) + { return string (); } + LineInput li (tmp + ".src"); + const string s (li. getString ()); + return getDirName (s); +} + + + +void ShellApplication::findProg (const string &progName) const +{ + ASSERT (! progName. empty ()); + ASSERT (! contains (progName, '/')); + ASSERT (isRight (execDir, "/")); + + string dir; + if (! find (prog2dir, progName, dir)) + { + dir = fileExists (execDir + progName) + ? execDir + : which (progName); + if (dir. empty ()) + throw runtime_error ("Binary " + shellQuote (progName) + " is not found.\nPlease make sure that " + + shellQuote (progName) + " is in the same directory as " + shellQuote (Common_sp::programName) + " or is in your $PATH.");; + prog2dir [progName] = dir; + } + + ASSERT (isRight (dir, "/")); +} + + + +string ShellApplication::fullProg (const string &progName) const +{ + string dir; + if (! find (prog2dir, progName, dir)) + throw runtime_error ("Program " + strQuote (progName) + " is not found"); + ASSERT (isRight (dir, "/")); + return dir + progName + " "; +} + + + + + } diff --git a/common.hpp b/common.hpp index 9eca864..4bd3f2e 100644 --- a/common.hpp +++ b/common.hpp @@ -55,8 +55,9 @@ #pragma warning (default : 4005) #endif -#include +#include #include +#include #include #include #include @@ -66,21 +67,34 @@ #include #include #include +#include +#include #include #include +#include #include #include #include + #include -#include +#ifdef _MSC_VER + #pragma warning(push) + #pragma warning(disable:4265) +#endif +#include +#ifdef _MSC_VER + #pragma warning(pop) +#endif + +using namespace std; + namespace Common_sp { -using namespace std; - +typedef unsigned char uchar; typedef unsigned int uint; typedef unsigned long ulong; @@ -92,33 +106,39 @@ bool initCommon (); extern vector programArgs; extern string programName; + +string getCommandLine (); + extern ostream* logPtr; extern bool qc_on; -extern size_t threads_max; - // >= 1 + extern ulong seed_global; // >= 1 +// thread +extern size_t threads_max; + // >= 1 +extern thread::id main_thread_id; +bool isMainThread (); + -void errorExit (const char* msg, - bool segmFault = false); +[[noreturn]] void errorExit (const char* msg, + bool segmFault = false); // Input: programArgs, programName, logptr // Update: *logPtr // Invokes: if segmFault then abort() else exit(1) -inline void errorExitStr (const string &msg) +[[noreturn]] inline void errorExitStr (const string &msg) { errorExit (msg. c_str ()); } - struct Nocopy { protected: - Nocopy () - {} + Nocopy () = default; Nocopy (const Nocopy &) = delete; - Nocopy (Nocopy&&) = delete; + Nocopy (Nocopy &&) = delete; Nocopy& operator= (const Nocopy &) = delete; }; @@ -153,7 +173,7 @@ class ONumber struct Chronometer : Nocopy // Requires: no thread is used { - static constexpr clock_t noclock {-1}; + static constexpr clock_t noclock {(clock_t) -1}; static bool enabled; const string name; clock_t time {0}; @@ -218,6 +238,13 @@ struct Chronometer_OnePass : Nocopy +template + inline T* var_cast (const T* t) + { return const_cast (t); } +template + inline T& var_cast (const T &t) + { return const_cast (t); } + template inline T const_static_cast (const S* s) { return static_cast (const_cast (s)); } @@ -365,6 +392,12 @@ inline bool isLower (char c) inline bool printable (char c) { return between (c, ' ', (char) 127); } + +inline bool isDelimiter (char c) + { return c != ' ' + && printable (c) + && ! isLetter (c); + } @@ -382,7 +415,7 @@ struct Iter : Nocopy typename T::iterator it; - Iter (T &t_arg) + explicit Iter (T &t_arg) : t (t_arg) , itNext (t_arg. begin ()) {} @@ -414,7 +447,6 @@ struct Iter : Nocopy - template inline ostream& operator<< (ostream &os, const pair &p) @@ -435,9 +467,16 @@ struct Pair : pair const T &b) : P (a, b) {} - Pair () - : P () + Pair (T &&a, + T &&b) + : P (move (a), move (b)) + {} + Pair () = default; + Pair (Pair &&other) + : P (move (other. first), move (other. second)) {} + Pair (const Pair &other) = default; + Pair& operator= (const Pair &other) = default; bool same () const { return P::first == P::second; } @@ -467,7 +506,7 @@ template inline void insertIter (To &to, const From &from) { for (const auto x : from) - to << x; + to << move (x); } template @@ -482,7 +521,9 @@ template template bool intersects (const T &t, const U &u) - { for (const auto x : t) + { if (u. empty ()) + return false; + for (const auto x : t) if (u. find (static_cast (x)) != u. end ()) return true; return false; @@ -493,13 +534,44 @@ template const KeyParent& key) { return m. find (key) != m. end (); } +template + inline bool contains (const unordered_map &m, + const KeyParent& key) + { return m. find (key) != m. end (); } + template bool find (const map &m, const KeyParent& key, Value &value) // Return: success // Output: value, if Return - { const auto it = m. find (key); + { const auto& it = m. find (key); + if (it == m. end ()) + return false; + value = it->second; + return true; + } + +template + bool find (const unordered_map &m, + const KeyParent& key, + Value &value) + // Return: success + // Output: value, if Return + { const auto& it = m. find (key); + if (it == m. end ()) + return false; + value = it->second; + return true; + } + +template + bool find (const unordered_map &m, + const KeyParent& key, + Value &value) + // Return: success + // Output: value, if Return + { const auto& it = m. find (key); if (it == m. end ()) return false; value = it->second; @@ -509,7 +581,16 @@ template template const Value* findPtr (const map &m, const KeyParent& key) - { const auto it = m. find (key); + { const auto& it = m. find (key); + if (it == m. end ()) + return nullptr; + return & it->second; + } + +template + const Value* findPtr (const unordered_map &m, + const KeyParent& key) + { const auto& it = m. find (key); if (it == m. end ()) return nullptr; return & it->second; @@ -524,6 +605,15 @@ template return nullptr; } +template + const Value* findPtr (const unordered_map &m, + const KeyParent& key) + { const Value* value; + if (find (m, key, value)) + return value; + return nullptr; + } + template const Value& findMake (map &m, const Key& key) @@ -532,6 +622,14 @@ template return * m [key]; } +template + const Value& findMake (unordered_map &m, + const Key& key) + { if (! contains (m, key)) + m [key] = new Value (); + return * m [key]; + } + template inline long count_if (T &t, UnaryPredicate pred) { return std::count_if (t. begin (), t. end (), pred); } @@ -549,7 +647,16 @@ template -// +struct Istringstream : istringstream +{ + Istringstream () = default; + void reset (const string &s) + { clear (); + str (s); + } +}; + + template struct List : list @@ -559,10 +666,9 @@ struct List : list public: - List () - {} + List () = default; template */> - explicit List (const list &other) + List (const list &other) { *this << other; } template */> explicit List (const vector &other) @@ -641,6 +747,16 @@ inline string nvl (const string& s, const string& nullS = "-") { return s. empty () ? nullS : s; } +inline bool isQuoted (const string &s, + char quote = '\"') + { return ! s. empty () && s [0] == quote && s [s. size () - 1] == quote; } + +string strQuote (const string &s, + char quote = '\"'); + +inline string unQuote (const string &s) + { return s. substr (1, s. size () - 2); } + inline string prepend (const string &prefix, const string &s) { if (s. empty ()) @@ -648,15 +764,6 @@ inline string prepend (const string &prefix, return prefix + s; } -inline bool isQuoted (const string &s) - { return ! s. empty () && s [0] == '\"' && s [s. size () - 1] == '\"'; } - -inline string strQuote (const string &s) - { return "\"" + s + "\""; } - -inline string unQuote (const string &s) - { return s. substr (1, s. size () - 2); } - inline bool isSpace (char c) { return c > '\0' && c <= ' ' && isspace (c); } @@ -673,12 +780,13 @@ template T str2 (const string &s) { static_assert (numeric_limits::max() > 256, "str2 does not work on chars"); T i; - istringstream iss (s); + istringstream iss (s); iss >> i; if ( Common_sp::strBlank (s) || ! iss. eof () + || iss. fail () ) - throw runtime_error ("Converting \"" + s + "\""); + throw runtime_error ("Converting " + strQuote (s)); return i; } @@ -713,6 +821,8 @@ bool trimTailAt (string &s, bool goodName (const string &name); +bool isIdentifier (const string& name); + void strUpper (string &s); void strLower (string &s); @@ -721,6 +831,12 @@ bool isUpper (const string &s); bool isLower (const string &s); +inline string strUpper1 (const string &s) + { if (s. empty ()) + return s; + return toUpper (s [0]) + s. substr (1); + } + inline bool charInSet (char c, const string &charSet) { return charSet. find (c) != string::npos; } @@ -739,6 +855,23 @@ void trimLeading (string &s); void trimTrailing (string &s); +inline void trim (string &s) + { trimTrailing (s); + trimLeading (s); + } + +void trimLeading (string &s, + char c); + +void trimTrailing (string &s, + char c); + +inline void trim (string &s, + char c) + { trimTrailing (s, c); + trimLeading (s, c); + } + inline bool contains (const string &hay, const string &needle) { return hay. find (needle) != string::npos; } @@ -751,11 +884,6 @@ size_t containsWord (const string& hay, const string& needle); // Return: position of needle in hay; string::npos if needle does not exist -inline void trim (string &s) - { trimTrailing (s); - trimLeading (s); - } - void replace (string &s, char from, char to); @@ -767,12 +895,14 @@ void replace (string &s, void replaceStr (string &s, const string &from, const string &to); + // Replaces "from" by "to" in s from left to right + // The replacing "to" is skipped if it contains "from" + // Requires: !from.empty() string to_c (const string &s); // " --> \", etc. void collapseSpace (string &s); - string str2streamWord (const string &s, size_t wordNum); @@ -795,6 +925,8 @@ string rfindSplit (string &s, // Return: suffix of c+s after c // Update: s +void reverse (string &s); + List str2list (const string &s, char c = ' '); // Invokes: findSplit(s,c) @@ -819,17 +951,49 @@ inline string getFileName (const string &path) return path. substr (pos + 1); } +inline string getDirName (const string &path) + { const size_t pos = path. rfind ('/'); + if (pos == string::npos) + return string (); + return path. substr (0, pos + 1); + } + inline bool isDirName (const string &path) { return isRight (path, "/"); } bool fileExists (const string &fName); - #ifndef _MSC_VER -bool directoryExists (const string &dirName); + bool directoryExists (const string &dirName); #endif +struct Dir +{ + List items; + // Simplified: contains no redundant "", ".", ".." + // items.front().empty (): root + + explicit Dir (const string &name); + + string get () const + { return items. empty () + ? string (1, '.') + : nvl (list2str (items, string (1, fileSlash)), string (1, fileSlash)); + } + string getParent () const + { if (items. empty ()) + return ".."; + if (items. size () == 1 && items. front (). empty ()) + throw runtime_error ("Cannot get the parent directory of the root"); + const Dir parent (get () + "/.."); + return parent. get (); + } +}; + + +streampos getFileSize (const string &fName); + size_t strMonth2num (const string& month); // Input: month: "Jan", "Feb", ... (3 characters) @@ -857,7 +1021,7 @@ inline void pressAnyKey () inline streamsize double2decimals (double r) - { return r ? (streamsize) max (0, (long) (ceil (- log10 (abs (r)) + 1))) : 0; } + { return r ? (streamsize) max (0, (long) (ceil (- log10 (fabs (r)) + 1))) : 0; } @@ -876,6 +1040,7 @@ struct Rand { seed = (long) (seed_arg % (ulong) max_); qc (); } + // Input: seed_arg > 0 ulong get (ulong max); // Return: in 0 .. max - 1 @@ -928,6 +1093,13 @@ template bool verbose (int inc = 0); +int getVerbosity (); + + + +void exec (const string &cmd, + const string &logFName = string()); + // Threads @@ -938,35 +1110,35 @@ struct Threads : Singleton private: static size_t threadsToStart; vector threads; + const bool quiet; public: - explicit Threads (size_t threadsToStart_arg) - { if (! empty ()) - throw logic_error ("Previous threads have not finished"); - threadsToStart = threadsToStart_arg; - if (threadsToStart >= threads_max) - throw logic_error ("Too many threads to start"); - threads. reserve (threadsToStart); - if (verbose (1)) - cout << "# Threads started: " << threadsToStart + 1 << endl; - } - ~Threads () - { for (auto& t : threads) - t. join (); - threads. clear (); - threadsToStart = 0; - if (verbose (1)) - cout << "Threads finished" << endl; - } + explicit Threads (size_t threadsToStart_arg, + bool quiet_arg = false); + ~Threads (); static bool empty () { return ! threadsToStart; } + size_t getAvailable () const + { return threadsToStart < threads. size () ? 0 : (threadsToStart - threads. size ()); } Threads& operator<< (thread &&t) - { if (threads. size () >= threadsToStart) + { if (! getAvailable ()) throw logic_error ("Too many threads created"); - threads. push_back (move (t)); + try { threads. push_back (move (t)); } + catch (const exception &e) + { throw runtime_error (string ("Cannot start thread\n") + e. what ()); } return *this; } + bool exec (const string cmd, + size_t cmdThreads = 1) + { if (cmdThreads && getAvailable () >= cmdThreads) + { *this << thread (Common_sp::exec, cmd, string ()); + threadsToStart -= cmdThreads - 1; + return true; + } + Common_sp::exec (cmd); + return false; + } }; @@ -976,7 +1148,7 @@ template size_t i_max, vector &results, Args&&... args) - // Input: func (size_t from, size_t to, Res& res, Args...) + // Input: void func (size_t from, size_t to, Res& res, Args...) // Optimial thread_num minimizes (Time_SingleCPU/thread_num + Time_openCloseThread * (thread_num - 1)), which is sqrt(Time_SingleCPU/Time_openCloseThread) { ASSERT (threads_max >= 1); @@ -1025,7 +1197,7 @@ class Verbose ~Verbose (); static bool enabled () - { return Threads::empty (); } + { return isMainThread () /*Threads::empty ()*/; } }; struct Unverbose @@ -1049,20 +1221,19 @@ class Notype {}; struct Root { protected: - Root () - {} + Root () = default; public: - virtual ~Root () throw (logic_error) + virtual ~Root () {} // A desrtructor should be virtual to be automatically invoked by a descendant class destructor virtual Root* copy () const - { NOT_IMPLEMENTED; return nullptr; } + { throw logic_error ("Root::copy() is not implemented"); } // Return: the same type virtual void qc () const {} // Input: qc_on virtual void saveText (ostream& /*os*/) const - { NOT_IMPLEMENTED; } + { throw logic_error ("Root::saveText() is not implemented"); } // Parsable output void saveFile (const string &fName) const; // if fName.empty() then do nothing @@ -1077,14 +1248,14 @@ struct Root // Human-friendly virtual Json* toJson (JsonContainer* /*parent_arg*/, const string& /*name_arg*/) const - { NOT_IMPLEMENTED; return nullptr; } + { throw logic_error ("Root::toJson() is not implemented"); } virtual bool empty () const - { NOT_IMPLEMENTED; return false; } + { return true; } virtual void clear () - { NOT_IMPLEMENTED; } + {} // Postcondition: empty() virtual void read (istream &/*is*/) - { NOT_IMPLEMENTED; } + { throw logic_error ("Root::read() is not implemented"); } // Input: a line of is }; @@ -1099,19 +1270,19 @@ inline ostream& operator<< (ostream &os, template - struct AutoPtr : unique_ptr + struct AutoPtr : unique_ptr { private: typedef unique_ptr P; public: - explicit AutoPtr (T* t = nullptr) throw () + explicit AutoPtr (T* t = nullptr) : P (t) {} AutoPtr (const AutoPtr &t) : P (t. copy ()) {} - AutoPtr (AutoPtr &t) + AutoPtr (AutoPtr &&t) : P (t. release ()) {} AutoPtr& operator= (T* t) @@ -1122,7 +1293,7 @@ template { P::reset (t. copy ()); return *this; } - AutoPtr& operator= (AutoPtr &t) + AutoPtr& operator= (AutoPtr &&t) { P::reset (t. release ()); return *this; } @@ -1137,11 +1308,12 @@ struct Named : Root string name; // !empty(), no spaces at the ends, printable ASCII characeters - Named () - {} - explicit Named (const string &name_arg); + Named () = default; + explicit Named (const string &name_arg) + : name (name_arg) + {} Named* copy () const override - { return new Named (*this); } + { return new Named (*this); } void qc () const override; void saveText (ostream& os) const override { os << name; } @@ -1182,8 +1354,7 @@ struct Vector : vector bool searchSorted {false}; - Vector () - {} + Vector () = default; explicit Vector (size_t n, const T &value = T ()) : P (n, value) @@ -1191,28 +1362,34 @@ struct Vector : vector explicit Vector (initializer_list init) : P (init) {} - Vector (const vector &other) + explicit Vector (const vector &other) : P (other) {} - Vector (const Vector &other) - : P (other) - , searchSorted (other. searchSorted) - {} - Vector& operator= (const Vector &other) - { searchSorted = other. searchSorted; - P::reserve (other. size ()); - P::operator= (other); - return *this; - } + typename P::reference operator[] (size_t index) + { + #ifndef NDEBUG + if (index >= P::size ()) + throw range_error ("Vector assignment to index = " + toString (index) + ", but size = " + toString (P::size ())); + #endif + return P::operator[] (index); + } + typename P::const_reference operator[] (size_t index) const + { + #ifndef NDEBUG + if (index >= P::size ()) + throw range_error ("Vector reading of index = " + toString (index) + ", but size = " + toString (P::size ())); + #endif + return P::operator[] (index); + } void reserveInc (size_t inc) { P::reserve (P::size () + inc); } bool find (const T &value, size_t &index) const // Output: index: valid if (bool)Return { for (index = 0; index < P::size (); index++) - if (P::operator[] (index) == value) + if ((*this) [index] == value) return true; return false; } @@ -1236,6 +1413,15 @@ struct Vector : vector } bool contains (const T &value) const { return constFind (value) != P::end (); } + size_t indexOf (const T &value) const + { size_t n = 0; + for (const T& t : *this) + if (value == t) + return n; + else + n++; + return NO_INDEX; + } size_t countValue (const T &value) const { size_t n = 0; for (const T& t : *this) @@ -1289,20 +1475,22 @@ struct Vector : vector { const size_t j = P::size () - 1 - i; if (i >= j) break; - swap (P::operator[] (i), P::operator[] (j)); + swap ((*this) [i], (*this) [j]); } searchSorted = false; } - void randomOrder (ulong seed) - { Rand rand (seed); + void randomOrder () + { Rand rand (seed_global); for (T &t : *this) - swap (t, P::operator[] ((size_t) rand. get ((ulong) P::size ()))); + swap (t, (*this) [(size_t) rand. get ((ulong) P::size ())]); searchSorted = false; } T pop (size_t n = 1) { T t = T (); while (n) - { t = P::operator[] (P::size () - 1); + { if (P::empty ()) + throw runtime_error ("Cannot pop an empty vector"); + t = (*this) [P::size () - 1]; P::pop_back (); n--; } @@ -1364,8 +1552,8 @@ struct Vector : vector return; FOR_START (size_t, i, 1, P::size ()) FOR_REV (size_t, j, i) - if (P::operator[] (j + 1) > P::operator[] (j)) - swap (P::operator[] (j), P::operator[] (j + 1)); + if ((*this) [j + 1] > (*this) [j]) + swap ((*this) [j], (*this) [j + 1]); else break; searchSorted = true; @@ -1384,16 +1572,16 @@ struct Vector : vector size_t lo = 0; // vec.at(lo) <= value size_t hi = P::size () - 1; // lo <= hi - if (value < P::operator[] (lo)) + if (value < (*this) [lo]) return exact ? NO_INDEX : lo; - if (P::operator[] (hi) < value) + if ((*this) [hi] < value) return NO_INDEX; // at(lo) <= value <= at(hi) for (;;) { const size_t m = (lo + hi) / 2; - if ( P::operator[] (m) == value - || P::operator[] (m) < value + if ( (*this) [m] == value + || (*this) [m] < value ) if (lo == m) // hi in {lo, lo + 1} break; @@ -1402,9 +1590,9 @@ struct Vector : vector else hi = m; } - if (P::operator[] (lo) == value) + if ((*this) [lo] == value) return lo; - if (! exact || P::operator[] (hi) == value) + if (! exact || (*this) [hi] == value) return hi; return NO_INDEX; } @@ -1429,15 +1617,35 @@ struct Vector : vector return false; return true; } + template + bool containsFastAll (const unordered_set &other) const + { if (other. size () > P::size ()) + return false; + for (const U& u : other) + if (! containsFast (u)) + return false; + return true; + } + template + bool containsFastAll (const unordered_map &other) const + { if (other. size () > P::size ()) + return false; + for (const auto& it : other) + if (! containsFast (it. first)) + return false; + return true; + } template bool intersectsFast (const Vector &other) const - { for (const U& u : other) + { if (P::empty ()) + return false; + for (const U& u : other) if (containsFast (u)) return true; return false; } template - bool intersectsFast2 (const Vector &other) const + bool intersectsFast_merge (const Vector &other) const { checkSorted (); other. checkSorted (); size_t i = 0; @@ -1452,6 +1660,15 @@ struct Vector : vector } return false; } + template + bool intersects (const set &other) const + { if (other. empty ()) + return false; + for (const T& t : *this) + if (other. find (t) != other. end ()) + return true; + return false; + } template void setMinus (const Vector &other) { filterIndex ([&] (size_t i) { return other. containsFast ((*this) [i]); }); } @@ -1504,8 +1721,8 @@ struct Vector : vector bool operator< (const Vector &other) const // Lexicographic comparison { FFOR (size_t, i, std::min (P::size (), other. size ())) - { if (P::operator[] (i) < other [i]) return true; - if (other [i] < P::operator[] (i)) return false; + { if ((*this) [i] < other [i]) return true; + if (other [i] < (*this) [i]) return false; } return P::size () < other. size (); } @@ -1521,46 +1738,51 @@ struct VectorPtr : Vector public: - VectorPtr () - {} + VectorPtr () = default; explicit VectorPtr (size_t n, const T* value = nullptr) : P (n, value) {} - VectorPtr (const VectorPtr &other) - : P (other) - {} explicit VectorPtr (initializer_list init) : P (init) {} template - VectorPtr (const vector &other) + explicit VectorPtr (const vector &other) : P () { P::reserve (other. size ()); insertAll (*this, other); } template - VectorPtr (const list &other) + explicit VectorPtr (const list &other) : P () { P::reserve (other. size ()); insertAll (*this, other); } + VectorPtr& operator<< (const T* value) + { P::operator<< (value); + return *this; + } + template */> + VectorPtr& operator<< (const VectorPtr &other) + { P::operator<< (other); + return *this; + } void deleteData () { for (const T* t : *this) delete t; P::clear (); } void erasePtr (size_t index) - { delete P::operator[] (index); + { delete (*this) [index]; P::eraseAt (index); } void sortBubblePtr () { FOR_START (size_t, i, 1, P::size ()) FOR_REV (size_t, j, i) - if (* P::operator[] (j + 1) > * P::operator[] (j)) - std::swap (P::operator[] (j), P::operator[] (j + 1)); + if (* (*this) [j + 1] > * (*this) [j]) + std::swap ((*this) [j], (*this) [j + 1]); else break; } @@ -1575,24 +1797,35 @@ struct VectorOwn : VectorPtr typedef VectorPtr P; public: - VectorOwn () - {} - VectorOwn (const VectorOwn &x) + VectorOwn () = default; + VectorOwn (const VectorOwn &other) : P () - { *this = x; } - VectorOwn& operator= (const VectorOwn &x) + { *this = other; } + VectorOwn& operator= (const VectorOwn &other) { P::deleteData (); - P::reserve (x. size ()); - for (const T* t : x) + P::reserve (other. size ()); + for (const T* t : other) P::push_back (static_cast (t->copy ())); - P::searchSorted = x. searchSorted; + P::searchSorted = other. searchSorted; return *this; } - VectorOwn (const VectorPtr &x) + VectorOwn (VectorOwn&&) = default; + VectorOwn& operator= (VectorOwn&&) = default; + explicit VectorOwn (const VectorPtr &other) : P () - { P::operator= (x); } + { P::operator= (other); } ~VectorOwn () { P::deleteData (); } + + VectorOwn& operator<< (const T* value) + { P::operator<< (value); + return *this; + } + template */> + VectorOwn& operator<< (const VectorPtr &other) + { P::operator<< (other); + return *this; + } }; @@ -1604,13 +1837,14 @@ struct StringVector : Vector public: - StringVector () - {} - StringVector (initializer_list init) + StringVector () = default; + explicit StringVector (initializer_list init) : P (init) {} StringVector (const string &fName, size_t reserve_size); + explicit StringVector (const string &s, + char c = ' '); string toString (const string& sep) const @@ -1793,7 +2027,7 @@ struct Heap : Root, Nocopy const string& s2_ = * static_cast (s2); if (s1_ < s2_) return -1; if (s1_ > s2_) return 1; - return 0; + return 0; } }; @@ -1809,30 +2043,40 @@ struct Set : set // true => empty() - Set () - {} + Set () = default; explicit Set (bool universal_arg) : universal (universal_arg) {} - Set (const Set &other) - : P () - { *this = other; } - Set& operator= (const Set &other) - { universal = other. universal; - return static_cast &> (P::operator= (other)); - } template - Set (const map &other) - : universal (false) - { for (const auto it : other) + explicit Set (const map &other) + { operator= (other); } + template + explicit Set (const unordered_map &other) + { operator= (other); } + template + explicit Set (const vector &other) + { operator= (other); } + template + Set& operator= (const map &other) + { universal = false; + for (const auto& it : other) P::insert (it. first); - } + return *this; + } + template + Set& operator= (const unordered_map &other) + { universal = false; + for (const auto& it : other) + P::insert (it. first); + return *this; + } template - Set (const vector &other) - : universal (false) - { for (const U u : other) + Set& operator= (const vector &other) + { universal = false; + for (const U& u : other) P::insert (u); - } + return *this; + } bool operator== (const Set &other) const { return universal ? other. universal ? true : false @@ -1875,7 +2119,9 @@ struct Set : set } private: bool intersects_ (const Set &other) const - { for (const T& t : *this) + { if (other. empty ()) + return false; + for (const T& t : *this) if (other. contains (t)) return true; return false; @@ -1901,7 +2147,7 @@ struct Set : set template void insertAll (const From &from) { P::insert (from. begin (), from. end ()); } - bool checkUnique (const T& el) + bool addUnique (const T& el) { if (contains (el)) return false; operator<< (el); @@ -1914,10 +2160,39 @@ struct Set : set { operator= (other); return; } - for (Iter > iter (*this); iter. next (); ) + for (Iter> iter (*this); iter. next (); ) if (! other. contains (*iter)) iter. erase (); } + void intersect (const Vector &other) + { if (universal) + { operator= (other); + return; + } + for (Iter> iter (*this); iter. next (); ) + if (! other. containsFast (*iter)) + iter. erase (); + } + template + void intersect (const map &other) + { if (universal) + { operator= (other); + return; + } + for (Iter> iter (*this); iter. next (); ) + if (! Common_sp::contains (other, *iter)) + iter. erase (); + } + template + void intersect (const unordered_map &other) + { if (universal) + { operator= (other); + return; + } + for (Iter> iter (*this); iter. next (); ) + if (! Common_sp::contains (other, *iter)) + iter. erase (); + } size_t intersectSize (const Set &other) const // Return: universal <=> SIZE_MAX { if (other. universal) @@ -1925,9 +2200,10 @@ struct Set : set if (universal) return other. size (); size_t n = 0; - for (const T& t : *this) - if (other. contains (t)) - n++; + if (! other. empty ()) + for (const T& t : *this) + if (other. contains (t)) + n++; return n; } size_t setMinus (const Set &other) @@ -1982,18 +2258,18 @@ struct Progress : Nocopy static size_t beingUsed; public: - size_t n_max; + size_t n_max {0}; // 0 <=> unknown bool active; size_t n {0}; string step; - size_t displayPeriod; + size_t displayPeriod {0}; explicit Progress (size_t n_max_arg = 0, size_t displayPeriod_arg = 1) : n_max (n_max_arg) - , active (enabled () && displayPeriod_arg) + , active (enabled () && displayPeriod_arg && (! n_max_arg || displayPeriod_arg <= n_max_arg)) , displayPeriod (displayPeriod_arg) { if (active) beingUsed++; @@ -2020,21 +2296,16 @@ struct Progress : Nocopy report (); } private: - void report () const - { cerr << '\r'; - #ifndef _MSC_VER - cerr << "\33[2K"; - #endif - cerr << n; - if (n_max) - cerr << " / " << n_max; - if (! step. empty ()) - cerr << ' ' << step; - cerr << ' '; - } + void report () const; public: + void reset () + { n = 0; + step. clear (); + } static void disable () { beingUsed++; } + static bool isUsed () + { return beingUsed; } static bool enabled () { return ! beingUsed && verbose (1); } }; @@ -2047,8 +2318,10 @@ struct Progress : Nocopy struct Input : Root, Nocopy { protected: - AutoPtr buf; + unique_ptr buf; ifstream ifs; + istream* is {nullptr}; + // !nullptr public: bool eof {false}; uint lineNum {0}; @@ -2062,6 +2335,13 @@ struct Input : Root, Nocopy Input (const string &fName, size_t bufSize, uint displayPeriod); + Input (istream &is_arg, + uint displayPeriod); +public: + + + void reset (); + // Update: ifs }; @@ -2070,25 +2350,31 @@ struct LineInput : Input { string line; // Current line + string commentStart; explicit LineInput (const string &fName, size_t bufSize = 100 * 1024, uint displayPeriod = 0) : Input (fName, bufSize, displayPeriod) + {} + explicit LineInput (istream &is_arg, + uint displayPeriod = 0) + : Input (is_arg, displayPeriod) {} bool nextLine (); // Output: eof, line // Update: lineNum + // Invokes: trimTrailing() bool expectPrefix (const string &prefix, bool eofAllowed) { if (nextLine () && trimPrefix (line, prefix)) return true; if (eof && eofAllowed) return false; - throw runtime_error ("No \"" + prefix + "\""); + throw runtime_error ("No " + strQuote (prefix)); } string getString () { string s; @@ -2116,6 +2402,10 @@ struct ObjectInput : Input size_t bufSize = 100 * 1024, uint displayPeriod = 0) : Input (fName, bufSize, displayPeriod) + {} + explicit ObjectInput (istream &is_arg, + uint displayPeriod = 0) + : Input (is_arg, displayPeriod) {} @@ -2128,12 +2418,12 @@ struct ObjectInput : Input struct CharInput : Input { - uint charNum; + uint charNum {(uint) -1}; // In the current line - bool eol; + bool eol {false}; // eof => eol private: - bool ungot; + bool ungot {false}; public: @@ -2141,11 +2431,12 @@ struct CharInput : Input size_t bufSize = 100 * 1024, uint displayPeriod = 0) : Input (fName, bufSize, displayPeriod) - , charNum ((uint) -1) - , eol (false) - , ungot (false) {} - + explicit CharInput (istream &is_arg, + uint displayPeriod = 0) + : Input (is_arg, displayPeriod) + {} + char get (); // Output: eof @@ -2157,14 +2448,15 @@ struct CharInput : Input struct Error : runtime_error - { explicit Error (const CharInput &in, - const string &expected = string ()) - : runtime_error ("Error at line " + toString (in. lineNum + 1) - + ", pos. " + toString (in. charNum + 1) - + (expected. empty () ? string () : (": " + expected + " is expected")) - ) - {} - }; + { Error (const CharInput &in, + const string &what_arg, + bool expected = true) + : runtime_error ("Error at line " + toString (in. lineNum + 1) + + ", pos. " + toString (in. charNum + 1) + + (what_arg. empty () ? string () : (": " + what_arg + ifS (expected, " is expected"))) + ) + {} + }; }; @@ -2173,72 +2465,73 @@ struct PairFile : Root { private: LineInput f; + Istringstream iss; public: + bool sameAllowed {false}; + bool orderNames {false}; + // true => name1 < name2 string name1; string name2; - // name1 < name2 - explicit PairFile (const string &fName) + PairFile (const string &fName, + bool sameAllowed_arg, + bool orderNames_arg) : f (fName, 100 * 1024, 1000) // PAR + , sameAllowed (sameAllowed_arg) + , orderNames (orderNames_arg) {} - bool next () - { if (! f. nextLine ()) - return false; - istringstream iss (f. line); - iss >> name1 >> name2; - if (name2. empty ()) - throw runtime_error ("Bad request: '" + name1 + "' - '" + name2 + "'"); - if (name1 == name2) - throw runtime_error ("Same name: " + name1); - if (name1 > name2) - swap (name1, name2); - return true; - } + bool next (); }; struct Token : Root { - static constexpr char quote = '\"'; - static constexpr uint noNum = (uint) -1; - enum Type { eText // In quote's within one line - , eNumber // All characters: isDigit() - , eName // All characters: isLetter() - // Start: !isDigit() - , eDelimiter // Does not include ' ' + enum Type { eName + , eText + , eInteger + , eDouble + , eDelimiter }; - Type type; + Type type {eDelimiter}; string name; - // All characters: printable() + // ename => !empty(), printable() // eText => embracing quote's are removed - // May be: !goodName(name) - uint num; - // Valid if type = eNumber - uint charNum; - // First CharInput::charNum of name + char quote {'\0'}; + // eText => '\'' or '\"' + int n {0}; + double d {0.0}; + // Valid if eDouble + streamsize decimals {0}; + bool scientific {false}; + + uint charNum {(uint) -1}; + // = CharInput::charNum - Token () - { clear (); } + Token () = default; + explicit Token (const string& name_arg) + : type (eName) + , name (name_arg) + {} Token (const string& name_arg, - Type type_arg = eName) - : type (type_arg) + char quote_arg) + : type (eText) , name (name_arg) - , num (noNum) - , charNum (0) + , quote (quote_arg) + {} + explicit Token (const int n_arg) + : type (eInteger) + , n (n_arg) {} - Token (const uint num_arg) - : type (eNumber) - , num (num_arg) - , charNum (0) + explicit Token (const double d_arg) + : type (eDouble) + , d (d_arg) {} - Token (char delimiter_arg) + explicit Token (char delimiter_arg) : type (eDelimiter) , name (1, delimiter_arg) - , num (noNum) - , charNum (0) {} explicit Token (CharInput &in) { readInput (in); } @@ -2246,21 +2539,27 @@ struct Token : Root Type expected) { readInput (in); if (empty ()) - throw CharInput::Error (in, "No token"); + throw CharInput::Error (in, "No token", false); if (type != expected) throw CharInput::Error (in, type2str (type)); } Token (CharInput &in, - uint expected) + const string &expected) { readInput (in); - if (! isNumber (expected)) - throw CharInput::Error (in, type2str (eNumber) + " " + toString (expected)); + if (! isNameText (expected)) + throw CharInput::Error (in, type2str (eName) + " or " + type2str (eName) + " " + strQuote (expected)); } Token (CharInput &in, - const string &expected) + int expected) { readInput (in); - if (! isName (expected)) - throw CharInput::Error (in, type2str (eName) + " " + expected); + if (! isInteger (expected)) + throw CharInput::Error (in, type2str (eInteger) + " " + toString (expected)); + } + Token (CharInput &in, + double expected) + { readInput (in); + if (! isDouble (expected)) + throw CharInput::Error (in, type2str (eDouble) + " " + toString (expected)); } Token (CharInput &in, char expected) @@ -2272,38 +2571,29 @@ struct Token : Root void readInput (CharInput &in); public: void qc () const override; - void saveText (ostream &os) const override - { if (! empty ()) - switch (type) - { case eText: os << ' ' << quote << name << quote; break; - case eNumber: os << ' ' << num; break; - case eName: os << ' ' << name; break; - case eDelimiter: os << name; break; - } - } + void saveText (ostream &os) const override; bool empty () const override { return type == eDelimiter && name. empty (); } - void clear () override - { type = eDelimiter; - name. clear (); - num = noNum; - charNum = 0; - } static string type2str (Type type) { switch (type) - { case eText: return "Text"; - case eNumber: return "Number"; - case eName: return "Name"; - case eDelimiter: return "Delimiter"; + { case eName: return "name"; + case eText: return "text"; + case eInteger: return "integer"; + case eDouble: return "double"; + case eDelimiter: return "delimiter"; } return "?"; } - bool isNumber (uint n) const - { return ! empty () && type == eNumber && num == n; } bool isName (const string &s) const { return ! empty () && type == eName && name == s; } + bool isNameText (const string &s) const + { return ! empty () && (type == eName || type == eText) && name == s; } + bool isInteger (int n_arg) const + { return ! empty () && type == eInteger && n == n_arg; } + bool isDouble (double d_arg) const + { return ! empty () && type == eDouble && d == d_arg; } bool isDelimiter (char c) const { return ! empty () && type == eDelimiter && name [0] == c; } }; @@ -2311,12 +2601,9 @@ struct Token : Root -// OFStream - struct OFStream : ofstream { - OFStream () - {} + OFStream () = default; OFStream (const string &dirName, const string &fileName, const string &extension) @@ -2333,6 +2620,24 @@ struct OFStream : ofstream +struct Stderr : Singleton +{ + bool quiet {false}; + + explicit Stderr (bool quiet_arg) + : quiet (quiet_arg) + {} + + template + Stderr& operator<< (const T& t) + { if (quiet) + return *this; + cerr << t; + return *this; + } +}; + + struct Csv : Root // Line of Excel .csv-file @@ -2362,9 +2667,7 @@ struct Csv : Root -void csvLine2vec (const string &line, - StringVector &words); - // Output: words +StringVector csvLine2vec (const string &line); // Invokes: Csv @@ -2378,8 +2681,8 @@ struct TabDel ONumber on; public: - TabDel (streamsize precision = 6, - bool scientific = false) + explicit TabDel (streamsize precision = 6, + bool scientific = false) : on (tabDel, precision, scientific) {} @@ -2413,19 +2716,18 @@ struct Json : Root, Nocopy // Heaponly protected: Json (JsonContainer* parent, const string& name); - Json () - {}; + Json () = default; public: void print (ostream& os) const override = 0; virtual const JsonNull* asJsonNull () const { return nullptr; } + virtual const JsonString* asJsonString () const + { return nullptr; } virtual const JsonInt* asJsonInt () const { return nullptr; } virtual const JsonDouble* asJsonDouble () const { return nullptr; } - virtual const JsonString* asJsonString () const - { return nullptr; } virtual const JsonBoolean* asJsonBoolean () const { return nullptr; } virtual const JsonArray* asJsonArray () const @@ -2436,19 +2738,19 @@ struct Json : Root, Nocopy // Heaponly protected: static string toStr (const string& s) { return "'" + to_c (s) + "'"; } - static Token readToken (istream &is); - static void parse (istream& is, + static void parse (CharInput &in, const Token& firstToken, JsonContainer* parent, const string& name); -public: - +public: + string getString () const; + // Requires: JsonString int getInt () const; // Requires: JsonInt double getDouble () const; // Requires: JsonDouble - string getString () const; - // Requires: JsonString + bool getBoolean () const; + // Requires: JsonBoolean const Json* at (const string& name_arg) const; // Requires: JsonMap const Json* at (size_t index) const; @@ -2472,9 +2774,27 @@ struct JsonNull : Json }; +struct JsonString : Json +{ + string s; + + JsonString (const string& s_arg, + JsonContainer* parent, + const string& name = noString) + : Json (parent, name) + , s (s_arg) + {} + void print (ostream& os) const final + { os << toStr (s); } + + const JsonString* asJsonString () const final + { return this; } +}; + + struct JsonInt : Json { - int n; + int n {0}; JsonInt (int n_arg, JsonContainer* parent, @@ -2492,8 +2812,9 @@ struct JsonInt : Json struct JsonDouble : Json { - double n; - streamsize decimals; + double n {0.0}; + streamsize decimals {0}; + bool scientfiic {false}; JsonDouble (double n_arg, streamsize decimals_arg, @@ -2505,11 +2826,11 @@ struct JsonDouble : Json {} // decimals_arg = -1: default void print (ostream& os) const final - { const ONumber on (os, (streamsize) decimals, false); + { const ONumber on (os, (streamsize) decimals, scientific); if (n == n) os << n; else - os << "null"; // NAN + os << "null"; // NaN } const JsonDouble* asJsonDouble () const final @@ -2517,27 +2838,9 @@ struct JsonDouble : Json }; -struct JsonString : Json -{ - string s; - - JsonString (const string& s_arg, - JsonContainer* parent, - const string& name = noString) - : Json (parent, name) - , s (s_arg) - {} - void print (ostream& os) const final - { os << toStr (s); } - - const JsonString* asJsonString () const final - { return this; } -}; - - struct JsonBoolean : Json { - bool b; + bool b {false}; JsonBoolean (bool b_arg, JsonContainer* parent, @@ -2560,8 +2863,7 @@ struct JsonContainer : Json const string& name) : Json (parent, name) {} - JsonContainer () - {} + JsonContainer () = default; public: }; @@ -2578,7 +2880,7 @@ struct JsonArray : JsonContainer : JsonContainer (parent, name) {} private: - JsonArray (istream& is, + JsonArray (CharInput& in, JsonContainer* parent, const string& name); public: @@ -2605,12 +2907,12 @@ struct JsonMap : JsonContainer // Output: jRoot = this explicit JsonMap (const string &fName); private: - JsonMap (istream& is, + JsonMap (CharInput& in, JsonContainer* parent, const string& name) : JsonContainer (parent, name) - { parse (is); } - void parse (istream& is); + { parse (in); } + void parse (CharInput& in); public: ~JsonMap (); void print (ostream& os) const final; @@ -2647,9 +2949,6 @@ struct Offset -void exec (const string &cmd); - - /////////////////////////////////////////////////////////////////////////// @@ -2723,12 +3022,16 @@ struct NumberItemGenerator : ItemGenerator /////////////////////////////////////////////////////////////////////////// struct Application : Singleton, Root -// Usage: int main (argc, argv) { Application app; return app. run (argc, argv); } +// Usage: int main (argc, argv) { ThisApplication /*:Application*/ app; return app. run (argc, argv); } { const string description; const bool needsArg; + const bool gnu; + string version {"1"}; + static constexpr const char* helpS {"help"}; + static constexpr const char* versionS {"version"}; -private: +protected: struct Positional; // forward struct Key; // forward struct Arg : Named @@ -2736,11 +3039,11 @@ struct Application : Singleton, Root const string description; // !empty() string value; - // Init: default protected: Arg (const string &name_arg, const string &description_arg); public: + void qc () const override; virtual const Positional* asPositional () const { return nullptr; } virtual const Key* asKey () const @@ -2759,73 +3062,184 @@ struct Application : Singleton, Root }; struct Key : Arg { + const Application& app; const bool flag; - Key (const string &name_arg, + string requiredGroup; + const char acronym; + // '\0' <=> no acronym + const string var; + // For help + string defaultValue; + Key (const Application &app_arg, + const string &name_arg, const string &description_arg, - const string &defaultValue) + const string &defaultValue_arg, + char acronym_arg, + const string &var_arg) : Arg (name_arg, description_arg) + , app (app_arg) , flag (false) - { value = defaultValue; } - Key (const string &name_arg, - const string &description_arg) + , acronym (acronym_arg) + , var (var_arg) + , defaultValue (defaultValue_arg) + {} + // "$BASE" in defaultValue means `dirname $0` + Key (const Application &app_arg, + const string &name_arg, + const string &description_arg, + char acronym_arg) : Arg (name_arg, description_arg) + , app (app_arg) , flag (true) + , acronym (acronym_arg) {} + void qc () const override; void saveText (ostream &os) const override; + string getShortHelp () const; const Key* asKey () const final { return this; } }; List positionals; List keys; - map args; - List::iterator posIt; + map name2arg; + map char2arg; + // Valid if gnu public: protected: explicit Application (const string &description_arg, - bool needsArg_arg = true) + bool needsArg_arg = true, + bool gnu_arg = false) : description (description_arg) , needsArg (needsArg_arg) - , posIt (positionals. begin ()) - { addFlag ("qc", "Integrity checks (quality control)"); - addKey ("verbose", "Level of verbosity", "0"); - addFlag ("noprogress", "Turn off progress printout"); - addFlag ("profile", "Use chronometers to profile"); - addKey ("seed", "Integer positive seed for random number generator", "1"); - addKey ("threads", "Max. number of threads", "1"); - addKey ("json", "Output file in Json format"); - addKey ("log", "Error log file, appended"); - } - // To invoke: addKey(), addFlag(), addPositional() - // Command-line parameters + , gnu (gnu_arg) + {} + // To invoke: addKey(), addFlag(), addPositional(), setRequiredGroup() + // ::= * + // ::= | | + // ::= + // ::= - | -= + // ::= - + // acronym = '\0' <=> no acronym void addKey (const string &name, const string &argDescription, - const string &defaultValue = string ()); + const string &defaultValue = string (), + char acronym = '\0', + const string &var = string ()); // [- ] + // gnu: [-- ] void addFlag (const string &name, - const string &argDescription); + const string &argDescription, + char acronym = '\0'); // [-] void addPositional (const string &name, const string &argDescription); - // What if it starts with '-' ?? + void setRequiredGroup (const string &keyName, + const string &requiredGroup); +private: + void addDefaultArgs () + { if (gnu) + { addKey ("threads", "Max. number of threads", "1", '\0', "THREADS"); + addFlag ("debug", "Integrity checks"); + } + else + { addFlag ("qc", "Integrity checks (quality control)"); + addKey ("verbose", "Level of verbosity", "0"); + addFlag ("noprogress", "Turn off progress printout"); + addFlag ("profile", "Use chronometers to profile"); + addKey ("seed", "Positive integer seed for random number generator", "1"); + addKey ("threads", "Max. number of threads", "1"); + addKey ("json", "Output file in Json format"); + addKey ("log", "Error log file, appended"); + addFlag ("sigpipe", "Exit normally on SIGPIPE"); + } + } + void qc () const final; + Key* getKey (const string &keyName) const; + // Return: !nullptr + void setPositional (List::iterator &posIt, + const string &value); public: + virtual ~Application () + {} protected: string getArg (const string &name) const; // Input: keys, where Key::flag = false, and positionals + uint arg2uint (const string &name) const + { uint n = 0; + try { n = str2 (getArg (name)); } + catch (...) { throw runtime_error ("Cannot convert " + strQuote (name) + " to non-negative number"); } + return n; + } + double arg2double (const string &name) const + { double d = numeric_limits::quiet_NaN (); + try { d = str2 (getArg (name)); } + catch (...) { throw runtime_error ("Cannot convert " + strQuote (name) + " to number"); } + return d; + } bool getFlag (const string &name) const; // Input: keys, where Key::flag = true -public: + string key2shortHelp (const string &name) const; + string getProgramDirName () const + { return getDirName (programArgs. front ()); } +protected: + virtual void initEnvironment () + {} string getInstruction () const; string getHelp () const; +public: int run (int argc, const char* argv []); // Invokes: body() private: virtual void body () const = 0; - // Requires: to be invoked once + // Invokes: initEnvironment() +}; + + + +struct ShellApplication : Application +// Requires: SHELL=bash +{ + // Environment + const bool useTmp; + string tmp; + string execDir; + mutable map prog2dir; + + + ShellApplication (const string &description_arg, + bool needsArg_arg, + bool gnu_arg, + bool useTmp_arg) + : Application (description_arg, needsArg_arg, gnu_arg) + , useTmp (useTmp_arg) + {} + ~ShellApplication (); + + +protected: + void initEnvironment () override; +private: + void body () const final; + virtual void shellBody () const = 0; +protected: + static string shellQuote (string s) + { replaceStr (s, "\'", "\'\"\'\"\'"); + return "\'" + s + "\'"; + } + static bool emptyArg (const string &s) + { return s. empty () || s == "\'\'"; } + string which (const string &progName) const; + // Return: isRight(,"/") or empty() + void findProg (const string &progName) const; + // Output: prog2dir + string fullProg (const string &progName) const; + // Return: directory + progName + ' ' + // Requires: After findProg(progName) }; diff --git a/common.inc b/common.inc index 3e8dcbc..b27e810 100644 --- a/common.inc +++ b/common.inc @@ -67,30 +67,37 @@ // Exceptions +#include #include #include #define WHERE "\"" __FILE__ "\", line " << __LINE__ +#define FUNC std::string (__PRETTY_FUNCTION__) + ": " -#define ERROR_MSG(msg) \ - { if (! uncaught_exception ()) \ - { std::ostringstream oss_; \ - oss_ << WHERE << ": " << (msg); \ - throw std::logic_error (oss_. str ()); \ - } else {} \ +[[noreturn]] void errorThrow (const std::string &msg); + +#define ERROR_MSG(msg) \ + { if (! std::uncaught_exception ()) \ + { std::ostringstream oss_; \ + oss_ << WHERE << ": " << (msg); \ + errorThrow (oss_. str ()); \ + } else {} \ } -#define ERROR ERROR_MSG("") -#define NOT_IMPLEMENTED ERROR_MSG("Not implemented: ") -#define NEVER_CALL ERROR_MSG("Never call: ") +#define ERROR ERROR_MSG ("") +#define NOT_IMPLEMENTED ERROR_MSG ("Not implemented: ") +#define NEVER_CALL ERROR_MSG ("Never call: ") + + +#define QC_ASSERT(cond) if (! (cond)) ERROR_MSG (#cond) else {} // Logic errors #ifdef NDEBUG #define ASSERT(cond) #else - #define ASSERT(cond) if (! (cond)) ERROR_MSG (#cond) else {} + #define ASSERT(cond) QC_ASSERT (cond) #endif @@ -101,8 +108,11 @@ #endif -#define IMPLY(a,b) { if (a) { ASSERT (b) }} -#define ASSERT_EQ(x,y,delta) ASSERT (abs((x) - (y)) <= (delta)) +#define IMPLY(a,b) { if (a) { ASSERT (b) }} +#define QC_IMPLY(a,b) { if (a) { QC_ASSERT (b) }} + +#define ASSERT_EQ(x,y,delta) ASSERT (std::fabs((x) - (y)) <= (delta)) +#define QC_ASSERT_EQ(x,y,delta) QC_ASSERT (std::fabs((x) - (y)) <= (delta)) @@ -117,5 +127,8 @@ +#define PRINT(x) cout << #x << " = " << (x) << endl; + + #endif diff --git a/fasta2parts.cpp b/fasta2parts.cpp new file mode 100644 index 0000000..f76bd18 --- /dev/null +++ b/fasta2parts.cpp @@ -0,0 +1,115 @@ +// fasta2parts.cpp + +/*=========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's official duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government have not placed any restriction on its use or reproduction. +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* Please cite the author in any work or product based on this material. +* +* =========================================================================== +* +* Author: Vyacheslav Brover +* +* File Description: +* Split the sequences a FASTA file into chunks without breaking sequences +* +*/ + + +#undef NDEBUG +#include "common.inc" + +#include "common.hpp" +using namespace Common_sp; + + + +namespace +{ + + + +struct ThisApplication : Application +{ + ThisApplication () + : Application ("Split the sequences a FASTA file into parts without breaking sequences") + { + addPositional ("in", "FASTA file"); + addPositional ("parts_max", "Max. number of parts (>= 2)"); + addPositional ("dir", "Output directory where chunks are saved named by integers starting with 1"); + } + + + + void body () const final + { + const string fName = getArg ("in"); + const size_t parts_max = str2 (getArg ("parts_max")); + const string dirName = getArg ("dir"); + + if (parts_max <= 1) + throw runtime_error ("Number of parts must be >= 2"); + + + const size_t chunk_min = (size_t) getFileSize (fName) / parts_max + 1; + + size_t part = 0; + OFStream* out = nullptr; + size_t seqSize = 0; + LineInput f (fName); + while (f. nextLine ()) + { + if (f. line. empty ()) + continue; + if ( f. line [0] == '>' + && seqSize >= chunk_min + ) + { + delete out; + out = nullptr; + seqSize = 0; + } + if (! out) + { + part++; + ASSERT (part <= parts_max); + out = new OFStream (dirName, toString (part), ""); + } + *out << f. line << endl; + seqSize += f. line. size (); + } + delete out; + } +}; + + + +} // namespace + + + +int main (int argc, + const char* argv[]) +{ + ThisApplication app; + return app. run (argc, argv); +} + + + diff --git a/fasta_check.cpp b/fasta_check.cpp index 82edf5d..e5bf196 100644 --- a/fasta_check.cpp +++ b/fasta_check.cpp @@ -54,46 +54,57 @@ struct ThisApplication : Application addPositional ("in", "FASTA file"); addFlag ("aa", "Amino acid sequenes, otherwise nucleotide"); addFlag ("hyphen", "Hyphens are allowed"); + addKey ("len", "Output file with lines: "); } void body () const final { - const string fName = getArg ("in"); - const bool aa = getFlag ("aa"); - const bool hyphen = getFlag ("hyphen"); + const string fName = getArg ("in"); + const bool aa = getFlag ("aa"); + const bool hyphen = getFlag ("hyphen"); + const string lenFName = getArg ("len"); + unique_ptr lenF; + if (! lenFName. empty ()) + lenF. reset (new OFStream (lenFName)); size_t lines = 0; bool first = true; StringVector ids; ids. reserve (100000); // PAR size_t seqSize = 0; - LineInput f (fName); + size_t allSize = 0; + LineInput f (fName); + size_t nuc = 0; + string id; + string errorS; while (f. nextLine ()) { if (f. line. empty ()) continue; - const string errorS ("File " + fName + ", line " + toString (f. lineNum) + ": "); + errorS = "File " + fName + ", line " + toString (f. lineNum) + ": "; lines++; if (f. line [0] == '>') { size_t pos = 1; while (pos < f. line. size () && ! isspace (f. line [pos])) pos++; - const string s (f. line. substr (1, pos - 1)); - if (s. empty ()) + id = f. line. substr (1, pos - 1); + if (id. empty ()) throw runtime_error (errorS + "Empty sequence identifier"); #if 0 - if (s. size () > 1000) // PAR + if (id. size () > 1000) // PAR throw runtime_error (errorS + "Too long sequence identifier"); #endif - for (const char c : s) + for (const char c : id) if (! printable (c)) throw runtime_error (errorS + "Non-printable character in the sequence identifier: " + c); if (! first && seqSize == 0) throw runtime_error (errorS + "Empty sequence"); - ids << s; + if (lenF. get () && ! ids. empty ()) + *lenF << ids. back () << '\t' << seqSize << endl; + ids << id; seqSize = 0; } else @@ -101,6 +112,7 @@ struct ThisApplication : Application if (first) throw runtime_error (errorS + "FASTA should start with '>'"); seqSize += f. line. size (); + allSize += f. line. size (); for (const char c : f. line) if (c == '-') if (hyphen) @@ -108,22 +120,31 @@ struct ThisApplication : Application else throw runtime_error (errorS + "hyphen in the sequence"); else + { + const char c1 = toLower (c); if (aa) { - if (! charInSet (c, "acdefghiklmnpqrstvwyxbzjuoACDEFGHIKLMNPQRSTVWYXBZJUO*")) + if (! charInSet (c1, "acdefghiklmnpqrstvwyxbzjuoacdefghiklmnpqrstvwyxbzjuo*")) throw runtime_error (errorS + "Wrong amino acid character: '" + c + "'"); + if (charInSet (c1, "acgt")) + nuc++; } else - if (! charInSet (c, "acgtbdhkmnrsvwyACGTBDHKMNRSVWY")) + if (! charInSet (c1, "acgtbdhkmnrsvwyacgtbdhkmnrsvwy")) throw runtime_error (errorS + "Wrong nucleotide character: '" + c + "'"); + } } first = false; } + if (lenF. get () && ! ids. empty ()) + *lenF << ids. back () << '\t' << seqSize << endl; if (! lines) throw runtime_error ("Empty file"); if (! first && seqSize == 0) throw runtime_error ("Empty sequence"); + if (aa && (double) nuc / (double) allSize > 0.9) // PAR + throw runtime_error ("Protein sequence looks like a nucleotide sequence"); ids. sort (); const size_t index = ids. findDuplicate (); diff --git a/gff.cpp b/gff.cpp index a6f0fe0..5266bb6 100644 --- a/gff.cpp +++ b/gff.cpp @@ -45,38 +45,49 @@ namespace GFF_sp -// Cds - -Cds::Cds (const string &contig_arg, - size_t start_arg, - size_t stop_arg, - bool strand_arg, - size_t crossOriginSeqLen_arg) -: contig (contig_arg) +// Locus + +Locus::Locus (size_t lineNum_arg, + const string &contig_arg, + size_t start_arg, + size_t stop_arg, + bool strand_arg, + bool partial_arg, + size_t crossOriginSeqLen_arg) +: lineNum (lineNum_arg) +, contig (contig_arg) , start (start_arg) , stop (stop_arg) , strand (strand_arg) -, crossOriginSeqLen (crossOriginSeqLen_arg) +, partial (partial_arg) +, contigLen (crossOriginSeqLen_arg) +, crossOrigin (bool (crossOriginSeqLen_arg)) { - ASSERT (! contig. empty ()); - if (crossOriginSeqLen) +//ASSERT (lineNum >= 1); + trim (contig, '_'); + if (contig. empty ()) + throw runtime_error ("Empty contig name"); + if (crossOrigin) { swap (start, stop); start--; stop++; + ASSERT (contigLen); + ASSERT (stop <= contigLen); } ASSERT (start < stop); } -bool Cds::operator< (const Cds& other) const +bool Locus::operator< (const Locus& other) const { LESS_PART (*this, other, contig) LESS_PART (*this, other, start) LESS_PART (*this, other, stop) LESS_PART (*this, other, strand) - LESS_PART (*this, other, crossOriginSeqLen); + LESS_PART (*this, other, contigLen); + LESS_PART (*this, other, crossOrigin); return false; } @@ -85,8 +96,9 @@ bool Cds::operator< (const Cds& other) const // Gff -Gff::Gff (const string &fName, - bool locus_tag) +Annot::Annot (Gff, + const string &fName, + bool locus_tag) { if (fName. empty ()) throw runtime_error ("Empty GFF file name"); @@ -100,24 +112,33 @@ Gff::Gff (const string &fName, ) continue; - replace (f. line, ' ', '_'); // to use '\t' as delimiter + constexpr char tmpSpace {'_'}; + replace (f. line, ' ', tmpSpace); // to use '\t' as delimiter const string errorS ("File " + fName + ", line " + toString (f. lineNum) + ": "); - string seqid, source, type, startS, stopS, score /*real number*/, strand, phase /*frame*/, attributes; - { - istringstream iss (f. line); - iss >> seqid >> source >> type >> startS >> stopS >> score >> strand >> phase >> attributes; - } + string contig, source, type, startS, stopS, score /*real number*/, strand, phase /*frame*/, attributes; + static Istringstream iss; + iss. reset (f. line); + iss >> contig >> source >> type >> startS >> stopS >> score >> strand >> phase >> attributes; + trim (contig, tmpSpace); + trim (source, tmpSpace); + trim (type, tmpSpace); + trim (startS, tmpSpace); + trim (stopS, tmpSpace); + trim (score, tmpSpace); + trim (strand, tmpSpace); + trim (phase, tmpSpace); + trim (attributes, tmpSpace); if (attributes. empty ()) throw runtime_error (errorS + "9 fields are expected in each line"); - if (contains (seqid, ":")) - findSplit (seqid, ':'); // = project_id - if (seqid. empty ()) + if (contains (contig, ":")) + findSplit (contig, ':'); // = project_id + if (contig. empty ()) throw runtime_error (errorS + "empty sequence indentifier"); - for (const char c : seqid) + for (const char c : contig) if (! printable (c)) throw runtime_error (errorS + "Non-printable character in the sequence identifier: " + c); @@ -160,6 +181,8 @@ Gff::Gff (const string &fName, || type == "pseudogene"; //if (pseudo && type == "CDS") //continue; + + const bool partial = contains (attributes, "partial=true"); string locusTag; const string locusTagName (locus_tag || pseudo ? "locus_tag=" : "Name="); @@ -179,8 +202,71 @@ Gff::Gff (const string &fName, findSplit (locusTag, '='); trimPrefix (locusTag, "\""); trimSuffix (locusTag, "\""); + trim (locusTag, tmpSpace); + ASSERT (! locusTag. empty ()); - seqid2cdss [locusTag] << Cds (seqid, (size_t) start, (size_t) stop, strand == "+", 0); + const Locus locus (f. lineNum, contig, (size_t) start, (size_t) stop, strand == "+", partial, 0); + #if 0 + // DNA may be truncated + if (type == "CDS" && ! pseudo && locus. size () % 3 != 0) + { + cout << "Locus tag: " << locusTag << endl; + locus. print (cout); + ERROR; + } + #endif + + prot2cdss [locusTag] << locus; + } +} + + + +Annot::Annot (Bed, + const string &fName) +{ + if (fName. empty ()) + throw runtime_error ("Empty BED file name"); + + LineInput f (fName /*, 100 * 1024, 1*/); + while (f. nextLine ()) + { + trim (f. line); + if ( f. line. empty () + || f. line [0] == '#' + ) + continue; + + replace (f. line, ' ', '_'); // to use '\t' as delimiter + + const string errorS ("File " + fName + ", line " + toString (f. lineNum) + ": "); + + string contig, locusTag; + size_t start, stop; + double score; + char strand = ' '; + static Istringstream iss; + iss. reset (f. line); + iss >> contig >> start >> stop >> locusTag >> score >> strand; + + if (strand == ' ') + throw runtime_error (errorS + "at least 5 fields are expected in each line"); + + for (const char c : contig) + if (! printable (c)) + throw runtime_error (errorS + "Non-printable character in the sequence identifier: " + c); + + if (start >= stop) + throw runtime_error (errorS + "start should be less than stop"); + + if ( strand != '+' + && strand != '-' + ) + throw runtime_error (errorS + "strand should be '+' or '-'"); + + trim (locusTag, '_'); + ASSERT (! locusTag. empty ()); + prot2cdss [locusTag] << Locus (f. lineNum, contig, start, stop, strand == '+', false/*partial*/, 0); } } diff --git a/gff.hpp b/gff.hpp index 2f59c14..e2cfe68 100644 --- a/gff.hpp +++ b/gff.hpp @@ -46,44 +46,70 @@ namespace GFF_sp -struct Cds +struct Locus { + static constexpr size_t end_delta = 3; // PAR + size_t lineNum {0}; + // >= 1 + // 0 - unknown string contig; size_t start {0}; size_t stop {0}; + // start <= stop bool strand {false}; - size_t crossOriginSeqLen {0}; - // !0 => cross origin - // To be set externally + bool partial {false}; + size_t contigLen {0}; + // 0 <=> unknown + bool crossOrigin {false}; - Cds (const string &contig_arg, - size_t start_arg, - size_t stop_arg, - bool strand_arg, - size_t crossOriginSeqLen); - Cds () - {} + Locus (size_t lineNum_arg, + const string &contig_arg, + size_t start_arg, + size_t stop_arg, + bool strand_arg, + bool partial_arg, + size_t crossOriginSeqLen); + Locus () = default; + bool empty () const + { return contig. empty (); } void print (ostream &os) const - { os << contig << ' ' << start << ' ' << stop << ' ' << strand << ' ' << crossOriginSeqLen << endl; } - bool operator< (const Cds& other) const; + { os << contig << ' ' << start << ' ' << stop << ' ' << strand << ' ' << contigLen << ' ' << crossOrigin << endl; } + bool operator< (const Locus& other) const; + size_t size () const + { return crossOrigin + ? contigLen - stop + start + : stop - start; + } + bool atContigStart () const + { return start <= end_delta; } + bool atContigStop () const + { return contigLen && contigLen - stop <= end_delta;} }; -struct Gff : Root -// https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md -// Requirement: the protein id should be in the attribute "Name=" (9th field) of the rows with type "CDS" or "gene" (3rd field) +struct Annot : Root { - map > seqid2cdss; + map> prot2cdss; - Gff (const string &fName, - bool locus_tag); + + class Gff {}; + Annot (Gff, + const string &fName, + bool locus_tag); + // https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md + // Requirement: the protein id should be in the attribute "Name=" (9th field) of the rows with type "CDS" or "gene" (3rd field) // Input: fName may be empty + class Bed {}; + Annot (Bed, + const string &fName); + // https://genome.ucsc.edu/FAQ/FAQformat.html#format1 }; + } diff --git a/gff_check.cpp b/gff_check.cpp index b211556..9c528dd 100644 --- a/gff_check.cpp +++ b/gff_check.cpp @@ -59,27 +59,35 @@ struct ThisApplication : Application ThisApplication () : Application ("Check the correctness of a .gff-file. Exit with an error if it is incorrect.") { - addPositional ("gff", ".gff-file, if \"" + noFile + "\" then exit 0"); - addKey ("fasta", "Protein FASTA file"); - addKey ("locus_tag", "File with matches: \" \", where is from \"" + locus_tagS + "]\" in the FASTA comment and from the .gff-file"); + // Input + addPositional ("gff", ".gff-file, if " + strQuote (noFile) + " then exit 0"); + addKey ("prot", "Protein FASTA file"); + addKey ("dna", "DNA FASTA file"); + addFlag ("gpipe", "Protein identifiers in the protein FASTA file have format 'gnl||'"); + // Output + addKey ("locus_tag", "Output file with matches: \" \", where is from " + strQuote (locus_tagS + "") + " in the FASTA comment and from the .gff-file"); } void body () const final { - const string gffName = getArg ("gff"); - const string fastaName = getArg ("fasta"); - const string locus_tagFName = getArg ("locus_tag"); + const string gffName = getArg ("gff"); + const string protFName = getArg ("prot"); + const string dnaFName = getArg ("dna"); + const string locus_tagFName = getArg ("locus_tag"); + const bool gpipe = getFlag ("gpipe"); if (isRight (gffName, noFile)) return; - const Gff gff (gffName, ! locus_tagFName. empty ()); + Annot::Gff gff; + const Annot annot (gff, gffName, ! locus_tagFName. empty ()); - if (! fastaName. empty ()) + + if (! protFName. empty ()) { StringVector gffIds; gffIds. reserve (10000); // PAR { @@ -87,32 +95,47 @@ struct ThisApplication : Application if (! locus_tagFName. empty ()) outF. open ("", locus_tagFName, ""); StringVector fastaIds; fastaIds. reserve (gffIds. capacity ()); - LineInput f (fastaName /*, 100 * 1024, 1*/); + LineInput f (protFName /*, 100 * 1024, 1*/); + Istringstream iss; + string line_orig; while (f. nextLine ()) if (! f. line. empty ()) if (f. line [0] == '>') { + line_orig = f. line; string fastaId, gffId; - istringstream iss (f. line. substr (1)); + iss. reset (f. line. substr (1)); iss >> fastaId; - if (locus_tagFName. empty ()) - gffId = fastaId; - else + // gffId + gffId = fastaId; + if (! locus_tagFName. empty ()) { - const size_t pos = f. line. find (locus_tagS); - if (pos == string::npos) - throw runtime_error ("\"" + locus_tagS + "\" is not found in: " + f. line); - gffId = f. line. substr (pos + locus_tagS. size ()); - const size_t end = gffId. find (']'); - if (end == string::npos) - throw runtime_error ("']' is not found after \"" + locus_tagS + "\" in: " + f. line); - gffId. erase (end); + if (gpipe) + { + if (! trimPrefix (gffId, "gnl|")) + throw runtime_error (__FILE__ ": Protein name should start with \"gnl|\" in: " + line_orig); + const size_t pos = gffId. find ("|"); + if (pos == string::npos) + throw runtime_error (__FILE__ ": " + strQuote ("|") + " is not found in: " + line_orig); + gffId. erase (0, pos + 1); + } + else + { + const size_t pos = f. line. find (locus_tagS); + if (pos == string::npos) + throw runtime_error (__FILE__ ": " + strQuote (locus_tagS) + " is not found in: " + line_orig); + gffId = f. line. substr (pos + locus_tagS. size ()); + const size_t end = gffId. find (']'); + if (end == string::npos) + throw runtime_error (__FILE__ ": ']' is not found after " + strQuote (locus_tagS) + " in: " + line_orig); + gffId. erase (end); + } } ASSERT (! contains (fastaId, ' ')); if (contains (gffId, ' ')) - throw runtime_error ("\"" + gffId + "\" contains space"); + throw runtime_error (__FILE__ ": " + strQuote (gffId) + " contains space"); if (gffId. empty ()) - throw runtime_error ("No identifier in: " + f. line); + throw runtime_error (__FILE__ ": No protein identifier in: " + line_orig); gffIds << gffId; fastaIds << fastaId; if (outF. is_open ()) @@ -122,15 +145,62 @@ struct ThisApplication : Application fastaIds. sort (); fastaIds. uniq (); if (fastaIds. size () != n) - throw runtime_error ("Duplicate FASTA ids"); + throw runtime_error (__FILE__ ": Duplicate FASTA ids"); gffIds. sort (); - gffIds. uniq (); - if (gffIds. size () != fastaIds. size ()) - throw runtime_error ("GFF identifiers are not unique"); + { + const string* s_prev = nullptr; + for (const string& s : gffIds) + { + if (s_prev && *s_prev == s) + throw runtime_error (__FILE__ ": GFF identifier " + strQuote (s) + " is not unique"); + s_prev = & s; + } + } + //gffIds. uniq (); + ASSERT (gffIds. size () == fastaIds. size ()); } + if (verbose ()) + cout << "# Proteins in GFF: " << annot. prot2cdss. size () << endl; for (const string& seqid : gffIds) - if (! contains (gff. seqid2cdss, seqid)) - throw runtime_error ("Protein id '" + seqid + "' is not in the .gff-file"); + if (! contains (annot. prot2cdss, seqid)) + throw runtime_error (__FILE__ ": Protein id " + strQuote (seqid) + " is not in the .gff-file"); + } + + + if (! dnaFName. empty ()) + { + StringVector contigIds; contigIds. reserve (10000); // PAR + { + LineInput f (dnaFName /*, 100 * 1024, 1*/); + Istringstream iss; + while (f. nextLine ()) + if (! f. line. empty ()) + if (f. line [0] == '>') + { + string contigId; + iss. reset (f. line. substr (1)); + iss >> contigId; + ASSERT (! contains (contigId, ' ')); + if (contigId. empty ()) + throw runtime_error (__FILE__ ": No contig identifier in: " + f. line); + contigIds << move (contigId); + } + contigIds. sort (); + { + const string* s_prev = nullptr; + for (const string& s : contigIds) + { + if (s_prev && *s_prev == s) + throw runtime_error (__FILE__ ": Contig identifier " + strQuote (s) + " is not unique"); + s_prev = & s; + } + } + //contigIds. uniq (); + } + for (const auto& it : annot. prot2cdss) + for (const Locus& cds : it. second) + if (! contigIds. contains (cds. contig)) + throw runtime_error (__FILE__ ": Contig id " + strQuote (cds. contig) + " is not in the file " + dnaFName); } } }; diff --git a/point_mut.cpp b/point_mut.cpp new file mode 100644 index 0000000..a3e5353 --- /dev/null +++ b/point_mut.cpp @@ -0,0 +1,612 @@ +// point_mut.cpp + +/*=========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's official duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government have not placed any restriction on its use or reproduction. +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* Please cite the author in any work or product based on this material. +* +* =========================================================================== +* +* Author: Vyacheslav Brover +* +* File Description: +* Identification of point mutations at DNA level +* +*/ + + +#undef NDEBUG +#include "common.inc" + +#include "common.hpp" +using namespace Common_sp; + + + +namespace +{ + + +char complementaryNucleotide (char wildNucleotide) +{ + char r = ' '; + switch (toLower (wildNucleotide)) + { + case 'a': r = 't'; break; + case 'c': r = 'g'; break; + case 'g': r = 'c'; break; + case 't': r = 'a'; break; + case 'm': r = 'k'; break; + case 'r': r = 'y'; break; + case 'w': r = 'w'; break; + case 's': r = 's'; break; + case 'y': r = 'r'; break; + case 'k': r = 'm'; break; + case 'v': r = 'b'; break; + case 'h': r = 'd'; break; + case 'd': r = 'h'; break; + case 'b': r = 'v'; break; + case 'n': r = 'n'; break; + default: + throw runtime_error ("Bad wild nucleotide " + toString (wildNucleotide)); + } + if (isupper (wildNucleotide)) + r = toUpper (r); + + return r; +} + + + +void reverseDna (string &seq) +{ + const size_t len = seq. size (); + if (! len) + return; + + FFOR (size_t, i, len / 2) + { + const size_t j = len - 1 - i; + if (seq [i] != '-') + seq [i] = complementaryNucleotide (seq [i]); + if (seq [j] != '-') + seq [j] = complementaryNucleotide (seq [j]); + swap (seq [i], seq [j]); + } + + if (len % 2) + { + const size_t i = len / 2; + if (seq [i] != '-') + seq [i] = complementaryNucleotide (seq [i]); + } +} + + + +struct PointMut +{ + string gene; + size_t pos {0}; + // In reference sequence + // >= 0 + char alleleChar {' '}; + // Upper-case + // !empty() + string geneMutation; + // Depends on the above + string geneMutationGen; + // geneMutation generalized, may have a different pos + string classS; + string subclass; + string name; + // Species binomial + resistance + double neighborhoodMismatch {0.0}; + + + PointMut (const string &gene_arg, + size_t pos_arg, + char alleleChar_arg, + const string &geneMutation_arg, + const string &geneMutationGen_arg, + const string &class_arg, + const string &subclass_arg, + const string &name_arg) + : gene (gene_arg) + , pos (pos_arg) + , alleleChar (alleleChar_arg) + , geneMutation (geneMutation_arg) + , geneMutationGen (geneMutationGen_arg) + , classS (class_arg) + , subclass (subclass_arg) + , name (name_arg) + { + ASSERT (! gene. empty ()); + ASSERT (pos > 0); + pos--; + ASSERT (alleleChar != ' '); + alleleChar = toUpper (alleleChar); + ASSERT (! geneMutation. empty ()); + ASSERT (! geneMutationGen. empty ()); + ASSERT (! name. empty ()); + ASSERT (! contains (name, '\t')); + replace (name, '_', ' '); + ASSERT (geneMutation. back () == alleleChar); + ASSERT (geneMutationGen. back () == alleleChar); + } + PointMut () = default; + + + bool empty () const + { return gene. empty (); } + string getResistance () const + { size_t p = name. find (' '); + ASSERT (p != string::npos); + p = name. find (' ', p + 1); + ASSERT (p != string::npos); + return name. substr (p + 1); + } + bool better (const PointMut &other) const + { if (geneMutationGen != other. geneMutationGen) + return false; + /*if (pos != other. pos) + return false; */ + LESS_PART (*this, other, neighborhoodMismatch); + // Tie + LESS_PART (*this, other, name); + LESS_PART (*this, other, geneMutation); + return false; + } +}; + + +map > accession2pointMuts; + + + +struct BlastAlignment +{ + // PD-2001 + static constexpr const size_t flankingLen = 200; // PAR + + // BLASTN alignment + size_t length {0}, nident {0} // aa + , refStart {0}, refEnd {0}, refLen {0} + , targetStart {0}, targetEnd {0}, targetLen {0}; + // Positions are 0-based + // targetStart < targetEnd + + // target + string targetName; + bool targetStrand {true}; + // false <=> negative + + // Reference + string refName; + + map targetPos2pointMut; + + + explicit BlastAlignment (const string &line) + { + static Istringstream iss; + iss. reset (line); + string targetSeq, refSeq; + iss >> targetName >> refName >> length >> nident >> targetStart >> targetEnd >> targetLen >> refStart >> refEnd >> refLen >> targetSeq >> refSeq; + // format: qseqid sseqid length nident qstart qend qlen sstart send slen sseq + // blastn: ... ... 733 733 62285 63017 88215 105 837 837 ... + ASSERT (! targetSeq. empty ()); + ASSERT (targetSeq. size () == refSeq. size ()); + + #if 0 + string refAccession; + size_t refSegStart = 0; + size_t refSegEnd = 0; + { + string s (refName); + replace (s, ':', ' '); + replace (s, '-', ' '); + static Istringstream refNameIss; + refNameIss. reset (s); + refNameIss >> refAccession >> refSegStart >> refSegEnd; + ASSERT (refSegStart); + ASSERT (refSegStart < refSegEnd); + refSegStart--; + } + #endif + + ASSERT (refStart != refEnd); + bool refStrand = refStart < refEnd; + if (! refStrand) + swap (refStart, refEnd); + + ASSERT (targetStart < targetEnd); + + ASSERT (refStart >= 1); + ASSERT (targetStart >= 1); + ASSERT (refStart < refEnd); + ASSERT (targetStart < targetEnd); + refStart--; + targetStart--; + + if (! refStrand) + { + reverseDna (targetSeq); + reverseDna (refSeq); + toggle (refStrand); + toggle (targetStrand); + } + ASSERT (refStrand); + + if (const vector* pointMuts_ = findPtr (accession2pointMuts, refName)) + { + if (verbose ()) + cout << "PointMut DNA found: " << refName << endl + << targetStart << ' ' << targetEnd << ' ' << targetStrand << ' ' << targetSeq << endl + << refStart << ' ' << refEnd << ' ' << refStrand << ' ' << refSeq << endl; + ASSERT (! pointMuts_->empty ()); + for (const PointMut& pm : *pointMuts_) + { + size_t pos = refStart; + // One pass for all *pointMuts_ ?? + size_t targetPos = (targetStrand ? targetStart : (targetEnd - 1)); + FFOR (size_t, i, refSeq. size ()) + { + if (targetSeq [i] != '-') + { + if (targetStrand) + targetPos++; + else + targetPos--; + } + if (refSeq [i] != '-') + { + if (targetSeq [i] != '-') + { + const double neighborhoodMismatch = getNeighborhoodMismatch (targetSeq, refSeq, i); + if (verbose ()) + if (targetSeq [i] != refSeq [i]) + cout << i + 1 + << ' ' << refSeq [i] + << ' ' << targetSeq [i] + << ' ' << pos + 1 + << ' ' << pm. pos + 1 + << ' ' << pm. alleleChar + << ' ' << neighborhoodMismatch + << endl; + if (pos == pm. pos) + { + if (toUpper (targetSeq [i]) == pm. alleleChar) + { + ASSERT (targetSeq [i] != refSeq [i]); + if (neighborhoodMismatch <= 0.03) // PAR + { + ASSERT (targetPos2pointMut [targetPos]. empty ()); + targetPos2pointMut [targetPos] = pm; + targetPos2pointMut [targetPos]. neighborhoodMismatch = neighborhoodMismatch; + } + } + break; + } + } + pos++; + } + } + } + } + } + void qc () const + { + if (! qc_on) + return; + ASSERT (targetStart < targetEnd); + ASSERT (targetEnd <= targetLen); + ASSERT (refStart < refEnd); + ASSERT (nident <= refEnd - refStart); + ASSERT (refEnd <= refLen); + ASSERT (refEnd - refStart <= length); + } + void saveText (ostream& os) const + { const string na ("NA"); + for (const auto& it : targetPos2pointMut) + { + const PointMut& pm = it. second; + if (pm. empty ()) + continue; + TabDel td (2, false); + td << na // PD-2534 + << targetName + << targetStart + 1 + << targetEnd + << (targetStrand ? '+' : '-'); + td << pm. geneMutationGen // was: pm.geneMutation + << pm. name + << "core" // PD-2825 + // PD-1856 + << "AMR" + << "POINT" + << nvl (pm. classS, na) + << nvl (pm. subclass, na) + // + << "POINTN" // PD-2088 + << targetLen; + td << refLen + << refCoverage () * 100 + << pIdentity () * 100 + << length + << refName + << pm. gene + ; + // HMM + td << na + << na; + os << td. str () << endl; + } + } + + + double pIdentity () const + { return (double) nident / (double) length; } + double refCoverage () const + { return (double) (refEnd - refStart) / (double) refLen; } + double getNeighborhoodMismatch (const string &targetSeq, + const string &refSeq, + size_t i) const + { ASSERT (targetSeq. size () == refSeq. size ()); + ASSERT (i < targetSeq. size ()) + ASSERT (targetSeq [i] != '-'); + ASSERT (refSeq [i] != '-'); + ASSERT (targetEnd - targetStart <= targetSeq. size ()); + ASSERT (refEnd - refStart <= refSeq. size ()); + // PD-2001 + size_t len = 0; + size_t mismatches = 0; + size_t j = 0; + // Left flank + if (i) + { + j = i - 1; + while (i - j <= flankingLen) + { len++; + if (targetSeq [j] != refSeq [j]) + mismatches++; + if (j == 0) + break; + j--; + } + } + ASSERT (i >= j); + const size_t left = min (min (targetStart, refStart), flankingLen + 1 - (i - j)); + len += left; + mismatches += left; + //cout << i << ' ' << j << ' ' << left; + // Right flank + for (j = i + 1; j < targetSeq. size () && j < refSeq. size () && j - i <= flankingLen; j++) + { len++; + if (targetSeq [j] != refSeq [j]) + mismatches++; + } + ASSERT (j >= i); + const size_t right = min (min (targetLen - targetEnd, refLen - refEnd), flankingLen + 1 - (j - i)); + len += right; + mismatches += right; + // + //cout << ' ' << j << right << ' ' << mismatches << ' ' << len << endl; + return (double) mismatches / (double) len; + } + bool good () const + { return length >= 2 * flankingLen + 1; } + bool operator< (const BlastAlignment &other) const + { LESS_PART (*this, other, targetName); + LESS_PART (other, *this, pIdentity ()); + LESS_PART (*this, other, targetStart); + LESS_PART (*this, other, refName); + return false; + } +}; + + + + +struct Batch +{ + vector blastAls; + + + explicit Batch (const string &point_mut) + { + LineInput f (point_mut); + string accession, gene, geneMutation, geneMutationGen, classS, subclass, name; + int pos; + char alleleChar; + Istringstream iss; + while (f. nextLine ()) + { + iss. reset (f. line); + iss >> accession >> gene >> pos >> alleleChar >> geneMutation >> geneMutationGen >> classS >> subclass >> name; + ASSERT (pos > 0); + accession2pointMuts [accession]. push_back (move (PointMut (gene, (size_t) pos, alleleChar, geneMutation, geneMutationGen, classS, subclass, name))); + } + } + + + void report (ostream &os) const + { + { + // Cf. BlastAlignment::saveText() + TabDel td; + td << "Protein identifier" // targetName // PD-2534 + // Contig + << "Contig id" + << "Start" // targetStart + << "Stop" // targetEnd + << "Strand" // targetStrand + << "Gene symbol" + << "Mutation name" + << "Scope" // PD-2825 + // PD-1856 + << "Element type" + << "Element subtype" + << "class" + << "Subclass" + // + << "Method" + << "Target length" + // + << "Reference gene length" // refLen + << "% Coverage of reference gene" // queryCoverage + << "% Identity to reference gene" + << "Alignment length" // length + << "Accession of reference gene" + << "Name of reference gene" + // + << "HMM id" + << "HMM description" + ; + os << td. str () << endl; + } + + for (const auto& blastAl : blastAls) + { + blastAl. qc (); + blastAl. saveText (os); + } + } +}; + + + + +// ThisApplication + +struct ThisApplication : Application +{ + ThisApplication () + : Application ("Find point mutations at DNA level and report in the format of amr_report.cpp") + { + // Input + addPositional ("blastn", "blastn output in the format: qseqid sseqid length nident qstart qend qlen sstart send slen sseq. sseqid is the 1st column of table"); + addPositional ("point_mut", "Point mutation table"); + // Output + #ifdef SVN_REV + version = SVN_REV; + #endif + } + + + + void body () const final + { + const string blastnFName = getArg ("blastn"); + const string point_mut = getArg ("point_mut"); + + + + Batch batch (point_mut); + + + // Input + { + LineInput f (blastnFName); + while (f. nextLine ()) + { + { + Unverbose unv; + if (verbose ()) + cout << f. line << endl; + } + BlastAlignment al (f. line); + al. qc (); + if (al. good ()) + batch. blastAls. push_back (move (al)); + } + } + if (verbose ()) + cout << "# Good Blasts: " << batch. blastAls. size () << endl; + + + // Output + // Group by targetName and process each targetName separately for speed ?? + //Common_sp::sort (batch. blastAls); + if (verbose ()) + { + cout << "After process():" << endl; + for (const auto& blastAl : batch. blastAls) + { + blastAl. saveText (cout); + cout << ' ' << blastAl. targetPos2pointMut. size () << endl; + } + } + + + for (const BlastAlignment& blastAl1 : batch. blastAls) + for (const auto& it1 : blastAl1. targetPos2pointMut) + { + const PointMut& pm1 = it1. second; + if (pm1. empty ()) + continue; + for (BlastAlignment& blastAl2 : batch. blastAls) + if ( blastAl2. targetName == blastAl1. targetName + && & blastAl2 != & blastAl1 + ) + for (auto& it2 : blastAl2. targetPos2pointMut) + { + PointMut& pm2 = it2. second; + if (pm2. empty ()) + continue; + if (verbose ()) + cout << blastAl2. targetName + << ' ' << it1. first + << ' ' << it2. first + << ' ' << pm1. geneMutation + << ' ' << pm2. geneMutation + << ' ' << pm1. geneMutationGen + << ' ' << pm2. geneMutationGen + << ' ' << pm1. neighborhoodMismatch + << ' ' << pm2. neighborhoodMismatch + << ' ' << pm1. better (pm2) + << endl; + if ( it2. first == it1. first + && pm1. better (pm2) + ) + pm2 = PointMut (); + } + } + + + batch. report (cout); + } +}; + + + +} // namespace + + + +int main (int argc, + const char* argv[]) +{ + ThisApplication app; + return app. run (argc, argv); +} + + + diff --git a/test.expected b/test.expected new file mode 100644 index 0000000..3854391 --- /dev/null +++ b/test.expected @@ -0,0 +1,10 @@ +Protein identifier Contig id Start Stop Strand Gene symbol Sequence name Element type Element subtype Class Subclass Method Target length Reference sequence length % Coverage of reference sequence % Identity to reference sequence Alignment length Accession of closest sequence Name of closest sequence HMM id HMM description +blaTEM-156 contig1 101 961 + blaTEM-156 class A beta-lactamase TEM-156 AMR AMR BETA-LACTAM BETA-LACTAM ALLELEP 286 286 100.00 100.00 286 WP_061158039.1 class A beta-lactamase TEM-156 NF000531.2 TEM family class A beta-lactamase +NA contig10 486 1307 + blaOXA class D beta-lactamase AMR AMR BETA-LACTAM BETA-LACTAM INTERNAL_STOP 274 274 100.00 99.64 274 WP_000722315.1 oxacillin-hydrolyzing class D beta-lactamase OXA-9 NF000270.1 class D beta-lactamase +blaPDC-114_blast contig2 1 1191 + blaPDC PDC family class C beta-lactamase AMR AMR BETA-LACTAM CEPHALOSPORIN BLASTP 397 397 100.00 99.75 397 WP_061189306.1 class C beta-lactamase PDC-114 NF000422.6 PDC family class C beta-lactamase +blaOXA-436_partial contig3 101 802 + blaOXA OXA-48 family class D beta-lactamase AMR AMR BETA-LACTAM BETA-LACTAM PARTIALP 233 265 87.92 100.00 233 WP_058842180.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-436 NF000387.2 OXA-48 family class D beta-lactamase +vanG contig4 101 1147 + vanG D-alanine--D-serine ligase VanG AMR AMR GLYCOPEPTIDE VANCOMYCIN EXACTP 349 349 100.00 100.00 349 WP_063856695.1 D-alanine--D-serine ligase VanG NF000091.3 D-alanine--D-serine ligase VanG +NA contig8 101 700 + blaTEM TEM family class A beta-lactamase AMR AMR BETA-LACTAM BETA-LACTAM PARTIAL_CONTIG_ENDX 200 286 69.93 100.00 200 WP_061158039.1 class A beta-lactamase TEM-156 NF000531.2 TEM family class A beta-lactamase +NA contig9 1 675 - aph(3'')-Ib aminoglycoside O-phosphotransferase APH(3'')-Ib AMR AMR AMINOGLYCOSIDE STREPTOMYCIN PARTIAL_CONTIG_ENDX 225 275 81.82 100.00 225 WP_109545041.1 aminoglycoside O-phosphotransferase APH(3'')-Ib NF032895.1 aminoglycoside O-phosphotransferase APH(3'')-Ib +NA contig9 715 1377 - sul2 sulfonamide-resistant dihydropteroate synthase Sul2 AMR AMR SULFONAMIDE SULFONAMIDE PARTIAL_CONTIG_ENDX 221 271 81.55 100.00 221 WP_001043265.1 sulfonamide-resistant dihydropteroate synthase Sul2 NF000295.1 sulfonamide-resistant dihydropteroate synthase Sul2 +nimIJ_hmm contigX 1 501 + nimIJ NimIJ family nitroimidazole resistance protein AMR AMR NITROIMIDAZOLE NITROIMIDAZOLE HMM 166 NA NA NA NA NA NA NF000262.1 NimIJ family nitroimidazole resistance protein diff --git a/test_both.expected b/test_both.expected new file mode 100644 index 0000000..42fd2da --- /dev/null +++ b/test_both.expected @@ -0,0 +1,16 @@ +Protein identifier Contig id Start Stop Strand Gene symbol Sequence name Scope Element type Element subtype Class Subclass Method Target length Reference sequence length % Coverage of reference sequence % Identity to reference sequence Alignment length Accession of closest sequence Name of closest sequence HMM id HMM description +blaTEM-156 contig01 101 961 + blaTEM-156 class A beta-lactamase TEM-156 core AMR AMR BETA-LACTAM BETA-LACTAM ALLELEP 286 286 100.00 100.00 286 WP_061158039.1 class A beta-lactamase TEM-156 NF000531.2 TEM family class A beta-lactamase +blaPDC-114_blast contig02 1 1191 + blaPDC PDC family class C beta-lactamase core AMR AMR BETA-LACTAM CEPHALOSPORIN BLASTP 397 397 100.00 99.75 397 WP_061189306.1 class C beta-lactamase PDC-114 NF000422.6 PDC family class C beta-lactamase +blaOXA-436_partial contig03 101 802 + blaOXA OXA-48 family class D beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM PARTIALP 233 265 87.92 100.00 233 WP_058842180.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-436 NF000387.2 OXA-48 family class D beta-lactamase +vanG contig04 101 1147 + vanG D-alanine--D-serine ligase VanG core AMR AMR GLYCOPEPTIDE VANCOMYCIN EXACTP 349 349 100.00 100.00 349 WP_063856695.1 D-alanine--D-serine ligase VanG NF000091.3 D-alanine--D-serine ligase VanG +NA contig05 260 2021 - 23S_A2075G Campylobacter macrolide resistant 23S core AMR POINT MACROLIDE MACROLIDE POINTN 2021 2912 60.51 99.83 1762 NC_022347.1:1040292-1037381 23S NA NA +NA contig06 2680 3102 + 50S_L22_A103V Campylobacter macrolide resistant 50S L22 core AMR POINT MACROLIDE MACROLIDE POINTX 141 141 100.00 97.16 141 WP_002851214.1 50S L22 NA NA +gyrA contig06 31 2616 + gyrA_T86I Campylobacter quinolone resistant GyrA core AMR POINT QUINOLONE QUINOLONE POINTP 862 863 99.88 99.54 862 WP_002857904.1 gyrA NA NA +50S_L22 contig07 101 526 + 50S_L22_A103V Campylobacter macrolide resistant 50S L22 core AMR POINT MACROLIDE MACROLIDE POINTP 141 141 100.00 97.16 141 WP_002851214.1 50S L22 NA NA +NA contig08 101 700 + blaTEM TEM family class A beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM PARTIAL_CONTIG_ENDX 200 286 69.93 100.00 200 WP_061158039.1 class A beta-lactamase TEM-156 NF000531.2 TEM family class A beta-lactamase +aph3pp-Ib_partial_5p_neg contig09 1 675 - aph(3'')-Ib aminoglycoside O-phosphotransferase APH(3'')-Ib core AMR AMR AMINOGLYCOSIDE STREPTOMYCIN PARTIAL_CONTIG_ENDP 225 275 81.82 99.56 225 WP_109545041.1 aminoglycoside O-phosphotransferase APH(3'')-Ib NF032895.1 aminoglycoside O-phosphotransferase APH(3'')-Ib +sul2_partial_3p_neg contig09 715 1377 - sul2 sulfonamide-resistant dihydropteroate synthase Sul2 core AMR AMR SULFONAMIDE SULFONAMIDE PARTIAL_CONTIG_ENDP 221 271 81.55 100.00 221 WP_001043265.1 sulfonamide-resistant dihydropteroate synthase Sul2 NF000295.1 sulfonamide-resistant dihydropteroate synthase Sul2 +NA contig10 486 1307 + blaOXA class D beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM INTERNAL_STOP 274 274 100.00 99.64 274 WP_000722315.1 oxacillin-hydrolyzing class D beta-lactamase OXA-9 NF000270.1 class D beta-lactamase +blaTEM-internal_stop contig11 113 547 + blaTEM TEM family class A beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM PARTIALP 144 286 50.35 97.22 144 WP_000027057.1 class A broad-spectrum beta-lactamase TEM-1 NF000531.2 TEM family class A beta-lactamase +qacR-curated_blast contig12 71 637 + qacR multidrug-binding transcriptional regulator QacR plus STRESS BIOCIDE QUATERNARY AMMONIUM QUATERNARY AMMONIUM BLASTP 188 188 100.00 99.47 188 ADK23698.1 multidrug-binding transcriptional regulator QacR NA NA +nimIJ_hmm contigX 1 501 + nimIJ NimIJ family nitroimidazole resistance protein core AMR AMR NITROIMIDAZOLE NITROIMIDAZOLE HMM 166 NA NA NA NA NA NA NF000262.1 NimIJ family nitroimidazole resistance protein diff --git a/test_dna.expected b/test_dna.expected index eb679c3..f63d638 100644 --- a/test_dna.expected +++ b/test_dna.expected @@ -1,5 +1,15 @@ -Target identifier Contig id Start Stop Strand Gene symbol Protein name Method Target length Reference protein length % Coverage of reference protein % Identity to reference protein Alignment length Accession of closest protein Name of closest protein HMM id HMM description -contig1 contig1 101 958 + blaTEM-156 class A beta-lactamase TEM-156 ALLELE 286 286 100.00 100.00 286 WP_061158039.1 class A beta-lactamase TEM-156 NF000531.2 TEM family class A beta-lactamase -contig2 contig2 1 1191 + blaPDC PDC family class C beta-lactamase BLAST 397 397 100.00 99.75 397 WP_061189306.1 class C beta-lactamase PDC-114 NF000422.6 PDC family class C beta-lactamase -contig3 contig3 101 802 + blaOXA OXA-48 family class D beta-lactamase PARTIAL 234 265 88.30 100.00 234 WP_058842180.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-436 NF000387.2 OXA-48 family class D beta-lactamase -contig4 contig4 101 1147 + vanG D-alanine--D-serine ligase VanG EXACT 349 349 100.00 100.00 349 WP_063856695.1 D-alanine--D-serine ligase VanG NF000091.3 D-alanine--D-serine ligase VanG +Protein identifier Contig id Start Stop Strand Gene symbol Sequence name Scope Element type Element subtype Class Subclass Method Target length Reference sequence length % Coverage of reference sequence % Identity to reference sequence Alignment length Accession of closest sequence Name of closest sequence HMM id HMM description +NA contig01 101 958 + blaTEM-156 class A beta-lactamase TEM-156 core AMR AMR BETA-LACTAM BETA-LACTAM ALLELEX 286 286 100.00 100.00 286 WP_061158039.1 class A beta-lactamase TEM-156 NF000531.2 TEM family class A beta-lactamase +NA contig02 1 1191 + blaPDC PDC family class C beta-lactamase core AMR AMR BETA-LACTAM CEPHALOSPORIN BLASTX 397 397 100.00 99.75 397 WP_061189306.1 class C beta-lactamase PDC-114 NF000422.6 PDC family class C beta-lactamase +NA contig03 101 802 + blaOXA OXA-48 family class D beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM PARTIALX 234 265 88.30 100.00 234 WP_058842180.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-436 NF000387.2 OXA-48 family class D beta-lactamase +NA contig04 101 1147 + vanG D-alanine--D-serine ligase VanG core AMR AMR GLYCOPEPTIDE VANCOMYCIN EXACTX 349 349 100.00 100.00 349 WP_063856695.1 D-alanine--D-serine ligase VanG NF000091.3 D-alanine--D-serine ligase VanG +NA contig05 260 2021 - 23S_A2075G Campylobacter macrolide resistant 23S core AMR POINT MACROLIDE MACROLIDE POINTN 2021 2912 60.51 99.83 1762 NC_022347.1:1040292-1037381 23S NA NA +NA contig06 2680 3102 + 50S_L22_A103V Campylobacter macrolide resistant 50S L22 core AMR POINT MACROLIDE MACROLIDE POINTX 141 141 100.00 97.16 141 WP_002851214.1 50S L22 NA NA +NA contig06 31 2616 + gyrA_T86I Campylobacter quinolone resistant GyrA core AMR POINT QUINOLONE QUINOLONE POINTX 862 863 99.88 99.54 862 WP_002857904.1 gyrA NA NA +NA contig07 101 523 + 50S_L22_A103V Campylobacter macrolide resistant 50S L22 core AMR POINT MACROLIDE MACROLIDE POINTX 141 141 100.00 97.16 141 WP_002851214.1 50S L22 NA NA +NA contig08 101 700 + blaTEM TEM family class A beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM PARTIAL_CONTIG_ENDX 200 286 69.93 100.00 200 WP_061158039.1 class A beta-lactamase TEM-156 NF000531.2 TEM family class A beta-lactamase +NA contig09 1 675 - aph(3'')-Ib aminoglycoside O-phosphotransferase APH(3'')-Ib core AMR AMR AMINOGLYCOSIDE STREPTOMYCIN PARTIAL_CONTIG_ENDX 225 275 81.82 100.00 225 WP_109545041.1 aminoglycoside O-phosphotransferase APH(3'')-Ib NF032895.1 aminoglycoside O-phosphotransferase APH(3'')-Ib +NA contig09 715 1377 - sul2 sulfonamide-resistant dihydropteroate synthase Sul2 core AMR AMR SULFONAMIDE SULFONAMIDE PARTIAL_CONTIG_ENDX 221 271 81.55 100.00 221 WP_001043265.1 sulfonamide-resistant dihydropteroate synthase Sul2 NF000295.1 sulfonamide-resistant dihydropteroate synthase Sul2 +NA contig10 486 1307 + blaOXA class D beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM INTERNAL_STOP 274 274 100.00 99.64 274 WP_000722315.1 oxacillin-hydrolyzing class D beta-lactamase OXA-9 NF000270.1 class D beta-lactamase +NA contig11 101 958 + blaTEM TEM family class A beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM INTERNAL_STOP 286 286 100.00 93.01 286 WP_061158039.1 class A beta-lactamase TEM-156 NF000531.2 TEM family class A beta-lactamase +NA contig12 71 634 + qacR multidrug-binding transcriptional regulator QacR plus STRESS BIOCIDE QUATERNARY AMMONIUM QUATERNARY AMMONIUM BLASTX 188 188 100.00 99.47 188 ADK23698.1 multidrug-binding transcriptional regulator QacR NA NA diff --git a/test_dna.fa b/test_dna.fa index defcc63..2c0c322 100644 --- a/test_dna.fa +++ b/test_dna.fa @@ -1,4 +1,4 @@ ->contig1 blaTEM-156_cds +>contig01 blaTEM-156_cds AACCCCTATTTGTTTATTTTTCTAAATACATTCAAATATGTATCCGCTCATGATACAATAACCCTGATAA ATGCTTCAATAATATTGAAAAAGGAAGAGTATGAGTATTCAACATTTCCGTGTCGCCCTTATTCCCTTTT TTGCGGCATTTTGCCTTCCTGTTTTTGCTCACCCAGAAACGCTGGTGAAAGTAAAAGATGCTGAAGATCA @@ -15,7 +15,7 @@ GGGGCCAGATGGTAAGCCCTCCCGTATCGTAGTTATCTACACGACGGGGAGTCAGGCAACTATGGATGAA CGAAATAGACAGATCGCTGAGATAGGTGCCTCACTGATTAAGCATTGGTAACTGTCAGACCAAGTTTACT CATA ->contig2 blaPDC-114_blast_cds +>contig02 blaPDC-114_blast_cds ATGCGCGATACCAGATTCCCCTGCCTGTGCGGCATCGCCGCTTCCACACTGCTGTTCGCCACCACCCCGG CCATTGCCGGCGAGGCCCCGGCGGATCGCCTGAAGGCACTGGTCGACGCCGCCGTACAACCGGTGATGAA GGCCAATGACATTCCGGGCCTGGCCGTAGCCATCAGCCTGAAAGGAGAACCGCATTACTTCAGCTATGGG @@ -35,11 +35,11 @@ GCCTACGTGGCGTTCGTCCCGGGCCGCGACCTGGGACTGGTGATCCTGGCCAACCGCAACTATCCCAATG CCGAGCGGGTGAAGATCGCCTACGCCATCCTCAGCGGCCTGGAGCAGCAGGGCAAGGTGCCGCTGAAGCG CTGA ->contig3 blaOXA-436_partial_cds +>contig03 blaOXA-436_partial_cds GCCACAATTGGTCAGCAGTTTGCTATTGAAAGCCCGCCAAGACGGTGCTACTCTTAGCGCCTCATTTTTATGGATTTATTGCATTAGGCAAGGGGACATTATGCGTGCGTTAGCCTTATCGGCTGTGTTGATGGTGACAACGATGATTGGCATGCCTGCGGTGGCAAAGGAGTGGCAAGAGAACAAGAGTTGGAATGCTCACTTTAGCGAACATAAAACCCAAGGCGTGGTTGTGCTCTGGAACGAGAATACACAGCAGGGTTTTACCAACGATCTTAAACGGGCAAACCAAGCATTTTTACCTGCATCGACCTTTAAGATCCCAAACAGTTTAATTGCCTTGGACTTAGGTGTGGTTAAGGATGAGCATCAAGTCTTTAAATGGGATGGACAGACGCGAGATATCGCCGCGTGGAATCGCGACCATGACTTAATCACCGCGATGAAGTATTCGGTTGTGCCTGTTTATCAAGAATTTGCCCGCCAAATTGGCGAGGCCCGTATGAGTAAAATGTTGCACGCCTTCGATTATGGTAATGAGGATATCTCGGGCAATTTGGACAGTTTTTGGCTCGATGGTGGTATTCGCATTTCGGCTACCCAGCAAATCGCTTTTTTACGCAAGCTGTACCACAACAAGTTGCACGTTTCTGAGCGTAGTCAGCGCATCGTTAAACAAGCCATGCTGACCGAGGCAAATGCCGACTATATCATCCGGGCGAAAACTGGCTATTCGGTCAGAATTGAACCGAAAATCGGTTGGTGGGTTGGCTGGATCGAACTGGATGACAATGTGTGGTTCAAGTGGTTAGCGGCGCATTTGTGTAAAATAGCCGTCATATAAGCTGTAAAGTTATATGGACAAAATACTTATAGTCGATGCGCTTATCGCTAATGGCTCA ->contig4 vanG_cds exact +>contig04 vanG_cds exact ATGACTGGTAGTCAGACGGAAAAAGAATTGTATGTCAACCAATGTAAAATAGCCTATAAGCTACCCGATG GTGTAAAAATTGAAGAAAGAGGTGTGTAAAATGCAAAATAAAAAAATAGCAGTTATTTTTGGAGGCAATT CAACAGAGTACGAGGTGTCATTGCAATCGGCATCCGCTGTTTTTGAAAATATCAATACCAATAAATTTGA @@ -59,7 +59,7 @@ AGGCTTTACCTCGCACAGTCGCTATCCAAATATGATGAAAGGCATTGGTCTATCGTTCTCCCAAATGTTG GATAAGCTGATAGGTCTGTATGTGGAATGATGAAAACGATTGAGCTTGAAAAGGAAGAAATTTATTGTGG AAATTTGCTGCTCGTCAACAAAAATTATCCGCTACGAGATAACAATGTAAAGGGTTTAGT ->contig6 gyrA:T86I +>contig06 gyrA:T86I TCAAAACTTTGAACTAATAGAAGGATTTTTATGGAGAATATTTTTAGCAAAGATTCTGAT ATTGAACTTGTAGATATAGAAAATTCTATAAAAGGTAGTTACTTAGACTATTCTATGAGT GTTATTATAGGTCGTGCTTTGCCTGACGCAAGAGATGGTTTAAAGCCTGTTCATAGAAGA @@ -113,7 +113,7 @@ CTAGAGGAAGTGCAAGTCGTATTAGAAAACCAACTTCACATATTTTAGTAGAAGTGGTAA AAGCGGAAGTGAAAGCTGAAGAGAAAAAAACAGTAGCTAAAAAAACTACTACAACAAAAG CTCCAGCTAAAAAAACAACAAGCACTAAAAAAGCAACAGCGAAAAAGGAAAGCTAATGGG ACAAAAAGTTAATCCAATTGGTCTAAGACTAGGAATCAATAGAAAC ->Contig5 Campylobacter 23S:A2075G +>contig05 Campylobacter 23S:A2075G TAAGAGCAAGTTCTAATACAAACTCTACTTGAAAGATTTCTATTAAACTCTAAATATTAT CTATATTCAAAGTCTTCTTAAAAACCTTAACAAGGAAGTGATGCTTTATTAAAGATAAGC CAAACGCTCTATTAGTACTGGTCAGCTAAAGGACTTTCATCCATTACACACCCAGCCTAT @@ -149,7 +149,7 @@ TTTTACTGATTTTACTAAGTTTTATTTCTATCTTTACTCTATAAAAATCAGTAAAAATAT AACTAAACATATAACTTAGTTTCTCATATTCTAAACTTAAATAATCTAAGTTGTGCGCAA GTCTACAGCTTCGGTACTTACTTTAGCCCCGTTATATTTTC ->contig7 Campylobacter 50S_L22:A103V NZ_CUUA01000002.1 Campylobacter jejuni strain OXC6585, whole genome shotgun sequence 59884-60509 +>contig07 Campylobacter 50S_L22:A103V NZ_CUUA01000002.1 Campylobacter jejuni strain OXC6585, whole genome shotgun sequence 59884-60509 AATCACATCGGTTATAAGCTTGGTGAATTTGCTCCAACACGCACATTTAAAGGGCATAAA GGCTCTGTGCAAAAGAAAATTGGTAAGTAAGGGATAAGATATGAGTAAAGCATTAATTAA ATTCATAAGATTATCTTCAACTAAAGCAAGATTGATTGCTAGAGAAGTTCAGGGGATGAA @@ -171,3 +171,95 @@ GCTTTGTTGCATCTGCGGATGGCATCCACTTTATGGCCAGCCATAGCACTGTGGAAGTATAGTTTGCCGT TGTCATAGACATAGCTGATGGGCACGGCATAGGGATAGTCATTATCGCCCAACAAGGCTAATGTGCCAGA AGTAGCCTTCTGCAGAACGGCGATGCTGTCTGCGTCAGTCAGTTGCTGACGCTTTCGTCTCATTTCTCGG AACTCACTCAT + +>contig08 Partial DNA only blaTEM-156 +AACCCCTATTTGTTTATTTTTCTAAATACATTCAAATATGTATCCGCTCATGATACAATAACCCTGATAA +ATGCTTCAATAATATTGAAAAAGGAAGAGTATGAGTATTCAACATTTCCGTGTCGCCCTTATTCCCTTTT +TTGCGGCATTTTGCCTTCCTGTTTTTGCTCACCCAGAAACGCTGGTGAAAGTAAAAGATGCTGAAGATCA +GTTGGGTGCACGAGTGGGTTACATCGAACTGGATCTCAACAGCGGTAAGATCCTTGAGAGTTTTCGCCCC +GAAGAACGTTTTCCAATGATGAGCACTTTTAAAGTTCTGCTATGTGGCGCGGTATTATCCCGTGTTGACG +CCGGGCAAGAGCAACTCGGTCGCCGCATACACTATTCTCAGAATGACTTGGTTGAGTACTCACCAGTCAC +AGAAAAGCATCTTACGGATGGCATGACAGTAAGAGAATTATGCAGTGCTGCCATAACCATGAGTGATAAC +ACTGCGGCCAACTTACTTCTGACAACGATCGGAGGACCGAAGGAGCTAACCGCTTTTTTGCACAACATAG +GGGATCATGTAACTCGCCTTGATCGTTGGGAACCGGAGCTGAATGAAGCCATACCAAACGACGAGCGTGA +CACCACGATGCCTGCAGCAATGGCAACAACGTTGCGCAAACTATTAACTGGCGAACTACTTACTCTAGCT + +>contig09 >case4_case6_sul2_aph3pp-Ib Providencia rettgeri strain Pret_2032, whole genome shotgun sequence 2160922-2162737 150-1527 (reverse comp'd) +CAAATCGGCATAGCGATCTGCTGTTCCGAGCCGCCCAAGGTCGATCAGACCCGTGCATTGAAGAGTTTTA +GGGTCCACCATGAAGTTCGGCATGCAGGGATCACCATGGCAAACAACCATATCGGTGCGCTCTTGGTCGA +GCCGCACCGGTAGCTCTCGTTCGACACGAGCCAAAAGATCGAGCTGCGGCGTACTCTTGTCCTCGTCCGG +TAAGAAGTCGGGATTGACGGCATTGCGGGACACCACATCAACGGCGCGTCCGAACATTCGCGACAGCCTG +CGCTCAAACGGACATTGATCAACCGATAGGCTGTGAACAGCGCCAAGTTGCTGCCCCATTGACGGCCACG +CTTTGAGCAAATCCGCTCCAGACAGATCAGCCGCCGGTACTCCCGGAATTGCCGTTATCACCAAGCATGC +ACCCTCCTGTTCCTCCTGCCAGTTGATGACCTCGGGGCAAGCCACACCTCGACCTTTGAGCCAAATGAGG +CGGTCACGCTCTCCAGCGAGCTCACCGCGGCGGGAAGCAGGTGCGATTTTCGCGAAGGCATGCCCGTCAC +CACGTCGAAAAACAAAATCACCAGATTCTCCGCCTCTGACAGGCAACCAGTCAGAATGCGATTCACCAAA +AAAAATATTAGTTCGATTCAATGGAGGTTCCTTCAGTTTTCTGATGAAGCGCGAATATAGAGAAATATCC +CGAATGTGCAGTTAACGAATTCTTGCGGTTTCTTTTAGCGCCGCCAATACCGCCAGCCCGTCGCGCAAGG +GGCGCGGCTCGTGTGTGCGGATGAAGTCAGCTCCACCTGCGGCGGCGGCAAGCTCTGCAGCGAGTGTCGC +GGCCCCGACATCCCCCGGACCACGGCCTGTGAGCGCGCGCAGAAAGGATTTGCGCGAAACAGACAGAAGC +ACCGGCAAATCGAAGCGCAGCCGCAATTCATCGAACCGCGCCAGCACCGAGAGCGAGGTTTCGGGAGCAG +CCCCCAGAAAAAACCCCATGCCGGGATCAAGGACAAGGCGGTTGCGTTTGATACCGGCACCCGTCAGCGC +CGCGATGCGCGCGTCAAAGAACGCCGCAATGTGATCCATGATGTCGCCAGCGGGTGCCTCGCGCCGATCT +GCCTGCCCGTCTTGCACCGAATGCATAACGACGAGTTTGGCAGATGATTTCGCCAATTGCGGATAGAACG +CAGCGTCTGGAAAACCGCGAATATCATTGAGATAGGCCACACCACGCGACAAGGCATAGGCTTGCGTCGC +GGGTTGATAACTGTCGAGCGAGACGGGAATGCCATCTGCCTTGAGCGCGTCCAGCACCGGCGCGATACGC +TCGATTTCTGTGTCGGACGAAACAGGCGCGGCGTCGGGGTTGCTGGAT + +>contig10 blaOXA_internal_stop >NZ_CP008798.1 Klebsiella pneumoniae subsp. pneumoniae KPNIH24 plasmid pKPC-484, complete sequence 44200-45800 +TTGCTCTGCTCAAACGCGAAGGCCGGTGCCCGTCTGACGTTGAACACCGACAGATTAAGT +ACCGGAACAACGTGATTGAATGCGATCATGGCAAACTGAAACGGATAATCGGCGCCACGC +TGGGATTTAAATCCATGAAGACGGCTTACGCCACCATCAAAGGTATTGAGGTGATGCGTG +CACTACGCAAAGGCCAGGCCTCAGCATTTTATTATGGTGATCCCCTGGGCGAAATGCGCC +TGGTAAGCAGAGTTTTTGAAATGTAAGGCCTTTGAATAAGACAAAAGGCTGCCTCATCGC +TAACTTTGCAACAGTGCCAGGCAGGCTTATCTTGGACAAGAAGATCGCTTGGCCTCGCGC +GCAGATCAGTTGGAAGAATTTGTTCACTACGTGAAAGGCGAGATCACCAAGGTAGTCGGC +AAATAATGTCTAAAACAAAGTTATGCACCTATTAAGCGCACAGCGGAGCAATGAAGGATA +CCTTGATGAAAAAAATTTTGCTGCTGCATATGTTGGTGTTCGTTTCCGCCACTCTCCCAA +TCAGTTCCGTGGCTTCTGATGAGGTTGAAACGCTTAAATGCACCATCATCGCAGACGCCA +TTACCGGAAATACCTTATATGAGACCGGAGAATGTGCCCGTCGTGTGTCTCCGTGCTCGT +CTTTTAAACTTCCATTGGCAATCATGGGGTTTGATAGTGGAATCTTGCAGTCGCCAAAAT +CACCTACGTGGGAATTGAAGCCGGAATACAACCCGTCTCCGAGAGATCGCACATACAAAC +AAGTCTATCCGGCGCTATGGCAAAGCGACTCTGTTGTCTAGTTCTCGCAGCAATTAACAA +GCCGTCTGGGAGTTGATCGGTTCACGGAATACGTAAAGAAATTTGAGTACGGTAATCAAG +ATGTTTCCGGTGACTCGGGGAAGCATAACGGCTTGACCCAGTCATGGCTGATGTCGTCGC +TCACCATATCTCCCAAGGAGCAAATTCAGTTTCTTCTACGCTTTGTCGCGCATAAGCTGC +CTGTATCCGAAGCGGCTTATGACATGGCGTATGCCACAATCCCGCAGTACCAGGCAGCCG +AAGGATGGGCTGTACATGGAAAAAGCGGCAGCGGCTGGCTTCGGGACAATAACGGCAAGA +TAAATGAAAGTCGTCCGCAGGGCTGGTTCGTGGGCTGGGCTGAAAAAAACGGACGGCAAG +TTGTTTTCGCCCGATTGGAAATAGGAAAGGAAAAGTCCGATATTCCCGGCGGGTCTAAAG +CACGAGAGGATATTCTCGTGGAATTACCCGTGTTGATGGGTAACAAATGATATGTGGCGT +CATCGAGAGCAGATGCATAACCCTGCGCTCGAGCGGACCTCGCGCATAAAGCCGCGCGAG +TCCGCTCACCTTGAACGTTAGATGCACTAAGCACATAATTGCTCACAGCCAAACTATCAG +GTCAAGTCTGCTTTTATTATTTTTAAGCGTGCATAATAAGCCCTACACAAATTGGGAGAT +ATATCATGAAAAGAGTTTGATGCTCAGGGTGTAGCGGTTCGGTTTATTGACGACGGGATC +AGTACCGACGGTGATATGGGGCAAATGGTGGTCACCATCCT + +>contig11 blaTEM divergent, with internal stop codon +AACCCCTATTTGTTTATTTTTCTAAATACATTCAAATATGTATCCGCTTAAGATACAATAACCCTAATAA +ATGCTTCAATAATATTGAAAAAGGAAGAGTATAAGTATTTAACATTTCCGTGTCGCCCTTATTCCCTTTT +TTGCGGCATTTTGCCTTCCTGTTTTTGCTCACCCAGAAACGCTGGTGAAAGTAAAAGATGCTGAAGATCA +GTTGGGTGCACGAGTGAATTACATCGAACTGGATCTCAACAGTGGTAAGATCCTTGAGAGTTTTCGCCCC +GAAGAACGTTTTCCAATGATGAGCACTTTTAAAGTTCTGCTATGTGGCGCGGTATTATCTTGTGTTGACG +CCGGGCAAGAGCAACTCGGTCGCAGTATACACTATTCTCAGAATGACTTGGTTGAGTACTCACCAGTCAC +AGAAAAGCATCTTACGGATGGCATGACACTTAGAGAATTATGCAGTGCTGCCATAACCATGAGTGATAAC +ACTGCGGCCAACTTACTTCTGACAACGATCGGAGGACCGAAGGAGCTAACCGCTTAATTGCACAACATAG +GGGATCATGTAACTCGCCTTGATCGTCCGGAACCGGAGCTGTATGAAGTTATACCAAACGACGAGCGTGA +CACCACGATGCCTGCAGCAATAACAACAACGAAGCGCAAACTATTAACTGGCGAACTACTTACTCTAGCT +TCCCGGCAACAATTAATAGACTGGATGGAGGCGGATAAAGTTGTAGGATTATTTCTGCGCTCGGCCCTTC +CGGCTGGCTGGTTTATTGCTGATAAATCTGGAGGGGGTGAGCGTGGGTCTCACGGTATCATTGCAGCACT +GGGGCCAGATGGTAAGCCCTCCCGTATCGTAGTTATCTATACGACGAGGAGTCAGGCAATTATGGATGAA +CGAAATAGACAGATCGCTGAGATAGGTGCCTCATTGATTAGGCATTGGTAACTGTCAGACCAAGTTTACT +CATA + +>contig12 qacR test for blast rule NBR007712 300492196 21742-21176 plus 70 upstream and downstream +AACCCCTATTTGTTTATTTTTCTAAATACATTCAAATATGTATCCGCTTAAGATACAATAACCCTAATAA +atgaaAttgaaagataaaatactaggtgtcgcaaaggaattatttataaaaaatggatataatgccactactactggaga +aattgttaaattatcagaaagtagtaaagggaatctttattatcactttaaaacaaaagaaaatctatttttagaaattt +taaatatagaagaatctaaatggcaagaacagtggaaaaaggaacaaatcaaatgtaaaactaatagagaaaaattttat +ttatataatgaactatctttaacaaccgaatactattatccacttcaaaacgcaataattgaattttacactgaatatta +taaaactaatagcattaatgaaaaaatgaataaattagaaaacaaatatatagatgcttatcatgtaatttttaaggagg +gtaatttaaatggcgaatggtgtattaatgatgttaatgctgttagtaaaatagcagcaaacgctgttaatggaattgtt +acctttacacatgaacaaaatattaatgaaagaattaaacttatgaacaagttctctcaaatttttttaaatggacttag +taaataa +AACCCCTATTTGTTTATTTTTCTAAATACATTCAAATATGTATCCGCTTAAGATACAATAACCCTAATAA diff --git a/test_prot.expected b/test_prot.expected index c983360..9601305 100644 --- a/test_prot.expected +++ b/test_prot.expected @@ -1,6 +1,12 @@ -Target identifier Gene symbol Protein name Method Target length Reference protein length % Coverage of reference protein % Identity to reference protein Alignment length Accession of closest protein Name of closest protein HMM id HMM description -blaOXA-436_partial blaOXA OXA-48 family class D beta-lactamase PARTIAL 233 265 87.92 100.00 233 WP_058842180.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-436 NF000387.2 OXA-48 family class D beta-lactamase -blaPDC-114_blast blaPDC PDC family class C beta-lactamase BLAST 397 397 100.00 99.75 397 WP_061189306.1 class C beta-lactamase PDC-114 NF000422.2 PDC family class C beta-lactamase -blaTEM-156 blaTEM-156 class A beta-lactamase TEM-156 ALLELE 286 286 100.00 100.00 286 WP_061158039.1 class A beta-lactamase TEM-156 NF000531.2 TEM family class A beta-lactamase -nimIJ_hmm nimIJ NimIJ family nitroimidazole resistance protein HMM 166 NA NA NA NA NA NA NF000262.1 NimIJ family nitroimidazole resistance protein -vanG vanG D-alanine--D-serine ligase VanG EXACT 349 349 100.00 100.00 349 WP_063856695.1 D-alanine--D-serine ligase VanG NF000091.3 D-alanine--D-serine ligase VanG +Protein identifier Contig id Start Stop Strand Gene symbol Sequence name Scope Element type Element subtype Class Subclass Method Target length Reference sequence length % Coverage of reference sequence % Identity to reference sequence Alignment length Accession of closest sequence Name of closest sequence HMM id HMM description +blaTEM-156 contig01 101 961 + blaTEM-156 class A beta-lactamase TEM-156 core AMR AMR BETA-LACTAM BETA-LACTAM ALLELEP 286 286 100.00 100.00 286 WP_061158039.1 class A beta-lactamase TEM-156 NF000531.2 TEM family class A beta-lactamase +blaPDC-114_blast contig02 1 1191 + blaPDC PDC family class C beta-lactamase core AMR AMR BETA-LACTAM CEPHALOSPORIN BLASTP 397 397 100.00 99.75 397 WP_061189306.1 class C beta-lactamase PDC-114 NF000422.6 PDC family class C beta-lactamase +blaOXA-436_partial contig03 101 802 + blaOXA OXA-48 family class D beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM PARTIALP 233 265 87.92 100.00 233 WP_058842180.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-436 NF000387.2 OXA-48 family class D beta-lactamase +vanG contig04 101 1147 + vanG D-alanine--D-serine ligase VanG core AMR AMR GLYCOPEPTIDE VANCOMYCIN EXACTP 349 349 100.00 100.00 349 WP_063856695.1 D-alanine--D-serine ligase VanG NF000091.3 D-alanine--D-serine ligase VanG +gyrA contig06 31 2616 + gyrA_T86I Campylobacter quinolone resistant GyrA core AMR POINT QUINOLONE QUINOLONE POINTP 862 863 99.88 99.54 862 WP_002857904.1 gyrA NA NA +50S_L22 contig07 101 526 + 50S_L22_A103V Campylobacter macrolide resistant 50S L22 core AMR POINT MACROLIDE MACROLIDE POINTP 141 141 100.00 97.16 141 WP_002851214.1 50S L22 NA NA +aph3pp-Ib_partial_5p_neg contig09 1 675 - aph(3'')-Ib aminoglycoside O-phosphotransferase APH(3'')-Ib core AMR AMR AMINOGLYCOSIDE STREPTOMYCIN PARTIAL_CONTIG_ENDP 225 275 81.82 99.56 225 WP_109545041.1 aminoglycoside O-phosphotransferase APH(3'')-Ib NF032895.1 aminoglycoside O-phosphotransferase APH(3'')-Ib +sul2_partial_3p_neg contig09 715 1377 - sul2 sulfonamide-resistant dihydropteroate synthase Sul2 core AMR AMR SULFONAMIDE SULFONAMIDE PARTIALP 221 271 81.55 100.00 221 WP_001043265.1 sulfonamide-resistant dihydropteroate synthase Sul2 NF000295.1 sulfonamide-resistant dihydropteroate synthase Sul2 +blaTEM-internal_stop contig11 113 547 + blaTEM TEM family class A beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM PARTIALP 144 286 50.35 97.22 144 WP_000027057.1 class A broad-spectrum beta-lactamase TEM-1 NF000531.2 TEM family class A beta-lactamase +qacR-curated_blast contig12 71 637 + qacR multidrug-binding transcriptional regulator QacR plus STRESS BIOCIDE QUATERNARY AMMONIUM QUATERNARY AMMONIUM BLASTP 188 188 100.00 99.47 188 ADK23698.1 multidrug-binding transcriptional regulator QacR NA NA +nimIJ_hmm contigX 1 501 + nimIJ NimIJ family nitroimidazole resistance protein core AMR AMR NITROIMIDAZOLE NITROIMIDAZOLE HMM 166 NA NA NA NA NA NA NF000262.1 NimIJ family nitroimidazole resistance protein diff --git a/test_prot.fa b/test_prot.fa index 9a6ce32..85de8f6 100644 --- a/test_prot.fa +++ b/test_prot.fa @@ -41,3 +41,19 @@ RPRARGSASRIRKPTSHILVEVVKAEVKAEEKKTVAKKTTTTKAPAKKTTSTKKATAKKES MSEFREMRRKRQQLTDADSIAVLQKATSGTLALLGDNDYPYAVPISYVYDNGKLYFHSAMAGHKVDAIRR CNKASFCVIEKDDVRPEKYTTYFRSVIAFGRIEIVEDEAEKRTIMHMMGNRFNPNHDDALQKELESGLAH MLAIRMDIEHLTGKEAIELVRQRGGN + +>aph3pp-Ib_partial_5p_neg NZ_QKNQ01000001.1 Providencia rettgeri strain Pret_2032, whole genome shotgun sequence 2160922-2162737 150-1527 704-137 +IRKLKEPPLNRTNIFFGESHSDWLPVRGGESGDFVFRRGDGHAFAKIAPASRRGELAGERDRLIWLKGRGVACPEVINWQEEQEGACLVITAIPGVPAADLSGADLLKAWPSMGQQLGAVHSLSVDQCPFERRLSRMFGRAVDVVSRNAVNPDFLPDEDKSTPQLDLLARVERELPVRLDQERTDMVVCHGDPCMPNFMVDPKTLQCTGLIDLGRLGTADRYADL + +>sul2_partial_3p_neg NZ_QKNQ01000001.1 Providencia rettgeri strain Pret_2032, whole genome shotgun sequence 2160922-2162737 150-1377 2-667 +SSNPDAAPVSSDTEIERIAPVLDALKADGIPVSLDSYQPATQAYALSRGVAYLNDIRGFPDAAFYPQLAKSSAKLVVMHSVQDGQADRREAPAGDIMDHIAAFFDARIAALTGAGIKRNRLVLDPGMGFFLGAAPETSLSVLARFDELRLRFDLPVLLSVSRKSFLRALTGRGPGDVGAATLAAELAAAAGGADFIRTHEPRPLRDGLAVLAALKETARIR + +>blaTEM-internal_stop +HFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVNYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSCVDAG +QEQLGRSIHYSQNDLVEYSPVTEKHLTDGMTLRELCSAAITMSDNTAANLLLTTIGGPKELTA + +>qacR-curated_blast (ADK23698.1) +MKLKDKILGVAKELFIKNGYNATTTGEIVKLSESSKGNLYYHFKTKENLFLEILNIEESKWQEQWKKEQI +KCKTNREKFYLYNELSLTTEYYYPLQNAIIEFYTEYYKTNSINEKMNKLENKYIDAYHVIFKEGNLNGEW +CINDVNAVSKIAANAVNGIVTFTHEQNINERIKLMNKFSQIFLNGLSK + diff --git a/test_prot.gff b/test_prot.gff index b5057db..11f81dd 100644 --- a/test_prot.gff +++ b/test_prot.gff @@ -1,9 +1,13 @@ ##gff-version 3 ##sequence-region contig1 1-50000 -contig1 . gene 101 961 . + . ID=gene1;Name=blaTEM-156 +contig01 . gene 101 961 . + . ID=gene1;Name=blaTEM-156 contigX . gene 1 501 . + . ID=gene2;Name=nimIJ_hmm -contig2 . gene 1 1191 . + . ID=gene3;Name=blaPDC-114_blast -contig3 . gene 101 802 . + . ID=gene4;Name=blaOXA-436_partial -contig4 . gene 101 1147 . + . ID=gene5;Name=vanG -contig6 . gene 31 2616 . + . ID=gene6;Name=gyrA -contig7 . gene 100 525 . + . ID=gene7;Name=50S_L22 +contig02 . gene 1 1191 . + . ID=gene3;Name=blaPDC-114_blast +contig03 . gene 101 802 . + . ID=gene4;Name=blaOXA-436_partial +contig04 . gene 101 1147 . + . ID=gene5;Name=vanG +contig06 . gene 31 2616 . + . ID=gene6;Name=gyrA +contig07 . gene 101 526 . + . ID=gene7;Name=50S_L22 +contig09 . gene 1 675 . - . Name=aph3pp-Ib_partial_5p_neg +contig09 . gene 715 1377 . - . Name=sul2_partial_3p_neg +contig11 . gene 113 547 . + . Name=blaTEM-internal_stop +contig12 . gene 71 637 . + . Name=qacR-curated_blast