Skip to content

Commit

Permalink
Version 3.1.1b with blacklisting
Browse files Browse the repository at this point in the history
  • Loading branch information
evolarjun committed Aug 22, 2019
2 parents b11e5a9 + 83c2dfb commit 338dfaa
Show file tree
Hide file tree
Showing 7 changed files with 114 additions and 61 deletions.
11 changes: 8 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,19 @@ else
endif
SVNREV := -D'SVN_REV="$(VERSION_STRING)"'

# make it possible to hard define a database directory

# Define default paths
PREFIX ?= /usr/local
INSTALL=install
ifneq '$(DEFAULT_DB_DIR)' ''
DBDIR := -D'DEFAULT_DB_DIR="$(DEFAULT_DB_DIR)"'
endif

CPPFLAGS = -std=gnu++11 -pthread -malign-double -fno-math-errno -O3 \
CPPFLAGS = -std=gnu++11 -pthread -malign-double -fno-math-errno -O3

CXX=g++
COMPILE.cpp= $(CXX) $(CPPFLAGS) $(SVNREV) -c
COMPILE.cpp= $(CXX) $(CPPFLAGS) $(SVNREV) $(DBDIR) -c


.PHONY: all clean install release
Expand All @@ -61,7 +66,7 @@ amr_report: $(amr_reportOBJS)
amrfinder.o: common.hpp common.inc amrfinder.inc
amrfinderOBJS=amrfinder.o common.o
amrfinder: $(amrfinderOBJS)
$(CXX) -o $@ $(amrfinderOBJS) -pthread
$(CXX) -o $@ $(amrfinderOBJS) -pthread $(DBDIR)

amrfinder_update.o: common.hpp common.inc amrfinder.inc
amrfinder_updateOBJS=amrfinder_update.o common.o
Expand Down
47 changes: 33 additions & 14 deletions amr_report.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -238,12 +238,9 @@ struct HmmAlignment


bool good () const
{ /*cout << fam << endl;
if (fam)
cout << fam->id << ' ' << fam->hmm << ' ' << fam->tc1 << ' ' << fam->tc2 << endl; */
return fam
&& ! fam->hmm. empty ()
&& score1 >= fam->tc1
{ QC_ASSERT (fam);
QC_ASSERT (! fam->hmm. empty ());
return score1 >= fam->tc1
&& score2 >= fam->tc2
//&& fam->reportable
;
Expand Down Expand Up @@ -1151,6 +1148,7 @@ struct Batch
// Reference input
map<string/*hmm*/,const Fam*> hmm2fam;
uchar reportable_min {0};
Vector<long> suppress_prots;

// Target input
typedef List<BlastAlignment> BlastAls;
Expand All @@ -1168,6 +1166,7 @@ struct Batch
Batch (const string &famFName,
const string &organism,
const string &point_mut,
const string &suppress_prot_FName,
bool non_reportable,
bool report_core_only)
: reportable_min (non_reportable
Expand Down Expand Up @@ -1267,7 +1266,6 @@ struct Batch
QC_ASSERT (roots == 1);
}


if (! organism. empty ())
{
if (verbose ())
Expand All @@ -1281,21 +1279,38 @@ struct Batch
char alleleChar;
iss. reset (f. line);
iss >> organism_ >> accession >> pos >> alleleChar >> geneMutation >> geneMutationGen >> classS >> subclass >> name;
ASSERT (pos > 0);
QC_ASSERT (pos > 0);
replace (organism_, '_', ' ');
if (organism_ == organism)
accession2pointMuts [accession] << move (PointMut ((size_t) pos, alleleChar, geneMutation, geneMutationGen, classS, subclass, name));
}
#if 0
// PD-2008
if (accession2pointMuts. empty ())
throw runtime_error ("No protein point mutations for organism " + strQuote (organism) + " found in the AMRFinder database. Please check the " + strQuote ("organism") + " option.");
#endif
for (auto& it : accession2pointMuts)
{
it. second. sort ();
//if (! it. second. isUniq ())
//throw runtime_error ("Duplicate mutations for " + it. first);
}
}

if (! suppress_prot_FName. empty ())
{
ifstream f (suppress_prot_FName);
while (! f. eof ())
{
long gi = 0;
f >> gi;
ASSERT (gi >= 0);
if (gi == 0)
break;
suppress_prots << gi;
}
}
suppress_prots. sort ();
}
private:
static void toProb (double &x)
Expand Down Expand Up @@ -1577,7 +1592,9 @@ struct Batch
;
else
blastAl. saveText (os);
else if (blastAl. getFam () -> reportable >= reportable_min)
else if ( blastAl. getFam () -> reportable >= reportable_min
&& ! suppress_prots. containsFast (blastAl. gi)
)
blastAl. saveText (os);
}
}
Expand Down Expand Up @@ -1708,6 +1725,7 @@ struct ThisApplication : Application
addKey ("organism", "Taxonomy group for point mutations");
addKey ("point_mut", "Point mutation table");
addKey ("point_mut_all", "File to report all target positions of reference point mutations");
addKey ("suppress_prot", "File with protein GIs to suppress");
addKey ("ident_min", "Min. identity to the reference protein (0..1). -1 means use a curated threshold if it exists and " + toString (ident_min_def) + " otherwise", "-1");
addKey ("coverage_min", "Min. coverage of the reference protein (0..1) for partial hits", toString (partial_coverage_min_def));
addFlag ("skip_hmm_check", "Skip checking HMM for a BLAST hit");
Expand Down Expand Up @@ -1744,6 +1762,7 @@ struct ThisApplication : Application
const string organism = getArg ("organism");
const string point_mut = getArg ("point_mut");
const string point_mut_all_FName = getArg ("point_mut_all");
const string suppress_prot_FName = getArg ("suppress_prot");
double ident_min = str2<double> (getArg ("ident_min"));
const double partial_coverage_min = str2<double> (getArg ("coverage_min"));
const bool skip_hmm_check = getFlag ("skip_hmm_check");
Expand All @@ -1756,12 +1775,12 @@ struct ThisApplication : Application
const bool nosame = getFlag ("nosame");
const bool noblast = getFlag ("noblast");

ASSERT (hmmsearch. empty () == hmmDom. empty ());
IMPLY (! outFName. empty (), ! blastpFName. empty ());
IMPLY (! gffFName. empty (), ! blastpFName. empty ());
QC_ASSERT (hmmsearch. empty () == hmmDom. empty ());
QC_IMPLY (! outFName. empty (), ! blastpFName. empty ());
QC_IMPLY (! gffFName. empty (), ! blastpFName. empty ());
if (! blastpFName. empty () && ! blastxFName. empty () && gffFName. empty ())
throw runtime_error ("If BLASTP and BLASTX files are present then a GFF file must be present");


if (ident_min == -1.0)
ident_min = ident_min_def;
Expand Down Expand Up @@ -1790,7 +1809,7 @@ struct ThisApplication : Application
point_mut_all. reset (new OFStream (point_mut_all_FName));


Batch batch (famFName, organism, point_mut, non_reportable, report_core_only);
Batch batch (famFName, organism, point_mut, suppress_prot_FName, non_reportable, report_core_only);


// Input
Expand Down
102 changes: 65 additions & 37 deletions amrfinder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ constexpr double ident_min_def = 0.9;
constexpr double partial_coverage_min_def = 0.5;


#define ORGANISMS "Campylobacter|Escherichia|Klebsiella|Salmonella|Staphylococcus|Vibrio" // from table Taxgroup



// ThisApplication
Expand All @@ -69,14 +71,15 @@ struct ThisApplication : ShellApplication
{
addKey ("protein", "Protein FASTA file to search", "", 'p', "PROT_FASTA");
addKey ("nucleotide", "Nucleotide FASTA file to search", "", 'n', "NUC_FASTA");
addKey ("gff", "GFF file for protein locations. Protein id should be in the attribute 'Name=<id>' (9th field) of the rows with type 'CDS' or 'gene' (3rd field).", "", 'g', "GFF_FILE");
addKey ("database", "Alternative directory with AMRFinder database. Default: $AMRFINDER_DB", "", 'd', "DATABASE_DIR");
addFlag ("update", "Update the AMRFinder database", 'u'); // PD-2379
addKey ("gff", "GFF file for protein locations. Protein id should be in the attribute 'Name=<id>' (9th field) of the rows with type 'CDS' or 'gene' (3rd field).", "", 'g', "GFF_FILE");
addKey ("ident_min", "Minimum identity for nucleotide hit (0..1). -1 means use a curated threshold if it exists and " + toString (ident_min_def) + " otherwise", "-1", 'i', "MIN_IDENT");
addKey ("coverage_min", "Minimum coverage of the reference protein (0..1)", toString (partial_coverage_min_def), 'c', "MIN_COV");
addKey ("organism", "Taxonomy group for point mutation assessment\n " ORGANISMS, "", 'O', "ORGANISM");
addKey ("translation_table", "NCBI genetic code for translated blast", "11", 't', "TRANSLATION_TABLE");
addKey ("organism", "Taxonomy group\n " ORGANISMS, "", 'O', "ORGANISM");
addKey ("translation_table", "NCBI genetic code for translated BLAST", "11", 't', "TRANSLATION_TABLE");
addFlag ("plus", "Add the plus genes to the report"); // PD-2789
addFlag ("report_common", "Suppress proteins common to a taxonomy group"); // PD-2756
addKey ("point_mut_all", "File to report all target positions of reference point mutations", "", '\0', "POINT_MUT_ALL_FILE");
addKey ("blast_bin", "Directory for BLAST. Deafult: $BLAST_BIN", "", '\0', "BLAST_DIR");
addKey ("parm", "amr_report parameters for testing: -nosame -noblast -skip_hmm_check -bed", "", '\0', "PARM");
Expand Down Expand Up @@ -133,22 +136,23 @@ struct ThisApplication : ShellApplication

void shellBody () const final
{
const string prot = shellQuote (getArg ("protein"));
const string dna = shellQuote (getArg ("nucleotide"));
string db = getArg ("database");
const bool update = getFlag ("update");
const string gff = shellQuote (getArg ("gff"));
const double ident = arg2double ("ident_min");
const double cov = arg2double ("coverage_min");
const string organism = shellQuote (getArg ("organism"));
const uint gencode = arg2uint ("translation_table");
const bool add_plus = getFlag ("plus");
const string point_mut_all = getArg ("point_mut_all");
string blast_bin = getArg ("blast_bin");
const string parm = getArg ("parm");
const string output = shellQuote (getArg ("output"));
const bool quiet = getFlag ("quiet");
const bool gpipe = getFlag ("gpipe");
const string prot = shellQuote (getArg ("protein"));
const string dna = shellQuote (getArg ("nucleotide"));
string db = getArg ("database");
const bool update = getFlag ("update");
const string gff = shellQuote (getArg ("gff"));
const double ident = arg2double ("ident_min");
const double cov = arg2double ("coverage_min");
const string organism = shellQuote (getArg ("organism"));
const uint gencode = arg2uint ("translation_table");
const bool add_plus = getFlag ("plus");
const bool report_common = getFlag ("report_common");
const string point_mut_all = getArg ("point_mut_all");
string blast_bin = getArg ("blast_bin");
const string parm = getArg ("parm");
const string output = shellQuote (getArg ("output"));
const bool quiet = getFlag ("quiet");
const bool gpipe = getFlag ("gpipe");


const string logFName (tmp + ".log");
Expand All @@ -167,6 +171,9 @@ struct ThisApplication : ShellApplication
if (cov < 0.0 || cov > 1.0)
throw runtime_error ("coverage_min must be between 0 and 1");

if (report_common && emptyArg (organism))
throw runtime_error ("--report_common requires --organism");


if (! emptyArg (output))
try { OFStream f (unQuote (output)); }
Expand All @@ -185,8 +192,11 @@ struct ThisApplication : ShellApplication
threads_max = threads_max_max;
}


const string defaultDb (execDir + "/data/latest");
#ifdef DEFAULT_DB_DIR
const string defaultDb ((string) DEFAULT_DB_DIR + "/latest");
#else
const string defaultDb (execDir + "/data/latest");
#endif

// db
if (db. empty ())
Expand Down Expand Up @@ -259,7 +269,7 @@ struct ThisApplication : ShellApplication
}
}
if (emptyArg (organism))
includes << key2shortHelp ("organism") + " option to add point-mutation searches";
includes << key2shortHelp ("organism") + " option to add point-mutation searches and suppress common proteins";
else
searchMode += " and point-mutation";

Expand Down Expand Up @@ -287,13 +297,21 @@ struct ThisApplication : ShellApplication
}

string organism1;
bool suppress_common = false;
if (! emptyArg (organism))
{
organism1 = unQuote (organism);
const StringVector organisms (ORGANISMS, '|');
if (! organisms. contains (organism1))
throw runtime_error ("Possible organisms: " + organisms. toString (", "));
replace (organism1, ' ', '_');
ASSERT (! organism1. empty ());
if (! report_common)
suppress_common = true;
}


#if 0
if (! emptyArg (organism))
{
string errMsg;
Expand All @@ -308,9 +326,10 @@ struct ThisApplication : ShellApplication
if (! errMsg. empty ())
throw runtime_error (errMsg + "\nPossible organisms: " ORGANISMS);
}
#endif


const string qcS (qc_on ? "-qc -verbose 1" : "");
const string qcS (qc_on ? " -qc -verbose 1" : "");
const string force_cds_report (! emptyArg (dna) && ! emptyArg (organism) ? "-force_cds_report" : ""); // Needed for point_mut


Expand Down Expand Up @@ -403,9 +422,9 @@ struct ThisApplication : ShellApplication
//ASSERT (threadsAvailable);
if (threadsAvailable >= 2)
{
exec ("mkdir " + tmp + ".chunk");
exec ("mkdir " + tmp + ".chunk", logFName);
exec (fullProg ("fasta2parts") + dna_ + " " + to_string (threadsAvailable) + " " + tmp + ".chunk -log " + logFName, logFName); // PAR
exec ("mkdir " + tmp + ".blastx_dir");
exec ("mkdir " + tmp + ".blastx_dir", logFName);
FileItemGenerator fig (false, true, tmp + ".chunk");
string item;
while (fig. next (item))
Expand All @@ -425,9 +444,11 @@ struct ThisApplication : ShellApplication
blastx_par = "-blastx " + tmp + ".blastx -dna_len " + tmp + ".len";
}

if (! emptyArg (dna) && ! emptyArg (organism))
if ( ! emptyArg (dna)
&& ! emptyArg (organism)
&& fileExists (db + "/AMR_DNA-" + organism1)
)
{
QC_ASSERT (fileExists (db + "/AMR_DNA-" + organism1));
findProg ("blastn");
findProg ("point_mut");
stderr << "Running blastn...\n";
Expand All @@ -438,7 +459,11 @@ struct ThisApplication : ShellApplication


if (blastxChunks)
exec ("cat " + tmp + ".blastx_dir/* > " + tmp + ".blastx");
exec ("cat " + tmp + ".blastx_dir/* > " + tmp + ".blastx", logFName);


if (suppress_common)
exec ("grep -w ^" + organism1 + " " + db + "/AMRProt-suppress | cut -f 2 > " + tmp + ".suppress_prot");


const string point_mut_allS (point_mut_all. empty () ? "" : ("-point_mut_all " + point_mut_all));
Expand All @@ -448,13 +473,16 @@ struct ThisApplication : ShellApplication
+ force_cds_report + " -pseudo" + coreS
+ (ident == -1 ? string () : " -ident_min " + toString (ident))
+ " -coverage_min " + toString (cov)
+ " " + qcS + " " + parm + " -log " + logFName + " > " + tmp + ".amr-raw", logFName);
+ ifS (suppress_common, " -suppress_prot " + tmp + ".suppress_prot")
+ qcS + " " + parm + " -log " + logFName + " > " + tmp + ".amr-raw", logFName);

if (! emptyArg (dna) && ! emptyArg (organism))
if ( ! emptyArg (dna)
&& ! emptyArg (organism)
&& fileExists (db + "/AMR_DNA-" + organism1)
)
{
QC_ASSERT (fileExists (db + "/AMR_DNA-" + organism1));
exec (fullProg ("point_mut") + tmp + ".blastn " + db + "/AMR_DNA-" + organism1 + ".tab " + qcS + " -log " + logFName + " > " + tmp + ".amr-snp", logFName);
exec ("tail -n +2 " + tmp + ".amr-snp >> " + tmp + ".amr-raw");
exec (fullProg ("point_mut") + tmp + ".blastn " + db + "/AMR_DNA-" + organism1 + ".tab" + qcS + " -log " + logFName + " > " + tmp + ".amr-snp", logFName);
exec ("tail -n +2 " + tmp + ".amr-snp >> " + tmp + ".amr-raw", logFName);
}

// $tmp.amr-raw --> $tmp.amr
Expand All @@ -469,14 +497,14 @@ struct ThisApplication : ShellApplication
|| ! emptyArg (gff)
)
sort_cols = " -k2 -k3n -k4n -k5";
exec ("head -1 " + tmp + ".amr-raw > " + tmp + ".amr");
exec ("tail -n +2 " + tmp + ".amr-raw | sort" + sort_cols + " -k1 >> " + tmp + ".amr");
exec ("head -1 " + tmp + ".amr-raw > " + tmp + ".amr", logFName);
exec ("tail -n +2 " + tmp + ".amr-raw | sort" + sort_cols + " -k1 >> " + tmp + ".amr", logFName);


if (emptyArg (output))
exec ("cat " + tmp + ".amr");
exec ("cat " + tmp + ".amr", logFName);
else
exec ("cp " + tmp + ".amr " + output);
exec ("cp " + tmp + ".amr " + output, logFName);
}
};

Expand Down
3 changes: 0 additions & 3 deletions amrfinder.inc
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
#define ORGANISMS "Campylobacter|Escherichia|Salmonella"
#if __APPLE__

#include <sys/types.h>
#include <sys/sysctl.h>

#endif


Loading

0 comments on commit 338dfaa

Please sign in to comment.