Peptides 2.0

dosorio · Mar 12, 2017 · d69e966 · d69e966
2 parents 70a50c4 + ceb646c
commit d69e966
Show file tree

Hide file tree

Showing 100 changed files with 2,030 additions and 1,013 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,14 +1,12 @@
 Package: Peptides
-Version: 1.2.1
-Date: 2017-02-20
-Title: Calculate Indices and Theoretical Properties of Protein Sequences
-Author: Daniel Osorio, Paola Rondon-Villarreal and Rodrigo Torres.
+Version: 2.0.0
+Date: 2017-03-12
+Title: Calculate Indices and Theoretical Physicochemical Properties of Protein Sequences
+Authors@R: c(person("Daniel","Osorio",email="[email protected]",role=c("aut","cre")),person("Paola","Rondon-Villarreal",role=c("aut","ths")),person("Rodrigo","Torres",role=c("aut","ths")),person("J. Sebastian","Paez",email="[email protected]",role=c("ctb")))
 Maintainer: Daniel Osorio <[email protected]>
 URL: https://github.com/dosorio/Peptides/
 Suggests:
-    RUnit
-Description: Calculate physicochemical properties and indices from amino-acid
-    sequences of peptides and proteins. Include also the option to read and plot
-    output files from the 'GROMACS' molecular dynamics package.
+    testthat
+Description: Includes functions to calculate several physicochemical properties and indices for amino-acid sequences as well as to read and plot 'XVG' output files from the 'GROMACS' molecular dynamics package.
 License: GPL-2
 RoxygenNote: 6.0.1
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,23 +1,33 @@
 # Generated by roxygen2: do not edit by hand
 
-S3method(plot,xvg)
-export(aacomp)
-export(aindex)
+export(aIndex)
+export(aaComp)
+export(aaDescriptors)
 export(autoCorrelation)
 export(autoCovariance)
+export(blosumIndices)
 export(boman)
 export(charge)
 export(crossCovariance)
 export(crucianiProperties)
+export(fasgaiVectors)
 export(hmoment)
 export(hydrophobicity)
-export(instaindex)
+export(instaIndex)
 export(kideraFactors)
 export(lengthpep)
 export(membpos)
+export(mswhimScores)
 export(mw)
 export(pI)
-export(plot.xvg)
-export(read.xvg)
+export(plotXVG)
+export(protFP)
+export(readXVG)
+export(stScales)
+export(tScales)
+export(vhseScales)
+export(zScales)
+importFrom(graphics,par)
+importFrom(graphics,title)
 importFrom(stats,embed)
 importFrom(utils,data)
diff --git a/NEWS → NEWS.md b/NEWS → NEWS.md
@@ -1,30 +1,60 @@
 NEWS
 ====
+**Peptides v.2.0.0**
+
+* All datasets were unified into AAdata
+
+* All test were migrated to testthat
+
+* readXVG and plotXVG functions were improved by J. Sebastian Paez
+
+* kideraFactors output vector was renamed as KF#
+
+* Now all sequences are checked before to property calculation
+
+* aaDescriptos, fasgaiVectors, blosumIndices, mswhimScores, zScales, vhseScales, protFP, tScales and stScales functions were added
+
+**Peptides v.1.2.2**
+
+* crucianiProperties function was added.
+
 **Peptides v.1.2.1**
+
 * Four new functions were added: autoCorrelation, autoCovariance, crossCovariance and crucianiProperties
+
 * Functions related with XVG files were updated.
+
 * Documentation was changed to roxygen2
 
 **Peptides v.1.1.2**
+
 * All functions were re-vectorized to support set of peptides as input
+
 * Kidera function now returns all factors in a unique output
 
 **Peptides v.1.1.1**
+
 * The mw function now computes the molecular weight using monoisotopic values
+
 * A problem with blank spaces was solved
 
 **Peptides v.1.1.0**
+
 * The kidera function and Kfactors dataset was included.
 
 **Peptides v.1.0.4**
+
 * A instaindex function bug has been fixed.
+
 * A problem with line breaks in sequences was solved.
 
 **Peptides v.1.0.3**
 * A membpos function bug has been fixed.
+
 * The results now are not rounded.
 
 **Peptides v.1.0.2**
+
 * Hydrophobicity function now can compute the GRAVY index with one of the 38 scales includes in Peptides (*new):
 
   1. **Aboderin:** Aboderin, A. A. (1971). An empirical hydrophobicity scale for α-amino-acids and some of its applications. International Journal of Biochemistry, 2(11), 537-544.

diff --git a/R/aaCheck.R b/R/aaCheck.R
@@ -0,0 +1,12 @@
+aaCheck <- function(seq){
+  seq <- toupper(seq)
+  seq <- gsub(pattern = "[[:space:]]+",replacement = "",x = seq)
+  seq <- strsplit(x = seq,split = "")
+  check <- unlist(lapply(seq,function(sequence){
+    !all(seq[[1]]%in%c("A" ,"C" ,"D" ,"E" ,"F" ,"G" ,"H" ,"I" ,"K" ,"L" ,"M" ,"N" ,"P" ,"Q" ,"R" ,"S" ,"T" ,"V" ,"W" ,"Y", "-"))
+  }))
+  if(sum(check) > 0){
+    sapply(which(check == TRUE),function(sequence){warning(paste0("Sequence ",sequence," has unrecognized amino acid types. Output value might be wrong calculated"),call. = FALSE)})
+  }
+  return(seq)
+}
diff --git a/R/aaDescriptors.R b/R/aaDescriptors.R
@@ -0,0 +1,51 @@
+#' @export aaDescriptors
+#' @title Compute 66 descriptors for each amino acid of a protein sequence.
+#' @description The function return 66 amino acid descriptors for the 20 natural amino acids. Available descriptors are: \itemize{
+#' \item{crucianiProperties:} Cruciani, G., Baroni, M., Carosati, E., Clementi, M., Valigi, R., and Clementi, S. (2004) Peptide studies by means of principal properties of amino acids derived from MIF descriptors. J. Chemom. 18, 146-155.,
+#' \item{kideraFactors:} Kidera, A., Konishi, Y., Oka, M., Ooi, T., & Scheraga, H. A. (1985). Statistical analysis of the physical properties of the 20 naturally occurring amino acids. Journal of Protein Chemistry, 4(1), 23-55.,
+#' \item{zScales:} Sandberg M, Eriksson L, Jonsson J, Sjostrom M, Wold S: New chemical descriptors relevant for the design of biologically active peptides. A multivariate characterization of 87 amino acids. J Med Chem 1998, 41:2481-2491.,
+#' \item{FASGAI:} Liang, G., & Li, Z. (2007). Factor analysis scale of generalized amino acid information as the source of a new set of descriptors for elucidating the structure and activity relationships of cationic antimicrobial peptides. Molecular Informatics, 26(6), 754-763.,
+#' \item{tScales:} Tian F, Zhou P, Li Z: T-scale as a novel vector of topological descriptors for amino acids and its application in QSARs of peptides. J Mol Struct. 2007, 830: 106-115. 10.1016/j.molstruc.2006.07.004.,
+#' \item{VHSE:} VHSE-scales (principal components score Vectors of Hydrophobic, Steric, and Electronic properties), is derived from principal components analysis (PCA) on independent families of 18 hydrophobic properties, 17 steric properties, and 15 electronic properties, respectively, which are included in total 50 physicochemical variables of 20 coded amino acids.,
+#' \item{protFP:} van Westen, G. J., Swier, R. F., Wegner, J. K., IJzerman, A. P., van Vlijmen, H. W., & Bender, A. (2013). Benchmarking of protein descriptor sets in proteochemometric modeling (part 1): comparative study of 13 amino acid descriptor sets. Journal of cheminformatics, 5(1), 41.,
+#' \item{stScales:} Yang, L., Shu, M., Ma, K., Mei, H., Jiang, Y., & Li, Z. (2010). ST-scale as a novel amino acid descriptor and its application in QSAM of peptides and analogues. Amino acids, 38(3), 805-816.,
+#' \item{BLOSUM:} Georgiev, A. G. (2009). Interpretable numerical descriptors of amino acid space. Journal of Computational Biology, 16(5), 703-723.,
+#' \item{MSWHIM:} Zaliani, A., & Gancia, E. (1999). MS-WHIM scores for amino acids: a new 3D-description for peptide QSAR and QSPR studies. Journal of chemical information and computer sciences, 39(3), 525-533.
+#' }
+#' @param seq An amino-acids sequence. If multiple sequences are given all of them must have the same length (gap symbols are allowed.)
+#' @return a matrix with 66 amino acid descriptors for each aminoacid in a protein sequence.
+#' @examples aaDescriptors(seq = "KLKLLLLLKLK")
+aaDescriptors <- function(seq){
+  # Remove spaces and line breaks
+  seq <- aaCheck(seq)
+  sequences <- length(seq)
+  # Length validation
+  if(all(lengths(seq)==length(seq[[1]]))){
+    # Extract descriptors
+    desc <- lapply(seq,function(seq){
+      sapply(seq,function(aa){
+        c(data.frame(AAdata$crucianiProperties)[aa,],
+          data.frame(AAdata$kideraFactors)[aa,],
+          data.frame(AAdata$zScales)[aa,],
+          data.frame(AAdata$FASGAI)[aa,],
+          data.frame(AAdata$tScales)[aa,],
+          data.frame(AAdata$VHSE)[aa,],
+          data.frame(AAdata$ProtFP)[aa,],
+          data.frame(AAdata$stScales)[aa,],
+          data.frame(AAdata$BLOSUM)[aa,],
+          data.frame(AAdata$MSWHIM)[aa,]
+        )
+      })
+    })
+    # Format output
+    col_names <- as.vector((outer(rownames(desc[[1]]),seq_len(dim(desc[[1]])[2]),paste,sep=".")))
+    descriptors <- matrix(data = NA,nrow = sequences,ncol = length(col_names),dimnames = list(list(),col_names))
+    for(sequence in seq_along(desc)){
+      descriptors[sequence,] <- as.numeric(desc[[sequence]])
+    }
+    # Return
+    return(descriptors)
+  } else {
+    stop("All sequences must have the same length.")
+  }
+}
diff --git a/R/aacomp.R b/R/aacomp.R
@@ -1,10 +1,10 @@
-#' @export aacomp
+#' @export aaComp
 #' @title Compute the amino acid composition of a protein sequence
 #' @description This function calculates the amount of amino acids of a particular class and classified as: Tiny, Small, Aliphatic, Aromatic, Non-polar, Polar, Charged, Basic and Acidic based on their size and R-groups using same function implemented in EMBOSS 'pepstat'. 
 #' The output is a matrix with the number and percentage of amino acids of a particular class 
 #' @details Amino acids are zwitterionic molecules with an amine and a carboxyl group present in their structure. 
 #' Some amino acids possess side chains with specific properties that allow grouping them in different ways. 
-#' The \code{aacomp} function classifies amino acids based on their size, side chains, hydrophobicity, charge and their response to pH 7.
+#' The \code{aaComp} function classifies amino acids based on their size, side chains, hydrophobicity, charge and their response to pH 7.
 #' @param seq An amino-acid sequence
 #' @return  The output is a matrix with the number and percentage of amino acids of a particular class
 #' \itemize{
@@ -35,7 +35,7 @@
 #' # Acidic        (B+D+E+Z)               0   00.000
 #' 
 #' ## AA composition of PDB: 1D9J Cecropin Peptide
-#' aacomp("KWKLFKKIGIGKFLHSAKKFX")
+#' aaComp(seq= "KWKLFKKIGIGKFLHSAKKFX")
 #' 
 #' ## Output
 #' #           Number  Mole %
@@ -49,11 +49,11 @@
 #' # Basic          8 38.095
 #' # Acidic         0  0.000
 
-aacomp<-function(seq){
+aaComp<-function(seq){
   # Remove space characters: tab, newline, vertical tab, form feed, carriage return, space and possibly other locale-dependent characters.
-  seq <- gsub("[[:space:]]+","",as.vector(seq))
-  # Divide the amino acid sequence and makes a frequencies table
-  seq <- lapply(seq, function(seq){table(unlist(strsplit(seq,"")))})
+  seq <- aaCheck(seq)
+  # Make a frequencies table
+  seq <- lapply(seq, function(seq){table(unlist(seq))})
   # Applying composition function
   aacomp <- lapply(seq, function(seq){
     # Create data matrix output

diff --git a/R/aindex.R b/R/aindex.R
@@ -1,7 +1,9 @@
-#' @export aindex
+#' @export aIndex
 #' @title Compute the aliphatic index of a protein sequence
 #' @description This function calculates the Ikai (1980) aliphatic index of a protein. The \code{aindex} is defined as the relative volume occupied by aliphatic side chains (Alanine, Valine, Isoleucine, and Leucine). It may be regarded as a positive factor for the increase of thermostability of globular proteins.
+#'
 #' @param seq An amino-acids sequence
+#'
 #' @return The computed aliphatic index for a given amino-acids sequence
 #' @references Ikai (1980). Thermostability and aliphatic index of globular proteins. Journal of Biochemistry, 88(6), 1895-1898.
 #' @details Aliphatic amino acids (A, I, L and V) are responsible for the thermal stability of proteins. The aliphatic index was proposed by Ikai (1980) and evaluates the thermostability of proteins based on the percentage of each of the aliphatic amino acids that build up proteins.
@@ -10,16 +12,13 @@
 #' # SEQUENCE: SDKEVDEVDAALSDLEITLE
 #' # Aliphatic index: 117.00
 #' 
-#' aindex("SDKEVDEVDAALSDLEITLE")
+#' aIndex(seq = "SDKEVDEVDAALSDLEITLE")
 #' # [1] 117
 
-aindex <- function(seq) {
-  seq <- gsub("[[:space:]]+", "", seq)
+aIndex <- function(seq) {
+  seq <- aaCheck(seq)
   # Divide the amino acid sequence and extracts the relative frequency of Alanine, Valine, Leucine and Isoleucine
-  seq <-
-    lapply(seq, function(seq) {
-      table(unlist(strsplit(seq, ""))) / nchar(seq)
-    })
+  seq <- lapply(seq, function(seq) { table(unlist(seq)) / length(seq) })
   # Aliphatic index = X(Ala) + a * X(Val) + b * ( X(Ile) + X(Leu) )
   # where X(Ala), X(Val), X(Ile), and X(Leu) are mole percent (100 X mole fraction)
   # of alanine, valine, isoleucine, and leucine.

diff --git a/R/autocorrelation.R b/R/autocorrelation.R
@@ -11,13 +11,13 @@
 #' @references Cruciani, G., Baroni, M., Carosati, E., Clementi, M., Valigi, R., and Clementi, S. (2004) Peptide studies by means of principal properties of amino acids derived from MIF descriptors. J. Chemom. 18, 146-155.
 #' @examples
 #' # Loading a property to evaluate its autocorrelation
-#' data(H)
+#' data(AAdata)
 #'
 #' # Calculate the auto-correlation index for a lag=1
 #' autoCorrelation(
 #'   sequence = "SDKEVDEVDAALSDLEITLE",
 #'   lag = 1,
-#'   property = H$KyteDoolittle,
+#'   property = AAdata$Hydrophobicity$KyteDoolittle,
 #'   center = TRUE
 #' )
 #' # [1] -0.3519908
@@ -26,18 +26,19 @@
 #' autoCorrelation(
 #'   sequence = "SDKEVDEVDAALSDLEITLE",
 #'   lag = 5,
-#'   property = H$KyteDoolittle,
+#'   property = AAdata$Hydrophobicity$KyteDoolittle,
 #'   center = TRUE
 #' )
 #' # [1] 0.001133553
 autoCorrelation <-
   function(sequence, lag, property, center = TRUE) {
     if (center == TRUE) {
-      property <- scale(property)[, ]
+      property <- scale(property)[,]
     }
-    sequence <- gsub("[[:space:]]+", "", sequence)
-    if (lag < (min(nchar(sequence)) - 1)) {
-      sequence <- strsplit(sequence, "")
+    # Split sequence by amino acids
+    sequence <- aaCheck(sequence)
+    if (lag < (min(lengths(sequence)) - 1)) {
+      # Apply the Cruciani formula
       unlist(lapply(sequence, function(sequence) {
         sum(sapply(seq_len(length(sequence) - lag), function(position) {
           property[sequence[[position]]] * property[sequence[[position + lag]]]

diff --git a/R/autocovariance.R b/R/autocovariance.R
@@ -11,13 +11,13 @@
 #' @references Cruciani, G., Baroni, M., Carosati, E., Clementi, M., Valigi, R., and Clementi, S. (2004) Peptide studies by means of principal properties of amino acids derived from MIF descriptors. J. Chemom. 18, 146-155.
 #' @examples
 #' # Loading a property to evaluate its autocorrelation
-#' data(H)
+#' data(AAdata)
 #'
 #' # Calculate the auto-covariance index for a lag=1
 #' autoCovariance(
 #'   sequence = "SDKEVDEVDAALSDLEITLE",
 #'   lag = 1,
-#'   property = H$KyteDoolittle,
+#'   property = AAdata$Hydrophobicity$KyteDoolittle,
 #'   center = TRUE
 #' )
 #' # [1] -0.4140053
@@ -26,17 +26,18 @@
 #' autoCovariance(
 #'   sequence = "SDKEVDEVDAALSDLEITLE",
 #'   lag = 5,
-#'   property = H$KyteDoolittle,
+#'   property = AAdata$Hydrophobicity$KyteDoolittle,
 #'   center = TRUE
 #' )
 #' # [1] 0.001000336
 autoCovariance <- function(sequence, lag, property, center = TRUE) {
   if (center == TRUE) {
     property <- scale(property)[,]
   }
-  sequence <- gsub("[[:space:]]+", "", sequence)
-  if (lag < (min(nchar(sequence)) - 1)) {
-    sequence <- strsplit(sequence, "")
+  # Split sequence by amino acids
+  sequence <- aaCheck(sequence)
+  if (lag < (min(lengths(sequence)) - 1)) {
+    # Apply the Cruciani formula
     unlist(lapply(sequence, function(sequence) {
       sum(sapply(seq_len(length(sequence) - lag), function(position) {
         property[sequence[[position]]] * property[sequence[[position + lag]]]

diff --git a/R/blosumIndices.R b/R/blosumIndices.R
@@ -0,0 +1,30 @@
+#' @export blosumIndices
+#' @title Compute the BLOSUM62 derived indices of a protein sequence
+#' @description BLOSUM indices were derived of physicochemical properties that have been subjected to a VARIMAX analyses and an alignment matrix of the 20 natural AAs using the BLOSUM62 matrix.
+#' @references Georgiev, A. G. (2009). Interpretable numerical descriptors of amino acid space. Journal of Computational Biology, 16(5), 703-723.
+#' @param seq An amino-acids sequence
+#' @return The computed average of BLOSUM indices of all the amino acids in the corresponding peptide sequence.
+#' @examples blosumIndices(seq = "KLKLLLLLKLK")
+#' # [[1]]
+#' #   BLOSUM1    BLOSUM2    BLOSUM3    BLOSUM4    BLOSUM5    
+#' # -0.4827273 -0.5618182 -0.8509091 -0.4172727  0.3172727  
+#' 
+#' #  BLOSUM6   BLOSUM7     BLOSUM8    BLOSUM9   BLOSUM10 
+#' # 0.2527273  0.1463636  0.1427273 -0.2145455 -0.3218182 
+#' 
+blosumIndices <- function(seq) {
+
+  # Split the sequence by amino-acids
+  # Remove spaces and line breaks
+  seq <- aaCheck(seq)
+
+  # Load the BLOSUM indices
+  scales <- AAdata$BLOSUM
+
+  # Computes the BLOSUM indices for given sequences
+  lapply(seq, function(seq) {
+    sapply(names(scales), function(scale) {
+      (sum(scales[[scale]][seq], na.rm = TRUE) / length(seq))
+    })
+  })
+}