Merge pull request #201 from aappling-usgs/master

document output columns in summarizeInputs and summarizeModel
DOI-USGS · Apr 24, 2017 · d844494 · d844494
2 parents bfff27e + bdd3ff8
commit d844494
Show file tree

Hide file tree

Showing 14 changed files with 446 additions and 105 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: loadflex
 Type: Package
 Title: Models and Tools for Watershed Flux Estimates
-Version: 1.1.10
-Date: 2017-03-31
+Version: 1.1.11
+Date: 2017-04-24
 Authors@R: c(
     person("Alison P.", "Appling", , "[email protected]", c("aut", "cre")),
     person("Miguel C.", "Leon", , , "aut"),

diff --git a/R/loadComp.R b/R/loadComp.R
@@ -551,6 +551,7 @@ estimateMSE.loadComp <- function(load.model, n.iter=100, method="parametric", rh
 #' the regression residuals, one for the 'residuals' used to do the composite 
 #' correction).
 #' 
+#' @md
 #' @inheritParams summarizeModel
 #' @param newdata data.frame of data that was/will be used to predict 
 #'   concentration or load; should be the same as the \code{newdata} argument to
@@ -564,7 +565,34 @@ estimateMSE.loadComp <- function(load.model, n.iter=100, method="parametric", rh
 #'   timesteps are found to be irregular. Tests and estimates of autocorrelation
 #'   are weak to wrong when timesteps are irregular, but timesteps are often at 
 #'   least a bit irregular in the real world.
-#' @return A 1-row data.frame of model metrics
+#' @return Returns a 1-row data frame with the following columns:
+#'   
+#'   * `site.id` - the unique identifier of the site, as in [metadata()]
+#'   
+#'   * `constituent` - the unique identifier of the constituent, as in 
+#'   [metadata()]
+#'   
+#'   * `RMSE.lin` or `RMSE.log` - the square root of the mean squared error, in
+#'   log space (`RMSE.log`) if the `use.log` equalled `TRUE` in the call to 
+#'   [loadComp()] that created this model.
+#'   
+#'   * `reg.durbin.watson` - the Durbin Watson test statistic as applied to the 
+#'   residuals from the regression model fitting process
+#'   
+#'   * `reg.rho` - the autocorrelation coefficient of the residuals from the 
+#'   regression model fitting process
+#'   
+#'   * `int.durbin.watson` - the Durbin Watson test statistic as applied to the 
+#'   residuals from the interpolation model fitting process. See 
+#'   [residDurbinWatson()]
+#'   
+#'   * `int.rho` - the autocorrelation coefficient of the residuals from the 
+#'   interpolation model fitting process. See [estimateRho()]
+#'   
+#'   * `correction.frac` - the correction fraction, i.e., the fraction of total 
+#'   concentration or flux prediction that is attributable to a correction such 
+#'   as the residuals correction of composite method models. See 
+#'   [getCorrectionFraction()]
 #' @importFrom dplyr select everything
 #' @export
 #' @family summarizeModel

diff --git a/R/loadInterp.R b/R/loadInterp.R
@@ -427,6 +427,7 @@ estimateMSE.loadInterp <- function(load.model, n.out, n.iter=floor(nrow(getFitti
 #' Produce a 1-row data.frame of model metrics. The relevant metrics for 
 #' loadInterp models include RMSE, p-values, and others TBD.
 #' 
+#' @md
 #' @inheritParams summarizeModel
 #' @param irregular.timesteps.ok logical. If FALSE, this function requires that 
 #'   the timesteps between observations are identical to one another, and a plot
@@ -436,7 +437,20 @@ estimateMSE.loadInterp <- function(load.model, n.out, n.iter=floor(nrow(getFitti
 #'   timesteps are found to be irregular. Tests and estimates of autocorrelation
 #'   are weak to wrong when timesteps are irregular, but timesteps are often at 
 #'   least a bit irregular in the real world.
-#' @return A 1-row data.frame of model metrics
+#' @return Returns a 1-row data frame with the following columns:
+#'   
+#'   * `site.id` - the unique identifier of the site, as in [metadata()]
+#'   
+#'   * `constituent` - the unique identifier of the constituent, as in 
+#'   [metadata()]
+#'   
+#'   * `RMSE.lin`- the square root of the mean squared error
+#'   
+#'   * `durbin.watson` - the Durbin Watson test statistic as applied to the 
+#'   observations used to fit the interpolation model
+#'   
+#'   * `rho`, `acf1`, `acf1demean`, `corlag` - measures of the autocorrelation
+#'   of the observations used to fit the model
 #' @importFrom dplyr select everything
 #' @importFrom car durbinWatsonTest
 #' @importFrom stats arima

diff --git a/R/loadReg.R b/R/loadReg.R
@@ -181,32 +181,53 @@ resampleCoefficients.loadReg <- function(fit, flux.or.conc) {
 
 }
 
-#' @section Metrics:
+#' Extract model summary statistics from an [rloadest::loadReg()] model
+#' 
+#' Produce a 1-row data.frame of model metrics. The relevant metrics for loadReg
+#' models are largely the same as those reported by the `rloadest` package, 
+#' though reported in this streamlined data.frame format for bulk reporting. 
+#' `summarizeModel.loadReg` should rarely be accessed directly; instead, call 
+#' `summarizeModel()` on a [loadReg] object.
+#' 
+#' @md
+#' @inheritParams summarizeModel
+#' @param flux.or.conc character. Which internal model (the flux model or the 
+#'   concentration model) should be summarized? An [rloadest::loadReg] model is 
+#'   actually two different models for (1) flux and (2) concentration, each 
+#'   fitted to the same data and with the same model structure except for 
+#'   whether the left-hand side of the model formula is flux or concentration. 
+#'   Some of the model metrics differ between these two internal models.
+#' @return Returns a 1-row data frame with the following columns:
 #'   
-#'   \describe{
+#'   * `eqn` - the regression equation, possibly in the form `const ~ model(x)`
+#'   where `x` is the `Number` of a pre-defined equation in [rloadest::Models]
 #'   
-#'   \item{\code{Intercept}, \code{lnQ}, \code{DECTIME}, \code{sin.DECTIME}, 
-#'   \code{cos.DECTIME}, etc.}{Coefficient estimates for the named terms. The 
-#'   model formula determines which terms are included. Recall that log(Q) was 
-#'   centered before the coefficients were fit.}
+#'   * `RMSE` - the square root of the mean squared error. Errors will be 
+#'   computed from either fluxes or concentrations, as determined by the value 
+#'   of `pred.format` that was passed to [loadReg2()] when this model was 
+#'   created
 #'   
-#'   \item{\code{RMSE}}{The root mean squared error of the difference between 
-#'   observed and predicted values of concentration.}
+#'   * `r.squared` - the r-squared value, generalized for censored data, 
+#'   describing the amount of observed variation explained by the model
 #'   
-#'   \item{\code{r.squared}}{The proportion of variation explained by the 
-#'   model.}
+#'   * `p.value` - the p-value for the overall model fit
 #'   
-#'   \item{\code{p.value}}{}
+#'   * `cor.resid` - the correlation of the model residuals
 #'   
-#'   }
-#' @rdname summarizeModel.loadReg2
-#' @param flux.or.conc character. Which internal model (the flux model or the 
-#'   concentration model) should be summarized? A \pkg{rloadest} model, and 
-#'   therefore also a \code{loadReg2} model, is actually two different models 
-#'   for (1) flux and (2) concentration, each fitted to the same data and with 
-#'   the same model structure except for whether the left-hand side of the model
-#'   formula is flux or concentration. Some of the model metrics differ between 
-#'   these two internal models.
+#'   * `PPCC` - the probability plot correlation coefficient measuring the 
+#'   normality of the residuals
+#'   
+#'   * `Intercept`, `lnQ`, `lnQ2`, `DECTIME`, `DECTIME2`, `sin.DECTIME`, 
+#'   `cos.DECTIME`, etc. - the fitted value of the intercept and other terms 
+#'   included in this model (list differs by model equation)
+#'   
+#'   * `Intercept.SE`, `lnQ.SE`, `lnQ2.SE`, `DECTIME.SE`, `DECTIME2.SE`, 
+#'   `sin.DECTIME.SE`, `cos.DECTIME.SE`, etc. - the standard error of the fitted
+#'   estimates of these terms
+#'   
+#'   * `Intercept.p.value`, `lnQ.p.value`, `lnQ2.p.value`, `DECTIME.p.value`, 
+#'   `DECTIME2.p.value`, `sin.DECTIME.p.value`, `cos.DECTIME.p.value` - the 
+#'   p-values for each of these model terms
 #' @importFrom smwrStats rmse
 #' @importFrom utils capture.output
 #' @importFrom stats coef

diff --git a/R/loadReg2.R b/R/loadReg2.R
@@ -532,13 +532,50 @@ simulateSolute.loadReg2 <- function(load.model, flux.or.conc=c("flux","conc"), n
 #' Extract model summary statistics from a loadReg2 model
 #' 
 #' Produce a 1-row data.frame of model metrics. The relevant metrics for 
-#' loadReg2 models are largely the same as those reported by the \code{rloadest}
+#' loadReg2 models are largely the same as those reported by the `rloadest` 
 #' package, though reported in this streamlined data.frame format for bulk 
-#' reporting. \code{summarizeModel.loadReg} should rarely be accessed directly;
-#' instead, call \code{summarizeModel()} on a \code{loadReg2} object.
+#' reporting. `summarizeModel.loadReg2` should rarely be accessed directly; 
+#' instead, call `summarizeModel()` on a [loadReg2] object.
 #' 
+#' @md
 #' @inheritParams summarizeModel
-#' @return A 1-row data.frame of model metrics
+#' @return Returns a 1-row data frame with the following columns:
+#'   
+#'   * `site.id` - the unique identifier of the site, as in [metadata()]
+#'   
+#'   * `constituent` - the unique identifier of the constituent, as in 
+#'   [metadata()]
+#'   
+#'   * `eqn` - the regression equation, possibly in the form `const ~ model(x)` 
+#'   where `x` is the `Number` of a pre-defined equation in [rloadest::Models]
+#'   
+#'   * `RMSE` - the square root of the mean squared error. Errors will be 
+#'   computed from either fluxes or concentrations, as determined by the value 
+#'   of `pred.format` that was passed to [loadReg2()] when this model was 
+#'   created
+#'   
+#'   * `r.squared` - the r-squared value, generalized for censored data, 
+#'   describing the amount of observed variation explained by the model
+#'   
+#'   * `p.value` - the p-value for the overall model fit
+#'   
+#'   * `cor.resid` - the correlation of the model residuals
+#'   
+#'   * `PPCC` - the probability plot correlation coefficient measuring the 
+#'   normality of the residuals
+#'   
+#'   * `Intercept`, `lnQ`, `lnQ2`, `DECTIME`, `DECTIME2`, `sin.DECTIME`, 
+#'   `cos.DECTIME`, etc. - the fitted value of the intercept and other terms 
+#'   included in this model (list differs by model equation)
+#'   
+#'   * `Intercept.SE`, `lnQ.SE`, `lnQ2.SE`, `DECTIME.SE`, `DECTIME2.SE`, 
+#'   `sin.DECTIME.SE`, `cos.DECTIME.SE`, etc. - the standard error of the fitted
+#'   estimates of these terms
+#'   
+#'   * `Intercept.p.value`, `lnQ.p.value`, `lnQ2.p.value`, `DECTIME.p.value`, 
+#'   `DECTIME2.p.value`, `sin.DECTIME.p.value`, `cos.DECTIME.p.value` - the 
+#'   p-values for each of these model terms
+#'   
 #' @importFrom dplyr select everything
 #' @export
 #' @family summarizeModel

diff --git a/R/summarizeData.R b/R/summarizeData.R
@@ -1,15 +1,112 @@
 #' Summarize the site and input data
 #' 
-#' @return data frame of the following information:
-#' @export
+#' @md
 #' @param metadata metadata, used to access the appropriate columns of data. At 
-#'   a minimum, \code{metadata} should correctly specify the date column and the
-#'   column indicated by \code{interp.format}.
-#' @param fitdat data frame record of constituent+discharge measurements, for
-#'   fitting a model
-#' @param estdat data frame record of discharge measurements, for making
-#'   predictions from a model
+#'   a minimum, `metadata` should correctly specify the date column and the 
+#'   column indicated by `interp.format`.
+#' @param fitdat data frame of constituent+discharge measurements, for fitting a
+#'   model
+#' @param estdat data frame of discharge measurements, for making predictions 
+#'   (estimations) from a model
+#' @return Returns a 1-row data frame with the following columns:
+#'   
+#'   * `site.name` - the long name of the site, as in [metadata()]
+#'   
+#'   * `site.id` - the unique identifier of the site, as in [metadata()]
+#'   
+#'   * `constituent` - the unique identifier of the constituent, as in 
+#'   [metadata()]
+#'   
+#'   * `consti.name` - the long name of the constituent, as in [metadata()]
+#'   
+#'   * `conc.units` - the units of concentration data, as in [metadata()]
+#'   
+#'   * `flow.units` - the units of discharge data, as in [metadata()]
+#'   
+#'   * `load.units` - the units of load estimates, as in [metadata()]
+#'   
+#'   * `load.rate.units` - the units of load rate units, as in [metadata()]
+#'   
+#'   * `lat` - the decimal latitude of the station where concentration (and 
+#'   possibly also discharge) was measured, as in [metadata()]
+#'   
+#'   * `lon` - the decimal longitude of the station where concentration (and 
+#'   possibly also discharge) was measured, as in [metadata()]
+#'   
+#'   * `basin.area` - the area of the drainage basin contributing water to the 
+#'   site where concentrations were measured, as in [metadata()]
+#'   
+#'   * `flow.site.name` - the long name of the station where flow was monitored,
+#'   as in [metadata()]
+#'   
+#'   * `flow.site.id` - the unique identifier of the station where flow was 
+#'   monitored, as in [metadata()]
+#'   
+#'   * `flow.lat` - the decimal latitude of the station where discharge was 
+#'   measured, as in [metadata()]
+#'   
+#'   * `flow.lon` - the decimal longitude of the station where discharge was 
+#'   measured, as in [metadata()]
+#'   
+#'   * `flow.basin.area` - the area of the drainage basin contributing water to 
+#'   the site where discharge was measured, as in [metadata()]
+#'   
+#'   * `basin.area.units` - the units of the values in `basin.area` and 
+#'   `flow.basin.area` (same for both), as in [metadata()]
+#'   
+#'   * `[custom]` - if the `custom` slot of `metadata` is a 1-row data.frame or 
+#'   a list of length-1 elements, the columns of that data.frame are included in
+#'   this summary
+#'   
+#'   * `basin.area.ratio.QC` - the ratio of `flow.basin.area` to `basin.area`
+#'   
+#'   * `fitdat.start` - the date of the first observation in the model fitting 
+#'   data
+#'   
+#'   * `fitdat.end` - the date/time of the last observation in the model fitting
+#'   data
+#'   
+#'   * `fitdat.num.total` - the total number of observations in the model 
+#'   fitting data
+#'   
+#'   * `fitdat.num.incomplete` - the number of NA observations in the model 
+#'   fitting data
+#'   
+#'   * `fitdat.num.censored` - this field is currently a placeholder (will 
+#'   always be NA) for the number of censored observations in the model fitting 
+#'   data
+#'   
+#'   * `fitdat.min.gap.days` - the length in days of the smallest gap between 
+#'   any two successive observations in the model fitting data
+#'   
+#'   * `fitdat.max.gap.days` - the length in days of the largest gap between any
+#'   two successive observations in the model fitting data
+#'   
+#'   * `fitdat.median.gap.days` - the length in days of the median gap between 
+#'   successive observations in the model fitting data
+#'   
+#'   * `estdat.start` - the date of the first observation in the estimation data
+#'   
+#'   * `estdat.end` - the date/time of the last observation in the estimation 
+#'   data
+#'   
+#'   * `estdat.num.total` - the total number of observations in the estimation 
+#'   data
+#'   
+#'   * `estdat.num.incomplete` - the number of incomplete (NA) observations in 
+#'   the estimation data
+#'   
+#'   * `estdat.min.gap.days` - the length in days of the smallest gap between 
+#'   any two successive observations in the estimation data
+#'   
+#'   * `estdat.max.gap.days` - the length in days of the largest gap between any
+#'   two successive observations in the estimation data
+#'   
+#'   * `estdat.median.gap.days` - the length in days of the median gap between 
+#'   successive observations in the estimation data
+#'   
 #' @importFrom dplyr mutate bind_cols select %>% everything
+#' @export
 #' @examples
 #' data(lamprey_nitrate)
 #' data(lamprey_discharge)
@@ -23,7 +120,8 @@
 summarizeInputs <- function(metadata, fitdat, estdat) {
 
   constituent <- flow <- load.rate <- dates <- station <- site.name <- 
-    site.id <- consti.name <- flow.basin.area <- basin.area <- '.dplyr.var'
+    site.id <- consti.name <- flow.basin.area <- basin.area <- num.censored <- 
+    '.dplyr.var'
 
   # convert metadata into data.frame and add a statistic or two
   site.info <- 
@@ -34,7 +132,8 @@ summarizeInputs <- function(metadata, fitdat, estdat) {
 
   # compute date statistcs for both input datasets
   fitdat.stats <- summarizeTimeseries(metadata, fitdat)
-  estdat.stats <- summarizeTimeseries(metadata, estdat) # should remove num.censored for Q
+  estdat.stats <- summarizeTimeseries(metadata, estdat) %>%
+    select(-num.censored)
 
   # combine all info into a single data.frame row
   all.info <- data.frame(