diff --git a/NEWS.md b/NEWS.md index 6acbb75..41795f0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,7 +8,8 @@ ### New Features -- Added `lmtp_survival()` function for estimating the entire survival curve. Enforces monotonicity using isotonic regression (see issue \#140. +- Added `lmtp_survival()` function for estimating the entire survival curve. Enforces monotonicity using isotonic regression (see issue \#140). +- Bootstrap for TMLE with the `boot` argument using a modified TMLE algorithm (https://arxiv.org/abs/1810.03030). ### Bug Fixes diff --git a/R/estimators.R b/R/estimators.R index e3f956a..5d8c8d3 100644 --- a/R/estimators.R +++ b/R/estimators.R @@ -41,6 +41,9 @@ #' @param mtp \[\code{logical(1)}\]\cr #' Is the intervention of interest a modified treatment policy? #' Default is \code{FALSE}. If treatment variables are continuous this should be \code{TRUE}. +#' @param boot \[\code{logical(1)}\]\cr +#' Compute standard errors using the bootstrap? Default is \code{FALSE}. If \code{FALSE}, standard +#' errors will be calculated using the empirical variance of the efficient influence function. #' @param outcome_type \[\code{character(1)}\]\cr #' Outcome variable type (i.e., continuous, binomial, survival). #' @param id \[\code{character(1)}\]\cr @@ -77,7 +80,8 @@ #' \item{standard_error}{The estimated standard error of the LMTP effect.} #' \item{low}{Lower bound of the 95% confidence interval of the LMTP effect.} #' \item{high}{Upper bound of the 95% confidence interval of the LMTP effect.} -#' \item{eif}{The estimated, un-centered, influence function of the estimate.} +#' \item{eif}{The estimated, un-centered, influence function of the estimate, +#' \code{NULL} if \code{boot = TRUE}.} #' \item{shift}{The shift function specifying the treatment policy of interest.} #' \item{outcome_reg}{An n x Tau + 1 matrix of outcome regression predictions. #' The mean of the first column is used for calculating theta.} @@ -92,7 +96,8 @@ #' @export lmtp_tmle <- function(data, trt, outcome, baseline = NULL, time_vary = NULL, cens = NULL, shift = NULL, shifted = NULL, k = Inf, - mtp = FALSE, outcome_type = c("binomial", "continuous", "survival"), + mtp = FALSE, boot = FALSE, + outcome_type = c("binomial", "continuous", "survival"), id = NULL, bounds = NULL, learners_outcome = "SL.glm", learners_trt = "SL.glm", @@ -125,6 +130,7 @@ lmtp_tmle <- function(data, trt, outcome, baseline = NULL, time_vary = NULL, checkmate::assertNumber(control$.bound) checkmate::assertNumber(control$.trim, upper = 1) checkmate::assertLogical(control$.return_full_fits, len = 1) + checkmate::assertLogical(boot, len = 1) check_trt_type(data, unlist(trt), mtp) task <- lmtp_task$new( @@ -147,6 +153,33 @@ lmtp_tmle <- function(data, trt, outcome, baseline = NULL, time_vary = NULL, pb <- progressr::progressor(task$tau*folds*2) + if (isTRUE(boot)) { + ratios <- cf_r(task, learners_trt, mtp, control, pb) + Qn <- cf_sub(task, "tmp_lmtp_scaled_outcome", learners_outcome, control, pb) + Qnb_eps <- cf_tmle2(task, ratios$ratios, Qn, control) + + ans <- theta_boot( + list( + estimator = "TMLE", + m = Qnb_eps$psi, + r = ratios$ratios, + boots = Qnb_eps$booted, + tau = task$tau, + folds = task$folds, + id = task$id, + outcome_type = task$outcome_type, + bounds = task$bounds, + weights = task$weights, + shift = if (is.null(shifted)) deparse(substitute((shift))) else NULL, + fits_m = Qn$fits, + fits_r = ratios$fits, + outcome_type = task$outcome_type, + seed = Qnb_eps$seed + ) + ) + return(ans) + } + ratios <- cf_r(task, learners_trt, mtp, control, pb) estims <- cf_tmle(task, "tmp_lmtp_scaled_outcome", @@ -486,7 +519,7 @@ lmtp_sub <- function(data, trt, outcome, baseline = NULL, time_vary = NULL, cens theta_sub( eta = list( - m = estims$m, + m = estims$ms, outcome_type = task$outcome_type, bounds = task$bounds, folds = task$folds, diff --git a/R/gcomp.R b/R/gcomp.R index ad0b718..a5feeb9 100644 --- a/R/gcomp.R +++ b/R/gcomp.R @@ -23,15 +23,15 @@ cf_sub <- function(task, outcome, learners, control, pb) { out <- future::value(out) list( - m = recombine_outcome(out, "m", task$folds), + ms = recombine_outcome(out, "ms", task$folds), + mn = recombine_outcome(out, "mn", task$folds), fits = lapply(out, function(x) x[["fits"]]) ) } estimate_sub <- function(natural, shifted, trt, outcome, node_list, cens, risk, tau, outcome_type, learners, control, pb) { - - m <- matrix(nrow = nrow(natural$valid), ncol = tau) + ms <- mn <- matrix(nrow = nrow(natural$valid), ncol = tau) fits <- vector("list", length = tau) for (t in tau:1) { @@ -79,13 +79,15 @@ estimate_sub <- function(natural, shifted, trt, outcome, node_list, cens, risk, under_shift_valid[, trt_t] <- shifted$valid[jv & rv, trt_t] natural$train[jt & rt, pseudo] <- bound(SL_predict(fit, under_shift_train), 1e-05) - m[jv & rv, t] <- bound(SL_predict(fit, under_shift_valid), 1e-05) + ms[jv & rv, t] <- bound(SL_predict(fit, under_shift_valid), 1e-05) + mn[jv & rv, t] <- bound(SL_predict(fit, natural$valid[jv & rv, vars]), 1e-05) natural$train[!rt, pseudo] <- 0 - m[!rv, t] <- 0 + ms[!rv, t] <- 0 + mn[!rv, t] <- 0 pb() } - list(m = m, fits = fits) + list(ms = ms, mn = mn, fits = fits) } diff --git a/R/lmtp_control.R b/R/lmtp_control.R index 4c8ad6b..86a3237 100644 --- a/R/lmtp_control.R +++ b/R/lmtp_control.R @@ -13,6 +13,8 @@ #' The number of cross-validation folds for \code{learners_trt}. #' @param .return_full_fits \[\code{logical(1)}\]\cr #' Return full SuperLearner fits? Default is \code{FALSE}, return only SuperLearner weights. +#' @param .B description +#' @param .boot_seed description #' #' @return A list of parameters controlling the estimation procedure. #' @export @@ -23,10 +25,14 @@ lmtp_control <- function(.bound = 1e5, .trim = 0.999, .learners_outcome_folds = 10, .learners_trt_folds = 10, - .return_full_fits = FALSE) { + .return_full_fits = FALSE, + .B = 1000, + .boot_seed = NULL) { list(.bound = .bound, .trim = .trim, .learners_outcome_folds = .learners_outcome_folds, .learners_trt_folds = .learners_trt_folds, - .return_full_fits = .return_full_fits) + .return_full_fits = .return_full_fits, + .B = .B, + .boot_seed = .boot_seed) } diff --git a/R/lmtp_survival.R b/R/lmtp_survival.R index 6ef1808..5f12759 100644 --- a/R/lmtp_survival.R +++ b/R/lmtp_survival.R @@ -41,6 +41,10 @@ #' @param mtp \[\code{logical(1)}\]\cr #' Is the intervention of interest a modified treatment policy? #' Default is \code{FALSE}. If treatment variables are continuous this should be \code{TRUE}. +#' @param boot \[\code{logical(1)}\]\cr +#' Compute standard errors using the bootstrap? Default is \code{FALSE}. If \code{FALSE}, standard +#' errors will be calculated using the empirical variance of the efficient influence function. +#' Ignored if \code{estimator = "lmtp_sdr"}. #' @param id \[\code{character(1)}\]\cr #' An optional column name containing cluster level identifiers. #' @param learners_outcome \[\code{character}\]\cr A vector of \code{SuperLearner} algorithms for estimation @@ -64,6 +68,7 @@ lmtp_survival <- function(data, trt, outcomes, baseline = NULL, time_vary = NULL estimator = c("lmtp_tmle", "lmtp_sdr"), k = Inf, mtp = FALSE, + boot = FALSE, id = NULL, learners_outcome = "SL.glm", learners_trt = "SL.glm", diff --git a/R/theta.R b/R/theta.R index 7df2f8f..6c38fbc 100644 --- a/R/theta.R +++ b/R/theta.R @@ -112,3 +112,41 @@ theta_dr <- function(eta, augmented = FALSE) { class(out) <- "lmtp" out } + +# TODO: NEED TO SAVE THE SEED FOR THE REPLICATES AND THE BOOTED ESTIMATES FOR ESTIMATNG CONTRASTS +theta_boot <- function(eta) { + theta <- weighted.mean(eta$m[, 1], eta$weights) + + if (eta$outcome_type == "continuous") { + theta <- rescale_y_continuous(theta, eta$bounds) + } + + # TODO: NEED TO FIGURE OUT HOW THIS WOULD WORK WITH CLUSTERING + se <- sqrt(var(eta$boots)) + ci_low <- theta - (qnorm(0.975) * se) + ci_high <- theta + (qnorm(0.975) * se) + + out <- list( + estimator = eta$estimator, + theta = theta, + standard_error = se, + low = ci_low, + high = ci_high, + boots = eta$boots, + id = eta$id, + shift = eta$shift, + outcome_reg = switch( + eta$outcome_type, + continuous = rescale_y_continuous(eta$m, eta$bounds), + binomial = eta$m + ), + density_ratios = eta$r, + fits_m = eta$fits_m, + fits_r = eta$fits_r, + outcome_type = eta$outcome_type, + seed = eta$seed + ) + + class(out) <- "lmtp" + out +} diff --git a/R/tmle2.R b/R/tmle2.R new file mode 100644 index 0000000..d429489 --- /dev/null +++ b/R/tmle2.R @@ -0,0 +1,67 @@ +cf_tmle2 <- function(task, ratios, m_init, control) { + psi <- estimate_tmle2(task$natural, + task$cens, + task$risk, + task$tau, + m_init$mn, + m_init$ms, + ratios, + task$weights) + + if (!is.null(control$.boot_seed)) { + seed <- control$.boot_seed + } else { + seed <- .Random.seed[1] + } + + set.seed(seed) + boots <- replicate(control$.B, + sample(1:nrow(task$natural), nrow(task$natural), replace = TRUE), + simplify = FALSE) + + Qnb <- vector("list", control$.B) + for (i in seq_along(boots)) { + ib <- boots[[i]] + Qnb[[i]] <- future::future({ + psis <- estimate_tmle2(task$natural[ib, , drop = FALSE], + task$cens, + task$risk, + task$tau, + m_init$mn[ib, , drop = FALSE], + m_init$ms[ib, , drop = FALSE], + ratios[ib, , drop = FALSE], + task$weights[ib]) + weighted.mean(psis[, 1], task$weights[ib]) + }, + seed = TRUE) + } + + list(psi = psi, booted = unlist(future::value(Qnb)), seed = seed) +} + +estimate_tmle2 <- function(data, cens, risk, tau, mn, ms, ratios, weights) { + m_eps <- matrix(nrow = nrow(data), ncol = tau + 1) + m_eps[, tau + 1] <- data$tmp_lmtp_scaled_outcome + + fits <- vector("list", length = tau) + for (t in tau:1) { + i <- censored(data, cens, t)$i + j <- censored(data, cens, t)$j + r <- at_risk(data, risk, t) + + wts <- ratios[i & r, t] * weights[i & r] + + fit <- sw( + glm( + m_eps[i & r, t + 1] ~ offset(qlogis(mn[i & r, t])), + weights = wts, + family = "binomial" + ) + ) + + m_eps[j & r, t] <- bound(plogis(qlogis(ms[j & r, t]) + coef(fit))) + m_eps[!r, t] <- 0 + } + + m_eps +} diff --git a/man/lmtp_control.Rd b/man/lmtp_control.Rd index 2e1d10a..8172476 100644 --- a/man/lmtp_control.Rd +++ b/man/lmtp_control.Rd @@ -9,7 +9,9 @@ lmtp_control( .trim = 0.999, .learners_outcome_folds = 10, .learners_trt_folds = 10, - .return_full_fits = FALSE + .return_full_fits = FALSE, + .B = 1000, + .boot_seed = NULL ) } \arguments{ @@ -30,6 +32,10 @@ The number of cross-validation folds for \code{learners_trt}.} \item{.return_full_fits}{[\code{logical(1)}]\cr Return full SuperLearner fits? Default is \code{FALSE}, return only SuperLearner weights.} + +\item{.B}{description} + +\item{.boot_seed}{description} } \value{ A list of parameters controlling the estimation procedure. diff --git a/man/lmtp_survival.Rd b/man/lmtp_survival.Rd index 9f693bd..5baf4e0 100644 --- a/man/lmtp_survival.Rd +++ b/man/lmtp_survival.Rd @@ -16,6 +16,7 @@ lmtp_survival( estimator = c("lmtp_tmle", "lmtp_sdr"), k = Inf, mtp = FALSE, + boot = FALSE, id = NULL, learners_outcome = "SL.glm", learners_trt = "SL.glm", @@ -73,6 +74,11 @@ all time points.} Is the intervention of interest a modified treatment policy? Default is \code{FALSE}. If treatment variables are continuous this should be \code{TRUE}.} +\item{boot}{[\code{logical(1)}]\cr +Compute standard errors using the bootstrap? Default is \code{FALSE}. If \code{FALSE}, standard +errors will be calculated using the empirical variance of the efficient influence function. +Ignored if \code{estimator = "lmtp_sdr"}.} + \item{id}{[\code{character(1)}]\cr An optional column name containing cluster level identifiers.} diff --git a/man/lmtp_tmle.Rd b/man/lmtp_tmle.Rd index f790bab..e8dcc50 100644 --- a/man/lmtp_tmle.Rd +++ b/man/lmtp_tmle.Rd @@ -15,6 +15,7 @@ lmtp_tmle( shifted = NULL, k = Inf, mtp = FALSE, + boot = FALSE, outcome_type = c("binomial", "continuous", "survival"), id = NULL, bounds = NULL, @@ -73,6 +74,10 @@ all time points.} Is the intervention of interest a modified treatment policy? Default is \code{FALSE}. If treatment variables are continuous this should be \code{TRUE}.} +\item{boot}{[\code{logical(1)}]\cr +Compute standard errors using the bootstrap? Default is \code{FALSE}. If \code{FALSE}, standard +errors will be calculated using the empirical variance of the efficient influence function.} + \item{outcome_type}{[\code{character(1)}]\cr Outcome variable type (i.e., continuous, binomial, survival).} @@ -108,7 +113,8 @@ A list of class \code{lmtp} containing the following components: \item{standard_error}{The estimated standard error of the LMTP effect.} \item{low}{Lower bound of the 95\% confidence interval of the LMTP effect.} \item{high}{Upper bound of the 95\% confidence interval of the LMTP effect.} -\item{eif}{The estimated, un-centered, influence function of the estimate.} +\item{eif}{The estimated, un-centered, influence function of the estimate, +\code{NULL} if \code{boot = TRUE}.} \item{shift}{The shift function specifying the treatment policy of interest.} \item{outcome_reg}{An n x Tau + 1 matrix of outcome regression predictions. The mean of the first column is used for calculating theta.} diff --git a/tests/testthat/test-censoring.R b/tests/testthat/test-censoring.R index c7b2ae1..3b1547d 100644 --- a/tests/testthat/test-censoring.R +++ b/tests/testthat/test-censoring.R @@ -7,7 +7,7 @@ truth <- 0.8 sub <- sw(lmtp_sub(sim_cens, A, "Y", time_vary = L, cens = C, k = 0, shift = NULL, folds = 1)) ipw <- sw(lmtp_ipw(sim_cens, A, "Y", time_vary = L, cens = C, k = 0, shift = NULL, folds = 1)) -tmle <- sw(lmtp_tmle(sim_cens, A, "Y", time_vary = L, cens = C, k = 0, shift = NULL, folds = 1)) +tmle <- sw(lmtp_tmle(sim_cens, A, "Y", time_vary = L, cens = C, k = 0, shift = NULL, boot = F, folds = 1)) sdr <- sw(lmtp_sdr(sim_cens, A, "Y", time_vary = L, cens = C, k = 0, shift = NULL, folds = 1)) # tests diff --git a/tests/testthat/test-dynamic.R b/tests/testthat/test-dynamic.R index 94bf0c2..27e7e51 100644 --- a/tests/testthat/test-dynamic.R +++ b/tests/testthat/test-dynamic.R @@ -32,18 +32,18 @@ cens <- c("C1", "C2") nodes <- list(c(NULL), c("L")) # truth = 0.308 -tml.stc <- sw(lmtp_tmle(sim, a, "Y", baseline, nodes, cens, shift = static_binary_on, folds = 1)) +tml.stc <- sw(lmtp_tmle(sim, a, "Y", baseline, nodes, cens, shift = static_binary_on, folds = 1, boot = F)) # truth = 0.528 sdr.stc <- sw(lmtp_sdr(sim, a, "Y", baseline, nodes, cens, shift = static_binary_off, folds = 1)) # truth = 0.433 -tml.tv <- sw(lmtp_tmle(sim, a, "Y", baseline, nodes, cens, shift = time_vary_on, folds = 1)) +tml.tv <- sw(lmtp_tmle(sim, a, "Y", baseline, nodes, cens, shift = time_vary_on, folds = 1, boot = F)) sdr.tv <- sw(lmtp_sdr(sim, a, "Y", baseline, nodes, cens, shift = time_vary_on, folds = 1)) # time varying and covariate dynamic # truth = 0.345 -tml.dyn <- sw(lmtp_tmle(sim, a, "Y", baseline, nodes, cens, shift = dynamic_vec, folds = 1)) +tml.dyn <- sw(lmtp_tmle(sim, a, "Y", baseline, nodes, cens, shift = dynamic_vec, folds = 1, boot = F)) sdr.dyn <- sw(lmtp_sdr(sim, a, "Y", baseline, nodes, cens, shift = dynamic_vec, folds = 1)) test_that("Dynamic intervention fidelity", { diff --git a/tests/testthat/test-point_treatment.R b/tests/testthat/test-point_treatment.R index 638e103..5f2b8d3 100644 --- a/tests/testthat/test-point_treatment.R +++ b/tests/testthat/test-point_treatment.R @@ -15,7 +15,7 @@ truth <- mean(tmp$Y.1) sub <- lmtp_sub(tmp, "A", "Y", baseline = c("W1", "W2"), shift = static_binary_on, folds = 1) ipw <- lmtp_ipw(tmp, "A", "Y", baseline = c("W1", "W2"), shift = static_binary_on, folds = 1) -tmle <- lmtp_tmle(tmp, "A", "Y", baseline = c("W1", "W2"), shift = static_binary_on, folds = 1) +tmle <- lmtp_tmle(tmp, "A", "Y", baseline = c("W1", "W2"), shift = static_binary_on, folds = 1, boot = F) sdr <- lmtp_sdr(tmp, "A", "Y", baseline = c("W1", "W2"), shift = static_binary_on, folds = 1) # tests diff --git a/tests/testthat/test-shifted.R b/tests/testthat/test-shifted.R index b85f90b..161af27 100644 --- a/tests/testthat/test-shifted.R +++ b/tests/testthat/test-shifted.R @@ -20,7 +20,7 @@ ipw <- tmle <- sw(lmtp_tmle(sim_cens, a, "Y", nodes, baseline = NULL, cens, k = 0, shifted = sc, - outcome_type = "binomial", folds = 2, mtp = TRUE)) + outcome_type = "binomial", folds = 2, mtp = TRUE, boot = F)) sdr <- sw(lmtp_sdr(sim_cens, a, "Y", nodes, baseline = NULL, diff --git a/tests/testthat/test-survey.R b/tests/testthat/test-survey.R index 2cdd6d5..b8021bf 100644 --- a/tests/testthat/test-survey.R +++ b/tests/testthat/test-survey.R @@ -25,7 +25,7 @@ ipw <- lmtp_ipw(tmp, "A", "Y", baseline = c("W1", "W2"), shift = static_binary_o weights = wts, folds = 2) tmle <- lmtp_tmle(tmp, "A", "Y", baseline = c("W1", "W2"), shift = static_binary_on, - weights = wts, folds = 2) + weights = wts, folds = 2, boot = F) sdr <- lmtp_sdr(tmp, "A", "Y", baseline = c("W1", "W2"), shift = static_binary_on, weights = wts, folds = 2) diff --git a/tests/testthat/test-time_varying_treatment.R b/tests/testthat/test-time_varying_treatment.R index 7e7cd1e..4b289e0 100644 --- a/tests/testthat/test-time_varying_treatment.R +++ b/tests/testthat/test-time_varying_treatment.R @@ -25,7 +25,7 @@ truth <- 0.305 sub <- sw(lmtp_sub(tmp, a, "Y", time_vary = time_varying, shift = d, folds = 1)) ipw <- sw(lmtp_ipw(tmp, a, "Y", time_vary = time_varying, shift = d, mtp = T, folds = 1)) -tmle <- sw(lmtp_tmle(tmp, a, "Y", time_vary = time_varying, shift = d, mtp = T, folds = 1)) +tmle <- sw(lmtp_tmle(tmp, a, "Y", time_vary = time_varying, shift = d, mtp = T, folds = 1, boot = F)) sdr <- sw(lmtp_sdr(tmp, a, "Y", time_vary = time_varying, shift = d, mtp = T, folds = 1)) test_that("time varying treatment fidelity, t = 4", {