From ded6a9b2f820f3650cccfdd05c7943045e0e83fa Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Sun, 3 Mar 2024 13:17:32 +0100
Subject: [PATCH 01/19] Draft new `data_summary()` function

---
 NAMESPACE           |   5 ++
 R/data_summary.R    | 207 ++++++++++++++++++++++++++++++++++++++++++++
 man/data_summary.Rd |  51 +++++++++++
 3 files changed, 263 insertions(+)
 create mode 100644 R/data_summary.R
 create mode 100644 man/data_summary.Rd

diff --git a/NAMESPACE b/NAMESPACE
index e89863807..4e924d76b 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -52,6 +52,10 @@ S3method(data_modify,data.frame)
 S3method(data_modify,default)
 S3method(data_modify,grouped_df)
 S3method(data_peek,data.frame)
+S3method(data_summary,data.frame)
+S3method(data_summary,default)
+S3method(data_summary,grouped_df)
+S3method(data_summary,matrix)
 S3method(data_tabulate,data.frame)
 S3method(data_tabulate,default)
 S3method(data_tabulate,grouped_df)
@@ -249,6 +253,7 @@ export(data_rotate)
 export(data_seek)
 export(data_select)
 export(data_separate)
+export(data_summary)
 export(data_tabulate)
 export(data_to_long)
 export(data_to_wide)
diff --git a/R/data_summary.R b/R/data_summary.R
new file mode 100644
index 000000000..ae37c2707
--- /dev/null
+++ b/R/data_summary.R
@@ -0,0 +1,207 @@
+#' @title Summarize data
+#' @name data_summary
+#'
+#' @description This function can be used to compute summary statistics for a
+#' data frame or a matrix.
+#'
+#' @param x A (grouped) data frame.
+#' @param by Optional character string, indicating the name of a variable in `x`.
+#' If supplied, the data will be split by this variable and summary statistics
+#' will be computed for each group.
+#' @param ... One or more named expressions that define the new variable name
+#' and the function to compute the summary statistic. Example:
+#' `mean_sepal_width = mean(Sepal.Width)`. The expression can also be provided
+#' as a character string, e.g. `"mean_sepal_width = mean(Sepal.Width)"`.
+#'
+#' @return A data frame with the requested summary statistics.
+#'
+#' @examples
+#' data(iris)
+#' data_summary(iris, MW = mean(Sepal.Width), SD = sd(Sepal.Width))
+#' data_summary(
+#'   iris,
+#'   MW = mean(Sepal.Width),
+#'   SD = sd(Sepal.Width),
+#'   by = "Species"
+#' )
+#'
+#' # same as
+#' d <- data_group(iris, "Species")
+#' data_summary(d, MW = mean(Sepal.Width), SD = sd(Sepal.Width))
+#'
+#' # multiple groups
+#' data(mtcars)
+#' data_summary(mtcars, MW = mean(mpg), SD = sd(mpg), by = c("am", "gear"))
+#'
+#' # expressions can also be supplied as character strings
+#' data_summary(mtcars, "MW = mean(mpg)", "SD = sd(mpg)", by = c("am", "gear"))
+#' @export
+data_summary <- function(x, ...) {
+  UseMethod("data_summary")
+}
+
+
+#' @export
+data_summary.matrix <- function(x, ..., by = NULL) {
+  data_summary(as.data.frame(x), ..., by = by)
+}
+
+
+#' @export
+data_summary.default <- function(x, ...) {
+  insight::format_error("`data_summary()` only works for (grouped) data frames and matrices.")
+}
+
+
+#' @rdname data_summary
+#' @export
+data_summary.data.frame <- function(x, ..., by = NULL) {
+  dots <- eval(substitute(alist(...)))
+
+  if (is.null(by)) {
+    # when we have no grouping, just compute a one-row summary
+    summarise <- .process_datasummary_dots(dots, x)
+    out <- data.frame(summarise)
+    colnames(out) <- vapply(summarise, names, character(1))
+  } else {
+    # split data
+    splitted_data <- split(x, x[by])
+    out <- lapply(splitted_data, function(s) {
+      # no data for combination? Return NULL
+      if (nrow(s) == 0) {
+        return(NULL)
+      }
+      # summarize data
+      summarise <- .process_datasummary_dots(dots, s)
+      # coerce to data frame
+      summarised_data <- data.frame(summarise)
+      # bind grouping-variables and values
+      summarised_data <- cbind(s[1, by], summarised_data)
+      # make sure we have proper column names
+      colnames(summarised_data) <- c(by, vapply(summarise, names, character(1)))
+      summarised_data
+    })
+    out <- do.call(rbind, out)
+  }
+  class(out) <- "data.frame"
+  rownames(out) <- NULL
+  out
+}
+
+
+#' @export
+data_summary.grouped_df <- function(x, ..., by = NULL) {
+  # extract group variables
+  grps <- attr(x, "groups", exact = TRUE)
+  group_variables <- data_remove(grps, ".rows")
+  # if "by" is not supplied, use group variables
+  if (is.null(by)) {
+    by <- colnames_to_row(group_variables)
+  }
+  attr(x, "groups") <- NULL
+  data_summary(x, ..., by = by)
+}
+
+
+# helper -----------------------------------------------------------------------
+
+.process_datasummary_dots <- function(dots, data) {
+  out <- NULL
+  if (length(dots)) {
+    # we check for character vector of expressions, in which case
+    # "dots" should be unnamed
+    if (is.null(names(dots))) {
+      # if we have multiple strings, concatenate them to a character vector
+      # and put it into a list...
+      if (length(dots) > 1) {
+        if (all(vapply(dots, is.character, logical(1)))) {
+          dots <- list(unlist(dots))
+        } else {
+          insight::format_error("You cannot mix string and literal representation of expressions.")
+        }
+      }
+      # expression is given as character string, e.g.
+      # a <- "double_SepWidth = 2 * Sepal.Width"
+      # data_modify(iris, a)
+      # or as character vector, e.g.
+      # data_modify(iris, c("var_a = Sepal.Width / 10", "var_b = Sepal.Width * 10"))
+      character_symbol <- tryCatch(.dynEval(dots[[1]]), error = function(e) NULL)
+      # do we have a character vector? Then we can proceed
+      if (is.character(character_symbol)) {
+        dots <- lapply(character_symbol, function(s) {
+          # turn value from character vector into expression
+          str2lang(.dynEval(s))
+        })
+        names(dots) <- vapply(dots, function(n) insight::safe_deparse(n[[2]]), character(1))
+      }
+    }
+
+    out <- lapply(seq_along(dots), function(i) {
+      new_variable <- .get_new_dots_variable(dots, i, data)
+      stats::setNames(new_variable, names(dots)[i])
+    })
+  }
+
+  out
+}
+
+
+.get_new_dots_variable <- function(dots, i, data) {
+  # iterate expressions for new variables
+  symbol <- dots[[i]]
+
+  # expression is given as character string in a variable, but named, e.g.
+  # a <- "2 * Sepal.Width"
+  # data_modify(iris, double_SepWidth = a)
+  # we reconstruct the symbol as if it were provided as literal expression.
+  # However, we need to check that we don't have a character vector,
+  # like: data_modify(iris, new_var = "a")
+  # this one should be recycled instead.
+  if (!is.character(symbol)) {
+    eval_symbol <- .dynEval(symbol, ifnotfound = NULL)
+    if (is.character(eval_symbol)) {
+      symbol <- try(str2lang(paste0(names(dots)[i], " = ", eval_symbol)), silent = TRUE)
+      # we may have the edge-case of having a function that returns a character
+      # vector, like "new_var = sample(letters[1:3])". In this case, "eval_symbol"
+      # is of type character, but no symbol, thus str2lang() above creates a
+      # wrong pattern. We then take "eval_symbol" as character input.
+      if (inherits(symbol, "try-error")) {
+        symbol <- str2lang(paste0(
+          names(dots)[i],
+          " = c(", paste0("\"", eval_symbol, "\"", collapse = ","), ")"
+        ))
+      }
+    }
+  }
+
+  # finally, we can evaluate expression and get values for new variables
+  new_variable <- try(with(data, eval(symbol)), silent = TRUE)
+
+  # successful, or any errors, like misspelled variable name?
+  if (inherits(new_variable, "try-error")) {
+    # in which step did error happen?
+    step_number <- switch(as.character(i),
+      "1" = "the first expression",
+      "2" = "the second expression",
+      "3" = "the third expression",
+      paste("expression", i)
+    )
+    step_msg <- paste0("There was an error in ", step_number, ".")
+    # try to find out which variable was the cause for the error
+    error_msg <- attributes(new_variable)$condition$message
+    if (grepl("object '(.*)' not found", error_msg)) {
+      error_var <- gsub("object '(.*)' not found", "\\1", error_msg)
+      insight::format_error(
+        paste0(step_msg, " Variable \"", error_var, "\" was not found in the dataset or in the environment."),
+        .misspelled_string(colnames(data), error_var, "Possibly misspelled or not yet defined?")
+      )
+    } else {
+      insight::format_error(paste0(
+        step_msg, " ", insight::format_capitalize(error_msg),
+        ". Possibly misspelled or not yet defined?"
+      ))
+    }
+  }
+
+  new_variable
+}
diff --git a/man/data_summary.Rd b/man/data_summary.Rd
new file mode 100644
index 000000000..6c0e89744
--- /dev/null
+++ b/man/data_summary.Rd
@@ -0,0 +1,51 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data_summary.R
+\name{data_summary}
+\alias{data_summary}
+\alias{data_summary.data.frame}
+\title{Summarize data}
+\usage{
+data_summary(x, ...)
+
+\method{data_summary}{data.frame}(x, ..., by = NULL)
+}
+\arguments{
+\item{x}{A (grouped) data frame.}
+
+\item{...}{One or more named expressions that define the new variable name
+and the function to compute the summary statistic. Example:
+\code{mean_sepal_width = mean(Sepal.Width)}. The expression can also be provided
+as a character string, e.g. \code{"mean_sepal_width = mean(Sepal.Width)"}.}
+
+\item{by}{Optional character string, indicating the name of a variable in \code{x}.
+If supplied, the data will be split by this variable and summary statistics
+will be computed for each group.}
+}
+\value{
+A data frame with the requested summary statistics.
+}
+\description{
+This function can be used to compute summary statistics for a
+data frame or a matrix.
+}
+\examples{
+data(iris)
+data_summary(iris, MW = mean(Sepal.Width), SD = sd(Sepal.Width))
+data_summary(
+  iris,
+  MW = mean(Sepal.Width),
+  SD = sd(Sepal.Width),
+  by = "Species"
+)
+
+# same as
+d <- data_group(iris, "Species")
+data_summary(d, MW = mean(Sepal.Width), SD = sd(Sepal.Width))
+
+# multiple groups
+data(mtcars)
+data_summary(mtcars, MW = mean(mpg), SD = sd(mpg), by = c("am", "gear"))
+
+# expressions can also be supplied as character strings
+data_summary(mtcars, "MW = mean(mpg)", "SD = sd(mpg)", by = c("am", "gear"))
+}

From 872c6d000bc0945d08bad7309ea6ecbda39bb5bc Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Sun, 3 Mar 2024 13:19:48 +0100
Subject: [PATCH 02/19] check if we can avoid duplicated code

---
 R/data_modify.R | 58 ++-----------------------------------------------
 1 file changed, 2 insertions(+), 56 deletions(-)

diff --git a/R/data_modify.R b/R/data_modify.R
index 40a186736..c3cb0f521 100644
--- a/R/data_modify.R
+++ b/R/data_modify.R
@@ -177,62 +177,8 @@ data_modify.data.frame <- function(data, ..., .if = NULL, .at = NULL, .modify =
     }
 
     for (i in seq_along(dots)) {
-      # iterate expressions for new variables
-      symbol <- dots[[i]]
-
-      # expression is given as character string in a variable, but named, e.g.
-      # a <- "2 * Sepal.Width"
-      # data_modify(iris, double_SepWidth = a)
-      # we reconstruct the symbol as if it were provided as literal expression.
-      # However, we need to check that we don't have a character vector,
-      # like: data_modify(iris, new_var = "a")
-      # this one should be recycled instead.
-      if (!is.character(symbol)) {
-        eval_symbol <- .dynEval(symbol, ifnotfound = NULL)
-        if (is.character(eval_symbol)) {
-          symbol <- try(str2lang(paste0(names(dots)[i], " = ", eval_symbol)), silent = TRUE)
-          # we may have the edge-case of having a function that returns a character
-          # vector, like "new_var = sample(letters[1:3])". In this case, "eval_symbol"
-          # is of type character, but no symbol, thus str2lang() above creates a
-          # wrong pattern. We then take "eval_symbol" as character input.
-          if (inherits(symbol, "try-error")) {
-            symbol <- str2lang(paste0(
-              names(dots)[i],
-              " = c(", paste0("\"", eval_symbol, "\"", collapse = ","), ")"
-            ))
-          }
-        }
-      }
-
-      # finally, we can evaluate expression and get values for new variables
-      new_variable <- try(with(data, eval(symbol)), silent = TRUE)
-
-      # successful, or any errors, like misspelled variable name?
-      if (inherits(new_variable, "try-error")) {
-        # in which step did error happen?
-        step_number <- switch(as.character(i),
-          "1" = "the first expression",
-          "2" = "the second expression",
-          "3" = "the third expression",
-          paste("expression", i)
-        )
-        step_msg <- paste0("There was an error in ", step_number, ".")
-        # try to find out which variable was the cause for the error
-        error_msg <- attributes(new_variable)$condition$message
-        if (grepl("object '(.*)' not found", error_msg)) {
-          error_var <- gsub("object '(.*)' not found", "\\1", error_msg)
-          insight::format_error(
-            paste0(step_msg, " Variable \"", error_var, "\" was not found in the dataset or in the environment."),
-            .misspelled_string(colnames(data), error_var, "Possibly misspelled or not yet defined?")
-          )
-        } else {
-          insight::format_error(paste0(
-            step_msg, " ", insight::format_capitalize(error_msg),
-            ". Possibly misspelled or not yet defined?"
-          ))
-        }
-      }
-
+      # create new variable
+      new_variable <- .get_new_dots_variable(dots, i, data)
       # give informative error when new variable doesn't match number of rows
       if (!is.null(new_variable) && length(new_variable) != nrow(data) && (nrow(data) %% length(new_variable)) != 0) {
         insight::format_error(

From af2ce1830c6d7ac5b09989229a1d327638cc755d Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Sun, 3 Mar 2024 13:20:17 +0100
Subject: [PATCH 03/19] pkgdown

---
 _pkgdown.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/_pkgdown.yaml b/_pkgdown.yaml
index 2adc0768c..0062b8a5a 100644
--- a/_pkgdown.yaml
+++ b/_pkgdown.yaml
@@ -61,6 +61,7 @@ reference:
       Functions to compute statistical summaries of data properties and distributions
     contents:
       - data_codebook
+      - data_summary
       - data_tabulate
       - data_peek
       - data_seek

From 123bb562a56002a63e338d0aa61ec6e232989842 Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Sun, 3 Mar 2024 13:22:32 +0100
Subject: [PATCH 04/19] fix

---
 R/data_summary.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/data_summary.R b/R/data_summary.R
index ae37c2707..4f80bd2cd 100644
--- a/R/data_summary.R
+++ b/R/data_summary.R
@@ -96,7 +96,7 @@ data_summary.grouped_df <- function(x, ..., by = NULL) {
   group_variables <- data_remove(grps, ".rows")
   # if "by" is not supplied, use group variables
   if (is.null(by)) {
-    by <- colnames_to_row(group_variables)
+    by <- colnames(group_variables)
   }
   attr(x, "groups") <- NULL
   data_summary(x, ..., by = by)

From 2876044cb6d54fa96ec51d798104c406d69c2af2 Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Sun, 3 Mar 2024 13:25:16 +0100
Subject: [PATCH 05/19] fixes

---
 R/data_summary.R | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/R/data_summary.R b/R/data_summary.R
index 4f80bd2cd..af1ec65a4 100644
--- a/R/data_summary.R
+++ b/R/data_summary.R
@@ -98,7 +98,9 @@ data_summary.grouped_df <- function(x, ..., by = NULL) {
   if (is.null(by)) {
     by <- colnames(group_variables)
   }
+  # remove information specific to grouped df's
   attr(x, "groups") <- NULL
+  class(x) <- "data.frame"
   data_summary(x, ..., by = by)
 }
 

From f672e679e6efddf9726ec6344562cbd37dbddef4 Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Sun, 3 Mar 2024 18:00:20 +0100
Subject: [PATCH 06/19] lintr

---
 tests/testthat/test-data_reorder.R | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tests/testthat/test-data_reorder.R b/tests/testthat/test-data_reorder.R
index 34a8f5420..463f36040 100644
--- a/tests/testthat/test-data_reorder.R
+++ b/tests/testthat/test-data_reorder.R
@@ -1,11 +1,11 @@
 test_that("data_reorder works as expected", {
-  expect_equal(
-    names(data_reorder(iris, c("Species", "Sepal.Length"))),
+  expect_named(
+    data_reorder(iris, c("Species", "Sepal.Length")),
     c("Species", "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")
   )
 
-  expect_warning(expect_equal(
-    names(data_reorder(iris, c("Species", "dupa"))),
+  expect_warning(expect_named(
+    data_reorder(iris, c("Species", "dupa")),
     c("Species", "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")
   ))
 })
@@ -24,5 +24,6 @@ test_that("data_reorder preserves attributes", {
   a2 <- attributes(out2)
 
   # attributes may not be in the same order
-  expect_true(all(names(a1) %in% names(a2)) && length(a1) == length(a2))
+  expect_true(all(names(a1) %in% names(a2)))
+  expect_length(a1, length(a2))
 })

From 3ebab822af97d5da64cdc10ced4e870fec2bb546 Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Sun, 3 Mar 2024 18:22:48 +0100
Subject: [PATCH 07/19] add tests

---
 R/data_summary.R                   |  12 +++
 tests/testthat/test-data_summary.R | 134 +++++++++++++++++++++++++++++
 2 files changed, 146 insertions(+)
 create mode 100644 tests/testthat/test-data_summary.R

diff --git a/R/data_summary.R b/R/data_summary.R
index af1ec65a4..5e3627c54 100644
--- a/R/data_summary.R
+++ b/R/data_summary.R
@@ -64,6 +64,18 @@ data_summary.data.frame <- function(x, ..., by = NULL) {
     out <- data.frame(summarise)
     colnames(out) <- vapply(summarise, names, character(1))
   } else {
+    # sanity check - is "by" a character string?
+    if (!is.character(by)) {
+      insight::format_error("Argument `by` must be a character string, indicating the name of a variable in the data.")
+    }
+    # is "by" in the data?
+    if (!all(by %in% colnames(x))) {
+      by_not_found <- by[!by %in% colnames(x)]
+      insight::format_error(
+        paste0("Variable \"", by_not_found, "\" not found in the data."),
+        .misspelled_string(colnames(x), by_not_found, "Possibly misspelled?")
+      )
+    }
     # split data
     splitted_data <- split(x, x[by])
     out <- lapply(splitted_data, function(s) {
diff --git a/tests/testthat/test-data_summary.R b/tests/testthat/test-data_summary.R
new file mode 100644
index 000000000..5d48ee84d
--- /dev/null
+++ b/tests/testthat/test-data_summary.R
@@ -0,0 +1,134 @@
+test_that("data_summary, single row summary", {
+  data(iris)
+  out <- data_summary(iris, MW = mean(Sepal.Width), SD = sd(Sepal.Width))
+  expect_equal(out$MW, mean(iris$Sepal.Width), tolerance = 1e-4)
+  expect_equal(out$SD, sd(iris$Sepal.Width), tolerance = 1e-4)
+})
+
+
+test_that("data_summary, single row summary, string expression", {
+  data(iris)
+  out <- data_summary(iris, "MW = mean(Sepal.Width)", "SD = sd(Sepal.Width)")
+  expect_equal(out$MW, mean(iris$Sepal.Width), tolerance = 1e-4)
+  expect_equal(out$SD, sd(iris$Sepal.Width), tolerance = 1e-4)
+})
+
+
+test_that("data_summary, summary for groups", {
+  data(iris)
+  out <- data_summary(iris, MW = mean(Sepal.Width), SD = sd(Sepal.Width), by = "Species")
+  expect_equal(
+    out$MW,
+    aggregate(iris["Sepal.Width"], list(iris$Species), mean)$Sepal.Width,
+    tolerance = 1e-4
+  )
+  expect_equal(
+    out$SD,
+    aggregate(iris["Sepal.Width"], list(iris$Species), sd)$Sepal.Width,
+    tolerance = 1e-4
+  )
+})
+
+
+test_that("data_summary, summary for groups, string expression", {
+  data(iris)
+  out <- data_summary(
+    iris,
+    "MW = mean(Sepal.Width)",
+    "SD = sd(Sepal.Width)",
+    by = "Species"
+  )
+  expect_equal(
+    out$MW,
+    aggregate(iris["Sepal.Width"], list(iris$Species), mean)$Sepal.Width,
+    tolerance = 1e-4
+  )
+  expect_equal(
+    out$SD,
+    aggregate(iris["Sepal.Width"], list(iris$Species), sd)$Sepal.Width,
+    tolerance = 1e-4
+  )
+})
+
+
+test_that("data_summary, grouped data frames", {
+  data(iris)
+  d <- data_group(iris, "Species")
+  out <- data_summary(d, MW = mean(Sepal.Width), SD = sd(Sepal.Width))
+  expect_equal(
+    out$MW,
+    aggregate(iris["Sepal.Width"], list(iris$Species), mean)$Sepal.Width,
+    tolerance = 1e-4
+  )
+  expect_equal(
+    out$SD,
+    aggregate(iris["Sepal.Width"], list(iris$Species), sd)$Sepal.Width,
+    tolerance = 1e-4
+  )
+  # "by" overrides groups
+  data(mtcars)
+  d <- data_group(mtcars, "gear")
+  out <- data_summary(d, MW = mean(mpg), SD = sd(mpg), by = "am")
+  expect_identical(
+    out$MW,
+    aggregate(mtcars["mpg"], list(mtcars$am), mean)$mpg
+  )
+})
+
+
+test_that("data_summary, summary for multiple groups", {
+  data(mtcars)
+  out <- data_summary(mtcars, MW = mean(mpg), SD = sd(mpg), by = c("am", "gear"))
+  expect_equal(
+    out$MW,
+    aggregate(mtcars["mpg"], list(mtcars$am, mtcars$gear), mean)$mpg,
+    tolerance = 1e-4
+  )
+  expect_equal(
+    out$SD,
+    aggregate(mtcars["mpg"], list(mtcars$am, mtcars$gear), sd)$mpg,
+    tolerance = 1e-4
+  )
+  x <- data_group(mtcars, c("am", "gear"))
+  out <- data_summary(x, MW = mean(mpg), SD = sd(mpg))
+  expect_equal(
+    out$MW,
+    aggregate(mtcars["mpg"], list(mtcars$am, mtcars$gear), mean)$mpg,
+    tolerance = 1e-4
+  )
+  expect_equal(
+    out$SD,
+    aggregate(mtcars["mpg"], list(mtcars$am, mtcars$gear), sd)$mpg,
+    tolerance = 1e-4
+  )
+})
+
+
+test_that("data_summary, errors", {
+  data(iris)
+  data(mtcars)
+  # "by" must be character
+  expect_error(
+    data_summary(iris, MW = mean(Sepal.Width), SD = sd(Sepal.Width), by = 5),
+    regex = "Argument `by` must be a character string"
+  )
+  # "by" must be in data
+  expect_error(
+    data_summary(iris, MW = mean(Sepal.Width), SD = sd(Sepal.Width), by = "Speceis"),
+    regex = "Variable \"Speceis\" not"
+  )
+  # by for multiple variables
+  expect_error(
+    data_summary(mtcars, MW = mean(mpg), SD = sd(mpg), by = c("bam", "gear")),
+    regex = "Variable \"bam\" not"
+  )
+  expect_error(
+    data_summary(mtcars, MW = mean(mpg), SD = sd(mpg), by = c("bam", "geas")),
+    regex = "Did you mean one of \"am\" or \"gear\"?"
+  )
+  # not a data frame
+  expect_error(
+    data_summary(iris$Sepal.Width, MW = mean(Sepal.Width), SD = sd(Sepal.Width)),
+    regex = "only works for (grouped) data frames"
+  )
+})

From f0f832d088a50ec971842a6fabd26d5899302da9 Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Sun, 3 Mar 2024 18:23:54 +0100
Subject: [PATCH 08/19] code style

---
 R/data_modify.R | 1 -
 1 file changed, 1 deletion(-)

diff --git a/R/data_modify.R b/R/data_modify.R
index c3cb0f521..4621590c8 100644
--- a/R/data_modify.R
+++ b/R/data_modify.R
@@ -185,7 +185,6 @@ data_modify.data.frame <- function(data, ..., .if = NULL, .at = NULL, .modify =
           "New variable has not the same length as the other variables in the data frame and cannot be recycled."
         )
       }
-
       data[[names(dots)[i]]] <- new_variable
     }
   }

From 9faba9f5220780d98d2f5e46f7b7deb38eeedafb Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Sun, 3 Mar 2024 18:24:41 +0100
Subject: [PATCH 09/19] desc, news

---
 DESCRIPTION | 2 +-
 NEWS.md     | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 048d15065..eeee80702 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.9.1.4
+Version: 0.9.1.5
 Authors@R: c(
     person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),
diff --git a/NEWS.md b/NEWS.md
index 968c9d417..8b6ba88da 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,9 @@
 # datawizard 0.9.2
 
+NEW FUNCTIONS
+
+* `data_summary()`, to compute summary statistics of (grouped) data frames.
+
 CHANGES
 
 * `data_modify()` gets three new arguments, `.at`, `.if` and `.modify`, to modify

From 6abbf8ae321b80bc89f0b42ece1f944395949d91 Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Sun, 3 Mar 2024 18:45:27 +0100
Subject: [PATCH 10/19] fix

---
 tests/testthat/test-data_summary.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/testthat/test-data_summary.R b/tests/testthat/test-data_summary.R
index 5d48ee84d..f94811a85 100644
--- a/tests/testthat/test-data_summary.R
+++ b/tests/testthat/test-data_summary.R
@@ -129,6 +129,6 @@ test_that("data_summary, errors", {
   # not a data frame
   expect_error(
     data_summary(iris$Sepal.Width, MW = mean(Sepal.Width), SD = sd(Sepal.Width)),
-    regex = "only works for (grouped) data frames"
+    regex = "only works for"
   )
 })

From 30cb27069625c4cb3682a8425fc1fa29f64ffe60 Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Sun, 3 Mar 2024 19:52:17 +0100
Subject: [PATCH 11/19] add print method and snapshot test

---
 NAMESPACE                             |  1 +
 R/data_modify.R                       | 60 +++++++++++++++++++++++++
 R/data_summary.R                      | 65 ++++-----------------------
 tests/testthat/_snaps/data_summary.md | 12 +++++
 tests/testthat/test-data_summary.R    |  7 +++
 5 files changed, 88 insertions(+), 57 deletions(-)
 create mode 100644 tests/testthat/_snaps/data_summary.md

diff --git a/NAMESPACE b/NAMESPACE
index 4e924d76b..d10d1884b 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -94,6 +94,7 @@ S3method(plot,visualisation_recipe)
 S3method(print,data_codebook)
 S3method(print,data_seek)
 S3method(print,dw_data_peek)
+S3method(print,dw_data_summary)
 S3method(print,dw_data_tabulate)
 S3method(print,dw_data_tabulates)
 S3method(print,dw_data_xtabulate)
diff --git a/R/data_modify.R b/R/data_modify.R
index 4621590c8..6234255fd 100644
--- a/R/data_modify.R
+++ b/R/data_modify.R
@@ -320,3 +320,63 @@ data_modify.grouped_df <- function(data, ..., .if = NULL, .at = NULL, .modify =
 
   data
 }
+
+.get_new_dots_variable <- function(dots, i, data) {
+  # iterate expressions for new variables
+  symbol <- dots[[i]]
+
+  # expression is given as character string in a variable, but named, e.g.
+  # a <- "2 * Sepal.Width"
+  # data_modify(iris, double_SepWidth = a)
+  # we reconstruct the symbol as if it were provided as literal expression.
+  # However, we need to check that we don't have a character vector,
+  # like: data_modify(iris, new_var = "a")
+  # this one should be recycled instead.
+  if (!is.character(symbol)) {
+    eval_symbol <- .dynEval(symbol, ifnotfound = NULL)
+    if (is.character(eval_symbol)) {
+      symbol <- try(str2lang(paste0(names(dots)[i], " = ", eval_symbol)), silent = TRUE)
+      # we may have the edge-case of having a function that returns a character
+      # vector, like "new_var = sample(letters[1:3])". In this case, "eval_symbol"
+      # is of type character, but no symbol, thus str2lang() above creates a
+      # wrong pattern. We then take "eval_symbol" as character input.
+      if (inherits(symbol, "try-error")) {
+        symbol <- str2lang(paste0(
+          names(dots)[i],
+          " = c(", paste0("\"", eval_symbol, "\"", collapse = ","), ")"
+        ))
+      }
+    }
+  }
+
+  # finally, we can evaluate expression and get values for new variables
+  new_variable <- try(with(data, eval(symbol)), silent = TRUE)
+
+  # successful, or any errors, like misspelled variable name?
+  if (inherits(new_variable, "try-error")) {
+    # in which step did error happen?
+    step_number <- switch(as.character(i),
+      "1" = "the first expression",
+      "2" = "the second expression",
+      "3" = "the third expression",
+      paste("expression", i)
+    )
+    step_msg <- paste0("There was an error in ", step_number, ".")
+    # try to find out which variable was the cause for the error
+    error_msg <- attributes(new_variable)$condition$message
+    if (grepl("object '(.*)' not found", error_msg)) {
+      error_var <- gsub("object '(.*)' not found", "\\1", error_msg)
+      insight::format_error(
+        paste0(step_msg, " Variable \"", error_var, "\" was not found in the dataset or in the environment."),
+        .misspelled_string(colnames(data), error_var, "Possibly misspelled or not yet defined?")
+      )
+    } else {
+      insight::format_error(paste0(
+        step_msg, " ", insight::format_capitalize(error_msg),
+        ". Possibly misspelled or not yet defined?"
+      ))
+    }
+  }
+
+  new_variable
+}
diff --git a/R/data_summary.R b/R/data_summary.R
index 5e3627c54..d4474eead 100644
--- a/R/data_summary.R
+++ b/R/data_summary.R
@@ -95,7 +95,7 @@ data_summary.data.frame <- function(x, ..., by = NULL) {
     })
     out <- do.call(rbind, out)
   }
-  class(out) <- "data.frame"
+  class(out) <- c("dw_data_summary", "data.frame")
   rownames(out) <- NULL
   out
 }
@@ -160,62 +160,13 @@ data_summary.grouped_df <- function(x, ..., by = NULL) {
 }
 
 
-.get_new_dots_variable <- function(dots, i, data) {
-  # iterate expressions for new variables
-  symbol <- dots[[i]]
-
-  # expression is given as character string in a variable, but named, e.g.
-  # a <- "2 * Sepal.Width"
-  # data_modify(iris, double_SepWidth = a)
-  # we reconstruct the symbol as if it were provided as literal expression.
-  # However, we need to check that we don't have a character vector,
-  # like: data_modify(iris, new_var = "a")
-  # this one should be recycled instead.
-  if (!is.character(symbol)) {
-    eval_symbol <- .dynEval(symbol, ifnotfound = NULL)
-    if (is.character(eval_symbol)) {
-      symbol <- try(str2lang(paste0(names(dots)[i], " = ", eval_symbol)), silent = TRUE)
-      # we may have the edge-case of having a function that returns a character
-      # vector, like "new_var = sample(letters[1:3])". In this case, "eval_symbol"
-      # is of type character, but no symbol, thus str2lang() above creates a
-      # wrong pattern. We then take "eval_symbol" as character input.
-      if (inherits(symbol, "try-error")) {
-        symbol <- str2lang(paste0(
-          names(dots)[i],
-          " = c(", paste0("\"", eval_symbol, "\"", collapse = ","), ")"
-        ))
-      }
-    }
-  }
+# methods ----------------------------------------------------------------------
 
-  # finally, we can evaluate expression and get values for new variables
-  new_variable <- try(with(data, eval(symbol)), silent = TRUE)
-
-  # successful, or any errors, like misspelled variable name?
-  if (inherits(new_variable, "try-error")) {
-    # in which step did error happen?
-    step_number <- switch(as.character(i),
-      "1" = "the first expression",
-      "2" = "the second expression",
-      "3" = "the third expression",
-      paste("expression", i)
-    )
-    step_msg <- paste0("There was an error in ", step_number, ".")
-    # try to find out which variable was the cause for the error
-    error_msg <- attributes(new_variable)$condition$message
-    if (grepl("object '(.*)' not found", error_msg)) {
-      error_var <- gsub("object '(.*)' not found", "\\1", error_msg)
-      insight::format_error(
-        paste0(step_msg, " Variable \"", error_var, "\" was not found in the dataset or in the environment."),
-        .misspelled_string(colnames(data), error_var, "Possibly misspelled or not yet defined?")
-      )
-    } else {
-      insight::format_error(paste0(
-        step_msg, " ", insight::format_capitalize(error_msg),
-        ". Possibly misspelled or not yet defined?"
-      ))
-    }
+#' @export
+print.dw_data_summary <- function(x, ...) {
+  if (nrow(x) == 0) {
+    cat("No matches found.\n")
+  } else {
+    cat(insight::export_table(x, ...))
   }
-
-  new_variable
 }
diff --git a/tests/testthat/_snaps/data_summary.md b/tests/testthat/_snaps/data_summary.md
new file mode 100644
index 000000000..65d9e82f2
--- /dev/null
+++ b/tests/testthat/_snaps/data_summary.md
@@ -0,0 +1,12 @@
+# data_summary, print
+
+    Code
+      print(out)
+    Output
+      am | gear |    MW |   SD
+      ------------------------
+       0 |    3 | 16.11 | 3.37
+       0 |    4 | 21.05 | 3.07
+       1 |    4 | 26.27 | 5.41
+       1 |    5 | 21.38 | 6.66
+
diff --git a/tests/testthat/test-data_summary.R b/tests/testthat/test-data_summary.R
index f94811a85..16cecb809 100644
--- a/tests/testthat/test-data_summary.R
+++ b/tests/testthat/test-data_summary.R
@@ -132,3 +132,10 @@ test_that("data_summary, errors", {
     regex = "only works for"
   )
 })
+
+
+test_that("data_summary, print", {
+  data(mtcars)
+  out <- data_summary(mtcars, MW = mean(mpg), SD = sd(mpg), by = c("am", "gear"))
+  expect_snapshot(print(out))
+})

From 465a9ffe0cd4503cd44e7f01f685cf161e877d6a Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Sun, 3 Mar 2024 20:10:55 +0100
Subject: [PATCH 12/19] add test

---
 tests/testthat/test-data_summary.R | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/tests/testthat/test-data_summary.R b/tests/testthat/test-data_summary.R
index 16cecb809..e057cc35a 100644
--- a/tests/testthat/test-data_summary.R
+++ b/tests/testthat/test-data_summary.R
@@ -139,3 +139,25 @@ test_that("data_summary, print", {
   out <- data_summary(mtcars, MW = mean(mpg), SD = sd(mpg), by = c("am", "gear"))
   expect_snapshot(print(out))
 })
+
+
+test_that("data_summary, inside functions", {
+  foo1 <- function(x, ...) {
+    datawizard::data_summary(x, ..., by = "Species")
+  }
+
+  foo2 <- function(x, by, ...) {
+    datawizard::data_summary(x, ..., by = by)
+  }
+
+  foo3 <- function(x, by) {
+    datawizard::data_summary(x, MW = mean(Sepal.Width), by = by)
+  }
+
+  data(iris)
+  out1 <- foo1(iris, MW = mean(Sepal.Width))
+  out2 <- foo2(iris, by = "Species", MW = mean(Sepal.Width))
+  out3 <- foo3(iris, "Species")
+  expect_equal(out1$MW, out2$MW, tolerance = 1e-4)
+  expect_equal(out1$MW, out3$MW, tolerance = 1e-4)
+})

From 973417b90cd9252b62129ce3c6556992cb857c7e Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Sun, 3 Mar 2024 20:13:30 +0100
Subject: [PATCH 13/19] correct english form

---
 R/data_summary.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/data_summary.R b/R/data_summary.R
index d4474eead..8bb4f3e09 100644
--- a/R/data_summary.R
+++ b/R/data_summary.R
@@ -77,8 +77,8 @@ data_summary.data.frame <- function(x, ..., by = NULL) {
       )
     }
     # split data
-    splitted_data <- split(x, x[by])
-    out <- lapply(splitted_data, function(s) {
+    split_data <- split(x, x[by])
+    out <- lapply(split_data, function(s) {
       # no data for combination? Return NULL
       if (nrow(s) == 0) {
         return(NULL)

From b3c7628fb00fdf70b098ffa9a4e73433e775a7f8 Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Sun, 3 Mar 2024 20:15:53 +0100
Subject: [PATCH 14/19] test

---
 R/data_summary.R                   | 5 +++++
 tests/testthat/test-data_summary.R | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/R/data_summary.R b/R/data_summary.R
index 8bb4f3e09..3b02762ec 100644
--- a/R/data_summary.R
+++ b/R/data_summary.R
@@ -58,6 +58,11 @@ data_summary.default <- function(x, ...) {
 data_summary.data.frame <- function(x, ..., by = NULL) {
   dots <- eval(substitute(alist(...)))
 
+  # do we have any expression at all?
+  if (length(dots) == 0) {
+    insight::format_error("No expressions for calculating summary statistics provided.")
+  }
+
   if (is.null(by)) {
     # when we have no grouping, just compute a one-row summary
     summarise <- .process_datasummary_dots(dots, x)
diff --git a/tests/testthat/test-data_summary.R b/tests/testthat/test-data_summary.R
index e057cc35a..b82d78095 100644
--- a/tests/testthat/test-data_summary.R
+++ b/tests/testthat/test-data_summary.R
@@ -131,6 +131,11 @@ test_that("data_summary, errors", {
     data_summary(iris$Sepal.Width, MW = mean(Sepal.Width), SD = sd(Sepal.Width)),
     regex = "only works for"
   )
+  # no expressions
+  expect_error(
+    data_summary(iris, by = "Species"),
+    regex = "No expressions for calculating"
+  )
 })
 
 

From ce578d54712cde4b9bfc904d08621b9b23163238 Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Sun, 3 Mar 2024 20:37:08 +0100
Subject: [PATCH 15/19] include NA, sort output

---
 R/data_summary.R                      | 29 +++++++++++++-----
 man/data_summary.Rd                   |  6 +++-
 tests/testthat/_snaps/data_summary.md | 43 +++++++++++++++++++++++++++
 tests/testthat/test-data_summary.R    | 12 ++++++++
 4 files changed, 81 insertions(+), 9 deletions(-)

diff --git a/R/data_summary.R b/R/data_summary.R
index 3b02762ec..8d89b93ae 100644
--- a/R/data_summary.R
+++ b/R/data_summary.R
@@ -8,6 +8,9 @@
 #' @param by Optional character string, indicating the name of a variable in `x`.
 #' If supplied, the data will be split by this variable and summary statistics
 #' will be computed for each group.
+#' @param include_na Logical, if `TRUE`, missing values are included as a level
+#' in the grouping variable. If `FALSE`, missing values are omitted from the
+#' grouping variable.
 #' @param ... One or more named expressions that define the new variable name
 #' and the function to compute the summary statistic. Example:
 #' `mean_sepal_width = mean(Sepal.Width)`. The expression can also be provided
@@ -42,8 +45,8 @@ data_summary <- function(x, ...) {
 
 
 #' @export
-data_summary.matrix <- function(x, ..., by = NULL) {
-  data_summary(as.data.frame(x), ..., by = by)
+data_summary.matrix <- function(x, ..., by = NULL, include_na = TRUE) {
+  data_summary(as.data.frame(x), ..., by = by, include_na = include_na)
 }
 
 
@@ -55,7 +58,7 @@ data_summary.default <- function(x, ...) {
 
 #' @rdname data_summary
 #' @export
-data_summary.data.frame <- function(x, ..., by = NULL) {
+data_summary.data.frame <- function(x, ..., by = NULL, include_na = TRUE) {
   dots <- eval(substitute(alist(...)))
 
   # do we have any expression at all?
@@ -81,8 +84,15 @@ data_summary.data.frame <- function(x, ..., by = NULL) {
         .misspelled_string(colnames(x), by_not_found, "Possibly misspelled?")
       )
     }
-    # split data
-    split_data <- split(x, x[by])
+    # split data, add NA levels, if requested
+    l <- lapply(x[by], function(i) {
+      if (include_na) {
+        addNA(i)
+      } else {
+        i
+      }
+    })
+    split_data <- split(x, l, drop = TRUE)
     out <- lapply(split_data, function(s) {
       # no data for combination? Return NULL
       if (nrow(s) == 0) {
@@ -100,6 +110,9 @@ data_summary.data.frame <- function(x, ..., by = NULL) {
     })
     out <- do.call(rbind, out)
   }
+  # sort data
+  out <- data_arrange(out, select = by)
+  # data attributes
   class(out) <- c("dw_data_summary", "data.frame")
   rownames(out) <- NULL
   out
@@ -107,7 +120,7 @@ data_summary.data.frame <- function(x, ..., by = NULL) {
 
 
 #' @export
-data_summary.grouped_df <- function(x, ..., by = NULL) {
+data_summary.grouped_df <- function(x, ..., by = NULL, include_na = TRUE) {
   # extract group variables
   grps <- attr(x, "groups", exact = TRUE)
   group_variables <- data_remove(grps, ".rows")
@@ -118,7 +131,7 @@ data_summary.grouped_df <- function(x, ..., by = NULL) {
   # remove information specific to grouped df's
   attr(x, "groups") <- NULL
   class(x) <- "data.frame"
-  data_summary(x, ..., by = by)
+  data_summary(x, ..., by = by, include_na = include_na)
 }
 
 
@@ -172,6 +185,6 @@ print.dw_data_summary <- function(x, ...) {
   if (nrow(x) == 0) {
     cat("No matches found.\n")
   } else {
-    cat(insight::export_table(x, ...))
+    cat(insight::export_table(x, missing = "<NA>", ...))
   }
 }
diff --git a/man/data_summary.Rd b/man/data_summary.Rd
index 6c0e89744..0602dc7da 100644
--- a/man/data_summary.Rd
+++ b/man/data_summary.Rd
@@ -7,7 +7,7 @@
 \usage{
 data_summary(x, ...)
 
-\method{data_summary}{data.frame}(x, ..., by = NULL)
+\method{data_summary}{data.frame}(x, ..., by = NULL, include_na = TRUE)
 }
 \arguments{
 \item{x}{A (grouped) data frame.}
@@ -20,6 +20,10 @@ as a character string, e.g. \code{"mean_sepal_width = mean(Sepal.Width)"}.}
 \item{by}{Optional character string, indicating the name of a variable in \code{x}.
 If supplied, the data will be split by this variable and summary statistics
 will be computed for each group.}
+
+\item{include_na}{Logical, if \code{TRUE}, missing values are included as a level
+in the grouping variable. If \code{FALSE}, missing values are omitted from the
+grouping variable.}
 }
 \value{
 A data frame with the requested summary statistics.
diff --git a/tests/testthat/_snaps/data_summary.md b/tests/testthat/_snaps/data_summary.md
index 65d9e82f2..44b7f3f4c 100644
--- a/tests/testthat/_snaps/data_summary.md
+++ b/tests/testthat/_snaps/data_summary.md
@@ -10,3 +10,46 @@
        1 |    4 | 26.27 | 5.41
        1 |    5 | 21.38 | 6.66
 
+# data_summary, with NA
+
+    Code
+      print(out)
+    Output
+      c172code |    MW
+      ----------------
+             1 | 87.12
+             2 | 94.05
+             3 | 75.00
+          <NA> | 47.80
+
+---
+
+    Code
+      print(out)
+    Output
+      c172code |    MW
+      ----------------
+             1 | 87.12
+             2 | 94.05
+             3 | 75.00
+
+---
+
+    Code
+      print(out)
+    Output
+      e42dep | c172code |     MW
+      --------------------------
+      1      |        2 |  17.00
+      2      |        2 |  34.25
+      3      |        1 |  39.50
+      3      |        2 |  52.44
+      3      |        3 |  52.00
+      3      |     <NA> |  84.00
+      4      |        1 | 134.75
+      4      |        2 | 119.26
+      4      |        3 |  88.80
+      4      |     <NA> |  43.29
+      <NA>   |        2 |   <NA>
+      <NA>   |     <NA> |   7.00
+
diff --git a/tests/testthat/test-data_summary.R b/tests/testthat/test-data_summary.R
index b82d78095..23eb02a7c 100644
--- a/tests/testthat/test-data_summary.R
+++ b/tests/testthat/test-data_summary.R
@@ -146,6 +146,18 @@ test_that("data_summary, print", {
 })
 
 
+test_that("data_summary, with NA", {
+  data(efc, package = "datawizard")
+  out <- data_summary(efc, MW = mean(c12hour, na.rm = TRUE), by = "c172code")
+  expect_snapshot(print(out))
+  out <- data_summary(efc, MW = mean(c12hour, na.rm = TRUE), by = "c172code", include_na = FALSE)
+  expect_snapshot(print(out))
+  # sorting for multiple groups
+  out <- data_summary(efc, MW = mean(c12hour, na.rm = TRUE), by = c("e42dep", "c172code"))
+  expect_snapshot(print(out))
+})
+
+
 test_that("data_summary, inside functions", {
   foo1 <- function(x, ...) {
     datawizard::data_summary(x, ..., by = "Species")

From 2bd8cd042148401ca3181aaed170f2e43c5d5b71 Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Sun, 3 Mar 2024 21:39:27 +0100
Subject: [PATCH 16/19] add test

---
 tests/testthat/test-data_summary.R | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tests/testthat/test-data_summary.R b/tests/testthat/test-data_summary.R
index 23eb02a7c..16adf9ddb 100644
--- a/tests/testthat/test-data_summary.R
+++ b/tests/testthat/test-data_summary.R
@@ -178,3 +178,20 @@ test_that("data_summary, inside functions", {
   expect_equal(out1$MW, out2$MW, tolerance = 1e-4)
   expect_equal(out1$MW, out3$MW, tolerance = 1e-4)
 })
+
+
+test_that("data_summary, expression as variable", {
+  data(mtcars)
+  a <- "MW = mean(mpg)"
+  b <- "SD = sd(mpg)"
+  out <- data_summary(mtcars, a, by = c("am", "gear"))
+  expect_named(out, c("am", "gear", "MW"))
+  expect_equal(out$MW, aggregate(mtcars["mpg"], list(mtcars$am, mtcars$gear), mean)$mpg, tolerance = 1e-4)
+  expect_error(
+    data_summary(mtcars, a, b, by = c("am", "gear")),
+    regex = "You cannot mix"
+  )
+  out <- data_summary(mtcars, c(a, b), by = c("am", "gear"))
+  expect_named(out, c("am", "gear", "MW", "SD"))
+  expect_equal(out$SD, aggregate(mtcars["mpg"], list(mtcars$am, mtcars$gear), sd)$mpg, tolerance = 1e-4)
+})

From 502df98aeff78a7ee08b9efa57d818ff1355382b Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Sun, 3 Mar 2024 21:41:32 +0100
Subject: [PATCH 17/19] meaningful code comments

---
 R/data_summary.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/data_summary.R b/R/data_summary.R
index 8d89b93ae..90f3e8ae3 100644
--- a/R/data_summary.R
+++ b/R/data_summary.R
@@ -153,10 +153,10 @@ data_summary.grouped_df <- function(x, ..., by = NULL, include_na = TRUE) {
         }
       }
       # expression is given as character string, e.g.
-      # a <- "double_SepWidth = 2 * Sepal.Width"
-      # data_modify(iris, a)
+      # a <- "mean_sepwid = mean(Sepal.Width)"
+      # data_summary(iris, a, by = "Species")
       # or as character vector, e.g.
-      # data_modify(iris, c("var_a = Sepal.Width / 10", "var_b = Sepal.Width * 10"))
+      # data_summary(iris, c("var_a = mean(Sepal.Width)", "var_b = sd(Sepal.Width)"))
       character_symbol <- tryCatch(.dynEval(dots[[1]]), error = function(e) NULL)
       # do we have a character vector? Then we can proceed
       if (is.character(character_symbol)) {

From 410090e38f43df0005a826633cbac21f69fdf4bb Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Sun, 3 Mar 2024 23:44:18 +0100
Subject: [PATCH 18/19] add test

---
 tests/testthat/test-data_summary.R | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/testthat/test-data_summary.R b/tests/testthat/test-data_summary.R
index 16adf9ddb..aede5c042 100644
--- a/tests/testthat/test-data_summary.R
+++ b/tests/testthat/test-data_summary.R
@@ -136,6 +136,11 @@ test_that("data_summary, errors", {
     data_summary(iris, by = "Species"),
     regex = "No expressions for calculating"
   )
+  # wrong expression
+  expect_error(
+    data_summary(mtcars, mw = mesn(mpg), by = "am"),
+    regex = "There was an error"
+  )
 })
 
 

From 26f70f208e97fbd4e038badfc4ee2bc6c24199d0 Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Sun, 3 Mar 2024 23:57:41 +0100
Subject: [PATCH 19/19] Update data_summary.R

---
 R/data_summary.R | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/R/data_summary.R b/R/data_summary.R
index 90f3e8ae3..5f8b0ee08 100644
--- a/R/data_summary.R
+++ b/R/data_summary.R
@@ -105,7 +105,7 @@ data_summary.data.frame <- function(x, ..., by = NULL, include_na = TRUE) {
       # bind grouping-variables and values
       summarised_data <- cbind(s[1, by], summarised_data)
       # make sure we have proper column names
-      colnames(summarised_data) <- c(by, vapply(summarise, names, character(1)))
+      colnames(summarised_data) <- c(by, unlist(lapply(summarise, names)))
       summarised_data
     })
     out <- do.call(rbind, out)
@@ -170,7 +170,11 @@ data_summary.grouped_df <- function(x, ..., by = NULL, include_na = TRUE) {
 
     out <- lapply(seq_along(dots), function(i) {
       new_variable <- .get_new_dots_variable(dots, i, data)
-      stats::setNames(new_variable, names(dots)[i])
+      if (inherits(new_variable, c("bayestestR_ci", "bayestestR_eti"))) {
+        stats::setNames(new_variable, c("CI", "CI_low", "CI_high"))
+      } else {
+        stats::setNames(new_variable, names(dots)[i])
+      }
     })
   }
 
@@ -185,6 +189,11 @@ print.dw_data_summary <- function(x, ...) {
   if (nrow(x) == 0) {
     cat("No matches found.\n")
   } else {
+    if (all(c("CI", "CI_low", "CI_high") %in% colnames(x))) {
+      ci <- insight::format_table(x[c("CI", "CI_low", "CI_high")], ...)
+      x$CI <- x$CI_low <- x$CI_high <- NULL
+      x <- cbind(x, ci)
+    }
     cat(insight::export_table(x, missing = "<NA>", ...))
   }
 }