Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement rowmean_n() #445

Merged
merged 13 commits into from
Aug 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: datawizard
Title: Easy Data Wrangling and Statistical Transformations
Version: 0.8.0.4
Version: 0.8.0.5
Authors@R: c(
person("Indrajeet", "Patil", , "[email protected]", role = "aut",
comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,7 @@ export(reverse)
export(reverse_scale)
export(row_to_colnames)
export(rowid_as_column)
export(rowmean_n)
export(rownames_as_column)
export(skewness)
export(slide)
Expand Down
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# datawizard (devel)

NEW FUNCTIONS

* `rowmean_n()`, to compute row means if row contains at least `n` non-missing
values.

CHANGES

* `recode_into()` gains an `overwrite` argument to skip overwriting already
Expand Down
101 changes: 101 additions & 0 deletions R/rowmean_n.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#' @title Row means with minimum amount of valid values
#' @name rowmean_n
#' @description This function is similar to the SPSS `MEAN.n` function and computes
#' row means from a data frame or matrix if at least `n` values of a row are
#' valid (and not `NA`).
#'
#' @param data A data frame with at least two columns, where row means are applied.
#' @param n A numeric value of length 1. May either be
#' - a numeric value that indicates the amount of valid values per row to
#' calculate the row mean;
#' - or a value between 0 and 1, indicating a proportion of valid values per
#' row to calculate the row mean (see 'Details').
#'
#' If a row's sum of valid values is less than `n`, `NA` will be returned.
#' @param digits Numeric value indicating the number of decimal places to be
#' used for rounding mean values. Negative values are allowed (see 'Details').
#' By default, `digits = NULL` and no rounding is used.
#' @param verbose Toggle warnings.
#'
#' @return A vector with row means for those rows with at least `n` valid values.
#'
#' @details Rounding to a negative number of `digits` means rounding to a power of
#' ten, for example `rowmean_n(df, 3, digits = -2)` rounds to the nearest hundred.
#' For `n`, must be a numeric value from `0` to `ncol(data)`. If a row in the
#' data frame has at least `n` non-missing values, the row mean is returned. If
#' `n` is a non-integer value from 0 to 1, `n` is considered to indicate the
#' proportion of required non-missing values per row. E.g., if `n = 0.75`, a
#' row must have at least `ncol(data) * n` non-missing values for the row mean
#' to be calculated. See 'Examples'.
#'
#' @examples
#' dat <- data.frame(
#' c1 = c(1, 2, NA, 4),
#' c2 = c(NA, 2, NA, 5),
#' c3 = c(NA, 4, NA, NA),
#' c4 = c(2, 3, 7, 8)
#' )
#'
#' # needs at least 4 non-missing values per row
#' rowmean_n(dat, 4) # 1 valid return value
#'
#' # needs at least 3 non-missing values per row
#' rowmean_n(dat, 3) # 2 valid return values
#'
#' # needs at least 2 non-missing values per row
#' rowmean_n(dat, 2)
#'
#' # needs at least 1 non-missing value per row
#' rowmean_n(dat, 1) # all means are shown
#'
#' # needs at least 50% of non-missing values per row
#' rowmean_n(dat, 0.5) # 3 valid return values
#'
#' # needs at least 75% of non-missing values per row
#' rowmean_n(dat, 0.75) # 2 valid return values
#'
#' @export
rowmean_n <- function(data, n, digits = NULL, verbose = TRUE) {
data <- .coerce_to_dataframe(data)

# n must be a numeric, non-missing value
if (is.null(n) || all(is.na(n)) || !is.numeric(n) || length(n) > 1) {
insight::format_error("`n` must be a numeric value of length 1.")
}

# make sure we only have numeric values
numeric_columns <- vapply(data, is.numeric, TRUE)
if (!all(numeric_columns)) {
if (verbose) {
insight::format_alert("Only numeric columns are considered for calculation.")
}
data <- data[numeric_columns]
}

# check if we have a data framme with at least two columns
if (ncol(data) < 2) {
insight::format_error("`data` must be a data frame with at least two numeric columns.")
}

# is 'n' indicating a proportion?
decimals <- n %% 1
if (decimals != 0) {
n <- round(ncol(data) * decimals)
}

# n may not be larger as df's amount of columns
if (ncol(data) < n) {
insight::format_error("`n` must be smaller or equal to number of columns in data frame.")
}

# row means
to_na <- rowSums(is.na(data)) > ncol(data) - n
out <- rowMeans(data, na.rm = TRUE)
out[to_na] <- NA

# round, if requested
if (!is.null(digits) && !all(is.na(digits))) {
out <- round(out, digits = digits)
}
out
}
1 change: 1 addition & 0 deletions _pkgdown.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ reference:
- smoothness
- skewness
- weighted_mean
- rowmean_n
- mean_sd

- title: Convert and Replace Data
Expand Down
13 changes: 10 additions & 3 deletions man/describe_distribution.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

72 changes: 72 additions & 0 deletions man/rowmean_n.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

26 changes: 26 additions & 0 deletions tests/testthat/test-rowmean_n.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
test_that("rowmean_n", {
d_mn <- data.frame(
c1 = c(1, 2, NA, 4),
c2 = c(NA, 2, NA, 5),
c3 = c(NA, 4, NA, NA),
c4 = c(2, 3, 7, 8)
)
expect_equal(rowmean_n(d_mn, 4), c(NA, 2.75, NA, NA), tolerance = 1e-3)
expect_equal(rowmean_n(d_mn, 3), c(NA, 2.75, NA, 5.66667), tolerance = 1e-3)
expect_equal(rowmean_n(d_mn, 2), c(1.5, 2.75, NA, 5.66667), tolerance = 1e-3)
expect_equal(rowmean_n(d_mn, 1), c(1.5, 2.75, 7, 5.66667), tolerance = 1e-3)
expect_equal(rowmean_n(d_mn, 0.5), c(1.5, 2.75, NA, 5.66667), tolerance = 1e-3)
expect_equal(rowmean_n(d_mn, 0.75), c(NA, 2.75, NA, 5.66667), tolerance = 1e-3)
expect_equal(rowmean_n(d_mn, 2, digits = 1), c(1.5, 2.8, NA, 5.7), tolerance = 1e-1)
})

test_that("rowmean_n, errors or messages", {
data(iris)
expect_error(rowmean_n(5, n = 1), regex = "`data` must be")
expect_error(rowmean_n(iris[1], n = 1), regex = "two numeric")
expect_error(rowmean_n(iris, n = NULL), regex = "numeric value")
expect_error(rowmean_n(iris, n = 1:4), regex = "numeric value")
expect_error(rowmean_n(iris, n = "a"), regex = "numeric value")
expect_message(rowmean_n(iris[1:3, ], n = 3), regex = "Only numeric")
expect_silent(rowmean_n(iris[1:3, ], n = 3, verbose = FALSE))
})
Loading