diff --git a/DESCRIPTION b/DESCRIPTION
index 3afc557..cc5a67c 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -49,7 +49,6 @@ Imports:
stringr,
base,
readr,
- lifecycle,
huxtable,
crayon,
data.table,
@@ -61,7 +60,8 @@ Imports:
sp,
withr,
cli,
- purrr
+ purrr,
+ lifecycle
RoxygenNote: 7.3.1
Suggests:
knitr,
diff --git a/NAMESPACE b/NAMESPACE
index f63564d..07bc09a 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -7,6 +7,7 @@ export(convert_datetime_format)
export(convert_long_to_utm)
export(convert_utm_to_ll)
export(create_datastore_script)
+export(document_missing_values)
export(fix_utc_offset)
export(fuzz_location)
export(generate_ll_from_utm)
diff --git a/NEWS.md b/NEWS.md
index c631921..cedb49b 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,8 @@
# QCkit v0.1.8 (not yet released)
+2024-07-16
+* Added experimental function `document_missing_values()`, which searches a file for multiple missing value codes, replaces them all with NA, and generates a new column with the missing value codes so that they can be properly documented in EML. This is a work-around for the fact that there is currently not a good way to get multiple missing value codes in a single column via EMLassemblyline. This function is still under development; expect substantial changes an improvements up to and including removing the function entirely.
+
2024-07-09
* Added function `get_user_email()`, which accesses NPS active directory via a powershell function to return the user's email address. Probably won't work for non-NPS users and probably won't work for non-windows users.
* Updated rest API from legacy v6 to current v7.
diff --git a/R/replace_blanks.R b/R/replace_blanks.R
index 6b9a835..43f9bc6 100644
--- a/R/replace_blanks.R
+++ b/R/replace_blanks.R
@@ -93,3 +93,86 @@ replace_blanks <- function(directory = here::here(), missing_val_code = NA) {
}
return(invisible())
}
+
+
+#' Handles multiple missing values
+#'
+#' @description
+#' `r lifecycle::badge("experimental")`
+#' `r lifecycle::badge("questioning")`
+#' Given a file name (.csv only) and path, the function will search the
+#' columns for any that contain multiple user-specified missing value codes.
+#' For any column with multiple missing value codes, all the missing values
+#' (including blanks) will be replaced with NA. A new column will be generated
+#' and, populated with the given missing value code from the origin column.
+#' Values that were not missing will be populated with "not_missing". The
+#' newly generate column of categorical variables can be used do describe
+#' the various/multiple reasons for why data is absent in the original column.
+#'
+#' The function will then write the new dataframe to a file, overwriting the
+#' original file. If it is important to keep a copy of the original file, make
+#' a copy prior to running the function.
+#'
+#' WARNING: this function will replace any blank cells in your data with NA!
+#'
+#' @details Blank cells will be treated as NA.
+#'
+#' @param file_name String. The name of the file to inspect
+#' @param directory String. Location of file to read/write. Defaults to the current working directory.
+#' @param colname `r lifecycle::badge("experimental")` String. The columns to inspect. CURRENTLY ONLY WORKS AS SET TO DEFAULT "NA".
+#' @param missing_val_codes List. A list of strings containing the missing value code or codes to search for.
+#' @param replace_value String. The value (singular) to replace multiple missing values with. Defaults to NA.
+#'
+#' @return writes a new dataframe to file. Return invisible.
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' document_missing_values(file_name = "mydata.csv",
+#' directory = here::here(),
+#' colname = NA, #do not change during function development
+#' missing_val_codes = c("missing", "blank", "no data"),
+#' replace_value = NA)
+#' }
+document_missing_values <- function(file_name,
+ directory = here::here(),
+ colname = NA,
+ missing_val_codes = NA,
+ replace_value = NA) {
+
+ #read in a dataframe:
+ df <- readr::read_csv(paste0(directory, "/", file_name),
+ show_col_types = FALSE)
+ #generate list of missing values
+ missing_val_codes <- append(missing_val_codes, NA)
+ missing_val_codes <- unique(missing_val_codes)
+
+ data_names <- colnames(df)
+
+ if (is.na(colname)) {
+ y <- ncol(df)
+ for (i in 1:y) {
+ #if here are multiple missing value codes in a column:
+ if (sum(df[[data_names[i]]] %in% missing_val_codes) >
+ sum(is.na(df[[data_names[i]]]))) {
+ #generate new column of data:
+ df$x <- with(df,
+ ifelse(df[[data_names[i]]] %in% missing_val_codes,
+ df[[data_names[i]]], "not_missing"))
+ #replace old missing values with replacement value
+ df[[data_names[i]]] = ifelse(df[[data_names[i]]] %in%
+ missing_val_codes,
+ replace_value, df[[data_names[i]]])
+ #rename new column:
+ names(df)[names(df) == "x"] <- paste0("custom_",
+ data_names[i],
+ "_MissingValues")
+ }
+ }
+ }
+ #write the file back out:
+ readr::write_csv(df, paste0(directory, "/", file_name))
+
+ return(invisible)
+
+}
diff --git a/docs/index.html b/docs/index.html
index 5fb3f58..2573281 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -123,7 +123,7 @@
2024-07-16 * Added experimental function document_missing_values(), which searches a file for multiple missing value codes, replaces them all with NA, and generates a new column with the missing value codes so that they can be properly documented in EML. This is a work-around for the fact that there is currently not a good way to get multiple missing value codes in a single column via EMLassemblyline. This function is still under development; expect substantial changes an improvements up to and including removing the function entirely.
2024-07-09 * Added function get_user_email(), which accesses NPS active directory via a powershell function to return the user’s email address. Probably won’t work for non-NPS users and probably won’t work for non-windows users. * Updated rest API from legacy v6 to current v7.
2024-06-28 * Updated get_park_polygon() to use the new API (had been using a legacy API). Added documentation to specify that the function is getting the convexhull for the park, which may not work particularly well for some parks. 2024-06-27 * bug fixes for generate_ll_from_utm() * add function remove_empty_tables() (and associated unit tests) * update documentation for replace blanks() to indicate it can replace blanks with more than just NA
+
+Given a file name (.csv only) and path, the function will search the
+columns for any that contain multiple user-specified missing value codes.
+For any column with multiple missing value codes, all the missing values
+(including blanks) will be replaced with NA. A new column will be generated
+and, populated with the given missing value code from the origin column.
+Values that were not missing will be populated with "not_missing". The
+newly generate column of categorical variables can be used do describe
+the various/multiple reasons for why data is absent in the original column.
+
The function will then write the new dataframe to a file, overwriting the
+original file. If it is important to keep a copy of the original file, make
+a copy prior to running the function.
+
WARNING: this function will replace any blank cells in your data with NA!
diff --git a/docs/sitemap.xml b/docs/sitemap.xml
index 31e79b0..ec11d5b 100644
--- a/docs/sitemap.xml
+++ b/docs/sitemap.xml
@@ -51,6 +51,9 @@
/reference/DC_col_check.html
+
+ /reference/document_missing_values.html
+ /reference/dot-get_unit_boundary.html
diff --git a/man/document_missing_values.Rd b/man/document_missing_values.Rd
new file mode 100644
index 0000000..62aa6c2
--- /dev/null
+++ b/man/document_missing_values.Rd
@@ -0,0 +1,58 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/replace_blanks.R
+\name{document_missing_values}
+\alias{document_missing_values}
+\title{Handles multiple missing values}
+\usage{
+document_missing_values(
+ file_name,
+ directory = here::here(),
+ colname = NA,
+ missing_val_codes = NA,
+ replace_value = NA
+)
+}
+\arguments{
+\item{file_name}{String. The name of the file to inspect}
+
+\item{directory}{String. Location of file to read/write. Defaults to the current working directory.}
+
+\item{colname}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} String. The columns to inspect. CURRENTLY ONLY WORKS AS SET TO DEFAULT "NA".}
+
+\item{missing_val_codes}{List. A list of strings containing the missing value code or codes to search for.}
+
+\item{replace_value}{String. The value (singular) to replace multiple missing values with. Defaults to NA.}
+}
+\value{
+writes a new dataframe to file. Return invisible.
+}
+\description{
+\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
+\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#questioning}{\figure{lifecycle-questioning.svg}{options: alt='[Questioning]'}}}{\strong{[Questioning]}}
+Given a file name (.csv only) and path, the function will search the
+columns for any that contain multiple user-specified missing value codes.
+For any column with multiple missing value codes, all the missing values
+(including blanks) will be replaced with NA. A new column will be generated
+and, populated with the given missing value code from the origin column.
+Values that were not missing will be populated with "not_missing". The
+newly generate column of categorical variables can be used do describe
+the various/multiple reasons for why data is absent in the original column.
+
+The function will then write the new dataframe to a file, overwriting the
+original file. If it is important to keep a copy of the original file, make
+a copy prior to running the function.
+
+WARNING: this function will replace any blank cells in your data with NA!
+}
+\details{
+Blank cells will be treated as NA.
+}
+\examples{
+\dontrun{
+document_missing_values(file_name = "mydata.csv",
+ directory = here::here(),
+ colname = NA, #do not change during function development
+ missing_val_codes = c("missing", "blank", "no data"),
+ replace_value = NA)
+ }
+}
diff --git a/man/figures/lifecycle-archived.svg b/man/figures/lifecycle-archived.svg
index 48f72a6..745ab0c 100644
--- a/man/figures/lifecycle-archived.svg
+++ b/man/figures/lifecycle-archived.svg
@@ -1 +1,21 @@
-
\ No newline at end of file
+
diff --git a/man/figures/lifecycle-defunct.svg b/man/figures/lifecycle-defunct.svg
index 01452e5..d5c9559 100644
--- a/man/figures/lifecycle-defunct.svg
+++ b/man/figures/lifecycle-defunct.svg
@@ -1 +1,21 @@
-
\ No newline at end of file
+
diff --git a/man/figures/lifecycle-deprecated.svg b/man/figures/lifecycle-deprecated.svg
index 4baaee0..b61c57c 100644
--- a/man/figures/lifecycle-deprecated.svg
+++ b/man/figures/lifecycle-deprecated.svg
@@ -1 +1,21 @@
-
\ No newline at end of file
+
diff --git a/man/figures/lifecycle-experimental.svg b/man/figures/lifecycle-experimental.svg
index d1d060e..5d88fc2 100644
--- a/man/figures/lifecycle-experimental.svg
+++ b/man/figures/lifecycle-experimental.svg
@@ -1 +1,21 @@
-
\ No newline at end of file
+
diff --git a/man/figures/lifecycle-maturing.svg b/man/figures/lifecycle-maturing.svg
index df71310..897370e 100644
--- a/man/figures/lifecycle-maturing.svg
+++ b/man/figures/lifecycle-maturing.svg
@@ -1 +1,21 @@
-
\ No newline at end of file
+
diff --git a/man/figures/lifecycle-questioning.svg b/man/figures/lifecycle-questioning.svg
index 08ee0c9..7c1721d 100644
--- a/man/figures/lifecycle-questioning.svg
+++ b/man/figures/lifecycle-questioning.svg
@@ -1 +1,21 @@
-
\ No newline at end of file
+
diff --git a/man/figures/lifecycle-soft-deprecated.svg b/man/figures/lifecycle-soft-deprecated.svg
new file mode 100644
index 0000000..9c166ff
--- /dev/null
+++ b/man/figures/lifecycle-soft-deprecated.svg
@@ -0,0 +1,21 @@
+
diff --git a/man/figures/lifecycle-stable.svg b/man/figures/lifecycle-stable.svg
index e015dc8..9bf21e7 100644
--- a/man/figures/lifecycle-stable.svg
+++ b/man/figures/lifecycle-stable.svg
@@ -1 +1,29 @@
-
\ No newline at end of file
+
diff --git a/man/figures/lifecycle-superseded.svg b/man/figures/lifecycle-superseded.svg
index 75f24f5..db8d757 100644
--- a/man/figures/lifecycle-superseded.svg
+++ b/man/figures/lifecycle-superseded.svg
@@ -1 +1,21 @@
-
\ No newline at end of file
+