From 82c4abd2c650eb78ea0496363cf6b1e827effe3a Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Fri, 15 Dec 2023 17:05:18 -0600 Subject: [PATCH] Re-level 'factor' and 'ordered' variables with new levels (#639) * When writing factors ensure levels are consistent with schema levels * Generate factor test column as factor * Also evolve array * New tests * Update NEWS and roll micro version [ci skip] --- DESCRIPTION | 2 +- NEWS.md | 2 + R/TileDBArray.R | 34 +++++++++++++++ inst/tinytest/test_arrayschemaevolution.R | 50 +++++++++++++++++++++-- 4 files changed, 84 insertions(+), 4 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 37fc7fdd8e..84c2ca624e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: tiledb Type: Package -Version: 0.22.0.6 +Version: 0.22.0.7 Title: Modern Database Engine for Multi-Modal Data via Sparse and Dense Multidimensional Arrays Authors@R: c(person("TileDB, Inc.", role = c("aut", "cph")), person("Dirk", "Eddelbuettel", email = "dirk@tiledb.com", role = "cre")) diff --git a/NEWS.md b/NEWS.md index 5cf15d1eb3..29107d823e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,8 @@ * A TileDB Array can now be opened in 'keep open' mode for subsequent use without re-opening (#630) +* Arrays with factor (or ordered) variables now grow their factor levels in appending writes (#639) + ## Bug Fixes * The read buffer is now correctly sized when implementing VFS serialization (#631) diff --git a/R/TileDBArray.R b/R/TileDBArray.R index 72b1f1b5e2..ac561b7ae8 100644 --- a/R/TileDBArray.R +++ b/R/TileDBArray.R @@ -1261,23 +1261,27 @@ setMethod("[<-", "tiledb_array", dimtypes <- sapply(dims, function(d) libtiledb_dim_get_datatype(d@ptr)) dimvarnum <- sapply(dims, function(d) libtiledb_dim_get_cell_val_num(d@ptr)) dimnullable <- sapply(dims, function(d) FALSE) + dimdictionary <- sapply(dims, function(d) FALSE) attrs <- tiledb::attrs(schema(x)) attrnames <- unname(sapply(attrs, function(a) libtiledb_attribute_get_name(a@ptr))) attrtypes <- unname(sapply(attrs, function(a) libtiledb_attribute_get_type(a@ptr))) attrvarnum <- unname(sapply(attrs, function(a) libtiledb_attribute_get_cell_val_num(a@ptr))) attrnullable <- unname(sapply(attrs, function(a) libtiledb_attribute_get_nullable(a@ptr))) + attrdictionary <- unname(sapply(attrs, function(a) libtiledb_attribute_has_enumeration(ctx@ptr, a@ptr))) if (length(attrnames) > 0) { allnames <- c(dimnames, attrnames) alltypes <- c(dimtypes, attrtypes) allvarnum <- c(dimvarnum, attrvarnum) allnullable <- c(dimnullable, attrnullable) + alldictionary <- c(dimdictionary, attrdictionary) } else { allnames <- dimnames alltypes <- dimtypes allvarnum <- dimvarnum allnullable <- dimnullable + alldictionary <- dimdictionary } ## check we have complete columns (as we cannot write subset of attributes) @@ -1336,6 +1340,7 @@ setMethod("[<-", "tiledb_array", allnames <- attrnames alltypes <- attrtypes allnullable <- attrnullable + alldictionary <- attrdictionary } ## Case 4: dense, list on RHS e.g. the ex_1.R example @@ -1353,6 +1358,7 @@ setMethod("[<-", "tiledb_array", allnames <- attrnames alltypes <- attrtypes allnullable <- attrnullable + alldictionary <- attrdictionary } nc <- if (is.list(value)) length(value) else ncol(value) @@ -1392,6 +1398,34 @@ setMethod("[<-", "tiledb_array", for (colnam in allnames) { ## when an index column is use this may be unordered to remap to position in 'nm' names k <- match(colnam, nm) + + if (alldictionary[k]) { + spdl::debug("[tiledb_array] '[<-' column {} ({}) is factor", colnam, k) + new_levels <- levels(value[[k]]) + + attr <- attrs[[allnames[k]]] + tpstr <- tiledb_attribute_get_enumeration_type_ptr(attr, arrptr) + if (tpstr %in% c("ASCII", "UTF8")) { + dictionary <- tiledb_attribute_get_enumeration_ptr(attr, arrptr) + } else if (tpstr %in% c("FLOAT32", "FLOAT64", "BOOL", "UINT8", "UINT16", "UINT32", "UINT64", + "INT8", "INT16", "INT32", "INT64")) { + dictionary <- tiledb_attribute_get_enumeration_vector_ptr(attr, arrptr) + } else { + stop("Unsupported enumeration vector payload of type '%s'", tpstr, call. = FALSE) + } + added_enums <- setdiff(new_levels, dictionary) + if (length(added_enums) > 0) { + levels <- unique(c(dictionary, new_levels)) + is_ordered <- tiledb_attribute_is_ordered_enumeration_ptr(attr, arrptr) + value[[k]] <- factor(value[[k]], levels = levels, ordered = is_ordered) + spdl::debug("[tiledb_array] '[<-' releveled column {}", k) + ase <- tiledb_array_schema_evolution() + arr <- tiledb_array_open(x) + ase <- tiledb_array_schema_evolution_extend_enumeration(ase, arr, allnames[[k]], added_enums) + tiledb::tiledb_array_schema_evolution_array_evolve(ase, uri) + } + } + if (alltypes[k] %in% c("CHAR", "ASCII", "UTF8")) { # variable length txtvec <- as.character(value[[k]]) spdl::debug("[tiledb_array] '[<-' alloc char buffer {} '{}': {}", k, colnam, alltypes[k]) diff --git a/inst/tinytest/test_arrayschemaevolution.R b/inst/tinytest/test_arrayschemaevolution.R index 6e64f901d3..99adf3d6ca 100644 --- a/inst/tinytest/test_arrayschemaevolution.R +++ b/inst/tinytest/test_arrayschemaevolution.R @@ -64,7 +64,6 @@ expect_equal(levels(res$val), enums) expect_equal(as.integer(res$val), c(1:5,5:1)) - ## -- testing 'create empty following by extending' if (tiledb_version(TRUE) < "2.17.3") exit_file("Needs TileDB 2.17.3 or later") uri <- tempfile() @@ -112,11 +111,10 @@ run_int_col_test <- function(coltype) { tiledb_array_create(uri, schema) set.seed(42) - df <- data.frame(dim = 1:10, fct = sample(1:length(enums), 10, replace=TRUE) - 1, dbl = rnorm(10)) + df <- data.frame(dim = 1:10, fct = sample(length(enums), 10, replace=TRUE) - 1, dbl = rnorm(10)) arr <- tiledb_array(uri) arr[] <- df - qc <- res <- tiledb_array(uri, return_as="data.frame", query_condition = parse_query_condition(fct == blue, arr))[] expect_equal(nrow(res), 5) @@ -134,3 +132,49 @@ run_int_col_test <- function(coltype) { unlink(uri) } sapply(c("INT8", "INT16", "INT32", "UINT8", "UINT16", "UINT32"), run_int_col_test) + + +## test that factor levels can grow without overlap +uri <- tempfile() +df1 <- data.frame(id = 1:3, obs = factor(c("A", "B", "A"))) +fromDataFrame(df1, uri, col_index=1, tile_domain=c(1L, 6L)) + +## write with a factor with two elements but without one of the initial ones +## while factor(c("B", "C", "B")) gets encoded as c(1,2,1) it should really +## encoded as c(2,3,2) under levels that are c("A", "B", "C") -- and the +## write method now does that +df2 <- data.frame(id = 4:6, obs = factor(c("B", "C", "B"))) +fromDataFrame(df2, uri, col_index=1, mode="append") + +res <- tiledb_array(uri, return_as="data.frame")[] + +expect_equal(nrow(res), 6) +expect_equal(nlevels(res[["obs"]]), 3) +expect_equal(levels(res[["obs"]]), c("A", "B", "C")) +expect_equal(as.integer(res[["obs"]]), c(1L, 2L, 1L, 2L, 3L, 2L)) + +ref <- rbind(df1, df2) +expect_equivalent(res, ref) # equivalent because of query status attribute + + +## test that ordered factor levels can grow without overlap +uri <- tempfile() +df1 <- data.frame(id = 1:3, obs = ordered(c("A", "B", "A"))) +fromDataFrame(df1, uri, col_index=1, tile_domain=c(1L, 6L)) + +## write with a factor with two elements but without one of the initial ones +## while factor(c("B", "C", "B")) gets encoded as c(1,2,1) it should really +## encoded as c(2,3,2) under levels that are c("A", "B", "C") -- and the +## write method now does that +df2 <- data.frame(id = 4:6, obs = ordered(c("B", "C", "B"))) +fromDataFrame(df2, uri, col_index=1, mode="append") + +res <- tiledb_array(uri, return_as="data.frame")[] + +expect_equal(nrow(res), 6) +expect_equal(nlevels(res[["obs"]]), 3) +expect_equal(levels(res[["obs"]]), c("A", "B", "C")) +expect_equal(as.integer(res[["obs"]]), c(1L, 2L, 1L, 2L, 3L, 2L)) + +ref <- rbind(df1, df2) +expect_equivalent(res, ref) # equivalent because of query status attribute