Skip to content

Commit

Permalink
Re-level 'factor' and 'ordered' variables with new levels (#639)
Browse files Browse the repository at this point in the history
* When writing factors ensure levels are consistent with schema levels

* Generate factor test column as factor

* Also evolve array

* New tests

* Update NEWS and roll micro version [ci skip]
  • Loading branch information
eddelbuettel authored Dec 15, 2023
1 parent e65a067 commit 82c4abd
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 4 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: tiledb
Type: Package
Version: 0.22.0.6
Version: 0.22.0.7
Title: Modern Database Engine for Multi-Modal Data via Sparse and Dense Multidimensional Arrays
Authors@R: c(person("TileDB, Inc.", role = c("aut", "cph")),
person("Dirk", "Eddelbuettel", email = "[email protected]", role = "cre"))
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

* A TileDB Array can now be opened in 'keep open' mode for subsequent use without re-opening (#630)

* Arrays with factor (or ordered) variables now grow their factor levels in appending writes (#639)

## Bug Fixes

* The read buffer is now correctly sized when implementing VFS serialization (#631)
Expand Down
34 changes: 34 additions & 0 deletions R/TileDBArray.R
Original file line number Diff line number Diff line change
Expand Up @@ -1261,23 +1261,27 @@ setMethod("[<-", "tiledb_array",
dimtypes <- sapply(dims, function(d) libtiledb_dim_get_datatype(d@ptr))
dimvarnum <- sapply(dims, function(d) libtiledb_dim_get_cell_val_num(d@ptr))
dimnullable <- sapply(dims, function(d) FALSE)
dimdictionary <- sapply(dims, function(d) FALSE)

attrs <- tiledb::attrs(schema(x))
attrnames <- unname(sapply(attrs, function(a) libtiledb_attribute_get_name(a@ptr)))
attrtypes <- unname(sapply(attrs, function(a) libtiledb_attribute_get_type(a@ptr)))
attrvarnum <- unname(sapply(attrs, function(a) libtiledb_attribute_get_cell_val_num(a@ptr)))
attrnullable <- unname(sapply(attrs, function(a) libtiledb_attribute_get_nullable(a@ptr)))
attrdictionary <- unname(sapply(attrs, function(a) libtiledb_attribute_has_enumeration(ctx@ptr, a@ptr)))

if (length(attrnames) > 0) {
allnames <- c(dimnames, attrnames)
alltypes <- c(dimtypes, attrtypes)
allvarnum <- c(dimvarnum, attrvarnum)
allnullable <- c(dimnullable, attrnullable)
alldictionary <- c(dimdictionary, attrdictionary)
} else {
allnames <- dimnames
alltypes <- dimtypes
allvarnum <- dimvarnum
allnullable <- dimnullable
alldictionary <- dimdictionary
}

## check we have complete columns (as we cannot write subset of attributes)
Expand Down Expand Up @@ -1336,6 +1340,7 @@ setMethod("[<-", "tiledb_array",
allnames <- attrnames
alltypes <- attrtypes
allnullable <- attrnullable
alldictionary <- attrdictionary
}

## Case 4: dense, list on RHS e.g. the ex_1.R example
Expand All @@ -1353,6 +1358,7 @@ setMethod("[<-", "tiledb_array",
allnames <- attrnames
alltypes <- attrtypes
allnullable <- attrnullable
alldictionary <- attrdictionary
}

nc <- if (is.list(value)) length(value) else ncol(value)
Expand Down Expand Up @@ -1392,6 +1398,34 @@ setMethod("[<-", "tiledb_array",
for (colnam in allnames) {
## when an index column is use this may be unordered to remap to position in 'nm' names
k <- match(colnam, nm)

if (alldictionary[k]) {
spdl::debug("[tiledb_array] '[<-' column {} ({}) is factor", colnam, k)
new_levels <- levels(value[[k]])

attr <- attrs[[allnames[k]]]
tpstr <- tiledb_attribute_get_enumeration_type_ptr(attr, arrptr)
if (tpstr %in% c("ASCII", "UTF8")) {
dictionary <- tiledb_attribute_get_enumeration_ptr(attr, arrptr)
} else if (tpstr %in% c("FLOAT32", "FLOAT64", "BOOL", "UINT8", "UINT16", "UINT32", "UINT64",
"INT8", "INT16", "INT32", "INT64")) {
dictionary <- tiledb_attribute_get_enumeration_vector_ptr(attr, arrptr)
} else {
stop("Unsupported enumeration vector payload of type '%s'", tpstr, call. = FALSE)
}
added_enums <- setdiff(new_levels, dictionary)
if (length(added_enums) > 0) {
levels <- unique(c(dictionary, new_levels))
is_ordered <- tiledb_attribute_is_ordered_enumeration_ptr(attr, arrptr)
value[[k]] <- factor(value[[k]], levels = levels, ordered = is_ordered)
spdl::debug("[tiledb_array] '[<-' releveled column {}", k)
ase <- tiledb_array_schema_evolution()
arr <- tiledb_array_open(x)
ase <- tiledb_array_schema_evolution_extend_enumeration(ase, arr, allnames[[k]], added_enums)
tiledb::tiledb_array_schema_evolution_array_evolve(ase, uri)
}
}

if (alltypes[k] %in% c("CHAR", "ASCII", "UTF8")) { # variable length
txtvec <- as.character(value[[k]])
spdl::debug("[tiledb_array] '[<-' alloc char buffer {} '{}': {}", k, colnam, alltypes[k])
Expand Down
50 changes: 47 additions & 3 deletions inst/tinytest/test_arrayschemaevolution.R
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ expect_equal(levels(res$val), enums)
expect_equal(as.integer(res$val), c(1:5,5:1))



## -- testing 'create empty following by extending'
if (tiledb_version(TRUE) < "2.17.3") exit_file("Needs TileDB 2.17.3 or later")
uri <- tempfile()
Expand Down Expand Up @@ -112,11 +111,10 @@ run_int_col_test <- function(coltype) {
tiledb_array_create(uri, schema)

set.seed(42)
df <- data.frame(dim = 1:10, fct = sample(1:length(enums), 10, replace=TRUE) - 1, dbl = rnorm(10))
df <- data.frame(dim = 1:10, fct = sample(length(enums), 10, replace=TRUE) - 1, dbl = rnorm(10))
arr <- tiledb_array(uri)
arr[] <- df

qc <-
res <- tiledb_array(uri, return_as="data.frame", query_condition = parse_query_condition(fct == blue, arr))[]
expect_equal(nrow(res), 5)

Expand All @@ -134,3 +132,49 @@ run_int_col_test <- function(coltype) {
unlink(uri)
}
sapply(c("INT8", "INT16", "INT32", "UINT8", "UINT16", "UINT32"), run_int_col_test)


## test that factor levels can grow without overlap
uri <- tempfile()
df1 <- data.frame(id = 1:3, obs = factor(c("A", "B", "A")))
fromDataFrame(df1, uri, col_index=1, tile_domain=c(1L, 6L))

## write with a factor with two elements but without one of the initial ones
## while factor(c("B", "C", "B")) gets encoded as c(1,2,1) it should really
## encoded as c(2,3,2) under levels that are c("A", "B", "C") -- and the
## write method now does that
df2 <- data.frame(id = 4:6, obs = factor(c("B", "C", "B")))
fromDataFrame(df2, uri, col_index=1, mode="append")

res <- tiledb_array(uri, return_as="data.frame")[]

expect_equal(nrow(res), 6)
expect_equal(nlevels(res[["obs"]]), 3)
expect_equal(levels(res[["obs"]]), c("A", "B", "C"))
expect_equal(as.integer(res[["obs"]]), c(1L, 2L, 1L, 2L, 3L, 2L))

ref <- rbind(df1, df2)
expect_equivalent(res, ref) # equivalent because of query status attribute


## test that ordered factor levels can grow without overlap
uri <- tempfile()
df1 <- data.frame(id = 1:3, obs = ordered(c("A", "B", "A")))
fromDataFrame(df1, uri, col_index=1, tile_domain=c(1L, 6L))

## write with a factor with two elements but without one of the initial ones
## while factor(c("B", "C", "B")) gets encoded as c(1,2,1) it should really
## encoded as c(2,3,2) under levels that are c("A", "B", "C") -- and the
## write method now does that
df2 <- data.frame(id = 4:6, obs = ordered(c("B", "C", "B")))
fromDataFrame(df2, uri, col_index=1, mode="append")

res <- tiledb_array(uri, return_as="data.frame")[]

expect_equal(nrow(res), 6)
expect_equal(nlevels(res[["obs"]]), 3)
expect_equal(levels(res[["obs"]]), c("A", "B", "C"))
expect_equal(as.integer(res[["obs"]]), c(1L, 2L, 1L, 2L, 3L, 2L))

ref <- rbind(df1, df2)
expect_equivalent(res, ref) # equivalent because of query status attribute

0 comments on commit 82c4abd

Please sign in to comment.