From 82c4abd2c650eb78ea0496363cf6b1e827effe3a Mon Sep 17 00:00:00 2001
From: Dirk Eddelbuettel <edd@debian.org>
Date: Fri, 15 Dec 2023 17:05:18 -0600
Subject: [PATCH] Re-level 'factor' and 'ordered' variables with new levels
 (#639)

* When writing factors ensure levels are consistent with schema levels

* Generate factor test column as factor

* Also evolve array

* New tests

* Update NEWS and roll micro version [ci skip]
---
 DESCRIPTION                               |  2 +-
 NEWS.md                                   |  2 +
 R/TileDBArray.R                           | 34 +++++++++++++++
 inst/tinytest/test_arrayschemaevolution.R | 50 +++++++++++++++++++++--
 4 files changed, 84 insertions(+), 4 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 37fc7fdd8e..84c2ca624e 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: tiledb
 Type: Package
-Version: 0.22.0.6
+Version: 0.22.0.7
 Title: Modern Database Engine for Multi-Modal Data via Sparse and Dense Multidimensional Arrays
 Authors@R: c(person("TileDB, Inc.", role = c("aut", "cph")),
  person("Dirk", "Eddelbuettel", email = "dirk@tiledb.com", role = "cre"))
diff --git a/NEWS.md b/NEWS.md
index 5cf15d1eb3..29107d823e 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -6,6 +6,8 @@
 
 * A TileDB Array can now be opened in 'keep open' mode for subsequent use without re-opening (#630)
 
+* Arrays with factor (or ordered) variables now grow their factor levels in appending writes (#639)
+
 ## Bug Fixes
 
 * The read buffer is now correctly sized when implementing VFS serialization (#631)
diff --git a/R/TileDBArray.R b/R/TileDBArray.R
index 72b1f1b5e2..ac561b7ae8 100644
--- a/R/TileDBArray.R
+++ b/R/TileDBArray.R
@@ -1261,23 +1261,27 @@ setMethod("[<-", "tiledb_array",
   dimtypes <- sapply(dims, function(d) libtiledb_dim_get_datatype(d@ptr))
   dimvarnum <- sapply(dims, function(d) libtiledb_dim_get_cell_val_num(d@ptr))
   dimnullable <- sapply(dims, function(d) FALSE)
+  dimdictionary <- sapply(dims, function(d) FALSE)
 
   attrs <- tiledb::attrs(schema(x))
   attrnames <- unname(sapply(attrs, function(a) libtiledb_attribute_get_name(a@ptr)))
   attrtypes <- unname(sapply(attrs, function(a) libtiledb_attribute_get_type(a@ptr)))
   attrvarnum <- unname(sapply(attrs, function(a) libtiledb_attribute_get_cell_val_num(a@ptr)))
   attrnullable <- unname(sapply(attrs, function(a) libtiledb_attribute_get_nullable(a@ptr)))
+  attrdictionary <- unname(sapply(attrs, function(a) libtiledb_attribute_has_enumeration(ctx@ptr, a@ptr)))
 
   if (length(attrnames) > 0) {
       allnames <- c(dimnames, attrnames)
       alltypes <- c(dimtypes, attrtypes)
       allvarnum <- c(dimvarnum, attrvarnum)
       allnullable <- c(dimnullable, attrnullable)
+      alldictionary <- c(dimdictionary, attrdictionary)
   } else {
       allnames <- dimnames
       alltypes <- dimtypes
       allvarnum <- dimvarnum
       allnullable <- dimnullable
+      alldictionary <- dimdictionary
   }
 
   ## check we have complete columns (as we cannot write subset of attributes)
@@ -1336,6 +1340,7 @@ setMethod("[<-", "tiledb_array",
       allnames <- attrnames
       alltypes <- attrtypes
       allnullable <- attrnullable
+      alldictionary <- attrdictionary
     }
 
   ## Case 4: dense, list on RHS e.g. the ex_1.R example
@@ -1353,6 +1358,7 @@ setMethod("[<-", "tiledb_array",
     allnames <- attrnames
     alltypes <- attrtypes
     allnullable <- attrnullable
+    alldictionary <- attrdictionary
   }
 
   nc <- if (is.list(value)) length(value) else ncol(value)
@@ -1392,6 +1398,34 @@ setMethod("[<-", "tiledb_array",
     for (colnam in allnames) {
       ## when an index column is use this may be unordered to remap to position in 'nm' names
       k <- match(colnam, nm)
+
+      if (alldictionary[k]) {
+          spdl::debug("[tiledb_array] '[<-' column {} ({}) is factor", colnam, k)
+          new_levels <- levels(value[[k]])
+
+          attr <- attrs[[allnames[k]]]
+          tpstr <- tiledb_attribute_get_enumeration_type_ptr(attr, arrptr)
+          if (tpstr %in% c("ASCII", "UTF8")) {
+              dictionary <- tiledb_attribute_get_enumeration_ptr(attr, arrptr)
+          } else if (tpstr %in% c("FLOAT32", "FLOAT64", "BOOL", "UINT8", "UINT16", "UINT32", "UINT64",
+                                  "INT8", "INT16", "INT32", "INT64")) {
+              dictionary <- tiledb_attribute_get_enumeration_vector_ptr(attr, arrptr)
+          } else {
+              stop("Unsupported enumeration vector payload of type '%s'", tpstr, call. = FALSE)
+          }
+          added_enums <- setdiff(new_levels, dictionary)
+          if (length(added_enums) > 0) {
+              levels <- unique(c(dictionary, new_levels))
+              is_ordered <- tiledb_attribute_is_ordered_enumeration_ptr(attr, arrptr)
+              value[[k]] <- factor(value[[k]], levels = levels, ordered = is_ordered)
+              spdl::debug("[tiledb_array] '[<-' releveled column {}", k)
+              ase <- tiledb_array_schema_evolution()
+              arr <- tiledb_array_open(x)
+              ase <- tiledb_array_schema_evolution_extend_enumeration(ase, arr, allnames[[k]], added_enums)
+              tiledb::tiledb_array_schema_evolution_array_evolve(ase, uri)
+          }
+      }
+
       if (alltypes[k] %in% c("CHAR", "ASCII", "UTF8")) { # variable length
         txtvec <- as.character(value[[k]])
         spdl::debug("[tiledb_array] '[<-' alloc char buffer {} '{}': {}", k, colnam, alltypes[k])
diff --git a/inst/tinytest/test_arrayschemaevolution.R b/inst/tinytest/test_arrayschemaevolution.R
index 6e64f901d3..99adf3d6ca 100644
--- a/inst/tinytest/test_arrayschemaevolution.R
+++ b/inst/tinytest/test_arrayschemaevolution.R
@@ -64,7 +64,6 @@ expect_equal(levels(res$val), enums)
 expect_equal(as.integer(res$val), c(1:5,5:1))
 
 
-
 ## -- testing 'create empty following by extending'
 if (tiledb_version(TRUE) < "2.17.3") exit_file("Needs TileDB 2.17.3 or later")
 uri <- tempfile()
@@ -112,11 +111,10 @@ run_int_col_test <- function(coltype) {
     tiledb_array_create(uri, schema)
 
     set.seed(42)
-    df <- data.frame(dim = 1:10, fct = sample(1:length(enums), 10, replace=TRUE) - 1, dbl = rnorm(10))
+    df <- data.frame(dim = 1:10, fct = sample(length(enums), 10, replace=TRUE) - 1, dbl = rnorm(10))
     arr <- tiledb_array(uri)
     arr[] <- df
 
-    qc <-
     res <- tiledb_array(uri, return_as="data.frame", query_condition = parse_query_condition(fct == blue, arr))[]
     expect_equal(nrow(res), 5)
 
@@ -134,3 +132,49 @@ run_int_col_test <- function(coltype) {
     unlink(uri)
 }
 sapply(c("INT8", "INT16", "INT32", "UINT8", "UINT16", "UINT32"), run_int_col_test)
+
+
+## test that factor levels can grow without overlap
+uri <- tempfile()
+df1 <- data.frame(id = 1:3, obs = factor(c("A", "B", "A")))
+fromDataFrame(df1, uri, col_index=1, tile_domain=c(1L, 6L))
+
+## write with a factor with two elements but without one of the initial ones
+## while factor(c("B", "C", "B")) gets encoded as c(1,2,1) it should really
+## encoded as c(2,3,2) under levels that are c("A", "B", "C") -- and the
+## write method now does that
+df2 <- data.frame(id = 4:6, obs = factor(c("B", "C", "B")))
+fromDataFrame(df2, uri, col_index=1, mode="append")
+
+res <- tiledb_array(uri, return_as="data.frame")[]
+
+expect_equal(nrow(res), 6)
+expect_equal(nlevels(res[["obs"]]), 3)
+expect_equal(levels(res[["obs"]]), c("A", "B", "C"))
+expect_equal(as.integer(res[["obs"]]), c(1L, 2L, 1L, 2L, 3L, 2L))
+
+ref <- rbind(df1, df2)
+expect_equivalent(res, ref) # equivalent because of query status attribute
+
+
+## test that ordered factor levels can grow without overlap
+uri <- tempfile()
+df1 <- data.frame(id = 1:3, obs = ordered(c("A", "B", "A")))
+fromDataFrame(df1, uri, col_index=1, tile_domain=c(1L, 6L))
+
+## write with a factor with two elements but without one of the initial ones
+## while factor(c("B", "C", "B")) gets encoded as c(1,2,1) it should really
+## encoded as c(2,3,2) under levels that are c("A", "B", "C") -- and the
+## write method now does that
+df2 <- data.frame(id = 4:6, obs = ordered(c("B", "C", "B")))
+fromDataFrame(df2, uri, col_index=1, mode="append")
+
+res <- tiledb_array(uri, return_as="data.frame")[]
+
+expect_equal(nrow(res), 6)
+expect_equal(nlevels(res[["obs"]]), 3)
+expect_equal(levels(res[["obs"]]), c("A", "B", "C"))
+expect_equal(as.integer(res[["obs"]]), c(1L, 2L, 1L, 2L, 3L, 2L))
+
+ref <- rbind(df1, df2)
+expect_equivalent(res, ref) # equivalent because of query status attribute