Skip to content

Commit

Permalink
Support for enumerated types (#562)
Browse files Browse the repository at this point in the history
* Support for factor columns via (experimental) Dictionary support

* Disable CI as we 'cannot do this' without pending Dictionary support

* More efficient factor creation

* Snapshot with updated enumeration class

* Updated snapshot with working read of enum'ed array

* Updated snapshot passing all tests

* Snapshot writing with factor

* Snapshot displaying schema with enumeration

* On index columns, factors need as character

* Recover factor from shmem buffer too

* With unit test support

* With updated documentation to pass R CMD check

* Snapshot enumeration to column buffer

* Further snapshot enumeration to column buffer

* Snapshot with Dictionary re-creation at R level

* Snapshot with working factor creation on the c++ side

* Dictionaries are small strings using int32 offsets

* Cleanup snapshot commenting out some unused code

* Snapshot and refactor, focus on arrowio not column_buffer

* Snapshot with small tweaks and one new helper

* Schema from uri now accesses array to allow attribute enumerations

* Condition enumeration code on TileDB 2.17.0 or later

* Adjust tests for enumerations

* Correction for array open given key

* Re-enable continuous integration

(plus minor cleanup following rebase)

* Add support for query conditions on enumerations

* Roll micro version and update NEWS [ci skip]
  • Loading branch information
eddelbuettel authored Sep 6, 2023
1 parent c6b6315 commit 6e27096
Show file tree
Hide file tree
Showing 31 changed files with 979 additions and 115 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: tiledb
Type: Package
Version: 0.20.3.2
Version: 0.20.3.3
Title: Universal Storage Engine for Sparse and Dense Multidimensional Arrays
Authors@R: c(person("TileDB, Inc.", role = c("aut", "cph")),
person("Dirk", "Eddelbuettel", email = "[email protected]", role = "cre"))
Expand Down
6 changes: 6 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ export(tiledb_array_create)
export(tiledb_array_delete_fragments)
export(tiledb_array_get_non_empty_domain_from_index)
export(tiledb_array_get_non_empty_domain_from_name)
export(tiledb_array_has_enumeration)
export(tiledb_array_is_heterogeneous)
export(tiledb_array_is_homogeneous)
export(tiledb_array_is_open)
Expand All @@ -114,10 +115,14 @@ export(tiledb_arrow_schema_ptr)
export(tiledb_attr)
export(tiledb_attribute_get_cell_size)
export(tiledb_attribute_get_cell_val_num)
export(tiledb_attribute_get_enumeration)
export(tiledb_attribute_get_enumeration_ptr)
export(tiledb_attribute_get_fill_value)
export(tiledb_attribute_get_nullable)
export(tiledb_attribute_has_enumeration)
export(tiledb_attribute_is_variable_sized)
export(tiledb_attribute_set_cell_val_num)
export(tiledb_attribute_set_enumeration_name)
export(tiledb_attribute_set_fill_value)
export(tiledb_attribute_set_nullable)
export(tiledb_config)
Expand Down Expand Up @@ -215,6 +220,7 @@ export(tiledb_query_buffer_alloc_ptr)
export(tiledb_query_condition)
export(tiledb_query_condition_combine)
export(tiledb_query_condition_init)
export(tiledb_query_condition_set_use_enumeration)
export(tiledb_query_create_buffer_ptr)
export(tiledb_query_create_buffer_ptr_char)
export(tiledb_query_ctx)
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

* Built-time configuration of TileDB Embedded can now be accessed as a JSON string (#584)

* Enumeration types (i.e. what R calls factors) are now supported (#562)


# tiledb 0.20.3

Expand Down
17 changes: 16 additions & 1 deletion R/Array.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# MIT License
#
# Copyright (c) 2017-2021 TileDB Inc.
# Copyright (c) 2017-2023 TileDB Inc.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -161,3 +161,18 @@ tiledb_array_delete_fragments <- function(arr, ts_start, ts_end) {
libtiledb_array_delete_fragments(arr@ptr, ts_start, ts_end)
invisible(TRUE)
}

##' Check for Enumeration (aka Factor aka Dictionary)
##'
##' @param arr A TileDB Array object
##' @return A boolean indicating if the array has homogeneous domains
##' @export
tiledb_array_has_enumeration <- function(arr) {
stopifnot("The 'arr' argument must be a tiledb_array object" = .isArray(arr))
ctx <- tiledb_get_context()
if (!tiledb_array_is_open(arr)) {
arr <- tiledb_array_open(arr, "READ")
on.exit(tiledb_array_close(arr))
}
return(libtiledb_array_has_enumeration_vector(ctx@ptr, arr@ptr))
}
42 changes: 30 additions & 12 deletions R/ArraySchema.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# MIT License
#
# Copyright (c) 2017-2022 TileDB Inc.
# Copyright (c) 2017-2023 TileDB Inc.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand All @@ -23,13 +23,16 @@
#' An S4 class for the TileDB array schema
#'
#' @slot ptr An external pointer to the underlying implementation
#' @slot arrptr An optional external pointer to the underlying array, or NULL if missing
#' @exportClass tiledb_array_schema
setClass("tiledb_array_schema",
slots = list(ptr = "externalptr"))
slots = list(ptr = "externalptr",
arrptr = "ANY"))

tiledb_array_schema.from_ptr <- function(ptr) {
stopifnot(`The 'ptr' argument must be a non NULL externalptr to a tiledb_array_schema instance` = !missing(ptr) && is(ptr, "externalptr") && !is.null(ptr) )
new("tiledb_array_schema", ptr = ptr)
tiledb_array_schema.from_ptr <- function(ptr, arrptr=NULL) {
stopifnot("The 'ptr' argument must be an external pointer to a tiledb_array_schema instance"
= !missing(ptr) && is(ptr, "externalptr") && !is.null(ptr))
new("tiledb_array_schema", ptr = ptr, arrptr = arrptr)
}

#' Constructs a `tiledb_array_schema` object
Expand All @@ -43,7 +46,8 @@ tiledb_array_schema.from_ptr <- function(ptr) {
#' @param offsets_filter_list (optional)
#' @param validity_filter_list (optional)
#' @param capacity (optional)
#' @param allows_dups (optional, requires \sQuote{spars} to be TRUE)
#' @param allows_dups (optional, requires \sQuote{sparse} to be TRUE)
#' @param enumerations (optional) named list of enumerations
#' @param ctx tiledb_ctx object (optional)
#' @examples
#' \dontshow{ctx <- tiledb_ctx(limitTileDBCores())}
Expand All @@ -68,6 +72,7 @@ tiledb_array_schema <- function(domain,
validity_filter_list = NULL,
capacity = 10000L,
allows_dups = FALSE,
enumerations = NULL,
ctx = tiledb_get_context()) {
if (!missing(attrs) && length(attrs) != 0) {
is_attr <- function(obj) is(obj, "tiledb_attr")
Expand Down Expand Up @@ -97,7 +102,7 @@ tiledb_array_schema <- function(domain,

ptr <- libtiledb_array_schema(ctx@ptr, domain@ptr, attr_ptr_list, cell_order, tile_order,
coords_filter_list_ptr, offsets_filter_list_ptr,
validity_filter_list_ptr, sparse)
validity_filter_list_ptr, sparse, enumerations)
libtiledb_array_schema_set_capacity(ptr, capacity)
if (allows_dups) libtiledb_array_schema_set_allows_dups(ptr, TRUE)
invisible(new("tiledb_array_schema", ptr = ptr))
Expand Down Expand Up @@ -145,7 +150,7 @@ setMethod("show", signature(object = "tiledb_array_schema"),
nfo <- nfilters(fl$offsets)
nfv <- nfilters(fl$validity)
cat("tiledb_array_schema(\n domain=", .as_text_domain(domain(object)), ",\n",
" attrs=c(\n ", paste(sapply(attrs(object), .as_text_attribute), collapse=",\n "), "\n ),\n",
" attrs=c(\n ", paste(sapply(attrs(object), .as_text_attribute, arrptr=object@arrptr), collapse=",\n "), "\n ),\n",
" cell_order=\"", cell_order(object), "\", ",
"tile_order=\"", tile_order(object), "\", ",
"capacity=", capacity(object), ", ",
Expand All @@ -155,11 +160,8 @@ setMethod("show", signature(object = "tiledb_array_schema"),
sep="")
if (nfc > 0) cat(" coords_filter_list=", .as_text_filter_list(fl$coords), if (nfo + nfv > 0) "," else "", "\n", sep="")
if (nfo > 0) cat(" offsets_filter_list=", .as_text_filter_list(fl$offsets), if (nfv > 0) ",\n" else "", sep="")
if (nfv > 0)
cat(" validity_filter_list=", .as_text_filter_list(fl$validity), "\n", sep="")
if (nfv > 0) cat(" validity_filter_list=", .as_text_filter_list(fl$validity), "\n", sep="")
cat(")\n", sep="")
#cat("tiledb_array_create(uri=tempfile(), schema=sch)) # or assign your URI here\n")

})

#' @rdname generics
Expand Down Expand Up @@ -541,6 +543,22 @@ tiledb_schema_get_dim_attr_status <- function(sch) {
return(c(rep(1L, length(dims)), rep(2L, length(attrs))))
}

##' Get Dimension or Attribute Status
##'
##' Note that this function is an unexported internal function.
##'
##' @param sch A TileDB Schema object
##' @return An integer vector where each element corresponds to a schema entry,
##' and a value of one signals dimension and a value of two an attribute.
tiledb_schema_get_enumeration_status <- function(sch) {
stopifnot("The 'sch' argument must be a schema" = is(sch, "tiledb_array_schema"))
dom <- tiledb::domain(sch)
dims <- tiledb::dimensions(dom)
attrs <- tiledb::attrs(sch)
return(c(rep(FALSE, length(dims)),
sapply(attrs, tiledb_attribute_has_enumeration)))
}


# -- get and set tile capacity

Expand Down
68 changes: 65 additions & 3 deletions R/Attribute.R
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ tiledb_attr.from_ptr <- function(ptr) {
#' @param ncells (default 1) The number of cells, use \code{NA} to signal variable length
#' @param nullable (default FALSE) A logical switch whether the attribute can have missing
#' values
#' @param enumeration (default NULL) A character vector of dictionary values
#' @param ctx tiledb_ctx object (optional)
#' @return `tiledb_dim` object
#' @examples
Expand All @@ -58,6 +59,7 @@ tiledb_attr <- function(name,
filter_list = tiledb_filter_list(),
ncells = 1,
nullable = FALSE,
enumeration = NULL,
ctx = tiledb_get_context()
) {
if (missing(name)) name <- ""
Expand All @@ -66,7 +68,10 @@ tiledb_attr <- function(name,
`The 'name' argument must be a scalar string` = is.scalar(name, "character"),
`The 'filter_list' argument must be a tiledb_filter_list instance` = is(filter_list, "tiledb_filter_list"))
ptr <- libtiledb_attribute(ctx@ptr, name, type, filter_list@ptr, ncells, nullable)
new("tiledb_attr", ptr = ptr)
attr <- new("tiledb_attr", ptr = ptr)
if (!is.null(enumeration))
attr <- tiledb_attribute_set_enumeration_name(attr, name, ctx)
invisible(attr)
}

#' Raw display of an attribute object
Expand All @@ -81,13 +86,23 @@ setMethod("raw_dump",
definition = function(object) libtiledb_attribute_dump(object@ptr))

# internal function returning text use here and in other higher-level show() methods
.as_text_attribute <- function(object) {
.as_text_attribute <- function(object, arrptr=NULL) {
fl <- filter_list(object)
ndct <- 0 # default
dct <- character() # default
if (!is.null(arrptr)) {
if (!libtiledb_array_is_open_for_reading(arrptr)) arrptr <- libtiledb_array_open_with_ptr(arrptr, "READ")
if (tiledb_attribute_has_enumeration(object)) {
dct <- tiledb_attribute_get_enumeration_ptr(object, arrptr)
ndct <- length(dct)
}
}
txt <- paste0("tiledb_attr(name=\"", name(object), "\", ",
"type=\"", datatype(object), "\", ",
"ncells=", cell_val_num(object), ", ",
"nullable=", tiledb_attribute_get_nullable(object),
if (nfilters(fl) > 0) paste0(", filter_list=", .as_text_filter_list(fl)))
if (nfilters(fl) > 0) paste0(", filter_list=", .as_text_filter_list(fl)),
if (ndct > 0) paste0(", dictionary=c(\"", paste(dct[seq(1, min(5, ndct))], collapse="\",\""), if (ndct > 5) "\",...", "\")"))
txt <- paste0(txt, ")")
txt
}
Expand Down Expand Up @@ -313,3 +328,50 @@ tiledb_attribute_get_nullable <- function(attr) {
stopifnot(`The argument must be an attribute` = is(attr, "tiledb_attr"))
libtiledb_attribute_get_nullable(attr@ptr)
}

#' Test if TileDB Attribute has an Enumeration
#'
#' @param attr A TileDB Attribute object
#' @param ctx A Tiledb Context object (optional)
#' @return A logical value indicating if the attribute has an enumeration
#' @export
tiledb_attribute_has_enumeration <- function(attr, ctx = tiledb_get_context()) {
stopifnot("The 'attr' argument must be an attribute" = is(attr, "tiledb_attr"))
libtiledb_attribute_has_enumeration(ctx@ptr, attr@ptr)
}

#' Get the TileDB Attribute Enumeration
#'
#' @param attr A TileDB Attribute object
#' @param arr A Tiledb Array object
#' @param ctx A Tiledb Context object (optional)
#' @return A character vector with the enumeration (of length zero if none)
#' @export
tiledb_attribute_get_enumeration <- function(attr, arr, ctx = tiledb_get_context()) {
stopifnot("The 'attr' argument must be an attribute" = is(attr, "tiledb_attr"),
"The 'arr' argument must be an array" = is(arr, "tiledb_array"))
libtiledb_attribute_get_enumeration(ctx@ptr, attr@ptr, arr@ptr)
}

#' @rdname tiledb_attribute_get_enumeration
#' @param arrptr A Tiledb Array object pointer
#' @export
tiledb_attribute_get_enumeration_ptr <- function(attr, arrptr, ctx = tiledb_get_context()) {
stopifnot("The 'attr' argument must be an attribute" = is(attr, "tiledb_attr"),
"The 'arr' argument must be an external pointer" = is(arrptr, "externalptr"))
libtiledb_attribute_get_enumeration(ctx@ptr, attr@ptr, arrptr)
}

#' Set a TileDB Attribute Enumeration Name
#'
#' @param attr A TileDB Attribute object
#' @param enum_name A character value with the enumeration value
#' @param ctx A Tiledb Context object (optional)
#' @return The modified TileDB Attribute object
#' @export
tiledb_attribute_set_enumeration_name <- function(attr, enum_name, ctx = tiledb_get_context()) {
stopifnot("The 'attr' argument must be an attribute" = is(attr, "tiledb_attr"),
"The 'enum_name' argument must be character" = is.character(enum_name))
attr@ptr <- libtiledb_attribute_set_enumeration(ctx@ptr, attr@ptr, enum_name)
attr
}
49 changes: 38 additions & 11 deletions R/DataFrame.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# MIT License
#
# Copyright (c) 2017-2022 TileDB Inc.
# Copyright (c) 2017-2023 TileDB Inc.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -97,9 +97,12 @@ fromDataFrame <- function(obj, uri, col_index=NULL, sparse=TRUE, allows_dups=spa
if (class(obj)[1] != "data.frame") obj <- as.data.frame(obj)

## turn factor columns in char columns
factcols <- grep("factor", sapply(obj, class))
if (length(factcols) > 0) {
for (i in factcols) obj[,i] <- as.character(obj[,i])
## TODO: add an option
if (tiledb_version(TRUE) < "2.17.0") {
factcols <- grep("factor", sapply(obj, class))
if (length(factcols) > 0) {
for (i in factcols) obj[,i] <- as.character(obj[,i])
}
}

## Create default filter_list from filter vector, 'NONE' and 'ZSTD' is default
Expand Down Expand Up @@ -127,6 +130,7 @@ fromDataFrame <- function(obj, uri, col_index=NULL, sparse=TRUE, allows_dups=spa
makeDim <- function(ind) {
idxcol <- dimobj[,ind]
idxnam <- colnames(dimobj)[ind]
if (inherits(idxcol, "factor")) idxcol <- as.character(idxcol)
col_domain <- if (is.null(tile_domain)) { # default case
c(min(idxcol), max(idxcol)) # use range
} else if (is.list(tile_domain)) { # but if list
Expand Down Expand Up @@ -190,9 +194,14 @@ fromDataFrame <- function(obj, uri, col_index=NULL, sparse=TRUE, allows_dups=spa
dom <- tiledb_domain(dims = dimensions)
}

## the simple helper function used create attribute_i given index i
## we now make it a little bit more powerful yet clumsy but returning a
## three element list at each element where the list contains the attribute
## along with the optional factor levels vector (and the corresponding column name)
makeAttr <- function(ind) {
col <- obj[,ind]
colname <- colnames(obj)[ind]
lvls <- NULL # by default no factor levels
if (inherits(col, "AsIs")) {
## we just look at the first list column, others have to have same type and length
cl <- class(obj[,ind][[1]])
Expand All @@ -217,8 +226,15 @@ fromDataFrame <- function(obj, uri, col_index=NULL, sparse=TRUE, allows_dups=spa
tp <- "INT64"
else if (cl == "logical")
tp <- if (tiledb_version(TRUE) >= "2.10.0") "BOOL" else "INT32"
else if (cl == "factor") {
lvls <- levels(col) # extract factor levels
if (length(lvls) > .Machine$integer.max)
stop("Cannot represent this many levels for ", colname, call. = FALSE)
tp <- "INT32"
}
else
stop("Currently unsupported type: ", cl)

filters <- if (colname %in% names(filter_list)) {
tiledb_filter_list(sapply(filter_list[[colname]], tiledb_filter))
} else {
Expand All @@ -227,15 +243,24 @@ fromDataFrame <- function(obj, uri, col_index=NULL, sparse=TRUE, allows_dups=spa
if (debug) {
cat(sprintf("Setting attribute name %s type %s\n", colname, tp))
}
tiledb_attr(colname,
type = tp,
ncells = if (tp %in% c("CHAR","ASCII")) NA_integer_ else nc,
filter_list = filters,
nullable = any(is.na(col)))
attr <- tiledb_attr(colname,
type = tp,
ncells = if (tp %in% c("CHAR","ASCII")) NA_integer_ else nc,
filter_list = filters,
nullable = any(is.na(col)),
enumeration = lvls)
list(attr=attr, lvls=lvls, name=colname)
}
cols <- seq_len(dims[2])
if (!is.null(col_index)) cols <- cols[-col_index]
attributes <- if (length(cols) > 0) sapply(cols, makeAttr) else list()
attributes <- enumerations <- list() # fallback
if (length(cols) > 0) {
a_e <- lapply(cols, makeAttr)
attributes <- lapply(a_e, "[[", 1)
enumerations <- lapply(a_e, "[[", 2)
colnames <- lapply(a_e, "[[", 3)
names(enumerations) <- colnames
}
schema <- tiledb_array_schema(dom,
attrs = attributes,
cell_order = cell_order,
Expand All @@ -244,8 +269,10 @@ fromDataFrame <- function(obj, uri, col_index=NULL, sparse=TRUE, allows_dups=spa
coords_filter_list = tiledb_filter_list(sapply(coords_filters, tiledb_filter)),
offsets_filter_list = tiledb_filter_list(sapply(offsets_filters, tiledb_filter)),
validity_filter_list = tiledb_filter_list(sapply(validity_filters, tiledb_filter)),
capacity=capacity)
capacity = capacity,
enumerations = if (length(enumerations) > 0) enumerations else NULL)
allows_dups(schema) <- allows_dups

if (mode != "append")
tiledb_array_create(uri, schema)

Expand Down
Loading

0 comments on commit 6e27096

Please sign in to comment.