From 351d032ead9e70d5585e21231daf16aecdfe7a36 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Thu, 3 Oct 2024 18:18:03 -0400 Subject: [PATCH] fix: [sc-56713] [core] query returns incorrect offsets for dense array fill value of variable-length int32 attribute (#5331) Story details: https://app.shortcut.com/tiledb-inc/story/56713 `tiledb-rs` uses the setting `sm.var_offsets.mode = elements` and the dense reader code which copies the fill value offsets does not check this setting. This pull request fixes the issue by checking whether that value was previously set; and adds unit tests demonstrating correct fill value offsets in three different scenarios. --- TYPE: BUG DESC: Fix dense reader fill value with `sm.var_offsets.mode = elements` --------- Co-authored-by: Isaiah Norton --- test/src/unit-cppapi-fill_values.cc | 246 ++++++++++++++++++++++++ tiledb/sm/query/readers/dense_reader.cc | 7 +- 2 files changed, 252 insertions(+), 1 deletion(-) diff --git a/test/src/unit-cppapi-fill_values.cc b/test/src/unit-cppapi-fill_values.cc index bfb3b050d4d..9a4b8f2ac1c 100644 --- a/test/src/unit-cppapi-fill_values.cc +++ b/test/src/unit-cppapi-fill_values.cc @@ -32,7 +32,9 @@ */ #include + #include "tiledb/sm/cpp_api/tiledb" +#include "tiledb/sm/cpp_api/tiledb_experimental" #include "tiledb/sm/misc/constants.h" #include @@ -768,3 +770,247 @@ TEST_CASE( CHECK_NOTHROW(vfs.remove_dir(array_name)); } + +TEST_CASE( + "C++ API: Test variable-size fill values in different offset modes for " + "dense reader", + "[cppapi][fill-values]") { + const char* uri = "dense-attribute-int32-var-size-fill-value"; + + Context ctx; + + // create array if needed + if (Object::object(ctx, uri).type() != Object::Type::Array) { + Domain domain(ctx); + domain.add_dimension(Dimension::create(ctx, "id", {{1, 4}}, 2)); + + ArraySchema schema(ctx, TILEDB_DENSE); + schema.set_domain(domain); + + // Add a single attribute "a" of var-size INT32 type + int a_fill[1] = {100}; + Attribute a = Attribute::create(ctx, "a") + .set_cell_val_num(sm::constants::var_num) + .set_fill_value(&a_fill[0], sizeof(a_fill)); + schema.add_attribute(a); + + // Create the (empty) array on disk. + Array::create(uri, schema); + + // Prepare some data for the array + std::vector a_data = {9}; + std::vector a_offsets = {0}; + + Array array(ctx, uri, TILEDB_WRITE); + + Subarray subarray(ctx, array); + subarray.add_range(0, 1, 1); + + Query query(ctx, array, TILEDB_WRITE); + query.set_subarray(subarray); + query.set_layout(TILEDB_ROW_MAJOR) + .set_data_buffer("a", a_data) + .set_offsets_buffer("a", a_offsets); + + // Perform the write and close the array. + query.submit(); + array.close(); + } + + Array array(ctx, uri, TILEDB_READ); + + // Prepare the vector that will hold the result (of size 6 elements) + std::vector a_data(6); + std::vector a_offsets(6); + + // Prepare the query + const auto offsets_elements = GENERATE(true, false); + const auto reader = GENERATE("legacy", "refactored"); + + const std::vector expect_offsets_elements = {0, 1, 2}; + const std::vector expect_offsets_bytes = { + 0, sizeof(int32_t), 2 * sizeof(int32_t)}; + + Config cfg; + cfg.set("sm.query.dense.reader", reader); + cfg.set("sm.var_offsets.extra_element", "true"); + if (offsets_elements) { + cfg.set("sm.var_offsets.mode", "elements"); + } + + Subarray subarray(ctx, array); + Query query(ctx, array, TILEDB_READ); + query.set_config(cfg) + .set_layout(TILEDB_ROW_MAJOR) + .set_data_buffer("a", a_data) + .set_offsets_buffer("a", a_offsets); + + bool subarrayStartsAt1 = false; + + SECTION("Non-materialized tile") { + // Slice only rows 3, 4 + subarray.add_range(0, 3, 4); + } + + SECTION("Partially-materialized tile") { + // Slice only rows 1, 2 + subarray.add_range(0, 1, 2); + + subarrayStartsAt1 = true; + } + + // Legacy reader gets SEGV when applying query condition. + // This is not specific to this example - tweak the + // "query_condition_dense.cc" example to force the legacy + // reader and it also gets a SEGV. + if (reader == std::string("refactored")) { + SECTION("Query condition false") { + // Slice only rows 1, 2 + subarray.add_range(0, 1, 2); + + QueryCondition qc(ctx); + { + const int32_t one = 1; + qc.init("id", &one, sizeof(int32_t), TILEDB_NE); + } + + query.set_condition(qc); + } + } + + query.set_subarray(subarray); + + // Submit the query and close the array. + query.submit(); + array.close(); + + CHECK(query.result_buffer_elements()["a"].first == 3); + if (offsets_elements) { + for (size_t i = 0; i < expect_offsets_elements.size(); i++) { + CHECK(a_offsets[i] == expect_offsets_elements[i]); + } + CHECK( + query.result_buffer_elements()["a"].second == + expect_offsets_elements.back()); + } else { + for (size_t i = 0; i < expect_offsets_bytes.size(); i++) { + CHECK(a_offsets[i] == expect_offsets_bytes[i]); + } + + // "elements" is intentional here, `.second` is not adjusted for bytes + CHECK( + query.result_buffer_elements()["a"].second == + expect_offsets_elements.back()); + } + if (subarrayStartsAt1) { + CHECK(a_data[0] == 9); + } else { + CHECK(a_data[0] == 100); + } + CHECK(a_data[1] == 100); +} + +TEST_CASE( + "C++ API: Test variable-size fill values in different offset modes for " + "sparse reader", + "[cppapi][fill-values]") { + const auto allow_duplicates = GENERATE(true, false); + const auto uri = std::string("sparse-") + + std::string(allow_duplicates ? "allow-dups" : "no-dups") + + std::string("-attribute-int32-var-size-fill-value"); + + Context ctx; + + // create array if needed + if (Object::object(ctx, uri.c_str()).type() != Object::Type::Array) { + Domain domain(ctx); + domain.add_dimension(Dimension::create(ctx, "id", {{1, 4}}, 2)); + + ArraySchema schema(ctx, TILEDB_SPARSE); + schema.set_allows_dups(allow_duplicates); + schema.set_domain(domain); + + // Add a single attribute "a" which will be unused + // (we must have at least one attribute) + schema.add_attribute(Attribute::create(ctx, "b")); + + // Create the (empty) array on disk. + Array::create(uri.c_str(), schema); + + // Prepare some data for the array + std::vector id_data = {1, 2}; + std::vector b_data = {10, 2}; + + Array array(ctx, uri.c_str(), TILEDB_WRITE); + + Query query(ctx, array, TILEDB_WRITE); + query.set_data_buffer("id", id_data).set_data_buffer("b", b_data); + + // Perform the write and close the array. + query.submit(); + array.close(); + + // Now evolve the schema to include the INT32 var attribute. + // When we read existing coordinates the fill value will be used. + int a_fill[1] = {100}; + Attribute a = Attribute::create(ctx, "a") + .set_cell_val_num(sm::constants::var_num) + .set_fill_value(&a_fill[0], sizeof(a_fill)); + + ArraySchemaEvolution evolution(ctx); + evolution.add_attribute(a); + evolution.array_evolve(uri.c_str()); + } + + Array array(ctx, uri.c_str(), TILEDB_READ); + + // Prepare the vector that will hold the result (of size 6 elements) + std::vector a_data(6); + std::vector a_offsets(6); + + // Prepare the query + const auto offsets_elements = GENERATE(false, true); + const auto layout = + GENERATE(TILEDB_ROW_MAJOR, TILEDB_UNORDERED, TILEDB_GLOBAL_ORDER); + + const std::vector expect_offsets_elements = {0, 1, 2}; + const std::vector expect_offsets_bytes = { + 0, sizeof(int32_t), 2 * sizeof(int32_t)}; + + Config cfg; + cfg.set("sm.var_offsets.extra_element", "true"); + if (offsets_elements) { + cfg.set("sm.var_offsets.mode", "elements"); + } + + Query query(ctx, array, TILEDB_READ); + query.set_config(cfg) + .set_layout(layout) + .set_data_buffer("a", a_data) + .set_offsets_buffer("a", a_offsets); + + // Submit the query and close the array. + query.submit(); + array.close(); + + CHECK(query.result_buffer_elements()["a"].first == 3); + if (offsets_elements) { + for (size_t i = 0; i < expect_offsets_elements.size(); i++) { + CHECK(a_offsets[i] == expect_offsets_elements[i]); + } + CHECK( + query.result_buffer_elements()["a"].second == + expect_offsets_elements.back()); + } else { + for (size_t i = 0; i < expect_offsets_bytes.size(); i++) { + CHECK(a_offsets[i] == expect_offsets_bytes[i]); + } + + // "elements" is intentional here, `.second` is not adjusted for bytes + CHECK( + query.result_buffer_elements()["a"].second == + expect_offsets_elements.back()); + } + CHECK(a_data[0] == 100); + CHECK(a_data[1] == 100); +} diff --git a/tiledb/sm/query/readers/dense_reader.cc b/tiledb/sm/query/readers/dense_reader.cc index fc0d678d012..88389ae4899 100644 --- a/tiledb/sm/query/readers/dense_reader.cc +++ b/tiledb/sm/query/readers/dense_reader.cc @@ -1215,7 +1215,12 @@ void DenseReader::fix_offsets_buffer( std::vector& var_data) { // For easy reference. const auto& fill_value = array_schema_.attribute(name)->fill_value(); - const auto fill_value_size = (OffType)fill_value.size(); + const auto fill_value_size = + (elements_mode_ ? + (OffType)fill_value.size() / + datatype_size(array_schema_.attribute(name)->type()) : + (OffType)fill_value.size()); + ; auto offsets_buffer = (OffType*)buffers_[name].buffer_; // Switch offsets from sizes to real offsets.