Merge remote-tracking branch 'apache/main' into impl-byte-view-column

Rachelint · Oct 15, 2024 · a1f8d0c · a1f8d0c
2 parents c4d45c7 + 5a0ea0b
commit a1f8d0c
Show file tree

Hide file tree

Showing 293 changed files with 14,647 additions and 5,837 deletions.
diff --git a/.github/actions/setup-builder/action.yaml b/.github/actions/setup-builder/action.yaml
@@ -28,16 +28,18 @@ runs:
     - name: Install Build Dependencies
       shell: bash
       run: |
-        apt-get update
-        apt-get install -y protobuf-compiler
+        RETRY="ci/scripts/retry"
+        "${RETRY}" apt-get update
+        "${RETRY}" apt-get install -y protobuf-compiler
     - name: Setup Rust toolchain
       shell: bash
       # rustfmt is needed for the substrait build script
       run: |
+        RETRY="ci/scripts/retry"
         echo "Installing ${{ inputs.rust-version }}"
-        rustup toolchain install ${{ inputs.rust-version }}
-        rustup default ${{ inputs.rust-version }}
-        rustup component add rustfmt
+        "${RETRY}" rustup toolchain install ${{ inputs.rust-version }}
+        "${RETRY}" rustup default ${{ inputs.rust-version }}
+        "${RETRY}" rustup component add rustfmt
     - name: Configure rust runtime env
       uses: ./.github/actions/setup-rust-runtime
     - name: Fixup git permissions

diff --git a/README.md b/README.md
@@ -44,8 +44,9 @@
 DataFusion is an extensible query engine written in [Rust] that
 uses [Apache Arrow] as its in-memory format.
 
-The DataFusion libraries in this repository are used to build data-centric system software. DataFusion also provides the
-following subprojects, which are packaged versions of DataFusion intended for end users.
+This crate provides libraries and binaries for developers building fast and
+feature rich database and analytic systems, customized to particular workloads.
+See [use cases] for examples. The following related subprojects target end users:
 
 - [DataFusion Python](https://github.com/apache/datafusion-python/) offers a Python interface for SQL and DataFrame
   queries.
@@ -54,13 +55,10 @@ following subprojects, which are packaged versions of DataFusion intended for en
 - [DataFusion Comet](https://github.com/apache/datafusion-comet/) is an accelerator for Apache Spark based on
   DataFusion.
 
-The target audience for the DataFusion crates in this repository are
-developers building fast and feature rich database and analytic systems,
-customized to particular workloads. See [use cases] for examples.
-
-DataFusion offers [SQL] and [`Dataframe`] APIs,
-excellent [performance], built-in support for CSV, Parquet, JSON, and Avro,
-extensive customization, and a great community.
+"Out of the box,"
+DataFusion offers [SQL] and [`Dataframe`] APIs, excellent [performance],
+built-in support for CSV, Parquet, JSON, and Avro, extensive customization, and
+a great community.
 
 DataFusion features a full query planner, a columnar, streaming, multi-threaded,
 vectorized execution engine, and partitioned data sources. You can

diff --git a/ci/scripts/retry b/ci/scripts/retry
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+x() {
+    echo "+ $*" >&2
+    "$@"
+}
+
+max_retry_time_seconds=$(( 3 * 60 ))
+retry_delay_seconds=10
+
+END=$(( $(date +%s) + ${max_retry_time_seconds} ))
+
+while (( $(date +%s) < $END )); do
+    x "$@" && exit 0
+    sleep "${retry_delay_seconds}"
+done
+
+echo "$0: retrying [$*] timed out" >&2
+exit 1
diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock
diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml
@@ -62,6 +62,7 @@ dashmap = { workspace = true }
 datafusion = { workspace = true, default-features = true, features = ["avro"] }
 datafusion-common = { workspace = true, default-features = true }
 datafusion-expr = { workspace = true }
+datafusion-functions-window-common = { workspace = true }
 datafusion-optimizer = { workspace = true, default-features = true }
 datafusion-physical-expr = { workspace = true, default-features = true }
 datafusion-proto = { workspace = true }

diff --git a/datafusion-examples/examples/advanced_udwf.rs b/datafusion-examples/examples/advanced_udwf.rs
@@ -30,6 +30,7 @@ use datafusion_expr::function::WindowUDFFieldArgs;
 use datafusion_expr::{
     PartitionEvaluator, Signature, WindowFrame, WindowUDF, WindowUDFImpl,
 };
+use datafusion_functions_window_common::partition::PartitionEvaluatorArgs;
 
 /// This example shows how to use the full WindowUDFImpl API to implement a user
 /// defined window function. As in the `simple_udwf.rs` example, this struct implements
@@ -74,7 +75,10 @@ impl WindowUDFImpl for SmoothItUdf {
 
     /// Create a `PartitionEvaluator` to evaluate this function on a new
     /// partition.
-    fn partition_evaluator(&self) -> Result<Box<dyn PartitionEvaluator>> {
+    fn partition_evaluator(
+        &self,
+        _partition_evaluator_args: PartitionEvaluatorArgs,
+    ) -> Result<Box<dyn PartitionEvaluator>> {
         Ok(Box::new(MyPartitionEvaluator::new()))
     }
 

diff --git a/datafusion-examples/examples/custom_file_format.rs b/datafusion-examples/examples/custom_file_format.rs
@@ -154,7 +154,7 @@ impl FileFormatFactory for TSVFileFactory {
         &self,
         state: &SessionState,
         format_options: &std::collections::HashMap<String, String>,
-    ) -> Result<std::sync::Arc<dyn FileFormat>> {
+    ) -> Result<Arc<dyn FileFormat>> {
         let mut new_options = format_options.clone();
         new_options.insert("format.delimiter".to_string(), "\t".to_string());
 
@@ -164,7 +164,7 @@ impl FileFormatFactory for TSVFileFactory {
         Ok(tsv_file_format)
     }
 
-    fn default(&self) -> std::sync::Arc<dyn FileFormat> {
+    fn default(&self) -> Arc<dyn FileFormat> {
         todo!()
     }
 

diff --git a/datafusion-examples/examples/simplify_udwf_expression.rs b/datafusion-examples/examples/simplify_udwf_expression.rs
@@ -27,6 +27,7 @@ use datafusion_expr::{
     expr::WindowFunction, simplify::SimplifyInfo, Expr, PartitionEvaluator, Signature,
     Volatility, WindowUDF, WindowUDFImpl,
 };
+use datafusion_functions_window_common::partition::PartitionEvaluatorArgs;
 
 /// This UDWF will show how to use the WindowUDFImpl::simplify() API
 #[derive(Debug, Clone)]
@@ -60,7 +61,10 @@ impl WindowUDFImpl for SimplifySmoothItUdf {
         &self.signature
     }
 
-    fn partition_evaluator(&self) -> Result<Box<dyn PartitionEvaluator>> {
+    fn partition_evaluator(
+        &self,
+        _partition_evaluator_args: PartitionEvaluatorArgs,
+    ) -> Result<Box<dyn PartitionEvaluator>> {
         todo!()
     }
 

diff --git a/datafusion/catalog/README.md b/datafusion/catalog/README.md
@@ -0,0 +1,26 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# DataFusion Catalog
+
+[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format.
+
+This crate is a submodule of DataFusion that provides catalog management functionality, including catalogs, schemas, and tables.
+
+[df]: https://crates.io/crates/datafusion
diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs
@@ -406,33 +406,6 @@ impl DFSchema {
         }
     }
 
-    /// Check whether the column reference is ambiguous
-    pub fn check_ambiguous_name(
-        &self,
-        qualifier: Option<&TableReference>,
-        name: &str,
-    ) -> Result<()> {
-        let count = self
-            .iter()
-            .filter(|(field_q, f)| match (field_q, qualifier) {
-                (Some(q1), Some(q2)) => q1.resolved_eq(q2) && f.name() == name,
-                (None, None) => f.name() == name,
-                _ => false,
-            })
-            .take(2)
-            .count();
-        if count > 1 {
-            _schema_err!(SchemaError::AmbiguousReference {
-                field: Column {
-                    relation: None,
-                    name: name.to_string(),
-                },
-            })
-        } else {
-            Ok(())
-        }
-    }
-
     /// Find the qualified field with the given name
     pub fn qualified_field_with_name(
         &self,

diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs
@@ -3577,9 +3577,8 @@ impl fmt::Display for ScalarValue {
                     columns
                         .iter()
                         .zip(fields.iter())
-                        .enumerate()
-                        .map(|(index, (column, field))| {
-                            if nulls.is_some_and(|b| b.is_null(index)) {
+                        .map(|(column, field)| {
+                            if nulls.is_some_and(|b| b.is_null(0)) {
                                 format!("{}:NULL", field.name())
                             } else if let DataType::Struct(_) = field.data_type() {
                                 let sv = ScalarValue::Struct(Arc::new(
@@ -3875,7 +3874,7 @@ mod tests {
     use arrow::compute::{is_null, kernels};
     use arrow::error::ArrowError;
     use arrow::util::pretty::pretty_format_columns;
-    use arrow_buffer::Buffer;
+    use arrow_buffer::{Buffer, NullBuffer};
     use arrow_schema::Fields;
     use chrono::NaiveDate;
     use rand::Rng;
@@ -6589,6 +6588,43 @@ mod tests {
         assert_batches_eq!(&expected, &[batch]);
     }
 
+    #[test]
+    fn test_null_bug() {
+        let field_a = Field::new("a", DataType::Int32, true);
+        let field_b = Field::new("b", DataType::Int32, true);
+        let fields = Fields::from(vec![field_a, field_b]);
+
+        let array_a = Arc::new(Int32Array::from_iter_values([1]));
+        let array_b = Arc::new(Int32Array::from_iter_values([2]));
+        let arrays: Vec<ArrayRef> = vec![array_a, array_b];
+
+        let mut not_nulls = BooleanBufferBuilder::new(1);
+        not_nulls.append(true);
+        let not_nulls = not_nulls.finish();
+        let not_nulls = Some(NullBuffer::new(not_nulls));
+
+        let ar = StructArray::new(fields, arrays, not_nulls);
+        let s = ScalarValue::Struct(Arc::new(ar));
+
+        assert_eq!(s.to_string(), "{a:1,b:2}");
+        assert_eq!(format!("{s:?}"), r#"Struct({a:1,b:2})"#);
+
+        let ScalarValue::Struct(arr) = s else {
+            panic!("Expected struct");
+        };
+
+        //verify compared to arrow display
+        let batch = RecordBatch::try_from_iter(vec![("s", arr as _)]).unwrap();
+        let expected = [
+            "+--------------+",
+            "| s            |",
+            "+--------------+",
+            "| {a: 1, b: 2} |",
+            "+--------------+",
+        ];
+        assert_batches_eq!(&expected, &[batch]);
+    }
+
     #[test]
     fn test_struct_display_null() {
         let fields = vec![Field::new("a", DataType::Int32, false)];