Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Coerce Array inner types #13452

Merged
merged 19 commits into from
Nov 21, 2024
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 57 additions & 16 deletions datafusion/expr-common/src/type_coercion/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -480,11 +480,6 @@ fn type_union_resolution_coercion(
let new_value_type = type_union_resolution_coercion(value_type, other_type);
new_value_type.map(|t| DataType::Dictionary(index_type.clone(), Box::new(t)))
}
(DataType::List(lhs), DataType::List(rhs)) => {
let new_item_type =
type_union_resolution_coercion(lhs.data_type(), rhs.data_type());
new_item_type.map(|t| DataType::List(Arc::new(Field::new("item", t, true))))
}
(DataType::Struct(lhs), DataType::Struct(rhs)) => {
if lhs.len() != rhs.len() {
return None;
Expand Down Expand Up @@ -529,6 +524,7 @@ fn type_union_resolution_coercion(
// Numeric coercion is the same as comparison coercion, both find the narrowest type
// that can accommodate both types
binary_numeric_coercion(lhs_type, rhs_type)
.or_else(|| list_coercion(lhs_type, rhs_type))
.or_else(|| temporal_coercion_nonstrict_timezone(lhs_type, rhs_type))
.or_else(|| string_coercion(lhs_type, rhs_type))
.or_else(|| numeric_string_coercion(lhs_type, rhs_type))
Expand Down Expand Up @@ -1138,27 +1134,46 @@ fn numeric_string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<D
}
}

/// Coerces two fields together, ensuring the field data (name and nullability) is correctly set.
fn coerce_list_children(lhs_field: &FieldRef, rhs_field: &FieldRef) -> Option<FieldRef> {
let data_types = vec![lhs_field.data_type().clone(), rhs_field.data_type().clone()];
Some(Arc::new(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add a comment here explaining why this function is needed -- specifically that it is setting the DataType / field name correctly?

(**lhs_field)
.clone()
.with_data_type(type_union_resolution(&data_types)?)
.with_nullable(lhs_field.is_nullable() || rhs_field.is_nullable()),
))
}

/// Coercion rules for list types.
fn list_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
use arrow::datatypes::DataType::*;
match (lhs_type, rhs_type) {
(List(_), List(_)) => Some(lhs_type.clone()),
(LargeList(_), List(_)) => Some(lhs_type.clone()),
blaginin marked this conversation as resolved.
Show resolved Hide resolved
(List(_), LargeList(_)) => Some(rhs_type.clone()),
(LargeList(_), LargeList(_)) => Some(lhs_type.clone()),
(List(_), FixedSizeList(_, _)) => Some(lhs_type.clone()),
(FixedSizeList(_, _), List(_)) => Some(rhs_type.clone()),
(
LargeList(lhs_field),
List(rhs_field) | LargeList(rhs_field) | FixedSizeList(rhs_field, _),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • Let's maybe add listview to the mix. it's hard to test, but why not be future-proof here?
  • let's maybe reorder clauses so that it's clear that any list types are handled
 match (lhs_type, rhs_type) {
        // Coerce to the left side FixedSizeList type if the list lengths are the same,
        // otherwise coerce to list with the left type for dynamic length
        (FixedSizeList(lhs_field, ls), FixedSizeList(rhs_field, rs)) if ls == rs => Some(
            FixedSizeList(coerce_list_children(lhs_field, rhs_field)?, *rs),
        ),

        // Left is a LargeList[View] or right is a LargeList[View]
        (
            LargeList(lhs_field) | LargeListView(lhs_field),
            List(rhs_field)
            | ListView(rhs_field)
            | LargeList(rhs_field)
            | LargeListView(rhs_field)
            | FixedSizeList(rhs_field, _),
        )
        | (
            List(lhs_field)
            | ListView(lhs_field)
            | FixedSizeList(lhs_field, _)
            | LargeList(lhs_field)
            | LargeListView(lhs_field),
            LargeList(rhs_field) | LargeListView(rhs_field),
        ) => Some(LargeList(coerce_list_children(lhs_field, rhs_field)?)),

        // Left and right are lists
        (
            List(lhs_field) | ListView(lhs_field) | FixedSizeList(lhs_field, _),
            List(rhs_field) | ListView(rhs_field) | FixedSizeList(rhs_field, _),
        ) => Some(List(coerce_list_children(lhs_field, rhs_field)?)),

        _ => None,
    }

Copy link
Contributor

@jayzhan211 jayzhan211 Nov 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is ListView already supported in arrow? I would prefer to handle list view if there is corresponding test as well to ensure the test coverage

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TBH I'd as well would handle this separately with a proper test. I'll put a ticket

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but will reorder the arms

)
| (List(lhs_field) | FixedSizeList(lhs_field, _), LargeList(rhs_field)) => {
Some(LargeList(coerce_list_children(lhs_field, rhs_field)?))
}

(List(lhs_field), List(rhs_field) | FixedSizeList(rhs_field, _))
| (FixedSizeList(lhs_field, _), List(rhs_field)) => {
Some(List(coerce_list_children(lhs_field, rhs_field)?))
}

// Coerce to the left side FixedSizeList type if the list lengths are the same,
// otherwise coerce to list with the left type for dynamic length
(FixedSizeList(lf, ls), FixedSizeList(_, rs)) => {
(FixedSizeList(lhs_field, ls), FixedSizeList(rhs_field, rs)) => {
if ls == rs {
Some(lhs_type.clone())
Some(FixedSizeList(
coerce_list_children(lhs_field, rhs_field)?,
*rs,
))
} else {
Some(List(Arc::clone(lf)))
Some(List(coerce_list_children(lhs_field, rhs_field)?))
}
}
(LargeList(_), FixedSizeList(_, _)) => Some(lhs_type.clone()),
(FixedSizeList(_, _), LargeList(_)) => Some(rhs_type.clone()),
_ => None,
}
}
Expand Down Expand Up @@ -2105,10 +2120,36 @@ mod tests {
DataType::List(Arc::clone(&inner_field))
);

// Negative test: inner_timestamp_field and inner_field are not compatible because their inner types are not compatible
let inner_timestamp_field = Arc::new(Field::new(
"item",
DataType::Timestamp(TimeUnit::Microsecond, None),
true,
));
let result_type = get_input_types(
&DataType::List(Arc::clone(&inner_field)),
&Operator::Eq,
&DataType::List(Arc::clone(&inner_timestamp_field)),
);
assert!(result_type.is_err());

// TODO add other data type
Ok(())
}

#[test]
fn test_list_coercion() {
let lhs_type = DataType::List(Arc::new(Field::new("lhs", DataType::Int8, false)));

let rhs_type = DataType::List(Arc::new(Field::new("rhs", DataType::Int64, true)));

let coerced_type = list_coercion(&lhs_type, &rhs_type).unwrap();
assert_eq!(
coerced_type,
DataType::List(Arc::new(Field::new("lhs", DataType::Int64, true)))
); // nullable because the RHS is nullable
}

#[test]
fn test_type_coercion_logical_op() -> Result<()> {
test_coercion_binary_rule!(
Expand Down
3 changes: 1 addition & 2 deletions datafusion/functions-nested/src/make_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ use arrow_array::{
new_null_array, Array, ArrayRef, GenericListArray, NullArray, OffsetSizeTrait,
};
use arrow_buffer::OffsetBuffer;
use arrow_schema::DataType::{LargeList, List, Null};
use arrow_schema::DataType::{List, Null};
use arrow_schema::{DataType, Field};
use datafusion_common::{plan_err, utils::array_into_list_array_nullable, Result};
use datafusion_expr::binary::{
Expand Down Expand Up @@ -198,7 +198,6 @@ pub(crate) fn make_array_inner(arrays: &[ArrayRef]) -> Result<ArrayRef> {
let array = new_null_array(&DataType::Int64, length);
Ok(Arc::new(array_into_list_array_nullable(array)))
}
LargeList(..) => array_array::<i64>(arrays, data_type),
Copy link
Contributor Author

@blaginin blaginin Nov 19, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this line was correct. According to the signature, the function output is always a List. However, this one sometimes returns a LargeList.

I added a test to illustrate this:

select make_array(make_array(1), arrow_cast(make_array(-1), 'LargeList(Int8)'))

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

query ?
select make_array(make_array(1), arrow_cast(make_array(-1), 'LargeList(Int8)'));
----
[[1], [-1]]

query T
select arrow_typeof(make_array(make_array(1), arrow_cast(make_array(-1), 'LargeList(Int8)')));
----
List(Field { name: "item", data_type: LargeList(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })

It seems correct 🤔 ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. You fix it :) Thanks

_ => array_array::<i32>(arrays, data_type),
}
}
Expand Down
6 changes: 6 additions & 0 deletions datafusion/sqllogictest/test_files/array.slt
Original file line number Diff line number Diff line change
Expand Up @@ -1155,6 +1155,12 @@ select column1, column5 from arrays_values_without_nulls;
[21, 22, 23, 24, 25, 26, 27, 28, 29, 30] [6, 7]
[31, 32, 33, 34, 35, 26, 37, 38, 39, 40] [8, 9]

# make array with arrays of different types
query ?
select make_array(make_array(1), arrow_cast(make_array(-1), 'LargeList(Int8)'))
----
[[1], [-1]]
blaginin marked this conversation as resolved.
Show resolved Hide resolved

query ???
select make_array(column1),
make_array(column1, column5),
Expand Down
16 changes: 16 additions & 0 deletions datafusion/sqllogictest/test_files/union.slt
Original file line number Diff line number Diff line change
Expand Up @@ -761,3 +761,19 @@ SELECT NULL WHERE FALSE;
----
0.5
1

# Test Union of List Types. Issue: https://github.com/apache/datafusion/issues/12291
query error DataFusion error: type_coercion\ncaused by\nError during planning: Incompatible inputs for Union: Previous inputs were of type List(.*), but got incompatible type List(.*) on column 'x'
SELECT make_array(2) x UNION ALL SELECT make_array(now()) x;

query ?
select make_array(arrow_cast(2, 'UInt8')) x UNION ALL SELECT make_array(arrow_cast(-2, 'Int8')) x;
----
[-2]
[2]

query ?
select make_array(make_array(1)) x UNION ALL SELECT make_array(arrow_cast(make_array(-1), 'LargeList(Int8)')) x;
----
[[-1]]
[[1]]