-
Notifications
You must be signed in to change notification settings - Fork 1.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Coerce Array inner types #13452
Coerce Array inner types #13452
Changes from 16 commits
5d109d1
9ddeda1
f353e3f
22414e3
3c9a768
6e0c56b
d169abc
baf99cb
639f236
2691f80
1181abc
306e7fa
18ee213
cb72340
4005c0c
91b468d
a3052be
1692229
5d39c20
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -480,11 +480,6 @@ fn type_union_resolution_coercion( | |
let new_value_type = type_union_resolution_coercion(value_type, other_type); | ||
new_value_type.map(|t| DataType::Dictionary(index_type.clone(), Box::new(t))) | ||
} | ||
(DataType::List(lhs), DataType::List(rhs)) => { | ||
let new_item_type = | ||
type_union_resolution_coercion(lhs.data_type(), rhs.data_type()); | ||
new_item_type.map(|t| DataType::List(Arc::new(Field::new("item", t, true)))) | ||
} | ||
(DataType::Struct(lhs), DataType::Struct(rhs)) => { | ||
if lhs.len() != rhs.len() { | ||
return None; | ||
|
@@ -529,6 +524,7 @@ fn type_union_resolution_coercion( | |
// Numeric coercion is the same as comparison coercion, both find the narrowest type | ||
// that can accommodate both types | ||
binary_numeric_coercion(lhs_type, rhs_type) | ||
.or_else(|| list_coercion(lhs_type, rhs_type)) | ||
.or_else(|| temporal_coercion_nonstrict_timezone(lhs_type, rhs_type)) | ||
.or_else(|| string_coercion(lhs_type, rhs_type)) | ||
.or_else(|| numeric_string_coercion(lhs_type, rhs_type)) | ||
|
@@ -1138,27 +1134,46 @@ fn numeric_string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<D | |
} | ||
} | ||
|
||
/// Coerces two fields together, ensuring the field data (name and nullability) is correctly set. | ||
fn coerce_list_children(lhs_field: &FieldRef, rhs_field: &FieldRef) -> Option<FieldRef> { | ||
let data_types = vec![lhs_field.data_type().clone(), rhs_field.data_type().clone()]; | ||
Some(Arc::new( | ||
(**lhs_field) | ||
.clone() | ||
.with_data_type(type_union_resolution(&data_types)?) | ||
.with_nullable(lhs_field.is_nullable() || rhs_field.is_nullable()), | ||
)) | ||
} | ||
|
||
/// Coercion rules for list types. | ||
fn list_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> { | ||
use arrow::datatypes::DataType::*; | ||
match (lhs_type, rhs_type) { | ||
(List(_), List(_)) => Some(lhs_type.clone()), | ||
(LargeList(_), List(_)) => Some(lhs_type.clone()), | ||
blaginin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
(List(_), LargeList(_)) => Some(rhs_type.clone()), | ||
(LargeList(_), LargeList(_)) => Some(lhs_type.clone()), | ||
(List(_), FixedSizeList(_, _)) => Some(lhs_type.clone()), | ||
(FixedSizeList(_, _), List(_)) => Some(rhs_type.clone()), | ||
( | ||
LargeList(lhs_field), | ||
List(rhs_field) | LargeList(rhs_field) | FixedSizeList(rhs_field, _), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
match (lhs_type, rhs_type) {
// Coerce to the left side FixedSizeList type if the list lengths are the same,
// otherwise coerce to list with the left type for dynamic length
(FixedSizeList(lhs_field, ls), FixedSizeList(rhs_field, rs)) if ls == rs => Some(
FixedSizeList(coerce_list_children(lhs_field, rhs_field)?, *rs),
),
// Left is a LargeList[View] or right is a LargeList[View]
(
LargeList(lhs_field) | LargeListView(lhs_field),
List(rhs_field)
| ListView(rhs_field)
| LargeList(rhs_field)
| LargeListView(rhs_field)
| FixedSizeList(rhs_field, _),
)
| (
List(lhs_field)
| ListView(lhs_field)
| FixedSizeList(lhs_field, _)
| LargeList(lhs_field)
| LargeListView(lhs_field),
LargeList(rhs_field) | LargeListView(rhs_field),
) => Some(LargeList(coerce_list_children(lhs_field, rhs_field)?)),
// Left and right are lists
(
List(lhs_field) | ListView(lhs_field) | FixedSizeList(lhs_field, _),
List(rhs_field) | ListView(rhs_field) | FixedSizeList(rhs_field, _),
) => Some(List(coerce_list_children(lhs_field, rhs_field)?)),
_ => None,
} There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is ListView already supported in arrow? I would prefer to handle list view if there is corresponding test as well to ensure the test coverage There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. TBH I'd as well would handle this separately with a proper test. I'll put a ticket There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. but will reorder the arms |
||
) | ||
| (List(lhs_field) | FixedSizeList(lhs_field, _), LargeList(rhs_field)) => { | ||
Some(LargeList(coerce_list_children(lhs_field, rhs_field)?)) | ||
} | ||
|
||
(List(lhs_field), List(rhs_field) | FixedSizeList(rhs_field, _)) | ||
| (FixedSizeList(lhs_field, _), List(rhs_field)) => { | ||
Some(List(coerce_list_children(lhs_field, rhs_field)?)) | ||
} | ||
|
||
// Coerce to the left side FixedSizeList type if the list lengths are the same, | ||
// otherwise coerce to list with the left type for dynamic length | ||
(FixedSizeList(lf, ls), FixedSizeList(_, rs)) => { | ||
(FixedSizeList(lhs_field, ls), FixedSizeList(rhs_field, rs)) => { | ||
if ls == rs { | ||
Some(lhs_type.clone()) | ||
Some(FixedSizeList( | ||
coerce_list_children(lhs_field, rhs_field)?, | ||
*rs, | ||
)) | ||
} else { | ||
Some(List(Arc::clone(lf))) | ||
Some(List(coerce_list_children(lhs_field, rhs_field)?)) | ||
} | ||
} | ||
(LargeList(_), FixedSizeList(_, _)) => Some(lhs_type.clone()), | ||
(FixedSizeList(_, _), LargeList(_)) => Some(rhs_type.clone()), | ||
_ => None, | ||
} | ||
} | ||
|
@@ -2105,10 +2120,36 @@ mod tests { | |
DataType::List(Arc::clone(&inner_field)) | ||
); | ||
|
||
// Negative test: inner_timestamp_field and inner_field are not compatible because their inner types are not compatible | ||
let inner_timestamp_field = Arc::new(Field::new( | ||
"item", | ||
DataType::Timestamp(TimeUnit::Microsecond, None), | ||
true, | ||
)); | ||
let result_type = get_input_types( | ||
&DataType::List(Arc::clone(&inner_field)), | ||
&Operator::Eq, | ||
&DataType::List(Arc::clone(&inner_timestamp_field)), | ||
); | ||
assert!(result_type.is_err()); | ||
|
||
// TODO add other data type | ||
Ok(()) | ||
} | ||
|
||
#[test] | ||
fn test_list_coercion() { | ||
let lhs_type = DataType::List(Arc::new(Field::new("lhs", DataType::Int8, false))); | ||
|
||
let rhs_type = DataType::List(Arc::new(Field::new("rhs", DataType::Int64, true))); | ||
|
||
let coerced_type = list_coercion(&lhs_type, &rhs_type).unwrap(); | ||
assert_eq!( | ||
coerced_type, | ||
DataType::List(Arc::new(Field::new("lhs", DataType::Int64, true))) | ||
); // nullable because the RHS is nullable | ||
} | ||
|
||
#[test] | ||
fn test_type_coercion_logical_op() -> Result<()> { | ||
test_coercion_binary_rule!( | ||
|
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -26,7 +26,7 @@ use arrow_array::{ | |||
new_null_array, Array, ArrayRef, GenericListArray, NullArray, OffsetSizeTrait, | ||||
}; | ||||
use arrow_buffer::OffsetBuffer; | ||||
use arrow_schema::DataType::{LargeList, List, Null}; | ||||
use arrow_schema::DataType::{List, Null}; | ||||
use arrow_schema::{DataType, Field}; | ||||
use datafusion_common::{plan_err, utils::array_into_list_array_nullable, Result}; | ||||
use datafusion_expr::binary::{ | ||||
|
@@ -198,7 +198,6 @@ pub(crate) fn make_array_inner(arrays: &[ArrayRef]) -> Result<ArrayRef> { | |||
let array = new_null_array(&DataType::Int64, length); | ||||
Ok(Arc::new(array_into_list_array_nullable(array))) | ||||
} | ||||
LargeList(..) => array_array::<i64>(arrays, data_type), | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think this line was correct. According to the signature, the function output is always a I added a test to illustrate this:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
It seems correct 🤔 ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see. You fix it :) Thanks |
||||
_ => array_array::<i32>(arrays, data_type), | ||||
} | ||||
} | ||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you add a comment here explaining why this function is needed -- specifically that it is setting the DataType / field name correctly?