Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update INITCAP scalar function to support Utf8View #11888

Merged
merged 4 commits into from
Aug 12, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 71 additions & 20 deletions datafusion/functions/src/string/initcap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@
use std::any::Any;
use std::sync::Arc;

use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait};
use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait, StringArray};
use arrow::datatypes::DataType;

use datafusion_common::cast::as_generic_string_array;
use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
use datafusion_common::{exec_err, Result};
use datafusion_expr::{ColumnarValue, Volatility};
use datafusion_expr::{ScalarUDFImpl, Signature};
Expand All @@ -45,7 +45,7 @@ impl InitcapFunc {
Self {
signature: Signature::uniform(
1,
vec![Utf8, LargeUtf8],
vec![Utf8, LargeUtf8, Utf8View],
Volatility::Immutable,
),
}
Expand Down Expand Up @@ -73,6 +73,7 @@ impl ScalarUDFImpl for InitcapFunc {
match args[0].data_type() {
DataType::Utf8 => make_scalar_function(initcap::<i32>, vec![])(args),
DataType::LargeUtf8 => make_scalar_function(initcap::<i64>, vec![])(args),
DataType::Utf8View => make_scalar_function(initcap_utf8view, vec![])(args),
other => {
exec_err!("Unsupported data type {other:?} for function initcap")
}
Expand All @@ -88,28 +89,40 @@ fn initcap<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
// first map is the iterator, second is for the `Option<_>`
let result = string_array
.iter()
.map(|string| {
string.map(|string: &str| {
let mut char_vector = Vec::<char>::new();
let mut previous_character_letter_or_number = false;
for c in string.chars() {
if previous_character_letter_or_number {
char_vector.push(c.to_ascii_lowercase());
} else {
char_vector.push(c.to_ascii_uppercase());
}
previous_character_letter_or_number = c.is_ascii_uppercase()
|| c.is_ascii_lowercase()
|| c.is_ascii_digit();
}
char_vector.iter().collect::<String>()
})
})
.map(initcap_string)
.collect::<GenericStringArray<T>>();

Ok(Arc::new(result) as ArrayRef)
}

fn initcap_utf8view(args: &[ArrayRef]) -> Result<ArrayRef> {
let string_view_array = as_string_view_array(&args[0])?;

let result = string_view_array
.iter()
.map(initcap_string)
.collect::<StringArray>();

Ok(Arc::new(result) as ArrayRef)
}

fn initcap_string(string: Option<&str>) -> Option<String> {
string.map(|string: &str| {
let mut char_vector = Vec::<char>::new();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suspect you could make this faster by creating the vector once and then resetting on each loop -- like

        let mut char_vector = Vec::<char>::new();
    string.map(|string: &str| {
      char_vector.clear();
...
}

let mut previous_character_letter_or_number = false;
for c in string.chars() {
if previous_character_letter_or_number {
char_vector.push(c.to_ascii_lowercase());
} else {
char_vector.push(c.to_ascii_uppercase());
}
previous_character_letter_or_number =
c.is_ascii_uppercase() || c.is_ascii_lowercase() || c.is_ascii_digit();
}
char_vector.iter().collect::<String>()
})
}

#[cfg(test)]
mod tests {
use crate::string::initcap::InitcapFunc;
Expand Down Expand Up @@ -153,6 +166,44 @@ mod tests {
Utf8,
StringArray
);
test_function!(
InitcapFunc::new(),
&[ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
"hi THOMAS".to_string()
)))],
Ok(Some("Hi Thomas")),
&str,
Utf8,
StringArray
);
test_function!(
InitcapFunc::new(),
&[ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
"hi THOMAS wIth M0re ThAN 12 ChaRs".to_string()
)))],
Ok(Some("Hi Thomas With M0re Than 12 Chars")),
&str,
Utf8,
StringArray
);
test_function!(
InitcapFunc::new(),
&[ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
"".to_string()
)))],
Ok(Some("")),
&str,
Utf8,
StringArray
);
test_function!(
InitcapFunc::new(),
&[ColumnarValue::Scalar(ScalarValue::Utf8View(None))],
Ok(None),
&str,
Utf8,
StringArray
);
xinlifoobar marked this conversation as resolved.
Show resolved Hide resolved

Ok(())
}
Expand Down
40 changes: 36 additions & 4 deletions datafusion/sqllogictest/test_files/string_view.slt
Original file line number Diff line number Diff line change
Expand Up @@ -425,6 +425,41 @@ logical_plan
01)Projection: starts_with(test.column1_utf8view, Utf8View("äöüß")) AS c1, starts_with(test.column1_utf8view, Utf8View("")) AS c2, starts_with(test.column1_utf8view, Utf8View(NULL)) AS c3, starts_with(Utf8View(NULL), test.column1_utf8view) AS c4
02)--TableScan: test projection=[column1_utf8view]

### Initcap
xinlifoobar marked this conversation as resolved.
Show resolved Hide resolved

# Create a table with lowercase strings
statement ok
CREATE TABLE test_lowercase AS SELECT
lower(column1_utf8) as column1_utf8_lower,
lower(column1_large_utf8) as column1_large_utf8_lower,
lower(column1_utf8view) as column1_utf8view_lower
FROM test;

# Test INITCAP with utf8view, utf8, and largeutf8
# Should not cast anything
query TT
EXPLAIN SELECT
INITCAP(column1_utf8view_lower) as c1,
INITCAP(column1_utf8_lower) as c2,
INITCAP(column1_large_utf8_lower) as c3
FROM test_lowercase;
----
logical_plan
01)Projection: initcap(test_lowercase.column1_utf8view_lower) AS c1, initcap(test_lowercase.column1_utf8_lower) AS c2, initcap(test_lowercase.column1_large_utf8_lower) AS c3
02)--TableScan: test_lowercase projection=[column1_utf8_lower, column1_large_utf8_lower, column1_utf8view_lower]

query TTT
SELECT
INITCAP(column1_utf8view_lower) as c1,
INITCAP(column1_utf8_lower) as c2,
INITCAP(column1_large_utf8_lower) as c3
FROM test_lowercase;
----
Andrew Andrew Andrew
Xiangpeng Xiangpeng Xiangpeng
Raphael Raphael Raphael
NULL NULL NULL

# Ensure string functions use native StringView implementation
# and do not fall back to Utf8 or LargeUtf8
# Should see no casts to Utf8 in the plans below
Expand Down Expand Up @@ -586,16 +621,13 @@ logical_plan
02)--Projection: CAST(test.column2_utf8view AS Utf8) AS __common_expr_1, test.column1_utf8view
03)----TableScan: test projection=[column1_utf8view, column2_utf8view]


## Ensure no casts for INITCAP
## TODO https://github.com/apache/datafusion/issues/11853
query TT
EXPLAIN SELECT
INITCAP(column1_utf8view) as c
FROM test;
----
logical_plan
01)Projection: initcap(CAST(test.column1_utf8view AS Utf8)) AS c
01)Projection: initcap(test.column1_utf8view) AS c
xinlifoobar marked this conversation as resolved.
Show resolved Hide resolved
02)--TableScan: test projection=[column1_utf8view]

## Ensure no casts for LEVENSHTEIN
Expand Down