From 45117f65f3d90579a77bd525a75ac568a888ef0a Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 31 Jul 2024 15:41:30 -0400 Subject: [PATCH 1/6] Minor: Add tests for StringView / character functions --- .../sqllogictest/test_files/string_view.slt | 364 ++++++++++++++++++ 1 file changed, 364 insertions(+) diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index 3f9a4793f655d..c3b8916014e48 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -322,6 +322,370 @@ logical_plan 03)----TableScan: test projection=[column1_utf8, column2_utf8, column1_utf8view] +# Ensure string functions use native StringView implementation +# and do not fall back to Utf8 or LargeUtf8 +# Should see no casts to Utf8 in the plans below + +## Ensure no casts for LIKE/ILIKE +query TT +EXPLAIN SELECT + column1_utf8view like 'foo' as "like", + column1_utf8view ilike 'foo' as "ilike" +FROM test; +---- +logical_plan +01)Projection: test.column1_utf8view LIKE Utf8View("foo") AS like, test.column1_utf8view ILIKE Utf8View("foo") AS ilike +02)--TableScan: test projection=[column1_utf8view] + + + +## Ensure no casts for ASCII +## TODO file ticket +query TT +EXPLAIN SELECT + ASCII(column1_utf8view) AS l +FROM test; +---- +logical_plan +01)Projection: ascii(CAST(test.column1_utf8view AS Utf8)) AS l +02)--TableScan: test projection=[column1_utf8view] + + +## Ensure no casts for BTRIM +## TODO file ticket +query TT +EXPLAIN SELECT + BTRIM(column1_utf8view, 'foo') AS l +FROM test; +---- +logical_plan +01)Projection: btrim(CAST(test.column1_utf8view AS Utf8), Utf8("foo")) AS l +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for CHARACTER_LENGTH +## TODO file ticket +query TT +EXPLAIN SELECT + CHARACTER_LENGTH(column1_utf8view) AS l +FROM test; +---- +logical_plan +01)Projection: character_length(CAST(test.column1_utf8view AS Utf8)) AS l +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for CONCAT +## TODO file ticket +query TT +EXPLAIN SELECT + concat(column1_utf8view, column2_utf8view) as c +FROM test; +---- +logical_plan +01)Projection: concat(CAST(test.column1_utf8view AS Utf8), CAST(test.column2_utf8view AS Utf8)) AS c +02)--TableScan: test projection=[column1_utf8view, column2_utf8view] + +## Ensure no casts for CONCAT_WS +## TODO file ticket +query TT +EXPLAIN SELECT + concat_ws(', ', column1_utf8view, column2_utf8view) as c +FROM test; +---- +logical_plan +01)Projection: concat_ws(Utf8(", "), CAST(test.column1_utf8view AS Utf8), CAST(test.column2_utf8view AS Utf8)) AS c +02)--TableScan: test projection=[column1_utf8view, column2_utf8view] + +## Ensure no casts for CONTAINS +## TODO file ticket +query TT +EXPLAIN SELECT + CONTAINS(column1_utf8view, 'foo') as c1, + CONTAINS(column2_utf8view, column2_utf8view) as c2 +FROM test; +---- +logical_plan +01)Projection: contains(CAST(test.column1_utf8view AS Utf8), Utf8("foo")) AS c1, contains(__common_expr_1, __common_expr_1) AS c2 +02)--Projection: CAST(test.column2_utf8view AS Utf8) AS __common_expr_1, test.column1_utf8view +03)----TableScan: test projection=[column1_utf8view, column2_utf8view] + +## Ensure no casts for ENDS_WITH +## TODO file ticket +query TT +EXPLAIN SELECT + ENDS_WITH(column1_utf8view, 'foo') as c1, + ENDS_WITH(column2_utf8view, column2_utf8view) as c2 +FROM test; +---- +logical_plan +01)Projection: ends_with(CAST(test.column1_utf8view AS Utf8), Utf8("foo")) AS c1, ends_with(__common_expr_1, __common_expr_1) AS c2 +02)--Projection: CAST(test.column2_utf8view AS Utf8) AS __common_expr_1, test.column1_utf8view +03)----TableScan: test projection=[column1_utf8view, column2_utf8view] + + +## Ensure no casts for INITCAP +## TODO file ticket +query TT +EXPLAIN SELECT + INITCAP(column1_utf8view) as c +FROM test; +---- +logical_plan +01)Projection: initcap(CAST(test.column1_utf8view AS Utf8)) AS c +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for LEVENSHTEIN +## TODO file ticket +query TT +EXPLAIN SELECT + levenshtein(column1_utf8view, 'foo') as c1, + levenshtein(column1_utf8view, column2_utf8view) as c2 +FROM test; +---- +logical_plan +01)Projection: levenshtein(__common_expr_1, Utf8("foo")) AS c1, levenshtein(__common_expr_1, CAST(test.column2_utf8view AS Utf8)) AS c2 +02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1, test.column2_utf8view +03)----TableScan: test projection=[column1_utf8view, column2_utf8view] + +## Ensure no casts for LOWER +## TODO file ticket +query TT +EXPLAIN SELECT + LOWER(column1_utf8view) as c1 +FROM test; +---- +logical_plan +01)Projection: lower(CAST(test.column1_utf8view AS Utf8)) AS c1 +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for LTRIM +## TODO file ticket +query TT +EXPLAIN SELECT + LTRIM(column1_utf8view) as c1 +FROM test; +---- +logical_plan +01)Projection: ltrim(CAST(test.column1_utf8view AS Utf8)) AS c1 +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for LPAD +## TODO file ticket +query TT +EXPLAIN SELECT + LPAD(column1_utf8view, 12, ' ') as c1 +FROM test; +---- +logical_plan +01)Projection: lpad(CAST(test.column1_utf8view AS Utf8), Int64(12), Utf8(" ")) AS c1 +02)--TableScan: test projection=[column1_utf8view] + + +## Ensure no casts for OCTET_LENGTH +## TODO file ticket +query TT +EXPLAIN SELECT + OCTET_LENGTH(column1_utf8view) as c1 +FROM test; +---- +logical_plan +01)Projection: octet_length(CAST(test.column1_utf8view AS Utf8)) AS c1 +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for OVERLAY +## TODO file ticket +query TT +EXPLAIN SELECT + OVERLAY(column1_utf8view PLACING 'foo' FROM 2 ) as c1 +FROM test; +---- +logical_plan +01)Projection: overlay(CAST(test.column1_utf8view AS Utf8), Utf8("foo"), Int64(2)) AS c1 +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for REGEXP_LIKE +query error DataFusion error: Error during planning: The regexp_like function can only accept strings\. Got Utf8View +EXPLAIN SELECT + REGEXP_LIKE(column1_utf8view, '^https?://(?:www\.)?([^/]+)/.*$') AS k +FROM test; + +## Ensure no casts for REGEXP_MATCH +query error DataFusion error: Error during planning: The regexp_match function can only accept strings\. Got Utf8View +EXPLAIN SELECT + REGEXP_MATCH(column1_utf8view, '^https?://(?:www\.)?([^/]+)/.*$') AS k +FROM test; + +## Ensure no casts for REGEXP_REPLACE +query TT +EXPLAIN SELECT + REGEXP_REPLACE(column1_utf8view, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k +FROM test; +---- +logical_plan +01)Projection: regexp_replace(test.column1_utf8view, Utf8("^https?://(?:www\.)?([^/]+)/.*$"), Utf8("\1")) AS k +02)--TableScan: test projection=[column1_utf8view] + + +## Ensure no casts for REPEAT +## TODO file ticket +query TT +EXPLAIN SELECT + REPEAT(column1_utf8view, 2) as c1 +FROM test; +---- +logical_plan +01)Projection: repeat(CAST(test.column1_utf8view AS Utf8), Int64(2)) AS c1 +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for REPLACE +## TODO file ticket +query TT +EXPLAIN SELECT + REPLACE(column1_utf8view, 'foo', 'bar') as c1, + REPLACE(column1_utf8view, column2_utf8view, 'bar') as c2 +FROM test; +---- +logical_plan +01)Projection: replace(__common_expr_1, Utf8("foo"), Utf8("bar")) AS c1, replace(__common_expr_1, CAST(test.column2_utf8view AS Utf8), Utf8("bar")) AS c2 +02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1, test.column2_utf8view +03)----TableScan: test projection=[column1_utf8view, column2_utf8view] + +## Ensure no casts for REVERSE +## TODO file ticket +query TT +EXPLAIN SELECT + REVERSE(column1_utf8view) as c1 +FROM test; +---- +logical_plan +01)Projection: reverse(CAST(test.column1_utf8view AS Utf8)) AS c1 +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for RTRIM +## TODO file ticket +query TT +EXPLAIN SELECT + RTRIM(column1_utf8view) as c1, + RTRIM(column1_utf8view, 'foo') as c2 +FROM test; +---- +logical_plan +01)Projection: rtrim(__common_expr_1) AS c1, rtrim(__common_expr_1, Utf8("foo")) AS c2 +02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1 +03)----TableScan: test projection=[column1_utf8view] + +## Ensure no casts for RIGHT +## TODO file ticket +query TT +EXPLAIN SELECT + RIGHT(column1_utf8view, 3) as c2 +FROM test; +---- +logical_plan +01)Projection: right(CAST(test.column1_utf8view AS Utf8), Int64(3)) AS c2 +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for RPAD +## TODO file ticket +query TT +EXPLAIN SELECT + RPAD(column1_utf8view, 1) as c1, + RPAD(column1_utf8view, 2, column2_utf8view) as c2 +FROM test; +---- +logical_plan +01)Projection: rpad(__common_expr_1, Int64(1)) AS c1, rpad(__common_expr_1, Int64(2), CAST(test.column2_utf8view AS Utf8)) AS c2 +02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1, test.column2_utf8view +03)----TableScan: test projection=[column1_utf8view, column2_utf8view] + + +## Ensure no casts for RTRIM +## TODO file ticket +query TT +EXPLAIN SELECT + RTRIM(column1_utf8view) as c, + RTRIM(column1_utf8view, column2_utf8view) as c1 +FROM test; +---- +logical_plan +01)Projection: rtrim(__common_expr_1) AS c, rtrim(__common_expr_1, CAST(test.column2_utf8view AS Utf8)) AS c1 +02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1, test.column2_utf8view +03)----TableScan: test projection=[column1_utf8view, column2_utf8view] + +## Ensure no casts for SPLIT_PART +## TODO file ticket +query TT +EXPLAIN SELECT + SPLIT_PART(column1_utf8view, 'f', 1) as c +FROM test; +---- +logical_plan +01)Projection: split_part(CAST(test.column1_utf8view AS Utf8), Utf8("f"), Int64(1)) AS c +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for STRPOS +## TODO file ticket +query TT +EXPLAIN SELECT + STRPOS(column1_utf8view, 'f') as c, + STRPOS(column1_utf8view, column2_utf8view) as c2 +FROM test; +---- +logical_plan +01)Projection: strpos(__common_expr_1, Utf8("f")) AS c, strpos(__common_expr_1, CAST(test.column2_utf8view AS Utf8)) AS c2 +02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1, test.column2_utf8view +03)----TableScan: test projection=[column1_utf8view, column2_utf8view] + +## Ensure no casts for SUBSTR +## TODO file ticket +query TT +EXPLAIN SELECT + SUBSTR(column1_utf8view, 1) as c, + SUBSTR(column1_utf8view, 1 ,2) as c2 +FROM test; +---- +logical_plan +01)Projection: substr(__common_expr_1, Int64(1)) AS c, substr(__common_expr_1, Int64(1), Int64(2)) AS c2 +02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1 +03)----TableScan: test projection=[column1_utf8view] + +## Ensure no casts for STARTS_WITH +## TODO file ticket +query TT +EXPLAIN SELECT + STARTS_WITH(column1_utf8view, 'foo') as c, + STARTS_WITH(column1_utf8view, column2_utf8view) as c2 +FROM test; +---- +logical_plan +01)Projection: starts_with(__common_expr_1, Utf8("foo")) AS c, starts_with(__common_expr_1, CAST(test.column2_utf8view AS Utf8)) AS c2 +02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1, test.column2_utf8view +03)----TableScan: test projection=[column1_utf8view, column2_utf8view] + +## Ensure no casts for TRANSLATE +## TODO file ticket +query TT +EXPLAIN SELECT + TRANSLATE(column1_utf8view, 'foo', 'bar') as c +FROM test; +---- +logical_plan +01)Projection: translate(CAST(test.column1_utf8view AS Utf8), Utf8("foo"), Utf8("bar")) AS c +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for FIND_IN_SET +## TODO file ticket +query TT +EXPLAIN SELECT + FIND_IN_SET(column1_utf8view, 'a,b,c,d') as c +FROM test; +---- +logical_plan +01)Projection: find_in_set(CAST(test.column1_utf8view AS Utf8), Utf8("a,b,c,d")) AS c +02)--TableScan: test projection=[column1_utf8view] + + + + statement ok drop table test; From 9f3c6515ccbdb4f400e2a51f27f76398585cdbfd Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 31 Jul 2024 16:17:44 -0400 Subject: [PATCH 2/6] Fix regexp_like and regexp_match to work with StringVeiw --- datafusion/functions/src/regex/regexplike.rs | 7 +------ datafusion/functions/src/regex/regexpmatch.rs | 12 ++---------- datafusion/sqllogictest/test_files/string_view.slt | 12 ++++++++++-- 3 files changed, 13 insertions(+), 18 deletions(-) diff --git a/datafusion/functions/src/regex/regexplike.rs b/datafusion/functions/src/regex/regexplike.rs index 09b96a28c1074..455d009db7785 100644 --- a/datafusion/functions/src/regex/regexplike.rs +++ b/datafusion/functions/src/regex/regexplike.rs @@ -75,13 +75,8 @@ impl ScalarUDFImpl for RegexpLikeFunc { use DataType::*; Ok(match &arg_types[0] { - LargeUtf8 | Utf8 => Boolean, Null => Null, - other => { - return plan_err!( - "The regexp_like function can only accept strings. Got {other}" - ); - } + _ => Boolean, }) } fn invoke(&self, args: &[ColumnarValue]) -> Result { diff --git a/datafusion/functions/src/regex/regexpmatch.rs b/datafusion/functions/src/regex/regexpmatch.rs index f57d3c17bd72b..764acd7de757d 100644 --- a/datafusion/functions/src/regex/regexpmatch.rs +++ b/datafusion/functions/src/regex/regexpmatch.rs @@ -74,17 +74,9 @@ impl ScalarUDFImpl for RegexpMatchFunc { } fn return_type(&self, arg_types: &[DataType]) -> Result { - use DataType::*; - Ok(match &arg_types[0] { - LargeUtf8 => List(Arc::new(Field::new("item", LargeUtf8, true))), - Utf8 => List(Arc::new(Field::new("item", Utf8, true))), - Null => Null, - other => { - return plan_err!( - "The regexp_match function can only accept strings. Got {other}" - ); - } + DataType::Null => DataType::Null, + other => DataType::List(Arc::new(Field::new("item", other.clone(), true))), }) } fn invoke(&self, args: &[ColumnarValue]) -> Result { diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index c3b8916014e48..3c5f13ce2ac8d 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -503,16 +503,24 @@ logical_plan 02)--TableScan: test projection=[column1_utf8view] ## Ensure no casts for REGEXP_LIKE -query error DataFusion error: Error during planning: The regexp_like function can only accept strings\. Got Utf8View +query TT EXPLAIN SELECT REGEXP_LIKE(column1_utf8view, '^https?://(?:www\.)?([^/]+)/.*$') AS k FROM test; +---- +logical_plan +01)Projection: regexp_like(CAST(test.column1_utf8view AS Utf8), Utf8("^https?://(?:www\.)?([^/]+)/.*$")) AS k +02)--TableScan: test projection=[column1_utf8view] ## Ensure no casts for REGEXP_MATCH -query error DataFusion error: Error during planning: The regexp_match function can only accept strings\. Got Utf8View +query TT EXPLAIN SELECT REGEXP_MATCH(column1_utf8view, '^https?://(?:www\.)?([^/]+)/.*$') AS k FROM test; +---- +logical_plan +01)Projection: regexp_match(CAST(test.column1_utf8view AS Utf8), Utf8("^https?://(?:www\.)?([^/]+)/.*$")) AS k +02)--TableScan: test projection=[column1_utf8view] ## Ensure no casts for REGEXP_REPLACE query TT From 268efcd892e1391a952ef8b815d59bfe8bc78db9 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 8 Aug 2024 12:18:15 -0400 Subject: [PATCH 3/6] Update for ASCII and BTRIM --- .../sqllogictest/test_files/string_view.slt | 175 +++++++----------- 1 file changed, 70 insertions(+), 105 deletions(-) diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index a80dc6892ef93..94797a9e7152c 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -443,19 +443,82 @@ logical_plan ## Ensure no casts for ASCII -## TODO file ticket + +### ASCII + +# Test ASCII with utf8view against utf8view, utf8, and largeutf8 +# (should be no casts) query TT EXPLAIN SELECT - ASCII(column1_utf8view) AS l + ASCII(column1_utf8view) as c1, + ASCII(column2_utf8) as c2, + ASCII(column2_large_utf8) as c3 FROM test; ---- logical_plan -01)Projection: ascii(CAST(test.column1_utf8view AS Utf8)) AS l -02)--TableScan: test projection=[column1_utf8view] +01)Projection: ascii(test.column1_utf8view) AS c1, ascii(test.column2_utf8) AS c2, ascii(test.column2_large_utf8) AS c3 +02)--TableScan: test projection=[column2_utf8, column2_large_utf8, column1_utf8view] + +query III +SELECT + ASCII(column1_utf8view) as c1, + ASCII(column2_utf8) as c2, + ASCII(column2_large_utf8) as c3 +FROM test; +---- +65 88 88 +88 88 88 +82 82 82 +NULL 82 82 + +query TT +EXPLAIN SELECT + ASCII(column1_utf8) as c1, + ASCII(column1_large_utf8) as c2, + ASCII(column2_utf8view) as c3, + ASCII('hello') as c4, + ASCII(arrow_cast('world', 'Utf8View')) as c5 +FROM test; +---- +logical_plan +01)Projection: ascii(test.column1_utf8) AS c1, ascii(test.column1_large_utf8) AS c2, ascii(test.column2_utf8view) AS c3, Int32(104) AS c4, Int32(119) AS c5 +02)--TableScan: test projection=[column1_utf8, column1_large_utf8, column2_utf8view] + +query IIIII +SELECT + ASCII(column1_utf8) as c1, + ASCII(column1_large_utf8) as c2, + ASCII(column2_utf8view) as c3, + ASCII('hello') as c4, + ASCII(arrow_cast('world', 'Utf8View')) as c5 +FROM test; +---- +65 65 88 104 119 +88 88 88 104 119 +82 82 82 104 119 +NULL NULL 82 104 119 + +# Test ASCII with literals cast to Utf8View +query TT +EXPLAIN SELECT + ASCII(arrow_cast('äöüß', 'Utf8View')) as c1, + ASCII(arrow_cast('', 'Utf8View')) as c2, + ASCII(arrow_cast(NULL, 'Utf8View')) as c3 +FROM test; +---- +logical_plan +01)Projection: Int32(228) AS c1, Int32(0) AS c2, Int32(NULL) AS c3 +02)--TableScan: test projection=[] +query III +SELECT + ASCII(arrow_cast('äöüß', 'Utf8View')) as c1, + ASCII(arrow_cast('', 'Utf8View')) as c2, + ASCII(arrow_cast(NULL, 'Utf8View')) as c3 +---- +228 0 NULL ## Ensure no casts for BTRIM -## TODO file ticket query TT EXPLAIN SELECT BTRIM(column1_utf8view, 'foo') AS l @@ -768,9 +831,8 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: starts_with(__common_expr_1, Utf8("foo")) AS c, starts_with(__common_expr_1, CAST(test.column2_utf8view AS Utf8)) AS c2 -02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1, test.column2_utf8view -03)----TableScan: test projection=[column1_utf8view, column2_utf8view] +01)Projection: starts_with(test.column1_utf8view, Utf8View("foo")) AS c, starts_with(test.column1_utf8view, test.column2_utf8view) AS c2 +02)--TableScan: test projection=[column1_utf8view, column2_utf8view] ## Ensure no casts for TRANSLATE ## TODO file ticket @@ -873,101 +935,4 @@ select column2|| ' ' ||column3 from temp; rust fast datafusion cool -### ASCII -# Setup the initial test data -statement ok -create table test_source as values - ('Andrew', 'X'), - ('Xiangpeng', 'Xiangpeng'), - ('Raphael', 'R'), - (NULL, 'R'); -# Table with the different combination of column types -statement ok -create table test as -SELECT - arrow_cast(column1, 'Utf8') as column1_utf8, - arrow_cast(column2, 'Utf8') as column2_utf8, - arrow_cast(column1, 'LargeUtf8') as column1_large_utf8, - arrow_cast(column2, 'LargeUtf8') as column2_large_utf8, - arrow_cast(column1, 'Utf8View') as column1_utf8view, - arrow_cast(column2, 'Utf8View') as column2_utf8view -FROM test_source; - -# Test ASCII with utf8view against utf8view, utf8, and largeutf8 -# (should be no casts) -query TT -EXPLAIN SELECT - ASCII(column1_utf8view) as c1, - ASCII(column2_utf8) as c2, - ASCII(column2_large_utf8) as c3 -FROM test; ----- -logical_plan -01)Projection: ascii(test.column1_utf8view) AS c1, ascii(test.column2_utf8) AS c2, ascii(test.column2_large_utf8) AS c3 -02)--TableScan: test projection=[column2_utf8, column2_large_utf8, column1_utf8view] - -query III -SELECT - ASCII(column1_utf8view) as c1, - ASCII(column2_utf8) as c2, - ASCII(column2_large_utf8) as c3 -FROM test; ----- -65 88 88 -88 88 88 -82 82 82 -NULL 82 82 - -query TT -EXPLAIN SELECT - ASCII(column1_utf8) as c1, - ASCII(column1_large_utf8) as c2, - ASCII(column2_utf8view) as c3, - ASCII('hello') as c4, - ASCII(arrow_cast('world', 'Utf8View')) as c5 -FROM test; ----- -logical_plan -01)Projection: ascii(test.column1_utf8) AS c1, ascii(test.column1_large_utf8) AS c2, ascii(test.column2_utf8view) AS c3, Int32(104) AS c4, Int32(119) AS c5 -02)--TableScan: test projection=[column1_utf8, column1_large_utf8, column2_utf8view] - -query IIIII -SELECT - ASCII(column1_utf8) as c1, - ASCII(column1_large_utf8) as c2, - ASCII(column2_utf8view) as c3, - ASCII('hello') as c4, - ASCII(arrow_cast('world', 'Utf8View')) as c5 -FROM test; ----- -65 65 88 104 119 -88 88 88 104 119 -82 82 82 104 119 -NULL NULL 82 104 119 - -# Test ASCII with literals cast to Utf8View -query TT -EXPLAIN SELECT - ASCII(arrow_cast('äöüß', 'Utf8View')) as c1, - ASCII(arrow_cast('', 'Utf8View')) as c2, - ASCII(arrow_cast(NULL, 'Utf8View')) as c3 -FROM test; ----- -logical_plan -01)Projection: Int32(228) AS c1, Int32(0) AS c2, Int32(NULL) AS c3 -02)--TableScan: test projection=[] - -query III -SELECT - ASCII(arrow_cast('äöüß', 'Utf8View')) as c1, - ASCII(arrow_cast('', 'Utf8View')) as c2, - ASCII(arrow_cast(NULL, 'Utf8View')) as c3 ----- -228 0 NULL - -statement ok -drop table test; - -statement ok -drop table test_source; From ead4a7c48c4789825f9d529e676e4fc169f3efa8 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 8 Aug 2024 12:20:30 -0400 Subject: [PATCH 4/6] Add comment about why it is ok to return boolean with catchall match --- datafusion/functions/src/regex/regexplike.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/datafusion/functions/src/regex/regexplike.rs b/datafusion/functions/src/regex/regexplike.rs index 455d009db7785..20029ba005c49 100644 --- a/datafusion/functions/src/regex/regexplike.rs +++ b/datafusion/functions/src/regex/regexplike.rs @@ -76,6 +76,8 @@ impl ScalarUDFImpl for RegexpLikeFunc { Ok(match &arg_types[0] { Null => Null, + // Type coercion is done by DataFusion based on signature, so if we + // get here, the first argument is always a string _ => Boolean, }) } From f42aa84e64536b08d87cba82f46c13f6b6b17c3f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 8 Aug 2024 12:28:24 -0400 Subject: [PATCH 5/6] Fix character_length --- datafusion/functions/src/unicode/character_length.rs | 2 +- datafusion/sqllogictest/test_files/string_view.slt | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/datafusion/functions/src/unicode/character_length.rs b/datafusion/functions/src/unicode/character_length.rs index cee1a57bc6d9d..e46ee162ff12e 100644 --- a/datafusion/functions/src/unicode/character_length.rs +++ b/datafusion/functions/src/unicode/character_length.rs @@ -44,7 +44,7 @@ impl CharacterLengthFunc { Self { signature: Signature::uniform( 1, - vec![Utf8, LargeUtf8], + vec![Utf8, LargeUtf8, Utf8View], Volatility::Immutable, ), aliases: vec![String::from("length"), String::from("char_length")], diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index 94797a9e7152c..c88c7aa4d84be 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -536,7 +536,7 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: character_length(CAST(test.column1_utf8view AS Utf8)) AS l +01)Projection: character_length(test.column1_utf8view) AS l 02)--TableScan: test projection=[column1_utf8view] ## Ensure no casts for CONCAT @@ -934,5 +934,3 @@ select column2|| ' ' ||column3 from temp; ---- rust fast datafusion cool - - From 4cb25d00aa779cdf8d9fda7097ae7e79b91c5f0a Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 8 Aug 2024 12:42:48 -0400 Subject: [PATCH 6/6] Add ticket references --- .../sqllogictest/test_files/string_view.slt | 24 +++++++++---------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index c88c7aa4d84be..e7166690580f9 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -529,7 +529,6 @@ logical_plan 02)--TableScan: test projection=[column1_utf8view] ## Ensure no casts for CHARACTER_LENGTH -## TODO file ticket query TT EXPLAIN SELECT CHARACTER_LENGTH(column1_utf8view) AS l @@ -540,7 +539,7 @@ logical_plan 02)--TableScan: test projection=[column1_utf8view] ## Ensure no casts for CONCAT -## TODO file ticket +## TODO https://github.com/apache/datafusion/issues/11836 query TT EXPLAIN SELECT concat(column1_utf8view, column2_utf8view) as c @@ -551,7 +550,7 @@ logical_plan 02)--TableScan: test projection=[column1_utf8view, column2_utf8view] ## Ensure no casts for CONCAT_WS -## TODO file ticket +## TODO https://github.com/apache/datafusion/issues/11837 query TT EXPLAIN SELECT concat_ws(', ', column1_utf8view, column2_utf8view) as c @@ -562,7 +561,7 @@ logical_plan 02)--TableScan: test projection=[column1_utf8view, column2_utf8view] ## Ensure no casts for CONTAINS -## TODO file ticket +## TODO https://github.com/apache/datafusion/issues/11838 query TT EXPLAIN SELECT CONTAINS(column1_utf8view, 'foo') as c1, @@ -575,7 +574,7 @@ logical_plan 03)----TableScan: test projection=[column1_utf8view, column2_utf8view] ## Ensure no casts for ENDS_WITH -## TODO file ticket +## TODO https://github.com/apache/datafusion/issues/11852 query TT EXPLAIN SELECT ENDS_WITH(column1_utf8view, 'foo') as c1, @@ -589,7 +588,7 @@ logical_plan ## Ensure no casts for INITCAP -## TODO file ticket +## TODO https://github.com/apache/datafusion/issues/11853 query TT EXPLAIN SELECT INITCAP(column1_utf8view) as c @@ -600,7 +599,7 @@ logical_plan 02)--TableScan: test projection=[column1_utf8view] ## Ensure no casts for LEVENSHTEIN -## TODO file ticket +## TODO https://github.com/apache/datafusion/issues/11854 query TT EXPLAIN SELECT levenshtein(column1_utf8view, 'foo') as c1, @@ -613,7 +612,7 @@ logical_plan 03)----TableScan: test projection=[column1_utf8view, column2_utf8view] ## Ensure no casts for LOWER -## TODO file ticket +## TODO https://github.com/apache/datafusion/issues/11855 query TT EXPLAIN SELECT LOWER(column1_utf8view) as c1 @@ -624,7 +623,7 @@ logical_plan 02)--TableScan: test projection=[column1_utf8view] ## Ensure no casts for LTRIM -## TODO file ticket +## TODO https://github.com/apache/datafusion/issues/11856 query TT EXPLAIN SELECT LTRIM(column1_utf8view) as c1 @@ -635,7 +634,7 @@ logical_plan 02)--TableScan: test projection=[column1_utf8view] ## Ensure no casts for LPAD -## TODO file ticket +## TODO https://github.com/apache/datafusion/issues/11857 query TT EXPLAIN SELECT LPAD(column1_utf8view, 12, ' ') as c1 @@ -647,7 +646,7 @@ logical_plan ## Ensure no casts for OCTET_LENGTH -## TODO file ticket +## TODO https://github.com/apache/datafusion/issues/11858 query TT EXPLAIN SELECT OCTET_LENGTH(column1_utf8view) as c1 @@ -822,8 +821,7 @@ logical_plan 02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1 03)----TableScan: test projection=[column1_utf8view] -## Ensure no casts for STARTS_WITH -## TODO file ticket +## Ensure no casts on columns for STARTS_WITH query TT EXPLAIN SELECT STARTS_WITH(column1_utf8view, 'foo') as c,