From 364f49c94e7558f68ed81e799bf75cffffcb3d13 Mon Sep 17 00:00:00 2001 From: Dejan Simic <10134699+simicd@users.noreply.github.com> Date: Mon, 22 Jan 2024 20:55:05 +0100 Subject: [PATCH 1/4] Port select queries - part II --- datafusion/core/tests/sql/select.rs | 49 --------------- datafusion/sqllogictest/test_files/select.slt | 59 ++++++++++++++++++- 2 files changed, 56 insertions(+), 52 deletions(-) diff --git a/datafusion/core/tests/sql/select.rs b/datafusion/core/tests/sql/select.rs index 4a782e54b070c..ed065e2f69ebe 100644 --- a/datafusion/core/tests/sql/select.rs +++ b/datafusion/core/tests/sql/select.rs @@ -612,52 +612,3 @@ async fn parallel_query_with_filter() -> Result<()> { Ok(()) } - -#[tokio::test] -async fn boolean_literal() -> Result<()> { - let results = - execute_with_partition("SELECT c1, c3 FROM test WHERE c1 > 2 AND c3 = true", 4) - .await?; - - let expected = [ - "+----+------+", - "| c1 | c3 |", - "+----+------+", - "| 3 | true |", - "| 3 | true |", - "| 3 | true |", - "| 3 | true |", - "| 3 | true |", - "+----+------+", - ]; - assert_batches_sorted_eq!(expected, &results); - - Ok(()) -} - -#[tokio::test] -async fn unprojected_filter() { - let config = SessionConfig::new(); - let ctx = SessionContext::new_with_config(config); - let df = ctx.read_table(table_with_sequence(1, 3).unwrap()).unwrap(); - - let df = df - .filter(col("i").gt(lit(2))) - .unwrap() - .select(vec![col("i") + col("i")]) - .unwrap(); - - let plan = df.clone().into_optimized_plan().unwrap(); - println!("{}", plan.display_indent()); - - let results = df.collect().await.unwrap(); - - let expected = [ - "+-----------------------+", - "| ?table?.i + ?table?.i |", - "+-----------------------+", - "| 6 |", - "+-----------------------+", - ]; - assert_batches_sorted_eq!(expected, &results); -} diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt index 9ffddc6e2d465..180d49a872a4b 100644 --- a/datafusion/sqllogictest/test_files/select.slt +++ b/datafusion/sqllogictest/test_files/select.slt @@ -506,6 +506,10 @@ ProjectionExec: expr=[c1@0 >= 2 AND c1@0 <= 3 as select_between_data.c1 BETWEEN # TODO: sort_on_window_null_string +# Drop table +statement ok +DROP TABLE test; + # # Create time tables with different precisions but the same logical values @@ -669,7 +673,11 @@ c 3 1 b 0 29 c 3 1 a 0 -85 c 3 1 b 4 -82 -# TODO: test_prepare_statement +# TODO: Test prepare statement +# Dependency on https://github.com/apache/arrow-datafusion/issues/4539#issuecomment-1755430857 + +# TODO: Test named query parameters +# Dependency on https://github.com/apache/arrow-datafusion/issues/4539#issuecomment-1755430857 # TODO: parallel_query_with_filter @@ -705,10 +713,55 @@ CREATE TABLE empty_table; statement ok SELECT * FROM empty_table +###### +# Boolean literal +###### -# TODO: boolean_literal +statement ok +CREATE TABLE test AS +SELECT + arrow_cast(column1, 'UInt32') as c1, + arrow_cast(column2, 'UInt64') as c2, + arrow_cast(column3, 'Boolean') as c3 +FROM ( + VALUES + (1, 1, FALSE), + (1, 2, TRUE), + (1, 3, FALSE), + (1, 4, TRUE), + (2, 1, FALSE), + (2, 2, TRUE), + (2, 3, FALSE), + (2, 4, TRUE), + (3, 1, FALSE), + (3, 2, TRUE), + (3, 3, FALSE), + (3, 4, TRUE) +); + +query IB +SELECT c1, c3 FROM test WHERE c1 > 2 AND c3 = true +---- +3 true +3 true + +# Drop table +statement ok +DROP TABLE test; -# TODO: unprojected_filter + +###### +# Unprojected filter +###### + +statement ok +CREATE TABLE test(i INT) AS +VALUES (1), (2), (3); + +query I +SELECT i + i FROM test WHERE i > 2; +---- +6 # case sensitive in default dialect From 668d80210a636778bc15b97736ddb6a1fd0ef03d Mon Sep 17 00:00:00 2001 From: Dejan Simic <10134699+simicd@users.noreply.github.com> Date: Mon, 22 Jan 2024 21:05:46 +0100 Subject: [PATCH 2/4] Remove unused code --- datafusion/core/tests/sql/mod.rs | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/datafusion/core/tests/sql/mod.rs b/datafusion/core/tests/sql/mod.rs index 40ae75cd7f802..cd06f8eda5797 100644 --- a/datafusion/core/tests/sql/mod.rs +++ b/datafusion/core/tests/sql/mod.rs @@ -22,7 +22,6 @@ use arrow::{ util::display::array_value_to_string, }; -use datafusion::datasource::TableProvider; use datafusion::error::Result; use datafusion::logical_expr::{Aggregate, LogicalPlan, TableScan}; use datafusion::physical_plan::metrics::MetricValue; @@ -31,7 +30,7 @@ use datafusion::physical_plan::ExecutionPlanVisitor; use datafusion::prelude::*; use datafusion::test_util; use datafusion::{assert_batches_eq, assert_batches_sorted_eq}; -use datafusion::{datasource::MemTable, physical_plan::collect}; +use datafusion::physical_plan::collect; use datafusion::{execution::context::SessionContext, physical_plan::displayable}; use datafusion_common::{assert_contains, assert_not_contains}; use object_store::path::Path; @@ -322,21 +321,6 @@ async fn register_alltypes_parquet(ctx: &SessionContext) { .unwrap(); } -/// Return a new table provider that has a single Int32 column with -/// values between `seq_start` and `seq_end` -pub fn table_with_sequence( - seq_start: i32, - seq_end: i32, -) -> Result> { - let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)])); - let arr = Arc::new(Int32Array::from((seq_start..=seq_end).collect::>())); - let partitions = vec![vec![RecordBatch::try_new( - schema.clone(), - vec![arr as ArrayRef], - )?]]; - Ok(Arc::new(MemTable::try_new(schema, partitions)?)) -} - pub struct ExplainNormalizer { replacements: Vec<(String, String)>, } From fddd27f73f6978c8daf24f2f75097636e03ce734 Mon Sep 17 00:00:00 2001 From: Dejan Simic <10134699+simicd@users.noreply.github.com> Date: Mon, 22 Jan 2024 23:20:12 +0100 Subject: [PATCH 3/4] Fix cargo fmt error and refactor tests --- datafusion/core/tests/sql/explain_analyze.rs | 1 + datafusion/core/tests/sql/mod.rs | 1 - datafusion/core/tests/sql/select.rs | 41 ------ datafusion/sqllogictest/test_files/select.slt | 136 ++++++++++++++---- 4 files changed, 112 insertions(+), 67 deletions(-) diff --git a/datafusion/core/tests/sql/explain_analyze.rs b/datafusion/core/tests/sql/explain_analyze.rs index a1d9a02cf6b14..6283d77e16eb8 100644 --- a/datafusion/core/tests/sql/explain_analyze.rs +++ b/datafusion/core/tests/sql/explain_analyze.rs @@ -18,6 +18,7 @@ use super::*; use datafusion::config::ConfigOptions; +use datafusion::physical_plan::collect; use datafusion::physical_plan::display::DisplayableExecutionPlan; use datafusion::physical_plan::metrics::Timestamp; diff --git a/datafusion/core/tests/sql/mod.rs b/datafusion/core/tests/sql/mod.rs index cd06f8eda5797..3a72e0d561c56 100644 --- a/datafusion/core/tests/sql/mod.rs +++ b/datafusion/core/tests/sql/mod.rs @@ -30,7 +30,6 @@ use datafusion::physical_plan::ExecutionPlanVisitor; use datafusion::prelude::*; use datafusion::test_util; use datafusion::{assert_batches_eq, assert_batches_sorted_eq}; -use datafusion::physical_plan::collect; use datafusion::{execution::context::SessionContext, physical_plan::displayable}; use datafusion_common::{assert_contains, assert_not_contains}; use object_store::path::Path; diff --git a/datafusion/core/tests/sql/select.rs b/datafusion/core/tests/sql/select.rs index ed065e2f69ebe..d796a20b03b5c 100644 --- a/datafusion/core/tests/sql/select.rs +++ b/datafusion/core/tests/sql/select.rs @@ -571,44 +571,3 @@ async fn test_named_query_parameters() -> Result<()> { assert_batches_sorted_eq!(expected, &results); Ok(()) } - -#[tokio::test] -async fn parallel_query_with_filter() -> Result<()> { - let tmp_dir = TempDir::new()?; - let partition_count = 4; - let ctx = create_ctx_with_partition(&tmp_dir, partition_count).await?; - - let dataframe = ctx - .sql("SELECT c1, c2 FROM test WHERE c1 > 0 AND c1 < 3") - .await?; - let results = dataframe.collect().await.unwrap(); - let expected = vec![ - "+----+----+", - "| c1 | c2 |", - "+----+----+", - "| 1 | 1 |", - "| 1 | 10 |", - "| 1 | 2 |", - "| 1 | 3 |", - "| 1 | 4 |", - "| 1 | 5 |", - "| 1 | 6 |", - "| 1 | 7 |", - "| 1 | 8 |", - "| 1 | 9 |", - "| 2 | 1 |", - "| 2 | 10 |", - "| 2 | 2 |", - "| 2 | 3 |", - "| 2 | 4 |", - "| 2 | 5 |", - "| 2 | 6 |", - "| 2 | 7 |", - "| 2 | 8 |", - "| 2 | 9 |", - "+----+----+", - ]; - assert_batches_sorted_eq!(expected, &results); - - Ok(()) -} diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt index 180d49a872a4b..b2fde41932b11 100644 --- a/datafusion/sqllogictest/test_files/select.slt +++ b/datafusion/sqllogictest/test_files/select.slt @@ -116,7 +116,7 @@ VALUES (1,2,3,4,5,6,7,8,9,10,11,12,13,NULL,'F',3.5) # Test non-literal expressions in VALUES query II -VALUES (1, CASE WHEN RANDOM() > 0.5 THEN 1 ELSE 1 END), +VALUES (1, CASE WHEN RANDOM() > 0.5 THEN 1 ELSE 1 END), (2, CASE WHEN RANDOM() > 0.5 THEN 2 ELSE 2 END); ---- 1 1 @@ -506,10 +506,6 @@ ProjectionExec: expr=[c1@0 >= 2 AND c1@0 <= 3 as select_between_data.c1 BETWEEN # TODO: sort_on_window_null_string -# Drop table -statement ok -DROP TABLE test; - # # Create time tables with different precisions but the same logical values @@ -679,7 +675,98 @@ c 3 1 b 4 -82 # TODO: Test named query parameters # Dependency on https://github.com/apache/arrow-datafusion/issues/4539#issuecomment-1755430857 -# TODO: parallel_query_with_filter + +###### +# Parallel query with filter +###### + +# Set up csv files +statement ok +CREATE TABLE src_table_base ( + c2 INT, + c3 BOOLEAN +) AS VALUES +(1, FALSE), +(2, TRUE), +(3, FALSE), +(4, TRUE), +(5, FALSE), +(6, TRUE), +(7, FALSE), +(8, TRUE), +(9, FALSE), +(10, TRUE); + +query IIB +COPY ( + SELECT 0 as c1, c2, c3 FROM src_table_base +) TO 'test_files/scratch/select/csv_partitions/partition-1.csv' +(FORMAT CSV, SINGLE_FILE_OUTPUT true); +---- +10 + +query IIB +COPY ( + SELECT 1 as c1, c2, c3 FROM src_table_base +) TO 'test_files/scratch/select/csv_partitions/partition-2.csv' +(FORMAT CSV, SINGLE_FILE_OUTPUT true); +---- +10 + +query IIB +COPY ( + SELECT 2 as c1, c2, c3 FROM src_table_base +) TO 'test_files/scratch/select/csv_partitions/partition-3.csv' +(FORMAT CSV, SINGLE_FILE_OUTPUT true); +---- +10 + +query IIB +COPY ( + SELECT 3 as c1, c2, c3 FROM src_table_base +) TO 'test_files/scratch/select/csv_partitions/partition-4.csv' +(FORMAT CSV, SINGLE_FILE_OUTPUT true); +---- +10 + +# Set up table +statement ok +CREATE EXTERNAL TABLE test ( + c1 INT, + c2 INT, + c3 BOOLEAN, +) +STORED AS CSV +WITH HEADER ROW +LOCATION 'test_files/scratch/select/csv_partitions' + +query II +SELECT c1, c2 FROM test WHERE c1 > 0 AND c1 < 3; +---- +2 1 +2 2 +2 3 +2 4 +2 5 +2 6 +2 7 +2 8 +2 9 +2 10 +1 1 +1 2 +1 3 +1 4 +1 5 +1 6 +1 7 +1 8 +1 9 +1 10 + +# Drop table +statement ok +DROP TABLE test; # query with filter string type coercion @@ -717,33 +804,32 @@ SELECT * FROM empty_table # Boolean literal ###### +statement ok +CREATE EXTERNAL TABLE test_base ( + c1 INT, + c2 INT, + c3 BOOLEAN, +) +STORED AS CSV +WITH HEADER ROW +LOCATION 'test_files/scratch/select/csv_partitions' + statement ok CREATE TABLE test AS SELECT - arrow_cast(column1, 'UInt32') as c1, - arrow_cast(column2, 'UInt64') as c2, - arrow_cast(column3, 'Boolean') as c3 -FROM ( - VALUES - (1, 1, FALSE), - (1, 2, TRUE), - (1, 3, FALSE), - (1, 4, TRUE), - (2, 1, FALSE), - (2, 2, TRUE), - (2, 3, FALSE), - (2, 4, TRUE), - (3, 1, FALSE), - (3, 2, TRUE), - (3, 3, FALSE), - (3, 4, TRUE) -); + arrow_cast(c1, 'UInt32') as c1, + arrow_cast(c2, 'UInt64') as c2, + arrow_cast(c3, 'Boolean') as c3 +FROM test_base; query IB SELECT c1, c3 FROM test WHERE c1 > 2 AND c3 = true ---- 3 true 3 true +3 true +3 true +3 true # Drop table statement ok @@ -1174,7 +1260,7 @@ query II SELECT CASE WHEN B.x > 0 THEN A.x / B.x ELSE 0 END AS value1, CASE WHEN B.x > 0 AND B.y > 0 THEN A.x / B.x ELSE 0 END AS value3 -FROM t AS A, (SELECT * FROM t WHERE x = 0) AS B; +FROM t AS A, (SELECT * FROM t WHERE x = 0) AS B; ---- 0 0 0 0 From e05eb94f4d1d7a0e8860350d3f5ec431d11b8606 Mon Sep 17 00:00:00 2001 From: Dejan Simic <10134699+simicd@users.noreply.github.com> Date: Tue, 23 Jan 2024 17:30:44 +0100 Subject: [PATCH 4/4] Make order deterministic --- datafusion/sqllogictest/test_files/select.slt | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt index b2fde41932b11..1b698e1e30d34 100644 --- a/datafusion/sqllogictest/test_files/select.slt +++ b/datafusion/sqllogictest/test_files/select.slt @@ -741,18 +741,8 @@ WITH HEADER ROW LOCATION 'test_files/scratch/select/csv_partitions' query II -SELECT c1, c2 FROM test WHERE c1 > 0 AND c1 < 3; +SELECT c1, c2 FROM test WHERE c1 > 0 AND c1 < 3 ORDER BY c1, c2; ---- -2 1 -2 2 -2 3 -2 4 -2 5 -2 6 -2 7 -2 8 -2 9 -2 10 1 1 1 2 1 3 @@ -763,6 +753,16 @@ SELECT c1, c2 FROM test WHERE c1 > 0 AND c1 < 3; 1 8 1 9 1 10 +2 1 +2 2 +2 3 +2 4 +2 5 +2 6 +2 7 +2 8 +2 9 +2 10 # Drop table statement ok