diff --git a/datafusion-examples/README.md b/datafusion-examples/README.md index 52702361e623a..f922cb766c222 100644 --- a/datafusion-examples/README.md +++ b/datafusion-examples/README.md @@ -36,9 +36,9 @@ cd datafusion # Download test data git submodule update --init -# Run the `csv_sql` example: +# Run the `dataframe` example: # ... use the equivalent for other examples -cargo run --example csv_sql +cargo run --example dataframe ``` ## Single Process @@ -47,10 +47,8 @@ cargo run --example csv_sql - [`advanced_udf.rs`](examples/advanced_udf.rs): Define and invoke a more complicated User Defined Scalar Function (UDF) - [`advanced_udwf.rs`](examples/advanced_udwf.rs): Define and invoke a more complicated User Defined Window Function (UDWF) - [`advanced_parquet_index.rs`](examples/advanced_parquet_index.rs): Creates a detailed secondary index that covers the contents of several parquet files -- [`avro_sql.rs`](examples/avro_sql.rs): Build and run a query plan from a SQL statement against a local AVRO file - [`catalog.rs`](examples/catalog.rs): Register the table into a custom catalog - [`composed_extension_codec`](examples/composed_extension_codec.rs): Example of using multiple extension codecs for serialization / deserialization -- [`csv_sql.rs`](examples/csv_sql.rs): Build and run a query plan from a SQL statement against a local CSV file - [`csv_sql_streaming.rs`](examples/csv_sql_streaming.rs): Build and run a streaming query plan from a SQL statement against a local CSV file - [`custom_datasource.rs`](examples/custom_datasource.rs): Run queries against a custom datasource (TableProvider) - [`dataframe-to-s3.rs`](examples/external_dependency/dataframe-to-s3.rs): Run a query using a DataFrame against a parquet file from s3 and writing back to s3 @@ -66,12 +64,11 @@ cargo run --example csv_sql - [`memtable.rs`](examples/memtable.rs): Create an query data in memory using SQL and `RecordBatch`es - [`optimizer_rule.rs`](examples/optimizer_rule.rs): Use a custom OptimizerRule to replace certain predicates - [`parquet_index.rs`](examples/parquet_index.rs): Create an secondary index over several parquet files and use it to speed up queries -- [`parquet_sql.rs`](examples/parquet_sql.rs): Build and run a query plan from a SQL statement against a local Parquet file - [`parquet_sql_multiple_files.rs`](examples/parquet_sql_multiple_files.rs): Build and run a query plan from a SQL statement against multiple local Parquet files - [`parquet_exec_visitor.rs`](examples/parquet_exec_visitor.rs): Extract statistics by visiting an ExecutionPlan after execution - [`parse_sql_expr.rs`](examples/parse_sql_expr.rs): Parse SQL text into Datafusion `Expr`. - [`plan_to_sql.rs`](examples/plan_to_sql.rs): Generate SQL from Datafusion `Expr` and `LogicalPlan` -- [`pruning.rs`](examples/parquet_sql.rs): Use pruning to rule out files based on statistics +- [`pruning.rs`](examples/pruning.rs): Use pruning to rule out files based on statistics - [`query-aws-s3.rs`](examples/external_dependency/query-aws-s3.rs): Configure `object_store` and run a query against files stored in AWS S3 - [`query-http-csv.rs`](examples/query-http-csv.rs): Configure `object_store` and run a query against files vi HTTP - [`regexp.rs`](examples/regexp.rs): Examples of using regular expression functions diff --git a/datafusion-examples/examples/avro_sql.rs b/datafusion-examples/examples/avro_sql.rs deleted file mode 100644 index ac1053aa1881e..0000000000000 --- a/datafusion-examples/examples/avro_sql.rs +++ /dev/null @@ -1,51 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use datafusion::arrow::util::pretty; - -use datafusion::error::Result; -use datafusion::prelude::*; - -/// This example demonstrates executing a simple query against an Arrow data source (Avro) and -/// fetching results -#[tokio::main] -async fn main() -> Result<()> { - // create local execution context - let ctx = SessionContext::new(); - - let testdata = datafusion::test_util::arrow_test_data(); - - // register avro file with the execution context - let avro_file = &format!("{testdata}/avro/alltypes_plain.avro"); - ctx.register_avro("alltypes_plain", avro_file, AvroReadOptions::default()) - .await?; - - // execute the query - let df = ctx - .sql( - "SELECT int_col, double_col, CAST(date_string_col as VARCHAR) \ - FROM alltypes_plain \ - WHERE id > 1 AND tinyint_col < double_col", - ) - .await?; - let results = df.collect().await?; - - // print the results - pretty::print_batches(&results)?; - - Ok(()) -} diff --git a/datafusion-examples/examples/csv_sql.rs b/datafusion-examples/examples/csv_sql.rs deleted file mode 100644 index 851fdcb626d2f..0000000000000 --- a/datafusion-examples/examples/csv_sql.rs +++ /dev/null @@ -1,70 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use datafusion::datasource::file_format::file_compression_type::FileCompressionType; -use datafusion::error::Result; -use datafusion::prelude::*; - -/// This example demonstrates executing a simple query against an Arrow data source (CSV) and -/// fetching results -#[tokio::main] -async fn main() -> Result<()> { - // create local execution context - let ctx = SessionContext::new(); - - let testdata = datafusion::test_util::arrow_test_data(); - - // register csv file with the execution context - ctx.register_csv( - "aggregate_test_100", - &format!("{testdata}/csv/aggregate_test_100.csv"), - CsvReadOptions::new(), - ) - .await?; - - // execute the query - let df = ctx - .sql( - "SELECT c1, MIN(c12), MAX(c12) \ - FROM aggregate_test_100 \ - WHERE c11 > 0.1 AND c11 < 0.9 \ - GROUP BY c1", - ) - .await?; - - // print the results - df.show().await?; - - // query compressed CSV with specific options - let csv_options = CsvReadOptions::default() - .has_header(true) - .file_compression_type(FileCompressionType::GZIP) - .file_extension("csv.gz"); - let df = ctx - .read_csv( - &format!("{testdata}/csv/aggregate_test_100.csv.gz"), - csv_options, - ) - .await?; - let df = df - .filter(col("c1").eq(lit("a")))? - .select_columns(&["c2", "c3"])?; - - df.show().await?; - - Ok(()) -} diff --git a/datafusion-examples/examples/parquet_sql.rs b/datafusion-examples/examples/parquet_sql.rs deleted file mode 100644 index fb438a7832cbd..0000000000000 --- a/datafusion-examples/examples/parquet_sql.rs +++ /dev/null @@ -1,51 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use datafusion::error::Result; -use datafusion::prelude::*; - -/// This example demonstrates executing a simple query against an Arrow data source (Parquet) and -/// fetching results -#[tokio::main] -async fn main() -> Result<()> { - // create local session context - let ctx = SessionContext::new(); - - let testdata = datafusion::test_util::parquet_test_data(); - - // register parquet file with the execution context - ctx.register_parquet( - "alltypes_plain", - &format!("{testdata}/alltypes_plain.parquet"), - ParquetReadOptions::default(), - ) - .await?; - - // execute the query - let df = ctx - .sql( - "SELECT int_col, double_col, CAST(date_string_col as VARCHAR) \ - FROM alltypes_plain \ - WHERE id > 1 AND tinyint_col < double_col", - ) - .await?; - - // print the results - df.show().await?; - - Ok(()) -} diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs index d81efaf68ca36..9e38a810a549e 100644 --- a/datafusion/core/src/lib.rs +++ b/datafusion/core/src/lib.rs @@ -592,3 +592,9 @@ doc_comment::doctest!( "../../../docs/source/user-guide/example-usage.md", user_guid_example_tests ); + +#[cfg(doctest)] +doc_comment::doctest!( + "../../../docs/source/library-user-guide/using-the-sql-api.md", + library_user_guide_example_usage +); diff --git a/docs/source/library-user-guide/using-the-sql-api.md b/docs/source/library-user-guide/using-the-sql-api.md index f4e85ee4e3a92..1a25f078cc2e2 100644 --- a/docs/source/library-user-guide/using-the-sql-api.md +++ b/docs/source/library-user-guide/using-the-sql-api.md @@ -19,4 +19,199 @@ # Using the SQL API +DataFusion has a full SQL API that allows you to interact with DataFusion using +SQL query strings. The simplest way to use the SQL API is to use the +[`SessionContext`] struct which provides the highest-level API for executing SQL +queries. + +To use SQL, you first register your data as a table and then run queries +using the [`SessionContext::sql`] method. For lower level control such as +preventing DDL, you can use [`SessionContext::sql_with_options`] or the +[`SessionState`] APIs + +[`sessioncontext`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html +[`sessioncontext::sql`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.sql +[`sessioncontext::sql_with_options`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.sql_with_options +[`sessionstate`]: https://docs.rs/datafusion/latest/datafusion/execution/session_state/struct.SessionState.html + +## Registering Data Sources using `SessionContext::register*` + +The `SessionContext::register*` methods tell DataFusion the name of +the source and how to read data. Once registered, you can execute SQL queries +using the `SessionContext::sql` method referring to your data source as a table. + +### Read a CSV File + +```rust +use datafusion::error::Result; +use datafusion::prelude::*; +use arrow::record_batch::RecordBatch; + +#[tokio::main] +async fn main() -> Result<()> { + let ctx = SessionContext::new(); + // register the "example" table + ctx.register_csv("example", "tests/data/example.csv", CsvReadOptions::new()).await?; + // create a plan to run a SQL query + let df = ctx.sql("SELECT a, MIN(b) FROM example WHERE a <= b GROUP BY a LIMIT 100").await?; + // execute the plan and collect the results as Vec + let results: Vec = df.collect().await?; + // Use the assert_batches_eq macro to compare the results with expected output + datafusion::assert_batches_eq!(vec![ + "+---+----------------+", + "| a | MIN(example.b) |", + "+---+----------------+", + "| 1 | 2 |", + "+---+----------------+", + ], + &results + ); + Ok(()) +} +``` + +### Read an Apache Parquet file + +Similarly to CSV, you can register a Parquet file as a table using the `register_parquet` method. + +```rust +use datafusion::error::Result; +use datafusion::prelude::*; +#[tokio::main] +async fn main() -> Result<()> { + // create local session context + let ctx = SessionContext::new(); + let testdata = datafusion::test_util::parquet_test_data(); + + // register parquet file with the execution context + ctx.register_parquet( + "alltypes_plain", + &format!("{testdata}/alltypes_plain.parquet"), + ParquetReadOptions::default(), + ) + .await?; + + // execute the query + let df = ctx.sql( + "SELECT int_col, double_col, CAST(date_string_col as VARCHAR) \ + FROM alltypes_plain \ + WHERE id > 1 AND tinyint_col < double_col", + ).await?; + + // execute the plan, and compare to the expected results + let results = df.collect().await?; + datafusion::assert_batches_eq!(vec![ + "+---------+------------+--------------------------------+", + "| int_col | double_col | alltypes_plain.date_string_col |", + "+---------+------------+--------------------------------+", + "| 1 | 10.1 | 03/01/09 |", + "| 1 | 10.1 | 04/01/09 |", + "| 1 | 10.1 | 02/01/09 |", + "+---------+------------+--------------------------------+", + ], + &results + ); + Ok(()) +} +``` + +### Read an Apache Avro file + +DataFusion can also read Avro files using the `register_avro` method. + +```rust +use datafusion::arrow::util::pretty; +use datafusion::error::Result; +use datafusion::prelude::*; + +#[tokio::main] +async fn main() -> Result<()> { + let ctx = SessionContext::new(); + // find the path to the avro test files + let testdata = datafusion::test_util::arrow_test_data(); + // register avro file with the execution context + let avro_file = &format!("{testdata}/avro/alltypes_plain.avro"); + ctx.register_avro("alltypes_plain", avro_file, AvroReadOptions::default()).await?; + + // execute the query + let df = ctx.sql( + "SELECT int_col, double_col, CAST(date_string_col as VARCHAR) \ + FROM alltypes_plain \ + WHERE id > 1 AND tinyint_col < double_col" + ).await?; + + // execute the plan, and compare to the expected results + let results = df.collect().await?; + datafusion::assert_batches_eq!(vec![ + "+---------+------------+--------------------------------+", + "| int_col | double_col | alltypes_plain.date_string_col |", + "+---------+------------+--------------------------------+", + "| 1 | 10.1 | 03/01/09 |", + "| 1 | 10.1 | 04/01/09 |", + "| 1 | 10.1 | 02/01/09 |", + "+---------+------------+--------------------------------+", + ], + &results + ); + Ok(()) +} +``` + +## Reading Multiple Files as a table + +It is also possible to read multiple files as a single table. This is done +with the ListingTableProvider which takes a list of file paths and reads them +as a single table, matching schemas as appropriate + Coming Soon + +```rust + +``` + +## Using `CREATE EXTERNAL TABLE` to register data sources via SQL + +You can also register files using SQL using the [`CREATE EXTERNAL TABLE`] +statement. + +[`create external table`]: ../user-guide/sql/ddl.md#create-external-table + +```rust +use datafusion::error::Result; +use datafusion::prelude::*; +#[tokio::main] +async fn main() -> Result<()> { + // create local session context + let ctx = SessionContext::new(); + let testdata = datafusion::test_util::parquet_test_data(); + + // register parquet file using SQL + let ddl = format!( + "CREATE EXTERNAL TABLE alltypes_plain \ + STORED AS PARQUET LOCATION '{testdata}/alltypes_plain.parquet'" + ); + ctx.sql(&ddl).await?; + + // execute the query referring to the alltypes_plain table we just registered + let df = ctx.sql( + "SELECT int_col, double_col, CAST(date_string_col as VARCHAR) \ + FROM alltypes_plain \ + WHERE id > 1 AND tinyint_col < double_col", + ).await?; + + // execute the plan, and compare to the expected results + let results = df.collect().await?; + datafusion::assert_batches_eq!(vec![ + "+---------+------------+--------------------------------+", + "| int_col | double_col | alltypes_plain.date_string_col |", + "+---------+------------+--------------------------------+", + "| 1 | 10.1 | 03/01/09 |", + "| 1 | 10.1 | 04/01/09 |", + "| 1 | 10.1 | 02/01/09 |", + "+---------+------------+--------------------------------+", + ], + &results + ); + Ok(()) +} +``` diff --git a/docs/source/user-guide/example-usage.md b/docs/source/user-guide/example-usage.md index 71a614313e8ae..62f09a6a9937c 100644 --- a/docs/source/user-guide/example-usage.md +++ b/docs/source/user-guide/example-usage.md @@ -56,7 +56,7 @@ datafusion = { git = "https://github.com/apache/datafusion", branch = "main", de More on [Cargo dependencies](https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html#specifying-dependencies) -## Run a SQL query against data stored in a CSV: +## Run a SQL query against data stored in a CSV ```rust use datafusion::prelude::*; @@ -76,7 +76,10 @@ async fn main() -> datafusion::error::Result<()> { } ``` -## Use the DataFrame API to process data stored in a CSV: +See [the SQL API](../library-user-guide/using-the-sql-api.md) section of the +library guide for more information on the SQL API. + +## Use the DataFrame API to process data stored in a CSV ```rust use datafusion::prelude::*;