From f517c69ca240b40b520773a2e0e573e88f9304a9 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 21 Jan 2023 13:45:07 -0700 Subject: [PATCH 1/2] DataFusion 16.0.0 documentation --- .../capitalized_example.csv | 5 + .../example.csv | 2 + .../contributor-guide/communication.md.txt | 74 ++++ .../_sources/contributor-guide/index.md.txt | 319 ++++++++++++++++ .../quarterly_roadmap.md.txt | 90 +++++ .../_sources/contributor-guide/roadmap.md.txt | 118 ++++++ .../specification/index.rst.txt | 25 ++ .../specification/invariants.md.txt | 327 ++++++++++++++++ .../output-field-name-semantic.md.txt | 212 +++++++++++ datafusion/_sources/index.rst.txt | 54 +++ datafusion/_sources/user-guide/cli.md.txt | 351 ++++++++++++++++++ datafusion/_sources/user-guide/configs.md.txt | 69 ++++ .../_sources/user-guide/dataframe.md.txt | 105 ++++++ .../_sources/user-guide/example-usage.md.txt | 140 +++++++ .../_sources/user-guide/expressions.md.txt | 211 +++++++++++ datafusion/_sources/user-guide/faq.md.txt | 31 ++ .../_sources/user-guide/introduction.md.txt | 43 +++ datafusion/_sources/user-guide/library.md.txt | 127 +++++++ .../user-guide/sql/aggregate_functions.md.txt | 68 ++++ .../_sources/user-guide/sql/data_types.md.txt | 90 +++++ datafusion/_sources/user-guide/sql/ddl.md.txt | 154 ++++++++ .../_sources/user-guide/sql/explain.md.txt | 71 ++++ .../_sources/user-guide/sql/index.rst.txt | 32 ++ .../user-guide/sql/information_schema.md.txt | 72 ++++ .../user-guide/sql/scalar_functions.md.txt | 297 +++++++++++++++ .../_sources/user-guide/sql/select.md.txt | 226 +++++++++++ .../_sources/user-guide/sql/sql_status.md.txt | 135 +++++++ .../_sources/user-guide/sql/subqueries.md.txt | 98 +++++ datafusion/contributor-guide/roadmap.html | 6 +- datafusion/objects.inv | Bin 5023 -> 5049 bytes datafusion/searchindex.js | 2 +- datafusion/user-guide/cli.html | 60 ++- datafusion/user-guide/configs.html | 152 ++++---- datafusion/user-guide/dataframe.html | 2 +- datafusion/user-guide/example-usage.html | 12 +- 35 files changed, 3684 insertions(+), 96 deletions(-) create mode 100644 datafusion/_downloads/3cce4d737d8c5814f5b50d859d21ba53/capitalized_example.csv create mode 100644 datafusion/_downloads/9f6fbc67bd5c63cb1fd7ba4efdf82d7a/example.csv create mode 100644 datafusion/_sources/contributor-guide/communication.md.txt create mode 100644 datafusion/_sources/contributor-guide/index.md.txt create mode 100644 datafusion/_sources/contributor-guide/quarterly_roadmap.md.txt create mode 100644 datafusion/_sources/contributor-guide/roadmap.md.txt create mode 100644 datafusion/_sources/contributor-guide/specification/index.rst.txt create mode 100644 datafusion/_sources/contributor-guide/specification/invariants.md.txt create mode 100644 datafusion/_sources/contributor-guide/specification/output-field-name-semantic.md.txt create mode 100644 datafusion/_sources/index.rst.txt create mode 100644 datafusion/_sources/user-guide/cli.md.txt create mode 100644 datafusion/_sources/user-guide/configs.md.txt create mode 100644 datafusion/_sources/user-guide/dataframe.md.txt create mode 100644 datafusion/_sources/user-guide/example-usage.md.txt create mode 100644 datafusion/_sources/user-guide/expressions.md.txt create mode 100644 datafusion/_sources/user-guide/faq.md.txt create mode 100644 datafusion/_sources/user-guide/introduction.md.txt create mode 100644 datafusion/_sources/user-guide/library.md.txt create mode 100644 datafusion/_sources/user-guide/sql/aggregate_functions.md.txt create mode 100644 datafusion/_sources/user-guide/sql/data_types.md.txt create mode 100644 datafusion/_sources/user-guide/sql/ddl.md.txt create mode 100644 datafusion/_sources/user-guide/sql/explain.md.txt create mode 100644 datafusion/_sources/user-guide/sql/index.rst.txt create mode 100644 datafusion/_sources/user-guide/sql/information_schema.md.txt create mode 100644 datafusion/_sources/user-guide/sql/scalar_functions.md.txt create mode 100644 datafusion/_sources/user-guide/sql/select.md.txt create mode 100644 datafusion/_sources/user-guide/sql/sql_status.md.txt create mode 100644 datafusion/_sources/user-guide/sql/subqueries.md.txt diff --git a/datafusion/_downloads/3cce4d737d8c5814f5b50d859d21ba53/capitalized_example.csv b/datafusion/_downloads/3cce4d737d8c5814f5b50d859d21ba53/capitalized_example.csv new file mode 100644 index 000000000000..dbc8f5c5a0a6 --- /dev/null +++ b/datafusion/_downloads/3cce4d737d8c5814f5b50d859d21ba53/capitalized_example.csv @@ -0,0 +1,5 @@ +A,b,c +1,2,3 +1,10,5 +2,5,6 +2,1,4 \ No newline at end of file diff --git a/datafusion/_downloads/9f6fbc67bd5c63cb1fd7ba4efdf82d7a/example.csv b/datafusion/_downloads/9f6fbc67bd5c63cb1fd7ba4efdf82d7a/example.csv new file mode 100644 index 000000000000..0eadb69396b3 --- /dev/null +++ b/datafusion/_downloads/9f6fbc67bd5c63cb1fd7ba4efdf82d7a/example.csv @@ -0,0 +1,2 @@ +a,b,c +1,2,3 \ No newline at end of file diff --git a/datafusion/_sources/contributor-guide/communication.md.txt b/datafusion/_sources/contributor-guide/communication.md.txt new file mode 100644 index 000000000000..11e0e4e0f0ea --- /dev/null +++ b/datafusion/_sources/contributor-guide/communication.md.txt @@ -0,0 +1,74 @@ + + +# Communication + +We welcome participation from everyone and encourage you to join us, ask +questions, and get involved. + +All participation in the Apache Arrow DataFusion project is governed by the +Apache Software Foundation's [code of +conduct](https://www.apache.org/foundation/policies/conduct.html). + +The vast majority of communication occurs in the open on our +[github repository](https://github.com/apache/arrow-datafusion). + +## Questions? + +### Mailing list + +We use arrow.apache.org's `dev@` mailing list for project management, release +coordination and design discussions +([subscribe](mailto:dev-subscribe@arrow.apache.org), +[unsubscribe](mailto:dev-unsubscribe@arrow.apache.org), +[archives](https://lists.apache.org/list.html?dev@arrow.apache.org)). + +When emailing the dev list, please make sure to prefix the subject line with a +`[DataFusion]` tag, e.g. `"[DataFusion] New API for remote data sources"`, so +that the appropriate people in the Apache Arrow community notice the message. + +### Slack and Discord + +We use the official [ASF](https://s.apache.org/slack-invite) Slack workspace +for informal discussions and coordination. This is a great place to meet other +contributors and get guidance on where to contribute. Join us in the +`#arrow-rust` channel. + +We also have a backup Arrow Rust Discord +server ([invite link](https://discord.gg/Qw5gKqHxUM)) in case you are not able +to join the Slack workspace. If you need an invite to the Slack workspace, you +can also ask for one in our Discord server. + +### Sync up video calls + +We have biweekly sync calls every other Thursdays at both 04:00 UTC +and 16:00 UTC (starting September 30, 2021) depending on if there are +items on the agenda to discuss and someone being willing to host. + +Please see the [agenda](https://docs.google.com/document/d/1atCVnoff5SR4eM4Lwf2M1BBJTY6g3_HUNR6qswYJW_U/edit) +for the video call link, add topics and to see what others plan to discuss. + +The goals of these calls are: + +1. Help "put a face to the name" of some of other contributors we are working with +2. Discuss / synchronize on the goals and major initiatives from different stakeholders to identify areas where more alignment is needed + +No decisions are made on the call and anything of substance will be discussed on the mailing list or in github issues / google docs. + +We will send a summary of all sync ups to the dev@arrow.apache.org mailing list. diff --git a/datafusion/_sources/contributor-guide/index.md.txt b/datafusion/_sources/contributor-guide/index.md.txt new file mode 100644 index 000000000000..43021e1815cd --- /dev/null +++ b/datafusion/_sources/contributor-guide/index.md.txt @@ -0,0 +1,319 @@ + + +# Introduction + +We welcome and encourage contributions of all kinds, such as: + +1. Tickets with issue reports of feature requests +2. Documentation improvements +3. Code (PR or PR Review) + +In addition to submitting new PRs, we have a healthy tradition of community members helping review each other's PRs. Doing so is a great way to help the community as well as get more familiar with Rust and the relevant codebases. + +You can find a curated +[good-first-issue](https://github.com/apache/arrow-datafusion/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) +list to help you get started. + +# Pull Requests + +We welcome pull requests (PRs) from anyone from the community. + +DataFusion is a very active fast-moving project and we try to review and merge PRs quickly to keep the review backlog down and the pace up. After review and approval, one of the [many people with commit access](https://arrow.apache.org/committers/) will merge your PR. + +Review bandwidth is currently our most limited resource, and we highly encourage reviews by the broader community. If you are waiting for your PR to be reviewed, consider helping review other PRs that are waiting. Such review both helps the reviewer to learn the codebase and become more expert, as well as helps identify issues in the PR (such as lack of test coverage), that can be addressed and make future reviews faster and more efficient. + +## Merging PRs + +Since we are a worldwide community, we have contributors in many timezones who review and comment. To ensure anyone who wishes has an opportunity to review a PR, our committers try to ensure that at least 24 hours passes between when a "major" PR is approved and when it is merged. + +A "major" PR means there is a substantial change in design or a change in the API. Committers apply their best judgment to determine what constitutes a substantial change. A "minor" PR might be merged without a 24 hour delay, again subject to the judgment of the committer. Examples of potential "minor" PRs are: + +1. Documentation improvements/additions +2. Small bug fixes +3. Non-controversial build-related changes (clippy, version upgrades etc.) +4. Smaller non-controversial feature additions + +# Developer's guide + +This section describes how you can get started at developing DataFusion. + +## Windows setup + +```shell +wget https://az792536.vo.msecnd.net/vms/VMBuild_20190311/VirtualBox/MSEdge/MSEdge.Win10.VirtualBox.zip +choco install -y git rustup.install visualcpp-build-tools +git-bash.exe +cargo build +``` + +## Protoc Installation + +Compiling DataFusion from sources requires an installed version of the protobuf compiler, `protoc`. + +On most platforms this can be installed from your system's package manager + +``` +$ apt install -y protobuf-compiler +$ dnf install -y protobuf-compiler +$ pacman -S protobuf +$ brew install protobuf +``` + +You will want to verify the version installed is `3.12` or greater, which introduced support for explicit [field presence](https://github.com/protocolbuffers/protobuf/blob/v3.12.0/docs/field_presence.md). Older versions may fail to compile. + +```shell +$ protoc --version +libprotoc 3.12.4 +``` + +Alternatively a binary release can be downloaded from the [Release Page](https://github.com/protocolbuffers/protobuf/releases) or [built from source](https://github.com/protocolbuffers/protobuf/blob/main/src/README.md). + +## Bootstrap environment + +DataFusion is written in Rust and it uses a standard rust toolkit: + +- `cargo build` +- `cargo fmt` to format the code +- `cargo test` to test +- etc. + +Testing setup: + +- `rustup update stable` DataFusion uses the latest stable release of rust +- `git submodule init` +- `git submodule update` + +Formatting instructions: + +- [ci/scripts/rust_fmt.sh](https://github.com/apache/arrow-datafusion/blob/master/ci/scripts/rust_fmt.sh) +- [ci/scripts/rust_clippy.sh](https://github.com/apache/arrow-datafusion/blob/master/ci/scripts/rust_clippy.sh) +- [ci/scripts/rust_toml_fmt.sh](https://github.com/apache/arrow-datafusion/blob/master/ci/scripts/rust_toml_fmt.sh) + +or run them all at once: + +- [dev/rust_lint.sh](https://github.com/apache/arrow-datafusion/blob/master/dev/rust_lint.sh) + +## Test Organization + +DataFusion has several levels of tests in its [Test +Pyramid](https://martinfowler.com/articles/practical-test-pyramid.html) +and tries to follow [Testing Organization](https://doc.rust-lang.org/book/ch11-03-test-organization.html) in the The Book. + +This section highlights the most important test modules that exist + +### Unit tests + +Tests for the code in an individual module are defined in the same source file with a `test` module, following Rust convention + +### Rust Integration Tests + +There are several tests of the public interface of the DataFusion library in the [tests](https://github.com/apache/arrow-datafusion/blob/master/datafusion/core/tests) directory. + +You can run these tests individually using a command such as + +```shell +cargo test -p datafusion --tests sql_integration +``` + +One very important test is the [sql_integration](https://github.com/apache/arrow-datafusion/blob/master/datafusion/core/tests/sql_integration.rs) test which validates DataFusion's ability to run a large assortment of SQL queries against an assortment of data setups. + +### sqllogictests Tests + +The [sqllogictests](https://github.com/apache/arrow-datafusion/blob/master/datafusion/core/tests/sqllogictests) also validate DataFusion SQL against an assortment of data setups. + +Data Driven tests have many benefits including being easier to write and maintain. We are in the process of [migrating sql_integration tests](https://github.com/apache/arrow-datafusion/issues/4460) and encourage +you to add new tests using sqllogictests if possible. + +### SQL / Postgres Integration Tests + +The [integration-tests](https://github.com/apache/arrow-datafusion/blob/master/integration-tests) directory contains a harness that runs certain queries against both postgres and datafusion and compares results + +#### setup environment + +```shell +export POSTGRES_DB=postgres +export POSTGRES_USER=postgres +export POSTGRES_HOST=localhost +export POSTGRES_PORT=5432 +``` + +#### Install dependencies + +```shell +# Install dependencies +python -m pip install --upgrade pip setuptools wheel +python -m pip install -r integration-tests/requirements.txt + +# setup environment +POSTGRES_DB=postgres POSTGRES_USER=postgres POSTGRES_HOST=localhost POSTGRES_PORT=5432 python -m pytest -v integration-tests/test_psql_parity.py + +# Create +psql -d "$POSTGRES_DB" -h "$POSTGRES_HOST" -p "$POSTGRES_PORT" -U "$POSTGRES_USER" -c 'CREATE TABLE IF NOT EXISTS test ( + c1 character varying NOT NULL, + c2 integer NOT NULL, + c3 smallint NOT NULL, + c4 smallint NOT NULL, + c5 integer NOT NULL, + c6 bigint NOT NULL, + c7 smallint NOT NULL, + c8 integer NOT NULL, + c9 bigint NOT NULL, + c10 character varying NOT NULL, + c11 double precision NOT NULL, + c12 double precision NOT NULL, + c13 character varying NOT NULL +);' + +psql -d "$POSTGRES_DB" -h "$POSTGRES_HOST" -p "$POSTGRES_PORT" -U "$POSTGRES_USER" -c "\copy test FROM '$(pwd)/testing/data/csv/aggregate_test_100.csv' WITH (FORMAT csv, HEADER true);" +``` + +#### Invoke the test runner + +```shell +python -m pytest -v integration-tests/test_psql_parity.py +``` + +## Benchmarks + +### Criterion Benchmarks + +[Criterion](https://docs.rs/criterion/latest/criterion/index.html) is a statistics-driven micro-benchmarking framework used by DataFusion for evaluating the performance of specific code-paths. In particular, the criterion benchmarks help to both guide optimisation efforts, and prevent performance regressions within DataFusion. + +Criterion integrates with Cargo's built-in [benchmark support](https://doc.rust-lang.org/cargo/commands/cargo-bench.html) and a given benchmark can be run with + +``` +cargo bench --bench BENCHMARK_NAME +``` + +A full list of benchmarks can be found [here](https://github.com/apache/arrow-datafusion/blob/master/datafusion/core/benches). + +_[cargo-criterion](https://github.com/bheisler/cargo-criterion) may also be used for more advanced reporting._ + +#### Parquet SQL Benchmarks + +The parquet SQL benchmarks can be run with + +``` + cargo bench --bench parquet_query_sql +``` + +These randomly generate a parquet file, and then benchmark queries sourced from [parquet_query_sql.sql](https://github.com/apache/arrow-datafusion/blob/master/datafusion/core/benches/parquet_query_sql.sql) against it. This can therefore be a quick way to add coverage of particular query and/or data paths. + +If the environment variable `PARQUET_FILE` is set, the benchmark will run queries against this file instead of a randomly generated one. This can be useful for performing multiple runs, potentially with different code, against the same source data, or for testing against a custom dataset. + +The benchmark will automatically remove any generated parquet file on exit, however, if interrupted (e.g. by CTRL+C) it will not. This can be useful for analysing the particular file after the fact, or preserving it to use with `PARQUET_FILE` in subsequent runs. + +### Upstream Benchmark Suites + +Instructions and tooling for running upstream benchmark suites against DataFusion can be found in [benchmarks](https://github.com/apache/arrow-datafusion/blob/master/benchmarks). + +These are valuable for comparative evaluation against alternative Arrow implementations and query engines. + +## How to add a new scalar function + +Below is a checklist of what you need to do to add a new scalar function to DataFusion: + +- Add the actual implementation of the function: + - [here](https://github.com/apache/arrow-datafusion/blob/master/datafusion/physical-expr/src/string_expressions.rs) for string functions + - [here](https://github.com/apache/arrow-datafusion/blob/master/datafusion/physical-expr/src/math_expressions.rs) for math functions + - [here](https://github.com/apache/arrow-datafusion/blob/master/datafusion/physical-expr/src/datetime_expressions.rs) for datetime functions + - create a new module [here](https://github.com/apache/arrow-datafusion/blob/master/datafusion/physical-expr/src) for other functions +- In [physical-expr/src](https://github.com/apache/arrow-datafusion/blob/master/datafusion/physical-expr/src/functions.rs), add: + - a new variant to `BuiltinScalarFunction` + - a new entry to `FromStr` with the name of the function as called by SQL + - a new line in `return_type` with the expected return type of the function, given an incoming type + - a new line in `signature` with the signature of the function (number and types of its arguments) + - a new line in `create_physical_expr`/`create_physical_fun` mapping the built-in to the implementation + - tests to the function. +- In [core/tests/sql](https://github.com/apache/arrow-datafusion/blob/master/datafusion/core/tests/sql), add a new test where the function is called through SQL against well known data and returns the expected result. +- In [expr/src/expr_fn.rs](https://github.com/apache/arrow-datafusion/blob/master/datafusion/expr/src/expr_fn.rs), add: + - a new entry of the `unary_scalar_expr!` macro for the new function. + +## How to add a new aggregate function + +Below is a checklist of what you need to do to add a new aggregate function to DataFusion: + +- Add the actual implementation of an `Accumulator` and `AggregateExpr`: + - [here](https://github.com/apache/arrow-datafusion/blob/master/datafusion/physical-expr/src/string_expressions.rs) for string functions + - [here](https://github.com/apache/arrow-datafusion/blob/master/datafusion/physical-expr/src/math_expressions.rs) for math functions + - [here](https://github.com/apache/arrow-datafusion/blob/master/datafusion/physical-expr/src/datetime_expressions.rs) for datetime functions + - create a new module [here](https://github.com/apache/arrow-datafusion/blob/master/datafusion/physical-expr/src) for other functions +- In [datafusion/expr/src](https://github.com/apache/arrow-datafusion/blob/master/datafusion/expr/src/aggregate_function.rs), add: + - a new variant to `AggregateFunction` + - a new entry to `FromStr` with the name of the function as called by SQL + - a new line in `return_type` with the expected return type of the function, given an incoming type + - a new line in `signature` with the signature of the function (number and types of its arguments) + - a new line in `create_aggregate_expr` mapping the built-in to the implementation + - tests to the function. +- In [tests/sql](https://github.com/apache/arrow-datafusion/blob/master/datafusion/core/tests/sql), add a new test where the function is called through SQL against well known data and returns the expected result. + +## How to display plans graphically + +The query plans represented by `LogicalPlan` nodes can be graphically +rendered using [Graphviz](https://www.graphviz.org/). + +To do so, save the output of the `display_graphviz` function to a file.: + +```rust +// Create plan somehow... +let mut output = File::create("/tmp/plan.dot")?; +write!(output, "{}", plan.display_graphviz()); +``` + +Then, use the `dot` command line tool to render it into a file that +can be displayed. For example, the following command creates a +`/tmp/plan.pdf` file: + +```bash +dot -Tpdf < /tmp/plan.dot > /tmp/plan.pdf +``` + +## Specifications + +We formalize DataFusion semantics and behaviors through specification +documents. These specifications are useful to be used as references to help +resolve ambiguities during development or code reviews. + +You are also welcome to propose changes to existing specifications or create +new specifications as you see fit. + +Here is the list current active specifications: + +- [Output field name semantic](https://arrow.apache.org/datafusion/contributor-guide/specification/output-field-name-semantic.html) +- [Invariants](https://arrow.apache.org/datafusion/contributor-guide/specification/invariants.html) + +All specifications are stored in the `docs/source/specification` folder. + +## How to format `.md` document + +We are using `prettier` to format `.md` files. + +You can either use `npm i -g prettier` to install it globally or use `npx` to run it as a standalone binary. Using `npx` required a working node environment. Upgrading to the latest prettier is recommended (by adding `--upgrade` to the `npm` command). + +```bash +$ prettier --version +2.3.0 +``` + +After you've confirmed your prettier version, you can format all the `.md` files: + +```bash +prettier -w {datafusion,datafusion-cli,datafusion-examples,dev,docs}/**/*.md +``` diff --git a/datafusion/_sources/contributor-guide/quarterly_roadmap.md.txt b/datafusion/_sources/contributor-guide/quarterly_roadmap.md.txt new file mode 100644 index 000000000000..c593e859d731 --- /dev/null +++ b/datafusion/_sources/contributor-guide/quarterly_roadmap.md.txt @@ -0,0 +1,90 @@ + + +# Quarterly Roadmap + +A quarterly roadmap will be published to give the DataFusion community visibility into the priorities of the projects contributors. This roadmap is not binding. + +## 2022 Q2 + +### DataFusion Core + +- IO Improvements + - Reading, registering, and writing more file formats from both DataFrame API and SQL + - Additional options for IO including partitioning and metadata support +- Work Scheduling + - Improve predictability, observability and performance of IO and CPU-bound work + - Develop a more explicit story for managing parallelism during plan execution +- Memory Management + - Add more operators for memory limited execution +- Performance + - Incorporate row-format into operators such as aggregate + - Add row-format benchmarks + - Explore JIT-compiling complex expressions + - Explore LLVM for JIT, with inline Rust functions as the primary goal + - Improve performance of Sort and Merge using Row Format / JIT expressions +- Documentation + - General improvements to DataFusion website + - Publish design documents +- Streaming + - Create `StreamProvider` trait + +### Ballista + +- Make production ready + - Shuffle file cleanup + - Fill functional gaps between DataFusion and Ballista + - Improve task scheduling and data exchange efficiency + - Better error handling + - Task failure + - Executor lost + - Schedule restart + - Improve monitoring and logging + - Auto scaling support +- Support for multi-scheduler deployments. Initially for resiliency and fault tolerance but ultimately to support sharding for scalability and more efficient caching. +- Executor deployment grouping based on resource allocation + +### Extensions ([datafusion-contrib](https://github.com/datafusion-contrib])) + +#### [DataFusion-Python](https://github.com/datafusion-contrib/datafusion-python) + +- Add missing functionality to DataFrame and SessionContext +- Improve documentation + +#### [DataFusion-S3](https://github.com/datafusion-contrib/datafusion-objectstore-s3) + +- Create Python bindings to use with datafusion-python + +#### [DataFusion-Tui](https://github.com/datafusion-contrib/datafusion-tui) + +- Create multiple SQL editors +- Expose more Context and query metadata +- Support new data sources + - BigTable, HDFS, HTTP APIs + +#### [DataFusion-BigTable](https://github.com/datafusion-contrib/datafusion-bigtable) + +- Python binding to use with datafusion-python +- Timestamp range predicate pushdown +- Multi-threaded partition aware execution +- Production ready Rust SDK + +#### [DataFusion-Streams](https://github.com/datafusion-contrib/datafusion-streams) + +- Create experimental implementation of `StreamProvider` trait diff --git a/datafusion/_sources/contributor-guide/roadmap.md.txt b/datafusion/_sources/contributor-guide/roadmap.md.txt new file mode 100644 index 000000000000..736eef681e48 --- /dev/null +++ b/datafusion/_sources/contributor-guide/roadmap.md.txt @@ -0,0 +1,118 @@ + + +# Roadmap + +This document describes high level goals of the DataFusion and +Ballista development community. It is not meant to restrict +possibilities, but rather help newcomers understand the broader +context of where the community is headed, and inspire +additional contributions. + +DataFusion and Ballista are part of the [Apache +Arrow](https://arrow.apache.org/) project and governed by the Apache +Software Foundation governance model. These projects are entirely +driven by volunteers, and we welcome contributions for items not on +this roadmap. However, before submitting a large PR, we strongly +suggest you start a conversation using a github issue or the +dev@arrow.apache.org mailing list to make review efficient and avoid +surprises. + +## DataFusion + +DataFusion's goal is to become the embedded query engine of choice +for new analytic applications, by leveraging the unique features of +[Rust](https://www.rust-lang.org/) and [Apache Arrow](https://arrow.apache.org/) +to provide: + +1. Best-in-class single node query performance +2. A Declarative SQL query interface compatible with PostgreSQL +3. A Dataframe API, similar to those offered by Pandas and Spark +4. A Procedural API for programmatically creating and running execution plans +5. High performance, data race free, ergonomic extensibility points at at every layer + +### Additional SQL Language Features + +- Decimal Support [#122](https://github.com/apache/arrow-datafusion/issues/122) +- Complete support list on [status](https://github.com/apache/arrow-datafusion/blob/master/README.md#status) +- Timestamp Arithmetic [#194](https://github.com/apache/arrow-datafusion/issues/194) +- SQL Parser extension point [#533](https://github.com/apache/arrow-datafusion/issues/533) +- Support for nested structures (fields, lists, structs) [#119](https://github.com/apache/arrow-datafusion/issues/119) +- Run all queries from the TPCH benchmark (see [milestone](https://github.com/apache/arrow-datafusion/milestone/2) for more details) + +### Query Optimizer + +- More sophisticated cost based optimizer for join ordering +- Implement advanced query optimization framework (Tokomak) [#440](https://github.com/apache/arrow-datafusion/issues/440) +- Finer optimizations for group by and aggregate functions + +### Datasources + +- Better support for reading data from remote filesystems (e.g. S3) without caching it locally [#907](https://github.com/apache/arrow-datafusion/issues/907) [#1060](https://github.com/apache/arrow-datafusion/issues/1060) +- Improve performances of file format datasources (parallelize file listings, async Arrow readers, file chunk prefetching capability...) + +### Runtime / Infrastructure + +- Migrate to some sort of arrow2 based implementation (see [milestone](https://github.com/apache/arrow-datafusion/milestone/3) for more details) +- Add DataFusion to h2oai/db-benchmark [#147](https://github.com/apache/arrow-datafusion/issues/147) +- Improve build time [#348](https://github.com/apache/arrow-datafusion/issues/348) + +### Resource Management + +- Finer grain control and limit of runtime memory [#587](https://github.com/apache/arrow-datafusion/issues/587) and CPU usage [#54](https://github.com/apache/arrow-datafusion/issues/64) + +### Python Interface + +TBD + +### DataFusion CLI (`datafusion-cli`) + +Note: There are some additional thoughts on a datafusion-cli vision on [#1096](https://github.com/apache/arrow-datafusion/issues/1096#issuecomment-939418770). + +- Better abstraction between REPL parsing and queries so that commands are separated and handled correctly +- Connect to the `Statistics` subsystem and have the cli print out more stats for query debugging, etc. +- Improved error handling for interactive use and shell scripting usage +- publishing to apt, brew, and possible NuGet registry so that people can use it more easily +- adopt a shorter name, like dfcli? + +## Ballista + +Ballista is a distributed compute platform based on Apache Arrow and DataFusion. It provides a query scheduler that +breaks a physical plan into stages and tasks and then schedules tasks for execution across the available executors +in the cluster. + +Having Ballista as part of the DataFusion codebase helps ensure that DataFusion remains suitable for distributed +compute. For example, it helps ensure that physical query plans can be serialized to protobuf format and that they +remain language-agnostic so that executors can be built in languages other than Rust. + +### Ballista Roadmap + +### Move query scheduler into DataFusion + +The Ballista scheduler has some advantages over DataFusion query execution because it doesn't try to eagerly execute +the entire query at once but breaks it down into a directionally-acyclic graph (DAG) of stages and executes a +configurable number of stages and tasks concurrently. It should be possible to push some of this logic down to +DataFusion so that the same scheduler can be used to scale across cores in-process and across nodes in a cluster. + +### Implement execution-time cost-based optimizations based on statistics + +After the execution of a query stage, accurate statistics are available for the resulting data. These statistics +could be leveraged by the scheduler to optimize the query during execution. For example, when performing a hash join +it is desirable to load the smaller side of the join into memory and in some cases we cannot predict which side will +be smaller until execution time. diff --git a/datafusion/_sources/contributor-guide/specification/index.rst.txt b/datafusion/_sources/contributor-guide/specification/index.rst.txt new file mode 100644 index 000000000000..bcd5a895c4d2 --- /dev/null +++ b/datafusion/_sources/contributor-guide/specification/index.rst.txt @@ -0,0 +1,25 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Specifications +============== + +.. toctree:: + :maxdepth: 1 + + invariants + output-field-name-semantic diff --git a/datafusion/_sources/contributor-guide/specification/invariants.md.txt b/datafusion/_sources/contributor-guide/specification/invariants.md.txt new file mode 100644 index 000000000000..c8de4e1d4e21 --- /dev/null +++ b/datafusion/_sources/contributor-guide/specification/invariants.md.txt @@ -0,0 +1,327 @@ + + +# Invariants + +This document enumerates invariants of DataFusion's logical and physical planes +(functions, and nodes). Some of these invariants are currently not enforced. +This document assumes that the reader is familiar with some of the codebase, +including rust arrow's RecordBatch and Array. + +## Rational + +DataFusion's computational model is built on top of a dynamically typed arrow +object, Array, that offers the interface `Array::as_any` to downcast itself to +its statically typed versions (e.g. `Int32Array`). DataFusion uses +`Array::data_type` to perform the respective downcasting on its physical +operations. DataFusion uses a dynamic type system because the queries being +executed are not always known at compile time: they are only known during the +runtime (or query time) of programs built with DataFusion. This document is +built on top of this principle. + +In dynamically typed interfaces, it is up to developers to enforce type +invariances. This document declares some of these invariants, so that users +know what they can expect from a query in DataFusion, and DataFusion developers +know what they need to enforce at the coding level. + +## Notation + +- Field or physical field: the tuple name, `arrow::DataType` and nullability flag (a bool whether values can be null), represented in this document by `PF(name, type, nullable)` +- Logical field: Field with a relation name. Represented in this document by `LF(relation, name, type, nullable)` +- Projected plan: plan with projection as the root node. +- Logical schema: a vector of logical fields, used by logical plan. +- Physical schema: a vector of physical fields, used by both physical plan and Arrow record batch. + +### Logical + +#### Function + +An object that knows its valid incoming logical fields and how to derive its +output logical field from its arguments' logical fields. A functions' output +field is itself a function of its input fields: + +``` +logical_field(lf1: LF, lf2: LF, ...) -> LF +``` + +Examples: + +- `plus(a,b) -> LF(None, "{a} Plus {b}", d(a.type,b.type), a.nullable | b.nullable)` where d is the function mapping input types to output type (`get_supertype` in our current implementation). +- `length(a) -> LF(None, "length({a})", u32, a.nullable)` + +#### Plan + +A tree composed of other plans and functions (e.g. `Projection c1 + c2, c1 - c2 AS sum12; Scan c1 as u32, c2 as u64`) +that knows how to derive its schema. + +Certain plans have a frozen schema (e.g. Scan), while others derive their +schema from their child nodes. + +#### Column + +An identifier in a logical plan consists of field name and relation name. + +### Physical + +#### Function + +An object that knows how to derive its physical field from its arguments' +physical fields, and also how to actually perform the computation on data. A +functions' output physical field is a function of its input physical fields: + +``` +physical_field(PF1, PF2, ...) -> PF +``` + +Examples: + +- `plus(a,b) -> PF("{a} Plus {b}", d(a.type,b.type), a.nullable | b.nullable)` where d is a complex function (`get_supertype` in our current implementation) whose computation is for each element in the columns, sum the two entries together and return it in the same type as the smallest type of both columns. +- `length(&str) -> PF("length({a})", u32, a.nullable)` whose computation is "count number of bytes in the string". + +#### Plan + +A tree (e.g. `Projection c1 + c2, c1 - c2 AS sum12; Scan c1 as u32, c2 as u64`) +that knows how to derive its metadata and compute itself. + +Note how the physical plane does not know how to derive field names: field +names are solely a property of the logical plane, as they are not needed in the +physical plane. + +#### Column + +A type of physical node in a physical plan consists of a field name and unique index. + +### Data Sources' registry + +A map of source name/relation -> Schema plus associated properties necessary to read data from it (e.g. file path). + +### Functions' registry + +A map of function name -> logical + physical function. + +### Physical Planner + +A function that knows how to derive a physical plan from a logical plan: + +``` +plan(LogicalPlan) -> PhysicalPlan +``` + +### Logical Optimizer + +A function that accepts a logical plan and returns an (optimized) logical plan +which computes the same results, but in a more efficient manner: + +``` +optimize(LogicalPlan) -> LogicalPlan +``` + +### Physical Optimizer + +A function that accepts a physical plan and returns an (optimized) physical +plan which computes the same results, but may differ based on the actual +hardware or execution environment being run: + +``` +optimize(PhysicalPlan) -> PhysicalPlan +``` + +### Builder + +A function that knows how to build a new logical plan from an existing logical +plan and some extra parameters. + +``` +build(logical_plan, params...) -> logical_plan +``` + +## Invariants + +The following subsections describe invariants. Since functions' output schema +depends on its arguments' schema (e.g. min, plus), the resulting schema can +only be derived based on a known set of input schemas (TableProvider). +Likewise, schemas of functions depend on the specific registry of functions +registered (e.g. does `my_op` return u32 or u64?). Thus, in this section, the +wording "same schema" is understood to mean "same schema under a given registry +of data sources and functions". + +### (relation, name) tuples in logical fields and logical columns are unique + +Every logical field's (relation, name) tuple in a logical schema MUST be unique. +Every logical column's (relation, name) tuple in a logical plan MUST be unique. + +This invariant guarantees that `SELECT t1.id, t2.id FROM t1 JOIN t2...` +unambiguously selects the field `t1.id` and `t2.id` in a logical schema in the +logical plane. + +#### Responsibility + +It is the logical builder and optimizer's responsibility to guarantee this +invariant. + +#### Validation + +Builder and optimizer MUST error if this invariant is violated on any logical +node that creates a new schema (e.g. scan, projection, aggregation, join, etc.). + +### Physical schema is consistent with data + +The contents of every Array in every RecordBatch in every partition returned by +a physical plan MUST be consistent with RecordBatch's schema, in that every +Array in the RecordBatch must be downcastable to its corresponding type +declared in the RecordBatch. + +#### Responsibility + +Physical functions MUST guarantee this invariant. This is particularly +important in aggregate functions, whose aggregating type may be different from +the intermediary types during calculations (e.g. sum(i32) -> i64). + +#### Validation + +Since the validation of this invariant is computationally expensive, execution +contexts CAN validate this invariant. It is acceptable for physical nodes to +`panic!` if their input does not satisfy this invariant. + +### Physical schema is consistent in physical functions + +The schema of every Array returned by a physical function MUST match the +DataType reported by the physical function itself. + +This ensures that when a physical function claims that it returns a type +(e.g. Int32), users can safely downcast its resulting Array to the +corresponding type (e.g. Int32Array), as well as to write data to formats that +have a schema with nullability flag (e.g. parquet). + +#### Responsibility + +It is the responsibility of the developer that writes a physical function to +guarantee this invariant. + +In particular: + +- The derived DataType matches the code it uses to build the array for every branch of valid input type combinations. +- The nullability flag matches how the values are built. + +#### Validation + +Since the validation of this invariant is computationally expensive, execution +contexts CAN validate this invariant. + +### The physical schema is invariant under planning + +The physical schema derived by a physical plan returned by the planner MUST be +equivalent to the physical schema derived by the logical plan passed to the +planner. Specifically: + +``` +plan(logical_plan).schema === logical_plan.physical_schema +``` + +Logical plan's physical schema is defined as logical schema with relation +qualifiers stripped for all logical fields: + +``` +logical_plan.physical_schema = vector[ strip_relation(f) for f in logical_plan.logical_fields ] +``` + +This is used to ensure that the physical schema of its (logical) plan is what +it gets in record batches, so that users can rely on the optimized logical plan +to know the resulting physical schema. + +Note that since a logical plan can be as simple as a single projection with a +single function, `Projection f(c1,c2)`, a corollary of this is that the +physical schema of every `logical function -> physical function` must be +invariant under planning. + +#### Responsibility + +Developers of physical and logical plans and planners MUST guarantee this +invariant for every triplet (logical plan, physical plan, conversion rule). + +#### Validation + +Planners MUST validate this invariant. In particular they MUST return an error +when, during planning, a physical function's derived schema does not match the +logical functions' derived schema. + +### The output schema equals the physical plan schema + +The schema of every RecordBatch in every partition outputted by a physical plan +MUST be equal to the schema of the physical plan. Specifically: + +``` +physical_plan.evaluate(batch).schema = physical_plan.schema +``` + +Together with other invariants, this ensures that the consumers of record +batches do not need to know the output schema of the physical plan; they can +safely rely on the record batch's schema to perform downscaling and naming. + +#### Responsibility + +Physical nodes MUST guarantee this invariant. + +#### Validation + +Execution Contexts CAN validate this invariant. + +### Logical schema is invariant under logical optimization + +The logical schema derived by a projected logical plan returned by the logical +optimizer MUST be equivalent to the logical schema derived by the logical plan +passed to the planner: + +``` +optimize(logical_plan).schema === logical_plan.schema +``` + +This is used to ensure that plans can be optimized without jeopardizing future +referencing logical columns (name and index) or assumptions about their +schemas. + +#### Responsibility + +Logical optimizers MUST guarantee this invariant. + +#### Validation + +Users of logical optimizers SHOULD validate this invariant. + +### Physical schema is invariant under physical optimization + +The physical schema derived by a projected physical plan returned by the +physical optimizer MUST match the physical schema derived by the physical plan +passed to the planner: + +``` +optimize(physical_plan).schema === physical_plan.schema +``` + +This is used to ensure that plans can be optimized without jeopardizing future +references of logical columns (name and index) or assumptions about their +schemas. + +#### Responsibility + +Optimizers MUST guarantee this invariant. + +#### Validation + +Users of optimizers SHOULD validate this invariant. diff --git a/datafusion/_sources/contributor-guide/specification/output-field-name-semantic.md.txt b/datafusion/_sources/contributor-guide/specification/output-field-name-semantic.md.txt new file mode 100644 index 000000000000..fe378a52cda1 --- /dev/null +++ b/datafusion/_sources/contributor-guide/specification/output-field-name-semantic.md.txt @@ -0,0 +1,212 @@ + + +# Output field name semantics + +This specification documents how field names in output record batches should be +generated based on given user queries. The filed name rules apply to +DataFusion queries planned from both SQL queries and Dataframe APIs. + +## Field name rules + +- All bare column field names MUST not contain relation/table qualifier. + - Both `SELECT t1.id`, `SELECT id` and `df.select_columns(&["id"])` SHOULD result in field name: `id` +- All compound column field names MUST contain relation/table qualifier. + - `SELECT foo + bar` SHOULD result in field name: `table.foo PLUS table.bar` +- Function names MUST be converted to lowercase. + - `SELECT AVG(c1)` SHOULD result in field name: `avg(table.c1)` +- Literal string MUST not be wrapped with quotes or double quotes. + - `SELECT 'foo'` SHOULD result in field name: `foo` +- Operator expressions MUST be wrapped with parentheses. + - `SELECT -2` SHOULD result in field name: `(- 2)` +- Operator and operand MUST be separated by spaces. + - `SELECT 1+2` SHOULD result in field name: `(1 + 2)` +- Function arguments MUST be separated by a comma `,` and a space. + - `SELECT f(c1,c2)` and `df.select(vec![f.udf("f")?.call(vec![col("c1"), col("c2")])])` SHOULD result in field name: `f(table.c1, table.c2)` + +## Appendices + +### Examples and comparison with other systems + +Data schema for test sample queries: + +``` +CREATE TABLE t1 (id INT, a VARCHAR(5)); +INSERT INTO t1 (id, a) VALUES (1, 'foo'); +INSERT INTO t1 (id, a) VALUES (2, 'bar'); + +CREATE TABLE t2 (id INT, b VARCHAR(5)); +INSERT INTO t2 (id, b) VALUES (1, 'hello'); +INSERT INTO t2 (id, b) VALUES (2, 'world'); +``` + +#### Projected columns + +Query: + +``` +SELECT t1.id, a, t2.id, b +FROM t1 +JOIN t2 ON t1.id = t2.id +``` + +DataFusion Arrow record batches output: + +| id | a | id | b | +| --- | --- | --- | ----- | +| 1 | foo | 1 | hello | +| 2 | bar | 2 | world | + +Spark, MySQL 8 and PostgreSQL 13 output: + +| id | a | id | b | +| --- | --- | --- | ----- | +| 1 | foo | 1 | hello | +| 2 | bar | 2 | world | + +SQLite 3 output: + +| id | a | b | +| --- | --- | ----- | +| 1 | foo | hello | +| 2 | bar | world | + +#### Function transformed columns + +Query: + +``` +SELECT ABS(t1.id), abs(-id) FROM t1; +``` + +DataFusion Arrow record batches output: + +| abs(t1.id) | abs((- t1.id)) | +| ---------- | -------------- | +| 1 | 1 | +| 2 | 2 | + +Spark output: + +| abs(id) | abs((- id)) | +| ------- | ----------- | +| 1 | 1 | +| 2 | 2 | + +MySQL 8 output: + +| ABS(t1.id) | abs(-id) | +| ---------- | -------- | +| 1 | 1 | +| 2 | 2 | + +PostgreSQL 13 output: + +| abs | abs | +| --- | --- | +| 1 | 1 | +| 2 | 2 | + +SQlite 3 output: + +| ABS(t1.id) | abs(-id) | +| ---------- | -------- | +| 1 | 1 | +| 2 | 2 | + +#### Function with operators + +Query: + +``` +SELECT t1.id + ABS(id), ABS(id * t1.id) FROM t1; +``` + +DataFusion Arrow record batches output: + +| t1.id + abs(t1.id) | abs(t1.id \* t1.id) | +| ------------------ | ------------------- | +| 2 | 1 | +| 4 | 4 | + +Spark output: + +| id + abs(id) | abs(id \* id) | +| ------------ | ------------- | +| 2 | 1 | +| 4 | 4 | + +MySQL 8 output: + +| t1.id + ABS(id) | ABS(id \* t1.id) | +| --------------- | ---------------- | +| 2 | 1 | +| 4 | 4 | + +PostgreSQL output: + +| ?column? | abs | +| -------- | --- | +| 2 | 1 | +| 4 | 4 | + +SQLite output: + +| t1.id + ABS(id) | ABS(id \* t1.id) | +| --------------- | ---------------- | +| 2 | 1 | +| 4 | 4 | + +#### Project literals + +Query: + +``` +SELECT 1, 2+5, 'foo_bar'; +``` + +DataFusion Arrow record batches output: + +| 1 | (2 + 5) | foo_bar | +| --- | ------- | ------- | +| 1 | 7 | foo_bar | + +Spark output: + +| 1 | (2 + 5) | foo_bar | +| --- | ------- | ------- | +| 1 | 7 | foo_bar | + +MySQL output: + +| 1 | 2+5 | foo_bar | +| --- | --- | ------- | +| 1 | 7 | foo_bar | + +PostgreSQL output: + +| ?column? | ?column? | ?column? | +| -------- | -------- | -------- | +| 1 | 7 | foo_bar | + +SQLite 3 output: + +| 1 | 2+5 | 'foo_bar' | +| --- | --- | --------- | +| 1 | 7 | foo_bar | diff --git a/datafusion/_sources/index.rst.txt b/datafusion/_sources/index.rst.txt new file mode 100644 index 000000000000..86b3b7e2c8ff --- /dev/null +++ b/datafusion/_sources/index.rst.txt @@ -0,0 +1,54 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +======================= +Apache Arrow DataFusion +======================= + +Table of Contents +================= + +.. _toc.guide: + +.. toctree:: + :maxdepth: 1 + :caption: User Guide + + user-guide/introduction + user-guide/example-usage + user-guide/library + user-guide/cli + user-guide/dataframe + user-guide/expressions + user-guide/sql/index + user-guide/configs + user-guide/faq + Rust Crate Documentation + +.. _toc.contributor-guide: + +.. toctree:: + :maxdepth: 2 + :caption: Contributor Guide + + contributor-guide/index + contributor-guide/communication + contributor-guide/roadmap + contributor-guide/quarterly_roadmap + contributor-guide/specification/index + Issue tracker + Code of conduct diff --git a/datafusion/_sources/user-guide/cli.md.txt b/datafusion/_sources/user-guide/cli.md.txt new file mode 100644 index 000000000000..d3512a6dca52 --- /dev/null +++ b/datafusion/_sources/user-guide/cli.md.txt @@ -0,0 +1,351 @@ + + +# DataFusion Command-line SQL Utility + +The DataFusion CLI is a command-line interactive SQL utility for executing +queries against any supported data files. It is a convenient way to +try DataFusion out with your own data sources, and test out its SQL support. + +## Example + +Create a CSV file to query. + +```shell +$ echo "a,b" > data.csv +$ echo "1,2" >> data.csv +``` + +Query that single file (the CLI also supports parquet, compressed csv, avro, json and more) + +```shell +$ datafusion-cli +DataFusion CLI v17.0.0 +❯ select * from 'data.csv'; ++---+---+ +| a | b | ++---+---+ +| 1 | 2 | ++---+---+ +1 row in set. Query took 0.007 seconds. +``` + +You can also query directories of files with compatible schemas: + +```shell +$ ls data_dir/ +data.csv data2.csv +``` + +```shell +$ datafusion-cli +DataFusion CLI v16.0.0 +❯ select * from 'data_dir'; ++---+---+ +| a | b | ++---+---+ +| 3 | 4 | +| 1 | 2 | ++---+---+ +2 rows in set. Query took 0.007 seconds. +``` + +## Installation + +### Install and run using Cargo + +The easiest way to install DataFusion CLI a spin is via `cargo install datafusion-cli`. + +### Install and run using Homebrew (on MacOS) + +DataFusion CLI can also be installed via Homebrew (on MacOS). Install it as any other pre-built software like this: + +```bash +brew install datafusion +# ==> Downloading https://ghcr.io/v2/homebrew/core/datafusion/manifests/12.0.0 +# ######################################################################## 100.0% +# ==> Downloading https://ghcr.io/v2/homebrew/core/datafusion/blobs/sha256:9ecc8a01be47ceb9a53b39976696afa87c0a8 +# ==> Downloading from https://pkg-containers.githubusercontent.com/ghcr1/blobs/sha256:9ecc8a01be47ceb9a53b39976 +# ######################################################################## 100.0% +# ==> Pouring datafusion--12.0.0.big_sur.bottle.tar.gz +# 🍺 /usr/local/Cellar/datafusion/12.0.0: 9 files, 17.4MB + +datafusion-cli +``` + +### Run using Docker + +There is no officially published Docker image for the DataFusion CLI, so it is necessary to build from source +instead. + +Use the following commands to clone this repository and build a Docker image containing the CLI tool. Note +that there is `.dockerignore` file in the root of the repository that may need to be deleted in order for +this to work. + +```bash +git clone https://github.com/apache/arrow-datafusion +git checkout 12.0.0 +cd arrow-datafusion +docker build -f datafusion-cli/Dockerfile . --tag datafusion-cli +docker run -it -v $(your_data_location):/data datafusion-cli +``` + +## Usage + +See the current usage using `datafusion-cli --help`: + +```bash +Apache Arrow +Command Line Client for DataFusion query engine. + +USAGE: + datafusion-cli [OPTIONS] + +OPTIONS: + -c, --batch-size The batch size of each query, or use DataFusion default + -f, --file ... Execute commands from file(s), then exit + --format [default: table] [possible values: csv, tsv, table, json, + nd-json] + -h, --help Print help information + -p, --data-path Path to your data, default to current directory + -q, --quiet Reduce printing other than the results and work quietly + -r, --rc ... Run the provided files on startup instead of ~/.datafusionrc + -V, --version Print version information +``` + +## Selecting files directly + +Files can be queried directly by enclosing the file or +directory name in single `'` quotes as shown in the example. + +It is also possible to create a table backed by files by explicitly +via `CREATE EXTERNAL TABLE` as shown below. + +## Registering Parquet Data Sources + +Parquet data sources can be registered by executing a `CREATE EXTERNAL TABLE` SQL statement. It is not necessary to provide schema information for Parquet files. + +```sql +CREATE EXTERNAL TABLE taxi +STORED AS PARQUET +LOCATION '/mnt/nyctaxi/tripdata.parquet'; +``` + +## Registering CSV Data Sources + +CSV data sources can be registered by executing a `CREATE EXTERNAL TABLE` SQL statement. + +```sql +CREATE EXTERNAL TABLE test +STORED AS CSV +WITH HEADER ROW +LOCATION '/path/to/aggregate_test_100.csv'; +``` + +It is also possible to provide schema information. + +```sql +CREATE EXTERNAL TABLE test ( + c1 VARCHAR NOT NULL, + c2 INT NOT NULL, + c3 SMALLINT NOT NULL, + c4 SMALLINT NOT NULL, + c5 INT NOT NULL, + c6 BIGINT NOT NULL, + c7 SMALLINT NOT NULL, + c8 INT NOT NULL, + c9 BIGINT NOT NULL, + c10 VARCHAR NOT NULL, + c11 FLOAT NOT NULL, + c12 DOUBLE NOT NULL, + c13 VARCHAR NOT NULL +) +STORED AS CSV +LOCATION '/path/to/aggregate_test_100.csv'; +``` + +## Querying S3 Data Sources + +The CLI can query data in S3 if the following environment variables are defined: + +- `AWS_DEFAULT_REGION` +- `AWS_ACCESS_KEY_ID` +- `AWS_SECRET_ACCESS_KEY` + +Details of the environment variables that can be used are + +- AWS_ACCESS_KEY_ID -> access_key_id +- AWS_SECRET_ACCESS_KEY -> secret_access_key +- AWS_DEFAULT_REGION -> region +- AWS_ENDPOINT -> endpoint +- AWS_SESSION_TOKEN -> token +- AWS_CONTAINER_CREDENTIALS_RELATIVE_URI -> +- AWS_ALLOW_HTTP -> set to "true" to permit HTTP connections without TLS + +Example: + +```bash +$ aws s3 cp test.csv s3://my-bucket/ +upload: ./test.csv to s3://my-bucket/test.csv + +$ export AWS_DEFAULT_REGION=us-east-2 +$ export AWS_SECRET_ACCESS_KEY=*************************** +$ export AWS_ACCESS_KEY_ID=************** + +$ datafusion-cli +DataFusion CLI v14.0.0 +❯ create external table test stored as csv location 's3://my-bucket/test.csv'; +0 rows in set. Query took 0.374 seconds. +❯ select * from test; ++----------+----------+ +| column_1 | column_2 | ++----------+----------+ +| 1 | 2 | ++----------+----------+ +1 row in set. Query took 0.171 seconds. +``` + +## Commands + +Available commands inside DataFusion CLI are: + +- Quit + +```bash +> \q +``` + +- Help + +```bash +> \? +``` + +- ListTables + +```bash +> \d +``` + +- DescribeTable + +```bash +> \d table_name +``` + +- QuietMode + +```bash +> \quiet [true|false] +``` + +- list function + +```bash +> \h +``` + +- Search and describe function + +```bash +> \h function +``` + +- Show configuration options + +```SQL +> show all; + ++-------------------------------------------------+---------+ +| name | setting | ++-------------------------------------------------+---------+ +| datafusion.execution.batch_size | 8192 | +| datafusion.execution.coalesce_batches | true | +| datafusion.execution.coalesce_target_batch_size | 4096 | +| datafusion.execution.time_zone | UTC | +| datafusion.explain.logical_plan_only | false | +| datafusion.explain.physical_plan_only | false | +| datafusion.optimizer.filter_null_join_keys | false | +| datafusion.optimizer.skip_failed_rules | true | ++-------------------------------------------------+---------+ + +``` + +- Set configuration options + +```SQL +> SET datafusion.execution.batch_size to 1024; +``` + +## Changing Configuration Options + +All available configuration options can be seen using `SHOW ALL` as described above. + +You can change the configuration options using environment +variables. `datafusion-cli` looks in the corresponding environment +variable with an upper case name and all `.` converted to `_`. + +For example, to set `datafusion.execution.batch_size` to `1024` you +would set the `DATAFUSION_EXECUTION_BATCH_SIZE` environment variable +appropriately: + +```shell +$ DATAFUSION_EXECUTION_BATCH_SIZE=1024 datafusion-cli +DataFusion CLI v12.0.0 +❯ show all; ++-------------------------------------------------+---------+ +| name | setting | ++-------------------------------------------------+---------+ +| datafusion.execution.batch_size | 1024 | +| datafusion.execution.coalesce_batches | true | +| datafusion.execution.coalesce_target_batch_size | 4096 | +| datafusion.execution.time_zone | UTC | +| datafusion.explain.logical_plan_only | false | +| datafusion.explain.physical_plan_only | false | +| datafusion.optimizer.filter_null_join_keys | false | +| datafusion.optimizer.skip_failed_rules | true | ++-------------------------------------------------+---------+ +8 rows in set. Query took 0.002 seconds. +``` + +You can change the configuration options using `SET` statement as well + +```shell +$ datafusion-cli +DataFusion CLI v13.0.0 + +❯ show datafusion.execution.batch_size; ++---------------------------------+---------+ +| name | setting | ++---------------------------------+---------+ +| datafusion.execution.batch_size | 8192 | ++---------------------------------+---------+ +1 row in set. Query took 0.011 seconds. + +❯ set datafusion.execution.batch_size to 1024; +0 rows in set. Query took 0.000 seconds. + +❯ show datafusion.execution.batch_size; ++---------------------------------+---------+ +| name | setting | ++---------------------------------+---------+ +| datafusion.execution.batch_size | 1024 | ++---------------------------------+---------+ +1 row in set. Query took 0.005 seconds. +``` diff --git a/datafusion/_sources/user-guide/configs.md.txt b/datafusion/_sources/user-guide/configs.md.txt new file mode 100644 index 000000000000..57d23ce69060 --- /dev/null +++ b/datafusion/_sources/user-guide/configs.md.txt @@ -0,0 +1,69 @@ + + + + +# Configuration Settings + +The following configuration options can be passed to `SessionConfig` to control various aspects of query execution. + +For applications which do not expose `SessionConfig`, like `datafusion-cli`, these options may also be set via environment variables. +To construct a session with options from the environment, use `SessionConfig::from_env`. +The name of the environment variable is the option's key, transformed to uppercase and with periods replaced with underscores. +For example, to configure `datafusion.execution.batch_size` you would set the `DATAFUSION_EXECUTION_BATCH_SIZE` environment variable. +Values are parsed according to the [same rules used in casts from Utf8](https://docs.rs/arrow/latest/arrow/compute/kernels/cast/fn.cast.html). +If the value in the environment variable cannot be cast to the type of the configuration option, the default value will be used instead and a warning emitted. +Environment variables are read during `SessionConfig` initialisation so they must be set beforehand and will not affect running sessions. + +| key | default | description | +| --------------------------------------------------------- | ---------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| datafusion.catalog.create_default_catalog_and_schema | true | Number of partitions for query execution. Increasing partitions can increase concurrency. Defaults to the number of cpu cores on the system. | +| datafusion.catalog.default_catalog | datafusion | The default catalog name - this impacts what SQL queries use if not specified | +| datafusion.catalog.default_schema | public | The default schema name - this impacts what SQL queries use if not specified | +| datafusion.catalog.information_schema | false | Should DataFusion provide access to `information_schema` virtual tables for displaying schema information | +| datafusion.catalog.location | NULL | Location scanned to load tables for `default` schema | +| datafusion.catalog.format | NULL | Type of `TableProvider` to use when loading `default` schema | +| datafusion.catalog.has_header | false | If the file has a header | +| datafusion.execution.batch_size | 8192 | Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would results in too much metadata memory consumption | +| datafusion.execution.coalesce_batches | true | When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting | +| datafusion.execution.collect_statistics | false | Should DataFusion collect statistics after listing files | +| datafusion.execution.target_partitions | 0 | Number of partitions for query execution. Increasing partitions can increase concurrency. Defaults to the number of cpu cores on the system | +| datafusion.execution.time_zone | +00:00 | The default time zone Some functions, e.g. EXTRACT(HOUR from SOME_TIME), shift the underlying datetime according to this time zone, and then extract the hour | +| datafusion.execution.parquet.enable_page_index | false | If true, uses parquet data page level metadata (Page Index) statistics to reduce the number of rows decoded. | +| datafusion.execution.parquet.pruning | true | If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file | +| datafusion.execution.parquet.skip_metadata | true | If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata | +| datafusion.execution.parquet.metadata_size_hint | NULL | If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two read are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer | +| datafusion.execution.parquet.pushdown_filters | false | If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded | +| datafusion.execution.parquet.reorder_filters | false | If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query | +| datafusion.optimizer.enable_round_robin_repartition | true | When set to true, the physical plan optimizer will try to add round robin repartition to increase parallelism to leverage more CPU cores | +| datafusion.optimizer.filter_null_join_keys | false | When set to true, the optimizer will insert filters before a join between a nullable and non-nullable column to filter out nulls on the nullable side. This filter can add additional overhead when the file format does not fully support predicate push down. | +| datafusion.optimizer.repartition_aggregations | true | Should DataFusion repartition data using the aggregate keys to execute aggregates in parallel using the provided `target_partitions` level" | +| datafusion.optimizer.repartition_joins | true | Should DataFusion repartition data using the join keys to execute joins in parallel using the provided `target_partitions` level" | +| datafusion.optimizer.repartition_windows | true | Should DataFusion repartition data using the partitions keys to execute window functions in parallel using the provided `target_partitions` level" | +| datafusion.optimizer.skip_failed_rules | true | When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail | +| datafusion.optimizer.max_passes | 3 | Number of times that the optimizer will attempt to optimize the plan | +| datafusion.optimizer.top_down_join_key_reordering | true | When set to true, the physical plan optimizer will run a top down process to reorder the join keys | +| datafusion.optimizer.prefer_hash_join | true | When set to true, the physical plan optimizer will prefer HashJoin over SortMergeJoin. HashJoin can work more efficiently than SortMergeJoin but consumes more memory | +| datafusion.optimizer.hash_join_single_partition_threshold | 1048576 | The maximum estimated size in bytes for one input side of a HashJoin will be collected into a single partition | +| datafusion.explain.logical_plan_only | false | When set to true, the explain statement will only print logical plans | +| datafusion.explain.physical_plan_only | false | When set to true, the explain statement will only print physical plans | diff --git a/datafusion/_sources/user-guide/dataframe.md.txt b/datafusion/_sources/user-guide/dataframe.md.txt new file mode 100644 index 000000000000..23766cd07bdb --- /dev/null +++ b/datafusion/_sources/user-guide/dataframe.md.txt @@ -0,0 +1,105 @@ + + +# DataFrame API + +A DataFrame represents a logical set of rows with the same named columns, similar to a [Pandas DataFrame](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html) or +[Spark DataFrame](https://spark.apache.org/docs/latest/sql-programming-guide.html). + +DataFrames are typically created by calling a method on +`SessionContext`, such as `read_csv`, and can then be modified +by calling the transformation methods, such as `filter`, `select`, `aggregate`, and `limit` +to build up a query definition. + +The query can be executed by calling the `collect` method. + +The DataFrame struct is part of DataFusion's prelude and can be imported with the following statement. + +```rust +use datafusion::prelude::*; +``` + +Here is a minimal example showing the execution of a query using the DataFrame API. + +```rust +let ctx = SessionContext::new(); +let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; +let df = df.filter(col("a").lt_eq(col("b")))? + .aggregate(vec![col("a")], vec![min(col("b"))])? + .limit(0, Some(100))?; +// Print results +df.show(); +``` + +The DataFrame API is well documented in the [API reference on docs.rs](https://docs.rs/datafusion/latest/datafusion/dataframe/struct.DataFrame.html). + +Refer to the [Expressions Reference](expressions) for available functions for building logical expressions for use with the +DataFrame API. + +## DataFrame Transformations + +These methods create a new DataFrame after applying a transformation to the logical plan that the DataFrame represents. + +DataFusion DataFrames use lazy evaluation, meaning that each transformation is just creating a new query plan and +not actually performing any transformations. This approach allows for the overall plan to be optimized before +execution. The plan is evaluated (executed) when an action method is invoked, such as `collect`. + +| Function | Notes | +| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | +| aggregate | Perform an aggregate query with optional grouping expressions. | +| distinct | Filter out duplicate rows. | +| except | Calculate the exception of two DataFrames. The two DataFrames must have exactly the same schema | +| filter | Filter a DataFrame to only include rows that match the specified filter expression. | +| intersect | Calculate the intersection of two DataFrames. The two DataFrames must have exactly the same schema | +| join | Join this DataFrame with another DataFrame using the specified columns as join keys. | +| limit | Limit the number of rows returned from this DataFrame. | +| repartition | Repartition a DataFrame based on a logical partitioning scheme. | +| sort | Sort the DataFrame by the specified sorting expressions. Any expression can be turned into a sort expression by calling its `sort` method. | +| select | Create a projection based on arbitrary expressions. Example: `df..select(vec![col("c1"), abs(col("c2"))])?` | +| select_columns | Create a projection based on column names. Example: `df.select_columns(&["id", "name"])?`. | +| union | Calculate the union of two DataFrames, preserving duplicate rows. The two DataFrames must have exactly the same schema. | +| union_distinct | Calculate the distinct union of two DataFrames. The two DataFrames must have exactly the same schema. | +| with_column | Add an additional column to the DataFrame. | +| with_column_renamed | Rename one column by applying a new projection. | + +## DataFrame Actions + +These methods execute the logical plan represented by the DataFrame and either collects the results into memory, prints them to stdout, or writes them to disk. + +| Function | Notes | +| -------------------------- | --------------------------------------------------------------------------------------------------------------------------- | +| collect | Executes this DataFrame and collects all results into a vector of RecordBatch. | +| collect_partitioned | Executes this DataFrame and collects all results into a vector of vector of RecordBatch maintaining the input partitioning. | +| execute_stream | Executes this DataFrame and returns a stream over a single partition. | +| execute_stream_partitioned | Executes this DataFrame and returns one stream per partition. | +| show | Execute this DataFrame and print the results to stdout. | +| show_limit | Execute this DataFrame and print a subset of results to stdout. | +| write_csv | Execute this DataFrame and write the results to disk in CSV format. | +| write_json | Execute this DataFrame and write the results to disk in JSON format. | +| write_parquet | Execute this DataFrame and write the results to disk in Parquet format. | + +## Other DataFrame Methods + +| Function | Notes | +| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| explain | Return a DataFrame with the explanation of its plan so far. | +| registry | Return a `FunctionRegistry` used to plan udf's calls. | +| schema | Returns the schema describing the output of this DataFrame in terms of columns returned, where each column has a name, data type, and nullability attribute. | +| to_logical_plan | Return the optimized logical plan represented by this DataFrame. | +| to_unoptimized_plan | Return the unoptimized logical plan represented by this DataFrame. | diff --git a/datafusion/_sources/user-guide/example-usage.md.txt b/datafusion/_sources/user-guide/example-usage.md.txt new file mode 100644 index 000000000000..03283a408c03 --- /dev/null +++ b/datafusion/_sources/user-guide/example-usage.md.txt @@ -0,0 +1,140 @@ + + +# Example Usage + +In this example some simple processing is performed on the [`example.csv`](../../../datafusion/core/tests/data/example.csv) file. + +## Update `Cargo.toml` + +Add the following to your `Cargo.toml` file: + +```toml +datafusion = "11.0" +tokio = "1.0" +``` + +## Run a SQL query against data stored in a CSV: + +```rust +use datafusion::prelude::*; + +#[tokio::main] +async fn main() -> datafusion::error::Result<()> { + // register the table + let ctx = SessionContext::new(); + ctx.register_csv("example", "tests/data/example.csv", CsvReadOptions::new()).await?; + + // create a plan to run a SQL query + let df = ctx.sql("SELECT a, MIN(b) FROM example GROUP BY a LIMIT 100").await?; + + // execute and print results + df.show().await?; + Ok(()) +} +``` + +## Use the DataFrame API to process data stored in a CSV: + +```rust +use datafusion::prelude::*; + +#[tokio::main] +async fn main() -> datafusion::error::Result<()> { + // create the dataframe + let ctx = SessionContext::new(); + let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + + let df = df.filter(col("a").lt_eq(col("b")))? + .aggregate(vec![col("a")], vec![min(col("b"))])? + .limit(0, Some(100))?; + + // execute and print results + df.show().await?; + Ok(()) +} +``` + +## Output from both examples + +```text ++---+--------+ +| a | MIN(b) | ++---+--------+ +| 1 | 2 | ++---+--------+ +``` + +# Identifiers and Capitalization + +Please be aware that all identifiers are effectively made lower-case in SQL, so if your csv file has capital letters (ex: `Name`) you must put your column name in double quotes or the examples won't work. + +To illustrate this behavior, consider the [`capitalized_example.csv`](../../../datafusion/core/tests/data/capitalized_example.csv) file: + +## Run a SQL query against data stored in a CSV: + +```rust +use datafusion::prelude::*; + +#[tokio::main] +async fn main() -> datafusion::error::Result<()> { + // register the table + let ctx = SessionContext::new(); + ctx.register_csv("example", "tests/data/capitalized_example.csv", CsvReadOptions::new()).await?; + + // create a plan to run a SQL query + let df = ctx.sql("SELECT \"A\", MIN(b) FROM example GROUP BY \"A\" LIMIT 100").await?; + + // execute and print results + df.show().await?; + Ok(()) +} +``` + +## Use the DataFrame API to process data stored in a CSV: + +```rust +use datafusion::prelude::*; + +#[tokio::main] +async fn main() -> datafusion::error::Result<()> { + // create the dataframe + let ctx = SessionContext::new(); + let df = ctx.read_csv("tests/data/capitalized_example.csv", CsvReadOptions::new()).await?; + + let df = df.filter(col("A").lt_eq(col("c")))? + .aggregate(vec![col("A")], vec![min(col("b"))])? + .limit(0, Some(100))?; + + // execute and print results + df.show().await?; + Ok(()) +} +``` + +## Output from both examples + +```text ++---+--------+ +| A | MIN(b) | ++---+--------+ +| 2 | 1 | +| 1 | 2 | ++---+--------+ +``` diff --git a/datafusion/_sources/user-guide/expressions.md.txt b/datafusion/_sources/user-guide/expressions.md.txt new file mode 100644 index 000000000000..5dc3520d10ce --- /dev/null +++ b/datafusion/_sources/user-guide/expressions.md.txt @@ -0,0 +1,211 @@ + + +# Expressions + +DataFrame methods such as `select` and `filter` accept one or more logical expressions and there are many functions +available for creating logical expressions. These are documented below. + +Expressions can be chained together using a fluent-style API: + +```rust +// create the expression `(a > 5) AND (b < 7)` +col("a").gt(lit(5)).and(col("b").lt(lit(7))) +``` + +## Identifiers + +| Function | Notes | +| -------- | -------------------------------------------- | +| col | Reference a column in a dataframe `col("a")` | + +## Literal Values + +| Function | Notes | +| -------- | -------------------------------------------------- | +| lit | Literal value such as `lit(123)` or `lit("hello")` | + +## Boolean Expressions + +| Function | Notes | +| -------- | ----------------------------------------- | +| and | `and(expr1, expr2)` or `expr1.and(expr2)` | +| or | `or(expr1, expr2)` or `expr1.or(expr2)` | +| not | `not(expr)` or `expr.not()` | + +## Comparison Expressions + +| Function | Notes | +| -------- | --------------------- | +| eq | `expr1.eq(expr2)` | +| gt | `expr1.gt(expr2)` | +| gt_eq | `expr1.gt_eq(expr2)` | +| lt | `expr1.lt(expr2)` | +| lt_eq | `expr1.lt_eq(expr2)` | +| not_eq | `expr1.not_eq(expr2)` | + +## Math Functions + +In addition to the math functions listed here, some Rust operators are implemented for expressions, allowing +expressions such as `col("a") + col("b")` to be used. + +| Function | Notes | +| --------------------- | ------------------------------------------------- | +| abs(x) | absolute value | +| acos(x) | inverse cosine | +| asin(x) | inverse sine | +| atan(x) | inverse tangent | +| atan2(y, x) | inverse tangent of y / x | +| ceil(x) | nearest integer greater than or equal to argument | +| cos(x) | cosine | +| exp(x) | exponential | +| floor(x) | nearest integer less than or equal to argument | +| ln(x) | natural logarithm | +| log10(x) | base 10 logarithm | +| log2(x) | base 2 logarithm | +| power(base, exponent) | base raised to the power of exponent | +| round(x) | round to nearest integer | +| signum(x) | sign of the argument (-1, 0, +1) | +| sin(x) | sine | +| sqrt(x) | square root | +| tan(x) | tangent | +| trunc(x) | truncate toward zero | + +## Bitwise Operators + +| Operator | Notes | +| -------- | ----------------------------------------------- | +| & | Bitwise AND => `(expr1 & expr2)` | +| | | Bitwise OR => (expr1 | expr2) | +| # | Bitwise XOR => `(expr1 # expr2)` | +| << | Bitwise left shift => `(expr1 << expr2)` | +| >> | Bitwise right shift => `(expr1 << expr2)` | + +## Conditional Expressions + +| Function | Notes | +| -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| coalesce | Returns the first of its arguments that is not null. Null is returned only if all arguments are null. It is often used to substitute a default value for null values when data is retrieved for display. | +| case | CASE expression. Example: `case(expr).when(expr, expr).when(expr, expr).otherwise(expr).end()`. | +| nullif | Returns a null value if `value1` equals `value2`; otherwise it returns `value1`. This can be used to perform the inverse operation of the `coalesce` expression. | + +## String Expressions + +| Function | Notes | +| ---------------- | ----- | +| ascii | | +| bit_length | | +| btrim | | +| char_length | | +| character_length | | +| concat | | +| concat_ws | | +| chr | | +| initcap | | +| left | | +| length | | +| lower | | +| lpad | | +| ltrim | | +| md5 | | +| octet_length | | +| repeat | | +| replace | | +| reverse | | +| right | | +| rpad | | +| rtrim | | +| digest | | +| split_part | | +| starts_with | | +| strpos | | +| substr | | +| translate | | +| trim | | +| upper | | + +## Regular Expressions + +| Function | Notes | +| -------------- | ----- | +| regexp_match | | +| regexp_replace | | + +## Temporal Expressions + +| Function | Notes | +| -------------------- | ------------ | +| date_part | | +| date_trunc | | +| from_unixtime | | +| to_timestamp | | +| to_timestamp_millis | | +| to_timestamp_micros | | +| to_timestamp_seconds | | +| now() | current time | + +## Other Expressions + +| Function | Notes | +| -------- | ----- | +| array | | +| in_list | | +| random | | +| sha224 | | +| sha256 | | +| sha384 | | +| sha512 | | +| struct | | +| to_hex | | + +## Aggregate Functions + +| Function | Notes | +| ---------------------------------- | ----- | +| avg | | +| approx_distinct | | +| approx_median | | +| approx_percentile_cont | | +| approx_percentile_cont_with_weight | | +| count | | +| count_distinct | | +| cube | | +| grouping_set | | +| max | | +| median | | +| min | | +| rollup | | +| sum | | + +## Subquery Expressions + +| Function | Notes | +| --------------- | --------------------------------------------------------------------------------------------- | +| exists | | +| in_subquery | `df1.filter(in_subquery(col("foo"), df2))?` is the equivalent of the SQL `WHERE foo IN ` | +| not_exists | | +| not_in_subquery | | +| scalar_subquery | | + +## User-Defined Function Expressions + +| Function | Notes | +| ----------- | ----- | +| create_udf | | +| create_udaf | | diff --git a/datafusion/_sources/user-guide/faq.md.txt b/datafusion/_sources/user-guide/faq.md.txt new file mode 100644 index 000000000000..16a8873fff38 --- /dev/null +++ b/datafusion/_sources/user-guide/faq.md.txt @@ -0,0 +1,31 @@ + + +# Frequently Asked Questions + +## What is the relationship between Apache Arrow, DataFusion, and Ballista? + +Apache Arrow is a library which provides a standardized memory representation for columnar data. It also provides +"kernels" for performing common operations on this data. + +DataFusion is a library for executing queries in-process using the Apache Arrow memory +model and computational kernels. It is designed to run within a single process, using threads +for parallel query execution. + +[Ballista](https://github.com/apache/arrow-ballista) is a distributed compute platform built on DataFusion. diff --git a/datafusion/_sources/user-guide/introduction.md.txt b/datafusion/_sources/user-guide/introduction.md.txt new file mode 100644 index 000000000000..e16504091571 --- /dev/null +++ b/datafusion/_sources/user-guide/introduction.md.txt @@ -0,0 +1,43 @@ + + +# Introduction + +DataFusion is an extensible query execution framework, written in +Rust, that uses [Apache Arrow](https://arrow.apache.org) as its +in-memory format. + +DataFusion supports both an SQL and a DataFrame API for building +logical query plans as well as a query optimizer and execution engine +capable of parallel execution against partitioned data sources (CSV +and Parquet) using threads. + +## Use Cases + +DataFusion is used to create modern, fast and efficient data +pipelines, ETL processes, and database systems, which need the +performance of Rust and Apache Arrow and want to provide their users +the convenience of an SQL interface or a DataFrame API. + +## Why DataFusion? + +- _High Performance_: Leveraging Rust and Arrow's memory model, DataFusion achieves very high performance +- _Easy to Connect_: Being part of the Apache Arrow ecosystem (Arrow, Parquet and Flight), DataFusion works well with the rest of the big data ecosystem +- _Easy to Embed_: Allowing extension at almost any point in its design, DataFusion can be tailored for your specific usecase +- _High Quality_: Extensively tested, both by itself and with the rest of the Arrow ecosystem, DataFusion can be used as the foundation for production systems. diff --git a/datafusion/_sources/user-guide/library.md.txt b/datafusion/_sources/user-guide/library.md.txt new file mode 100644 index 000000000000..c7cc1ec425ef --- /dev/null +++ b/datafusion/_sources/user-guide/library.md.txt @@ -0,0 +1,127 @@ + + +# Using DataFusion as a library + +## Create a new project + +```shell +cargo new hello_datafusion +``` + +```shell +$ cd hello_datafusion +$ tree . +. +├── Cargo.toml +└── src + └── main.rs + +1 directory, 2 files +``` + +## Default Configuration + +DataFusion is [published on crates.io](https://crates.io/crates/datafusion), and is [well documented on docs.rs](https://docs.rs/datafusion/). + +To get started, add the following to your `Cargo.toml` file: + +```toml +[dependencies] +datafusion = "11.0" +``` + +## Create a main function + +Update the main.rs file with your first datafusion application based on [Example usage](https://arrow.apache.org/datafusion/user-guide/example-usage.html) + +```rust +use datafusion::prelude::*; + +#[tokio::main] +async fn main() -> datafusion::error::Result<()> { + // register the table + let ctx = SessionContext::new(); + ctx.register_csv("test", "", CsvReadOptions::new()).await?; + + // create a plan to run a SQL query + let df = ctx.sql("SELECT * FROM test").await?; + + // execute and print results + df.show().await?; + Ok(()) +} +``` + +## Extensibility + +DataFusion is designed to be extensible at all points. To that end, you can provide your own custom: + +- [x] User Defined Functions (UDFs) +- [x] User Defined Aggregate Functions (UDAFs) +- [x] User Defined Table Source (`TableProvider`) for tables +- [x] User Defined `Optimizer` passes (plan rewrites) +- [x] User Defined `LogicalPlan` nodes +- [x] User Defined `ExecutionPlan` nodes + +## Rust Version Compatibility + +This crate is tested with the latest stable version of Rust. We do not currently test against other, older versions of the Rust compiler. + +## Optimized Configuration + +For an optimized build several steps are required. First, use the below in your `Cargo.toml`. It is +worth noting that using the settings in the `[profile.release]` section will significantly increase the build time. + +```toml +[dependencies] +datafusion = { version = "11.0" , features = ["simd"]} +tokio = { version = "^1.0", features = ["rt-multi-thread"] } +snmalloc-rs = "0.2" + +[profile.release] +lto = true +codegen-units = 1 +``` + +Then, in `main.rs.` update the memory allocator with the below after your imports: + +```rust +use datafusion::prelude::*; + +#[global_allocator] +static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc; + +async fn main() -> datafusion::error::Result<()> { + Ok(()) +} +``` + +Finally, in order to build with the `simd` optimization `cargo nightly` is required. + +```shell +rustup toolchain install nightly +``` + +Based on the instruction set architecture you are building on you will want to configure the `target-cpu` as well, ideally +with `native` or at least `avx2`. + +``` +RUSTFLAGS='-C target-cpu=native' cargo +nightly run --release +``` diff --git a/datafusion/_sources/user-guide/sql/aggregate_functions.md.txt b/datafusion/_sources/user-guide/sql/aggregate_functions.md.txt new file mode 100644 index 000000000000..e8299b6193c2 --- /dev/null +++ b/datafusion/_sources/user-guide/sql/aggregate_functions.md.txt @@ -0,0 +1,68 @@ + + +# Aggregate Functions + +Aggregate functions operate on a set of values to compute a single result. Please refer to [PostgreSQL](https://www.postgresql.org/docs/current/functions-aggregate.html) for usage of standard SQL functions. + +## General + +- min +- max +- count +- avg +- sum +- array_agg + +## Statistical + +- var / var_samp / var_pop +- stddev / stddev_samp / stddev_pop +- covar / covar_samp / covar_pop +- corr + +## Approximate + +### approx_distinct + +`approx_distinct(x) -> uint64` returns the approximate number (HyperLogLog) of distinct input values + +### approx_median + +`approx_median(x) -> x` returns the approximate median of input values. it is an alias of `approx_percentile_cont(x, 0.5)`. + +### approx_percentile_cont + +`approx_percentile_cont(x, p) -> x` return the approximate percentile (TDigest) of input values, where `p` is a float64 between 0 and 1 (inclusive). + +It supports raw data as input and build Tdigest sketches during query time, and is approximately equal to `approx_percentile_cont_with_weight(x, 1, p)`. + +`approx_percentile_cont(x, p, n) -> x` return the approximate percentile (TDigest) of input values, where `p` is a float64 between 0 and 1 (inclusive), + +and `n` (default 100) is the number of centroids in Tdigest which means that if there are `n` or fewer unique values in `x`, you can expect an exact result. + +A higher value of `n` results in a more accurate approximation and the cost of higher memory usage. + +### approx_percentile_cont_with_weight + +`approx_percentile_cont_with_weight(x, w, p) -> x` returns the approximate percentile (TDigest) of input values with weight, where `w` is weight column expression and `p` is a float64 between 0 and 1 (inclusive). + +It supports raw data as input or pre-aggregated TDigest sketches, then builds or merges Tdigest sketches during query time. TDigest sketches are a list of centroid `(x, w)`, where `x` stands for mean and `w` stands for weight. + +It is suitable for low latency OLAP system where a streaming compute engine (e.g. Spark Streaming/Flink) pre-aggregates data to a data store, then queries using Datafusion. diff --git a/datafusion/_sources/user-guide/sql/data_types.md.txt b/datafusion/_sources/user-guide/sql/data_types.md.txt new file mode 100644 index 000000000000..1d5c0f9fc078 --- /dev/null +++ b/datafusion/_sources/user-guide/sql/data_types.md.txt @@ -0,0 +1,90 @@ + + +# Data Types + +DataFusion uses Arrow, and thus the Arrow type system, for query +execution. The SQL types from +[sqlparser-rs](https://github.com/sqlparser-rs/sqlparser-rs/blob/main/src/ast/data_type.rs#L27) +are mapped to [Arrow data types](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html) according to the following table. +This mapping occurs when defining the schema in a `CREATE EXTERNAL TABLE` command or when performing a SQL `CAST` operation. + +## Character Types + +| SQL DataType | Arrow DataType | +| ------------ | -------------- | +| `CHAR` | `Utf8` | +| `VARCHAR` | `Utf8` | +| `TEXT` | `Utf8` | + +## Numeric Types + +| SQL DataType | Arrow DataType | Notes | +| ------------------------------------ | :---------------------------- | ----------------------------------------------------------------------------------------------------------- | +| `TINYINT` | `Int8` | | +| `SMALLINT` | `Int16` | | +| `INT` or `INTEGER` | `Int32` | | +| `BIGINT` | `Int64` | | +| `TINYINT UNSIGNED` | `UInt8` | | +| `SMALLINT UNSIGNED` | `UInt16` | | +| `INT UNSIGNED` or `INTEGER UNSIGNED` | `UInt32` | | +| `BIGINT UNSIGNED` | `UInt64` | | +| `FLOAT` | `Float32` | | +| `REAL` | `Float32` | | +| `DOUBLE` | `Float64` | | +| `DECIMAL(precision,scale)` | `Decimal128(precision,scale)` | Decimal support is currently experimental ([#3523](https://github.com/apache/arrow-datafusion/issues/3523)) | + +## Date/Time Types + +| SQL DataType | Arrow DataType | +| ------------ | :-------------------------------------- | +| `DATE` | `Date32` | +| `TIME` | `Time64(TimeUnit::Nanosecond)` | +| `TIMESTAMP` | `Timestamp(TimeUnit::Nanosecond, None)` | + +## Boolean Types + +| SQL DataType | Arrow DataType | +| ------------ | :------------- | +| `BOOLEAN` | `Boolean` | + +## Binary Types + +| SQL DataType | Arrow DataType | +| ------------ | :------------- | +| `BYTEA` | `Binary` | + +## Unsupported Types + +| SQL Data Type | Arrow DataType | +| ------------- | :------------------ | +| `UUID` | _Not yet supported_ | +| `BLOB` | _Not yet supported_ | +| `CLOB` | _Not yet supported_ | +| `BINARY` | _Not yet supported_ | +| `VARBINARY` | _Not yet supported_ | +| `REGCLASS` | _Not yet supported_ | +| `NVARCHAR` | _Not yet supported_ | +| `STRING` | _Not yet supported_ | +| `CUSTOM` | _Not yet supported_ | +| `ARRAY` | _Not yet supported_ | +| `ENUM` | _Not yet supported_ | +| `SET` | _Not yet supported_ | +| `INTERVAL` | _Not yet supported_ | +| `DATETIME` | _Not yet supported_ | diff --git a/datafusion/_sources/user-guide/sql/ddl.md.txt b/datafusion/_sources/user-guide/sql/ddl.md.txt new file mode 100644 index 000000000000..c531312b1e58 --- /dev/null +++ b/datafusion/_sources/user-guide/sql/ddl.md.txt @@ -0,0 +1,154 @@ + + +# DDL + +## CREATE EXTERNAL TABLE + +Parquet data sources can be registered by executing a `CREATE EXTERNAL TABLE` SQL statement. It is not necessary +to provide schema information for Parquet files. + +```sql +CREATE EXTERNAL TABLE taxi +STORED AS PARQUET +LOCATION '/mnt/nyctaxi/tripdata.parquet'; +``` + +CSV data sources can also be registered by executing a `CREATE EXTERNAL TABLE` SQL statement. The schema will be +inferred based on scanning a subset of the file. + +```sql +CREATE EXTERNAL TABLE test +STORED AS CSV +WITH HEADER ROW +LOCATION '/path/to/aggregate_simple.csv'; +``` + +It is also possible to specify the schema manually. + +```sql +CREATE EXTERNAL TABLE test ( + c1 VARCHAR NOT NULL, + c2 INT NOT NULL, + c3 SMALLINT NOT NULL, + c4 SMALLINT NOT NULL, + c5 INT NOT NULL, + c6 BIGINT NOT NULL, + c7 SMALLINT NOT NULL, + c8 INT NOT NULL, + c9 BIGINT NOT NULL, + c10 VARCHAR NOT NULL, + c11 FLOAT NOT NULL, + c12 DOUBLE NOT NULL, + c13 VARCHAR NOT NULL +) +STORED AS CSV +WITH HEADER ROW +LOCATION '/path/to/aggregate_test_100.csv'; +``` + +If data sources are already partitioned in Hive style, `PARTITIONED BY` can be used for partition pruning. + +``` +/mnt/nyctaxi/year=2022/month=01/tripdata.parquet +/mnt/nyctaxi/year=2021/month=12/tripdata.parquet +/mnt/nyctaxi/year=2021/month=11/tripdata.parquet +``` + +```sql +CREATE EXTERNAL TABLE taxi +STORED AS PARQUET +PARTITIONED BY (year, month) +LOCATION '/mnt/nyctaxi'; +``` + +## CREATE TABLE + +An in-memory table can be created with a query or values list. + +
+CREATE [OR REPLACE] TABLE [IF NOT EXISTS] table_name AS [SELECT | VALUES LIST];
+
+ +```sql +CREATE TABLE IF NOT EXISTS valuetable AS VALUES(1,'HELLO'),(12,'DATAFUSION'); + +CREATE TABLE memtable as select * from valuetable; +``` + +## DROP TABLE + +Removes the table from DataFusion's catalog. + +
+DROP TABLE [ IF EXISTS ] table_name;
+
+ +```sql +CREATE TABLE users AS VALUES(1,2),(2,3); +DROP TABLE users; +-- or use 'if exists' to silently ignore if the table doesn't exist +DROP TABLE IF EXISTS nonexistent_table; +``` + +## CREATE VIEW + +View is a virtual table based on the result of a SQL query. It can be created from an existing table or values list. + +
+CREATE VIEW view_name AS statement;
+
+ +```sql +CREATE TABLE users AS VALUES(1,2),(2,3),(3,4),(4,5); +CREATE VIEW test AS SELECT column1 FROM users; +SELECT * FROM test; ++---------+ +| column1 | ++---------+ +| 1 | +| 2 | +| 3 | +| 4 | ++---------+ +``` + +```sql +CREATE VIEW test AS VALUES(1,2),(5,6); +SELECT * FROM test; ++---------+---------+ +| column1 | column2 | ++---------+---------+ +| 1 | 2 | +| 5 | 6 | ++---------+---------+ +``` + +## DROP VIEW + +Removes the view from DataFusion's catalog. + +
+DROP VIEW [ IF EXISTS ] view_name;
+
+ +```sql +-- drop users_v view from the customer_a schema +DROP VIEW IF EXISTS customer_a.users_v; +``` diff --git a/datafusion/_sources/user-guide/sql/explain.md.txt b/datafusion/_sources/user-guide/sql/explain.md.txt new file mode 100644 index 000000000000..ae0795f9ab4b --- /dev/null +++ b/datafusion/_sources/user-guide/sql/explain.md.txt @@ -0,0 +1,71 @@ + + +# EXPLAIN + +The `EXPLAIN` command shows the logical and physical execution plan for the specified SQL statement. + +
+EXPLAIN [ANALYZE] [VERBOSE] statement
+
+ +## EXPLAIN + +Shows the execution plan of a statement. +If you need more details output, try to use `EXPLAIN VERBOSE`. + +```sql +EXPLAIN SELECT SUM(x) FROM table GROUP BY b; ++---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #SUM(table.x) | +| | Aggregate: groupBy=[[#table.b]], aggr=[[SUM(#table.x)]] | +| | TableScan: table projection=[x, b] | +| physical_plan | ProjectionExec: expr=[SUM(table.x)@1 as SUM(table.x)] | +| | AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[SUM(table.x)] | +| | CoalesceBatchesExec: target_batch_size=4096 | +| | RepartitionExec: partitioning=Hash([Column { name: "b", index: 0 }], 16) | +| | AggregateExec: mode=Partial, gby=[b@1 as b], aggr=[SUM(table.x)] | +| | RepartitionExec: partitioning=RoundRobinBatch(16) | +| | CsvExec: source=Path(/tmp/table.csv: [/tmp/table.csv]), has_header=false, limit=None, projection=[x, b] | +| | | ++---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+ +``` + +## EXPLAIN ANALYZE + +Shows the execution plan and metrics of a statement. +If you need more information output, try to use `EXPLAIN ANALYZE VERBOSE`. + +```sql +EXPLAIN ANALYZE SELECT SUM(x) FROM table GROUP BY b; ++-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Plan with Metrics | CoalescePartitionsExec, metrics=[] | +| | ProjectionExec: expr=[SUM(table.x)@1 as SUM(x)], metrics=[] | +| | HashAggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[SUM(x)], metrics=[outputRows=2] | +| | CoalesceBatchesExec: target_batch_size=4096, metrics=[] | +| | RepartitionExec: partitioning=Hash([Column { name: "b", index: 0 }], 16), metrics=[sendTime=839560, fetchTime=122528525, repartitionTime=5327877] | +| | HashAggregateExec: mode=Partial, gby=[b@1 as b], aggr=[SUM(x)], metrics=[outputRows=2] | +| | RepartitionExec: partitioning=RoundRobinBatch(16), metrics=[fetchTime=5660489, repartitionTime=0, sendTime=8012] | +| | CsvExec: source=Path(/tmp/table.csv: [/tmp/table.csv]), has_header=false, metrics=[] | ++-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------+ +``` diff --git a/datafusion/_sources/user-guide/sql/index.rst.txt b/datafusion/_sources/user-guide/sql/index.rst.txt new file mode 100644 index 000000000000..373d60eb1e10 --- /dev/null +++ b/datafusion/_sources/user-guide/sql/index.rst.txt @@ -0,0 +1,32 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +SQL Reference +============= + +.. toctree:: + :maxdepth: 2 + + data_types + select + subqueries + ddl + explain + information_schema + aggregate_functions + scalar_functions + sql_status diff --git a/datafusion/_sources/user-guide/sql/information_schema.md.txt b/datafusion/_sources/user-guide/sql/information_schema.md.txt new file mode 100644 index 000000000000..b3fcc843bd9f --- /dev/null +++ b/datafusion/_sources/user-guide/sql/information_schema.md.txt @@ -0,0 +1,72 @@ + + +# Information Schema + +DataFusion supports showing metadata about the tables and views available. This information can be accessed using the +views of the ISO SQL `information_schema` schema or the DataFusion specific `SHOW TABLES` and `SHOW COLUMNS` commands. + +To show tables in the DataFusion catalog, use the `SHOW TABLES` command or the `information_schema.tables` view: + +```sql +> show tables; +or +> select * from information_schema.tables; ++---------------+--------------------+------------+------------+ +| table_catalog | table_schema | table_name | table_type | ++---------------+--------------------+------------+------------+ +| datafusion | public | t | BASE TABLE | +| datafusion | information_schema | tables | VIEW | +| datafusion | information_schema | views | VIEW | +| datafusion | information_schema | columns | VIEW | ++---------------+--------------------+------------+------------+ + +``` + +To show the schema of a table in DataFusion, use the `SHOW COLUMNS` command or the `information_schema.columns` view: + +```sql +> show columns from t; +or +> select table_catalog, table_schema, table_name, column_name, data_type, is_nullable from information_schema.columns; ++---------------+--------------+------------+-------------+-----------+-------------+ +| table_catalog | table_schema | table_name | column_name | data_type | is_nullable | ++---------------+--------------+------------+-------------+-----------+-------------+ +| datafusion | public | t | Int64(1) | Int64 | NO | ++---------------+--------------+------------+-------------+-----------+-------------+ +``` + +To show the current session configuration options, use the `SHOW ALL` command or the `information_schema.df_settings` view: + +```sql +❯ select * from information_schema.df_settings; + ++-------------------------------------------------+---------+ +| name | setting | ++-------------------------------------------------+---------+ +| datafusion.execution.batch_size | 8192 | +| datafusion.execution.coalesce_batches | true | +| datafusion.execution.coalesce_target_batch_size | 4096 | +| datafusion.execution.time_zone | UTC | +| datafusion.explain.logical_plan_only | false | +| datafusion.explain.physical_plan_only | false | +| datafusion.optimizer.filter_null_join_keys | false | +| datafusion.optimizer.skip_failed_rules | true | ++-------------------------------------------------+---------+ +``` diff --git a/datafusion/_sources/user-guide/sql/scalar_functions.md.txt b/datafusion/_sources/user-guide/sql/scalar_functions.md.txt new file mode 100644 index 000000000000..11725f90d93d --- /dev/null +++ b/datafusion/_sources/user-guide/sql/scalar_functions.md.txt @@ -0,0 +1,297 @@ + + +# Scalar Functions + +## Math Functions + +### `abs(x)` + +absolute value + +### `acos(x)` + +inverse cosine + +### `asin(x)` + +inverse sine + +### `atan(x)` + +inverse tangent + +### `atan2(y, x)` + +inverse tangent of y / x + +### `ceil(x)` + +nearest integer greater than or equal to argument + +### `cos(x)` + +cosine + +### `exp(x)` + +exponential + +### `floor(x)` + +nearest integer less than or equal to argument + +### `ln(x)` + +natural logarithm + +### `log10(x)` + +base 10 logarithm + +### `log2(x)` + +base 2 logarithm + +### `power(base, exponent)` + +base raised to the power of exponent + +### `round(x)` + +round to nearest integer + +### `signum(x)` + +sign of the argument (-1, 0, +1) + +### `sin(x)` + +sine + +### `sqrt(x)` + +square root + +### `tan(x)` + +tangent + +### `trunc(x)` + +truncate toward zero + +## Conditional Functions + +### `coalesce` + +Returns the first of its arguments that is not null. Null is returned only if all arguments are null. It is often used to substitute a default value for null values when data is retrieved for display. + +### `nullif` + +Returns a null value if value1 equals value2; otherwise it returns value1. This can be used to perform the inverse operation of the `coalesce` expression. | + +## String Functions + +### `ascii` + +### `bit_length` + +### `btrim` + +### `char_length` + +### `character_length` + +### `concat` + +### `concat_ws` + +### `chr` + +### `initcap` + +### `left` + +### `length` + +### `lower` + +### `lpad` + +### `ltrim` + +### `md5` + +### `octet_length` + +### `repeat` + +### `replace` + +### `reverse` + +### `right` + +### `rpad` + +### `rtrim` + +### `digest` + +### `split_part` + +### `starts_with` + +### `strpos` + +### `substr` + +### `translate` + +### `trim` + +### `upper` + +## Regular Expression Functions + +### regexp_match + +### regexp_replace + +## Temporal Functions + +### `to_timestamp` + +`to_timestamp()` is similar to the standard SQL function. It performs conversions to type `Timestamp(Nanoseconds, None)`, from: + +- Timestamp strings + - `1997-01-31T09:26:56.123Z` # RCF3339 + - `1997-01-31T09:26:56.123-05:00` # RCF3339 + - `1997-01-31 09:26:56.123-05:00` # close to RCF3339 but with a space er than T + - `1997-01-31T09:26:56.123` # close to RCF3339 but no timezone et specified + - `1997-01-31 09:26:56.123` # close to RCF3339 but uses a space and timezone offset + - `1997-01-31 09:26:56` # close to RCF3339, no fractional seconds +- An Int64 array/column, values are nanoseconds since Epoch UTC +- Other Timestamp() columns or values + +Note that conversions from other Timestamp and Int64 types can also be performed using `CAST(.. AS Timestamp)`. However, the conversion functionality here is present for consistency with the other `to_timestamp_xx()` functions. + +### `to_timestamp_millis` + +`to_timestamp_millis()` does conversions to type `Timestamp(Milliseconds, None)`, from: + +- Timestamp strings, the same as supported by the regular timestamp() function (except the output is a timestamp of Milliseconds resolution) + - `1997-01-31T09:26:56.123Z` # RCF3339 + - `1997-01-31T09:26:56.123-05:00` # RCF3339 + - `1997-01-31 09:26:56.123-05:00` # close to RCF3339 but with a space er than T + - `1997-01-31T09:26:56.123` # close to RCF3339 but no timezone et specified + - `1997-01-31 09:26:56.123` # close to RCF3339 but uses a space and timezone offset + - `1997-01-31 09:26:56` # close to RCF3339, no fractional seconds +- An Int64 array/column, values are milliseconds since Epoch UTC +- Other Timestamp() columns or values + +Note that `CAST(.. AS Timestamp)` converts to Timestamps with Nanosecond resolution; this function is the only way to convert/cast to millisecond resolution. + +### `to_timestamp_micros` + +`to_timestamp_micros()` does conversions to type `Timestamp(Microseconds, None)`, from: + +- Timestamp strings, the same as supported by the regular timestamp() function (except the output is a timestamp of microseconds resolution) + - `1997-01-31T09:26:56.123Z` # RCF3339 + - `1997-01-31T09:26:56.123-05:00` # RCF3339 + - `1997-01-31 09:26:56.123-05:00` # close to RCF3339 but with a space er than T + - `1997-01-31T09:26:56.123` # close to RCF3339 but no timezone et specified + - `1997-01-31 09:26:56.123` # close to RCF3339 but uses a space and timezone offset + - `1997-01-31 09:26:56` # close to RCF3339, no fractional seconds +- An Int64 array/column, values are microseconds since Epoch UTC +- Other Timestamp() columns or values + +Note that `CAST(.. AS Timestamp)` converts to Timestamps with Nanosecond resolution; this function is the only way to convert/cast to microsecond resolution. + +### `to_timestamp_seconds` + +`to_timestamp_seconds()` does conversions to type `Timestamp(Seconds, None)`, from: + +- Timestamp strings, the same as supported by the regular timestamp() function (except the output is a timestamp of secondseconds resolution) + - `1997-01-31T09:26:56.123Z` # RCF3339 + - `1997-01-31T09:26:56.123-05:00` # RCF3339 + - `1997-01-31 09:26:56.123-05:00` # close to RCF3339 but with a space er than T + - `1997-01-31T09:26:56.123` # close to RCF3339 but no timezone et specified + - `1997-01-31 09:26:56.123` # close to RCF3339 but uses a space and timezone offset + - `1997-01-31 09:26:56` # close to RCF3339, no fractional seconds +- An Int64 array/column, values are seconds since Epoch UTC +- Other Timestamp() columns or values + +Note that `CAST(.. AS Timestamp)` converts to Timestamps with Nanosecond resolution; this function is the only way to convert/cast to seconds resolution. + +### `extract` + +`extract(field FROM source)` + +- The `extract` function retrieves subfields such as year or hour from date/time values. + `source` must be a value expression of type timestamp, Date32, or Date64. `field` is an identifier that selects what field to extract from the source value. + The `extract` function returns values of type u32. + - `year` :`extract(year FROM to_timestamp('2020-09-08T12:00:00+00:00')) -> 2020` + - `month`:`extract(month FROM to_timestamp('2020-09-08T12:00:00+00:00')) -> 9` + - `week` :`extract(week FROM to_timestamp('2020-09-08T12:00:00+00:00')) -> 37` + - `day`: `extract(day FROM to_timestamp('2020-09-08T12:00:00+00:00')) -> 8` + - `hour`: `extract(hour FROM to_timestamp('2020-09-08T12:00:00+00:00')) -> 12` + - `minute`: `extract(minute FROM to_timestamp('2020-09-08T12:01:00+00:00')) -> 1` + - `second`: `extract(second FROM to_timestamp('2020-09-08T12:00:03+00:00')) -> 3` + +### `date_part` + +`date_part('field', source)` + +- The `date_part` function is modeled on the postgres equivalent to the SQL-standard function `extract`. + Note that here the field parameter needs to be a string value, not a name. + The valid field names for `date_part` are the same as for `extract`. + - `date_part('second', to_timestamp('2020-09-08T12:00:12+00:00')) -> 12` + +### `date_trunc` + +### `date_bin` + +### `from_unixtime` + +### `now` + +Returns current time as `Timestamp(Nanoseconds, UTC)`. Returns same value for the function +wherever it appears in the statement, using a value chosen at planning time. + +## Other Functions + +### `array` + +### `in_list` + +### `random` + +### `sha224` + +### `sha256` + +### `sha384` + +### `sha512` + +### `struct` + +### `to_hex` diff --git a/datafusion/_sources/user-guide/sql/select.md.txt b/datafusion/_sources/user-guide/sql/select.md.txt new file mode 100644 index 000000000000..3eea252d7080 --- /dev/null +++ b/datafusion/_sources/user-guide/sql/select.md.txt @@ -0,0 +1,226 @@ + + +# SELECT syntax + +The queries in DataFusion scan data from tables and return 0 or more rows. +Please be aware that column names in queries are made lower-case, but not on the inferred schema. Accordingly, if you +want to query against a capitalized field, make sure to use double quotes. Please see this +[example](https://arrow.apache.org/datafusion/user-guide/example-usage.html) for clarification. +In this documentation we describe the SQL syntax in DataFusion. + +DataFusion supports the following syntax for queries: + + +[ [WITH](#with-clause) with_query [, ...] ]
+[SELECT](#select-clause) [ ALL | DISTINCT ] select_expr [, ...]
+[ [FROM](#from-clause) from_item [, ...] ]
+[ [JOIN](#join-clause) join_item [, ...] ]
+[ [WHERE](#where-clause) condition ]
+[ [GROUP BY](#group-by-clause) grouping_element [, ...] ]
+[ [HAVING](#having-clause) condition]
+[ [UNION](#union-clause) [ ALL | select ]
+[ [ORDER BY](#order-by-clause) expression [ ASC | DESC ][, ...] ]
+[ [LIMIT](#limit-clause) count ]
+ +
+ +## WITH clause + +A with clause allows to give names for queries and reference them by name. + +```sql +WITH x AS (SELECT a, MAX(b) AS b FROM t GROUP BY a) +SELECT a, b FROM x; +``` + +## SELECT clause + +Example: + +```sql +SELECT a, b, a + b FROM table +``` + +The `DISTINCT` quantifier can be added to make the query return all distinct rows. +By default `ALL` will be used, which returns all the rows. + +```sql +SELECT DISTINCT person, age FROM employees +``` + +## FROM clause + +Example: + +```sql +SELECT t.a FROM table AS t +``` + +## WHERE clause + +Example: + +```sql +SELECT a FROM table WHERE a > 10 +``` + +## JOIN clause + +DataFusion supports `INNER JOIN`, `LEFT OUTER JOIN`, `RIGHT OUTER JOIN`, `FULL OUTER JOIN`, and `CROSS JOIN`. + +The following examples are based on this table: + +```sql +select * from x; ++----------+----------+ +| column_1 | column_2 | ++----------+----------+ +| 1 | 2 | ++----------+----------+ +``` + +### INNER JOIN + +The keywords `JOIN` or `INNER JOIN` define a join that only shows rows where there is a match in both tables. + +```sql +❯ select * from x inner join x y ON x.column_1 = y.column_1; ++----------+----------+----------+----------+ +| column_1 | column_2 | column_1 | column_2 | ++----------+----------+----------+----------+ +| 1 | 2 | 1 | 2 | ++----------+----------+----------+----------+ +``` + +### LEFT OUTER JOIN + +The keywords `LEFT JOIN` or `LEFT OUTER JOIN` define a join that includes all rows from the left table even if there +is not a match in the right table. When there is no match, null values are produced for the right side of the join. + +```sql +❯ select * from x left join x y ON x.column_1 = y.column_2; ++----------+----------+----------+----------+ +| column_1 | column_2 | column_1 | column_2 | ++----------+----------+----------+----------+ +| 1 | 2 | | | ++----------+----------+----------+----------+ +``` + +### RIGHT OUTER JOIN + +The keywords `RIGHT JOIN` or `RIGHT OUTER JOIN` define a join that includes all rows from the right table even if there +is not a match in the left table. When there is no match, null values are produced for the left side of the join. + +```sql +❯ select * from x right join x y ON x.column_1 = y.column_2; ++----------+----------+----------+----------+ +| column_1 | column_2 | column_1 | column_2 | ++----------+----------+----------+----------+ +| | | 1 | 2 | ++----------+----------+----------+----------+ +``` + +### FULL OUTER JOIN + +The keywords `FULL JOIN` or `FULL OUTER JOIN` define a join that is effectively a union of a `LEFT OUTER JOIN` and +`RIGHT OUTER JOIN`. It will show all rows from the left and right side of the join and will produce null values on +either side of the join where there is not a match. + +```sql +❯ select * from x full outer join x y ON x.column_1 = y.column_2; ++----------+----------+----------+----------+ +| column_1 | column_2 | column_1 | column_2 | ++----------+----------+----------+----------+ +| 1 | 2 | | | +| | | 1 | 2 | ++----------+----------+----------+----------+ +``` + +### CROSS JOIN + +A cross join produces a cartesian product that matches every row in the left side of the join with every row in the +right side of the join. + +```sql +❯ select * from x cross join x y; ++----------+----------+----------+----------+ +| column_1 | column_2 | column_1 | column_2 | ++----------+----------+----------+----------+ +| 1 | 2 | 1 | 2 | ++----------+----------+----------+----------+ +``` + +## GROUP BY clause + +Example: + +```sql +SELECT a, b, MAX(c) FROM table GROUP BY a, b +``` + +## HAVING clause + +Example: + +```sql +SELECT a, b, MAX(c) FROM table GROUP BY a, b HAVING MAX(c) > 10 +``` + +## UNION clause + +Example: + +```sql +SELECT + a, + b, + c +FROM table1 +UNION ALL +SELECT + a, + b, + c +FROM table2 +``` + +## ORDER BY clause + +Orders the results by the referenced expression. By default it uses ascending order (`ASC`). +This order can be changed to descending by adding `DESC` after the order-by expressions. + +Examples: + +```sql +SELECT age, person FROM table ORDER BY age; +SELECT age, person FROM table ORDER BY age DESC; +SELECT age, person FROM table ORDER BY age, person DESC; +``` + +## LIMIT clause + +Limits the number of rows to be a maximum of `count` rows. `count` should be a non-negative integer. + +Example: + +```sql +SELECT age, person FROM table +LIMIT 10 +``` diff --git a/datafusion/_sources/user-guide/sql/sql_status.md.txt b/datafusion/_sources/user-guide/sql/sql_status.md.txt new file mode 100644 index 000000000000..b260ecb4bae9 --- /dev/null +++ b/datafusion/_sources/user-guide/sql/sql_status.md.txt @@ -0,0 +1,135 @@ + + +# Status + +## General + +- [x] SQL Parser +- [x] SQL Query Planner +- [x] Query Optimizer +- [x] Constant folding +- [x] Join Reordering +- [x] Limit Pushdown +- [x] Projection push down +- [x] Predicate push down +- [x] Type coercion +- [x] Parallel query execution + +## SQL Support + +- [x] Projection +- [x] Filter (WHERE) +- [x] Filter post-aggregate (HAVING) +- [x] Limit +- [x] Aggregate +- [x] Common math functions +- [x] cast +- [x] try_cast +- [x] [`VALUES` lists](https://www.postgresql.org/docs/current/queries-values.html) +- Postgres compatible String functions + - [x] ascii + - [x] bit_length + - [x] btrim + - [x] char_length + - [x] character_length + - [x] chr + - [x] concat + - [x] concat_ws + - [x] initcap + - [x] left + - [x] length + - [x] lpad + - [x] ltrim + - [x] octet_length + - [x] regexp_replace + - [x] repeat + - [x] replace + - [x] reverse + - [x] right + - [x] rpad + - [x] rtrim + - [x] split_part + - [x] starts_with + - [x] strpos + - [x] substr + - [x] to_hex + - [x] translate + - [x] trim +- Conditional functions + - [x] nullif + - [x] case + - [x] coalesce +- Approximation functions + - [x] approx_distinct + - [x] approx_median + - [x] approx_percentile_cont + - [x] approx_percentile_cont_with_weight +- Common date/time functions + - [ ] Basic date functions + - [ ] Basic time functions + - [x] Basic timestamp functions + - [x] [to_timestamp](./scalar_functions.md#to_timestamp) + - [x] [to_timestamp_millis](./scalar_functions.md#to_timestamp_millis) + - [x] [to_timestamp_micros](./scalar_functions.md#to_timestamp_micros) + - [x] [to_timestamp_seconds](./scalar_functions.md#to_timestamp_seconds) + - [x] [extract](./scalar_functions.md#extract) + - [x] [date_part](./scalar_functions.md#date_part) +- nested functions + - [x] Array of columns +- [x] Schema Queries + - [x] SHOW TABLES + - [x] SHOW COLUMNS FROM + - [x] SHOW CREATE TABLE + - [x] information_schema.{tables, columns, views} + - [ ] information_schema other views +- [x] Sorting +- [ ] Nested types +- [ ] Lists +- [x] Subqueries +- [x] Common table expressions +- [x] Set Operations + - [x] UNION ALL + - [x] UNION + - [x] INTERSECT + - [x] INTERSECT ALL + - [x] EXCEPT + - [x] EXCEPT ALL +- [x] Joins + - [x] INNER JOIN + - [x] LEFT JOIN + - [x] RIGHT JOIN + - [x] FULL JOIN + - [x] CROSS JOIN +- [ ] Window + - [x] Empty window + - [x] Common window functions + - [x] Window with PARTITION BY clause + - [x] Window with ORDER BY clause + - [ ] Window with FILTER clause + - [ ] [Window with custom WINDOW FRAME](https://github.com/apache/arrow-datafusion/issues/361) + - [ ] UDF and UDAF for window functions + +## Data Sources + +- [x] CSV +- [x] Parquet primitive types +- [ ] Parquet nested types +- [x] JSON +- [x] Avro diff --git a/datafusion/_sources/user-guide/sql/subqueries.md.txt b/datafusion/_sources/user-guide/sql/subqueries.md.txt new file mode 100644 index 000000000000..478fab7e7c2d --- /dev/null +++ b/datafusion/_sources/user-guide/sql/subqueries.md.txt @@ -0,0 +1,98 @@ + + +# Subqueries + +DataFusion supports `EXISTS`, `NOT EXISTS`, `IN`, `NOT IN` and Scalar Subqueries. + +The examples below are based on the following table. + +```sql +❯ select * from x; ++----------+----------+ +| column_1 | column_2 | ++----------+----------+ +| 1 | 2 | ++----------+----------+ +``` + +## EXISTS + +The `EXISTS` syntax can be used to find all rows in a relation where a correlated subquery produces one or more matches +for that row. Only correlated subqueries are supported. + +```sql +❯ select * from x y where exists (select * from x where x.column_1 = y.column_1); ++----------+----------+ +| column_1 | column_2 | ++----------+----------+ +| 1 | 2 | ++----------+----------+ +1 row in set. +``` + +## NOT EXISTS + +The `NOT EXISTS` syntax can be used to find all rows in a relation where a correlated subquery produces zero matches +for that row. Only correlated subqueries are supported. + +```sql +❯ select * from x y where not exists (select * from x where x.column_1 = y.column_1); +0 rows in set. +``` + +## IN + +The `IN` syntax can be used to find all rows in a relation where a given expression's value can be found in the +results of a correlated subquery. + +```sql +❯ select * from x where column_1 in (select column_1 from x); ++----------+----------+ +| column_1 | column_2 | ++----------+----------+ +| 1 | 2 | ++----------+----------+ +1 row in set. +``` + +## NOT IN + +The `NOT IN` syntax can be used to find all rows in a relation where a given expression's value can not be found in the +results of a correlated subquery. + +```sql +❯ select * from x where column_1 not in (select column_1 from x); +0 rows in set. +``` + +## Scalar Subquery + +A scalar subquery can be used to produce a single value that can be used in many different contexts in a query. Here +is an example of a filter using a scalar subquery. Only correlated subqueries are supported. + +```sql +❯ select * from x y where column_1 < (select sum(column_2) from x where x.column_1 = y.column_1); ++----------+----------+ +| column_1 | column_2 | ++----------+----------+ +| 1 | 2 | ++----------+----------+ +1 row in set. +``` diff --git a/datafusion/contributor-guide/roadmap.html b/datafusion/contributor-guide/roadmap.html index 5516bc76e6cf..0e85413f8296 100644 --- a/datafusion/contributor-guide/roadmap.html +++ b/datafusion/contributor-guide/roadmap.html @@ -421,7 +421,7 @@

Additional SQL Language Features

  • More sophisticated cost based optimizer for join ordering

  • -
  • Implement advanced query optimization framework (Tokomak) #440

  • +
  • Implement advanced query optimization framework (Tokomak) #440

  • Finer optimizations for group by and aggregate functions

@@ -436,8 +436,8 @@

Datasources

  • Migrate to some sort of arrow2 based implementation (see milestone for more details)

  • -
  • Add DataFusion to h2oai/db-benchmark 147

  • -
  • Improve build time 348

  • +
  • Add DataFusion to h2oai/db-benchmark #147

  • +
  • Improve build time #348

diff --git a/datafusion/objects.inv b/datafusion/objects.inv index ce6713b92ca1f6409a8e2c0a0671d38f8b9a4479..0b79f2c92d91b2a85d95f8330d5abff3293868af 100644 GIT binary patch delta 2884 zcmV-K3%m56C%GrElL&uu8OfkP%u|RRs7zigDh81*Xf!Ter<;9q(Bit8t4;WBv$lk5^+*7xSh`s-X|~ zF!W)SkDpQfb<2OHRQt`-0BnMU?sJ-LBVsGCM`9a87;DH4P7?}jvt2B5;>Bi9=uWNK zuww%=J138Uaf(r3-{2o_79H0l{`C#*EVB(BR?>apB8?*#OWIhj!S~w5^qzl4mwD}6 z>Y2<=4UXq(GL=HlCO`V!`w74ZRZo|8ni4$JV^Kb->|=k3YqmIf6U#FN0qdr4tUahh zRINenFu|nfA_wQ<=Bm4{#3=Z*K15gV7@3iO42yjTF9A;p3W&dh_z?QhxWFjiy37fu zX(UC$`w2^v9sz$eZp377&1iaAFo8d7W zL9HIfj7)!o=WZcQUPM$%2c;hF$~>~~`f)}NI+ch-?; zf%e(Rr!avQxyrYG<}6d;oH#!>A?i{wW$2T5o@jrX!N)YopY-9?1gBI^^FM*J{@g^5 zNUt{4fw{oIeBuiiBSLdnCmZ%VQQ*=p_K~WU{d^Z@{81KrDWOB8S6jHi)t2_z3>=w_ zpQafz_ibc;01K`wV^6Ei!YoT?Y3M_1>imNqDON{m{@G54!2$5$+ieFK`MSka$l257 zbM${Rr^{!wu%{b{e5NTH3N;w)z@FjVoWy>2m^`pCtJ4Gi*tOR+IF;WS_1kl2|OJceQ0#D^Rv^ro<9+8LD5x%pnhf%da zmO4dmFT=_V^F4ht4LN3}*6C%}) z&W%qTr@+;o&+SszR$O%^~-+%h#eg z^AzOrmp@0p)C0+fZWne~#wSACwg36C=M>g)7(8c!&yaY|tPG0n>%#_5(Iqo=!MuM- znuOORJyJJQVz~)La+D#KqC@Cr1dQ(k9E3&k0k=UMWlh7Sw^XokiKFa$z@11Bdl_zb zZ)&UA3}tzjrJdps#+YA^FxC*9HupsNtxWWY*Gswre>PX5WPMK6SGG;CZtnZ?SXDk0 zhfK~@ZcU(OS&-e~^E92ANlTz&~wU><@UWHoO_kPLQ0iJ*WG z?xKC{S@ld2*0aFwnk^XhE#fr2J~Zm3CV2}BlJb{3JZVbMP0CoRFz}G=BEOm^ie#DO zWd=k40(w>MvP1NJ<1!(?iRQ}PCb9+Y{AvW*>YiiYQ{jAPAs(aoI7<* z*u@`r7pt$QSM2WeN#g2(*r-rH|iY$czhl1U(kJoWC7 zht_eCDbjx6Y7(e+XKQn$V;QL@g=*K)vnn+CF`8wI%vU8zW3YZuo2=%ul#|1p0Sfz@ zkNN9t6i1~H#$OryzI%O7k_l2j9l)UwT&PM)iGz=l#_3;@Odf=xkBooD>0g>mn0Tcd z;4t0bLQS_qBqs;%C5BLrSbg{I@PW}aeFd!=F?Nq(@es*mTkJgK0L(48;UTFODvW&u zHBSFhWC9sRKFNqUNJhL6lM(yz{^^M5Dj{1;RAD_}I8NP+L0TrXIcat%FKGp8iX@PV zNalg`QK_KS4NWfzMB#s>rh(={S9;jz-eP>8vzmn{S#OV^%`8sJkMOiNSva-1o^7FN z3h6B&4oM##7kaDXQhGFt(29esmkCw)3TkZ4LY6EPIQeQCaKl4V4YxA)ilqUX3q=XK z;66D8V31SbmzYyvV2m}Pmy$u+D@E)nN>k+%D-DEr=_@g2xZQvDp-jS8{n0NEP2Blg z<2!5=`7n7H{x=SiFrNKKq-CIqg+ycMC zG{R13MD2n`>Gs#ZdC4FUmGDXzJMq*yF;XXFrM*;MN_+i9roANfi=s?yd{hvIKO_y$ z?z+K6)b2WQv*3TU9(mW`qy?MbePjUze-wcv-FpZi3jRm}WfX-^J{ksSE)=B{MhHiY z9W}ffakRNe+Ip|%FePER%smf6z z*`z)N)dJ0hq$Ias*#i`~z~|)~I5+S%mxH#FStPv1gI0gNBIxoFE$HpnJt{BG(hc_k zyX8`dmh>9OTA;ZQmAqd_^(Gu%0q!{R6EjGAC5bI=k(iLuI5oD*VN@n9lZ{X3L<6CB zqOZj6iSp|=XqY1Q8Z8RAE*s=nEBj7KMfH zJd*~SiKd2!r24E1uh;<&qX!pi{0cBB*IUPY^fa1Vbj?H))(2#cs$cpP_JoaF{Zgnb z^;9=NybzU2uW7Fuh!>Jl`3Ql1lq^tRC`z77YG!{$jlQ5(pC&pcyNr<32a=q!Op!k9 zq*t_4jzEJaD&gcG6suE?4b`I1H3mDQTO)Ez1bAhe3FP=XirO%LucxLAk_r zo1-nNXvVlnHsJ?PqNV-36Nc2-`i>^))gRR`YHw7pmUtC81Y%u-S7L2bW|`L{qrvIl zfsB9ts?yvZ8y&#zthuz$fw8W)yr?~WSpYE@&*vgN8Ief-yA40qy1f9T9ju8aC|`u%_M>*qzO5I+m=mX0mR<=3x^sfPhwqteD8 z`(iY3&BiBB^l?`gpYBL`A_`;ZsXIoi1G>7rzPwwESgP=A$?kctkGol&L!1bCPtTM1 zB;^3Oy8QfwmYC+wHHux7tpk^$XF;Z_bd|CwVs%?4=0D9jMXCe3|9W||ocH%Tsic3a zP6zbk%f;$qSqBdkxyrOKKQ8aSOo=>44|qGk!X7-cZX4$rlNA5=qO;8{*w>polI;j_wBJHdVJRMV i(zg17h?7hoYv+5b505MecXJb2vw^j;*8c(U{l1P%=CJPo delta 2858 zcmV+_3)S?wC!Z&D&gMHnSx(gm~X7Q^drEQU@&NyrbM&Q zLof@SV^=7V=+}mc@hL1Oe zX_$LDVN4FP?c}gnb;CAIFrP22Lx~=cE(ng$&4!M*vSTX96VQ~5PV&uLu)uWLcNP6Y0Mwt?eS_9=wjY9Nj3NYABH}x^6@jOzjC;gYQK3JfQ^9Ay*{&XLu>{1 z32bu&y?wdCi8_CQt)`1bPQ2Lg2;HqTTV`xvW@p7QFi!FbY@Pq{X3=p?;$Pp;&NAEJ zaUtCsEz&r0v80XV8ho!^O#k&~beY%ArJhUd)ZlosB~vN%E%Kw^y`KP#5%q*+rzyc> zH5TQQ%07m;W{Ztk&7o{bs# z$FSIk@DlKxo`Co}h!3G3jSGykt;?KnnnqG2yq~Z%=@IZp<3>#O)|{f31rzwQX8w_C zM^f);_B3Ri;H+^cn&Ktj2GL|%{Rg^C8`SDy%*aG|@)gqLMMR}^Q0n2X%p?1*A7}KS zGrRLC!aaZ7H9V3EuIb%+y|W?Ibct#24=R@uPp5iGkAYXljVPZNdQk4AK7w6qC#rV& zXgV!^>SO4&btGC)$T&-tZ(T8pzO#-*3$)KhK7|Ri$W^}eGiR9!XQlbM2~n4dDMO#c z^F-4OKBh_jq>qm#MRZ`N`Jcd9fAFD4q}O}uz+8V|U_SAUi4mbWtdkA}KY#^ScCn|`W?`14vpDmiHF^C( zj})uJEdOk$!{7+^@a?vPjC@~WD&*|>?m7CI^W8IA*wYO}KGPHpg&GWYV9)SwPGY}1 zOdfxknAPb4f9%>T3!T>yg^x0i!T@7~f6Jh7RBt0|ZTx0~b(~O!YEYC>(Ou7?NsmL% zjca$BbVA%7Z+x6znd*pkAs6kZ36W|~<;H&}j#J=jSLTKkPkoaA1^SJ4Q7wbOE|H$W z>#qTPH(q8GP|PQXo!4(wr8(rDcKKQ~XP$yw{_^MOmwF)i(Cxww%lJfSyZSyq_ME~x z4uj`R@EH=%nUz7YeSO&AIkse`E|@n-lkl3PN9txuEH|M@jxxkjbO_yyfbo5RgRp-{ zKHx@#qpWGT^ri+jE^(B754acTVK2k&?oDkq+mS5qvb0ki!Wi@G5yl#V)8?Kizm01EDN$be4eH=GieD_th}M0ibYSL z3;d`bNP2b-%tNq)tme%KlEF?W5fpzA!dlwbG6{d=kf+`q^3Xc4F-6)BTulPi?rd!iRV*X*q)_cTdRl}g zKSr}`k@>D8X$;m6YLnG`7;j!uTtL-*>O?Nisp|rvo??f(uni zDRJ;o(m4G~lF5TG^pVjx{Y#Sx6R&gw9HtvwsOeUSF+rmvt? zBgXDAEFL16Y>S zR+KP=nZ-%@5uShcCJU!F*Rw4&O(DG{#3AX!<3ewBTuP5-5n6GO^)jIfUqOwn zS;&%w0w-Th18#Uos^M1VUa>SlbD=0f7u+YO01R>p{1S5t42-cR^incNd!>jyMQN&h zVx@r)FMTD(47b}plu7uiKfC3ji93I5e20x9A0`jO|A#>m#?ycIh_nnev9JjGA4VgJ z8Kk{Z#3K3R6DT(jgJb#@Q0+~*MWjG}+=$w$Kg&4r?r!U*Arv7?4pBaSu~Nn7vLJSMC38O&Z!r_7RJ5ykJi z;^w?Gk%>>euSs~9MZ3L2w-2mxBvm;|B%9Qypjx20kd)*WEPH?g7x=t<1Lp?b=5o+h zGK+-Qc+jd>1YJI&1-<>cN9Dy?y5T-xw_FO*l3wFj3p9ThqLTLusosRcE5IE`eqsh` zuOzX>EfNz_8mGo~IgHArWwP<D{gz7Umu`)|)UX`sFkm45&A#7{{B^@XGqZj&{ey!uo*BQT0oo!k(~kt6vJ0rJm{rh!>(#={4oc}vrT61fd+mN%>&3euL=VrK@t5MMF5kVx%r7>YkqK)~qy18cNentfU*JfJc zQn^!H`nXT`S6A%j{_bM65UVYW=raCzaE)0X_jz@5f6G4pYcYm9@uEL9<|fxgeL4Mp z`StUnREQsDcT2|>-vSU#3K!qX&OH zCsWSuDy;+AR!`qgQ}zE>(!9}K@-A@dt^j_V${uBK5^3XbfAiYh zFNig-gsfvA6Ys`+P$HZ?{S+|Yz zj7f_B{m|Lw7VPWI9m#ftINEO^sIVdw4?AgFeL=)YrjNDrz14?D7KFRGiLBYcT3PG= I0PrM_&@% +
  • + + Selecting files directly + +
  • Registering Parquet Data Sources @@ -367,26 +372,43 @@ -->

    DataFusion Command-line SQL Utility

    -

    The DataFusion CLI is a command-line interactive SQL utility that allows -queries to be executed against any supported data files. It is a convenient way to +

    The DataFusion CLI is a command-line interactive SQL utility for executing +queries against any supported data files. It is a convenient way to try DataFusion out with your own data sources, and test out its SQL support.

    Example

    Create a CSV file to query.

    -
    $ echo "1,2" > data.csv
    +
    $ echo "a,b" > data.csv
    +$ echo "1,2" >> data.csv
     
    -
    $ datafusion-cli
    -DataFusion CLI v12.0.0
    -❯ CREATE EXTERNAL TABLE foo STORED AS CSV LOCATION 'data.csv';
    -0 rows in set. Query took 0.017 seconds.
    -❯ select * from foo;
    -+----------+----------+
    -| column_1 | column_2 |
    -+----------+----------+
    -| 1        | 2        |
    -+----------+----------+
    -1 row in set. Query took 0.012 seconds.
    +

    Query that single file (the CLI also supports parquet, compressed csv, avro, json and more)

    +
    $ datafusion-cli
    +DataFusion CLI v17.0.0
    +❯ select * from 'data.csv';
    ++---+---+
    +| a | b |
    ++---+---+
    +| 1 | 2 |
    ++---+---+
    +1 row in set. Query took 0.007 seconds.
    +
    +
    +

    You can also query directories of files with compatible schemas:

    +
    $ ls data_dir/
    +data.csv   data2.csv
    +
    +
    +
    $ datafusion-cli
    +DataFusion CLI v16.0.0
    +❯ select * from 'data_dir';
    ++---+---+
    +| a | b |
    ++---+---+
    +| 3 | 4 |
    +| 1 | 2 |
    ++---+---+
    +2 rows in set. Query took 0.007 seconds.
     
    @@ -430,6 +452,7 @@

    Run using Docker

    Usage

    +

    See the current usage using datafusion-cli --help:

    +
    +

    Selecting files directly

    +

    Files can be queried directly by enclosing the file or +directory name in single ' quotes as shown in the example.

    +

    It is also possible to create a table backed by files by explicitly +via CREATE EXTERNAL TABLE as shown below.

    +

    Registering Parquet Data Sources

    Parquet data sources can be registered by executing a CREATE EXTERNAL TABLE SQL statement. It is not necessary to provide schema information for Parquet files.

    diff --git a/datafusion/user-guide/configs.html b/datafusion/user-guide/configs.html index 3ae271b6fb0d..b340d1022082 100644 --- a/datafusion/user-guide/configs.html +++ b/datafusion/user-guide/configs.html @@ -321,116 +321,130 @@

    Configuration Settings

  • - + + + + + + + + + + + + + + + + - - + - - + - + - - + + + + + - + - - + - - - - - - + - - + - + - - - - + + + - - - - + + + - - + - + - - - - + + + - - - - - - - + - - - - + + + - - + - + - - + - + + + + + - - - - - + + + - - - - + + + - - + - + - - + + + + + + + + + - - + - + + + + + + + + + + + + +

    key

    type

    default

    description

    datafusion.catalog.create_default_catalog_and_schema

    true

    Number of partitions for query execution. Increasing partitions can increase concurrency. Defaults to the number of cpu cores on the system.

    datafusion.catalog.default_catalog

    datafusion

    The default catalog name - this impacts what SQL queries use if not specified

    datafusion.catalog.default_schema

    public

    The default schema name - this impacts what SQL queries use if not specified

    datafusion.catalog.information_schema

    false

    Should DataFusion provide access to information_schema virtual tables for displaying schema information

    datafusion.catalog.location

    Utf8

    NULL

    Location scanned to load tables for default schema, defaults to None

    Location scanned to load tables for default schema

    datafusion.catalog.type

    Utf8

    datafusion.catalog.format

    NULL

    Type of TableProvider to use when loading default schema. Defaults to None

    Type of TableProvider to use when loading default schema

    datafusion.execution.batch_size

    UInt64

    datafusion.catalog.has_header

    false

    If the file has a header

    datafusion.execution.batch_size

    8192

    Default batch size while creating new batches, it’s especially useful for buffer-in-memory batches since creating tiny batches would results in too much metadata memory consumption.

    Default batch size while creating new batches, it’s especially useful for buffer-in-memory batches since creating tiny batches would results in too much metadata memory consumption

    datafusion.execution.coalesce_batches

    Boolean

    datafusion.execution.coalesce_batches

    true

    When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting ‘datafusion.execution.coalesce_target_batch_size’.

    datafusion.execution.coalesce_target_batch_size

    UInt64

    4096

    Target batch size when coalescing batches. Uses in conjunction with the configuration setting ‘datafusion.execution.coalesce_batches’.

    When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting

    datafusion.execution.parquet.enable_page_index

    Boolean

    datafusion.execution.collect_statistics

    false

    If true, uses parquet data page level metadata (Page Index) statistics to reduce the number of rows decoded.

    Should DataFusion collect statistics after listing files

    datafusion.execution.parquet.metadata_size_hint

    UInt64

    NULL

    If specified, the parquet reader will try and fetch the last size_hint bytes of the parquet file optimistically. If not specified, two read are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer.

    datafusion.execution.target_partitions

    0

    Number of partitions for query execution. Increasing partitions can increase concurrency. Defaults to the number of cpu cores on the system

    datafusion.execution.parquet.pruning

    Boolean

    true

    If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file.

    datafusion.execution.time_zone

    +00:00

    The default time zone Some functions, e.g. EXTRACT(HOUR from SOME_TIME), shift the underlying datetime according to this time zone, and then extract the hour

    datafusion.execution.parquet.pushdown_filters

    Boolean

    datafusion.execution.parquet.enable_page_index

    false

    If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded.

    If true, uses parquet data page level metadata (Page Index) statistics to reduce the number of rows decoded.

    datafusion.execution.parquet.reorder_filters

    Boolean

    false

    If true, filter expressions evaluated during the parquet decoding opearation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query.

    datafusion.execution.parquet.pruning

    true

    If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file

    datafusion.execution.parquet.skip_metadata

    Boolean

    true

    If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata.

    datafusion.execution.time_zone

    Utf8

    +00:00

    The session time zone which some function require e.g. EXTRACT(HOUR from SOME_TIME) shift the underline datetime according to the time zone,

    If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata

    then extract the hour.

    datafusion.execution.parquet.metadata_size_hint

    NULL

    If specified, the parquet reader will try and fetch the last size_hint bytes of the parquet file optimistically. If not specified, two read are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer

    datafusion.explain.logical_plan_only

    Boolean

    datafusion.execution.parquet.pushdown_filters

    false

    When set to true, the explain statement will only print logical plans.

    If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded

    datafusion.explain.physical_plan_only

    Boolean

    datafusion.execution.parquet.reorder_filters

    false

    When set to true, the explain statement will only print physical plans.

    If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query

    datafusion.optimizer.enable_round_robin_repartition

    true

    When set to true, the physical plan optimizer will try to add round robin repartition to increase parallelism to leverage more CPU cores

    datafusion.optimizer.filter_null_join_keys

    Boolean

    false

    When set to true, the optimizer will insert filters before a join between a nullable and non-nullable column to filter out nulls on the nullable side. This filter can add additional overhead when the file format does not fully support predicate push down.

    datafusion.optimizer.hash_join_single_partition_threshold

    UInt64

    1048576

    The maximum estimated size in bytes for one input side of a HashJoin will be collected into a single partition

    datafusion.optimizer.repartition_aggregations

    true

    Should DataFusion repartition data using the aggregate keys to execute aggregates in parallel using the provided target_partitions level”

    datafusion.optimizer.max_passes

    UInt64

    3

    Number of times that the optimizer will attempt to optimize the plan

    datafusion.optimizer.repartition_joins

    true

    Should DataFusion repartition data using the join keys to execute joins in parallel using the provided target_partitions level”

    datafusion.optimizer.prefer_hash_join

    Boolean

    datafusion.optimizer.repartition_windows

    true

    When set to true, the physical plan optimizer will prefer HashJoin over SortMergeJoin. HashJoin can work more efficientlythan SortMergeJoin but consumes more memory. Defaults to true

    Should DataFusion repartition data using the partitions keys to execute window functions in parallel using the provided target_partitions level”

    datafusion.optimizer.skip_failed_rules

    Boolean

    true

    When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail.

    When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail

    datafusion.optimizer.max_passes

    3

    Number of times that the optimizer will attempt to optimize the plan

    datafusion.optimizer.top_down_join_key_reordering

    true

    When set to true, the physical plan optimizer will run a top down process to reorder the join keys

    datafusion.optimizer.top_down_join_key_reordering

    Boolean

    datafusion.optimizer.prefer_hash_join

    true

    When set to true, the physical plan optimizer will run a top down process to reorder the join keys. Defaults to true

    When set to true, the physical plan optimizer will prefer HashJoin over SortMergeJoin. HashJoin can work more efficiently than SortMergeJoin but consumes more memory

    datafusion.optimizer.hash_join_single_partition_threshold

    1048576

    The maximum estimated size in bytes for one input side of a HashJoin will be collected into a single partition

    datafusion.explain.logical_plan_only

    false

    When set to true, the explain statement will only print logical plans

    datafusion.explain.physical_plan_only

    false

    When set to true, the explain statement will only print physical plans

    diff --git a/datafusion/user-guide/dataframe.html b/datafusion/user-guide/dataframe.html index 69afc2a941dc..8ac113dc5408 100644 --- a/datafusion/user-guide/dataframe.html +++ b/datafusion/user-guide/dataframe.html @@ -338,7 +338,7 @@

    DataFrame API
    let ctx = SessionContext::new();
    -let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?;
    +let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
     let df = df.filter(col("a").lt_eq(col("b")))?
                .aggregate(vec![col("a")], vec![min(col("b"))])?
                .limit(0, Some(100))?;
    diff --git a/datafusion/user-guide/example-usage.html b/datafusion/user-guide/example-usage.html
    index e3b40604abc6..a997783dfbfd 100644
    --- a/datafusion/user-guide/example-usage.html
    +++ b/datafusion/user-guide/example-usage.html
    @@ -364,7 +364,7 @@
     -->
     

    Example Usage

    -

    In this example some simple processing is performed on the example.csv file.

    +

    In this example some simple processing is performed on the example.csv file.

    Update Cargo.toml

    Add the following to your Cargo.toml file:

    @@ -381,7 +381,7 @@

    Run a SQL query against data stored in a CSV:async fn main() -> datafusion::error::Result<()> { // register the table let ctx = SessionContext::new(); - ctx.register_csv("example", "tests/example.csv", CsvReadOptions::new()).await?; + ctx.register_csv("example", "tests/data/example.csv", CsvReadOptions::new()).await?; // create a plan to run a SQL query let df = ctx.sql("SELECT a, MIN(b) FROM example GROUP BY a LIMIT 100").await?; @@ -401,7 +401,7 @@

    Use the DataFrame API to process data stored in a CSV:async fn main() -> datafusion::error::Result<()> { // create the dataframe let ctx = SessionContext::new(); - let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?; + let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; let df = df.filter(col("a").lt_eq(col("b")))? .aggregate(vec![col("a")], vec![min(col("b"))])? @@ -428,7 +428,7 @@

    Output from both examples

    Identifiers and Capitalization

    Please be aware that all identifiers are effectively made lower-case in SQL, so if your csv file has capital letters (ex: Name) you must put your column name in double quotes or the examples won’t work.

    -

    To illustrate this behavior, consider the capitalized_example.csv file:

    +

    To illustrate this behavior, consider the capitalized_example.csv file:

    Run a SQL query against data stored in a CSV:

    use datafusion::prelude::*;
    @@ -437,7 +437,7 @@ 

    Run a SQL query against data stored in a CSV:async fn main() -> datafusion::error::Result<()> { // register the table let ctx = SessionContext::new(); - ctx.register_csv("example", "tests/capitalized_example.csv", CsvReadOptions::new()).await?; + ctx.register_csv("example", "tests/data/capitalized_example.csv", CsvReadOptions::new()).await?; // create a plan to run a SQL query let df = ctx.sql("SELECT \"A\", MIN(b) FROM example GROUP BY \"A\" LIMIT 100").await?; @@ -457,7 +457,7 @@

    Use the DataFrame API to process data stored in a CSV:async fn main() -> datafusion::error::Result<()> { // create the dataframe let ctx = SessionContext::new(); - let df = ctx.read_csv("tests/capitalized_example.csv", CsvReadOptions::new()).await?; + let df = ctx.read_csv("tests/data/capitalized_example.csv", CsvReadOptions::new()).await?; let df = df.filter(col("A").lt_eq(col("c")))? .aggregate(vec![col("A")], vec![min(col("b"))])? From afb5523f9b1c27622f0149aea81595eae6940d58 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 21 Jan 2023 13:46:07 -0700 Subject: [PATCH 2/2] remove _sources --- datafusion/.gitignore | 1 + .../contributor-guide/communication.md.txt | 74 ---- .../_sources/contributor-guide/index.md.txt | 319 ---------------- .../quarterly_roadmap.md.txt | 90 ----- .../_sources/contributor-guide/roadmap.md.txt | 118 ------ .../specification/index.rst.txt | 25 -- .../specification/invariants.md.txt | 327 ---------------- .../output-field-name-semantic.md.txt | 212 ----------- datafusion/_sources/index.rst.txt | 54 --- datafusion/_sources/user-guide/cli.md.txt | 351 ------------------ datafusion/_sources/user-guide/configs.md.txt | 69 ---- .../_sources/user-guide/dataframe.md.txt | 105 ------ .../_sources/user-guide/example-usage.md.txt | 140 ------- .../_sources/user-guide/expressions.md.txt | 211 ----------- datafusion/_sources/user-guide/faq.md.txt | 31 -- .../_sources/user-guide/introduction.md.txt | 43 --- datafusion/_sources/user-guide/library.md.txt | 127 ------- .../user-guide/sql/aggregate_functions.md.txt | 68 ---- .../_sources/user-guide/sql/data_types.md.txt | 90 ----- datafusion/_sources/user-guide/sql/ddl.md.txt | 154 -------- .../_sources/user-guide/sql/explain.md.txt | 71 ---- .../_sources/user-guide/sql/index.rst.txt | 32 -- .../user-guide/sql/information_schema.md.txt | 72 ---- .../user-guide/sql/scalar_functions.md.txt | 297 --------------- .../_sources/user-guide/sql/select.md.txt | 226 ----------- .../_sources/user-guide/sql/sql_status.md.txt | 135 ------- .../_sources/user-guide/sql/subqueries.md.txt | 98 ----- 27 files changed, 1 insertion(+), 3539 deletions(-) create mode 100644 datafusion/.gitignore delete mode 100644 datafusion/_sources/contributor-guide/communication.md.txt delete mode 100644 datafusion/_sources/contributor-guide/index.md.txt delete mode 100644 datafusion/_sources/contributor-guide/quarterly_roadmap.md.txt delete mode 100644 datafusion/_sources/contributor-guide/roadmap.md.txt delete mode 100644 datafusion/_sources/contributor-guide/specification/index.rst.txt delete mode 100644 datafusion/_sources/contributor-guide/specification/invariants.md.txt delete mode 100644 datafusion/_sources/contributor-guide/specification/output-field-name-semantic.md.txt delete mode 100644 datafusion/_sources/index.rst.txt delete mode 100644 datafusion/_sources/user-guide/cli.md.txt delete mode 100644 datafusion/_sources/user-guide/configs.md.txt delete mode 100644 datafusion/_sources/user-guide/dataframe.md.txt delete mode 100644 datafusion/_sources/user-guide/example-usage.md.txt delete mode 100644 datafusion/_sources/user-guide/expressions.md.txt delete mode 100644 datafusion/_sources/user-guide/faq.md.txt delete mode 100644 datafusion/_sources/user-guide/introduction.md.txt delete mode 100644 datafusion/_sources/user-guide/library.md.txt delete mode 100644 datafusion/_sources/user-guide/sql/aggregate_functions.md.txt delete mode 100644 datafusion/_sources/user-guide/sql/data_types.md.txt delete mode 100644 datafusion/_sources/user-guide/sql/ddl.md.txt delete mode 100644 datafusion/_sources/user-guide/sql/explain.md.txt delete mode 100644 datafusion/_sources/user-guide/sql/index.rst.txt delete mode 100644 datafusion/_sources/user-guide/sql/information_schema.md.txt delete mode 100644 datafusion/_sources/user-guide/sql/scalar_functions.md.txt delete mode 100644 datafusion/_sources/user-guide/sql/select.md.txt delete mode 100644 datafusion/_sources/user-guide/sql/sql_status.md.txt delete mode 100644 datafusion/_sources/user-guide/sql/subqueries.md.txt diff --git a/datafusion/.gitignore b/datafusion/.gitignore new file mode 100644 index 000000000000..97d0badbd313 --- /dev/null +++ b/datafusion/.gitignore @@ -0,0 +1 @@ +_sources \ No newline at end of file diff --git a/datafusion/_sources/contributor-guide/communication.md.txt b/datafusion/_sources/contributor-guide/communication.md.txt deleted file mode 100644 index 11e0e4e0f0ea..000000000000 --- a/datafusion/_sources/contributor-guide/communication.md.txt +++ /dev/null @@ -1,74 +0,0 @@ - - -# Communication - -We welcome participation from everyone and encourage you to join us, ask -questions, and get involved. - -All participation in the Apache Arrow DataFusion project is governed by the -Apache Software Foundation's [code of -conduct](https://www.apache.org/foundation/policies/conduct.html). - -The vast majority of communication occurs in the open on our -[github repository](https://github.com/apache/arrow-datafusion). - -## Questions? - -### Mailing list - -We use arrow.apache.org's `dev@` mailing list for project management, release -coordination and design discussions -([subscribe](mailto:dev-subscribe@arrow.apache.org), -[unsubscribe](mailto:dev-unsubscribe@arrow.apache.org), -[archives](https://lists.apache.org/list.html?dev@arrow.apache.org)). - -When emailing the dev list, please make sure to prefix the subject line with a -`[DataFusion]` tag, e.g. `"[DataFusion] New API for remote data sources"`, so -that the appropriate people in the Apache Arrow community notice the message. - -### Slack and Discord - -We use the official [ASF](https://s.apache.org/slack-invite) Slack workspace -for informal discussions and coordination. This is a great place to meet other -contributors and get guidance on where to contribute. Join us in the -`#arrow-rust` channel. - -We also have a backup Arrow Rust Discord -server ([invite link](https://discord.gg/Qw5gKqHxUM)) in case you are not able -to join the Slack workspace. If you need an invite to the Slack workspace, you -can also ask for one in our Discord server. - -### Sync up video calls - -We have biweekly sync calls every other Thursdays at both 04:00 UTC -and 16:00 UTC (starting September 30, 2021) depending on if there are -items on the agenda to discuss and someone being willing to host. - -Please see the [agenda](https://docs.google.com/document/d/1atCVnoff5SR4eM4Lwf2M1BBJTY6g3_HUNR6qswYJW_U/edit) -for the video call link, add topics and to see what others plan to discuss. - -The goals of these calls are: - -1. Help "put a face to the name" of some of other contributors we are working with -2. Discuss / synchronize on the goals and major initiatives from different stakeholders to identify areas where more alignment is needed - -No decisions are made on the call and anything of substance will be discussed on the mailing list or in github issues / google docs. - -We will send a summary of all sync ups to the dev@arrow.apache.org mailing list. diff --git a/datafusion/_sources/contributor-guide/index.md.txt b/datafusion/_sources/contributor-guide/index.md.txt deleted file mode 100644 index 43021e1815cd..000000000000 --- a/datafusion/_sources/contributor-guide/index.md.txt +++ /dev/null @@ -1,319 +0,0 @@ - - -# Introduction - -We welcome and encourage contributions of all kinds, such as: - -1. Tickets with issue reports of feature requests -2. Documentation improvements -3. Code (PR or PR Review) - -In addition to submitting new PRs, we have a healthy tradition of community members helping review each other's PRs. Doing so is a great way to help the community as well as get more familiar with Rust and the relevant codebases. - -You can find a curated -[good-first-issue](https://github.com/apache/arrow-datafusion/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) -list to help you get started. - -# Pull Requests - -We welcome pull requests (PRs) from anyone from the community. - -DataFusion is a very active fast-moving project and we try to review and merge PRs quickly to keep the review backlog down and the pace up. After review and approval, one of the [many people with commit access](https://arrow.apache.org/committers/) will merge your PR. - -Review bandwidth is currently our most limited resource, and we highly encourage reviews by the broader community. If you are waiting for your PR to be reviewed, consider helping review other PRs that are waiting. Such review both helps the reviewer to learn the codebase and become more expert, as well as helps identify issues in the PR (such as lack of test coverage), that can be addressed and make future reviews faster and more efficient. - -## Merging PRs - -Since we are a worldwide community, we have contributors in many timezones who review and comment. To ensure anyone who wishes has an opportunity to review a PR, our committers try to ensure that at least 24 hours passes between when a "major" PR is approved and when it is merged. - -A "major" PR means there is a substantial change in design or a change in the API. Committers apply their best judgment to determine what constitutes a substantial change. A "minor" PR might be merged without a 24 hour delay, again subject to the judgment of the committer. Examples of potential "minor" PRs are: - -1. Documentation improvements/additions -2. Small bug fixes -3. Non-controversial build-related changes (clippy, version upgrades etc.) -4. Smaller non-controversial feature additions - -# Developer's guide - -This section describes how you can get started at developing DataFusion. - -## Windows setup - -```shell -wget https://az792536.vo.msecnd.net/vms/VMBuild_20190311/VirtualBox/MSEdge/MSEdge.Win10.VirtualBox.zip -choco install -y git rustup.install visualcpp-build-tools -git-bash.exe -cargo build -``` - -## Protoc Installation - -Compiling DataFusion from sources requires an installed version of the protobuf compiler, `protoc`. - -On most platforms this can be installed from your system's package manager - -``` -$ apt install -y protobuf-compiler -$ dnf install -y protobuf-compiler -$ pacman -S protobuf -$ brew install protobuf -``` - -You will want to verify the version installed is `3.12` or greater, which introduced support for explicit [field presence](https://github.com/protocolbuffers/protobuf/blob/v3.12.0/docs/field_presence.md). Older versions may fail to compile. - -```shell -$ protoc --version -libprotoc 3.12.4 -``` - -Alternatively a binary release can be downloaded from the [Release Page](https://github.com/protocolbuffers/protobuf/releases) or [built from source](https://github.com/protocolbuffers/protobuf/blob/main/src/README.md). - -## Bootstrap environment - -DataFusion is written in Rust and it uses a standard rust toolkit: - -- `cargo build` -- `cargo fmt` to format the code -- `cargo test` to test -- etc. - -Testing setup: - -- `rustup update stable` DataFusion uses the latest stable release of rust -- `git submodule init` -- `git submodule update` - -Formatting instructions: - -- [ci/scripts/rust_fmt.sh](https://github.com/apache/arrow-datafusion/blob/master/ci/scripts/rust_fmt.sh) -- [ci/scripts/rust_clippy.sh](https://github.com/apache/arrow-datafusion/blob/master/ci/scripts/rust_clippy.sh) -- [ci/scripts/rust_toml_fmt.sh](https://github.com/apache/arrow-datafusion/blob/master/ci/scripts/rust_toml_fmt.sh) - -or run them all at once: - -- [dev/rust_lint.sh](https://github.com/apache/arrow-datafusion/blob/master/dev/rust_lint.sh) - -## Test Organization - -DataFusion has several levels of tests in its [Test -Pyramid](https://martinfowler.com/articles/practical-test-pyramid.html) -and tries to follow [Testing Organization](https://doc.rust-lang.org/book/ch11-03-test-organization.html) in the The Book. - -This section highlights the most important test modules that exist - -### Unit tests - -Tests for the code in an individual module are defined in the same source file with a `test` module, following Rust convention - -### Rust Integration Tests - -There are several tests of the public interface of the DataFusion library in the [tests](https://github.com/apache/arrow-datafusion/blob/master/datafusion/core/tests) directory. - -You can run these tests individually using a command such as - -```shell -cargo test -p datafusion --tests sql_integration -``` - -One very important test is the [sql_integration](https://github.com/apache/arrow-datafusion/blob/master/datafusion/core/tests/sql_integration.rs) test which validates DataFusion's ability to run a large assortment of SQL queries against an assortment of data setups. - -### sqllogictests Tests - -The [sqllogictests](https://github.com/apache/arrow-datafusion/blob/master/datafusion/core/tests/sqllogictests) also validate DataFusion SQL against an assortment of data setups. - -Data Driven tests have many benefits including being easier to write and maintain. We are in the process of [migrating sql_integration tests](https://github.com/apache/arrow-datafusion/issues/4460) and encourage -you to add new tests using sqllogictests if possible. - -### SQL / Postgres Integration Tests - -The [integration-tests](https://github.com/apache/arrow-datafusion/blob/master/integration-tests) directory contains a harness that runs certain queries against both postgres and datafusion and compares results - -#### setup environment - -```shell -export POSTGRES_DB=postgres -export POSTGRES_USER=postgres -export POSTGRES_HOST=localhost -export POSTGRES_PORT=5432 -``` - -#### Install dependencies - -```shell -# Install dependencies -python -m pip install --upgrade pip setuptools wheel -python -m pip install -r integration-tests/requirements.txt - -# setup environment -POSTGRES_DB=postgres POSTGRES_USER=postgres POSTGRES_HOST=localhost POSTGRES_PORT=5432 python -m pytest -v integration-tests/test_psql_parity.py - -# Create -psql -d "$POSTGRES_DB" -h "$POSTGRES_HOST" -p "$POSTGRES_PORT" -U "$POSTGRES_USER" -c 'CREATE TABLE IF NOT EXISTS test ( - c1 character varying NOT NULL, - c2 integer NOT NULL, - c3 smallint NOT NULL, - c4 smallint NOT NULL, - c5 integer NOT NULL, - c6 bigint NOT NULL, - c7 smallint NOT NULL, - c8 integer NOT NULL, - c9 bigint NOT NULL, - c10 character varying NOT NULL, - c11 double precision NOT NULL, - c12 double precision NOT NULL, - c13 character varying NOT NULL -);' - -psql -d "$POSTGRES_DB" -h "$POSTGRES_HOST" -p "$POSTGRES_PORT" -U "$POSTGRES_USER" -c "\copy test FROM '$(pwd)/testing/data/csv/aggregate_test_100.csv' WITH (FORMAT csv, HEADER true);" -``` - -#### Invoke the test runner - -```shell -python -m pytest -v integration-tests/test_psql_parity.py -``` - -## Benchmarks - -### Criterion Benchmarks - -[Criterion](https://docs.rs/criterion/latest/criterion/index.html) is a statistics-driven micro-benchmarking framework used by DataFusion for evaluating the performance of specific code-paths. In particular, the criterion benchmarks help to both guide optimisation efforts, and prevent performance regressions within DataFusion. - -Criterion integrates with Cargo's built-in [benchmark support](https://doc.rust-lang.org/cargo/commands/cargo-bench.html) and a given benchmark can be run with - -``` -cargo bench --bench BENCHMARK_NAME -``` - -A full list of benchmarks can be found [here](https://github.com/apache/arrow-datafusion/blob/master/datafusion/core/benches). - -_[cargo-criterion](https://github.com/bheisler/cargo-criterion) may also be used for more advanced reporting._ - -#### Parquet SQL Benchmarks - -The parquet SQL benchmarks can be run with - -``` - cargo bench --bench parquet_query_sql -``` - -These randomly generate a parquet file, and then benchmark queries sourced from [parquet_query_sql.sql](https://github.com/apache/arrow-datafusion/blob/master/datafusion/core/benches/parquet_query_sql.sql) against it. This can therefore be a quick way to add coverage of particular query and/or data paths. - -If the environment variable `PARQUET_FILE` is set, the benchmark will run queries against this file instead of a randomly generated one. This can be useful for performing multiple runs, potentially with different code, against the same source data, or for testing against a custom dataset. - -The benchmark will automatically remove any generated parquet file on exit, however, if interrupted (e.g. by CTRL+C) it will not. This can be useful for analysing the particular file after the fact, or preserving it to use with `PARQUET_FILE` in subsequent runs. - -### Upstream Benchmark Suites - -Instructions and tooling for running upstream benchmark suites against DataFusion can be found in [benchmarks](https://github.com/apache/arrow-datafusion/blob/master/benchmarks). - -These are valuable for comparative evaluation against alternative Arrow implementations and query engines. - -## How to add a new scalar function - -Below is a checklist of what you need to do to add a new scalar function to DataFusion: - -- Add the actual implementation of the function: - - [here](https://github.com/apache/arrow-datafusion/blob/master/datafusion/physical-expr/src/string_expressions.rs) for string functions - - [here](https://github.com/apache/arrow-datafusion/blob/master/datafusion/physical-expr/src/math_expressions.rs) for math functions - - [here](https://github.com/apache/arrow-datafusion/blob/master/datafusion/physical-expr/src/datetime_expressions.rs) for datetime functions - - create a new module [here](https://github.com/apache/arrow-datafusion/blob/master/datafusion/physical-expr/src) for other functions -- In [physical-expr/src](https://github.com/apache/arrow-datafusion/blob/master/datafusion/physical-expr/src/functions.rs), add: - - a new variant to `BuiltinScalarFunction` - - a new entry to `FromStr` with the name of the function as called by SQL - - a new line in `return_type` with the expected return type of the function, given an incoming type - - a new line in `signature` with the signature of the function (number and types of its arguments) - - a new line in `create_physical_expr`/`create_physical_fun` mapping the built-in to the implementation - - tests to the function. -- In [core/tests/sql](https://github.com/apache/arrow-datafusion/blob/master/datafusion/core/tests/sql), add a new test where the function is called through SQL against well known data and returns the expected result. -- In [expr/src/expr_fn.rs](https://github.com/apache/arrow-datafusion/blob/master/datafusion/expr/src/expr_fn.rs), add: - - a new entry of the `unary_scalar_expr!` macro for the new function. - -## How to add a new aggregate function - -Below is a checklist of what you need to do to add a new aggregate function to DataFusion: - -- Add the actual implementation of an `Accumulator` and `AggregateExpr`: - - [here](https://github.com/apache/arrow-datafusion/blob/master/datafusion/physical-expr/src/string_expressions.rs) for string functions - - [here](https://github.com/apache/arrow-datafusion/blob/master/datafusion/physical-expr/src/math_expressions.rs) for math functions - - [here](https://github.com/apache/arrow-datafusion/blob/master/datafusion/physical-expr/src/datetime_expressions.rs) for datetime functions - - create a new module [here](https://github.com/apache/arrow-datafusion/blob/master/datafusion/physical-expr/src) for other functions -- In [datafusion/expr/src](https://github.com/apache/arrow-datafusion/blob/master/datafusion/expr/src/aggregate_function.rs), add: - - a new variant to `AggregateFunction` - - a new entry to `FromStr` with the name of the function as called by SQL - - a new line in `return_type` with the expected return type of the function, given an incoming type - - a new line in `signature` with the signature of the function (number and types of its arguments) - - a new line in `create_aggregate_expr` mapping the built-in to the implementation - - tests to the function. -- In [tests/sql](https://github.com/apache/arrow-datafusion/blob/master/datafusion/core/tests/sql), add a new test where the function is called through SQL against well known data and returns the expected result. - -## How to display plans graphically - -The query plans represented by `LogicalPlan` nodes can be graphically -rendered using [Graphviz](https://www.graphviz.org/). - -To do so, save the output of the `display_graphviz` function to a file.: - -```rust -// Create plan somehow... -let mut output = File::create("/tmp/plan.dot")?; -write!(output, "{}", plan.display_graphviz()); -``` - -Then, use the `dot` command line tool to render it into a file that -can be displayed. For example, the following command creates a -`/tmp/plan.pdf` file: - -```bash -dot -Tpdf < /tmp/plan.dot > /tmp/plan.pdf -``` - -## Specifications - -We formalize DataFusion semantics and behaviors through specification -documents. These specifications are useful to be used as references to help -resolve ambiguities during development or code reviews. - -You are also welcome to propose changes to existing specifications or create -new specifications as you see fit. - -Here is the list current active specifications: - -- [Output field name semantic](https://arrow.apache.org/datafusion/contributor-guide/specification/output-field-name-semantic.html) -- [Invariants](https://arrow.apache.org/datafusion/contributor-guide/specification/invariants.html) - -All specifications are stored in the `docs/source/specification` folder. - -## How to format `.md` document - -We are using `prettier` to format `.md` files. - -You can either use `npm i -g prettier` to install it globally or use `npx` to run it as a standalone binary. Using `npx` required a working node environment. Upgrading to the latest prettier is recommended (by adding `--upgrade` to the `npm` command). - -```bash -$ prettier --version -2.3.0 -``` - -After you've confirmed your prettier version, you can format all the `.md` files: - -```bash -prettier -w {datafusion,datafusion-cli,datafusion-examples,dev,docs}/**/*.md -``` diff --git a/datafusion/_sources/contributor-guide/quarterly_roadmap.md.txt b/datafusion/_sources/contributor-guide/quarterly_roadmap.md.txt deleted file mode 100644 index c593e859d731..000000000000 --- a/datafusion/_sources/contributor-guide/quarterly_roadmap.md.txt +++ /dev/null @@ -1,90 +0,0 @@ - - -# Quarterly Roadmap - -A quarterly roadmap will be published to give the DataFusion community visibility into the priorities of the projects contributors. This roadmap is not binding. - -## 2022 Q2 - -### DataFusion Core - -- IO Improvements - - Reading, registering, and writing more file formats from both DataFrame API and SQL - - Additional options for IO including partitioning and metadata support -- Work Scheduling - - Improve predictability, observability and performance of IO and CPU-bound work - - Develop a more explicit story for managing parallelism during plan execution -- Memory Management - - Add more operators for memory limited execution -- Performance - - Incorporate row-format into operators such as aggregate - - Add row-format benchmarks - - Explore JIT-compiling complex expressions - - Explore LLVM for JIT, with inline Rust functions as the primary goal - - Improve performance of Sort and Merge using Row Format / JIT expressions -- Documentation - - General improvements to DataFusion website - - Publish design documents -- Streaming - - Create `StreamProvider` trait - -### Ballista - -- Make production ready - - Shuffle file cleanup - - Fill functional gaps between DataFusion and Ballista - - Improve task scheduling and data exchange efficiency - - Better error handling - - Task failure - - Executor lost - - Schedule restart - - Improve monitoring and logging - - Auto scaling support -- Support for multi-scheduler deployments. Initially for resiliency and fault tolerance but ultimately to support sharding for scalability and more efficient caching. -- Executor deployment grouping based on resource allocation - -### Extensions ([datafusion-contrib](https://github.com/datafusion-contrib])) - -#### [DataFusion-Python](https://github.com/datafusion-contrib/datafusion-python) - -- Add missing functionality to DataFrame and SessionContext -- Improve documentation - -#### [DataFusion-S3](https://github.com/datafusion-contrib/datafusion-objectstore-s3) - -- Create Python bindings to use with datafusion-python - -#### [DataFusion-Tui](https://github.com/datafusion-contrib/datafusion-tui) - -- Create multiple SQL editors -- Expose more Context and query metadata -- Support new data sources - - BigTable, HDFS, HTTP APIs - -#### [DataFusion-BigTable](https://github.com/datafusion-contrib/datafusion-bigtable) - -- Python binding to use with datafusion-python -- Timestamp range predicate pushdown -- Multi-threaded partition aware execution -- Production ready Rust SDK - -#### [DataFusion-Streams](https://github.com/datafusion-contrib/datafusion-streams) - -- Create experimental implementation of `StreamProvider` trait diff --git a/datafusion/_sources/contributor-guide/roadmap.md.txt b/datafusion/_sources/contributor-guide/roadmap.md.txt deleted file mode 100644 index 736eef681e48..000000000000 --- a/datafusion/_sources/contributor-guide/roadmap.md.txt +++ /dev/null @@ -1,118 +0,0 @@ - - -# Roadmap - -This document describes high level goals of the DataFusion and -Ballista development community. It is not meant to restrict -possibilities, but rather help newcomers understand the broader -context of where the community is headed, and inspire -additional contributions. - -DataFusion and Ballista are part of the [Apache -Arrow](https://arrow.apache.org/) project and governed by the Apache -Software Foundation governance model. These projects are entirely -driven by volunteers, and we welcome contributions for items not on -this roadmap. However, before submitting a large PR, we strongly -suggest you start a conversation using a github issue or the -dev@arrow.apache.org mailing list to make review efficient and avoid -surprises. - -## DataFusion - -DataFusion's goal is to become the embedded query engine of choice -for new analytic applications, by leveraging the unique features of -[Rust](https://www.rust-lang.org/) and [Apache Arrow](https://arrow.apache.org/) -to provide: - -1. Best-in-class single node query performance -2. A Declarative SQL query interface compatible with PostgreSQL -3. A Dataframe API, similar to those offered by Pandas and Spark -4. A Procedural API for programmatically creating and running execution plans -5. High performance, data race free, ergonomic extensibility points at at every layer - -### Additional SQL Language Features - -- Decimal Support [#122](https://github.com/apache/arrow-datafusion/issues/122) -- Complete support list on [status](https://github.com/apache/arrow-datafusion/blob/master/README.md#status) -- Timestamp Arithmetic [#194](https://github.com/apache/arrow-datafusion/issues/194) -- SQL Parser extension point [#533](https://github.com/apache/arrow-datafusion/issues/533) -- Support for nested structures (fields, lists, structs) [#119](https://github.com/apache/arrow-datafusion/issues/119) -- Run all queries from the TPCH benchmark (see [milestone](https://github.com/apache/arrow-datafusion/milestone/2) for more details) - -### Query Optimizer - -- More sophisticated cost based optimizer for join ordering -- Implement advanced query optimization framework (Tokomak) [#440](https://github.com/apache/arrow-datafusion/issues/440) -- Finer optimizations for group by and aggregate functions - -### Datasources - -- Better support for reading data from remote filesystems (e.g. S3) without caching it locally [#907](https://github.com/apache/arrow-datafusion/issues/907) [#1060](https://github.com/apache/arrow-datafusion/issues/1060) -- Improve performances of file format datasources (parallelize file listings, async Arrow readers, file chunk prefetching capability...) - -### Runtime / Infrastructure - -- Migrate to some sort of arrow2 based implementation (see [milestone](https://github.com/apache/arrow-datafusion/milestone/3) for more details) -- Add DataFusion to h2oai/db-benchmark [#147](https://github.com/apache/arrow-datafusion/issues/147) -- Improve build time [#348](https://github.com/apache/arrow-datafusion/issues/348) - -### Resource Management - -- Finer grain control and limit of runtime memory [#587](https://github.com/apache/arrow-datafusion/issues/587) and CPU usage [#54](https://github.com/apache/arrow-datafusion/issues/64) - -### Python Interface - -TBD - -### DataFusion CLI (`datafusion-cli`) - -Note: There are some additional thoughts on a datafusion-cli vision on [#1096](https://github.com/apache/arrow-datafusion/issues/1096#issuecomment-939418770). - -- Better abstraction between REPL parsing and queries so that commands are separated and handled correctly -- Connect to the `Statistics` subsystem and have the cli print out more stats for query debugging, etc. -- Improved error handling for interactive use and shell scripting usage -- publishing to apt, brew, and possible NuGet registry so that people can use it more easily -- adopt a shorter name, like dfcli? - -## Ballista - -Ballista is a distributed compute platform based on Apache Arrow and DataFusion. It provides a query scheduler that -breaks a physical plan into stages and tasks and then schedules tasks for execution across the available executors -in the cluster. - -Having Ballista as part of the DataFusion codebase helps ensure that DataFusion remains suitable for distributed -compute. For example, it helps ensure that physical query plans can be serialized to protobuf format and that they -remain language-agnostic so that executors can be built in languages other than Rust. - -### Ballista Roadmap - -### Move query scheduler into DataFusion - -The Ballista scheduler has some advantages over DataFusion query execution because it doesn't try to eagerly execute -the entire query at once but breaks it down into a directionally-acyclic graph (DAG) of stages and executes a -configurable number of stages and tasks concurrently. It should be possible to push some of this logic down to -DataFusion so that the same scheduler can be used to scale across cores in-process and across nodes in a cluster. - -### Implement execution-time cost-based optimizations based on statistics - -After the execution of a query stage, accurate statistics are available for the resulting data. These statistics -could be leveraged by the scheduler to optimize the query during execution. For example, when performing a hash join -it is desirable to load the smaller side of the join into memory and in some cases we cannot predict which side will -be smaller until execution time. diff --git a/datafusion/_sources/contributor-guide/specification/index.rst.txt b/datafusion/_sources/contributor-guide/specification/index.rst.txt deleted file mode 100644 index bcd5a895c4d2..000000000000 --- a/datafusion/_sources/contributor-guide/specification/index.rst.txt +++ /dev/null @@ -1,25 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -Specifications -============== - -.. toctree:: - :maxdepth: 1 - - invariants - output-field-name-semantic diff --git a/datafusion/_sources/contributor-guide/specification/invariants.md.txt b/datafusion/_sources/contributor-guide/specification/invariants.md.txt deleted file mode 100644 index c8de4e1d4e21..000000000000 --- a/datafusion/_sources/contributor-guide/specification/invariants.md.txt +++ /dev/null @@ -1,327 +0,0 @@ - - -# Invariants - -This document enumerates invariants of DataFusion's logical and physical planes -(functions, and nodes). Some of these invariants are currently not enforced. -This document assumes that the reader is familiar with some of the codebase, -including rust arrow's RecordBatch and Array. - -## Rational - -DataFusion's computational model is built on top of a dynamically typed arrow -object, Array, that offers the interface `Array::as_any` to downcast itself to -its statically typed versions (e.g. `Int32Array`). DataFusion uses -`Array::data_type` to perform the respective downcasting on its physical -operations. DataFusion uses a dynamic type system because the queries being -executed are not always known at compile time: they are only known during the -runtime (or query time) of programs built with DataFusion. This document is -built on top of this principle. - -In dynamically typed interfaces, it is up to developers to enforce type -invariances. This document declares some of these invariants, so that users -know what they can expect from a query in DataFusion, and DataFusion developers -know what they need to enforce at the coding level. - -## Notation - -- Field or physical field: the tuple name, `arrow::DataType` and nullability flag (a bool whether values can be null), represented in this document by `PF(name, type, nullable)` -- Logical field: Field with a relation name. Represented in this document by `LF(relation, name, type, nullable)` -- Projected plan: plan with projection as the root node. -- Logical schema: a vector of logical fields, used by logical plan. -- Physical schema: a vector of physical fields, used by both physical plan and Arrow record batch. - -### Logical - -#### Function - -An object that knows its valid incoming logical fields and how to derive its -output logical field from its arguments' logical fields. A functions' output -field is itself a function of its input fields: - -``` -logical_field(lf1: LF, lf2: LF, ...) -> LF -``` - -Examples: - -- `plus(a,b) -> LF(None, "{a} Plus {b}", d(a.type,b.type), a.nullable | b.nullable)` where d is the function mapping input types to output type (`get_supertype` in our current implementation). -- `length(a) -> LF(None, "length({a})", u32, a.nullable)` - -#### Plan - -A tree composed of other plans and functions (e.g. `Projection c1 + c2, c1 - c2 AS sum12; Scan c1 as u32, c2 as u64`) -that knows how to derive its schema. - -Certain plans have a frozen schema (e.g. Scan), while others derive their -schema from their child nodes. - -#### Column - -An identifier in a logical plan consists of field name and relation name. - -### Physical - -#### Function - -An object that knows how to derive its physical field from its arguments' -physical fields, and also how to actually perform the computation on data. A -functions' output physical field is a function of its input physical fields: - -``` -physical_field(PF1, PF2, ...) -> PF -``` - -Examples: - -- `plus(a,b) -> PF("{a} Plus {b}", d(a.type,b.type), a.nullable | b.nullable)` where d is a complex function (`get_supertype` in our current implementation) whose computation is for each element in the columns, sum the two entries together and return it in the same type as the smallest type of both columns. -- `length(&str) -> PF("length({a})", u32, a.nullable)` whose computation is "count number of bytes in the string". - -#### Plan - -A tree (e.g. `Projection c1 + c2, c1 - c2 AS sum12; Scan c1 as u32, c2 as u64`) -that knows how to derive its metadata and compute itself. - -Note how the physical plane does not know how to derive field names: field -names are solely a property of the logical plane, as they are not needed in the -physical plane. - -#### Column - -A type of physical node in a physical plan consists of a field name and unique index. - -### Data Sources' registry - -A map of source name/relation -> Schema plus associated properties necessary to read data from it (e.g. file path). - -### Functions' registry - -A map of function name -> logical + physical function. - -### Physical Planner - -A function that knows how to derive a physical plan from a logical plan: - -``` -plan(LogicalPlan) -> PhysicalPlan -``` - -### Logical Optimizer - -A function that accepts a logical plan and returns an (optimized) logical plan -which computes the same results, but in a more efficient manner: - -``` -optimize(LogicalPlan) -> LogicalPlan -``` - -### Physical Optimizer - -A function that accepts a physical plan and returns an (optimized) physical -plan which computes the same results, but may differ based on the actual -hardware or execution environment being run: - -``` -optimize(PhysicalPlan) -> PhysicalPlan -``` - -### Builder - -A function that knows how to build a new logical plan from an existing logical -plan and some extra parameters. - -``` -build(logical_plan, params...) -> logical_plan -``` - -## Invariants - -The following subsections describe invariants. Since functions' output schema -depends on its arguments' schema (e.g. min, plus), the resulting schema can -only be derived based on a known set of input schemas (TableProvider). -Likewise, schemas of functions depend on the specific registry of functions -registered (e.g. does `my_op` return u32 or u64?). Thus, in this section, the -wording "same schema" is understood to mean "same schema under a given registry -of data sources and functions". - -### (relation, name) tuples in logical fields and logical columns are unique - -Every logical field's (relation, name) tuple in a logical schema MUST be unique. -Every logical column's (relation, name) tuple in a logical plan MUST be unique. - -This invariant guarantees that `SELECT t1.id, t2.id FROM t1 JOIN t2...` -unambiguously selects the field `t1.id` and `t2.id` in a logical schema in the -logical plane. - -#### Responsibility - -It is the logical builder and optimizer's responsibility to guarantee this -invariant. - -#### Validation - -Builder and optimizer MUST error if this invariant is violated on any logical -node that creates a new schema (e.g. scan, projection, aggregation, join, etc.). - -### Physical schema is consistent with data - -The contents of every Array in every RecordBatch in every partition returned by -a physical plan MUST be consistent with RecordBatch's schema, in that every -Array in the RecordBatch must be downcastable to its corresponding type -declared in the RecordBatch. - -#### Responsibility - -Physical functions MUST guarantee this invariant. This is particularly -important in aggregate functions, whose aggregating type may be different from -the intermediary types during calculations (e.g. sum(i32) -> i64). - -#### Validation - -Since the validation of this invariant is computationally expensive, execution -contexts CAN validate this invariant. It is acceptable for physical nodes to -`panic!` if their input does not satisfy this invariant. - -### Physical schema is consistent in physical functions - -The schema of every Array returned by a physical function MUST match the -DataType reported by the physical function itself. - -This ensures that when a physical function claims that it returns a type -(e.g. Int32), users can safely downcast its resulting Array to the -corresponding type (e.g. Int32Array), as well as to write data to formats that -have a schema with nullability flag (e.g. parquet). - -#### Responsibility - -It is the responsibility of the developer that writes a physical function to -guarantee this invariant. - -In particular: - -- The derived DataType matches the code it uses to build the array for every branch of valid input type combinations. -- The nullability flag matches how the values are built. - -#### Validation - -Since the validation of this invariant is computationally expensive, execution -contexts CAN validate this invariant. - -### The physical schema is invariant under planning - -The physical schema derived by a physical plan returned by the planner MUST be -equivalent to the physical schema derived by the logical plan passed to the -planner. Specifically: - -``` -plan(logical_plan).schema === logical_plan.physical_schema -``` - -Logical plan's physical schema is defined as logical schema with relation -qualifiers stripped for all logical fields: - -``` -logical_plan.physical_schema = vector[ strip_relation(f) for f in logical_plan.logical_fields ] -``` - -This is used to ensure that the physical schema of its (logical) plan is what -it gets in record batches, so that users can rely on the optimized logical plan -to know the resulting physical schema. - -Note that since a logical plan can be as simple as a single projection with a -single function, `Projection f(c1,c2)`, a corollary of this is that the -physical schema of every `logical function -> physical function` must be -invariant under planning. - -#### Responsibility - -Developers of physical and logical plans and planners MUST guarantee this -invariant for every triplet (logical plan, physical plan, conversion rule). - -#### Validation - -Planners MUST validate this invariant. In particular they MUST return an error -when, during planning, a physical function's derived schema does not match the -logical functions' derived schema. - -### The output schema equals the physical plan schema - -The schema of every RecordBatch in every partition outputted by a physical plan -MUST be equal to the schema of the physical plan. Specifically: - -``` -physical_plan.evaluate(batch).schema = physical_plan.schema -``` - -Together with other invariants, this ensures that the consumers of record -batches do not need to know the output schema of the physical plan; they can -safely rely on the record batch's schema to perform downscaling and naming. - -#### Responsibility - -Physical nodes MUST guarantee this invariant. - -#### Validation - -Execution Contexts CAN validate this invariant. - -### Logical schema is invariant under logical optimization - -The logical schema derived by a projected logical plan returned by the logical -optimizer MUST be equivalent to the logical schema derived by the logical plan -passed to the planner: - -``` -optimize(logical_plan).schema === logical_plan.schema -``` - -This is used to ensure that plans can be optimized without jeopardizing future -referencing logical columns (name and index) or assumptions about their -schemas. - -#### Responsibility - -Logical optimizers MUST guarantee this invariant. - -#### Validation - -Users of logical optimizers SHOULD validate this invariant. - -### Physical schema is invariant under physical optimization - -The physical schema derived by a projected physical plan returned by the -physical optimizer MUST match the physical schema derived by the physical plan -passed to the planner: - -``` -optimize(physical_plan).schema === physical_plan.schema -``` - -This is used to ensure that plans can be optimized without jeopardizing future -references of logical columns (name and index) or assumptions about their -schemas. - -#### Responsibility - -Optimizers MUST guarantee this invariant. - -#### Validation - -Users of optimizers SHOULD validate this invariant. diff --git a/datafusion/_sources/contributor-guide/specification/output-field-name-semantic.md.txt b/datafusion/_sources/contributor-guide/specification/output-field-name-semantic.md.txt deleted file mode 100644 index fe378a52cda1..000000000000 --- a/datafusion/_sources/contributor-guide/specification/output-field-name-semantic.md.txt +++ /dev/null @@ -1,212 +0,0 @@ - - -# Output field name semantics - -This specification documents how field names in output record batches should be -generated based on given user queries. The filed name rules apply to -DataFusion queries planned from both SQL queries and Dataframe APIs. - -## Field name rules - -- All bare column field names MUST not contain relation/table qualifier. - - Both `SELECT t1.id`, `SELECT id` and `df.select_columns(&["id"])` SHOULD result in field name: `id` -- All compound column field names MUST contain relation/table qualifier. - - `SELECT foo + bar` SHOULD result in field name: `table.foo PLUS table.bar` -- Function names MUST be converted to lowercase. - - `SELECT AVG(c1)` SHOULD result in field name: `avg(table.c1)` -- Literal string MUST not be wrapped with quotes or double quotes. - - `SELECT 'foo'` SHOULD result in field name: `foo` -- Operator expressions MUST be wrapped with parentheses. - - `SELECT -2` SHOULD result in field name: `(- 2)` -- Operator and operand MUST be separated by spaces. - - `SELECT 1+2` SHOULD result in field name: `(1 + 2)` -- Function arguments MUST be separated by a comma `,` and a space. - - `SELECT f(c1,c2)` and `df.select(vec![f.udf("f")?.call(vec![col("c1"), col("c2")])])` SHOULD result in field name: `f(table.c1, table.c2)` - -## Appendices - -### Examples and comparison with other systems - -Data schema for test sample queries: - -``` -CREATE TABLE t1 (id INT, a VARCHAR(5)); -INSERT INTO t1 (id, a) VALUES (1, 'foo'); -INSERT INTO t1 (id, a) VALUES (2, 'bar'); - -CREATE TABLE t2 (id INT, b VARCHAR(5)); -INSERT INTO t2 (id, b) VALUES (1, 'hello'); -INSERT INTO t2 (id, b) VALUES (2, 'world'); -``` - -#### Projected columns - -Query: - -``` -SELECT t1.id, a, t2.id, b -FROM t1 -JOIN t2 ON t1.id = t2.id -``` - -DataFusion Arrow record batches output: - -| id | a | id | b | -| --- | --- | --- | ----- | -| 1 | foo | 1 | hello | -| 2 | bar | 2 | world | - -Spark, MySQL 8 and PostgreSQL 13 output: - -| id | a | id | b | -| --- | --- | --- | ----- | -| 1 | foo | 1 | hello | -| 2 | bar | 2 | world | - -SQLite 3 output: - -| id | a | b | -| --- | --- | ----- | -| 1 | foo | hello | -| 2 | bar | world | - -#### Function transformed columns - -Query: - -``` -SELECT ABS(t1.id), abs(-id) FROM t1; -``` - -DataFusion Arrow record batches output: - -| abs(t1.id) | abs((- t1.id)) | -| ---------- | -------------- | -| 1 | 1 | -| 2 | 2 | - -Spark output: - -| abs(id) | abs((- id)) | -| ------- | ----------- | -| 1 | 1 | -| 2 | 2 | - -MySQL 8 output: - -| ABS(t1.id) | abs(-id) | -| ---------- | -------- | -| 1 | 1 | -| 2 | 2 | - -PostgreSQL 13 output: - -| abs | abs | -| --- | --- | -| 1 | 1 | -| 2 | 2 | - -SQlite 3 output: - -| ABS(t1.id) | abs(-id) | -| ---------- | -------- | -| 1 | 1 | -| 2 | 2 | - -#### Function with operators - -Query: - -``` -SELECT t1.id + ABS(id), ABS(id * t1.id) FROM t1; -``` - -DataFusion Arrow record batches output: - -| t1.id + abs(t1.id) | abs(t1.id \* t1.id) | -| ------------------ | ------------------- | -| 2 | 1 | -| 4 | 4 | - -Spark output: - -| id + abs(id) | abs(id \* id) | -| ------------ | ------------- | -| 2 | 1 | -| 4 | 4 | - -MySQL 8 output: - -| t1.id + ABS(id) | ABS(id \* t1.id) | -| --------------- | ---------------- | -| 2 | 1 | -| 4 | 4 | - -PostgreSQL output: - -| ?column? | abs | -| -------- | --- | -| 2 | 1 | -| 4 | 4 | - -SQLite output: - -| t1.id + ABS(id) | ABS(id \* t1.id) | -| --------------- | ---------------- | -| 2 | 1 | -| 4 | 4 | - -#### Project literals - -Query: - -``` -SELECT 1, 2+5, 'foo_bar'; -``` - -DataFusion Arrow record batches output: - -| 1 | (2 + 5) | foo_bar | -| --- | ------- | ------- | -| 1 | 7 | foo_bar | - -Spark output: - -| 1 | (2 + 5) | foo_bar | -| --- | ------- | ------- | -| 1 | 7 | foo_bar | - -MySQL output: - -| 1 | 2+5 | foo_bar | -| --- | --- | ------- | -| 1 | 7 | foo_bar | - -PostgreSQL output: - -| ?column? | ?column? | ?column? | -| -------- | -------- | -------- | -| 1 | 7 | foo_bar | - -SQLite 3 output: - -| 1 | 2+5 | 'foo_bar' | -| --- | --- | --------- | -| 1 | 7 | foo_bar | diff --git a/datafusion/_sources/index.rst.txt b/datafusion/_sources/index.rst.txt deleted file mode 100644 index 86b3b7e2c8ff..000000000000 --- a/datafusion/_sources/index.rst.txt +++ /dev/null @@ -1,54 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -======================= -Apache Arrow DataFusion -======================= - -Table of Contents -================= - -.. _toc.guide: - -.. toctree:: - :maxdepth: 1 - :caption: User Guide - - user-guide/introduction - user-guide/example-usage - user-guide/library - user-guide/cli - user-guide/dataframe - user-guide/expressions - user-guide/sql/index - user-guide/configs - user-guide/faq - Rust Crate Documentation - -.. _toc.contributor-guide: - -.. toctree:: - :maxdepth: 2 - :caption: Contributor Guide - - contributor-guide/index - contributor-guide/communication - contributor-guide/roadmap - contributor-guide/quarterly_roadmap - contributor-guide/specification/index - Issue tracker - Code of conduct diff --git a/datafusion/_sources/user-guide/cli.md.txt b/datafusion/_sources/user-guide/cli.md.txt deleted file mode 100644 index d3512a6dca52..000000000000 --- a/datafusion/_sources/user-guide/cli.md.txt +++ /dev/null @@ -1,351 +0,0 @@ - - -# DataFusion Command-line SQL Utility - -The DataFusion CLI is a command-line interactive SQL utility for executing -queries against any supported data files. It is a convenient way to -try DataFusion out with your own data sources, and test out its SQL support. - -## Example - -Create a CSV file to query. - -```shell -$ echo "a,b" > data.csv -$ echo "1,2" >> data.csv -``` - -Query that single file (the CLI also supports parquet, compressed csv, avro, json and more) - -```shell -$ datafusion-cli -DataFusion CLI v17.0.0 -❯ select * from 'data.csv'; -+---+---+ -| a | b | -+---+---+ -| 1 | 2 | -+---+---+ -1 row in set. Query took 0.007 seconds. -``` - -You can also query directories of files with compatible schemas: - -```shell -$ ls data_dir/ -data.csv data2.csv -``` - -```shell -$ datafusion-cli -DataFusion CLI v16.0.0 -❯ select * from 'data_dir'; -+---+---+ -| a | b | -+---+---+ -| 3 | 4 | -| 1 | 2 | -+---+---+ -2 rows in set. Query took 0.007 seconds. -``` - -## Installation - -### Install and run using Cargo - -The easiest way to install DataFusion CLI a spin is via `cargo install datafusion-cli`. - -### Install and run using Homebrew (on MacOS) - -DataFusion CLI can also be installed via Homebrew (on MacOS). Install it as any other pre-built software like this: - -```bash -brew install datafusion -# ==> Downloading https://ghcr.io/v2/homebrew/core/datafusion/manifests/12.0.0 -# ######################################################################## 100.0% -# ==> Downloading https://ghcr.io/v2/homebrew/core/datafusion/blobs/sha256:9ecc8a01be47ceb9a53b39976696afa87c0a8 -# ==> Downloading from https://pkg-containers.githubusercontent.com/ghcr1/blobs/sha256:9ecc8a01be47ceb9a53b39976 -# ######################################################################## 100.0% -# ==> Pouring datafusion--12.0.0.big_sur.bottle.tar.gz -# 🍺 /usr/local/Cellar/datafusion/12.0.0: 9 files, 17.4MB - -datafusion-cli -``` - -### Run using Docker - -There is no officially published Docker image for the DataFusion CLI, so it is necessary to build from source -instead. - -Use the following commands to clone this repository and build a Docker image containing the CLI tool. Note -that there is `.dockerignore` file in the root of the repository that may need to be deleted in order for -this to work. - -```bash -git clone https://github.com/apache/arrow-datafusion -git checkout 12.0.0 -cd arrow-datafusion -docker build -f datafusion-cli/Dockerfile . --tag datafusion-cli -docker run -it -v $(your_data_location):/data datafusion-cli -``` - -## Usage - -See the current usage using `datafusion-cli --help`: - -```bash -Apache Arrow -Command Line Client for DataFusion query engine. - -USAGE: - datafusion-cli [OPTIONS] - -OPTIONS: - -c, --batch-size The batch size of each query, or use DataFusion default - -f, --file ... Execute commands from file(s), then exit - --format [default: table] [possible values: csv, tsv, table, json, - nd-json] - -h, --help Print help information - -p, --data-path Path to your data, default to current directory - -q, --quiet Reduce printing other than the results and work quietly - -r, --rc ... Run the provided files on startup instead of ~/.datafusionrc - -V, --version Print version information -``` - -## Selecting files directly - -Files can be queried directly by enclosing the file or -directory name in single `'` quotes as shown in the example. - -It is also possible to create a table backed by files by explicitly -via `CREATE EXTERNAL TABLE` as shown below. - -## Registering Parquet Data Sources - -Parquet data sources can be registered by executing a `CREATE EXTERNAL TABLE` SQL statement. It is not necessary to provide schema information for Parquet files. - -```sql -CREATE EXTERNAL TABLE taxi -STORED AS PARQUET -LOCATION '/mnt/nyctaxi/tripdata.parquet'; -``` - -## Registering CSV Data Sources - -CSV data sources can be registered by executing a `CREATE EXTERNAL TABLE` SQL statement. - -```sql -CREATE EXTERNAL TABLE test -STORED AS CSV -WITH HEADER ROW -LOCATION '/path/to/aggregate_test_100.csv'; -``` - -It is also possible to provide schema information. - -```sql -CREATE EXTERNAL TABLE test ( - c1 VARCHAR NOT NULL, - c2 INT NOT NULL, - c3 SMALLINT NOT NULL, - c4 SMALLINT NOT NULL, - c5 INT NOT NULL, - c6 BIGINT NOT NULL, - c7 SMALLINT NOT NULL, - c8 INT NOT NULL, - c9 BIGINT NOT NULL, - c10 VARCHAR NOT NULL, - c11 FLOAT NOT NULL, - c12 DOUBLE NOT NULL, - c13 VARCHAR NOT NULL -) -STORED AS CSV -LOCATION '/path/to/aggregate_test_100.csv'; -``` - -## Querying S3 Data Sources - -The CLI can query data in S3 if the following environment variables are defined: - -- `AWS_DEFAULT_REGION` -- `AWS_ACCESS_KEY_ID` -- `AWS_SECRET_ACCESS_KEY` - -Details of the environment variables that can be used are - -- AWS_ACCESS_KEY_ID -> access_key_id -- AWS_SECRET_ACCESS_KEY -> secret_access_key -- AWS_DEFAULT_REGION -> region -- AWS_ENDPOINT -> endpoint -- AWS_SESSION_TOKEN -> token -- AWS_CONTAINER_CREDENTIALS_RELATIVE_URI -> -- AWS_ALLOW_HTTP -> set to "true" to permit HTTP connections without TLS - -Example: - -```bash -$ aws s3 cp test.csv s3://my-bucket/ -upload: ./test.csv to s3://my-bucket/test.csv - -$ export AWS_DEFAULT_REGION=us-east-2 -$ export AWS_SECRET_ACCESS_KEY=*************************** -$ export AWS_ACCESS_KEY_ID=************** - -$ datafusion-cli -DataFusion CLI v14.0.0 -❯ create external table test stored as csv location 's3://my-bucket/test.csv'; -0 rows in set. Query took 0.374 seconds. -❯ select * from test; -+----------+----------+ -| column_1 | column_2 | -+----------+----------+ -| 1 | 2 | -+----------+----------+ -1 row in set. Query took 0.171 seconds. -``` - -## Commands - -Available commands inside DataFusion CLI are: - -- Quit - -```bash -> \q -``` - -- Help - -```bash -> \? -``` - -- ListTables - -```bash -> \d -``` - -- DescribeTable - -```bash -> \d table_name -``` - -- QuietMode - -```bash -> \quiet [true|false] -``` - -- list function - -```bash -> \h -``` - -- Search and describe function - -```bash -> \h function -``` - -- Show configuration options - -```SQL -> show all; - -+-------------------------------------------------+---------+ -| name | setting | -+-------------------------------------------------+---------+ -| datafusion.execution.batch_size | 8192 | -| datafusion.execution.coalesce_batches | true | -| datafusion.execution.coalesce_target_batch_size | 4096 | -| datafusion.execution.time_zone | UTC | -| datafusion.explain.logical_plan_only | false | -| datafusion.explain.physical_plan_only | false | -| datafusion.optimizer.filter_null_join_keys | false | -| datafusion.optimizer.skip_failed_rules | true | -+-------------------------------------------------+---------+ - -``` - -- Set configuration options - -```SQL -> SET datafusion.execution.batch_size to 1024; -``` - -## Changing Configuration Options - -All available configuration options can be seen using `SHOW ALL` as described above. - -You can change the configuration options using environment -variables. `datafusion-cli` looks in the corresponding environment -variable with an upper case name and all `.` converted to `_`. - -For example, to set `datafusion.execution.batch_size` to `1024` you -would set the `DATAFUSION_EXECUTION_BATCH_SIZE` environment variable -appropriately: - -```shell -$ DATAFUSION_EXECUTION_BATCH_SIZE=1024 datafusion-cli -DataFusion CLI v12.0.0 -❯ show all; -+-------------------------------------------------+---------+ -| name | setting | -+-------------------------------------------------+---------+ -| datafusion.execution.batch_size | 1024 | -| datafusion.execution.coalesce_batches | true | -| datafusion.execution.coalesce_target_batch_size | 4096 | -| datafusion.execution.time_zone | UTC | -| datafusion.explain.logical_plan_only | false | -| datafusion.explain.physical_plan_only | false | -| datafusion.optimizer.filter_null_join_keys | false | -| datafusion.optimizer.skip_failed_rules | true | -+-------------------------------------------------+---------+ -8 rows in set. Query took 0.002 seconds. -``` - -You can change the configuration options using `SET` statement as well - -```shell -$ datafusion-cli -DataFusion CLI v13.0.0 - -❯ show datafusion.execution.batch_size; -+---------------------------------+---------+ -| name | setting | -+---------------------------------+---------+ -| datafusion.execution.batch_size | 8192 | -+---------------------------------+---------+ -1 row in set. Query took 0.011 seconds. - -❯ set datafusion.execution.batch_size to 1024; -0 rows in set. Query took 0.000 seconds. - -❯ show datafusion.execution.batch_size; -+---------------------------------+---------+ -| name | setting | -+---------------------------------+---------+ -| datafusion.execution.batch_size | 1024 | -+---------------------------------+---------+ -1 row in set. Query took 0.005 seconds. -``` diff --git a/datafusion/_sources/user-guide/configs.md.txt b/datafusion/_sources/user-guide/configs.md.txt deleted file mode 100644 index 57d23ce69060..000000000000 --- a/datafusion/_sources/user-guide/configs.md.txt +++ /dev/null @@ -1,69 +0,0 @@ - - - - -# Configuration Settings - -The following configuration options can be passed to `SessionConfig` to control various aspects of query execution. - -For applications which do not expose `SessionConfig`, like `datafusion-cli`, these options may also be set via environment variables. -To construct a session with options from the environment, use `SessionConfig::from_env`. -The name of the environment variable is the option's key, transformed to uppercase and with periods replaced with underscores. -For example, to configure `datafusion.execution.batch_size` you would set the `DATAFUSION_EXECUTION_BATCH_SIZE` environment variable. -Values are parsed according to the [same rules used in casts from Utf8](https://docs.rs/arrow/latest/arrow/compute/kernels/cast/fn.cast.html). -If the value in the environment variable cannot be cast to the type of the configuration option, the default value will be used instead and a warning emitted. -Environment variables are read during `SessionConfig` initialisation so they must be set beforehand and will not affect running sessions. - -| key | default | description | -| --------------------------------------------------------- | ---------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| datafusion.catalog.create_default_catalog_and_schema | true | Number of partitions for query execution. Increasing partitions can increase concurrency. Defaults to the number of cpu cores on the system. | -| datafusion.catalog.default_catalog | datafusion | The default catalog name - this impacts what SQL queries use if not specified | -| datafusion.catalog.default_schema | public | The default schema name - this impacts what SQL queries use if not specified | -| datafusion.catalog.information_schema | false | Should DataFusion provide access to `information_schema` virtual tables for displaying schema information | -| datafusion.catalog.location | NULL | Location scanned to load tables for `default` schema | -| datafusion.catalog.format | NULL | Type of `TableProvider` to use when loading `default` schema | -| datafusion.catalog.has_header | false | If the file has a header | -| datafusion.execution.batch_size | 8192 | Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would results in too much metadata memory consumption | -| datafusion.execution.coalesce_batches | true | When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting | -| datafusion.execution.collect_statistics | false | Should DataFusion collect statistics after listing files | -| datafusion.execution.target_partitions | 0 | Number of partitions for query execution. Increasing partitions can increase concurrency. Defaults to the number of cpu cores on the system | -| datafusion.execution.time_zone | +00:00 | The default time zone Some functions, e.g. EXTRACT(HOUR from SOME_TIME), shift the underlying datetime according to this time zone, and then extract the hour | -| datafusion.execution.parquet.enable_page_index | false | If true, uses parquet data page level metadata (Page Index) statistics to reduce the number of rows decoded. | -| datafusion.execution.parquet.pruning | true | If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file | -| datafusion.execution.parquet.skip_metadata | true | If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata | -| datafusion.execution.parquet.metadata_size_hint | NULL | If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two read are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer | -| datafusion.execution.parquet.pushdown_filters | false | If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded | -| datafusion.execution.parquet.reorder_filters | false | If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query | -| datafusion.optimizer.enable_round_robin_repartition | true | When set to true, the physical plan optimizer will try to add round robin repartition to increase parallelism to leverage more CPU cores | -| datafusion.optimizer.filter_null_join_keys | false | When set to true, the optimizer will insert filters before a join between a nullable and non-nullable column to filter out nulls on the nullable side. This filter can add additional overhead when the file format does not fully support predicate push down. | -| datafusion.optimizer.repartition_aggregations | true | Should DataFusion repartition data using the aggregate keys to execute aggregates in parallel using the provided `target_partitions` level" | -| datafusion.optimizer.repartition_joins | true | Should DataFusion repartition data using the join keys to execute joins in parallel using the provided `target_partitions` level" | -| datafusion.optimizer.repartition_windows | true | Should DataFusion repartition data using the partitions keys to execute window functions in parallel using the provided `target_partitions` level" | -| datafusion.optimizer.skip_failed_rules | true | When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail | -| datafusion.optimizer.max_passes | 3 | Number of times that the optimizer will attempt to optimize the plan | -| datafusion.optimizer.top_down_join_key_reordering | true | When set to true, the physical plan optimizer will run a top down process to reorder the join keys | -| datafusion.optimizer.prefer_hash_join | true | When set to true, the physical plan optimizer will prefer HashJoin over SortMergeJoin. HashJoin can work more efficiently than SortMergeJoin but consumes more memory | -| datafusion.optimizer.hash_join_single_partition_threshold | 1048576 | The maximum estimated size in bytes for one input side of a HashJoin will be collected into a single partition | -| datafusion.explain.logical_plan_only | false | When set to true, the explain statement will only print logical plans | -| datafusion.explain.physical_plan_only | false | When set to true, the explain statement will only print physical plans | diff --git a/datafusion/_sources/user-guide/dataframe.md.txt b/datafusion/_sources/user-guide/dataframe.md.txt deleted file mode 100644 index 23766cd07bdb..000000000000 --- a/datafusion/_sources/user-guide/dataframe.md.txt +++ /dev/null @@ -1,105 +0,0 @@ - - -# DataFrame API - -A DataFrame represents a logical set of rows with the same named columns, similar to a [Pandas DataFrame](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html) or -[Spark DataFrame](https://spark.apache.org/docs/latest/sql-programming-guide.html). - -DataFrames are typically created by calling a method on -`SessionContext`, such as `read_csv`, and can then be modified -by calling the transformation methods, such as `filter`, `select`, `aggregate`, and `limit` -to build up a query definition. - -The query can be executed by calling the `collect` method. - -The DataFrame struct is part of DataFusion's prelude and can be imported with the following statement. - -```rust -use datafusion::prelude::*; -``` - -Here is a minimal example showing the execution of a query using the DataFrame API. - -```rust -let ctx = SessionContext::new(); -let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; -let df = df.filter(col("a").lt_eq(col("b")))? - .aggregate(vec![col("a")], vec![min(col("b"))])? - .limit(0, Some(100))?; -// Print results -df.show(); -``` - -The DataFrame API is well documented in the [API reference on docs.rs](https://docs.rs/datafusion/latest/datafusion/dataframe/struct.DataFrame.html). - -Refer to the [Expressions Reference](expressions) for available functions for building logical expressions for use with the -DataFrame API. - -## DataFrame Transformations - -These methods create a new DataFrame after applying a transformation to the logical plan that the DataFrame represents. - -DataFusion DataFrames use lazy evaluation, meaning that each transformation is just creating a new query plan and -not actually performing any transformations. This approach allows for the overall plan to be optimized before -execution. The plan is evaluated (executed) when an action method is invoked, such as `collect`. - -| Function | Notes | -| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | -| aggregate | Perform an aggregate query with optional grouping expressions. | -| distinct | Filter out duplicate rows. | -| except | Calculate the exception of two DataFrames. The two DataFrames must have exactly the same schema | -| filter | Filter a DataFrame to only include rows that match the specified filter expression. | -| intersect | Calculate the intersection of two DataFrames. The two DataFrames must have exactly the same schema | -| join | Join this DataFrame with another DataFrame using the specified columns as join keys. | -| limit | Limit the number of rows returned from this DataFrame. | -| repartition | Repartition a DataFrame based on a logical partitioning scheme. | -| sort | Sort the DataFrame by the specified sorting expressions. Any expression can be turned into a sort expression by calling its `sort` method. | -| select | Create a projection based on arbitrary expressions. Example: `df..select(vec![col("c1"), abs(col("c2"))])?` | -| select_columns | Create a projection based on column names. Example: `df.select_columns(&["id", "name"])?`. | -| union | Calculate the union of two DataFrames, preserving duplicate rows. The two DataFrames must have exactly the same schema. | -| union_distinct | Calculate the distinct union of two DataFrames. The two DataFrames must have exactly the same schema. | -| with_column | Add an additional column to the DataFrame. | -| with_column_renamed | Rename one column by applying a new projection. | - -## DataFrame Actions - -These methods execute the logical plan represented by the DataFrame and either collects the results into memory, prints them to stdout, or writes them to disk. - -| Function | Notes | -| -------------------------- | --------------------------------------------------------------------------------------------------------------------------- | -| collect | Executes this DataFrame and collects all results into a vector of RecordBatch. | -| collect_partitioned | Executes this DataFrame and collects all results into a vector of vector of RecordBatch maintaining the input partitioning. | -| execute_stream | Executes this DataFrame and returns a stream over a single partition. | -| execute_stream_partitioned | Executes this DataFrame and returns one stream per partition. | -| show | Execute this DataFrame and print the results to stdout. | -| show_limit | Execute this DataFrame and print a subset of results to stdout. | -| write_csv | Execute this DataFrame and write the results to disk in CSV format. | -| write_json | Execute this DataFrame and write the results to disk in JSON format. | -| write_parquet | Execute this DataFrame and write the results to disk in Parquet format. | - -## Other DataFrame Methods - -| Function | Notes | -| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| explain | Return a DataFrame with the explanation of its plan so far. | -| registry | Return a `FunctionRegistry` used to plan udf's calls. | -| schema | Returns the schema describing the output of this DataFrame in terms of columns returned, where each column has a name, data type, and nullability attribute. | -| to_logical_plan | Return the optimized logical plan represented by this DataFrame. | -| to_unoptimized_plan | Return the unoptimized logical plan represented by this DataFrame. | diff --git a/datafusion/_sources/user-guide/example-usage.md.txt b/datafusion/_sources/user-guide/example-usage.md.txt deleted file mode 100644 index 03283a408c03..000000000000 --- a/datafusion/_sources/user-guide/example-usage.md.txt +++ /dev/null @@ -1,140 +0,0 @@ - - -# Example Usage - -In this example some simple processing is performed on the [`example.csv`](../../../datafusion/core/tests/data/example.csv) file. - -## Update `Cargo.toml` - -Add the following to your `Cargo.toml` file: - -```toml -datafusion = "11.0" -tokio = "1.0" -``` - -## Run a SQL query against data stored in a CSV: - -```rust -use datafusion::prelude::*; - -#[tokio::main] -async fn main() -> datafusion::error::Result<()> { - // register the table - let ctx = SessionContext::new(); - ctx.register_csv("example", "tests/data/example.csv", CsvReadOptions::new()).await?; - - // create a plan to run a SQL query - let df = ctx.sql("SELECT a, MIN(b) FROM example GROUP BY a LIMIT 100").await?; - - // execute and print results - df.show().await?; - Ok(()) -} -``` - -## Use the DataFrame API to process data stored in a CSV: - -```rust -use datafusion::prelude::*; - -#[tokio::main] -async fn main() -> datafusion::error::Result<()> { - // create the dataframe - let ctx = SessionContext::new(); - let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; - - let df = df.filter(col("a").lt_eq(col("b")))? - .aggregate(vec![col("a")], vec![min(col("b"))])? - .limit(0, Some(100))?; - - // execute and print results - df.show().await?; - Ok(()) -} -``` - -## Output from both examples - -```text -+---+--------+ -| a | MIN(b) | -+---+--------+ -| 1 | 2 | -+---+--------+ -``` - -# Identifiers and Capitalization - -Please be aware that all identifiers are effectively made lower-case in SQL, so if your csv file has capital letters (ex: `Name`) you must put your column name in double quotes or the examples won't work. - -To illustrate this behavior, consider the [`capitalized_example.csv`](../../../datafusion/core/tests/data/capitalized_example.csv) file: - -## Run a SQL query against data stored in a CSV: - -```rust -use datafusion::prelude::*; - -#[tokio::main] -async fn main() -> datafusion::error::Result<()> { - // register the table - let ctx = SessionContext::new(); - ctx.register_csv("example", "tests/data/capitalized_example.csv", CsvReadOptions::new()).await?; - - // create a plan to run a SQL query - let df = ctx.sql("SELECT \"A\", MIN(b) FROM example GROUP BY \"A\" LIMIT 100").await?; - - // execute and print results - df.show().await?; - Ok(()) -} -``` - -## Use the DataFrame API to process data stored in a CSV: - -```rust -use datafusion::prelude::*; - -#[tokio::main] -async fn main() -> datafusion::error::Result<()> { - // create the dataframe - let ctx = SessionContext::new(); - let df = ctx.read_csv("tests/data/capitalized_example.csv", CsvReadOptions::new()).await?; - - let df = df.filter(col("A").lt_eq(col("c")))? - .aggregate(vec![col("A")], vec![min(col("b"))])? - .limit(0, Some(100))?; - - // execute and print results - df.show().await?; - Ok(()) -} -``` - -## Output from both examples - -```text -+---+--------+ -| A | MIN(b) | -+---+--------+ -| 2 | 1 | -| 1 | 2 | -+---+--------+ -``` diff --git a/datafusion/_sources/user-guide/expressions.md.txt b/datafusion/_sources/user-guide/expressions.md.txt deleted file mode 100644 index 5dc3520d10ce..000000000000 --- a/datafusion/_sources/user-guide/expressions.md.txt +++ /dev/null @@ -1,211 +0,0 @@ - - -# Expressions - -DataFrame methods such as `select` and `filter` accept one or more logical expressions and there are many functions -available for creating logical expressions. These are documented below. - -Expressions can be chained together using a fluent-style API: - -```rust -// create the expression `(a > 5) AND (b < 7)` -col("a").gt(lit(5)).and(col("b").lt(lit(7))) -``` - -## Identifiers - -| Function | Notes | -| -------- | -------------------------------------------- | -| col | Reference a column in a dataframe `col("a")` | - -## Literal Values - -| Function | Notes | -| -------- | -------------------------------------------------- | -| lit | Literal value such as `lit(123)` or `lit("hello")` | - -## Boolean Expressions - -| Function | Notes | -| -------- | ----------------------------------------- | -| and | `and(expr1, expr2)` or `expr1.and(expr2)` | -| or | `or(expr1, expr2)` or `expr1.or(expr2)` | -| not | `not(expr)` or `expr.not()` | - -## Comparison Expressions - -| Function | Notes | -| -------- | --------------------- | -| eq | `expr1.eq(expr2)` | -| gt | `expr1.gt(expr2)` | -| gt_eq | `expr1.gt_eq(expr2)` | -| lt | `expr1.lt(expr2)` | -| lt_eq | `expr1.lt_eq(expr2)` | -| not_eq | `expr1.not_eq(expr2)` | - -## Math Functions - -In addition to the math functions listed here, some Rust operators are implemented for expressions, allowing -expressions such as `col("a") + col("b")` to be used. - -| Function | Notes | -| --------------------- | ------------------------------------------------- | -| abs(x) | absolute value | -| acos(x) | inverse cosine | -| asin(x) | inverse sine | -| atan(x) | inverse tangent | -| atan2(y, x) | inverse tangent of y / x | -| ceil(x) | nearest integer greater than or equal to argument | -| cos(x) | cosine | -| exp(x) | exponential | -| floor(x) | nearest integer less than or equal to argument | -| ln(x) | natural logarithm | -| log10(x) | base 10 logarithm | -| log2(x) | base 2 logarithm | -| power(base, exponent) | base raised to the power of exponent | -| round(x) | round to nearest integer | -| signum(x) | sign of the argument (-1, 0, +1) | -| sin(x) | sine | -| sqrt(x) | square root | -| tan(x) | tangent | -| trunc(x) | truncate toward zero | - -## Bitwise Operators - -| Operator | Notes | -| -------- | ----------------------------------------------- | -| & | Bitwise AND => `(expr1 & expr2)` | -| | | Bitwise OR => (expr1 | expr2) | -| # | Bitwise XOR => `(expr1 # expr2)` | -| << | Bitwise left shift => `(expr1 << expr2)` | -| >> | Bitwise right shift => `(expr1 << expr2)` | - -## Conditional Expressions - -| Function | Notes | -| -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| coalesce | Returns the first of its arguments that is not null. Null is returned only if all arguments are null. It is often used to substitute a default value for null values when data is retrieved for display. | -| case | CASE expression. Example: `case(expr).when(expr, expr).when(expr, expr).otherwise(expr).end()`. | -| nullif | Returns a null value if `value1` equals `value2`; otherwise it returns `value1`. This can be used to perform the inverse operation of the `coalesce` expression. | - -## String Expressions - -| Function | Notes | -| ---------------- | ----- | -| ascii | | -| bit_length | | -| btrim | | -| char_length | | -| character_length | | -| concat | | -| concat_ws | | -| chr | | -| initcap | | -| left | | -| length | | -| lower | | -| lpad | | -| ltrim | | -| md5 | | -| octet_length | | -| repeat | | -| replace | | -| reverse | | -| right | | -| rpad | | -| rtrim | | -| digest | | -| split_part | | -| starts_with | | -| strpos | | -| substr | | -| translate | | -| trim | | -| upper | | - -## Regular Expressions - -| Function | Notes | -| -------------- | ----- | -| regexp_match | | -| regexp_replace | | - -## Temporal Expressions - -| Function | Notes | -| -------------------- | ------------ | -| date_part | | -| date_trunc | | -| from_unixtime | | -| to_timestamp | | -| to_timestamp_millis | | -| to_timestamp_micros | | -| to_timestamp_seconds | | -| now() | current time | - -## Other Expressions - -| Function | Notes | -| -------- | ----- | -| array | | -| in_list | | -| random | | -| sha224 | | -| sha256 | | -| sha384 | | -| sha512 | | -| struct | | -| to_hex | | - -## Aggregate Functions - -| Function | Notes | -| ---------------------------------- | ----- | -| avg | | -| approx_distinct | | -| approx_median | | -| approx_percentile_cont | | -| approx_percentile_cont_with_weight | | -| count | | -| count_distinct | | -| cube | | -| grouping_set | | -| max | | -| median | | -| min | | -| rollup | | -| sum | | - -## Subquery Expressions - -| Function | Notes | -| --------------- | --------------------------------------------------------------------------------------------- | -| exists | | -| in_subquery | `df1.filter(in_subquery(col("foo"), df2))?` is the equivalent of the SQL `WHERE foo IN ` | -| not_exists | | -| not_in_subquery | | -| scalar_subquery | | - -## User-Defined Function Expressions - -| Function | Notes | -| ----------- | ----- | -| create_udf | | -| create_udaf | | diff --git a/datafusion/_sources/user-guide/faq.md.txt b/datafusion/_sources/user-guide/faq.md.txt deleted file mode 100644 index 16a8873fff38..000000000000 --- a/datafusion/_sources/user-guide/faq.md.txt +++ /dev/null @@ -1,31 +0,0 @@ - - -# Frequently Asked Questions - -## What is the relationship between Apache Arrow, DataFusion, and Ballista? - -Apache Arrow is a library which provides a standardized memory representation for columnar data. It also provides -"kernels" for performing common operations on this data. - -DataFusion is a library for executing queries in-process using the Apache Arrow memory -model and computational kernels. It is designed to run within a single process, using threads -for parallel query execution. - -[Ballista](https://github.com/apache/arrow-ballista) is a distributed compute platform built on DataFusion. diff --git a/datafusion/_sources/user-guide/introduction.md.txt b/datafusion/_sources/user-guide/introduction.md.txt deleted file mode 100644 index e16504091571..000000000000 --- a/datafusion/_sources/user-guide/introduction.md.txt +++ /dev/null @@ -1,43 +0,0 @@ - - -# Introduction - -DataFusion is an extensible query execution framework, written in -Rust, that uses [Apache Arrow](https://arrow.apache.org) as its -in-memory format. - -DataFusion supports both an SQL and a DataFrame API for building -logical query plans as well as a query optimizer and execution engine -capable of parallel execution against partitioned data sources (CSV -and Parquet) using threads. - -## Use Cases - -DataFusion is used to create modern, fast and efficient data -pipelines, ETL processes, and database systems, which need the -performance of Rust and Apache Arrow and want to provide their users -the convenience of an SQL interface or a DataFrame API. - -## Why DataFusion? - -- _High Performance_: Leveraging Rust and Arrow's memory model, DataFusion achieves very high performance -- _Easy to Connect_: Being part of the Apache Arrow ecosystem (Arrow, Parquet and Flight), DataFusion works well with the rest of the big data ecosystem -- _Easy to Embed_: Allowing extension at almost any point in its design, DataFusion can be tailored for your specific usecase -- _High Quality_: Extensively tested, both by itself and with the rest of the Arrow ecosystem, DataFusion can be used as the foundation for production systems. diff --git a/datafusion/_sources/user-guide/library.md.txt b/datafusion/_sources/user-guide/library.md.txt deleted file mode 100644 index c7cc1ec425ef..000000000000 --- a/datafusion/_sources/user-guide/library.md.txt +++ /dev/null @@ -1,127 +0,0 @@ - - -# Using DataFusion as a library - -## Create a new project - -```shell -cargo new hello_datafusion -``` - -```shell -$ cd hello_datafusion -$ tree . -. -├── Cargo.toml -└── src - └── main.rs - -1 directory, 2 files -``` - -## Default Configuration - -DataFusion is [published on crates.io](https://crates.io/crates/datafusion), and is [well documented on docs.rs](https://docs.rs/datafusion/). - -To get started, add the following to your `Cargo.toml` file: - -```toml -[dependencies] -datafusion = "11.0" -``` - -## Create a main function - -Update the main.rs file with your first datafusion application based on [Example usage](https://arrow.apache.org/datafusion/user-guide/example-usage.html) - -```rust -use datafusion::prelude::*; - -#[tokio::main] -async fn main() -> datafusion::error::Result<()> { - // register the table - let ctx = SessionContext::new(); - ctx.register_csv("test", "", CsvReadOptions::new()).await?; - - // create a plan to run a SQL query - let df = ctx.sql("SELECT * FROM test").await?; - - // execute and print results - df.show().await?; - Ok(()) -} -``` - -## Extensibility - -DataFusion is designed to be extensible at all points. To that end, you can provide your own custom: - -- [x] User Defined Functions (UDFs) -- [x] User Defined Aggregate Functions (UDAFs) -- [x] User Defined Table Source (`TableProvider`) for tables -- [x] User Defined `Optimizer` passes (plan rewrites) -- [x] User Defined `LogicalPlan` nodes -- [x] User Defined `ExecutionPlan` nodes - -## Rust Version Compatibility - -This crate is tested with the latest stable version of Rust. We do not currently test against other, older versions of the Rust compiler. - -## Optimized Configuration - -For an optimized build several steps are required. First, use the below in your `Cargo.toml`. It is -worth noting that using the settings in the `[profile.release]` section will significantly increase the build time. - -```toml -[dependencies] -datafusion = { version = "11.0" , features = ["simd"]} -tokio = { version = "^1.0", features = ["rt-multi-thread"] } -snmalloc-rs = "0.2" - -[profile.release] -lto = true -codegen-units = 1 -``` - -Then, in `main.rs.` update the memory allocator with the below after your imports: - -```rust -use datafusion::prelude::*; - -#[global_allocator] -static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc; - -async fn main() -> datafusion::error::Result<()> { - Ok(()) -} -``` - -Finally, in order to build with the `simd` optimization `cargo nightly` is required. - -```shell -rustup toolchain install nightly -``` - -Based on the instruction set architecture you are building on you will want to configure the `target-cpu` as well, ideally -with `native` or at least `avx2`. - -``` -RUSTFLAGS='-C target-cpu=native' cargo +nightly run --release -``` diff --git a/datafusion/_sources/user-guide/sql/aggregate_functions.md.txt b/datafusion/_sources/user-guide/sql/aggregate_functions.md.txt deleted file mode 100644 index e8299b6193c2..000000000000 --- a/datafusion/_sources/user-guide/sql/aggregate_functions.md.txt +++ /dev/null @@ -1,68 +0,0 @@ - - -# Aggregate Functions - -Aggregate functions operate on a set of values to compute a single result. Please refer to [PostgreSQL](https://www.postgresql.org/docs/current/functions-aggregate.html) for usage of standard SQL functions. - -## General - -- min -- max -- count -- avg -- sum -- array_agg - -## Statistical - -- var / var_samp / var_pop -- stddev / stddev_samp / stddev_pop -- covar / covar_samp / covar_pop -- corr - -## Approximate - -### approx_distinct - -`approx_distinct(x) -> uint64` returns the approximate number (HyperLogLog) of distinct input values - -### approx_median - -`approx_median(x) -> x` returns the approximate median of input values. it is an alias of `approx_percentile_cont(x, 0.5)`. - -### approx_percentile_cont - -`approx_percentile_cont(x, p) -> x` return the approximate percentile (TDigest) of input values, where `p` is a float64 between 0 and 1 (inclusive). - -It supports raw data as input and build Tdigest sketches during query time, and is approximately equal to `approx_percentile_cont_with_weight(x, 1, p)`. - -`approx_percentile_cont(x, p, n) -> x` return the approximate percentile (TDigest) of input values, where `p` is a float64 between 0 and 1 (inclusive), - -and `n` (default 100) is the number of centroids in Tdigest which means that if there are `n` or fewer unique values in `x`, you can expect an exact result. - -A higher value of `n` results in a more accurate approximation and the cost of higher memory usage. - -### approx_percentile_cont_with_weight - -`approx_percentile_cont_with_weight(x, w, p) -> x` returns the approximate percentile (TDigest) of input values with weight, where `w` is weight column expression and `p` is a float64 between 0 and 1 (inclusive). - -It supports raw data as input or pre-aggregated TDigest sketches, then builds or merges Tdigest sketches during query time. TDigest sketches are a list of centroid `(x, w)`, where `x` stands for mean and `w` stands for weight. - -It is suitable for low latency OLAP system where a streaming compute engine (e.g. Spark Streaming/Flink) pre-aggregates data to a data store, then queries using Datafusion. diff --git a/datafusion/_sources/user-guide/sql/data_types.md.txt b/datafusion/_sources/user-guide/sql/data_types.md.txt deleted file mode 100644 index 1d5c0f9fc078..000000000000 --- a/datafusion/_sources/user-guide/sql/data_types.md.txt +++ /dev/null @@ -1,90 +0,0 @@ - - -# Data Types - -DataFusion uses Arrow, and thus the Arrow type system, for query -execution. The SQL types from -[sqlparser-rs](https://github.com/sqlparser-rs/sqlparser-rs/blob/main/src/ast/data_type.rs#L27) -are mapped to [Arrow data types](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html) according to the following table. -This mapping occurs when defining the schema in a `CREATE EXTERNAL TABLE` command or when performing a SQL `CAST` operation. - -## Character Types - -| SQL DataType | Arrow DataType | -| ------------ | -------------- | -| `CHAR` | `Utf8` | -| `VARCHAR` | `Utf8` | -| `TEXT` | `Utf8` | - -## Numeric Types - -| SQL DataType | Arrow DataType | Notes | -| ------------------------------------ | :---------------------------- | ----------------------------------------------------------------------------------------------------------- | -| `TINYINT` | `Int8` | | -| `SMALLINT` | `Int16` | | -| `INT` or `INTEGER` | `Int32` | | -| `BIGINT` | `Int64` | | -| `TINYINT UNSIGNED` | `UInt8` | | -| `SMALLINT UNSIGNED` | `UInt16` | | -| `INT UNSIGNED` or `INTEGER UNSIGNED` | `UInt32` | | -| `BIGINT UNSIGNED` | `UInt64` | | -| `FLOAT` | `Float32` | | -| `REAL` | `Float32` | | -| `DOUBLE` | `Float64` | | -| `DECIMAL(precision,scale)` | `Decimal128(precision,scale)` | Decimal support is currently experimental ([#3523](https://github.com/apache/arrow-datafusion/issues/3523)) | - -## Date/Time Types - -| SQL DataType | Arrow DataType | -| ------------ | :-------------------------------------- | -| `DATE` | `Date32` | -| `TIME` | `Time64(TimeUnit::Nanosecond)` | -| `TIMESTAMP` | `Timestamp(TimeUnit::Nanosecond, None)` | - -## Boolean Types - -| SQL DataType | Arrow DataType | -| ------------ | :------------- | -| `BOOLEAN` | `Boolean` | - -## Binary Types - -| SQL DataType | Arrow DataType | -| ------------ | :------------- | -| `BYTEA` | `Binary` | - -## Unsupported Types - -| SQL Data Type | Arrow DataType | -| ------------- | :------------------ | -| `UUID` | _Not yet supported_ | -| `BLOB` | _Not yet supported_ | -| `CLOB` | _Not yet supported_ | -| `BINARY` | _Not yet supported_ | -| `VARBINARY` | _Not yet supported_ | -| `REGCLASS` | _Not yet supported_ | -| `NVARCHAR` | _Not yet supported_ | -| `STRING` | _Not yet supported_ | -| `CUSTOM` | _Not yet supported_ | -| `ARRAY` | _Not yet supported_ | -| `ENUM` | _Not yet supported_ | -| `SET` | _Not yet supported_ | -| `INTERVAL` | _Not yet supported_ | -| `DATETIME` | _Not yet supported_ | diff --git a/datafusion/_sources/user-guide/sql/ddl.md.txt b/datafusion/_sources/user-guide/sql/ddl.md.txt deleted file mode 100644 index c531312b1e58..000000000000 --- a/datafusion/_sources/user-guide/sql/ddl.md.txt +++ /dev/null @@ -1,154 +0,0 @@ - - -# DDL - -## CREATE EXTERNAL TABLE - -Parquet data sources can be registered by executing a `CREATE EXTERNAL TABLE` SQL statement. It is not necessary -to provide schema information for Parquet files. - -```sql -CREATE EXTERNAL TABLE taxi -STORED AS PARQUET -LOCATION '/mnt/nyctaxi/tripdata.parquet'; -``` - -CSV data sources can also be registered by executing a `CREATE EXTERNAL TABLE` SQL statement. The schema will be -inferred based on scanning a subset of the file. - -```sql -CREATE EXTERNAL TABLE test -STORED AS CSV -WITH HEADER ROW -LOCATION '/path/to/aggregate_simple.csv'; -``` - -It is also possible to specify the schema manually. - -```sql -CREATE EXTERNAL TABLE test ( - c1 VARCHAR NOT NULL, - c2 INT NOT NULL, - c3 SMALLINT NOT NULL, - c4 SMALLINT NOT NULL, - c5 INT NOT NULL, - c6 BIGINT NOT NULL, - c7 SMALLINT NOT NULL, - c8 INT NOT NULL, - c9 BIGINT NOT NULL, - c10 VARCHAR NOT NULL, - c11 FLOAT NOT NULL, - c12 DOUBLE NOT NULL, - c13 VARCHAR NOT NULL -) -STORED AS CSV -WITH HEADER ROW -LOCATION '/path/to/aggregate_test_100.csv'; -``` - -If data sources are already partitioned in Hive style, `PARTITIONED BY` can be used for partition pruning. - -``` -/mnt/nyctaxi/year=2022/month=01/tripdata.parquet -/mnt/nyctaxi/year=2021/month=12/tripdata.parquet -/mnt/nyctaxi/year=2021/month=11/tripdata.parquet -``` - -```sql -CREATE EXTERNAL TABLE taxi -STORED AS PARQUET -PARTITIONED BY (year, month) -LOCATION '/mnt/nyctaxi'; -``` - -## CREATE TABLE - -An in-memory table can be created with a query or values list. - -
    -CREATE [OR REPLACE] TABLE [IF NOT EXISTS] table_name AS [SELECT | VALUES LIST];
    -
    - -```sql -CREATE TABLE IF NOT EXISTS valuetable AS VALUES(1,'HELLO'),(12,'DATAFUSION'); - -CREATE TABLE memtable as select * from valuetable; -``` - -## DROP TABLE - -Removes the table from DataFusion's catalog. - -
    -DROP TABLE [ IF EXISTS ] table_name;
    -
    - -```sql -CREATE TABLE users AS VALUES(1,2),(2,3); -DROP TABLE users; --- or use 'if exists' to silently ignore if the table doesn't exist -DROP TABLE IF EXISTS nonexistent_table; -``` - -## CREATE VIEW - -View is a virtual table based on the result of a SQL query. It can be created from an existing table or values list. - -
    -CREATE VIEW view_name AS statement;
    -
    - -```sql -CREATE TABLE users AS VALUES(1,2),(2,3),(3,4),(4,5); -CREATE VIEW test AS SELECT column1 FROM users; -SELECT * FROM test; -+---------+ -| column1 | -+---------+ -| 1 | -| 2 | -| 3 | -| 4 | -+---------+ -``` - -```sql -CREATE VIEW test AS VALUES(1,2),(5,6); -SELECT * FROM test; -+---------+---------+ -| column1 | column2 | -+---------+---------+ -| 1 | 2 | -| 5 | 6 | -+---------+---------+ -``` - -## DROP VIEW - -Removes the view from DataFusion's catalog. - -
    -DROP VIEW [ IF EXISTS ] view_name;
    -
    - -```sql --- drop users_v view from the customer_a schema -DROP VIEW IF EXISTS customer_a.users_v; -``` diff --git a/datafusion/_sources/user-guide/sql/explain.md.txt b/datafusion/_sources/user-guide/sql/explain.md.txt deleted file mode 100644 index ae0795f9ab4b..000000000000 --- a/datafusion/_sources/user-guide/sql/explain.md.txt +++ /dev/null @@ -1,71 +0,0 @@ - - -# EXPLAIN - -The `EXPLAIN` command shows the logical and physical execution plan for the specified SQL statement. - -
    -EXPLAIN [ANALYZE] [VERBOSE] statement
    -
    - -## EXPLAIN - -Shows the execution plan of a statement. -If you need more details output, try to use `EXPLAIN VERBOSE`. - -```sql -EXPLAIN SELECT SUM(x) FROM table GROUP BY b; -+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| plan_type | plan | -+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #SUM(table.x) | -| | Aggregate: groupBy=[[#table.b]], aggr=[[SUM(#table.x)]] | -| | TableScan: table projection=[x, b] | -| physical_plan | ProjectionExec: expr=[SUM(table.x)@1 as SUM(table.x)] | -| | AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[SUM(table.x)] | -| | CoalesceBatchesExec: target_batch_size=4096 | -| | RepartitionExec: partitioning=Hash([Column { name: "b", index: 0 }], 16) | -| | AggregateExec: mode=Partial, gby=[b@1 as b], aggr=[SUM(table.x)] | -| | RepartitionExec: partitioning=RoundRobinBatch(16) | -| | CsvExec: source=Path(/tmp/table.csv: [/tmp/table.csv]), has_header=false, limit=None, projection=[x, b] | -| | | -+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+ -``` - -## EXPLAIN ANALYZE - -Shows the execution plan and metrics of a statement. -If you need more information output, try to use `EXPLAIN ANALYZE VERBOSE`. - -```sql -EXPLAIN ANALYZE SELECT SUM(x) FROM table GROUP BY b; -+-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------+ -| plan_type | plan | -+-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------+ -| Plan with Metrics | CoalescePartitionsExec, metrics=[] | -| | ProjectionExec: expr=[SUM(table.x)@1 as SUM(x)], metrics=[] | -| | HashAggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[SUM(x)], metrics=[outputRows=2] | -| | CoalesceBatchesExec: target_batch_size=4096, metrics=[] | -| | RepartitionExec: partitioning=Hash([Column { name: "b", index: 0 }], 16), metrics=[sendTime=839560, fetchTime=122528525, repartitionTime=5327877] | -| | HashAggregateExec: mode=Partial, gby=[b@1 as b], aggr=[SUM(x)], metrics=[outputRows=2] | -| | RepartitionExec: partitioning=RoundRobinBatch(16), metrics=[fetchTime=5660489, repartitionTime=0, sendTime=8012] | -| | CsvExec: source=Path(/tmp/table.csv: [/tmp/table.csv]), has_header=false, metrics=[] | -+-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------+ -``` diff --git a/datafusion/_sources/user-guide/sql/index.rst.txt b/datafusion/_sources/user-guide/sql/index.rst.txt deleted file mode 100644 index 373d60eb1e10..000000000000 --- a/datafusion/_sources/user-guide/sql/index.rst.txt +++ /dev/null @@ -1,32 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -SQL Reference -============= - -.. toctree:: - :maxdepth: 2 - - data_types - select - subqueries - ddl - explain - information_schema - aggregate_functions - scalar_functions - sql_status diff --git a/datafusion/_sources/user-guide/sql/information_schema.md.txt b/datafusion/_sources/user-guide/sql/information_schema.md.txt deleted file mode 100644 index b3fcc843bd9f..000000000000 --- a/datafusion/_sources/user-guide/sql/information_schema.md.txt +++ /dev/null @@ -1,72 +0,0 @@ - - -# Information Schema - -DataFusion supports showing metadata about the tables and views available. This information can be accessed using the -views of the ISO SQL `information_schema` schema or the DataFusion specific `SHOW TABLES` and `SHOW COLUMNS` commands. - -To show tables in the DataFusion catalog, use the `SHOW TABLES` command or the `information_schema.tables` view: - -```sql -> show tables; -or -> select * from information_schema.tables; -+---------------+--------------------+------------+------------+ -| table_catalog | table_schema | table_name | table_type | -+---------------+--------------------+------------+------------+ -| datafusion | public | t | BASE TABLE | -| datafusion | information_schema | tables | VIEW | -| datafusion | information_schema | views | VIEW | -| datafusion | information_schema | columns | VIEW | -+---------------+--------------------+------------+------------+ - -``` - -To show the schema of a table in DataFusion, use the `SHOW COLUMNS` command or the `information_schema.columns` view: - -```sql -> show columns from t; -or -> select table_catalog, table_schema, table_name, column_name, data_type, is_nullable from information_schema.columns; -+---------------+--------------+------------+-------------+-----------+-------------+ -| table_catalog | table_schema | table_name | column_name | data_type | is_nullable | -+---------------+--------------+------------+-------------+-----------+-------------+ -| datafusion | public | t | Int64(1) | Int64 | NO | -+---------------+--------------+------------+-------------+-----------+-------------+ -``` - -To show the current session configuration options, use the `SHOW ALL` command or the `information_schema.df_settings` view: - -```sql -❯ select * from information_schema.df_settings; - -+-------------------------------------------------+---------+ -| name | setting | -+-------------------------------------------------+---------+ -| datafusion.execution.batch_size | 8192 | -| datafusion.execution.coalesce_batches | true | -| datafusion.execution.coalesce_target_batch_size | 4096 | -| datafusion.execution.time_zone | UTC | -| datafusion.explain.logical_plan_only | false | -| datafusion.explain.physical_plan_only | false | -| datafusion.optimizer.filter_null_join_keys | false | -| datafusion.optimizer.skip_failed_rules | true | -+-------------------------------------------------+---------+ -``` diff --git a/datafusion/_sources/user-guide/sql/scalar_functions.md.txt b/datafusion/_sources/user-guide/sql/scalar_functions.md.txt deleted file mode 100644 index 11725f90d93d..000000000000 --- a/datafusion/_sources/user-guide/sql/scalar_functions.md.txt +++ /dev/null @@ -1,297 +0,0 @@ - - -# Scalar Functions - -## Math Functions - -### `abs(x)` - -absolute value - -### `acos(x)` - -inverse cosine - -### `asin(x)` - -inverse sine - -### `atan(x)` - -inverse tangent - -### `atan2(y, x)` - -inverse tangent of y / x - -### `ceil(x)` - -nearest integer greater than or equal to argument - -### `cos(x)` - -cosine - -### `exp(x)` - -exponential - -### `floor(x)` - -nearest integer less than or equal to argument - -### `ln(x)` - -natural logarithm - -### `log10(x)` - -base 10 logarithm - -### `log2(x)` - -base 2 logarithm - -### `power(base, exponent)` - -base raised to the power of exponent - -### `round(x)` - -round to nearest integer - -### `signum(x)` - -sign of the argument (-1, 0, +1) - -### `sin(x)` - -sine - -### `sqrt(x)` - -square root - -### `tan(x)` - -tangent - -### `trunc(x)` - -truncate toward zero - -## Conditional Functions - -### `coalesce` - -Returns the first of its arguments that is not null. Null is returned only if all arguments are null. It is often used to substitute a default value for null values when data is retrieved for display. - -### `nullif` - -Returns a null value if value1 equals value2; otherwise it returns value1. This can be used to perform the inverse operation of the `coalesce` expression. | - -## String Functions - -### `ascii` - -### `bit_length` - -### `btrim` - -### `char_length` - -### `character_length` - -### `concat` - -### `concat_ws` - -### `chr` - -### `initcap` - -### `left` - -### `length` - -### `lower` - -### `lpad` - -### `ltrim` - -### `md5` - -### `octet_length` - -### `repeat` - -### `replace` - -### `reverse` - -### `right` - -### `rpad` - -### `rtrim` - -### `digest` - -### `split_part` - -### `starts_with` - -### `strpos` - -### `substr` - -### `translate` - -### `trim` - -### `upper` - -## Regular Expression Functions - -### regexp_match - -### regexp_replace - -## Temporal Functions - -### `to_timestamp` - -`to_timestamp()` is similar to the standard SQL function. It performs conversions to type `Timestamp(Nanoseconds, None)`, from: - -- Timestamp strings - - `1997-01-31T09:26:56.123Z` # RCF3339 - - `1997-01-31T09:26:56.123-05:00` # RCF3339 - - `1997-01-31 09:26:56.123-05:00` # close to RCF3339 but with a space er than T - - `1997-01-31T09:26:56.123` # close to RCF3339 but no timezone et specified - - `1997-01-31 09:26:56.123` # close to RCF3339 but uses a space and timezone offset - - `1997-01-31 09:26:56` # close to RCF3339, no fractional seconds -- An Int64 array/column, values are nanoseconds since Epoch UTC -- Other Timestamp() columns or values - -Note that conversions from other Timestamp and Int64 types can also be performed using `CAST(.. AS Timestamp)`. However, the conversion functionality here is present for consistency with the other `to_timestamp_xx()` functions. - -### `to_timestamp_millis` - -`to_timestamp_millis()` does conversions to type `Timestamp(Milliseconds, None)`, from: - -- Timestamp strings, the same as supported by the regular timestamp() function (except the output is a timestamp of Milliseconds resolution) - - `1997-01-31T09:26:56.123Z` # RCF3339 - - `1997-01-31T09:26:56.123-05:00` # RCF3339 - - `1997-01-31 09:26:56.123-05:00` # close to RCF3339 but with a space er than T - - `1997-01-31T09:26:56.123` # close to RCF3339 but no timezone et specified - - `1997-01-31 09:26:56.123` # close to RCF3339 but uses a space and timezone offset - - `1997-01-31 09:26:56` # close to RCF3339, no fractional seconds -- An Int64 array/column, values are milliseconds since Epoch UTC -- Other Timestamp() columns or values - -Note that `CAST(.. AS Timestamp)` converts to Timestamps with Nanosecond resolution; this function is the only way to convert/cast to millisecond resolution. - -### `to_timestamp_micros` - -`to_timestamp_micros()` does conversions to type `Timestamp(Microseconds, None)`, from: - -- Timestamp strings, the same as supported by the regular timestamp() function (except the output is a timestamp of microseconds resolution) - - `1997-01-31T09:26:56.123Z` # RCF3339 - - `1997-01-31T09:26:56.123-05:00` # RCF3339 - - `1997-01-31 09:26:56.123-05:00` # close to RCF3339 but with a space er than T - - `1997-01-31T09:26:56.123` # close to RCF3339 but no timezone et specified - - `1997-01-31 09:26:56.123` # close to RCF3339 but uses a space and timezone offset - - `1997-01-31 09:26:56` # close to RCF3339, no fractional seconds -- An Int64 array/column, values are microseconds since Epoch UTC -- Other Timestamp() columns or values - -Note that `CAST(.. AS Timestamp)` converts to Timestamps with Nanosecond resolution; this function is the only way to convert/cast to microsecond resolution. - -### `to_timestamp_seconds` - -`to_timestamp_seconds()` does conversions to type `Timestamp(Seconds, None)`, from: - -- Timestamp strings, the same as supported by the regular timestamp() function (except the output is a timestamp of secondseconds resolution) - - `1997-01-31T09:26:56.123Z` # RCF3339 - - `1997-01-31T09:26:56.123-05:00` # RCF3339 - - `1997-01-31 09:26:56.123-05:00` # close to RCF3339 but with a space er than T - - `1997-01-31T09:26:56.123` # close to RCF3339 but no timezone et specified - - `1997-01-31 09:26:56.123` # close to RCF3339 but uses a space and timezone offset - - `1997-01-31 09:26:56` # close to RCF3339, no fractional seconds -- An Int64 array/column, values are seconds since Epoch UTC -- Other Timestamp() columns or values - -Note that `CAST(.. AS Timestamp)` converts to Timestamps with Nanosecond resolution; this function is the only way to convert/cast to seconds resolution. - -### `extract` - -`extract(field FROM source)` - -- The `extract` function retrieves subfields such as year or hour from date/time values. - `source` must be a value expression of type timestamp, Date32, or Date64. `field` is an identifier that selects what field to extract from the source value. - The `extract` function returns values of type u32. - - `year` :`extract(year FROM to_timestamp('2020-09-08T12:00:00+00:00')) -> 2020` - - `month`:`extract(month FROM to_timestamp('2020-09-08T12:00:00+00:00')) -> 9` - - `week` :`extract(week FROM to_timestamp('2020-09-08T12:00:00+00:00')) -> 37` - - `day`: `extract(day FROM to_timestamp('2020-09-08T12:00:00+00:00')) -> 8` - - `hour`: `extract(hour FROM to_timestamp('2020-09-08T12:00:00+00:00')) -> 12` - - `minute`: `extract(minute FROM to_timestamp('2020-09-08T12:01:00+00:00')) -> 1` - - `second`: `extract(second FROM to_timestamp('2020-09-08T12:00:03+00:00')) -> 3` - -### `date_part` - -`date_part('field', source)` - -- The `date_part` function is modeled on the postgres equivalent to the SQL-standard function `extract`. - Note that here the field parameter needs to be a string value, not a name. - The valid field names for `date_part` are the same as for `extract`. - - `date_part('second', to_timestamp('2020-09-08T12:00:12+00:00')) -> 12` - -### `date_trunc` - -### `date_bin` - -### `from_unixtime` - -### `now` - -Returns current time as `Timestamp(Nanoseconds, UTC)`. Returns same value for the function -wherever it appears in the statement, using a value chosen at planning time. - -## Other Functions - -### `array` - -### `in_list` - -### `random` - -### `sha224` - -### `sha256` - -### `sha384` - -### `sha512` - -### `struct` - -### `to_hex` diff --git a/datafusion/_sources/user-guide/sql/select.md.txt b/datafusion/_sources/user-guide/sql/select.md.txt deleted file mode 100644 index 3eea252d7080..000000000000 --- a/datafusion/_sources/user-guide/sql/select.md.txt +++ /dev/null @@ -1,226 +0,0 @@ - - -# SELECT syntax - -The queries in DataFusion scan data from tables and return 0 or more rows. -Please be aware that column names in queries are made lower-case, but not on the inferred schema. Accordingly, if you -want to query against a capitalized field, make sure to use double quotes. Please see this -[example](https://arrow.apache.org/datafusion/user-guide/example-usage.html) for clarification. -In this documentation we describe the SQL syntax in DataFusion. - -DataFusion supports the following syntax for queries: - - -[ [WITH](#with-clause) with_query [, ...] ]
    -[SELECT](#select-clause) [ ALL | DISTINCT ] select_expr [, ...]
    -[ [FROM](#from-clause) from_item [, ...] ]
    -[ [JOIN](#join-clause) join_item [, ...] ]
    -[ [WHERE](#where-clause) condition ]
    -[ [GROUP BY](#group-by-clause) grouping_element [, ...] ]
    -[ [HAVING](#having-clause) condition]
    -[ [UNION](#union-clause) [ ALL | select ]
    -[ [ORDER BY](#order-by-clause) expression [ ASC | DESC ][, ...] ]
    -[ [LIMIT](#limit-clause) count ]
    - -
    - -## WITH clause - -A with clause allows to give names for queries and reference them by name. - -```sql -WITH x AS (SELECT a, MAX(b) AS b FROM t GROUP BY a) -SELECT a, b FROM x; -``` - -## SELECT clause - -Example: - -```sql -SELECT a, b, a + b FROM table -``` - -The `DISTINCT` quantifier can be added to make the query return all distinct rows. -By default `ALL` will be used, which returns all the rows. - -```sql -SELECT DISTINCT person, age FROM employees -``` - -## FROM clause - -Example: - -```sql -SELECT t.a FROM table AS t -``` - -## WHERE clause - -Example: - -```sql -SELECT a FROM table WHERE a > 10 -``` - -## JOIN clause - -DataFusion supports `INNER JOIN`, `LEFT OUTER JOIN`, `RIGHT OUTER JOIN`, `FULL OUTER JOIN`, and `CROSS JOIN`. - -The following examples are based on this table: - -```sql -select * from x; -+----------+----------+ -| column_1 | column_2 | -+----------+----------+ -| 1 | 2 | -+----------+----------+ -``` - -### INNER JOIN - -The keywords `JOIN` or `INNER JOIN` define a join that only shows rows where there is a match in both tables. - -```sql -❯ select * from x inner join x y ON x.column_1 = y.column_1; -+----------+----------+----------+----------+ -| column_1 | column_2 | column_1 | column_2 | -+----------+----------+----------+----------+ -| 1 | 2 | 1 | 2 | -+----------+----------+----------+----------+ -``` - -### LEFT OUTER JOIN - -The keywords `LEFT JOIN` or `LEFT OUTER JOIN` define a join that includes all rows from the left table even if there -is not a match in the right table. When there is no match, null values are produced for the right side of the join. - -```sql -❯ select * from x left join x y ON x.column_1 = y.column_2; -+----------+----------+----------+----------+ -| column_1 | column_2 | column_1 | column_2 | -+----------+----------+----------+----------+ -| 1 | 2 | | | -+----------+----------+----------+----------+ -``` - -### RIGHT OUTER JOIN - -The keywords `RIGHT JOIN` or `RIGHT OUTER JOIN` define a join that includes all rows from the right table even if there -is not a match in the left table. When there is no match, null values are produced for the left side of the join. - -```sql -❯ select * from x right join x y ON x.column_1 = y.column_2; -+----------+----------+----------+----------+ -| column_1 | column_2 | column_1 | column_2 | -+----------+----------+----------+----------+ -| | | 1 | 2 | -+----------+----------+----------+----------+ -``` - -### FULL OUTER JOIN - -The keywords `FULL JOIN` or `FULL OUTER JOIN` define a join that is effectively a union of a `LEFT OUTER JOIN` and -`RIGHT OUTER JOIN`. It will show all rows from the left and right side of the join and will produce null values on -either side of the join where there is not a match. - -```sql -❯ select * from x full outer join x y ON x.column_1 = y.column_2; -+----------+----------+----------+----------+ -| column_1 | column_2 | column_1 | column_2 | -+----------+----------+----------+----------+ -| 1 | 2 | | | -| | | 1 | 2 | -+----------+----------+----------+----------+ -``` - -### CROSS JOIN - -A cross join produces a cartesian product that matches every row in the left side of the join with every row in the -right side of the join. - -```sql -❯ select * from x cross join x y; -+----------+----------+----------+----------+ -| column_1 | column_2 | column_1 | column_2 | -+----------+----------+----------+----------+ -| 1 | 2 | 1 | 2 | -+----------+----------+----------+----------+ -``` - -## GROUP BY clause - -Example: - -```sql -SELECT a, b, MAX(c) FROM table GROUP BY a, b -``` - -## HAVING clause - -Example: - -```sql -SELECT a, b, MAX(c) FROM table GROUP BY a, b HAVING MAX(c) > 10 -``` - -## UNION clause - -Example: - -```sql -SELECT - a, - b, - c -FROM table1 -UNION ALL -SELECT - a, - b, - c -FROM table2 -``` - -## ORDER BY clause - -Orders the results by the referenced expression. By default it uses ascending order (`ASC`). -This order can be changed to descending by adding `DESC` after the order-by expressions. - -Examples: - -```sql -SELECT age, person FROM table ORDER BY age; -SELECT age, person FROM table ORDER BY age DESC; -SELECT age, person FROM table ORDER BY age, person DESC; -``` - -## LIMIT clause - -Limits the number of rows to be a maximum of `count` rows. `count` should be a non-negative integer. - -Example: - -```sql -SELECT age, person FROM table -LIMIT 10 -``` diff --git a/datafusion/_sources/user-guide/sql/sql_status.md.txt b/datafusion/_sources/user-guide/sql/sql_status.md.txt deleted file mode 100644 index b260ecb4bae9..000000000000 --- a/datafusion/_sources/user-guide/sql/sql_status.md.txt +++ /dev/null @@ -1,135 +0,0 @@ - - -# Status - -## General - -- [x] SQL Parser -- [x] SQL Query Planner -- [x] Query Optimizer -- [x] Constant folding -- [x] Join Reordering -- [x] Limit Pushdown -- [x] Projection push down -- [x] Predicate push down -- [x] Type coercion -- [x] Parallel query execution - -## SQL Support - -- [x] Projection -- [x] Filter (WHERE) -- [x] Filter post-aggregate (HAVING) -- [x] Limit -- [x] Aggregate -- [x] Common math functions -- [x] cast -- [x] try_cast -- [x] [`VALUES` lists](https://www.postgresql.org/docs/current/queries-values.html) -- Postgres compatible String functions - - [x] ascii - - [x] bit_length - - [x] btrim - - [x] char_length - - [x] character_length - - [x] chr - - [x] concat - - [x] concat_ws - - [x] initcap - - [x] left - - [x] length - - [x] lpad - - [x] ltrim - - [x] octet_length - - [x] regexp_replace - - [x] repeat - - [x] replace - - [x] reverse - - [x] right - - [x] rpad - - [x] rtrim - - [x] split_part - - [x] starts_with - - [x] strpos - - [x] substr - - [x] to_hex - - [x] translate - - [x] trim -- Conditional functions - - [x] nullif - - [x] case - - [x] coalesce -- Approximation functions - - [x] approx_distinct - - [x] approx_median - - [x] approx_percentile_cont - - [x] approx_percentile_cont_with_weight -- Common date/time functions - - [ ] Basic date functions - - [ ] Basic time functions - - [x] Basic timestamp functions - - [x] [to_timestamp](./scalar_functions.md#to_timestamp) - - [x] [to_timestamp_millis](./scalar_functions.md#to_timestamp_millis) - - [x] [to_timestamp_micros](./scalar_functions.md#to_timestamp_micros) - - [x] [to_timestamp_seconds](./scalar_functions.md#to_timestamp_seconds) - - [x] [extract](./scalar_functions.md#extract) - - [x] [date_part](./scalar_functions.md#date_part) -- nested functions - - [x] Array of columns -- [x] Schema Queries - - [x] SHOW TABLES - - [x] SHOW COLUMNS FROM - - [x] SHOW CREATE TABLE - - [x] information_schema.{tables, columns, views} - - [ ] information_schema other views -- [x] Sorting -- [ ] Nested types -- [ ] Lists -- [x] Subqueries -- [x] Common table expressions -- [x] Set Operations - - [x] UNION ALL - - [x] UNION - - [x] INTERSECT - - [x] INTERSECT ALL - - [x] EXCEPT - - [x] EXCEPT ALL -- [x] Joins - - [x] INNER JOIN - - [x] LEFT JOIN - - [x] RIGHT JOIN - - [x] FULL JOIN - - [x] CROSS JOIN -- [ ] Window - - [x] Empty window - - [x] Common window functions - - [x] Window with PARTITION BY clause - - [x] Window with ORDER BY clause - - [ ] Window with FILTER clause - - [ ] [Window with custom WINDOW FRAME](https://github.com/apache/arrow-datafusion/issues/361) - - [ ] UDF and UDAF for window functions - -## Data Sources - -- [x] CSV -- [x] Parquet primitive types -- [ ] Parquet nested types -- [x] JSON -- [x] Avro diff --git a/datafusion/_sources/user-guide/sql/subqueries.md.txt b/datafusion/_sources/user-guide/sql/subqueries.md.txt deleted file mode 100644 index 478fab7e7c2d..000000000000 --- a/datafusion/_sources/user-guide/sql/subqueries.md.txt +++ /dev/null @@ -1,98 +0,0 @@ - - -# Subqueries - -DataFusion supports `EXISTS`, `NOT EXISTS`, `IN`, `NOT IN` and Scalar Subqueries. - -The examples below are based on the following table. - -```sql -❯ select * from x; -+----------+----------+ -| column_1 | column_2 | -+----------+----------+ -| 1 | 2 | -+----------+----------+ -``` - -## EXISTS - -The `EXISTS` syntax can be used to find all rows in a relation where a correlated subquery produces one or more matches -for that row. Only correlated subqueries are supported. - -```sql -❯ select * from x y where exists (select * from x where x.column_1 = y.column_1); -+----------+----------+ -| column_1 | column_2 | -+----------+----------+ -| 1 | 2 | -+----------+----------+ -1 row in set. -``` - -## NOT EXISTS - -The `NOT EXISTS` syntax can be used to find all rows in a relation where a correlated subquery produces zero matches -for that row. Only correlated subqueries are supported. - -```sql -❯ select * from x y where not exists (select * from x where x.column_1 = y.column_1); -0 rows in set. -``` - -## IN - -The `IN` syntax can be used to find all rows in a relation where a given expression's value can be found in the -results of a correlated subquery. - -```sql -❯ select * from x where column_1 in (select column_1 from x); -+----------+----------+ -| column_1 | column_2 | -+----------+----------+ -| 1 | 2 | -+----------+----------+ -1 row in set. -``` - -## NOT IN - -The `NOT IN` syntax can be used to find all rows in a relation where a given expression's value can not be found in the -results of a correlated subquery. - -```sql -❯ select * from x where column_1 not in (select column_1 from x); -0 rows in set. -``` - -## Scalar Subquery - -A scalar subquery can be used to produce a single value that can be used in many different contexts in a query. Here -is an example of a filter using a scalar subquery. Only correlated subqueries are supported. - -```sql -❯ select * from x y where column_1 < (select sum(column_2) from x where x.column_1 = y.column_1); -+----------+----------+ -| column_1 | column_2 | -+----------+----------+ -| 1 | 2 | -+----------+----------+ -1 row in set. -```