From 9c0275755f1830a8d5b97bb8b9bbfca3ecb6d7f0 Mon Sep 17 00:00:00 2001 From: Trent Hauck Date: Fri, 28 Jun 2024 20:21:44 -0700 Subject: [PATCH 01/10] feat: add example for copy to --- .../examples/custom_file_format.rs | 203 ++++++++++++++++++ 1 file changed, 203 insertions(+) create mode 100644 datafusion-examples/examples/custom_file_format.rs diff --git a/datafusion-examples/examples/custom_file_format.rs b/datafusion-examples/examples/custom_file_format.rs new file mode 100644 index 0000000000000..b5efb93aea6a3 --- /dev/null +++ b/datafusion-examples/examples/custom_file_format.rs @@ -0,0 +1,203 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{any::Any, sync::Arc}; + +use arrow::array::{RecordBatch, StringArray, UInt8Array}; +use arrow_schema::{DataType, Field, Schema, SchemaRef}; +use datafusion::{ + datasource::{ + file_format::{ + csv::CsvFormatFactory, file_compression_type::FileCompressionType, + FileFormat, FileFormatFactory, + }, + physical_plan::{FileScanConfig, FileSinkConfig}, + MemTable, + }, + error::Result, + execution::{context::SessionState, runtime_env::RuntimeEnv}, + physical_plan::ExecutionPlan, + prelude::{SessionConfig, SessionContext}, +}; +use datafusion_common::{GetExt, Statistics}; +use datafusion_physical_expr::{PhysicalExpr, PhysicalSortRequirement}; +use object_store::{ObjectMeta, ObjectStore}; + +#[derive(Debug)] +struct TSVFileFormat { + csv_file_format: Arc, +} + +impl TSVFileFormat { + pub fn new(csv_file_format: Arc) -> Self { + Self { csv_file_format } + } +} + +#[async_trait::async_trait] +impl FileFormat for TSVFileFormat { + fn as_any(&self) -> &dyn Any { + self + } + + fn get_ext(&self) -> String { + "tsv".to_string() + } + + fn get_ext_with_compression( + &self, + c: &FileCompressionType, + ) -> datafusion::error::Result { + if c == &FileCompressionType::UNCOMPRESSED { + Ok("tsv".to_string()) + } else { + todo!("Compression not supported") + } + } + + async fn infer_schema( + &self, + state: &SessionState, + store: &Arc, + objects: &[ObjectMeta], + ) -> Result { + self.csv_file_format + .infer_schema(state, store, objects) + .await + } + + async fn infer_stats( + &self, + state: &SessionState, + store: &Arc, + table_schema: SchemaRef, + object: &ObjectMeta, + ) -> Result { + self.csv_file_format + .infer_stats(state, store, table_schema, object) + .await + } + + async fn create_physical_plan( + &self, + state: &SessionState, + conf: FileScanConfig, + filters: Option<&Arc>, + ) -> Result> { + self.csv_file_format + .create_physical_plan(state, conf, filters) + .await + } + + async fn create_writer_physical_plan( + &self, + input: Arc, + state: &SessionState, + conf: FileSinkConfig, + order_requirements: Option>, + ) -> Result> { + self.csv_file_format + .create_writer_physical_plan(input, state, conf, order_requirements) + .await + } +} + +#[derive(Default)] +pub struct TSVFileFactory { + csv_file_factory: CsvFormatFactory, +} + +impl TSVFileFactory { + pub fn new() -> Self { + Self { + csv_file_factory: CsvFormatFactory::new(), + } + } +} + +impl FileFormatFactory for TSVFileFactory { + fn create( + &self, + state: &SessionState, + format_options: &std::collections::HashMap, + ) -> Result> { + let mut new_options = format_options.clone(); + new_options.insert("format.delimiter".to_string(), "\t".to_string()); + + let csv_file_format = self.csv_file_factory.create(state, &new_options)?; + let tsv_file_format = Arc::new(TSVFileFormat::new(csv_file_format)); + + Ok(tsv_file_format) + } + + fn default(&self) -> std::sync::Arc { + todo!() + } +} + +impl GetExt for TSVFileFactory { + fn get_ext(&self) -> String { + "tsv".to_string() + } +} + +#[tokio::main] +async fn main() -> Result<()> { + // Create a new context with the default configuration + let config = SessionConfig::new(); + let runtime = RuntimeEnv::default(); + let mut state = SessionState::new_with_config_rt(config, Arc::new(runtime)); + + // Register the custom file format + let file_format = Arc::new(TSVFileFactory::new()); + state.register_file_format(file_format, true).unwrap(); + + // Create a new context with the custom file format + let ctx = SessionContext::new_with_state(state); + + let mem_table = create_mem_table(); + ctx.register_table("mem_table", mem_table).unwrap(); + + let d = ctx + .sql("COPY mem_table TO 'mem_table.tsv' STORED AS TSV;") + .await?; + + let results = d.collect().await?; + println!("Number of inserted rows: {:?}", results[0]); + + Ok(()) +} + +// create a simple mem table +fn create_mem_table() -> Arc { + let fields = vec![ + Field::new("id", DataType::UInt8, false), + Field::new("data", DataType::Utf8, false), + ]; + let schema = Arc::new(Schema::new(fields)); + + let partitions = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(UInt8Array::from(vec![1, 2])), + Arc::new(StringArray::from(vec!["foo", "bar"])), + ], + ) + .unwrap(); + + Arc::new(MemTable::try_new(schema, vec![vec![partitions]]).unwrap()) +} From 19ab39a87e4e4183601d8042c94cc9595f9f4995 Mon Sep 17 00:00:00 2001 From: Trent Hauck Date: Fri, 28 Jun 2024 21:41:14 -0700 Subject: [PATCH 02/10] better docs plus tempdir --- .../examples/custom_file_format.rs | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/datafusion-examples/examples/custom_file_format.rs b/datafusion-examples/examples/custom_file_format.rs index b5efb93aea6a3..03a57ef8bb813 100644 --- a/datafusion-examples/examples/custom_file_format.rs +++ b/datafusion-examples/examples/custom_file_format.rs @@ -36,8 +36,19 @@ use datafusion::{ use datafusion_common::{GetExt, Statistics}; use datafusion_physical_expr::{PhysicalExpr, PhysicalSortRequirement}; use object_store::{ObjectMeta, ObjectStore}; +use tempfile::tempdir; + +/// Example of a custom file format that reads and writes TSV files. +/// +/// TSVFileFormatFactory is responsible for creating instances of TSVFileFormat. +/// The former, once registered with the SessionState, will then be used +/// to facilitate SQL operations on TSV files, such as `COPY TO` shown here. #[derive(Debug)] +/// Custom file format that reads and writes TSV files +/// +/// This file format is a wrapper around the CSV file format +/// for demonstration purposes. struct TSVFileFormat { csv_file_format: Arc, } @@ -117,6 +128,10 @@ impl FileFormat for TSVFileFormat { } #[derive(Default)] +/// Factory for creating TSV file formats +/// +/// This factory is a wrapper around the CSV file format factory +/// for demonstration purposes. pub struct TSVFileFactory { csv_file_factory: CsvFormatFactory, } @@ -172,8 +187,14 @@ async fn main() -> Result<()> { let mem_table = create_mem_table(); ctx.register_table("mem_table", mem_table).unwrap(); + let temp_dir = tempdir().unwrap(); + let table_save_path = temp_dir.path().join("mem_table.tsv"); + let d = ctx - .sql("COPY mem_table TO 'mem_table.tsv' STORED AS TSV;") + .sql(&format!( + "COPY mem_table TO '{}' STORED AS TSV;", + table_save_path.display(), + )) .await?; let results = d.collect().await?; From 2d410b3193e02a729423ff094d0c28903e4a63e5 Mon Sep 17 00:00:00 2001 From: Trent Hauck Date: Sat, 29 Jun 2024 07:47:48 -0700 Subject: [PATCH 03/10] build: clean examples if over 10GB --- ci/scripts/rust_example.sh | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/ci/scripts/rust_example.sh b/ci/scripts/rust_example.sh index 675dc4e527d00..e40ba45dc0b72 100755 --- a/ci/scripts/rust_example.sh +++ b/ci/scripts/rust_example.sh @@ -18,10 +18,15 @@ # under the License. set -ex + +repo_dir=$PWD + cd datafusion-examples/examples/ cargo fmt --all -- --check cargo check --examples +size_threshold=$((10 * 1024 * 1024 * 1024)) # 10GB + files=$(ls .) for filename in $files do @@ -29,5 +34,14 @@ do # Skip tests that rely on external storage and flight if [ ! -d $filename ]; then cargo run --example $example_name + + # If the examples are getting to big, run cargo clean + current_size=$(du -s $repo_dir/target/debug | awk '{print $1}') + + if [ $current_size -gt $size_threshold ]; then + echo "Cleaning cargo due to directory size exceeding 10 GB..." + cargo clean + fi + fi done From 96a4b3cd32fc5d6f5724e8b0b8b931364e08b584 Mon Sep 17 00:00:00 2001 From: Trent Hauck Date: Sat, 29 Jun 2024 18:20:40 -0700 Subject: [PATCH 04/10] only 1GB --- ci/scripts/rust_example.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/scripts/rust_example.sh b/ci/scripts/rust_example.sh index e40ba45dc0b72..e25f5ac85008f 100755 --- a/ci/scripts/rust_example.sh +++ b/ci/scripts/rust_example.sh @@ -25,7 +25,7 @@ cd datafusion-examples/examples/ cargo fmt --all -- --check cargo check --examples -size_threshold=$((10 * 1024 * 1024 * 1024)) # 10GB +size_threshold=$((1 * 1024 * 1024 * 1024)) # 1GB files=$(ls .) for filename in $files @@ -35,7 +35,7 @@ do if [ ! -d $filename ]; then cargo run --example $example_name - # If the examples are getting to big, run cargo clean + # If the examples are getting too big, run cargo clean current_size=$(du -s $repo_dir/target/debug | awk '{print $1}') if [ $current_size -gt $size_threshold ]; then From 9c3138598e27b3611f03313f4947cb3795546305 Mon Sep 17 00:00:00 2001 From: Trent Hauck Date: Sun, 30 Jun 2024 08:22:01 -0700 Subject: [PATCH 05/10] build: try clearing some disk space before running --- .github/workflows/rust.yml | 69 ++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 32 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index ce4b4b06cf44e..12ba0a47c8734 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -24,16 +24,16 @@ concurrency: on: push: paths-ignore: - - "docs/**" - - "**.md" - - ".github/ISSUE_TEMPLATE/**" - - ".github/pull_request_template.md" + - 'docs/**' + - '**.md' + - '.github/ISSUE_TEMPLATE/**' + - '.github/pull_request_template.md' pull_request: paths-ignore: - - "docs/**" - - "**.md" - - ".github/ISSUE_TEMPLATE/**" - - ".github/pull_request_template.md" + - 'docs/**' + - '**.md' + - '.github/ISSUE_TEMPLATE/**' + - '.github/pull_request_template.md' # manual trigger # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow workflow_dispatch: @@ -134,7 +134,7 @@ jobs: # Run tests linux-test: name: cargo test (amd64) - needs: [ linux-build-lib ] + needs: [linux-build-lib] runs-on: ubuntu-latest container: image: amd64/rust @@ -145,7 +145,7 @@ jobs: - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: - rust-version: stable + rust-version: stable - name: Run tests (excluding doctests) run: cargo test --lib --tests --bins --features avro,json,backtrace - name: Verify Working Directory Clean @@ -153,7 +153,7 @@ jobs: linux-test-datafusion-cli: name: cargo test datafusion-cli (amd64) - needs: [ linux-build-lib ] + needs: [linux-build-lib] runs-on: ubuntu-latest container: image: amd64/rust @@ -174,11 +174,18 @@ jobs: linux-test-example: name: cargo examples (amd64) - needs: [ linux-build-lib ] + needs: [linux-build-lib] runs-on: ubuntu-latest container: image: amd64/rust steps: + - run: | + # clear space for the examples + # https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + sudo rm -rf "/usr/local/share/boost" - uses: actions/checkout@v4 with: submodules: true @@ -195,12 +202,10 @@ jobs: - name: Verify Working Directory Clean run: git diff --exit-code - - # Run `cargo test doc` (test documentation examples) linux-test-doc: name: cargo test doc (amd64) - needs: [ linux-build-lib ] + needs: [linux-build-lib] runs-on: ubuntu-latest container: image: amd64/rust @@ -223,7 +228,7 @@ jobs: # Run `cargo doc` to ensure the rustdoc is clean linux-rustdoc: name: cargo doc - needs: [ linux-build-lib ] + needs: [linux-build-lib] runs-on: ubuntu-latest container: image: amd64/rust @@ -260,7 +265,7 @@ jobs: # verify that the benchmark queries return the correct results verify-benchmark-results: name: verify benchmark results (amd64) - needs: [ linux-build-lib ] + needs: [linux-build-lib] runs-on: ubuntu-latest container: image: amd64/rust @@ -290,8 +295,8 @@ jobs: run: git diff --exit-code sqllogictest-postgres: - name: "Run sqllogictest with Postgres runner" - needs: [ linux-build-lib ] + name: 'Run sqllogictest with Postgres runner' + needs: [linux-build-lib] runs-on: ubuntu-latest services: postgres: @@ -343,15 +348,15 @@ jobs: steps: - uses: actions/checkout@v4 with: - submodules: true + submodules: true - name: Setup Rust toolchain - uses: ./.github/actions/setup-macos-builder + uses: ./.github/actions/setup-macos-builder - name: Run tests (excluding doctests) shell: bash run: | cargo test --lib --tests --bins --features avro,json,backtrace cd datafusion-cli - cargo test --lib --tests --bins --all-features + cargo test --lib --tests --bins --all-features macos-aarch64: name: cargo test (macos-aarch64) @@ -371,7 +376,7 @@ jobs: test-datafusion-pyarrow: name: cargo test pyarrow (amd64) - needs: [ linux-build-lib ] + needs: [linux-build-lib] runs-on: ubuntu-20.04 container: image: amd64/rust:bullseye # Workaround https://github.com/actions/setup-python/issues/721 @@ -381,7 +386,7 @@ jobs: submodules: true - uses: actions/setup-python@v5 with: - python-version: "3.8" + python-version: '3.8' - name: Install PyArrow run: | echo "LIBRARY_PATH=$LD_LIBRARY_PATH" >> $GITHUB_ENV @@ -468,7 +473,7 @@ jobs: clippy: name: clippy - needs: [ linux-build-lib ] + needs: [linux-build-lib] runs-on: ubuntu-latest container: image: amd64/rust @@ -488,7 +493,7 @@ jobs: # Check answers are correct when hash values collide hash-collisions: name: cargo test hash collisions (amd64) - needs: [ linux-build-lib ] + needs: [linux-build-lib] runs-on: ubuntu-latest container: image: amd64/rust @@ -507,7 +512,7 @@ jobs: cargo-toml-formatting-checks: name: check Cargo.toml formatting - needs: [ linux-build-lib ] + needs: [linux-build-lib] runs-on: ubuntu-latest container: image: amd64/rust @@ -527,7 +532,7 @@ jobs: config-docs-check: name: check configs.md is up-to-date - needs: [ linux-build-lib ] + needs: [linux-build-lib] runs-on: ubuntu-latest container: image: amd64/rust @@ -541,7 +546,7 @@ jobs: rust-version: stable - uses: actions/setup-node@v4 with: - node-version: "20" + node-version: '20' - name: Check if configs.md has been modified run: | # If you encounter an error, run './dev/update_config_docs.sh' and commit @@ -568,9 +573,9 @@ jobs: working-directory: datafusion/core run: | # If you encounter an error with any of the commands below - # it means some crate in your dependency tree has a higher - # MSRV (Min Supported Rust Version) than the one specified - # in the `rust-version` key of `Cargo.toml`. Check your + # it means some crate in your dependency tree has a higher + # MSRV (Min Supported Rust Version) than the one specified + # in the `rust-version` key of `Cargo.toml`. Check your # dependencies or update the version in `Cargo.toml` cargo msrv verify - name: Check datafusion-substrait From 01147d2b35895aaf7c602e09f1d3d96182b9bdb8 Mon Sep 17 00:00:00 2001 From: Trent Hauck Date: Sun, 30 Jun 2024 08:29:55 -0700 Subject: [PATCH 06/10] build: remove sudo --- .github/workflows/rust.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 12ba0a47c8734..fe799d91642e9 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -182,10 +182,10 @@ jobs: - run: | # clear space for the examples # https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 - sudo rm -rf /usr/share/dotnet - sudo rm -rf /opt/ghc - sudo rm -rf "$AGENT_TOOLSDIRECTORY" - sudo rm -rf "/usr/local/share/boost" + rm -rf /usr/share/dotnet + rm -rf /opt/ghc + rm -rf "$AGENT_TOOLSDIRECTORY" + rm -rf "/usr/local/share/boost" - uses: actions/checkout@v4 with: submodules: true From c1af9553e7b25b9ab32aa8849ff09571e58301d4 Mon Sep 17 00:00:00 2001 From: Trent Hauck Date: Sun, 30 Jun 2024 08:51:27 -0700 Subject: [PATCH 07/10] build: try clean --- .github/workflows/rust.yml | 69 ++++++++++++++++++-------------------- ci/scripts/rust_example.sh | 14 -------- 2 files changed, 32 insertions(+), 51 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index fe799d91642e9..ce4b4b06cf44e 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -24,16 +24,16 @@ concurrency: on: push: paths-ignore: - - 'docs/**' - - '**.md' - - '.github/ISSUE_TEMPLATE/**' - - '.github/pull_request_template.md' + - "docs/**" + - "**.md" + - ".github/ISSUE_TEMPLATE/**" + - ".github/pull_request_template.md" pull_request: paths-ignore: - - 'docs/**' - - '**.md' - - '.github/ISSUE_TEMPLATE/**' - - '.github/pull_request_template.md' + - "docs/**" + - "**.md" + - ".github/ISSUE_TEMPLATE/**" + - ".github/pull_request_template.md" # manual trigger # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow workflow_dispatch: @@ -134,7 +134,7 @@ jobs: # Run tests linux-test: name: cargo test (amd64) - needs: [linux-build-lib] + needs: [ linux-build-lib ] runs-on: ubuntu-latest container: image: amd64/rust @@ -145,7 +145,7 @@ jobs: - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: - rust-version: stable + rust-version: stable - name: Run tests (excluding doctests) run: cargo test --lib --tests --bins --features avro,json,backtrace - name: Verify Working Directory Clean @@ -153,7 +153,7 @@ jobs: linux-test-datafusion-cli: name: cargo test datafusion-cli (amd64) - needs: [linux-build-lib] + needs: [ linux-build-lib ] runs-on: ubuntu-latest container: image: amd64/rust @@ -174,18 +174,11 @@ jobs: linux-test-example: name: cargo examples (amd64) - needs: [linux-build-lib] + needs: [ linux-build-lib ] runs-on: ubuntu-latest container: image: amd64/rust steps: - - run: | - # clear space for the examples - # https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 - rm -rf /usr/share/dotnet - rm -rf /opt/ghc - rm -rf "$AGENT_TOOLSDIRECTORY" - rm -rf "/usr/local/share/boost" - uses: actions/checkout@v4 with: submodules: true @@ -202,10 +195,12 @@ jobs: - name: Verify Working Directory Clean run: git diff --exit-code + + # Run `cargo test doc` (test documentation examples) linux-test-doc: name: cargo test doc (amd64) - needs: [linux-build-lib] + needs: [ linux-build-lib ] runs-on: ubuntu-latest container: image: amd64/rust @@ -228,7 +223,7 @@ jobs: # Run `cargo doc` to ensure the rustdoc is clean linux-rustdoc: name: cargo doc - needs: [linux-build-lib] + needs: [ linux-build-lib ] runs-on: ubuntu-latest container: image: amd64/rust @@ -265,7 +260,7 @@ jobs: # verify that the benchmark queries return the correct results verify-benchmark-results: name: verify benchmark results (amd64) - needs: [linux-build-lib] + needs: [ linux-build-lib ] runs-on: ubuntu-latest container: image: amd64/rust @@ -295,8 +290,8 @@ jobs: run: git diff --exit-code sqllogictest-postgres: - name: 'Run sqllogictest with Postgres runner' - needs: [linux-build-lib] + name: "Run sqllogictest with Postgres runner" + needs: [ linux-build-lib ] runs-on: ubuntu-latest services: postgres: @@ -348,15 +343,15 @@ jobs: steps: - uses: actions/checkout@v4 with: - submodules: true + submodules: true - name: Setup Rust toolchain - uses: ./.github/actions/setup-macos-builder + uses: ./.github/actions/setup-macos-builder - name: Run tests (excluding doctests) shell: bash run: | cargo test --lib --tests --bins --features avro,json,backtrace cd datafusion-cli - cargo test --lib --tests --bins --all-features + cargo test --lib --tests --bins --all-features macos-aarch64: name: cargo test (macos-aarch64) @@ -376,7 +371,7 @@ jobs: test-datafusion-pyarrow: name: cargo test pyarrow (amd64) - needs: [linux-build-lib] + needs: [ linux-build-lib ] runs-on: ubuntu-20.04 container: image: amd64/rust:bullseye # Workaround https://github.com/actions/setup-python/issues/721 @@ -386,7 +381,7 @@ jobs: submodules: true - uses: actions/setup-python@v5 with: - python-version: '3.8' + python-version: "3.8" - name: Install PyArrow run: | echo "LIBRARY_PATH=$LD_LIBRARY_PATH" >> $GITHUB_ENV @@ -473,7 +468,7 @@ jobs: clippy: name: clippy - needs: [linux-build-lib] + needs: [ linux-build-lib ] runs-on: ubuntu-latest container: image: amd64/rust @@ -493,7 +488,7 @@ jobs: # Check answers are correct when hash values collide hash-collisions: name: cargo test hash collisions (amd64) - needs: [linux-build-lib] + needs: [ linux-build-lib ] runs-on: ubuntu-latest container: image: amd64/rust @@ -512,7 +507,7 @@ jobs: cargo-toml-formatting-checks: name: check Cargo.toml formatting - needs: [linux-build-lib] + needs: [ linux-build-lib ] runs-on: ubuntu-latest container: image: amd64/rust @@ -532,7 +527,7 @@ jobs: config-docs-check: name: check configs.md is up-to-date - needs: [linux-build-lib] + needs: [ linux-build-lib ] runs-on: ubuntu-latest container: image: amd64/rust @@ -546,7 +541,7 @@ jobs: rust-version: stable - uses: actions/setup-node@v4 with: - node-version: '20' + node-version: "20" - name: Check if configs.md has been modified run: | # If you encounter an error, run './dev/update_config_docs.sh' and commit @@ -573,9 +568,9 @@ jobs: working-directory: datafusion/core run: | # If you encounter an error with any of the commands below - # it means some crate in your dependency tree has a higher - # MSRV (Min Supported Rust Version) than the one specified - # in the `rust-version` key of `Cargo.toml`. Check your + # it means some crate in your dependency tree has a higher + # MSRV (Min Supported Rust Version) than the one specified + # in the `rust-version` key of `Cargo.toml`. Check your # dependencies or update the version in `Cargo.toml` cargo msrv verify - name: Check datafusion-substrait diff --git a/ci/scripts/rust_example.sh b/ci/scripts/rust_example.sh index e25f5ac85008f..675dc4e527d00 100755 --- a/ci/scripts/rust_example.sh +++ b/ci/scripts/rust_example.sh @@ -18,15 +18,10 @@ # under the License. set -ex - -repo_dir=$PWD - cd datafusion-examples/examples/ cargo fmt --all -- --check cargo check --examples -size_threshold=$((1 * 1024 * 1024 * 1024)) # 1GB - files=$(ls .) for filename in $files do @@ -34,14 +29,5 @@ do # Skip tests that rely on external storage and flight if [ ! -d $filename ]; then cargo run --example $example_name - - # If the examples are getting too big, run cargo clean - current_size=$(du -s $repo_dir/target/debug | awk '{print $1}') - - if [ $current_size -gt $size_threshold ]; then - echo "Cleaning cargo due to directory size exceeding 10 GB..." - cargo clean - fi - fi done From 506212fe90972ac2f436c07ed44eb3e016cbdd6a Mon Sep 17 00:00:00 2001 From: Trent Hauck Date: Sun, 30 Jun 2024 12:03:54 -0700 Subject: [PATCH 08/10] build: run clean --- ci/scripts/rust_example.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/scripts/rust_example.sh b/ci/scripts/rust_example.sh index 675dc4e527d00..63e6fc76c9b79 100755 --- a/ci/scripts/rust_example.sh +++ b/ci/scripts/rust_example.sh @@ -29,5 +29,6 @@ do # Skip tests that rely on external storage and flight if [ ! -d $filename ]; then cargo run --example $example_name + cargo clean fi done From fdbd2c8b7e929db527bbeb0f1d9a89f7869b64a2 Mon Sep 17 00:00:00 2001 From: Trent Hauck Date: Sun, 30 Jun 2024 12:39:05 -0700 Subject: [PATCH 09/10] build: only clean examples --- ci/scripts/rust_example.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/rust_example.sh b/ci/scripts/rust_example.sh index 63e6fc76c9b79..0415090665d22 100755 --- a/ci/scripts/rust_example.sh +++ b/ci/scripts/rust_example.sh @@ -29,6 +29,6 @@ do # Skip tests that rely on external storage and flight if [ ! -d $filename ]; then cargo run --example $example_name - cargo clean + cargo clean -p datafusion-examples fi done From 77ff942ac635499d42cb5ac3c21dca214bb7e6db Mon Sep 17 00:00:00 2001 From: Trent Hauck Date: Sun, 30 Jun 2024 20:30:48 -0700 Subject: [PATCH 10/10] docs: better output for example --- datafusion-examples/examples/custom_file_format.rs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/datafusion-examples/examples/custom_file_format.rs b/datafusion-examples/examples/custom_file_format.rs index 03a57ef8bb813..fe936418bce4a 100644 --- a/datafusion-examples/examples/custom_file_format.rs +++ b/datafusion-examples/examples/custom_file_format.rs @@ -17,7 +17,10 @@ use std::{any::Any, sync::Arc}; -use arrow::array::{RecordBatch, StringArray, UInt8Array}; +use arrow::{ + array::{AsArray, RecordBatch, StringArray, UInt8Array}, + datatypes::UInt64Type, +}; use arrow_schema::{DataType, Field, Schema, SchemaRef}; use datafusion::{ datasource::{ @@ -198,7 +201,14 @@ async fn main() -> Result<()> { .await?; let results = d.collect().await?; - println!("Number of inserted rows: {:?}", results[0]); + println!( + "Number of inserted rows: {:?}", + (results[0] + .column_by_name("count") + .unwrap() + .as_primitive::() + .value(0)) + ); Ok(()) }