From 3c6ac9672408648ff9e258ef4fc895c00943743a Mon Sep 17 00:00:00 2001 From: Mikhail Kot Date: Fri, 12 Jun 2026 11:50:18 +0100 Subject: [PATCH] initial Signed-off-by: Mikhail Kot --- .github/workflows/ci.yml | 23 ++ .github/workflows/duckdb-r2.yml | 197 ++++++++++++++++++ .github/workflows/rust-instrumented.yml | 20 ++ vortex-duckdb/build.rs | 72 +++++-- .../src/e2e_test/vortex_scan_test.rs | 7 + 5 files changed, 300 insertions(+), 19 deletions(-) create mode 100644 .github/workflows/duckdb-r2.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bf9c0a785b6..07e6bb708a7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -25,6 +25,25 @@ env: NIGHTLY_TOOLCHAIN: nightly-2026-02-05 jobs: + duckdb-mirror: + name: "Mirror DuckDB to R2" + if: github.event_name == 'pull_request' + uses: ./.github/workflows/duckdb-r2.yml + secrets: inherit + + duckdb-ready: + name: "DuckDB libraries available in R2" + needs: duckdb-mirror + if: ${{ !cancelled() }} + runs-on: ubuntu-latest + timeout-minutes: 5 + steps: + - name: Verify DuckDB mirror + if: ${{ needs.duckdb-mirror.result == 'failure' }} + run: | + echo "DuckDB mirror failed; downstream builds would 404" + exit 1 + lint-toml: runs-on: ubuntu-latest timeout-minutes: 10 @@ -115,6 +134,7 @@ jobs: rust-docs: name: "Rust (docs)" + needs: duckdb-ready timeout-minutes: 30 runs-on: >- ${{ github.repository == 'vortex-data/vortex' @@ -204,6 +224,7 @@ jobs: rust-lint: name: "Rust (lint)" + needs: duckdb-ready timeout-minutes: 30 runs-on: >- ${{ github.repository == 'vortex-data/vortex' @@ -301,6 +322,7 @@ jobs: rust-test-other: name: "Rust tests (${{ matrix.os }})" + needs: duckdb-ready timeout-minutes: 30 strategy: fail-fast: false @@ -422,6 +444,7 @@ jobs: sqllogic-test: name: "SQL logic tests" + needs: duckdb-ready runs-on: >- ${{ github.repository == 'vortex-data/vortex' && format('runs-on={0}/runner=amd64-medium/image=ubuntu24-full-x64-pre-v2/tag=sql-logic-test', github.run_id) diff --git a/.github/workflows/duckdb-r2.yml b/.github/workflows/duckdb-r2.yml new file mode 100644 index 00000000000..ffc2410d0ef --- /dev/null +++ b/.github/workflows/duckdb-r2.yml @@ -0,0 +1,197 @@ +name: DuckDB R2 mirror + +# Mirror DuckDB libraries referenced by vortex-duckdb/build.rs to R2 when they +# are not present yet. Download tagged archives or build commits from source. +on: + workflow_call: { } + +concurrency: + group: duckdb-r2-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: false + +permissions: + contents: read + +env: + PUBLIC_BASE_URL: "https://ci-builds.vortex.dev" + R2_BUCKET: "duckdb-builds" + R2_ENDPOINT_URL: "https://52bdeab5651e1584747feefd051fd566.r2.cloudflarestorage.com" + +jobs: + check: + name: "Resolve DuckDB version and check R2" + runs-on: ubuntu-latest + timeout-minutes: 10 + outputs: + version: ${{ steps.resolve.outputs.version }} + ref_dir: ${{ steps.resolve.outputs.ref_dir }} + release: ${{ steps.resolve.outputs.release }} + matrix: ${{ steps.resolve.outputs.matrix }} + any_missing: ${{ steps.resolve.outputs.any_missing }} + steps: + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 + - name: Resolve version and check R2 + id: resolve + run: | + set -Eeuo pipefail + + version=$(grep -oP 'DEFAULT_DUCKDB_VERSION:\s*&str\s*=\s*"\K[^"]+' \ + vortex-duckdb/build.rs) + + # Same as in vortex-duckdb/build.rs: >=2 dot-separated numeric + # components is a tagged release (ref dir "vX.Y.Z"), anything + # else is a commit. + ref="${version#v}" + if [[ "$ref" =~ ^[0-9]+(\.[0-9]+)+$ ]]; then + release=true + ref_dir="v$ref" + else + release=false + ref_dir="$ref" + fi + + echo "DuckDB $version release=$release" + + entries=() + for archive in \ + libduckdb-linux-amd64.zip \ + libduckdb-linux-arm64.zip \ + libduckdb-osx-universal.zip; do + + url="${PUBLIC_BASE_URL}/${ref_dir}/${archive}" + code=$(curl -o /dev/null -s -w '%{http_code}' --head "$url" || echo 000) + if [ "$code" = "200" ]; then + echo "present in R2: $archive" + continue + fi + + echo "missing in R2 (HTTP $code): $archive" + case "$archive" in + *linux-amd64*) runner="ubuntu-latest"; os="linux"; arch="amd64" ;; + *linux-arm64*) runner="ubuntu-24.04-arm"; os="linux"; arch="arm64" ;; + *osx-universal*) runner="macos-14"; os="osx"; arch="universal" ;; + esac + entries+=("$(jq -nc \ + --arg archive "$archive" \ + --arg runner "$runner" \ + --arg os "$os" \ + --arg arch "$arch" \ + '{archive: $archive, runner: $runner, os: $os, arch: $arch}')") + done + + if [ "${#entries[@]}" -eq 0 ]; then + matrix='{"include":[]}' + any_missing=false + else + include=$(printf '%s\n' "${entries[@]}" | jq -sc '.') + matrix=$(jq -nc --argjson include "$include" '{include: $include}') + any_missing=true + fi + + echo "any_missing=$any_missing" + + { + echo "version=$version" + echo "ref_dir=$ref_dir" + echo "release=$release" + echo "matrix=$matrix" + echo "any_missing=$any_missing" + } >> "$GITHUB_OUTPUT" + + mirror: + name: "Mirror DuckDB ${{ matrix.archive }} to R2" + needs: check + if: >- + needs.check.outputs.any_missing == 'true' && + github.repository == 'vortex-data/vortex' && + github.event.pull_request.head.repo.full_name == github.repository + environment: duckdb-build + timeout-minutes: 120 + strategy: + fail-fast: false + matrix: ${{ fromJSON(needs.check.outputs.matrix) }} + runs-on: ${{ matrix.runner }} + steps: + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 + + - name: Install build dependencies (Linux) + if: needs.check.outputs.release != 'true' && runner.os == 'Linux' + run: | + sudo apt-get update + sudo apt-get install -y ninja-build libcurl4-openssl-dev zip unzip + + # MacOS already has ninja and p7zip + + - name: Prepare ${{ matrix.archive }} + env: + ARCHIVE: ${{ matrix.archive }} + REF_DIR: ${{ needs.check.outputs.ref_dir }} + RELEASE: ${{ needs.check.outputs.release }} + PLATFORM_OS: ${{ matrix.os }} + run: | + set -Eeuo pipefail + + if [ "$RELEASE" = "true" ]; then + echo "Mirroring DuckDB release ${REF_DIR}/${ARCHIVE}" + curl -fSL --retry 3 -o "$ARCHIVE" \ + "https://github.com/duckdb/duckdb/releases/download/${REF_DIR}/${ARCHIVE}" + else + echo "Building DuckDB commit ${REF_DIR} from source" + + curl -fSL --retry 3 -o duckdb-src.zip \ + "https://github.com/duckdb/duckdb/archive/${REF_DIR}.zip" + + # macos zip extract error: cannot create + # <...>/issue2628_������.csv Illegal byte sequence + + if [ "$PLATFORM_OS" = "osx" ]; then + 7z x duckdb-src.zip + else + unzip -q duckdb-src.zip + fi + + src_dir="duckdb-${REF_DIR}" + extra="" + if [ "$PLATFORM_OS" = "osx" ]; then + extra="OSX_BUILD_UNIVERSAL=1" + fi + + make -C "$src_dir" \ + GEN=ninja \ + DISABLE_SANITIZER=1 \ + THREADSAN=0 \ + BUILD_SHELL=false \ + BUILD_UNITTESTS=false \ + ENABLE_UNITTEST_CPP_TESTS=false \ + BUILD_EXTENSIONS="parquet;tpch;tpcds" \ + $extra + + lib_dir="${src_dir}/build/release/src" + stage="stage" + rm -rf "$stage" + mkdir -p "$stage" + + cp -a "${lib_dir}/libduckdb.so" "$stage/" 2>/dev/null || true + cp -a "${lib_dir}/libduckdb.dylib" "$stage/" 2>/dev/null || true + cp -a "${lib_dir}/libduckdb_static.a" "$stage/" + cp -a "${src_dir}/src/include/duckdb.h" "$stage/" 2>/dev/null || true + cp -a "${src_dir}/src/include/duckdb.hpp" "$stage/" 2>/dev/null || true + + ( cd "$stage" && zip -r "../${ARCHIVE}" . ) + fi + + ls -la "$ARCHIVE" + + - name: Upload to R2 + env: + AWS_ACCESS_KEY_ID: ${{ secrets.DUCKDB_R2_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DUCKDB_R2_SECRET_ACCESS_KEY }} + AWS_REGION: "us-east-1" + AWS_ENDPOINT_URL: ${{ env.R2_ENDPOINT_URL }} + run: | + set -Eeuo pipefail + python3 scripts/s3-upload.py \ + --bucket "$R2_BUCKET" \ + --key "${{ needs.check.outputs.ref_dir }}/${{ matrix.archive }}" \ + --body "${{ matrix.archive }}" \ + --checksum-algorithm CRC32 diff --git a/.github/workflows/rust-instrumented.yml b/.github/workflows/rust-instrumented.yml index e4dc48ee8b7..b72b943895c 100644 --- a/.github/workflows/rust-instrumented.yml +++ b/.github/workflows/rust-instrumented.yml @@ -22,8 +22,28 @@ env: NIGHTLY_TOOLCHAIN: nightly-2026-02-05 jobs: + duckdb-mirror: + name: "Mirror DuckDB to R2" + if: github.event_name == 'pull_request' + uses: ./.github/workflows/duckdb-r2.yml + secrets: inherit + + duckdb-ready: + name: "DuckDB libraries available in R2" + needs: duckdb-mirror + if: ${{ !cancelled() }} + runs-on: ubuntu-latest + timeout-minutes: 5 + steps: + - name: Verify DuckDB mirror + if: ${{ needs.duckdb-mirror.result == 'failure' }} + run: | + echo "DuckDB mirror failed" + exit 1 + rust-coverage: name: "Rust tests (coverage) (${{ matrix.suite }})" + needs: duckdb-ready timeout-minutes: 30 permissions: id-token: write diff --git a/vortex-duckdb/build.rs b/vortex-duckdb/build.rs index 54ff2f23dc4..b51d529ebe0 100644 --- a/vortex-duckdb/build.rs +++ b/vortex-duckdb/build.rs @@ -17,7 +17,10 @@ use std::process::exit; use bindgen::Abi; use bindgen::callbacks::ParseCallbacks; -const DUCKDB_RELEASES_URL: &str = "https://github.com/duckdb/duckdb/releases/download"; +// You can substitute this URL for https://github.com/duckdb/duckdb/releases/download +// We use own infrastructure for testing pre-release builds +const DUCKDB_RELEASES_URL: &str = "https://ci-builds.vortex.dev"; + const DUCKDB_SOURCE_RELEASE_URL: &str = "https://github.com/duckdb/duckdb/archive/refs/tags"; const DUCKDB_SOURCE_COMMIT_URL: &str = "https://github.com/duckdb/duckdb/archive"; const DEFAULT_DUCKDB_VERSION: &str = "1.5.3"; @@ -209,8 +212,8 @@ impl ParseCallbacks for BindgenCargoCallbacks { #[derive(Debug, Clone)] enum DuckDBVersion { - Release(String), // i.e. X.Y.Z. Download pre-compiled libraries from GitHub releases. - Commit(String), // Download source and build DuckDB from scratch. + Release(String), // i.e. X.Y.Z. Download precompiled libraries from R2. + Commit(String), // Download precompiled libraries from R2, build from source on a miss. } impl std::fmt::Display for DuckDBVersion { @@ -246,9 +249,10 @@ impl From<&String> for DuckDBVersion { } } -fn download_url(url: &str, path: &Path) { +/// Returns false on a non-retryable client error (4xx) or after failing retries +fn try_download_url(url: &str, path: &Path) -> bool { if path.exists() { - return; + return true; } println!("cargo:info=Downloading DuckDB from {url}"); @@ -268,7 +272,7 @@ fn download_url(url: &str, path: &Path) { let bytes = response.bytes().unwrap().to_vec(); fs::write(path, bytes).unwrap(); println!("cargo:info=Downloaded to {url}"); - return; + return true; } Ok(response) if response.status().is_server_error() => { let status = response.status(); @@ -286,8 +290,8 @@ fn download_url(url: &str, path: &Path) { // Client errors (4xx) are not retryable Ok(response) => { let status = response.status(); - println!("cargo:error=Failed to download {url}: HTTP {status}"); - exit(1); + println!("cargo:warning=Failed to download {url}: HTTP {status}"); + return false; } } @@ -298,8 +302,15 @@ fn download_url(url: &str, path: &Path) { } } - println!("cargo:error=Failed to download {url} after {DOWNLOAD_MAX_RETRIES} attempts"); - exit(1); + println!("cargo:warning=Failed to download {url} after {DOWNLOAD_MAX_RETRIES} attempts"); + false +} + +fn download_url(url: &str, path: &Path) { + if !try_download_url(url, path) { + println!("cargo:error=Failed to download {url}"); + exit(1); + } } fn extract(archive: &Path, dest: &Path) { @@ -312,7 +323,9 @@ fn extract(archive: &Path, dest: &Path) { zip::ZipArchive::new(file).unwrap().extract(dest).unwrap(); } -fn download(version: &DuckDBVersion, library_dir: &Path) { +/// Download DuckDB library archive from R2 and extract it. +/// Return false if archive is not available or download failed +fn download(version: &DuckDBVersion, library_dir: &Path) -> bool { let target = env::var("TARGET").unwrap(); let (platform, arch) = match target.as_str() { "aarch64-apple-darwin" | "x86_64-apple-darwin" => ("osx", "universal"), @@ -329,15 +342,18 @@ fn download(version: &DuckDBVersion, library_dir: &Path) { let archive_path = library_dir.join(&archive_name); fs::create_dir_all(library_dir).unwrap(); - download_url(&url, &archive_path); + if !try_download_url(&url, &archive_path) { + return false; + } let duckdb_lib_dir = archive_path.parent().unwrap().to_path_buf(); for artifact in BUILD_ARTIFACTS { if duckdb_lib_dir.join(artifact).exists() { - return; + return true; } } extract(&archive_path, &duckdb_lib_dir); + true } fn build_duckdb(version: &DuckDBVersion, duckdb_repo_dir: &Path) { @@ -358,12 +374,12 @@ fn build_duckdb(version: &DuckDBVersion, duckdb_repo_dir: &Path) { ("1", "0") }; - // If we're building from a commit we need to build benchmark + // If we're building from a commit we need to build benchmark // extensions statically, otherwise DuckDB tries to load them from an http // endpoint with version 0.0.1 (all non-tagged builds) which doesn't exist. let static_extensions = match version { - DuckDBVersion::Release(_) => "parquet;jemalloc", - DuckDBVersion::Commit(_) => "parquet;jemalloc;tpch;tpcds", + DuckDBVersion::Release(_) => "parquet", + DuckDBVersion::Commit(_) => "parquet;tpch;tpcds", }; let envs = [ @@ -548,6 +564,8 @@ fn main() { println!("cargo:rerun-if-env-changed=HTTP_TIMEOUT"); println!("cargo:rerun-if-env-changed=TARGET"); + println!("cargo:rustc-check-cfg=cfg(duckdb_release)"); + // These two variables are set in duckdb-vortex's CI. Don't download // duckdb if they are present println!("cargo:rerun-if-env-changed=DUCKDB_SOURCE_DIR"); @@ -580,7 +598,10 @@ fn main() { .unwrap_or_else(|_| DEFAULT_DUCKDB_VERSION.to_owned()); let version = DuckDBVersion::from(&version); match &version { - DuckDBVersion::Release(v) => println!("cargo:info=Using DuckDB release version: {v}"), + DuckDBVersion::Release(v) => { + println!("cargo:info=Using DuckDB release version: {v}"); + println!("cargo:rustc-cfg=duckdb_release"); + } DuckDBVersion::Commit(c) => println!("cargo:info=Using DuckDB commit: {c}"), } @@ -645,10 +666,23 @@ fn main() { }; println!("cargo:info=building DuckDB in {build_type} mode"); - if has_debug_env || matches!(version, DuckDBVersion::Commit(_)) { + if has_debug_env { try_build_duckdb(&source_dir, &library_dir, &version, build_type); } else { - download(&version, &library_dir); + match &version { + DuckDBVersion::Release(_) => { + if !download(&version, &library_dir) { + println!("cargo:error=DuckDB release {version} not available in R2"); + exit(1); + } + } + DuckDBVersion::Commit(_) => { + if !download(&version, &library_dir) { + println!("cargo:info=DuckDB commit {version} not in R2, building from source"); + try_build_duckdb(&source_dir, &library_dir, &version, build_type); + } + } + } }; let duckdb_include_dir = inner_dir.join("src").join("include"); diff --git a/vortex-duckdb/src/e2e_test/vortex_scan_test.rs b/vortex-duckdb/src/e2e_test/vortex_scan_test.rs index 99bab3f71ce..9bbc63a6944 100644 --- a/vortex-duckdb/src/e2e_test/vortex_scan_test.rs +++ b/vortex-duckdb/src/e2e_test/vortex_scan_test.rs @@ -949,6 +949,13 @@ fn test_vortex_encodings_roundtrip() { assert_eq!(fixed_child_values, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); } +// Spatial extension is not bundled with duckdb. If we're building from a +// commit, don't run this test, since bundling spatial requires openssl-dev +// which is an issue on macos runners. +#[cfg_attr( + not(duckdb_release), + ignore = "spatial extension requires a release DuckDB build" +)] #[test] fn test_geometry() { let file = RUNTIME.block_on(async {