From 94ab4bdbdc0d3687f65147ce28ed03941b4c3b34 Mon Sep 17 00:00:00 2001 From: Qi Zhu <821684824@qq.com> Date: Sun, 24 May 2026 19:00:26 +0800 Subject: [PATCH 1/7] fix(sort-pushdown): restore SortExec elimination after stats-based file reorder MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #21956 added a `column_in_file_schema` signal that routes plain-column sort requests with stripped output_ordering into the Inexact branch of `FileScanConfig::try_pushdown_sort`. The Inexact branch unconditionally strips `output_ordering` even when the post-sort file groups have become non-overlapping and the ordering would re-validate — so `SortExec` stayed above the source where it had previously been eliminated. Symptom: with `WITH ORDER (key ASC)` (or inferred parquet `sorting_columns` metadata) plus files listed out of order on disk, PushdownSort reordered the files but kept the SortExec, regressing the behaviour PR #21182 introduced. Fix: in `rebuild_with_source`, when `is_exact=false` but `all_non_overlapping=true` after the stats-based sort, re-validate the declared ordering against the new file groups. If `ordering_satisfy` passes, keep `output_ordering` so the outer wrapper can upgrade Inexact back to Exact. The outer `FileScanConfig::try_pushdown_sort` now also inspects `output_ordering` on the Inexact branch and returns Exact when it survived. Safety: - When `self.output_ordering` is empty (no `WITH ORDER`, no `sorting_columns` metadata) the re-validate falls through, ordering is stripped, and SortExec is kept. - When `all_non_overlapping=false`, ordering is stripped as before. SLT updates: `sort_pushdown.slt` tests 4.1, 6.1, 8.1, G.1, G.2 now expect the SortExec to be eliminated (the behaviour `#21182` originally delivered and SLT comments still describe). New `Test 5b` covers the safety case: files written without `ORDER BY` (no sorting_columns) and no `WITH ORDER` must keep SortExec even when stats happen to be non-overlapping. --- .../datasource/src/file_scan_config/mod.rs | 21 +++- .../src/file_scan_config/sort_pushdown.rs | 68 ++++++++----- .../sqllogictest/test_files/sort_pushdown.slt | 95 +++++++++++++++---- 3 files changed, 138 insertions(+), 46 deletions(-) diff --git a/datafusion/datasource/src/file_scan_config/mod.rs b/datafusion/datasource/src/file_scan_config/mod.rs index e1fd10324373d..a39495994f368 100644 --- a/datafusion/datasource/src/file_scan_config/mod.rs +++ b/datafusion/datasource/src/file_scan_config/mod.rs @@ -973,9 +973,24 @@ impl DataSource for FileScanConfig { } } SortOrderPushdownResult::Inexact { inner } => { - Ok(SortOrderPushdownResult::Inexact { - inner: Arc::new(self.rebuild_with_source(inner, false, order)?), - }) + let config = self.rebuild_with_source(inner, false, order)?; + // `rebuild_with_source` reorders files by stats; if the + // post-sort files are non-overlapping AND the request now + // validates against the new file groups, `output_ordering` + // is preserved and we can upgrade back to Exact. This + // restores the sort-elimination behaviour that lived in + // the `Unsupported` → `try_sort_file_groups_by_statistics` + // path before #21956 routed `column_in_file_schema` cases + // here. + if config.output_ordering.is_empty() { + Ok(SortOrderPushdownResult::Inexact { + inner: Arc::new(config), + }) + } else { + Ok(SortOrderPushdownResult::Exact { + inner: Arc::new(config), + }) + } } SortOrderPushdownResult::Unsupported => { self.try_sort_file_groups_by_statistics(order) diff --git a/datafusion/datasource/src/file_scan_config/sort_pushdown.rs b/datafusion/datasource/src/file_scan_config/sort_pushdown.rs index ece84015a7bbc..5fc0bdabf0965 100644 --- a/datafusion/datasource/src/file_scan_config/sort_pushdown.rs +++ b/datafusion/datasource/src/file_scan_config/sort_pushdown.rs @@ -138,31 +138,51 @@ impl FileScanConfig { false }; - if is_exact && all_non_overlapping { - // Truly exact: within-file ordering guaranteed and files are non-overlapping. - // Keep output_ordering so SortExec can be eliminated for each partition. - // - // We intentionally do NOT redistribute files across groups here. - // The planning-phase bin-packing may interleave file ranges across groups: - // - // Group 0: [f1(1-10), f3(21-30)] ← interleaved with group 1 - // Group 1: [f2(11-20), f4(31-40)] - // - // This interleaving is actually beneficial because SPM pulls from both - // partitions concurrently, keeping parallel I/O active: - // - // SPM: pull P0 [1-10] → pull P1 [11-20] → pull P0 [21-30] → pull P1 [31-40] - // ^^^^^^^^^^^^ ^^^^^^^^^^^^ - // both partitions scanning files simultaneously - // - // If we were to redistribute files consecutively: - // Group 0: [f1(1-10), f2(11-20)] ← all values < group 1 - // Group 1: [f3(21-30), f4(31-40)] - // - // SPM would read ALL of group 0 first (values always smaller), then group 1. - // This degrades to single-threaded sequential I/O — the other partition - // sits idle the entire time, losing the parallelism benefit. + // Decide whether to keep `output_ordering` (i.e. let the outer + // pushdown report `Exact` and drop `SortExec`). + // + // Two paths can produce a keep: + // + // 1. `is_exact && all_non_overlapping`: the source already had + // validated ordering and the post-sort files still don't + // overlap — Exact carries through unchanged. + // + // 2. `!is_exact && all_non_overlapping`: source returned + // `Inexact` because pre-sort `validated_output_ordering()` + // stripped the declaration (files were listed out of order + // on disk). After our stats-based sort the files are now + // non-overlapping — re-validate against the new file + // groups and, if it passes, upgrade back to Exact so the + // outer wrapper drops the `SortExec`. Without this, the + // `Inexact` branch stayed Inexact even when reorder + // restored a perfectly valid ordering, leaving an + // unnecessary `SortExec` above the source (regression + // after #21956's `column_in_file_schema` signal pushed + // this scenario into the Inexact branch instead of the + // `try_sort_file_groups_by_statistics` fallback). + // + // We intentionally do NOT redistribute files across groups here. + // The planning-phase bin-packing may interleave file ranges across groups: + // + // Group 0: [f1(1-10), f3(21-30)] ← interleaved with group 1 + // Group 1: [f2(11-20), f4(31-40)] + // + // This interleaving is actually beneficial because SPM pulls from both + // partitions concurrently, keeping parallel I/O active. + let keep_ordering = if all_non_overlapping { + if is_exact { + true + } else { + // Re-validate: now that files are sorted, can the + // request be satisfied? + let new_eq_props = new_config.eq_properties(); + new_eq_props.ordering_satisfy(order.iter().cloned())? + } } else { + false + }; + + if !keep_ordering { new_config.output_ordering = vec![]; } diff --git a/datafusion/sqllogictest/test_files/sort_pushdown.slt b/datafusion/sqllogictest/test_files/sort_pushdown.slt index 540562eb3bc8d..ac2f483c07f51 100644 --- a/datafusion/sqllogictest/test_files/sort_pushdown.slt +++ b/datafusion/sqllogictest/test_files/sort_pushdown.slt @@ -1100,8 +1100,9 @@ CREATE EXTERNAL TABLE reversed_parquet(id INT, value INT) STORED AS PARQUET LOCATION 'test_files/scratch/sort_pushdown/reversed/'; -# Test 4.1: PushdownSort reorders files by min/max statistics so they are -# already in correct sort order → non-overlapping → no SortExec needed. +# Test 4.1: PushdownSort reorders files by min/max statistics; the +# post-sort file groups are non-overlapping, the inferred ordering +# re-validates, and the SortExec above can be eliminated. # (files reordered from [a_high, b_mid, c_low] to [c_low, b_mid, a_high]) query TT EXPLAIN SELECT * FROM reversed_parquet ORDER BY id ASC; @@ -1109,9 +1110,7 @@ EXPLAIN SELECT * FROM reversed_parquet ORDER BY id ASC; logical_plan 01)Sort: reversed_parquet.id ASC NULLS LAST 02)--TableScan: reversed_parquet projection=[id, value] -physical_plan -01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/c_low.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/a_high.parquet]]}, projection=[id, value], file_type=parquet, sort_order_for_reorder=[id@0 ASC NULLS LAST] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/c_low.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/a_high.parquet]]}, projection=[id, value], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet, sort_order_for_reorder=[id@0 ASC NULLS LAST] # Test 4.2: Results must be correct query II @@ -1175,10 +1174,72 @@ SELECT * FROM overlap_parquet ORDER BY id ASC; 5 500 6 600 +# Test 5b: Safety case — no WITH ORDER, files written without ORDER BY (no +# sorting_columns metadata). Source has no way to declare per-file ordering, +# so even though min/max stats happen to be non-overlapping, the optimizer +# must NOT eliminate SortExec. +statement ok +CREATE TABLE no_decl_low(id INT, value INT) AS VALUES (1, 100), (3, 300), (2, 200); + +statement ok +CREATE TABLE no_decl_mid(id INT, value INT) AS VALUES (6, 600), (4, 400), (5, 500); + +statement ok +CREATE TABLE no_decl_high(id INT, value INT) AS VALUES (9, 900), (8, 800), (7, 700); + +# Write WITHOUT ORDER BY so each file lacks sorting_columns metadata. +query I +COPY no_decl_low TO 'test_files/scratch/sort_pushdown/no_decl/a_low.parquet'; +---- +3 + +query I +COPY no_decl_mid TO 'test_files/scratch/sort_pushdown/no_decl/b_mid.parquet'; +---- +3 + +query I +COPY no_decl_high TO 'test_files/scratch/sort_pushdown/no_decl/c_high.parquet'; +---- +3 + +statement ok +CREATE EXTERNAL TABLE no_decl_parquet(id INT, value INT) +STORED AS PARQUET +LOCATION 'test_files/scratch/sort_pushdown/no_decl/'; + +# Min/max stats per file happen to be non-overlapping (1-3, 4-6, 7-9) but the +# rows inside each file are NOT sorted by id. Without an ordering declaration +# (WITH ORDER or parquet sorting_columns), the optimizer cannot prove the +# output would be sorted — SortExec must stay. +query TT +EXPLAIN SELECT * FROM no_decl_parquet ORDER BY id ASC; +---- +logical_plan +01)Sort: no_decl_parquet.id ASC NULLS LAST +02)--TableScan: no_decl_parquet projection=[id, value] +physical_plan +01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/no_decl/a_low.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/no_decl/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/no_decl/c_high.parquet]]}, projection=[id, value], file_type=parquet, sort_order_for_reorder=[id@0 ASC NULLS LAST] + +# Results must still be correct (SortExec does the final sort) +query II +SELECT * FROM no_decl_parquet ORDER BY id ASC; +---- +1 100 +2 200 +3 300 +4 400 +5 500 +6 600 +7 700 +8 800 +9 900 + # Test 6: WITH ORDER + reversed filesystem order # Same file setup as Test 4 but explicitly declaring ordering via WITH ORDER. -# Even with WITH ORDER, the optimizer should detect that inter-file order is wrong -# and keep SortExec. +# PushdownSort reorders files by min/max stats; after reorder the inter-file +# ordering re-validates and the SortExec above is eliminated. statement ok CREATE EXTERNAL TABLE reversed_with_order_parquet(id INT, value INT) @@ -1194,9 +1255,7 @@ EXPLAIN SELECT * FROM reversed_with_order_parquet ORDER BY id ASC; logical_plan 01)Sort: reversed_with_order_parquet.id ASC NULLS LAST 02)--TableScan: reversed_with_order_parquet projection=[id, value] -physical_plan -01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/c_low.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/a_high.parquet]]}, projection=[id, value], file_type=parquet, sort_order_for_reorder=[id@0 ASC NULLS LAST] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/c_low.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/a_high.parquet]]}, projection=[id, value], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet, sort_order_for_reorder=[id@0 ASC NULLS LAST] # Test 6.2: Results must be correct query II @@ -1333,9 +1392,7 @@ EXPLAIN SELECT * FROM desc_reversed_parquet ORDER BY id DESC; logical_plan 01)Sort: desc_reversed_parquet.id DESC NULLS FIRST 02)--TableScan: desc_reversed_parquet projection=[id, value] -physical_plan -01)SortExec: expr=[id@0 DESC], preserve_partitioning=[false] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/desc_reversed/b_high.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/desc_reversed/a_low.parquet]]}, projection=[id, value], file_type=parquet, sort_order_for_reorder=[id@0 DESC], reverse_row_groups=true +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/desc_reversed/b_high.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/desc_reversed/a_low.parquet]]}, projection=[id, value], output_ordering=[id@0 DESC], file_type=parquet, sort_order_for_reorder=[id@0 DESC], reverse_row_groups=true # Test 8.2: Results must be correct query II @@ -2218,7 +2275,7 @@ STORED AS PARQUET LOCATION 'test_files/scratch/sort_pushdown/tg_buffer/' WITH ORDER (id ASC); -# Test G.1: BufferExec appears between SPM and DataSourceExec +# Test G.1: SortExec eliminated; BufferExec replaces it between SPM and DataSourceExec query TT EXPLAIN SELECT * FROM tg_buffer ORDER BY id ASC; ---- @@ -2227,8 +2284,8 @@ logical_plan 02)--TableScan: tg_buffer projection=[id, value] physical_plan 01)SortPreservingMergeExec: [id@0 ASC NULLS LAST] -02)--SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true] -03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/a_high.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/c_low.parquet]]}, projection=[id, value], file_type=parquet, sort_order_for_reorder=[id@0 ASC NULLS LAST] +02)--BufferExec: capacity=1073741824 +03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/a_high.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/c_low.parquet]]}, projection=[id, value], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet, sort_order_for_reorder=[id@0 ASC NULLS LAST] # Verify correctness query II @@ -2245,7 +2302,7 @@ SELECT * FROM tg_buffer ORDER BY id ASC; 9 900 10 1000 -# Test G.2: LIMIT query with BufferExec +# Test G.2: LIMIT query — SortExec eliminated, limit pushed to source; BufferExec stays query TT EXPLAIN SELECT * FROM tg_buffer ORDER BY id ASC LIMIT 3; ---- @@ -2254,8 +2311,8 @@ logical_plan 02)--TableScan: tg_buffer projection=[id, value] physical_plan 01)SortPreservingMergeExec: [id@0 ASC NULLS LAST], fetch=3 -02)--SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true] -03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/a_high.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/c_low.parquet]]}, projection=[id, value], file_type=parquet, predicate=DynamicFilter [ empty ], sort_order_for_reorder=[id@0 ASC NULLS LAST] +02)--BufferExec: capacity=1073741824 +03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/a_high.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/c_low.parquet]]}, projection=[id, value], limit=3, output_ordering=[id@0 ASC NULLS LAST], file_type=parquet, sort_order_for_reorder=[id@0 ASC NULLS LAST] query II SELECT * FROM tg_buffer ORDER BY id ASC LIMIT 3; From 9ea1bb83cbea98b299751859bb961d2183acb4e5 Mon Sep 17 00:00:00 2001 From: Qi Zhu <821684824@qq.com> Date: Sun, 24 May 2026 19:13:29 +0800 Subject: [PATCH 2/7] address Copilot review: add NULL guard + test cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. NULL safety guard In `rebuild_with_source`'s Inexact-branch re-validate path, mirror the existing `try_sort_file_groups_by_statistics` fallback by refusing to upgrade to Exact when any file in the (post-sort) file groups has NULLs in the sort columns. Otherwise NULLs sitting at the end of a non-last file (NULLS LAST) would land mid-stream after concatenation, breaking the ordering. 2. SLT test cleanup - Test 5b: add DROP TABLE statements for no_decl_low / no_decl_mid / no_decl_high / no_decl_parquet so the test is self-contained. - Test 5c (new): NULL-safety regression test — files with reversed filesystem naming so the Inexact branch fires, and the lower-id file contains NULLs in the sort column. Asserts SortExec stays and query results are correct. --- .../src/file_scan_config/sort_pushdown.rs | 27 ++++++- .../sqllogictest/test_files/sort_pushdown.slt | 81 +++++++++++++++++++ 2 files changed, 106 insertions(+), 2 deletions(-) diff --git a/datafusion/datasource/src/file_scan_config/sort_pushdown.rs b/datafusion/datasource/src/file_scan_config/sort_pushdown.rs index 5fc0bdabf0965..4d71d4ced4921 100644 --- a/datafusion/datasource/src/file_scan_config/sort_pushdown.rs +++ b/datafusion/datasource/src/file_scan_config/sort_pushdown.rs @@ -175,8 +175,31 @@ impl FileScanConfig { } else { // Re-validate: now that files are sorted, can the // request be satisfied? - let new_eq_props = new_config.eq_properties(); - new_eq_props.ordering_satisfy(order.iter().cloned())? + // + // Same NULL guard as `try_sort_file_groups_by_statistics`: + // we cannot claim Exact if any non-last file contains + // NULLs in the sort columns. With NULLS LAST those + // NULLs sit after all non-null rows in the file, so + // when the next file's non-nulls are smaller than the + // previous file's max, they'd appear *after* the NULLs + // in the concatenated stream — breaking the ordering. + let projected_schema = new_config.projected_schema()?; + let projection_indices = new_config + .file_source + .projection() + .as_ref() + .and_then(|p| ordered_column_indices_from_projection(p)); + if any_file_has_nulls_in_sort_columns( + &new_config.file_groups, + order, + &projected_schema, + projection_indices.as_deref(), + ) { + false + } else { + let new_eq_props = new_config.eq_properties(); + new_eq_props.ordering_satisfy(order.iter().cloned())? + } } } else { false diff --git a/datafusion/sqllogictest/test_files/sort_pushdown.slt b/datafusion/sqllogictest/test_files/sort_pushdown.slt index ac2f483c07f51..60960919599c7 100644 --- a/datafusion/sqllogictest/test_files/sort_pushdown.slt +++ b/datafusion/sqllogictest/test_files/sort_pushdown.slt @@ -1236,6 +1236,87 @@ SELECT * FROM no_decl_parquet ORDER BY id ASC; 8 800 9 900 +# Cleanup Test 5b +statement ok +DROP TABLE no_decl_low; + +statement ok +DROP TABLE no_decl_mid; + +statement ok +DROP TABLE no_decl_high; + +statement ok +DROP TABLE no_decl_parquet; + +# Test 5c: NULL safety — files in **wrong** filesystem order so the +# Inexact branch fires; the previously-non-last file contains NULLs in +# the sort column. With NULLS LAST, NULLs inside a file sit after all +# non-null rows. If the next file's non-null values are smaller than +# the previous file's max, those values would land AFTER the NULLs in +# the concatenated stream — breaking the ordering. The fix must NOT +# upgrade to Exact here even though stats are non-overlapping. + +statement ok +CREATE TABLE null_safety_high(id INT, value INT) AS VALUES (4, 400), (5, 500), (6, 600); + +statement ok +CREATE TABLE null_safety_low_with_nulls(id INT, value INT) AS VALUES (1, 100), (2, 200), (3, 300), (NULL, 999); + +# Name files so alphabetical order is REVERSED relative to id order +# (a_high before b_low) — triggers the Inexact / re-validate path. +query I +COPY (SELECT * FROM null_safety_high ORDER BY id ASC NULLS LAST) +TO 'test_files/scratch/sort_pushdown/null_safety/a_high.parquet'; +---- +3 + +query I +COPY (SELECT * FROM null_safety_low_with_nulls ORDER BY id ASC NULLS LAST) +TO 'test_files/scratch/sort_pushdown/null_safety/b_low_nulls.parquet'; +---- +4 + +statement ok +CREATE EXTERNAL TABLE null_safety_parquet(id INT, value INT) +STORED AS PARQUET +LOCATION 'test_files/scratch/sort_pushdown/null_safety/' +WITH ORDER (id ASC NULLS LAST); + +# After Phase 2 reorder file_groups would be [b_low_nulls, a_high] and +# min/max would be non-overlapping — but b_low_nulls has NULLs in the +# sort column, so we must NOT upgrade to Exact. SortExec stays. +query TT +EXPLAIN SELECT * FROM null_safety_parquet ORDER BY id ASC NULLS LAST; +---- +logical_plan +01)Sort: null_safety_parquet.id ASC NULLS LAST +02)--TableScan: null_safety_parquet projection=[id, value] +physical_plan +01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/null_safety/b_low_nulls.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/null_safety/a_high.parquet]]}, projection=[id, value], file_type=parquet, sort_order_for_reorder=[id@0 ASC NULLS LAST] + +# Results must still be correct (SortExec does the final sort) +query II +SELECT * FROM null_safety_parquet ORDER BY id ASC NULLS LAST; +---- +1 100 +2 200 +3 300 +4 400 +5 500 +6 600 +NULL 999 + +statement ok +DROP TABLE null_safety_high; + +statement ok +DROP TABLE null_safety_low_with_nulls; + +statement ok +DROP TABLE null_safety_parquet; + # Test 6: WITH ORDER + reversed filesystem order # Same file setup as Test 4 but explicitly declaring ordering via WITH ORDER. # PushdownSort reorders files by min/max stats; after reorder the inter-file From 42149c209b547b95d48420876ebb602fd8855c5a Mon Sep 17 00:00:00 2001 From: Qi Zhu <821684824@qq.com> Date: Sun, 24 May 2026 19:19:19 +0800 Subject: [PATCH 3/7] =?UTF-8?q?docs:=20update=20try=5Fpushdown=5Fsort=20di?= =?UTF-8?q?agram=20to=20reflect=20Inexact=E2=86=92Exact=20upgrade?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The doc-comment for FileScanConfig::try_pushdown_sort still described the pre-fix Inexact branch ("SortExec kept, scan optimized"). Update the diagram to reflect the new behaviour: rebuild_with_source can upgrade back to Exact when post-sort file groups are non-overlapping, the request re-validates, and no NULLs sit in the sort columns of non-last files. Also note that the Unsupported fallback now mirrors the same NULL check. --- datafusion/datasource/src/file_scan_config/mod.rs | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/datafusion/datasource/src/file_scan_config/mod.rs b/datafusion/datasource/src/file_scan_config/mod.rs index a39495994f368..e06e3519ee838 100644 --- a/datafusion/datasource/src/file_scan_config/mod.rs +++ b/datafusion/datasource/src/file_scan_config/mod.rs @@ -937,14 +937,19 @@ impl DataSource for FileScanConfig { /// │ → SortExec removed, fetch (LIMIT) pushed to DataSourceExec /// │ /// ├─► FileSource returns Inexact - /// │ (reverse_row_groups=true) - /// │ → SortExec kept, scan optimized + /// │ (e.g. column_in_file_schema: opener will reorder RGs at runtime) + /// │ → rebuild_with_source: sort files by stats; if the post-sort + /// │ file groups are non-overlapping AND the request now validates + /// │ AND no NULLs sit in the sort columns of non-last files, + /// │ upgrade back to Exact (SortExec removed). Otherwise stays + /// │ Inexact and SortExec is kept while the scan is still + /// │ optimised via `sort_order_for_reorder` / `reverse_row_groups`. /// │ /// └─► FileSource returns Unsupported - /// (ordering stripped because files in wrong order) + /// (e.g. expression sort key or partition column) /// → try_sort_file_groups_by_statistics(): /// 1. Sort files within each group by min/max statistics - /// 2. Re-check: non-overlapping + ordering valid? + /// 2. Re-check: non-overlapping + ordering valid + no NULLs? /// YES → Exact → SortExec removed /// NO → Inexact (files reordered, Sort stays) /// ``` From a22da9fcfaadffb91f456e18ac019ebcaf875a1a Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Sun, 24 May 2026 09:26:46 -0500 Subject: [PATCH 4/7] refactor: express keep_ordering decision as an explicit match MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the nested `if all_non_overlapping { if is_exact {..} else {..} } else { false }` in `rebuild_with_source` with a `match (all_non_overlapping, is_exact)`. No behaviour change — the three arms map one-to-one to the prior branches and make the "overlap → drop", "exact → keep", "inexact → re-validate" cases explicit. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/file_scan_config/sort_pushdown.rs | 34 ++++++++++--------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/datafusion/datasource/src/file_scan_config/sort_pushdown.rs b/datafusion/datasource/src/file_scan_config/sort_pushdown.rs index 4d71d4ced4921..3f5beed20fa8d 100644 --- a/datafusion/datasource/src/file_scan_config/sort_pushdown.rs +++ b/datafusion/datasource/src/file_scan_config/sort_pushdown.rs @@ -169,20 +169,24 @@ impl FileScanConfig { // // This interleaving is actually beneficial because SPM pulls from both // partitions concurrently, keeping parallel I/O active. - let keep_ordering = if all_non_overlapping { - if is_exact { - true - } else { - // Re-validate: now that files are sorted, can the - // request be satisfied? - // - // Same NULL guard as `try_sort_file_groups_by_statistics`: - // we cannot claim Exact if any non-last file contains - // NULLs in the sort columns. With NULLS LAST those - // NULLs sit after all non-null rows in the file, so - // when the next file's non-nulls are smaller than the - // previous file's max, they'd appear *after* the NULLs - // in the concatenated stream — breaking the ordering. + let keep_ordering = match (all_non_overlapping, is_exact) { + // Files still overlap after the stats sort — the combined + // stream isn't ordered, so `output_ordering` must be dropped. + (false, _) => false, + // Source already had validated ordering and the post-sort + // files still don't overlap — Exact carries through. + (true, true) => true, + // Source returned `Inexact`; re-validate against the + // reordered file groups to decide whether to upgrade. + // + // Same NULL guard as `try_sort_file_groups_by_statistics`: + // we cannot claim Exact if any non-last file contains + // NULLs in the sort columns. With NULLS LAST those + // NULLs sit after all non-null rows in the file, so + // when the next file's non-nulls are smaller than the + // previous file's max, they'd appear *after* the NULLs + // in the concatenated stream — breaking the ordering. + (true, false) => { let projected_schema = new_config.projected_schema()?; let projection_indices = new_config .file_source @@ -201,8 +205,6 @@ impl FileScanConfig { new_eq_props.ordering_satisfy(order.iter().cloned())? } } - } else { - false }; if !keep_ordering { From 4eeced58870eecc2641ce89d758756566201bbb2 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Sun, 24 May 2026 09:28:59 -0500 Subject: [PATCH 5/7] =?UTF-8?q?fix(sort-pushdown):=20drop=20runtime=20row-?= =?UTF-8?q?group=20reorder=20hints=20on=20Inexact=E2=86=92Exact=20upgrade?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When `try_pushdown_sort` upgrades the `Inexact` result back to `Exact` (post-sort file groups non-overlapping + declared ordering re-validates), the `SortExec` is removed. But the source rebuilt from the `Inexact` result still carried `sort_order_for_reorder` / `reverse_row_groups`, so the parquet opener kept reordering row groups at runtime. With the `SortExec` gone, that runtime reorder is no longer a harmless optimisation — it is load-bearing for correctness, and it is wrong for DESC requests. The opener sorts row groups ASC-by-min and then `reverse()`s them; two row groups within a single file that share the same `min` but differ in `max` get mis-ordered by the reverse. Example: a file declared `WITH ORDER (id DESC)` containing `[10,8,8,8]` whose row groups are `[10,8]` and `[8,8]` (both min 8) streams as `8,8,10,8`. Previously the `SortExec` re-sorted that; once eliminated, the wrong order is the final result. Because the upgrade only fires when each file's declared ordering re-validates against the non-overlapping post-sort file groups, reading the files in their natural (declared-sorted) order already yields the requested ordering — no runtime reorder is needed. Restore the original, hint-free source on upgrade, matching the `Unsupported → Exact` path which also reads files naturally. Existing tests 4.1 / 6.1 / 8.1 / G.1 / G.2 lose the now-absent `sort_order_for_reorder` / `reverse_row_groups` from their `DataSourceExec` plans; data results are unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../datasource/src/file_scan_config/mod.rs | 22 ++++++++++++++++++- .../sqllogictest/test_files/sort_pushdown.slt | 10 ++++----- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/datafusion/datasource/src/file_scan_config/mod.rs b/datafusion/datasource/src/file_scan_config/mod.rs index e06e3519ee838..5ea1fbb0c863e 100644 --- a/datafusion/datasource/src/file_scan_config/mod.rs +++ b/datafusion/datasource/src/file_scan_config/mod.rs @@ -978,7 +978,7 @@ impl DataSource for FileScanConfig { } } SortOrderPushdownResult::Inexact { inner } => { - let config = self.rebuild_with_source(inner, false, order)?; + let mut config = self.rebuild_with_source(inner, false, order)?; // `rebuild_with_source` reorders files by stats; if the // post-sort files are non-overlapping AND the request now // validates against the new file groups, `output_ordering` @@ -992,6 +992,26 @@ impl DataSource for FileScanConfig { inner: Arc::new(config), }) } else { + // Upgrading to Exact: the post-sort file groups are + // non-overlapping and each file's declared ordering + // re-validates, so reading the files in their natural + // (declared-sorted) order already yields the requested + // ordering — exactly like the `Unsupported` → Exact path, + // which reads files in natural order too. + // + // Drop the runtime row-group reorder hints the Inexact + // source carried (`sort_order_for_reorder` / + // `reverse_row_groups`) by restoring the original, + // hint-free source. With the `SortExec` removed those + // hints are not just redundant but unsafe: for a DESC + // request the opener sorts row groups ASC-by-min and then + // reverses them, which mis-orders two row groups within a + // single file that share the same `min` (e.g. a file + // `[10,8,8,8]` whose row groups are `[10,8]` and `[8,8]` + // would stream as `8,8,10,8`). The `SortExec` used to + // mask this; once it is gone the reordered stream is the + // final, wrong answer. + config.file_source = Arc::clone(&self.file_source); Ok(SortOrderPushdownResult::Exact { inner: Arc::new(config), }) diff --git a/datafusion/sqllogictest/test_files/sort_pushdown.slt b/datafusion/sqllogictest/test_files/sort_pushdown.slt index 60960919599c7..7396de92c89c8 100644 --- a/datafusion/sqllogictest/test_files/sort_pushdown.slt +++ b/datafusion/sqllogictest/test_files/sort_pushdown.slt @@ -1110,7 +1110,7 @@ EXPLAIN SELECT * FROM reversed_parquet ORDER BY id ASC; logical_plan 01)Sort: reversed_parquet.id ASC NULLS LAST 02)--TableScan: reversed_parquet projection=[id, value] -physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/c_low.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/a_high.parquet]]}, projection=[id, value], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet, sort_order_for_reorder=[id@0 ASC NULLS LAST] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/c_low.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/a_high.parquet]]}, projection=[id, value], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet # Test 4.2: Results must be correct query II @@ -1336,7 +1336,7 @@ EXPLAIN SELECT * FROM reversed_with_order_parquet ORDER BY id ASC; logical_plan 01)Sort: reversed_with_order_parquet.id ASC NULLS LAST 02)--TableScan: reversed_with_order_parquet projection=[id, value] -physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/c_low.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/a_high.parquet]]}, projection=[id, value], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet, sort_order_for_reorder=[id@0 ASC NULLS LAST] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/c_low.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/a_high.parquet]]}, projection=[id, value], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet # Test 6.2: Results must be correct query II @@ -1473,7 +1473,7 @@ EXPLAIN SELECT * FROM desc_reversed_parquet ORDER BY id DESC; logical_plan 01)Sort: desc_reversed_parquet.id DESC NULLS FIRST 02)--TableScan: desc_reversed_parquet projection=[id, value] -physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/desc_reversed/b_high.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/desc_reversed/a_low.parquet]]}, projection=[id, value], output_ordering=[id@0 DESC], file_type=parquet, sort_order_for_reorder=[id@0 DESC], reverse_row_groups=true +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/desc_reversed/b_high.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/desc_reversed/a_low.parquet]]}, projection=[id, value], output_ordering=[id@0 DESC], file_type=parquet # Test 8.2: Results must be correct query II @@ -2366,7 +2366,7 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [id@0 ASC NULLS LAST] 02)--BufferExec: capacity=1073741824 -03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/a_high.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/c_low.parquet]]}, projection=[id, value], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet, sort_order_for_reorder=[id@0 ASC NULLS LAST] +03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/a_high.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/c_low.parquet]]}, projection=[id, value], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet # Verify correctness query II @@ -2393,7 +2393,7 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [id@0 ASC NULLS LAST], fetch=3 02)--BufferExec: capacity=1073741824 -03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/a_high.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/c_low.parquet]]}, projection=[id, value], limit=3, output_ordering=[id@0 ASC NULLS LAST], file_type=parquet, sort_order_for_reorder=[id@0 ASC NULLS LAST] +03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/a_high.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/c_low.parquet]]}, projection=[id, value], limit=3, output_ordering=[id@0 ASC NULLS LAST], file_type=parquet query II SELECT * FROM tg_buffer ORDER BY id ASC LIMIT 3; From ae3e8c794a7074c24b29a4c48fc45bd21f29ba4f Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Sun, 24 May 2026 09:31:18 -0500 Subject: [PATCH 6/7] test(sort-pushdown): cover DESC multi-row-group files with a shared min MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Test 8b: a DESC scan over a file whose row groups share the same min value (`[10,8,8,8]` → row groups `[10,8]` and `[8,8]`), listed out of order on disk so the Inexact→Exact upgrade fires and SortExec is eliminated. Without the fix this returns `8,8,10,8,...` (id=10 in third position): the opener reorders row groups ASC-by-min then reverses, mis-ordering the two min=8 row groups, and the eliminated SortExec no longer masks it. The test asserts both the clean plan (no runtime reorder hints) and the correct DESC result. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../sqllogictest/test_files/sort_pushdown.slt | 72 +++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/datafusion/sqllogictest/test_files/sort_pushdown.slt b/datafusion/sqllogictest/test_files/sort_pushdown.slt index 7396de92c89c8..36fb38f5b4026 100644 --- a/datafusion/sqllogictest/test_files/sort_pushdown.slt +++ b/datafusion/sqllogictest/test_files/sort_pushdown.slt @@ -1486,6 +1486,78 @@ SELECT * FROM desc_reversed_parquet ORDER BY id DESC; 2 200 1 100 +# Test 8b: DESC with multiple row groups per file sharing a min value. +# Regression test for the Inexact→Exact upgrade: when SortExec is eliminated +# the files must be read in natural order. The opener's runtime row-group +# reorder (sort ASC-by-min then reverse) mis-orders two row groups in one file +# that share the same min — so the upgrade must NOT leave those hints active. +# +# File b_high is DESC-sorted [10,8,8,8] written with 2 rows per row group: +# RG0 = [10, 8] (min 8, max 10) +# RG1 = [ 8, 8] (min 8, max 8) +# Both row groups have min=8. Naively reordering RGs ASC-by-min then reversing +# yields [RG1, RG0] → 8,8,10,8 (wrong). Natural order [RG0, RG1] is correct. + +statement ok +CREATE TABLE rg_desc_high(id INT, value INT) AS VALUES (10, 100), (8, 801), (8, 802), (8, 803); + +statement ok +CREATE TABLE rg_desc_low(id INT, value INT) AS VALUES (3, 300), (2, 200), (1, 100); + +query I +COPY (SELECT * FROM rg_desc_high ORDER BY id DESC) +TO 'test_files/scratch/sort_pushdown/rg_desc/b_high.parquet' +OPTIONS ('format.max_row_group_size' '2'); +---- +4 + +query I +COPY (SELECT * FROM rg_desc_low ORDER BY id DESC) +TO 'test_files/scratch/sort_pushdown/rg_desc/a_low.parquet' +OPTIONS ('format.max_row_group_size' '2'); +---- +3 + +# Files named so filesystem order [a_low, b_high] is wrong for DESC → the +# Inexact path fires, stats reorder makes file groups [b_high, a_low] +# non-overlapping, and the upgrade eliminates SortExec. +statement ok +CREATE EXTERNAL TABLE rg_desc_parquet(id INT, value INT) +STORED AS PARQUET +LOCATION 'test_files/scratch/sort_pushdown/rg_desc/' +WITH ORDER (id DESC); + +# SortExec eliminated, files reordered, NO sort_order_for_reorder / +# reverse_row_groups (natural read is correct after the upgrade). +query TT +EXPLAIN SELECT id FROM rg_desc_parquet ORDER BY id DESC; +---- +logical_plan +01)Sort: rg_desc_parquet.id DESC NULLS FIRST +02)--TableScan: rg_desc_parquet projection=[id] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/rg_desc/b_high.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/rg_desc/a_low.parquet]]}, projection=[id], output_ordering=[id@0 DESC], file_type=parquet + +# Results must be in DESC order — id=10 first. +query I +SELECT id FROM rg_desc_parquet ORDER BY id DESC; +---- +10 +8 +8 +8 +3 +2 +1 + +statement ok +DROP TABLE rg_desc_parquet; + +statement ok +DROP TABLE rg_desc_high; + +statement ok +DROP TABLE rg_desc_low; + # Test 9: Multi-column sort key validation # Files have (category, id) ordering. Files share a boundary value on category='B' # so column-level min/max statistics overlap on the primary key column. From 8994832a13d96e92f468630b7d6f0ce72a528436 Mon Sep 17 00:00:00 2001 From: Qi Zhu Date: Sun, 24 May 2026 22:47:25 +0800 Subject: [PATCH 7/7] address review: early-return + fix typos lint - adriangb r3294820727: flatten the if/else in try_pushdown_sort Inexact arm into an early-return guard clause so the Exact-upgrade block sits at one less level of indentation - typos CI: 'mis-orders' tripped the spell checker on leading 'mis'; reword the comment as 'reorders ... incorrectly' (no behaviour change) --- .../datasource/src/file_scan_config/mod.rs | 51 +++++++++---------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/datafusion/datasource/src/file_scan_config/mod.rs b/datafusion/datasource/src/file_scan_config/mod.rs index 5ea1fbb0c863e..4bf86e17d387d 100644 --- a/datafusion/datasource/src/file_scan_config/mod.rs +++ b/datafusion/datasource/src/file_scan_config/mod.rs @@ -988,34 +988,33 @@ impl DataSource for FileScanConfig { // path before #21956 routed `column_in_file_schema` cases // here. if config.output_ordering.is_empty() { - Ok(SortOrderPushdownResult::Inexact { + return Ok(SortOrderPushdownResult::Inexact { inner: Arc::new(config), - }) - } else { - // Upgrading to Exact: the post-sort file groups are - // non-overlapping and each file's declared ordering - // re-validates, so reading the files in their natural - // (declared-sorted) order already yields the requested - // ordering — exactly like the `Unsupported` → Exact path, - // which reads files in natural order too. - // - // Drop the runtime row-group reorder hints the Inexact - // source carried (`sort_order_for_reorder` / - // `reverse_row_groups`) by restoring the original, - // hint-free source. With the `SortExec` removed those - // hints are not just redundant but unsafe: for a DESC - // request the opener sorts row groups ASC-by-min and then - // reverses them, which mis-orders two row groups within a - // single file that share the same `min` (e.g. a file - // `[10,8,8,8]` whose row groups are `[10,8]` and `[8,8]` - // would stream as `8,8,10,8`). The `SortExec` used to - // mask this; once it is gone the reordered stream is the - // final, wrong answer. - config.file_source = Arc::clone(&self.file_source); - Ok(SortOrderPushdownResult::Exact { - inner: Arc::new(config), - }) + }); } + // Upgrading to Exact: the post-sort file groups are + // non-overlapping and each file's declared ordering + // re-validates, so reading the files in their natural + // (declared-sorted) order already yields the requested + // ordering — exactly like the `Unsupported` → Exact path, + // which reads files in natural order too. + // + // Drop the runtime row-group reorder hints the Inexact + // source carried (`sort_order_for_reorder` / + // `reverse_row_groups`) by restoring the original, + // hint-free source. With the `SortExec` removed those + // hints are not just redundant but unsafe: for a DESC + // request the opener sorts row groups ASC-by-min and then + // reverses them, which reorders two row groups within a + // single file that share the same `min` incorrectly + // (e.g. a file `[10,8,8,8]` whose row groups are + // `[10,8]` and `[8,8]` would stream as `8,8,10,8`). + // The `SortExec` used to mask this; once it is gone the + // reordered stream is the final, wrong answer. + config.file_source = Arc::clone(&self.file_source); + Ok(SortOrderPushdownResult::Exact { + inner: Arc::new(config), + }) } SortOrderPushdownResult::Unsupported => { self.try_sort_file_groups_by_statistics(order)