Skip to content

Commit ca8da0f

Browse files
committed
Add support for Bloom filters on binary columns
1 parent 9b098ee commit ca8da0f

File tree

2 files changed

+103
-0
lines changed

2 files changed

+103
-0
lines changed

datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,7 @@ impl PruningStatistics for BloomFilterStatistics {
225225
.map(|value| {
226226
match value {
227227
ScalarValue::Utf8(Some(v)) => sbbf.check(&v.as_str()),
228+
ScalarValue::Binary(Some(v)) => sbbf.check(v),
228229
ScalarValue::Boolean(Some(v)) => sbbf.check(v),
229230
ScalarValue::Float64(Some(v)) => sbbf.check(v),
230231
ScalarValue::Float32(Some(v)) => sbbf.check(v),

datafusion/core/tests/parquet/row_group_pruning.rs

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -846,6 +846,108 @@ async fn prune_string_lt() {
846846
.await;
847847
}
848848

849+
#[tokio::test]
850+
async fn prune_binary_eq_match() {
851+
RowGroupPruningTest::new()
852+
.with_scenario(Scenario::ByteArray)
853+
.with_query(
854+
"SELECT name, service_binary FROM t WHERE service_binary = CAST('backend one' AS bytea)",
855+
)
856+
.with_expected_errors(Some(0))
857+
// false positive on 'all backends' batch: 'backend five' < 'backend one' < 'backend three'
858+
.with_matched_by_stats(Some(2))
859+
.with_pruned_by_stats(Some(1))
860+
.with_matched_by_bloom_filter(Some(1))
861+
.with_pruned_by_bloom_filter(Some(1))
862+
.with_expected_rows(1)
863+
.test_row_group_prune()
864+
.await;
865+
}
866+
867+
#[tokio::test]
868+
async fn prune_binary_eq_no_match() {
869+
RowGroupPruningTest::new()
870+
.with_scenario(Scenario::ByteArray)
871+
.with_query(
872+
"SELECT name, service_binary FROM t WHERE service_binary = CAST('backend nine' AS bytea)",
873+
)
874+
.with_expected_errors(Some(0))
875+
// false positive on 'all backends' batch: 'backend five' < 'backend one' < 'backend three'
876+
.with_matched_by_stats(Some(1))
877+
.with_pruned_by_stats(Some(2))
878+
.with_matched_by_bloom_filter(Some(0))
879+
.with_pruned_by_bloom_filter(Some(1))
880+
.with_expected_rows(0)
881+
.test_row_group_prune()
882+
.await;
883+
884+
RowGroupPruningTest::new()
885+
.with_scenario(Scenario::ByteArray)
886+
.with_query(
887+
"SELECT name, service_binary FROM t WHERE service_binary = CAST('frontend nine' AS bytea)",
888+
)
889+
.with_expected_errors(Some(0))
890+
// false positive on 'all frontends' batch: 'frontend five' < 'frontend nine' < 'frontend two'
891+
// false positive on 'mixed' batch: 'backend one' < 'frontend nine' < 'frontend six'
892+
.with_matched_by_stats(Some(2))
893+
.with_pruned_by_stats(Some(1))
894+
.with_matched_by_bloom_filter(Some(0))
895+
.with_pruned_by_bloom_filter(Some(2))
896+
.with_expected_rows(0)
897+
.test_row_group_prune()
898+
.await;
899+
}
900+
901+
#[tokio::test]
902+
async fn prune_binary_neq() {
903+
RowGroupPruningTest::new()
904+
.with_scenario(Scenario::ByteArray)
905+
.with_query(
906+
"SELECT name, service_binary FROM t WHERE service_binary != CAST('backend one' AS bytea)",
907+
)
908+
.with_expected_errors(Some(0))
909+
.with_matched_by_stats(Some(3))
910+
.with_pruned_by_stats(Some(0))
911+
.with_matched_by_bloom_filter(Some(3))
912+
.with_pruned_by_bloom_filter(Some(0))
913+
.with_expected_rows(14)
914+
.test_row_group_prune()
915+
.await;
916+
}
917+
918+
#[tokio::test]
919+
async fn prune_binary_lt() {
920+
RowGroupPruningTest::new()
921+
.with_scenario(Scenario::ByteArray)
922+
.with_query(
923+
"SELECT name, service_binary FROM t WHERE service_binary < CAST('backend one' AS bytea)",
924+
)
925+
.with_expected_errors(Some(0))
926+
// matches 'all backends' only
927+
.with_matched_by_stats(Some(1))
928+
.with_pruned_by_stats(Some(2))
929+
.with_matched_by_bloom_filter(Some(0))
930+
.with_pruned_by_bloom_filter(Some(0))
931+
.with_expected_rows(3)
932+
.test_row_group_prune()
933+
.await;
934+
935+
RowGroupPruningTest::new()
936+
.with_scenario(Scenario::ByteArray)
937+
.with_query(
938+
"SELECT name, service_binary FROM t WHERE service_binary < CAST('backend zero' AS bytea)",
939+
)
940+
.with_expected_errors(Some(0))
941+
.with_matched_by_stats(Some(2))
942+
.with_pruned_by_stats(Some(1))
943+
.with_matched_by_bloom_filter(Some(0))
944+
.with_pruned_by_bloom_filter(Some(0))
945+
// all backends from 'mixed' and 'all backends'
946+
.with_expected_rows(8)
947+
.test_row_group_prune()
948+
.await;
949+
}
950+
849951
#[tokio::test]
850952
async fn prune_periods_in_column_names() {
851953
// There are three row groups for "service.name", each with 5 rows = 15 rows total

0 commit comments

Comments
 (0)