From f051a282af54647a5c52c221233a28102af0cb83 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Fri, 17 Jan 2025 17:11:39 -0500 Subject: [PATCH 01/16] feat: teach RunEndArray NullCount and TrueCount --- encodings/runend/src/array.rs | 201 +++++++++++++++++++++++++++++++--- 1 file changed, 184 insertions(+), 17 deletions(-) diff --git a/encodings/runend/src/array.rs b/encodings/runend/src/array.rs index 8a6c8a25080..caa06b8946a 100644 --- a/encodings/runend/src/array.rs +++ b/encodings/runend/src/array.rs @@ -16,7 +16,7 @@ use vortex_array::{ IntoCanonical, }; use vortex_buffer::Buffer; -use vortex_dtype::{DType, PType}; +use vortex_dtype::{match_each_unsigned_integer_ptype, DType, PType}; use vortex_error::{vortex_bail, VortexExpect as _, VortexResult}; use vortex_scalar::Scalar; @@ -227,31 +227,113 @@ impl VisitorVTable for RunEndEncoding { impl StatisticsVTable for RunEndEncoding { fn compute_statistics(&self, array: &RunEndArray, stat: Stat) -> VortexResult { - let maybe_stat = match stat { - Stat::Min | Stat::Max => array.values().statistics().compute(stat), - Stat::IsSorted => Some(Scalar::from( - array - .values() - .statistics() - .compute_is_sorted() - .unwrap_or(false) - && array.logical_validity().all_valid(), - )), - _ => None, + let mut stats = StatsSet::default(); + + match stat { + Stat::Min | Stat::Max => { + if let Some(extrema) = array.values().statistics().compute(stat) { + stats.set(stat, extrema); + } + } + Stat::IsSorted => { + let is_sorted = Scalar::from( + array + .values() + .statistics() + .compute_is_sorted() + .unwrap_or(false) + && array.logical_validity().all_valid(), + ); + stats.set(stat, is_sorted); + } + Stat::TrueCount => match array.dtype() { + DType::Bool(_) => { + let ends = array.ends().into_primitive()?; + let bools = array.values().into_bool()?.boolean_buffer(); + let mut true_count: u64 = 0; + let mut null_count: u64 = 0; + + match array.values().logical_validity() { + LogicalValidity::AllValid(_) => { + null_count = 0; + true_count = match_each_unsigned_integer_ptype!(ends.ptype(), |$P| { + let mut begin = array.offset() as $P; + ends + .as_slice::<$P>() + .iter() + .enumerate() + .map(|(index, end)| { + let len = *end - begin; + begin = *end; + (len as u64) * (bools.value(index as usize) as u64) + }) + .sum() + }); + } + LogicalValidity::AllInvalid(_) => { + null_count = array.len() as u64; + true_count = 0; + } + LogicalValidity::Array(is_valid) => { + let is_valid = is_valid.into_bool()?.boolean_buffer(); + + match_each_unsigned_integer_ptype!(ends.ptype(), |$P| { + let mut begin = array.offset() as $P; + for (index, end) in ends.as_slice::<$P>().iter().enumerate() { + let len = *end - begin; + begin = *end; + true_count += (len as u64) * (bools.value(index as usize) as u64) * (is_valid.value(index as usize) as u64); + null_count += (len as u64) * (is_valid.value(index as usize) as u64); + } + }); + } + }; + + stats.set(Stat::TrueCount, true_count); + stats.set(Stat::NullCount, null_count); + } + DType::Primitive(..) => {} + dtype => vortex_bail!("invalid dtype: {}", dtype), + }, + Stat::NullCount => { + let ends = array.ends().into_primitive()?; + let null_count: u64 = match array.values().logical_validity() { + LogicalValidity::AllValid(_) => 0_u64, + LogicalValidity::AllInvalid(_) => array.len() as u64, + LogicalValidity::Array(is_valid) => { + let is_valid = is_valid.into_bool()?.boolean_buffer(); + match_each_unsigned_integer_ptype!(ends.ptype(), |$P| { + let mut begin = array.offset() as $P; + ends + .as_slice::<$P>() + .iter() + .enumerate() + .map(|(index, end)| { + let len = *end - begin; + begin = *end; + (len as u64) * ((!is_valid.value(index as usize)) as u64) + }) + .sum() + }) + } + }; + stats.set(stat, null_count); + } + _ => {} }; - let mut stats = StatsSet::default(); - if let Some(stat_value) = maybe_stat { - stats.set(stat, stat_value); - } Ok(stats) } } #[cfg(test)] mod tests { - use vortex_array::compute::scalar_at; + use arrow_buffer::BooleanBuffer; + use vortex_array::array::BoolArray; + use vortex_array::compute::{scalar_at, slice}; + use vortex_array::stats::{ArrayStatistics as _, Stat}; use vortex_array::test_harness::check_metadata; + use vortex_array::validity::Validity; use vortex_array::{ArrayDType, ArrayLen, IntoArrayData}; use vortex_buffer::buffer; use vortex_dtype::{DType, Nullability, PType}; @@ -292,4 +374,89 @@ mod tests { assert_eq!(scalar_at(arr.as_ref(), 5).unwrap(), 3.into()); assert_eq!(scalar_at(arr.as_ref(), 9).unwrap(), 3.into()); } + + #[test] + fn test_runend_int_stats() { + let arr = RunEndArray::try_new( + buffer![2u32, 5, 10].into_array(), + buffer![1i32, 2, 3].into_array(), + ) + .unwrap(); + + assert_eq!(arr.statistics().compute_as::(Stat::Min).unwrap(), 1); + assert_eq!(arr.statistics().compute_as::(Stat::Max).unwrap(), 3); + assert_eq!( + arr.statistics().compute_as::(Stat::NullCount).unwrap(), + 0 + ); + assert!(arr.statistics().compute_as::(Stat::IsSorted).unwrap()); + } + + #[test] + fn test_runend_bool_stats() { + let arr = RunEndArray::try_new( + buffer![2u32, 5, 10].into_array(), + BoolArray::try_new( + BooleanBuffer::from_iter([true, true, false]), + Validity::Array(BoolArray::from_iter([true, false, true]).into_array()), + ) + .unwrap() + .into_array(), + ) + .unwrap(); + + assert!(!arr.statistics().compute_as::(Stat::Min).unwrap()); + assert!(arr.statistics().compute_as::(Stat::Max).unwrap()); + assert_eq!( + arr.statistics().compute_as::(Stat::NullCount).unwrap(), + 3 + ); + assert!(!arr.statistics().compute_as::(Stat::IsSorted).unwrap()); + assert_eq!( + arr.statistics().compute_as::(Stat::TrueCount).unwrap(), + 2 + ); + + let sliced = slice(arr, 4, 7).unwrap(); + + assert!(!sliced.statistics().compute_as::(Stat::Min).unwrap()); + assert!(!sliced.statistics().compute_as::(Stat::Max).unwrap()); + assert_eq!( + sliced + .statistics() + .compute_as::(Stat::NullCount) + .unwrap(), + 1 + ); + // Not sorted because null must come last + assert!(!sliced + .statistics() + .compute_as::(Stat::IsSorted) + .unwrap()); + assert_eq!( + sliced + .statistics() + .compute_as::(Stat::TrueCount) + .unwrap(), + 0 + ); + } + + #[test] + fn test_all_invalid_true_count() { + let arr = RunEndArray::try_new( + buffer![2u32, 5, 10].into_array(), + BoolArray::from_iter([None, None, None]).into_array(), + ) + .unwrap() + .into_array(); + assert_eq!( + arr.statistics().compute_as::(Stat::TrueCount).unwrap(), + 0 + ); + assert_eq!( + arr.statistics().compute_as::(Stat::NullCount).unwrap(), + 10 + ); + } } From af059cdee8537c921f858c07a185b2a8b03884ad Mon Sep 17 00:00:00 2001 From: Daniel King Date: Mon, 20 Jan 2025 17:05:17 +0000 Subject: [PATCH 02/16] benchmark --- Cargo.lock | 1 + encodings/runend/Cargo.toml | 5 ++ .../runend/benches/run_end_null_count.rs | 53 +++++++++++++++++++ 3 files changed, 59 insertions(+) create mode 100644 encodings/runend/benches/run_end_null_count.rs diff --git a/Cargo.lock b/Cargo.lock index 5e30f79e4e9..6caf82aba72 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5263,6 +5263,7 @@ dependencies = [ "criterion", "itertools 0.14.0", "num-traits", + "rand", "serde", "vortex-array", "vortex-buffer", diff --git a/encodings/runend/Cargo.toml b/encodings/runend/Cargo.toml index 3fcbae85b3e..e08d80cfba4 100644 --- a/encodings/runend/Cargo.toml +++ b/encodings/runend/Cargo.toml @@ -31,7 +31,12 @@ workspace = true [dev-dependencies] vortex-array = { workspace = true, features = ["test-harness"] } criterion = { workspace = true } +rand = { workspace = true } [[bench]] name = "run_end_filter" harness = false + +[[bench]] +name = "run_end_null_count" +harness = false diff --git a/encodings/runend/benches/run_end_null_count.rs b/encodings/runend/benches/run_end_null_count.rs new file mode 100644 index 00000000000..145fff976d2 --- /dev/null +++ b/encodings/runend/benches/run_end_null_count.rs @@ -0,0 +1,53 @@ +#![allow(clippy::unwrap_used)] + +use std::iter::Iterator; + +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng as _}; +use vortex_array::array::PrimitiveArray; +use vortex_array::stats::ArrayStatistics; +use vortex_array::IntoArrayData; +use vortex_buffer::Buffer; +use vortex_runend::RunEndArray; + +const LENS: [usize; 2] = [1000, 100_000]; + +/// Create RunEnd arrays where the runs are equal size, and the null_count mask is evenly spaced. +fn run_end_null_count(c: &mut Criterion) { + let mut rng = StdRng::seed_from_u64(0); + let mut group = c.benchmark_group("run_end_null_count"); + + for &n in LENS.iter().rev() { + for run_step in [1usize << 2, 1 << 4, 1 << 8, 1 << 16] { + let ends = (0..=n) + .step_by(run_step) + .map(|x| x as u64) + .collect::>() + .into_array(); + let run_count = ends.len() - 1; + for valid_density in [0.001, 0.01, 0.1, 0.25, 0.5] { + let values = PrimitiveArray::from_option_iter( + (0..ends.len()).map(|x| rng.gen_bool(valid_density).then_some(x as u64)), + ) + .into_array(); + let array = RunEndArray::try_new(ends.clone(), values) + .unwrap() + .into_array(); + + group.bench_function( + format!( + "null_count_run_end n: {}, run_count: {}, valid_density: {}", + n, run_count, valid_density + ), + |b| { + b.iter(|| black_box(array.statistics().compute_null_count().unwrap())); + }, + ); + } + } + } +} + +criterion_group!(benches, run_end_null_count); +criterion_main!(benches); From ec185684ee57868718e096ad533cfebebcb5747d Mon Sep 17 00:00:00 2001 From: Daniel King Date: Mon, 20 Jan 2025 17:28:46 +0000 Subject: [PATCH 03/16] fixes and fewer desnities --- encodings/runend/benches/run_end_null_count.rs | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/encodings/runend/benches/run_end_null_count.rs b/encodings/runend/benches/run_end_null_count.rs index 145fff976d2..dd7bba5f9c7 100644 --- a/encodings/runend/benches/run_end_null_count.rs +++ b/encodings/runend/benches/run_end_null_count.rs @@ -6,7 +6,7 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion}; use rand::rngs::StdRng; use rand::{Rng, SeedableRng as _}; use vortex_array::array::PrimitiveArray; -use vortex_array::stats::ArrayStatistics; +use vortex_array::stats::Stat; use vortex_array::IntoArrayData; use vortex_buffer::Buffer; use vortex_runend::RunEndArray; @@ -26,7 +26,7 @@ fn run_end_null_count(c: &mut Criterion) { .collect::>() .into_array(); let run_count = ends.len() - 1; - for valid_density in [0.001, 0.01, 0.1, 0.25, 0.5] { + for valid_density in [0.01, 0.1, 0.5] { let values = PrimitiveArray::from_option_iter( (0..ends.len()).map(|x| rng.gen_bool(valid_density).then_some(x as u64)), ) @@ -41,7 +41,14 @@ fn run_end_null_count(c: &mut Criterion) { n, run_count, valid_density ), |b| { - b.iter(|| black_box(array.statistics().compute_null_count().unwrap())); + b.iter(|| { + black_box( + array + .encoding() + .compute_statistics(&array, Stat::NullCount) + .unwrap(), + ) + }); }, ); } From 4ed455e2087577151a47d7503383864e12afc33e Mon Sep 17 00:00:00 2001 From: Daniel King Date: Mon, 20 Jan 2025 17:05:12 +0000 Subject: [PATCH 04/16] iterate the indices --- encodings/runend/src/array.rs | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/encodings/runend/src/array.rs b/encodings/runend/src/array.rs index caa06b8946a..ea535b31f19 100644 --- a/encodings/runend/src/array.rs +++ b/encodings/runend/src/array.rs @@ -1,3 +1,4 @@ +use std::cmp; use std::fmt::{Debug, Display}; use serde::{Deserialize, Serialize}; @@ -302,19 +303,29 @@ impl StatisticsVTable for RunEndEncoding { LogicalValidity::AllInvalid(_) => array.len() as u64, LogicalValidity::Array(is_valid) => { let is_valid = is_valid.into_bool()?.boolean_buffer(); - match_each_unsigned_integer_ptype!(ends.ptype(), |$P| { - let mut begin = array.offset() as $P; - ends - .as_slice::<$P>() - .iter() - .enumerate() - .map(|(index, end)| { - let len = *end - begin; - begin = *end; - (len as u64) * ((!is_valid.value(index as usize)) as u64) + let mut is_valid = is_valid.set_indices(); + match is_valid.next() { + None => array.len() as u64, + Some(valid_index) => { + let offsetted_len = (array.len() + array.offset()) as u64; + let mut null_count: u64 = array.len() as u64; + match_each_unsigned_integer_ptype!(ends.ptype(), |$P| { + let ends = ends.as_slice::<$P>(); + let begin = if valid_index == 0 { + 0 + } else { + ends[valid_index - 1] + }; + null_count -= cmp::min(ends[valid_index] as u64, offsetted_len) - begin as u64; + + for valid_index in is_valid { + null_count -= cmp::min(ends[valid_index] as u64, offsetted_len) - ends[valid_index - 1] as u64; + } + + null_count }) - .sum() - }) + } + } } }; stats.set(stat, null_count); From edc2bc23696ec8bb0c534d82b2921f48cd59a3ee Mon Sep 17 00:00:00 2001 From: Daniel King Date: Mon, 20 Jan 2025 17:47:46 +0000 Subject: [PATCH 05/16] always iterate the validity --- encodings/runend/src/array.rs | 88 ++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 42 deletions(-) diff --git a/encodings/runend/src/array.rs b/encodings/runend/src/array.rs index ea535b31f19..053652f79e5 100644 --- a/encodings/runend/src/array.rs +++ b/encodings/runend/src/array.rs @@ -230,34 +230,24 @@ impl StatisticsVTable for RunEndEncoding { fn compute_statistics(&self, array: &RunEndArray, stat: Stat) -> VortexResult { let mut stats = StatsSet::default(); - match stat { - Stat::Min | Stat::Max => { - if let Some(extrema) = array.values().statistics().compute(stat) { - stats.set(stat, extrema); - } - } - Stat::IsSorted => { - let is_sorted = Scalar::from( - array - .values() - .statistics() - .compute_is_sorted() - .unwrap_or(false) - && array.logical_validity().all_valid(), - ); - stats.set(stat, is_sorted); - } + let value = match stat { + Stat::Min | Stat::Max => array.values().statistics().compute(stat), + Stat::IsSorted => Some(Scalar::from( + array + .values() + .statistics() + .compute_is_sorted() + .unwrap_or(false) + && array.logical_validity().all_valid(), + )), Stat::TrueCount => match array.dtype() { DType::Bool(_) => { let ends = array.ends().into_primitive()?; let bools = array.values().into_bool()?.boolean_buffer(); - let mut true_count: u64 = 0; - let mut null_count: u64 = 0; - match array.values().logical_validity() { + let true_count = match array.values().logical_validity() { LogicalValidity::AllValid(_) => { - null_count = 0; - true_count = match_each_unsigned_integer_ptype!(ends.ptype(), |$P| { + match_each_unsigned_integer_ptype!(ends.ptype(), |$P| { let mut begin = array.offset() as $P; ends .as_slice::<$P>() @@ -269,36 +259,46 @@ impl StatisticsVTable for RunEndEncoding { (len as u64) * (bools.value(index as usize) as u64) }) .sum() - }); - } - LogicalValidity::AllInvalid(_) => { - null_count = array.len() as u64; - true_count = 0; + }) } + LogicalValidity::AllInvalid(_) => 0, LogicalValidity::Array(is_valid) => { let is_valid = is_valid.into_bool()?.boolean_buffer(); - - match_each_unsigned_integer_ptype!(ends.ptype(), |$P| { - let mut begin = array.offset() as $P; - for (index, end) in ends.as_slice::<$P>().iter().enumerate() { - let len = *end - begin; - begin = *end; - true_count += (len as u64) * (bools.value(index as usize) as u64) * (is_valid.value(index as usize) as u64); - null_count += (len as u64) * (is_valid.value(index as usize) as u64); + let mut is_valid = is_valid.set_indices(); + match is_valid.next() { + None => array.len() as u64, + Some(valid_index) => { + let offsetted_len = (array.len() + array.offset()) as u64; + let mut true_count: u64 = array.len() as u64; + match_each_unsigned_integer_ptype!(ends.ptype(), |$P| { + let ends = ends.as_slice::<$P>(); + let begin = if valid_index == 0 { + 0 + } else { + ends[valid_index - 1] + }; + + true_count += bools.value(valid_index as usize) as u64 * (cmp::min(ends[valid_index] as u64, offsetted_len) - begin as u64); + + for valid_index in is_valid { + true_count += bools.value(valid_index as usize) as u64 * (cmp::min(ends[valid_index] as u64, offsetted_len) - ends[valid_index - 1] as u64); + } + + true_count + }) } - }); + } } }; - stats.set(Stat::TrueCount, true_count); - stats.set(Stat::NullCount, null_count); + Some(Scalar::from(true_count)) } - DType::Primitive(..) => {} + DType::Primitive(..) => None, dtype => vortex_bail!("invalid dtype: {}", dtype), }, Stat::NullCount => { let ends = array.ends().into_primitive()?; - let null_count: u64 = match array.values().logical_validity() { + let null_count = match array.values().logical_validity() { LogicalValidity::AllValid(_) => 0_u64, LogicalValidity::AllInvalid(_) => array.len() as u64, LogicalValidity::Array(is_valid) => { @@ -328,9 +328,13 @@ impl StatisticsVTable for RunEndEncoding { } } }; - stats.set(stat, null_count); + Some(Scalar::from(null_count)) } - _ => {} + _ => None, + }; + + if let Some(value) = value { + stats.set(stat, value) }; Ok(stats) From e1bb6800321bc283aa11e98b96c13d843c5b1daa Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 21 Jan 2025 10:35:39 +0000 Subject: [PATCH 06/16] extract runend statistics into functions --- encodings/runend/src/array.rs | 185 ++++++++++++++++++---------------- 1 file changed, 97 insertions(+), 88 deletions(-) diff --git a/encodings/runend/src/array.rs b/encodings/runend/src/array.rs index 053652f79e5..428b647a728 100644 --- a/encodings/runend/src/array.rs +++ b/encodings/runend/src/array.rs @@ -1,6 +1,7 @@ use std::cmp; use std::fmt::{Debug, Display}; +use itertools::Itertools; use serde::{Deserialize, Serialize}; use vortex_array::array::PrimitiveArray; use vortex_array::compute::{ @@ -230,7 +231,7 @@ impl StatisticsVTable for RunEndEncoding { fn compute_statistics(&self, array: &RunEndArray, stat: Stat) -> VortexResult { let mut stats = StatsSet::default(); - let value = match stat { + let maybe_stat = match stat { Stat::Min | Stat::Max => array.values().statistics().compute(stat), Stat::IsSorted => Some(Scalar::from( array @@ -241,99 +242,15 @@ impl StatisticsVTable for RunEndEncoding { && array.logical_validity().all_valid(), )), Stat::TrueCount => match array.dtype() { - DType::Bool(_) => { - let ends = array.ends().into_primitive()?; - let bools = array.values().into_bool()?.boolean_buffer(); - - let true_count = match array.values().logical_validity() { - LogicalValidity::AllValid(_) => { - match_each_unsigned_integer_ptype!(ends.ptype(), |$P| { - let mut begin = array.offset() as $P; - ends - .as_slice::<$P>() - .iter() - .enumerate() - .map(|(index, end)| { - let len = *end - begin; - begin = *end; - (len as u64) * (bools.value(index as usize) as u64) - }) - .sum() - }) - } - LogicalValidity::AllInvalid(_) => 0, - LogicalValidity::Array(is_valid) => { - let is_valid = is_valid.into_bool()?.boolean_buffer(); - let mut is_valid = is_valid.set_indices(); - match is_valid.next() { - None => array.len() as u64, - Some(valid_index) => { - let offsetted_len = (array.len() + array.offset()) as u64; - let mut true_count: u64 = array.len() as u64; - match_each_unsigned_integer_ptype!(ends.ptype(), |$P| { - let ends = ends.as_slice::<$P>(); - let begin = if valid_index == 0 { - 0 - } else { - ends[valid_index - 1] - }; - - true_count += bools.value(valid_index as usize) as u64 * (cmp::min(ends[valid_index] as u64, offsetted_len) - begin as u64); - - for valid_index in is_valid { - true_count += bools.value(valid_index as usize) as u64 * (cmp::min(ends[valid_index] as u64, offsetted_len) - ends[valid_index - 1] as u64); - } - - true_count - }) - } - } - } - }; - - Some(Scalar::from(true_count)) - } + DType::Bool(_) => Some(Scalar::from(array.true_count()?)), DType::Primitive(..) => None, dtype => vortex_bail!("invalid dtype: {}", dtype), }, - Stat::NullCount => { - let ends = array.ends().into_primitive()?; - let null_count = match array.values().logical_validity() { - LogicalValidity::AllValid(_) => 0_u64, - LogicalValidity::AllInvalid(_) => array.len() as u64, - LogicalValidity::Array(is_valid) => { - let is_valid = is_valid.into_bool()?.boolean_buffer(); - let mut is_valid = is_valid.set_indices(); - match is_valid.next() { - None => array.len() as u64, - Some(valid_index) => { - let offsetted_len = (array.len() + array.offset()) as u64; - let mut null_count: u64 = array.len() as u64; - match_each_unsigned_integer_ptype!(ends.ptype(), |$P| { - let ends = ends.as_slice::<$P>(); - let begin = if valid_index == 0 { - 0 - } else { - ends[valid_index - 1] - }; - null_count -= cmp::min(ends[valid_index] as u64, offsetted_len) - begin as u64; - - for valid_index in is_valid { - null_count -= cmp::min(ends[valid_index] as u64, offsetted_len) - ends[valid_index - 1] as u64; - } - - null_count - }) - } - } - } - }; - Some(Scalar::from(null_count)) - } + Stat::NullCount => Some(Scalar::from(array.null_count()?)), _ => None, }; - if let Some(value) = value { + if let Some(value) = maybe_stat { stats.set(stat, value) }; @@ -341,6 +258,98 @@ impl StatisticsVTable for RunEndEncoding { } } +impl RunEndArray { + fn true_count(&self) -> VortexResult { + let ends = self.ends().into_primitive()?; + let bools = self.values().into_bool()?.boolean_buffer(); + + Ok(match self.values().logical_validity() { + LogicalValidity::AllValid(_) => { + match_each_unsigned_integer_ptype!(ends.ptype(), |$P| { + let mut begin = self.offset() as $P; + ends + .as_slice::<$P>() + .iter() + .zip_eq(bools.into_iter()) + .map(|(end, bool_value)| { + let len = *end - begin; + begin = *end; + (len as u64) * (bool_value as u64) + }) + .sum() + }) + } + LogicalValidity::AllInvalid(_) => 0, + LogicalValidity::Array(is_valid) => { + let is_valid = is_valid.into_bool()?.boolean_buffer(); + let mut is_valid = is_valid.set_indices(); + match is_valid.next() { + None => self.len() as u64, + Some(valid_index) => { + let offsetted_len = (self.len() + self.offset()) as u64; + let mut true_count: u64 = 0; + match_each_unsigned_integer_ptype!(ends.ptype(), |$P| { + let ends = ends.as_slice::<$P>(); + println!("{} {} {} {:?}", offsetted_len, true_count, valid_index, ends); + let begin = if valid_index == 0 { + 0 + } else { + ends[valid_index - 1] + }; + + let end = cmp::min(ends[valid_index] as u64, offsetted_len); + true_count += bools.value(valid_index as usize) as u64 * (end - begin as u64); + + for valid_index in is_valid { + println!("{} {} {}", offsetted_len, true_count, valid_index); + let end = cmp::min(ends[valid_index] as u64, offsetted_len); + true_count += bools.value(valid_index as usize) as u64 * (end - ends[valid_index - 1] as u64); + } + + true_count + }) + } + } + } + }) + } + + fn null_count(&self) -> VortexResult { + let ends = self.ends().into_primitive()?; + let null_count = match self.values().logical_validity() { + LogicalValidity::AllValid(_) => 0_u64, + LogicalValidity::AllInvalid(_) => self.len() as u64, + LogicalValidity::Array(is_valid) => { + let is_valid = is_valid.into_bool()?.boolean_buffer(); + let mut is_valid = is_valid.set_indices(); + match is_valid.next() { + None => self.len() as u64, + Some(valid_index) => { + let offsetted_len = (self.len() + self.offset()) as u64; + let mut null_count: u64 = self.len() as u64; + match_each_unsigned_integer_ptype!(ends.ptype(), |$P| { + let ends = ends.as_slice::<$P>(); + let begin = if valid_index == 0 { + 0 + } else { + ends[valid_index - 1] + }; + null_count -= cmp::min(ends[valid_index] as u64, offsetted_len) - begin as u64; + + for valid_index in is_valid { + null_count -= cmp::min(ends[valid_index] as u64, offsetted_len) - ends[valid_index - 1] as u64; + } + + null_count + }) + } + } + } + }; + Ok(null_count) + } +} + #[cfg(test)] mod tests { use arrow_buffer::BooleanBuffer; From 687b3b1edf41e13fe907d344414bac114234f8c8 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 21 Jan 2025 10:37:45 +0000 Subject: [PATCH 07/16] cleanup --- encodings/runend/src/array.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/encodings/runend/src/array.rs b/encodings/runend/src/array.rs index 428b647a728..cbc556d5493 100644 --- a/encodings/runend/src/array.rs +++ b/encodings/runend/src/array.rs @@ -243,8 +243,7 @@ impl StatisticsVTable for RunEndEncoding { )), Stat::TrueCount => match array.dtype() { DType::Bool(_) => Some(Scalar::from(array.true_count()?)), - DType::Primitive(..) => None, - dtype => vortex_bail!("invalid dtype: {}", dtype), + _ => None, }, Stat::NullCount => Some(Scalar::from(array.null_count()?)), _ => None, @@ -290,7 +289,6 @@ impl RunEndArray { let mut true_count: u64 = 0; match_each_unsigned_integer_ptype!(ends.ptype(), |$P| { let ends = ends.as_slice::<$P>(); - println!("{} {} {} {:?}", offsetted_len, true_count, valid_index, ends); let begin = if valid_index == 0 { 0 } else { @@ -301,7 +299,6 @@ impl RunEndArray { true_count += bools.value(valid_index as usize) as u64 * (end - begin as u64); for valid_index in is_valid { - println!("{} {} {}", offsetted_len, true_count, valid_index); let end = cmp::min(ends[valid_index] as u64, offsetted_len); true_count += bools.value(valid_index as usize) as u64 * (end - ends[valid_index - 1] as u64); } From 7501df652ad21acc4b142c655bcbbcfb3595a211 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 21 Jan 2025 10:38:47 +0000 Subject: [PATCH 08/16] smaller diff --- encodings/runend/src/array.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/encodings/runend/src/array.rs b/encodings/runend/src/array.rs index cbc556d5493..3d57273a92f 100644 --- a/encodings/runend/src/array.rs +++ b/encodings/runend/src/array.rs @@ -229,8 +229,6 @@ impl VisitorVTable for RunEndEncoding { impl StatisticsVTable for RunEndEncoding { fn compute_statistics(&self, array: &RunEndArray, stat: Stat) -> VortexResult { - let mut stats = StatsSet::default(); - let maybe_stat = match stat { Stat::Min | Stat::Max => array.values().statistics().compute(stat), Stat::IsSorted => Some(Scalar::from( @@ -249,6 +247,7 @@ impl StatisticsVTable for RunEndEncoding { _ => None, }; + let mut stats = StatsSet::default(); if let Some(value) = maybe_stat { stats.set(stat, value) }; From e59c5b6311f1f06351d6d5c1d94baf227c119bcb Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 21 Jan 2025 10:39:08 +0000 Subject: [PATCH 09/16] smaller diff --- encodings/runend/src/array.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/encodings/runend/src/array.rs b/encodings/runend/src/array.rs index 3d57273a92f..57bc5da8805 100644 --- a/encodings/runend/src/array.rs +++ b/encodings/runend/src/array.rs @@ -250,8 +250,7 @@ impl StatisticsVTable for RunEndEncoding { let mut stats = StatsSet::default(); if let Some(value) = maybe_stat { stats.set(stat, value) - }; - + } Ok(stats) } } From 87fd08fbed28019fd7178fa9347de8189bd67fc0 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 21 Jan 2025 10:46:00 +0000 Subject: [PATCH 10/16] fixed up --- encodings/runend/src/array.rs | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/encodings/runend/src/array.rs b/encodings/runend/src/array.rs index 57bc5da8805..07bf0f3cb2f 100644 --- a/encodings/runend/src/array.rs +++ b/encodings/runend/src/array.rs @@ -248,8 +248,8 @@ impl StatisticsVTable for RunEndEncoding { }; let mut stats = StatsSet::default(); - if let Some(value) = maybe_stat { - stats.set(stat, value) + if let Some(stat_value) = maybe_stat { + stats.set(stat, stat_value); } Ok(stats) } @@ -283,22 +283,23 @@ impl RunEndArray { match is_valid.next() { None => self.len() as u64, Some(valid_index) => { - let offsetted_len = (self.len() + self.offset()) as u64; let mut true_count: u64 = 0; match_each_unsigned_integer_ptype!(ends.ptype(), |$P| { + let offsetted_begin = <$P>::try_from(self.offset())?; + let offsetted_len = <$P>::try_from(self.len() + self.offset())?; let ends = ends.as_slice::<$P>(); let begin = if valid_index == 0 { - 0 + offsetted_begin } else { ends[valid_index - 1] }; - let end = cmp::min(ends[valid_index] as u64, offsetted_len); - true_count += bools.value(valid_index as usize) as u64 * (end - begin as u64); + let end = cmp::min(ends[valid_index], offsetted_len); + true_count += bools.value(valid_index as usize) as u64 * (end - begin) as u64; for valid_index in is_valid { - let end = cmp::min(ends[valid_index] as u64, offsetted_len); - true_count += bools.value(valid_index as usize) as u64 * (end - ends[valid_index - 1] as u64); + let end = cmp::min(ends[valid_index], offsetted_len); + true_count += bools.value(valid_index as usize) as u64 * (end - ends[valid_index - 1]) as u64; } true_count From e76d1a541eaca44b2a55bcc21819052054d48c43 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 21 Jan 2025 11:31:49 +0000 Subject: [PATCH 11/16] move statistics out --- encodings/runend/src/array.rs | 217 +-------------------------- encodings/runend/src/lib.rs | 1 + encodings/runend/src/statistics.rs | 229 +++++++++++++++++++++++++++++ 3 files changed, 233 insertions(+), 214 deletions(-) create mode 100644 encodings/runend/src/statistics.rs diff --git a/encodings/runend/src/array.rs b/encodings/runend/src/array.rs index 07bf0f3cb2f..cd3ee055be9 100644 --- a/encodings/runend/src/array.rs +++ b/encodings/runend/src/array.rs @@ -1,14 +1,12 @@ -use std::cmp; use std::fmt::{Debug, Display}; -use itertools::Itertools; use serde::{Deserialize, Serialize}; use vortex_array::array::PrimitiveArray; use vortex_array::compute::{ scalar_at, search_sorted_usize, search_sorted_usize_many, SearchSortedSide, }; use vortex_array::encoding::ids; -use vortex_array::stats::{ArrayStatistics, Stat, StatisticsVTable, StatsSet}; +use vortex_array::stats::{ArrayStatistics, StatsSet}; use vortex_array::validate::ValidateVTable; use vortex_array::validity::{ArrayValidity, LogicalValidity, ValidityVTable}; use vortex_array::variants::{BoolArrayTrait, PrimitiveArrayTrait, VariantsVTable}; @@ -18,9 +16,8 @@ use vortex_array::{ IntoCanonical, }; use vortex_buffer::Buffer; -use vortex_dtype::{match_each_unsigned_integer_ptype, DType, PType}; +use vortex_dtype::{DType, PType}; use vortex_error::{vortex_bail, VortexExpect as _, VortexResult}; -use vortex_scalar::Scalar; use crate::compress::{runend_decode_bools, runend_decode_primitive, runend_encode}; @@ -227,133 +224,10 @@ impl VisitorVTable for RunEndEncoding { } } -impl StatisticsVTable for RunEndEncoding { - fn compute_statistics(&self, array: &RunEndArray, stat: Stat) -> VortexResult { - let maybe_stat = match stat { - Stat::Min | Stat::Max => array.values().statistics().compute(stat), - Stat::IsSorted => Some(Scalar::from( - array - .values() - .statistics() - .compute_is_sorted() - .unwrap_or(false) - && array.logical_validity().all_valid(), - )), - Stat::TrueCount => match array.dtype() { - DType::Bool(_) => Some(Scalar::from(array.true_count()?)), - _ => None, - }, - Stat::NullCount => Some(Scalar::from(array.null_count()?)), - _ => None, - }; - - let mut stats = StatsSet::default(); - if let Some(stat_value) = maybe_stat { - stats.set(stat, stat_value); - } - Ok(stats) - } -} - -impl RunEndArray { - fn true_count(&self) -> VortexResult { - let ends = self.ends().into_primitive()?; - let bools = self.values().into_bool()?.boolean_buffer(); - - Ok(match self.values().logical_validity() { - LogicalValidity::AllValid(_) => { - match_each_unsigned_integer_ptype!(ends.ptype(), |$P| { - let mut begin = self.offset() as $P; - ends - .as_slice::<$P>() - .iter() - .zip_eq(bools.into_iter()) - .map(|(end, bool_value)| { - let len = *end - begin; - begin = *end; - (len as u64) * (bool_value as u64) - }) - .sum() - }) - } - LogicalValidity::AllInvalid(_) => 0, - LogicalValidity::Array(is_valid) => { - let is_valid = is_valid.into_bool()?.boolean_buffer(); - let mut is_valid = is_valid.set_indices(); - match is_valid.next() { - None => self.len() as u64, - Some(valid_index) => { - let mut true_count: u64 = 0; - match_each_unsigned_integer_ptype!(ends.ptype(), |$P| { - let offsetted_begin = <$P>::try_from(self.offset())?; - let offsetted_len = <$P>::try_from(self.len() + self.offset())?; - let ends = ends.as_slice::<$P>(); - let begin = if valid_index == 0 { - offsetted_begin - } else { - ends[valid_index - 1] - }; - - let end = cmp::min(ends[valid_index], offsetted_len); - true_count += bools.value(valid_index as usize) as u64 * (end - begin) as u64; - - for valid_index in is_valid { - let end = cmp::min(ends[valid_index], offsetted_len); - true_count += bools.value(valid_index as usize) as u64 * (end - ends[valid_index - 1]) as u64; - } - - true_count - }) - } - } - } - }) - } - - fn null_count(&self) -> VortexResult { - let ends = self.ends().into_primitive()?; - let null_count = match self.values().logical_validity() { - LogicalValidity::AllValid(_) => 0_u64, - LogicalValidity::AllInvalid(_) => self.len() as u64, - LogicalValidity::Array(is_valid) => { - let is_valid = is_valid.into_bool()?.boolean_buffer(); - let mut is_valid = is_valid.set_indices(); - match is_valid.next() { - None => self.len() as u64, - Some(valid_index) => { - let offsetted_len = (self.len() + self.offset()) as u64; - let mut null_count: u64 = self.len() as u64; - match_each_unsigned_integer_ptype!(ends.ptype(), |$P| { - let ends = ends.as_slice::<$P>(); - let begin = if valid_index == 0 { - 0 - } else { - ends[valid_index - 1] - }; - null_count -= cmp::min(ends[valid_index] as u64, offsetted_len) - begin as u64; - - for valid_index in is_valid { - null_count -= cmp::min(ends[valid_index] as u64, offsetted_len) - ends[valid_index - 1] as u64; - } - - null_count - }) - } - } - } - }; - Ok(null_count) - } -} - #[cfg(test)] mod tests { - use arrow_buffer::BooleanBuffer; - use vortex_array::array::BoolArray; - use vortex_array::compute::{scalar_at, slice}; - use vortex_array::stats::{ArrayStatistics as _, Stat}; + use vortex_array::compute::scalar_at; use vortex_array::test_harness::check_metadata; - use vortex_array::validity::Validity; use vortex_array::{ArrayDType, ArrayLen, IntoArrayData}; use vortex_buffer::buffer; use vortex_dtype::{DType, Nullability, PType}; @@ -394,89 +268,4 @@ mod tests { assert_eq!(scalar_at(arr.as_ref(), 5).unwrap(), 3.into()); assert_eq!(scalar_at(arr.as_ref(), 9).unwrap(), 3.into()); } - - #[test] - fn test_runend_int_stats() { - let arr = RunEndArray::try_new( - buffer![2u32, 5, 10].into_array(), - buffer![1i32, 2, 3].into_array(), - ) - .unwrap(); - - assert_eq!(arr.statistics().compute_as::(Stat::Min).unwrap(), 1); - assert_eq!(arr.statistics().compute_as::(Stat::Max).unwrap(), 3); - assert_eq!( - arr.statistics().compute_as::(Stat::NullCount).unwrap(), - 0 - ); - assert!(arr.statistics().compute_as::(Stat::IsSorted).unwrap()); - } - - #[test] - fn test_runend_bool_stats() { - let arr = RunEndArray::try_new( - buffer![2u32, 5, 10].into_array(), - BoolArray::try_new( - BooleanBuffer::from_iter([true, true, false]), - Validity::Array(BoolArray::from_iter([true, false, true]).into_array()), - ) - .unwrap() - .into_array(), - ) - .unwrap(); - - assert!(!arr.statistics().compute_as::(Stat::Min).unwrap()); - assert!(arr.statistics().compute_as::(Stat::Max).unwrap()); - assert_eq!( - arr.statistics().compute_as::(Stat::NullCount).unwrap(), - 3 - ); - assert!(!arr.statistics().compute_as::(Stat::IsSorted).unwrap()); - assert_eq!( - arr.statistics().compute_as::(Stat::TrueCount).unwrap(), - 2 - ); - - let sliced = slice(arr, 4, 7).unwrap(); - - assert!(!sliced.statistics().compute_as::(Stat::Min).unwrap()); - assert!(!sliced.statistics().compute_as::(Stat::Max).unwrap()); - assert_eq!( - sliced - .statistics() - .compute_as::(Stat::NullCount) - .unwrap(), - 1 - ); - // Not sorted because null must come last - assert!(!sliced - .statistics() - .compute_as::(Stat::IsSorted) - .unwrap()); - assert_eq!( - sliced - .statistics() - .compute_as::(Stat::TrueCount) - .unwrap(), - 0 - ); - } - - #[test] - fn test_all_invalid_true_count() { - let arr = RunEndArray::try_new( - buffer![2u32, 5, 10].into_array(), - BoolArray::from_iter([None, None, None]).into_array(), - ) - .unwrap() - .into_array(); - assert_eq!( - arr.statistics().compute_as::(Stat::TrueCount).unwrap(), - 0 - ); - assert_eq!( - arr.statistics().compute_as::(Stat::NullCount).unwrap(), - 10 - ); - } } diff --git a/encodings/runend/src/lib.rs b/encodings/runend/src/lib.rs index e601f67973e..d4a77c993ce 100644 --- a/encodings/runend/src/lib.rs +++ b/encodings/runend/src/lib.rs @@ -4,6 +4,7 @@ mod array; pub mod compress; mod compute; mod iter; +mod statistics; #[doc(hidden)] pub mod _benchmarking { diff --git a/encodings/runend/src/statistics.rs b/encodings/runend/src/statistics.rs new file mode 100644 index 00000000000..35b665478ed --- /dev/null +++ b/encodings/runend/src/statistics.rs @@ -0,0 +1,229 @@ +use std::cmp; + +use itertools::Itertools; +use vortex_array::stats::{ArrayStatistics as _, Stat, StatisticsVTable, StatsSet}; +use vortex_array::validity::{ArrayValidity as _, LogicalValidity}; +use vortex_array::variants::PrimitiveArrayTrait; +use vortex_array::{ArrayDType as _, ArrayLen as _, IntoArrayVariant as _}; +use vortex_dtype::{match_each_unsigned_integer_ptype, DType}; +use vortex_error::VortexResult; +use vortex_scalar::Scalar; + +use crate::{RunEndArray, RunEndEncoding}; + +impl StatisticsVTable for RunEndEncoding { + fn compute_statistics(&self, array: &RunEndArray, stat: Stat) -> VortexResult { + let maybe_stat = match stat { + Stat::Min | Stat::Max => array.values().statistics().compute(stat), + Stat::IsSorted => Some(Scalar::from( + array + .values() + .statistics() + .compute_is_sorted() + .unwrap_or(false) + && array.logical_validity().all_valid(), + )), + Stat::TrueCount => match array.dtype() { + DType::Bool(_) => Some(Scalar::from(array.true_count()?)), + _ => None, + }, + Stat::NullCount => Some(Scalar::from(array.null_count()?)), + _ => None, + }; + + let mut stats = StatsSet::default(); + if let Some(stat_value) = maybe_stat { + stats.set(stat, stat_value); + } + Ok(stats) + } +} + +impl RunEndArray { + fn true_count(&self) -> VortexResult { + let ends = self.ends().into_primitive()?; + let bools = self.values().into_bool()?.boolean_buffer(); + + Ok(match self.values().logical_validity() { + LogicalValidity::AllValid(_) => { + match_each_unsigned_integer_ptype!(ends.ptype(), |$P| { + let mut begin = self.offset() as $P; + ends + .as_slice::<$P>() + .iter() + .zip_eq(bools.into_iter()) + .map(|(end, bool_value)| { + let len = *end - begin; + begin = *end; + (len as u64) * (bool_value as u64) + }) + .sum() + }) + } + LogicalValidity::AllInvalid(_) => 0, + LogicalValidity::Array(is_valid) => { + let is_valid = is_valid.into_bool()?.boolean_buffer(); + let mut is_valid = is_valid.set_indices(); + match is_valid.next() { + None => self.len() as u64, + Some(valid_index) => { + let mut true_count: u64 = 0; + match_each_unsigned_integer_ptype!(ends.ptype(), |$P| { + let offsetted_begin = <$P>::try_from(self.offset())?; + let offsetted_len = <$P>::try_from(self.len() + self.offset())?; + let ends = ends.as_slice::<$P>(); + let begin = if valid_index == 0 { + offsetted_begin + } else { + ends[valid_index - 1] + }; + + let end = cmp::min(ends[valid_index], offsetted_len); + true_count += bools.value(valid_index as usize) as u64 * (end - begin) as u64; + + for valid_index in is_valid { + let end = cmp::min(ends[valid_index], offsetted_len); + true_count += bools.value(valid_index as usize) as u64 * (end - ends[valid_index - 1]) as u64; + } + + true_count + }) + } + } + } + }) + } + + fn null_count(&self) -> VortexResult { + let ends = self.ends().into_primitive()?; + let null_count = match self.values().logical_validity() { + LogicalValidity::AllValid(_) => 0_u64, + LogicalValidity::AllInvalid(_) => self.len() as u64, + LogicalValidity::Array(is_valid) => { + let is_valid = is_valid.into_bool()?.boolean_buffer(); + let mut is_valid = is_valid.set_indices(); + match is_valid.next() { + None => self.len() as u64, + Some(valid_index) => { + let offsetted_len = (self.len() + self.offset()) as u64; + let mut null_count: u64 = self.len() as u64; + match_each_unsigned_integer_ptype!(ends.ptype(), |$P| { + let ends = ends.as_slice::<$P>(); + let begin = if valid_index == 0 { + 0 + } else { + ends[valid_index - 1] + }; + null_count -= cmp::min(ends[valid_index] as u64, offsetted_len) - begin as u64; + + for valid_index in is_valid { + null_count -= cmp::min(ends[valid_index] as u64, offsetted_len) - ends[valid_index - 1] as u64; + } + + null_count + }) + } + } + } + }; + Ok(null_count) + } +} + +#[cfg(test)] +mod tests { + use arrow_buffer::BooleanBuffer; + use vortex_array::array::BoolArray; + use vortex_array::compute::slice; + use vortex_array::stats::{ArrayStatistics as _, Stat}; + use vortex_array::validity::Validity; + use vortex_array::IntoArrayData; + use vortex_buffer::buffer; + + use crate::RunEndArray; + + #[test] + fn test_runend_int_stats() { + let arr = RunEndArray::try_new( + buffer![2u32, 5, 10].into_array(), + buffer![1i32, 2, 3].into_array(), + ) + .unwrap(); + + assert_eq!(arr.statistics().compute_as::(Stat::Min).unwrap(), 1); + assert_eq!(arr.statistics().compute_as::(Stat::Max).unwrap(), 3); + assert_eq!( + arr.statistics().compute_as::(Stat::NullCount).unwrap(), + 0 + ); + assert!(arr.statistics().compute_as::(Stat::IsSorted).unwrap()); + } + + #[test] + fn test_runend_bool_stats() { + let arr = RunEndArray::try_new( + buffer![2u32, 5, 10].into_array(), + BoolArray::try_new( + BooleanBuffer::from_iter([true, true, false]), + Validity::Array(BoolArray::from_iter([true, false, true]).into_array()), + ) + .unwrap() + .into_array(), + ) + .unwrap(); + + assert!(!arr.statistics().compute_as::(Stat::Min).unwrap()); + assert!(arr.statistics().compute_as::(Stat::Max).unwrap()); + assert_eq!( + arr.statistics().compute_as::(Stat::NullCount).unwrap(), + 3 + ); + assert!(!arr.statistics().compute_as::(Stat::IsSorted).unwrap()); + assert_eq!( + arr.statistics().compute_as::(Stat::TrueCount).unwrap(), + 2 + ); + + let sliced = slice(arr, 4, 7).unwrap(); + + assert!(!sliced.statistics().compute_as::(Stat::Min).unwrap()); + assert!(!sliced.statistics().compute_as::(Stat::Max).unwrap()); + assert_eq!( + sliced + .statistics() + .compute_as::(Stat::NullCount) + .unwrap(), + 1 + ); + // Not sorted because null must come last + assert!(!sliced + .statistics() + .compute_as::(Stat::IsSorted) + .unwrap()); + assert_eq!( + sliced + .statistics() + .compute_as::(Stat::TrueCount) + .unwrap(), + 0 + ); + } + + #[test] + fn test_all_invalid_true_count() { + let arr = RunEndArray::try_new( + buffer![2u32, 5, 10].into_array(), + BoolArray::from_iter([None, None, None]).into_array(), + ) + .unwrap() + .into_array(); + assert_eq!( + arr.statistics().compute_as::(Stat::TrueCount).unwrap(), + 0 + ); + assert_eq!( + arr.statistics().compute_as::(Stat::NullCount).unwrap(), + 10 + ); + } +} From 7bba20795a54e57917b94ad0c94acb397e236c7c Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 21 Jan 2025 11:50:20 +0000 Subject: [PATCH 12/16] bind the end --- encodings/runend/src/statistics.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/encodings/runend/src/statistics.rs b/encodings/runend/src/statistics.rs index 35b665478ed..b4bc1a587dd 100644 --- a/encodings/runend/src/statistics.rs +++ b/encodings/runend/src/statistics.rs @@ -114,10 +114,13 @@ impl RunEndArray { } else { ends[valid_index - 1] }; - null_count -= cmp::min(ends[valid_index] as u64, offsetted_len) - begin as u64; + + let end = cmp::min(ends[valid_index] as u64, offsetted_len); + null_count -= end - begin as u64; for valid_index in is_valid { - null_count -= cmp::min(ends[valid_index] as u64, offsetted_len) - ends[valid_index - 1] as u64; + let end = cmp::min(ends[valid_index] as u64, offsetted_len); + null_count -= end - ends[valid_index - 1] as u64; } null_count From b24b7b7f6d7756cd0cf283ee8da7b1b7624186c6 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 21 Jan 2025 16:27:03 +0000 Subject: [PATCH 13/16] typed functions instead of macro code --- encodings/runend/src/statistics.rs | 155 +++++++++++++++++------------ 1 file changed, 93 insertions(+), 62 deletions(-) diff --git a/encodings/runend/src/statistics.rs b/encodings/runend/src/statistics.rs index b4bc1a587dd..77536aa982c 100644 --- a/encodings/runend/src/statistics.rs +++ b/encodings/runend/src/statistics.rs @@ -1,12 +1,14 @@ use std::cmp; +use arrow_buffer::BooleanBuffer; use itertools::Itertools; +use vortex_array::array::PrimitiveArray; use vortex_array::stats::{ArrayStatistics as _, Stat, StatisticsVTable, StatsSet}; use vortex_array::validity::{ArrayValidity as _, LogicalValidity}; use vortex_array::variants::PrimitiveArrayTrait; use vortex_array::{ArrayDType as _, ArrayLen as _, IntoArrayVariant as _}; -use vortex_dtype::{match_each_unsigned_integer_ptype, DType}; -use vortex_error::VortexResult; +use vortex_dtype::{match_each_unsigned_integer_ptype, DType, NativePType}; +use vortex_error::{VortexExpect as _, VortexResult}; use vortex_scalar::Scalar; use crate::{RunEndArray, RunEndEncoding}; @@ -42,23 +44,34 @@ impl StatisticsVTable for RunEndEncoding { impl RunEndArray { fn true_count(&self) -> VortexResult { let ends = self.ends().into_primitive()?; - let bools = self.values().into_bool()?.boolean_buffer(); + let values = self.values().into_bool()?.boolean_buffer(); + match_each_unsigned_integer_ptype!(ends.ptype(), |$P| self.typed_true_count::<$P>(ends, values)) + } + + fn typed_true_count( + &self, + decompressed_ends: PrimitiveArray, + decompressed_values: BooleanBuffer, + ) -> VortexResult + where + u64: From

, + u64: From, + { Ok(match self.values().logical_validity() { LogicalValidity::AllValid(_) => { - match_each_unsigned_integer_ptype!(ends.ptype(), |$P| { - let mut begin = self.offset() as $P; - ends - .as_slice::<$P>() - .iter() - .zip_eq(bools.into_iter()) - .map(|(end, bool_value)| { - let len = *end - begin; - begin = *end; - (len as u64) * (bool_value as u64) - }) - .sum() - }) + let mut begin = u64::try_from(self.offset()).vortex_expect("usize fits in u64"); + decompressed_ends + .as_slice::

() + .iter() + .zip_eq(&decompressed_values) + .map(|(end, bool_value)| { + let end = u64::from(*end); + let len = end - begin; + begin = end; + len * u64::from(bool_value) + }) + .sum() } LogicalValidity::AllInvalid(_) => 0, LogicalValidity::Array(is_valid) => { @@ -68,26 +81,32 @@ impl RunEndArray { None => self.len() as u64, Some(valid_index) => { let mut true_count: u64 = 0; - match_each_unsigned_integer_ptype!(ends.ptype(), |$P| { - let offsetted_begin = <$P>::try_from(self.offset())?; - let offsetted_len = <$P>::try_from(self.len() + self.offset())?; - let ends = ends.as_slice::<$P>(); - let begin = if valid_index == 0 { - offsetted_begin - } else { - ends[valid_index - 1] - }; - - let end = cmp::min(ends[valid_index], offsetted_len); - true_count += bools.value(valid_index as usize) as u64 * (end - begin) as u64; - - for valid_index in is_valid { - let end = cmp::min(ends[valid_index], offsetted_len); - true_count += bools.value(valid_index as usize) as u64 * (end - ends[valid_index - 1]) as u64; - } - - true_count - }) + let offsetted_begin = u64::try_from(self.offset()) + .ok() + .vortex_expect("usize fits in u64"); + let offsetted_len = u64::try_from(self.len() + self.offset()) + .ok() + .vortex_expect("usize fits in u64"); + let decompressed_ends = decompressed_ends.as_slice::

(); + let begin = if valid_index == 0 { + offsetted_begin + } else { + u64::from(decompressed_ends[valid_index - 1]) + }; + + let end = + cmp::min(u64::from(decompressed_ends[valid_index]), offsetted_len); + true_count += + u64::from(decompressed_values.value(valid_index)) * (end - begin); + + for valid_index in is_valid { + let end = + cmp::min(u64::from(decompressed_ends[valid_index]), offsetted_len); + true_count += u64::from(decompressed_values.value(valid_index)) + * (end - u64::from(decompressed_ends[valid_index - 1])); + } + + true_count } } } @@ -101,36 +120,48 @@ impl RunEndArray { LogicalValidity::AllInvalid(_) => self.len() as u64, LogicalValidity::Array(is_valid) => { let is_valid = is_valid.into_bool()?.boolean_buffer(); - let mut is_valid = is_valid.set_indices(); - match is_valid.next() { - None => self.len() as u64, - Some(valid_index) => { - let offsetted_len = (self.len() + self.offset()) as u64; - let mut null_count: u64 = self.len() as u64; - match_each_unsigned_integer_ptype!(ends.ptype(), |$P| { - let ends = ends.as_slice::<$P>(); - let begin = if valid_index == 0 { - 0 - } else { - ends[valid_index - 1] - }; - - let end = cmp::min(ends[valid_index] as u64, offsetted_len); - null_count -= end - begin as u64; - - for valid_index in is_valid { - let end = cmp::min(ends[valid_index] as u64, offsetted_len); - null_count -= end - ends[valid_index - 1] as u64; - } - - null_count - }) - } - } + match_each_unsigned_integer_ptype!(ends.ptype(), |$P| self.null_count_with_array_validity::<$P>(ends, is_valid)) } }; Ok(null_count) } + + fn null_count_with_array_validity( + &self, + decompressed_ends: PrimitiveArray, + is_valid: BooleanBuffer, + ) -> u64 + where + u64: From

, + u64: TryFrom, + { + let mut is_valid = is_valid.set_indices(); + match is_valid.next() { + None => u64::try_from(self.len()) + .ok() + .vortex_expect("usize fits in u64"), + Some(valid_index) => { + let offsetted_len = (self.len() + self.offset()) as u64; + let mut null_count: u64 = self.len() as u64; + let decompressed_ends = decompressed_ends.as_slice::

(); + let begin = if valid_index == 0 { + 0 + } else { + u64::from(decompressed_ends[valid_index - 1]) + }; + + let end = cmp::min(u64::from(decompressed_ends[valid_index]), offsetted_len); + null_count -= end - begin; + + for valid_index in is_valid { + let end = cmp::min(u64::from(decompressed_ends[valid_index]), offsetted_len); + null_count -= end - u64::from(decompressed_ends[valid_index - 1]); + } + + null_count + } + } + } } #[cfg(test)] From 10d5e304c6e1ace02e1970f14ee2ed5f9ad66f34 Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Wed, 22 Jan 2025 15:09:39 +0000 Subject: [PATCH 14/16] Convert from to into (#2052) --- encodings/runend/src/statistics.rs | 70 +++++++++++------------------- 1 file changed, 26 insertions(+), 44 deletions(-) diff --git a/encodings/runend/src/statistics.rs b/encodings/runend/src/statistics.rs index 77536aa982c..3a158657b33 100644 --- a/encodings/runend/src/statistics.rs +++ b/encodings/runend/src/statistics.rs @@ -2,7 +2,6 @@ use std::cmp; use arrow_buffer::BooleanBuffer; use itertools::Itertools; -use vortex_array::array::PrimitiveArray; use vortex_array::stats::{ArrayStatistics as _, Stat, StatisticsVTable, StatsSet}; use vortex_array::validity::{ArrayValidity as _, LogicalValidity}; use vortex_array::variants::PrimitiveArrayTrait; @@ -46,27 +45,23 @@ impl RunEndArray { let ends = self.ends().into_primitive()?; let values = self.values().into_bool()?.boolean_buffer(); - match_each_unsigned_integer_ptype!(ends.ptype(), |$P| self.typed_true_count::<$P>(ends, values)) + match_each_unsigned_integer_ptype!(ends.ptype(), |$P| self.typed_true_count(ends.as_slice::<$P>(), values)) } - fn typed_true_count( + fn typed_true_count>( &self, - decompressed_ends: PrimitiveArray, + decompressed_ends: &[P], decompressed_values: BooleanBuffer, - ) -> VortexResult - where - u64: From

, - u64: From, - { + ) -> VortexResult { Ok(match self.values().logical_validity() { LogicalValidity::AllValid(_) => { let mut begin = u64::try_from(self.offset()).vortex_expect("usize fits in u64"); decompressed_ends - .as_slice::

() .iter() + .copied() .zip_eq(&decompressed_values) .map(|(end, bool_value)| { - let end = u64::from(*end); + let end: u64 = end.into(); let len = end - begin; begin = end; len * u64::from(bool_value) @@ -81,29 +76,23 @@ impl RunEndArray { None => self.len() as u64, Some(valid_index) => { let mut true_count: u64 = 0; - let offsetted_begin = u64::try_from(self.offset()) - .ok() - .vortex_expect("usize fits in u64"); - let offsetted_len = u64::try_from(self.len() + self.offset()) - .ok() - .vortex_expect("usize fits in u64"); - let decompressed_ends = decompressed_ends.as_slice::

(); + let offsetted_begin = self.offset() as u64; + let offsetted_len = (self.len() + self.offset()) as u64; + let valid_end: u64 = decompressed_ends[valid_index].into(); let begin = if valid_index == 0 { offsetted_begin } else { - u64::from(decompressed_ends[valid_index - 1]) + valid_end }; - let end = - cmp::min(u64::from(decompressed_ends[valid_index]), offsetted_len); - true_count += - u64::from(decompressed_values.value(valid_index)) * (end - begin); + let end = cmp::min(valid_end, offsetted_len); + true_count += decompressed_values.value(valid_index) as u64 * (end - begin); for valid_index in is_valid { - let end = - cmp::min(u64::from(decompressed_ends[valid_index]), offsetted_len); - true_count += u64::from(decompressed_values.value(valid_index)) - * (end - u64::from(decompressed_ends[valid_index - 1])); + let valid_end: u64 = decompressed_ends[valid_index].into(); + let end = cmp::min(valid_end, offsetted_len); + true_count += + decompressed_values.value(valid_index) as u64 * (end - valid_end); } true_count @@ -116,46 +105,39 @@ impl RunEndArray { fn null_count(&self) -> VortexResult { let ends = self.ends().into_primitive()?; let null_count = match self.values().logical_validity() { - LogicalValidity::AllValid(_) => 0_u64, + LogicalValidity::AllValid(_) => 0u64, LogicalValidity::AllInvalid(_) => self.len() as u64, LogicalValidity::Array(is_valid) => { let is_valid = is_valid.into_bool()?.boolean_buffer(); - match_each_unsigned_integer_ptype!(ends.ptype(), |$P| self.null_count_with_array_validity::<$P>(ends, is_valid)) + match_each_unsigned_integer_ptype!(ends.ptype(), |$P| self.null_count_with_array_validity(ends.as_slice::<$P>(), is_valid)) } }; Ok(null_count) } - fn null_count_with_array_validity( + fn null_count_with_array_validity>( &self, - decompressed_ends: PrimitiveArray, + decompressed_ends: &[P], is_valid: BooleanBuffer, - ) -> u64 - where - u64: From

, - u64: TryFrom, - { + ) -> u64 { let mut is_valid = is_valid.set_indices(); match is_valid.next() { - None => u64::try_from(self.len()) - .ok() - .vortex_expect("usize fits in u64"), + None => self.len() as u64, Some(valid_index) => { let offsetted_len = (self.len() + self.offset()) as u64; let mut null_count: u64 = self.len() as u64; - let decompressed_ends = decompressed_ends.as_slice::

(); let begin = if valid_index == 0 { 0 } else { - u64::from(decompressed_ends[valid_index - 1]) + decompressed_ends[valid_index - 1].into() }; - let end = cmp::min(u64::from(decompressed_ends[valid_index]), offsetted_len); + let end = cmp::min(decompressed_ends[valid_index].into(), offsetted_len); null_count -= end - begin; for valid_index in is_valid { - let end = cmp::min(u64::from(decompressed_ends[valid_index]), offsetted_len); - null_count -= end - u64::from(decompressed_ends[valid_index - 1]); + let end = cmp::min(decompressed_ends[valid_index].into(), offsetted_len); + null_count -= end - decompressed_ends[valid_index - 1].into(); } null_count From a2471497217c81c960da2848648c772bc765e531 Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Wed, 22 Jan 2025 15:12:48 +0000 Subject: [PATCH 15/16] more --- encodings/runend/src/statistics.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/encodings/runend/src/statistics.rs b/encodings/runend/src/statistics.rs index 3a158657b33..8605d17bc44 100644 --- a/encodings/runend/src/statistics.rs +++ b/encodings/runend/src/statistics.rs @@ -7,7 +7,7 @@ use vortex_array::validity::{ArrayValidity as _, LogicalValidity}; use vortex_array::variants::PrimitiveArrayTrait; use vortex_array::{ArrayDType as _, ArrayLen as _, IntoArrayVariant as _}; use vortex_dtype::{match_each_unsigned_integer_ptype, DType, NativePType}; -use vortex_error::{VortexExpect as _, VortexResult}; +use vortex_error::VortexResult; use vortex_scalar::Scalar; use crate::{RunEndArray, RunEndEncoding}; @@ -55,7 +55,7 @@ impl RunEndArray { ) -> VortexResult { Ok(match self.values().logical_validity() { LogicalValidity::AllValid(_) => { - let mut begin = u64::try_from(self.offset()).vortex_expect("usize fits in u64"); + let mut begin = self.offset() as u64; decompressed_ends .iter() .copied() From a3ccf4bb60e49041f65d92f55ae2ab57af0e84e8 Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Wed, 22 Jan 2025 15:24:54 +0000 Subject: [PATCH 16/16] fix --- encodings/runend/src/statistics.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/encodings/runend/src/statistics.rs b/encodings/runend/src/statistics.rs index 8605d17bc44..ba5bc1599e5 100644 --- a/encodings/runend/src/statistics.rs +++ b/encodings/runend/src/statistics.rs @@ -78,14 +78,13 @@ impl RunEndArray { let mut true_count: u64 = 0; let offsetted_begin = self.offset() as u64; let offsetted_len = (self.len() + self.offset()) as u64; - let valid_end: u64 = decompressed_ends[valid_index].into(); let begin = if valid_index == 0 { offsetted_begin } else { - valid_end + decompressed_ends[valid_index - 1].into() }; - let end = cmp::min(valid_end, offsetted_len); + let end = cmp::min(decompressed_ends[valid_index].into(), offsetted_len); true_count += decompressed_values.value(valid_index) as u64 * (end - begin); for valid_index in is_valid {