Skip to content

Commit 28ef629

Browse files
committed
Fix panic in multiple distinct aggregates by fixing ScalarValue::new_list
1 parent bb1d7f9 commit 28ef629

File tree

3 files changed

+48
-202
lines changed

3 files changed

+48
-202
lines changed

Cargo.toml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,3 +75,14 @@ opt-level = 3
7575
overflow-checks = false
7676
panic = 'unwind'
7777
rpath = false
78+
79+
[patch.crates-io]
80+
arrow = { path= "/Users/alamb/Software/arrow-rs2/arrow" }
81+
arrow-array = { path= "/Users/alamb/Software/arrow-rs2/arrow-array" }
82+
arrow-buffer = { path= "/Users/alamb/Software/arrow-rs2/arrow-buffer" }
83+
arrow-schema = { path= "/Users/alamb/Software/arrow-rs2/arrow-schema" }
84+
arrow-select = { path= "/Users/alamb/Software/arrow-rs2/arrow-select" }
85+
arrow-string = { path= "/Users/alamb/Software/arrow-rs2/arrow-string" }
86+
arrow-ord = { path= "/Users/alamb/Software/arrow-rs2/arrow-ord" }
87+
arrow-flight = { path= "/Users/alamb/Software/arrow-rs2/arrow-flight" }
88+
parquet = { path= "/Users/alamb/Software/arrow-rs2/parquet" }

datafusion/common/src/scalar.rs

Lines changed: 10 additions & 194 deletions
Original file line numberDiff line numberDiff line change
@@ -600,117 +600,6 @@ macro_rules! typed_cast {
600600
}};
601601
}
602602

603-
macro_rules! build_timestamp_list {
604-
($TIME_UNIT:expr, $TIME_ZONE:expr, $VALUES:expr, $SIZE:expr) => {{
605-
match $VALUES {
606-
// the return on the macro is necessary, to short-circuit and return ArrayRef
607-
None => {
608-
return new_null_array(
609-
&DataType::List(Arc::new(Field::new(
610-
"item",
611-
DataType::Timestamp($TIME_UNIT, $TIME_ZONE),
612-
true,
613-
))),
614-
$SIZE,
615-
)
616-
}
617-
Some(values) => match $TIME_UNIT {
618-
TimeUnit::Second => {
619-
build_values_list_tz!(
620-
TimestampSecondBuilder,
621-
TimestampSecond,
622-
values,
623-
$SIZE,
624-
$TIME_ZONE
625-
)
626-
}
627-
TimeUnit::Millisecond => build_values_list_tz!(
628-
TimestampMillisecondBuilder,
629-
TimestampMillisecond,
630-
values,
631-
$SIZE,
632-
$TIME_ZONE
633-
),
634-
TimeUnit::Microsecond => build_values_list_tz!(
635-
TimestampMicrosecondBuilder,
636-
TimestampMicrosecond,
637-
values,
638-
$SIZE,
639-
$TIME_ZONE
640-
),
641-
TimeUnit::Nanosecond => build_values_list_tz!(
642-
TimestampNanosecondBuilder,
643-
TimestampNanosecond,
644-
values,
645-
$SIZE,
646-
$TIME_ZONE
647-
),
648-
},
649-
}
650-
}};
651-
}
652-
653-
macro_rules! new_builder {
654-
(StringBuilder, $len:expr) => {
655-
StringBuilder::new()
656-
};
657-
(LargeStringBuilder, $len:expr) => {
658-
LargeStringBuilder::new()
659-
};
660-
($el:ident, $len:expr) => {{
661-
<$el>::with_capacity($len)
662-
}};
663-
}
664-
665-
macro_rules! build_values_list {
666-
($VALUE_BUILDER_TY:ident, $SCALAR_TY:ident, $VALUES:expr, $SIZE:expr) => {{
667-
let builder = new_builder!($VALUE_BUILDER_TY, $VALUES.len());
668-
let mut builder = ListBuilder::new(builder);
669-
670-
for _ in 0..$SIZE {
671-
for scalar_value in $VALUES {
672-
match scalar_value {
673-
ScalarValue::$SCALAR_TY(Some(v)) => {
674-
builder.values().append_value(v.clone());
675-
}
676-
ScalarValue::$SCALAR_TY(None) => {
677-
builder.values().append_null();
678-
}
679-
_ => panic!("Incompatible ScalarValue for list"),
680-
};
681-
}
682-
builder.append(true);
683-
}
684-
685-
builder.finish()
686-
}};
687-
}
688-
689-
macro_rules! build_values_list_tz {
690-
($VALUE_BUILDER_TY:ident, $SCALAR_TY:ident, $VALUES:expr, $SIZE:expr, $TIME_ZONE:expr) => {{
691-
let mut builder = ListBuilder::new(
692-
$VALUE_BUILDER_TY::with_capacity($VALUES.len()).with_timezone_opt($TIME_ZONE),
693-
);
694-
695-
for _ in 0..$SIZE {
696-
for scalar_value in $VALUES {
697-
match scalar_value {
698-
ScalarValue::$SCALAR_TY(Some(v), _) => {
699-
builder.values().append_value(v.clone());
700-
}
701-
ScalarValue::$SCALAR_TY(None, _) => {
702-
builder.values().append_null();
703-
}
704-
_ => panic!("Incompatible ScalarValue for list"),
705-
};
706-
}
707-
builder.append(true);
708-
}
709-
710-
builder.finish()
711-
}};
712-
}
713-
714603
macro_rules! build_array_from_option {
715604
($DATA_TYPE:ident, $ARRAY_TYPE:ident, $EXPR:expr, $SIZE:expr) => {{
716605
match $EXPR {
@@ -1198,7 +1087,8 @@ impl ScalarValue {
11981087
}
11991088

12001089
/// Converts an iterator of references [`ScalarValue`] into an [`ArrayRef`]
1201-
/// corresponding to those values. For example,
1090+
/// corresponding to those values. For example, an iterator of
1091+
/// [`ScalarValue::Int32`] would be converted to an [`Int32Array`].
12021092
///
12031093
/// Returns an error if the iterator is empty or if the
12041094
/// [`ScalarValue`]s are not all the same type
@@ -1654,41 +1544,6 @@ impl ScalarValue {
16541544
Ok(array)
16551545
}
16561546

1657-
/// This function does not contains nulls but empty array instead.
1658-
fn iter_to_array_list_without_nulls(
1659-
values: &[ScalarValue],
1660-
data_type: &DataType,
1661-
) -> Result<GenericListArray<i32>> {
1662-
let mut elements: Vec<ArrayRef> = vec![];
1663-
let mut offsets = vec![];
1664-
1665-
if values.is_empty() {
1666-
offsets.push(0);
1667-
} else {
1668-
let arr = ScalarValue::iter_to_array(values.to_vec())?;
1669-
offsets.push(arr.len());
1670-
elements.push(arr);
1671-
}
1672-
1673-
// Concatenate element arrays to create single flat array
1674-
let flat_array = if elements.is_empty() {
1675-
new_empty_array(data_type)
1676-
} else {
1677-
let element_arrays: Vec<&dyn Array> =
1678-
elements.iter().map(|a| a.as_ref()).collect();
1679-
arrow::compute::concat(&element_arrays)?
1680-
};
1681-
1682-
let list_array = ListArray::new(
1683-
Arc::new(Field::new("item", flat_array.data_type().to_owned(), true)),
1684-
OffsetBuffer::<i32>::from_lengths(offsets),
1685-
flat_array,
1686-
None,
1687-
);
1688-
1689-
Ok(list_array)
1690-
}
1691-
16921547
/// This function build with nulls with nulls buffer.
16931548
fn iter_to_array_list(
16941549
scalars: impl IntoIterator<Item = ScalarValue>,
@@ -1776,7 +1631,8 @@ impl ScalarValue {
17761631
.unwrap()
17771632
}
17781633

1779-
/// Converts `Vec<ScalaValue>` to ListArray, simplified version of ScalarValue::to_array
1634+
/// Converts `Vec<ScalaValue>` where each element has type corresponding to
1635+
/// `data_type`, to a [`ListArray`].
17801636
///
17811637
/// Example
17821638
/// ```
@@ -1802,52 +1658,12 @@ impl ScalarValue {
18021658
/// assert_eq!(result, &expected);
18031659
/// ```
18041660
pub fn new_list(values: &[ScalarValue], data_type: &DataType) -> ArrayRef {
1805-
Arc::new(match data_type {
1806-
DataType::Boolean => build_values_list!(BooleanBuilder, Boolean, values, 1),
1807-
DataType::Int8 => build_values_list!(Int8Builder, Int8, values, 1),
1808-
DataType::Int16 => build_values_list!(Int16Builder, Int16, values, 1),
1809-
DataType::Int32 => build_values_list!(Int32Builder, Int32, values, 1),
1810-
DataType::Int64 => build_values_list!(Int64Builder, Int64, values, 1),
1811-
DataType::UInt8 => build_values_list!(UInt8Builder, UInt8, values, 1),
1812-
DataType::UInt16 => build_values_list!(UInt16Builder, UInt16, values, 1),
1813-
DataType::UInt32 => build_values_list!(UInt32Builder, UInt32, values, 1),
1814-
DataType::UInt64 => build_values_list!(UInt64Builder, UInt64, values, 1),
1815-
DataType::Utf8 => build_values_list!(StringBuilder, Utf8, values, 1),
1816-
DataType::LargeUtf8 => {
1817-
build_values_list!(LargeStringBuilder, LargeUtf8, values, 1)
1818-
}
1819-
DataType::Float32 => build_values_list!(Float32Builder, Float32, values, 1),
1820-
DataType::Float64 => build_values_list!(Float64Builder, Float64, values, 1),
1821-
DataType::Timestamp(unit, tz) => {
1822-
let values = Some(values);
1823-
build_timestamp_list!(unit.clone(), tz.clone(), values, 1)
1824-
}
1825-
DataType::List(_) | DataType::Struct(_) => {
1826-
ScalarValue::iter_to_array_list_without_nulls(values, data_type).unwrap()
1827-
}
1828-
DataType::Decimal128(precision, scale) => {
1829-
let mut vals = vec![];
1830-
for value in values.iter() {
1831-
if let ScalarValue::Decimal128(v, _, _) = value {
1832-
vals.push(v.to_owned())
1833-
}
1834-
}
1835-
1836-
let arr = Decimal128Array::from(vals)
1837-
.with_precision_and_scale(*precision, *scale)
1838-
.unwrap();
1839-
wrap_into_list_array(Arc::new(arr))
1840-
}
1841-
1842-
DataType::Null => {
1843-
let arr = new_null_array(&DataType::Null, values.len());
1844-
wrap_into_list_array(arr)
1845-
}
1846-
_ => panic!(
1847-
"Unsupported data type {:?} for ScalarValue::list_to_array",
1848-
data_type
1849-
),
1850-
})
1661+
let values = if values.is_empty() {
1662+
new_empty_array(data_type)
1663+
} else {
1664+
Self::iter_to_array(values.iter().cloned()).unwrap()
1665+
};
1666+
Arc::new(wrap_into_list_array(values))
18511667
}
18521668

18531669
/// Converts a scalar value into an array of `size` rows.

datafusion/sqllogictest/test_files/aggregate.slt

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2020,14 +2020,6 @@ statement ok
20202020
drop table t;
20212021

20222022

2023-
2024-
2025-
statement error DataFusion error: Execution error: Table 't_source' doesn't exist\.
2026-
drop table t_source;
2027-
2028-
statement error DataFusion error: Execution error: Table 't' doesn't exist\.
2029-
drop table t;
2030-
20312023
query I
20322024
select median(a) from (select 1 as a where 1=0);
20332025
----
@@ -2199,6 +2191,26 @@ NULL 1 10.1 10.1 10.1 10.1 0 NULL
21992191
statement ok
22002192
set datafusion.sql_parser.dialect = 'Generic';
22012193

2194+
## Multiple distinct aggregates and dictionaries
2195+
statement ok
2196+
create table dict_test as values (1, arrow_cast('foo', 'Dictionary(Int32, Utf8)')), (2, arrow_cast('bar', 'Dictionary(Int32, Utf8)'));
2197+
2198+
query I?
2199+
select * from dict_test;
2200+
----
2201+
1 foo
2202+
2 bar
2203+
2204+
query II
2205+
select count(distinct column1), count(distinct column2) from dict_test group by column1;
2206+
----
2207+
1 1
2208+
1 1
2209+
2210+
statement error DataFusion error: SQL error: ParserError\("Expected end of statement, found: test"\)
2211+
drop table dict_ test;
2212+
2213+
22022214
# Prepare the table with dictionary values for testing
22032215
statement ok
22042216
CREATE TABLE value(x bigint) AS VALUES (1), (2), (3), (1), (3), (4), (5), (2);
@@ -2282,6 +2294,13 @@ select max(x_dict) from value_dict group by x_dict % 2 order by max(x_dict);
22822294
4
22832295
5
22842296

2297+
statement ok
2298+
drop table value
2299+
2300+
statement ok
2301+
drop table value_dict
2302+
2303+
22852304
# bool aggregation
22862305
statement ok
22872306
CREATE TABLE value_bool(x boolean, g int) AS VALUES (NULL, 0), (false, 0), (true, 0), (false, 1), (true, 2), (NULL, 3);

0 commit comments

Comments
 (0)