From 68c624108ca08f6595861d929174d66d42837378 Mon Sep 17 00:00:00 2001 From: Vipul Vaibhaw Date: Thu, 4 Jul 2024 12:23:35 +0530 Subject: [PATCH 01/14] dedup code --- .../expressions/scalar_funcs/hex.rs | 66 ++++++------------- 1 file changed, 19 insertions(+), 47 deletions(-) diff --git a/core/src/execution/datafusion/expressions/scalar_funcs/hex.rs b/core/src/execution/datafusion/expressions/scalar_funcs/hex.rs index 5191e53fa2..2cdf8e7f7e 100644 --- a/core/src/execution/datafusion/expressions/scalar_funcs/hex.rs +++ b/core/src/execution/datafusion/expressions/scalar_funcs/hex.rs @@ -118,55 +118,27 @@ pub(super) fn spark_hex(args: &[ColumnarValue]) -> Result { + DataType::Dictionary(_, value_type) => { let dict = as_dictionary_array::(&array); - let hexed_values = as_int64_array(dict.values())?; - let values = hexed_values - .iter() - .map(|v| v.map(hex_int64)) - .collect::>(); - - let keys = dict.keys().clone(); - let mut new_keys = Vec::with_capacity(values.len()); - - for key in keys.iter() { - let key = key.map(|k| values[k as usize].clone()).unwrap_or(None); - new_keys.push(key); - } - - let string_array_values = StringArray::from(new_keys); - Ok(ColumnarValue::Array(Arc::new(string_array_values))) - } - DataType::Dictionary(_, value_type) if matches!(**value_type, DataType::Utf8) => { - let dict = as_dictionary_array::(&array); - - let hexed_values = as_string_array(dict.values()); - let values: Vec> = hexed_values - .iter() - .map(|v| v.map(hex_bytes).transpose()) - .collect::>()?; - - let keys = dict.keys().clone(); - - let mut new_keys = Vec::with_capacity(values.len()); - - for key in keys.iter() { - let key = key.map(|k| values[k as usize].clone()).unwrap_or(None); - new_keys.push(key); - } - - let string_array_values = StringArray::from(new_keys); - Ok(ColumnarValue::Array(Arc::new(string_array_values))) - } - DataType::Dictionary(_, value_type) if matches!(**value_type, DataType::Binary) => { - let dict = as_dictionary_array::(&array); - - let hexed_values = as_binary_array(dict.values())?; - let values: Vec> = hexed_values - .iter() - .map(|v| v.map(hex_bytes).transpose()) - .collect::>()?; + let values = match **value_type { + DataType::Int64 => as_int64_array(dict.values())? + .iter() + .map(|v| v.map(hex_int64)) + .collect::>(), + DataType::Utf8 => as_string_array(dict.values()) + .iter() + .map(|v| v.map(hex_bytes).transpose()) + .collect::>()?, + DataType::Binary => as_binary_array(dict.values())? + .iter() + .map(|v| v.map(hex_bytes).transpose()) + .collect::>()?, + _ => exec_err!( + "hex got an unexpected argument type: {:?}", + array.data_type() + )?, + }; let keys = dict.keys().clone(); let mut new_keys = Vec::with_capacity(values.len()); From ba53b3a31d0313057b8f2c3077a374f71a4c7587 Mon Sep 17 00:00:00 2001 From: Vipul Vaibhaw Date: Thu, 4 Jul 2024 12:59:31 +0530 Subject: [PATCH 02/14] transforming the dict directly --- .../datafusion/expressions/scalar_funcs/hex.rs | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/core/src/execution/datafusion/expressions/scalar_funcs/hex.rs b/core/src/execution/datafusion/expressions/scalar_funcs/hex.rs index 2cdf8e7f7e..e6059818b9 100644 --- a/core/src/execution/datafusion/expressions/scalar_funcs/hex.rs +++ b/core/src/execution/datafusion/expressions/scalar_funcs/hex.rs @@ -140,15 +140,14 @@ pub(super) fn spark_hex(args: &[ColumnarValue]) -> Result> = dict + .keys() + .iter() + .map(|key| key.map(|k| values[k as usize].clone()).unwrap_or(None)) + .collect(); - for key in keys.iter() { - let key = key.map(|k| values[k as usize].clone()).unwrap_or(None); - new_keys.push(key); - } + let string_array_values = StringArray::from(new_values); - let string_array_values = StringArray::from(new_keys); Ok(ColumnarValue::Array(Arc::new(string_array_values))) } _ => exec_err!( From 304abf8349c6d1381c5416886f77875847cae30b Mon Sep 17 00:00:00 2001 From: Vipul Vaibhaw Date: Thu, 4 Jul 2024 13:17:11 +0530 Subject: [PATCH 03/14] code optimization for cast string to timestamp --- .../execution/datafusion/expressions/cast.rs | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/core/src/execution/datafusion/expressions/cast.rs b/core/src/execution/datafusion/expressions/cast.rs index 9e3205cefb..80ac26a05d 100644 --- a/core/src/execution/datafusion/expressions/cast.rs +++ b/core/src/execution/datafusion/expressions/cast.rs @@ -91,22 +91,23 @@ macro_rules! cast_utf8_to_int { result }}; } - macro_rules! cast_utf8_to_timestamp { ($array:expr, $eval_mode:expr, $array_type:ty, $cast_method:ident) => {{ - let len = $array.len(); - let mut cast_array = PrimitiveArray::<$array_type>::builder(len).with_timezone("UTC"); - for i in 0..len { + let mut cast_array = PrimitiveArray::<$array_type>::builder($array.len()).with_timezone("UTC"); + + for i in 0..$array.len() { if $array.is_null(i) { - cast_array.append_null() - } else if let Ok(Some(cast_value)) = $cast_method($array.value(i).trim(), $eval_mode) { - cast_array.append_value(cast_value); + cast_array.append_null(); } else { - cast_array.append_null() + let value = $array.value(i).trim(); + match $cast_method(value, $eval_mode) { + Ok(Some(cast_value)) => cast_array.append_value(cast_value), + _ => cast_array.append_null(), + } } } - let result: ArrayRef = Arc::new(cast_array.finish()) as ArrayRef; - result + + Arc::new(cast_array.finish()) as ArrayRef }}; } From 913c82d1d38047a363b902dab875938b41361746 Mon Sep 17 00:00:00 2001 From: Vipul Vaibhaw Date: Thu, 4 Jul 2024 16:12:00 +0530 Subject: [PATCH 04/14] minor optimizations --- .../execution/datafusion/expressions/cast.rs | 45 ++++++++++--------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/core/src/execution/datafusion/expressions/cast.rs b/core/src/execution/datafusion/expressions/cast.rs index 80ac26a05d..4f3c4f4cad 100644 --- a/core/src/execution/datafusion/expressions/cast.rs +++ b/core/src/execution/datafusion/expressions/cast.rs @@ -717,28 +717,33 @@ impl Cast { .as_any() .downcast_ref::>() .expect("Expected a string array"); - - let cast_array: ArrayRef = match to_type { - DataType::Date32 => { - let len = string_array.len(); - let mut cast_array = PrimitiveArray::::builder(len); - for i in 0..len { - if !string_array.is_null(i) { - match date_parser(string_array.value(i), eval_mode) { - Ok(Some(cast_value)) => cast_array.append_value(cast_value), - Ok(None) => cast_array.append_null(), - Err(e) => return Err(e), - } - } else { - cast_array.append_null() - } + + if to_type != &DataType::Date32 { + unreachable!("Invalid data type {:?} in cast from string", to_type); + } + + let len = string_array.len(); + let mut cast_array = PrimitiveArray::::builder(len); + + for i in 0..len { + let value = if string_array.is_null(i) { + None + } else { + match date_parser(string_array.value(i), eval_mode) { + Ok(Some(cast_value)) => Some(cast_value), + Ok(None) => None, + Err(e) => return Err(e), } - Arc::new(cast_array.finish()) as ArrayRef + }; + + match value { + Some(cast_value) => cast_array.append_value(cast_value), + None => cast_array.append_null(), } - _ => unreachable!("Invalid data type {:?} in cast from string", to_type), - }; - Ok(cast_array) - } + } + + Ok(Arc::new(cast_array.finish()) as ArrayRef) + } fn cast_string_to_timestamp( array: &ArrayRef, From aee95cf9da3bb3295d35e767b6b9a256406d50fd Mon Sep 17 00:00:00 2001 From: Vipul Vaibhaw Date: Thu, 4 Jul 2024 21:30:16 +0530 Subject: [PATCH 05/14] fmt fixes and casting to dict array without unpacking to array first --- .../execution/datafusion/expressions/cast.rs | 37 +++++++++++-------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/core/src/execution/datafusion/expressions/cast.rs b/core/src/execution/datafusion/expressions/cast.rs index 4f3c4f4cad..789bc4718c 100644 --- a/core/src/execution/datafusion/expressions/cast.rs +++ b/core/src/execution/datafusion/expressions/cast.rs @@ -34,8 +34,9 @@ use arrow::{ }; use arrow_array::{ types::{Date32Type, Int16Type, Int32Type, Int64Type, Int8Type}, - Array, ArrayRef, BooleanArray, Decimal128Array, Float32Array, Float64Array, GenericStringArray, - Int16Array, Int32Array, Int64Array, Int8Array, OffsetSizeTrait, PrimitiveArray, + Array, ArrayRef, BooleanArray, Decimal128Array, DictionaryArray, Float32Array, Float64Array, + GenericStringArray, Int16Array, Int32Array, Int64Array, Int8Array, OffsetSizeTrait, + PrimitiveArray, }; use arrow_schema::{DataType, Schema}; use chrono::{NaiveDate, NaiveDateTime, TimeZone, Timelike}; @@ -93,7 +94,8 @@ macro_rules! cast_utf8_to_int { } macro_rules! cast_utf8_to_timestamp { ($array:expr, $eval_mode:expr, $array_type:ty, $cast_method:ident) => {{ - let mut cast_array = PrimitiveArray::<$array_type>::builder($array.len()).with_timezone("UTC"); + let mut cast_array = + PrimitiveArray::<$array_type>::builder($array.len()).with_timezone("UTC"); for i in 0..$array.len() { if $array.is_null(i) { @@ -502,18 +504,23 @@ impl Cast { let array = array_with_timezone(array, self.timezone.clone(), Some(to_type))?; let from_type = array.data_type().clone(); - // unpack dictionary string arrays first - // TODO: we are unpacking a dictionary-encoded array and then performing - // the cast. We could potentially improve performance here by casting the - // dictionary values directly without unpacking the array first, although this - // would add more complexity to the code let array = match &from_type { DataType::Dictionary(key_type, value_type) if key_type.as_ref() == &DataType::Int32 && (value_type.as_ref() == &DataType::Utf8 || value_type.as_ref() == &DataType::LargeUtf8) => { - cast_with_options(&array, value_type.as_ref(), &CAST_OPTIONS)? + let dict_array = array + .as_any() + .downcast_ref::>() + .expect("Expected a dictionary array"); + let values = dict_array.values(); + let cast_values = self.cast_array(values.clone())?; + let cast_array = Arc::new(DictionaryArray::::new( + dict_array.keys().clone(), + cast_values, + )) as ArrayRef; + return Ok(cast_array); } _ => array, }; @@ -717,14 +724,14 @@ impl Cast { .as_any() .downcast_ref::>() .expect("Expected a string array"); - + if to_type != &DataType::Date32 { unreachable!("Invalid data type {:?} in cast from string", to_type); } - + let len = string_array.len(); let mut cast_array = PrimitiveArray::::builder(len); - + for i in 0..len { let value = if string_array.is_null(i) { None @@ -735,15 +742,15 @@ impl Cast { Err(e) => return Err(e), } }; - + match value { Some(cast_value) => cast_array.append_value(cast_value), None => cast_array.append_null(), } } - + Ok(Arc::new(cast_array.finish()) as ArrayRef) - } + } fn cast_string_to_timestamp( array: &ArrayRef, From fc3a7feeb2e699e8882563d7270f5896de6f2793 Mon Sep 17 00:00:00 2001 From: Vipul Vaibhaw Date: Sat, 6 Jul 2024 06:03:12 +0530 Subject: [PATCH 06/14] bug fixes --- core/src/execution/datafusion/expressions/cast.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/src/execution/datafusion/expressions/cast.rs b/core/src/execution/datafusion/expressions/cast.rs index 789bc4718c..0b335fba71 100644 --- a/core/src/execution/datafusion/expressions/cast.rs +++ b/core/src/execution/datafusion/expressions/cast.rs @@ -520,7 +520,8 @@ impl Cast { dict_array.keys().clone(), cast_values, )) as ArrayRef; - return Ok(cast_array); + + cast_with_options(&cast_array, value_type, &CAST_OPTIONS)? } _ => array, }; From e64e2f7a3bcca00bb229d933e3b25e4666bafb81 Mon Sep 17 00:00:00 2001 From: Vipul Vaibhaw Date: Tue, 9 Jul 2024 19:30:42 +0530 Subject: [PATCH 07/14] revert unrelated change --- .../execution/datafusion/expressions/cast.rs | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/native/core/src/execution/datafusion/expressions/cast.rs b/native/core/src/execution/datafusion/expressions/cast.rs index 0b335fba71..f87566dee9 100644 --- a/native/core/src/execution/datafusion/expressions/cast.rs +++ b/native/core/src/execution/datafusion/expressions/cast.rs @@ -94,22 +94,19 @@ macro_rules! cast_utf8_to_int { } macro_rules! cast_utf8_to_timestamp { ($array:expr, $eval_mode:expr, $array_type:ty, $cast_method:ident) => {{ - let mut cast_array = - PrimitiveArray::<$array_type>::builder($array.len()).with_timezone("UTC"); - - for i in 0..$array.len() { + let len = $array.len(); + let mut cast_array = PrimitiveArray::<$array_type>::builder(len).with_timezone("UTC"); + for i in 0..len { if $array.is_null(i) { - cast_array.append_null(); + cast_array.append_null() + } else if let Ok(Some(cast_value)) = $cast_method($array.value(i).trim(), $eval_mode) { + cast_array.append_value(cast_value); } else { - let value = $array.value(i).trim(); - match $cast_method(value, $eval_mode) { - Ok(Some(cast_value)) => cast_array.append_value(cast_value), - _ => cast_array.append_null(), - } + cast_array.append_null() } } - - Arc::new(cast_array.finish()) as ArrayRef + let result: ArrayRef = Arc::new(cast_array.finish()) as ArrayRef; + result }}; } From d557c39c2739ded8746677d5352a4c45db97f96a Mon Sep 17 00:00:00 2001 From: Vipul Vaibhaw Date: Fri, 12 Jul 2024 11:11:57 +0530 Subject: [PATCH 08/14] Added test case and code refactor --- .../execution/datafusion/expressions/cast.rs | 59 +++++++++++++++---- 1 file changed, 48 insertions(+), 11 deletions(-) diff --git a/native/core/src/execution/datafusion/expressions/cast.rs b/native/core/src/execution/datafusion/expressions/cast.rs index f87566dee9..e1c8952963 100644 --- a/native/core/src/execution/datafusion/expressions/cast.rs +++ b/native/core/src/execution/datafusion/expressions/cast.rs @@ -33,10 +33,7 @@ use arrow::{ util::display::FormatOptions, }; use arrow_array::{ - types::{Date32Type, Int16Type, Int32Type, Int64Type, Int8Type}, - Array, ArrayRef, BooleanArray, Decimal128Array, DictionaryArray, Float32Array, Float64Array, - GenericStringArray, Int16Array, Int32Array, Int64Array, Int8Array, OffsetSizeTrait, - PrimitiveArray, + types::{Date32Type, Int16Type, Int32Type, Int64Type, Int8Type}, Array, ArrayRef, BooleanArray, Decimal128Array, DictionaryArray, Float32Array, Float64Array, GenericStringArray, Int16Array, Int32Array, Int64Array, Int8Array, OffsetSizeTrait, PrimitiveArray }; use arrow_schema::{DataType, Schema}; use chrono::{NaiveDate, NaiveDateTime, TimeZone, Timelike}; @@ -500,7 +497,6 @@ impl Cast { let to_type = &self.data_type; let array = array_with_timezone(array, self.timezone.clone(), Some(to_type))?; let from_type = array.data_type().clone(); - let array = match &from_type { DataType::Dictionary(key_type, value_type) if key_type.as_ref() == &DataType::Int32 @@ -511,14 +507,21 @@ impl Cast { .as_any() .downcast_ref::>() .expect("Expected a dictionary array"); - let values = dict_array.values(); - let cast_values = self.cast_array(values.clone())?; - let cast_array = Arc::new(DictionaryArray::::new( + + // cast dictionary values directly + let casted_dictionary = DictionaryArray::::new( dict_array.keys().clone(), - cast_values, - )) as ArrayRef; + self.cast_array(dict_array.values().clone())?, + ); - cast_with_options(&cast_array, value_type, &CAST_OPTIONS)? + // casted dictionary to return type + let casted_result = cast_with_options( + &casted_dictionary, + to_type, + &CAST_OPTIONS, + )?; + + return Ok(spark_cast(casted_result, &from_type, to_type)); } _ => array, }; @@ -1720,6 +1723,40 @@ mod tests { assert_eq!(result.len(), 2); } + #[test] + fn test_cast_dict_string_to_dict_timestamp() -> DataFusionResult<()> { + // prepare input data + let keys = Int32Array::from(vec![0, 1]); + let values: ArrayRef = Arc::new(StringArray::from(vec![ + Some("2020-01-01T12:34:56.123456"), + Some("T2"), + ])); + let dict_array = Arc::new(DictionaryArray::new(keys, values)); + + // prepare cast expression + let timezone = "UTC".to_string(); + let expr = Arc::new(Column::new("a", 0)); // this is not used by the test + let cast = Cast::new( + expr, + DataType::Timestamp(TimeUnit::Microsecond, Some(timezone.clone().into())), + EvalMode::Legacy, + timezone.clone(), + ); + + // test casting string dictionary array to timestamp array + let result = cast.cast_array(dict_array)?; + assert_eq!( + *result.data_type(), + DataType::Timestamp( + TimeUnit::Microsecond, + Some(timezone.into()) + ) + ); + assert_eq!(result.len(), 2); + + Ok(()) + } + #[test] fn date_parser_test() { for date in &[ From df92f0f7e55cfb7aa267da92d627561489a250bf Mon Sep 17 00:00:00 2001 From: Vipul Vaibhaw Date: Fri, 12 Jul 2024 11:31:11 +0530 Subject: [PATCH 09/14] minor optimization --- .../src/execution/datafusion/expressions/cast.rs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/native/core/src/execution/datafusion/expressions/cast.rs b/native/core/src/execution/datafusion/expressions/cast.rs index e1c8952963..5ee5f809da 100644 --- a/native/core/src/execution/datafusion/expressions/cast.rs +++ b/native/core/src/execution/datafusion/expressions/cast.rs @@ -514,12 +514,14 @@ impl Cast { self.cast_array(dict_array.values().clone())?, ); - // casted dictionary to return type - let casted_result = cast_with_options( - &casted_dictionary, - to_type, - &CAST_OPTIONS, - )?; + let casted_result = match to_type { + DataType::Dictionary(_, _) => Arc::new(casted_dictionary), + _ => cast_with_options( + &casted_dictionary, + to_type, + &CAST_OPTIONS, + )? + }; return Ok(spark_cast(casted_result, &from_type, to_type)); } From ec0687bda7960a3a86b375c13c6d5b5bff93e322 Mon Sep 17 00:00:00 2001 From: Vipul Vaibhaw Date: Fri, 12 Jul 2024 11:32:13 +0530 Subject: [PATCH 10/14] minor optimization again --- native/core/src/execution/datafusion/expressions/cast.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/native/core/src/execution/datafusion/expressions/cast.rs b/native/core/src/execution/datafusion/expressions/cast.rs index 5ee5f809da..23be23146c 100644 --- a/native/core/src/execution/datafusion/expressions/cast.rs +++ b/native/core/src/execution/datafusion/expressions/cast.rs @@ -518,7 +518,7 @@ impl Cast { DataType::Dictionary(_, _) => Arc::new(casted_dictionary), _ => cast_with_options( &casted_dictionary, - to_type, + &casted_dictionary.value_type(), &CAST_OPTIONS, )? }; From 9270aedeafa12dacabc664ca9df7c85236e05d85 Mon Sep 17 00:00:00 2001 From: Vipul Vaibhaw Date: Fri, 12 Jul 2024 12:00:32 +0530 Subject: [PATCH 11/14] convert the cast to array --- .../execution/datafusion/expressions/cast.rs | 29 +++++++------------ 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/native/core/src/execution/datafusion/expressions/cast.rs b/native/core/src/execution/datafusion/expressions/cast.rs index 23be23146c..966c1ae7f2 100644 --- a/native/core/src/execution/datafusion/expressions/cast.rs +++ b/native/core/src/execution/datafusion/expressions/cast.rs @@ -33,7 +33,11 @@ use arrow::{ util::display::FormatOptions, }; use arrow_array::{ - types::{Date32Type, Int16Type, Int32Type, Int64Type, Int8Type}, Array, ArrayRef, BooleanArray, Decimal128Array, DictionaryArray, Float32Array, Float64Array, GenericStringArray, Int16Array, Int32Array, Int64Array, Int8Array, OffsetSizeTrait, PrimitiveArray + make_array, + types::{Date32Type, Int16Type, Int32Type, Int64Type, Int8Type}, + Array, ArrayRef, BooleanArray, Decimal128Array, DictionaryArray, Float32Array, Float64Array, + GenericStringArray, Int16Array, Int32Array, Int64Array, Int8Array, OffsetSizeTrait, + PrimitiveArray, }; use arrow_schema::{DataType, Schema}; use chrono::{NaiveDate, NaiveDateTime, TimeZone, Timelike}; @@ -508,21 +512,11 @@ impl Cast { .downcast_ref::>() .expect("Expected a dictionary array"); - // cast dictionary values directly - let casted_dictionary = DictionaryArray::::new( - dict_array.keys().clone(), - self.cast_array(dict_array.values().clone())?, + let casted_result = make_array( + self.cast_array(dict_array.values().clone())? + .to_data() + .clone(), ); - - let casted_result = match to_type { - DataType::Dictionary(_, _) => Arc::new(casted_dictionary), - _ => cast_with_options( - &casted_dictionary, - &casted_dictionary.value_type(), - &CAST_OPTIONS, - )? - }; - return Ok(spark_cast(casted_result, &from_type, to_type)); } _ => array, @@ -1749,10 +1743,7 @@ mod tests { let result = cast.cast_array(dict_array)?; assert_eq!( *result.data_type(), - DataType::Timestamp( - TimeUnit::Microsecond, - Some(timezone.into()) - ) + DataType::Timestamp(TimeUnit::Microsecond, Some(timezone.into())) ); assert_eq!(result.len(), 2); From 37f6e8ff98772e3bcf9e5b91b73b7f0b2a3906ad Mon Sep 17 00:00:00 2001 From: Vipul Vaibhaw Date: Fri, 12 Jul 2024 12:03:02 +0530 Subject: [PATCH 12/14] Revert "convert the cast to array" This reverts commit 9270aedeafa12dacabc664ca9df7c85236e05d85. --- .../execution/datafusion/expressions/cast.rs | 29 ++++++++++++------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/native/core/src/execution/datafusion/expressions/cast.rs b/native/core/src/execution/datafusion/expressions/cast.rs index 966c1ae7f2..23be23146c 100644 --- a/native/core/src/execution/datafusion/expressions/cast.rs +++ b/native/core/src/execution/datafusion/expressions/cast.rs @@ -33,11 +33,7 @@ use arrow::{ util::display::FormatOptions, }; use arrow_array::{ - make_array, - types::{Date32Type, Int16Type, Int32Type, Int64Type, Int8Type}, - Array, ArrayRef, BooleanArray, Decimal128Array, DictionaryArray, Float32Array, Float64Array, - GenericStringArray, Int16Array, Int32Array, Int64Array, Int8Array, OffsetSizeTrait, - PrimitiveArray, + types::{Date32Type, Int16Type, Int32Type, Int64Type, Int8Type}, Array, ArrayRef, BooleanArray, Decimal128Array, DictionaryArray, Float32Array, Float64Array, GenericStringArray, Int16Array, Int32Array, Int64Array, Int8Array, OffsetSizeTrait, PrimitiveArray }; use arrow_schema::{DataType, Schema}; use chrono::{NaiveDate, NaiveDateTime, TimeZone, Timelike}; @@ -512,11 +508,21 @@ impl Cast { .downcast_ref::>() .expect("Expected a dictionary array"); - let casted_result = make_array( - self.cast_array(dict_array.values().clone())? - .to_data() - .clone(), + // cast dictionary values directly + let casted_dictionary = DictionaryArray::::new( + dict_array.keys().clone(), + self.cast_array(dict_array.values().clone())?, ); + + let casted_result = match to_type { + DataType::Dictionary(_, _) => Arc::new(casted_dictionary), + _ => cast_with_options( + &casted_dictionary, + &casted_dictionary.value_type(), + &CAST_OPTIONS, + )? + }; + return Ok(spark_cast(casted_result, &from_type, to_type)); } _ => array, @@ -1743,7 +1749,10 @@ mod tests { let result = cast.cast_array(dict_array)?; assert_eq!( *result.data_type(), - DataType::Timestamp(TimeUnit::Microsecond, Some(timezone.into())) + DataType::Timestamp( + TimeUnit::Microsecond, + Some(timezone.into()) + ) ); assert_eq!(result.len(), 2); From f2dbeaa2a4229180c41bf45d15c0bc4f30138d2c Mon Sep 17 00:00:00 2001 From: Vipul Vaibhaw Date: Fri, 12 Jul 2024 15:30:53 +0530 Subject: [PATCH 13/14] bug fixes --- .../execution/datafusion/expressions/cast.rs | 22 +++++++------------ 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/native/core/src/execution/datafusion/expressions/cast.rs b/native/core/src/execution/datafusion/expressions/cast.rs index 23be23146c..8c394730af 100644 --- a/native/core/src/execution/datafusion/expressions/cast.rs +++ b/native/core/src/execution/datafusion/expressions/cast.rs @@ -24,7 +24,7 @@ use std::{ }; use arrow::{ - compute::{cast_with_options, CastOptions}, + compute::{cast_with_options, take, CastOptions}, datatypes::{ ArrowPrimitiveType, Decimal128Type, DecimalType, Float32Type, Float64Type, TimestampMicrosecondType, @@ -33,7 +33,10 @@ use arrow::{ util::display::FormatOptions, }; use arrow_array::{ - types::{Date32Type, Int16Type, Int32Type, Int64Type, Int8Type}, Array, ArrayRef, BooleanArray, Decimal128Array, DictionaryArray, Float32Array, Float64Array, GenericStringArray, Int16Array, Int32Array, Int64Array, Int8Array, OffsetSizeTrait, PrimitiveArray + types::{Date32Type, Int16Type, Int32Type, Int64Type, Int8Type}, + Array, ArrayRef, BooleanArray, Decimal128Array, DictionaryArray, Float32Array, Float64Array, + GenericStringArray, Int16Array, Int32Array, Int64Array, Int8Array, OffsetSizeTrait, + PrimitiveArray, }; use arrow_schema::{DataType, Schema}; use chrono::{NaiveDate, NaiveDateTime, TimeZone, Timelike}; @@ -508,21 +511,15 @@ impl Cast { .downcast_ref::>() .expect("Expected a dictionary array"); - // cast dictionary values directly let casted_dictionary = DictionaryArray::::new( dict_array.keys().clone(), self.cast_array(dict_array.values().clone())?, ); let casted_result = match to_type { - DataType::Dictionary(_, _) => Arc::new(casted_dictionary), - _ => cast_with_options( - &casted_dictionary, - &casted_dictionary.value_type(), - &CAST_OPTIONS, - )? + DataType::Dictionary(_, _) => Arc::new(casted_dictionary.clone()), + _ => take(casted_dictionary.values().as_ref(), dict_array.keys(), None)?, }; - return Ok(spark_cast(casted_result, &from_type, to_type)); } _ => array, @@ -1749,10 +1746,7 @@ mod tests { let result = cast.cast_array(dict_array)?; assert_eq!( *result.data_type(), - DataType::Timestamp( - TimeUnit::Microsecond, - Some(timezone.into()) - ) + DataType::Timestamp(TimeUnit::Microsecond, Some(timezone.into())) ); assert_eq!(result.len(), 2); From 6b60290d4a8ff96c429da97c43581a3dd2c1364c Mon Sep 17 00:00:00 2001 From: Vipul Vaibhaw Date: Fri, 12 Jul 2024 20:22:46 +0530 Subject: [PATCH 14/14] rename the test to cast_dict_to_timestamp arr --- native/core/src/execution/datafusion/expressions/cast.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/native/core/src/execution/datafusion/expressions/cast.rs b/native/core/src/execution/datafusion/expressions/cast.rs index 8c394730af..cdd34c19c8 100644 --- a/native/core/src/execution/datafusion/expressions/cast.rs +++ b/native/core/src/execution/datafusion/expressions/cast.rs @@ -1723,7 +1723,7 @@ mod tests { } #[test] - fn test_cast_dict_string_to_dict_timestamp() -> DataFusionResult<()> { + fn test_cast_dict_string_to_timestamp() -> DataFusionResult<()> { // prepare input data let keys = Int32Array::from(vec![0, 1]); let values: ArrayRef = Arc::new(StringArray::from(vec![