From d5fe314faa059714bb427ebdec29bffea413263d Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Thu, 25 Feb 2021 09:07:00 +0100 Subject: [PATCH 1/5] cast large-utf8 to numeric and temporal types --- rust/arrow/src/compute/kernels/cast.rs | 225 ++++++++++++++++--------- 1 file changed, 150 insertions(+), 75 deletions(-) diff --git a/rust/arrow/src/compute/kernels/cast.rs b/rust/arrow/src/compute/kernels/cast.rs index 9a547bdefafa..b51115f81e11 100644 --- a/rust/arrow/src/compute/kernels/cast.rs +++ b/rust/arrow/src/compute/kernels/cast.rs @@ -88,6 +88,10 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Utf8, Date64) => true, (Utf8, Timestamp(TimeUnit::Nanosecond, None)) => true, (Utf8, _) => DataType::is_numeric(to_type), + (LargeUtf8, Date32) => true, + (LargeUtf8, Date64) => true, + (LargeUtf8, Timestamp(TimeUnit::Nanosecond, None)) => true, + (LargeUtf8, _) => DataType::is_numeric(to_type), (_, Utf8) | (_, LargeUtf8) => { DataType::is_numeric(from_type) || from_type == &Binary } @@ -364,68 +368,42 @@ pub fn cast(array: &ArrayRef, to_type: &DataType) -> Result { from_type, to_type, ))), }, - (Utf8, _) => match to_type { - LargeUtf8 => cast_str_container::(&**array), - UInt8 => cast_string_to_numeric::(array), - UInt16 => cast_string_to_numeric::(array), - UInt32 => cast_string_to_numeric::(array), - UInt64 => cast_string_to_numeric::(array), - Int8 => cast_string_to_numeric::(array), - Int16 => cast_string_to_numeric::(array), - Int32 => cast_string_to_numeric::(array), - Int64 => cast_string_to_numeric::(array), - Float32 => cast_string_to_numeric::(array), - Float64 => cast_string_to_numeric::(array), - Date32 => { - use chrono::Datelike; - let string_array = array.as_any().downcast_ref::().unwrap(); - let mut builder = PrimitiveBuilder::::new(string_array.len()); - for i in 0..string_array.len() { - if string_array.is_null(i) { - builder.append_null()?; - } else { - match string_array.value(i).parse::() { - Ok(date) => builder.append_value( - date.num_days_from_ce() - EPOCH_DAYS_FROM_CE, - )?, - Err(_) => builder.append_null()?, // not a valid date - }; - } - } - Ok(Arc::new(builder.finish()) as ArrayRef) - } - Date64 => { - let string_array = array.as_any().downcast_ref::().unwrap(); - let mut builder = PrimitiveBuilder::::new(string_array.len()); - for i in 0..string_array.len() { - if string_array.is_null(i) { - builder.append_null()?; - } else { - match string_array.value(i).parse::() { - Ok(date_time) => { - builder.append_value(date_time.timestamp_millis())? - } - Err(_) => builder.append_null()?, // not a valid date - }; - } - } - Ok(Arc::new(builder.finish()) as ArrayRef) + (LargeUtf8, _) => match to_type { + UInt8 => cast_string_to_numeric::(array), + UInt16 => cast_string_to_numeric::(array), + UInt32 => cast_string_to_numeric::(array), + UInt64 => cast_string_to_numeric::(array), + Int8 => cast_string_to_numeric::(array), + Int16 => cast_string_to_numeric::(array), + Int32 => cast_string_to_numeric::(array), + Int64 => cast_string_to_numeric::(array), + Float32 => cast_string_to_numeric::(array), + Float64 => cast_string_to_numeric::(array), + Date32 => cast_string_to_date32::(&**array), + Date64 => cast_string_to_date64::(&**array), + Timestamp(TimeUnit::Nanosecond, None) => { + cast_string_to_timestamp_ns::(&**array) } + _ => Err(ArrowError::ComputeError(format!( + "Casting from {:?} to {:?} not supported", + from_type, to_type, + ))), + }, + (Utf8, _) => match to_type { + UInt8 => cast_string_to_numeric::(array), + UInt16 => cast_string_to_numeric::(array), + UInt32 => cast_string_to_numeric::(array), + UInt64 => cast_string_to_numeric::(array), + Int8 => cast_string_to_numeric::(array), + Int16 => cast_string_to_numeric::(array), + Int32 => cast_string_to_numeric::(array), + Int64 => cast_string_to_numeric::(array), + Float32 => cast_string_to_numeric::(array), + Float64 => cast_string_to_numeric::(array), + Date32 => cast_string_to_date32::(&**array), + Date64 => cast_string_to_date64::(&**array), Timestamp(TimeUnit::Nanosecond, None) => { - let string_array = array.as_any().downcast_ref::().unwrap(); - let mut builder = - PrimitiveBuilder::::new(string_array.len()); - for i in 0..string_array.len() { - if string_array.is_null(i) { - builder.append_null()?; - } else { - match string_to_timestamp_nanos(string_array.value(i)) { - Ok(nanos) => builder.append_value(nanos)?, - Err(_) => builder.append_null()?, // not a valid date - }; - } - } - Ok(Arc::new(builder.finish()) as ArrayRef) + cast_string_to_timestamp_ns::(&**array) } _ => Err(ArrowError::ComputeError(format!( "Casting from {:?} to {:?} not supported", @@ -949,17 +927,23 @@ where /// Cast numeric types to Utf8 #[allow(clippy::unnecessary_wraps)] -fn cast_string_to_numeric(from: &ArrayRef) -> Result +fn cast_string_to_numeric( + from: &ArrayRef, +) -> Result where T: ArrowNumericType, ::Native: lexical_core::FromLexical, { - Ok(Arc::new(string_to_numeric_cast::( - from.as_any().downcast_ref::().unwrap(), + Ok(Arc::new(string_to_numeric_cast::( + from.as_any() + .downcast_ref::>() + .unwrap(), ))) } -fn string_to_numeric_cast(from: &StringArray) -> PrimitiveArray +fn string_to_numeric_cast( + from: &GenericStringArray, +) -> PrimitiveArray where T: ArrowNumericType, ::Native: lexical_core::FromLexical, @@ -978,6 +962,90 @@ where unsafe { PrimitiveArray::::from_trusted_len_iter(iter) } } +/// Casts generic string arrays to Date32Array +fn cast_string_to_date32( + array: &dyn Array, +) -> Result { + use chrono::Datelike; + let string_array = array + .as_any() + .downcast_ref::>() + .unwrap(); + + let iter = (0..string_array.len()).map(|i| { + if string_array.is_null(i) { + None + } else { + string_array + .value(i) + .parse::() + .map(|date| date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) + .ok() + } + }); + + // Benefit: + // 20% performance improvement + // Soundness: + // The iterator is trustedLen because it comes from an `StringArray`. + let array = unsafe { Date32Array::from_trusted_len_iter(iter) }; + Ok(Arc::new(array) as ArrayRef) +} + +/// Casts generic string arrays to Date64Array +fn cast_string_to_date64( + array: &dyn Array, +) -> Result { + let string_array = array + .as_any() + .downcast_ref::>() + .unwrap(); + + let iter = (0..string_array.len()).map(|i| { + if string_array.is_null(i) { + None + } else { + string_array + .value(i) + .parse::() + .map(|datetime| datetime.timestamp_millis()) + .ok() + } + }); + + // Benefit: + // 20% performance improvement + // Soundness: + // The iterator is trustedLen because it comes from an `StringArray`. + let array = unsafe { Date64Array::from_trusted_len_iter(iter) }; + Ok(Arc::new(array) as ArrayRef) +} + +/// Casts generic string arrays to TimeStampNanosecondArray +fn cast_string_to_timestamp_ns( + array: &dyn Array, +) -> Result { + let string_array = array + .as_any() + .downcast_ref::>() + .unwrap(); + + let iter = (0..string_array.len()).map(|i| { + if string_array.is_null(i) { + None + } else { + string_to_timestamp_nanos(string_array.value(i)).ok() + } + }); + + // Benefit: + // 20% performance improvement + // Soundness: + // The iterator is trustedLen because it comes from an `StringArray`. + let array = unsafe { TimestampNanosecondArray::from_trusted_len_iter(iter) }; + Ok(Arc::new(array) as ArrayRef) +} + /// Cast numeric types to Boolean /// /// Any zero value returns `false` while non-zero returns `true` @@ -1719,20 +1787,27 @@ mod tests { #[test] fn test_cast_string_to_timestamp() { - let a = StringArray::from(vec![ + let a1 = Arc::new(StringArray::from(vec![ Some("2020-09-08T12:00:00+00:00"), Some("Not a valid date"), None, - ]); - let array = Arc::new(a) as ArrayRef; - let b = cast(&array, &DataType::Timestamp(TimeUnit::Nanosecond, None)).unwrap(); - let c = b - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(1599566400000000000, c.value(0)); - assert!(c.is_null(1)); - assert!(c.is_null(2)); + ])) as ArrayRef; + let a2 = Arc::new(StringArray::from(vec![ + Some("2020-09-08T12:00:00+00:00"), + Some("Not a valid date"), + None, + ])) as ArrayRef; + for array in &[a1, a2] { + let b = + cast(array, &DataType::Timestamp(TimeUnit::Nanosecond, None)).unwrap(); + let c = b + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(1599566400000000000, c.value(0)); + assert!(c.is_null(1)); + assert!(c.is_null(2)); + } } #[test] From b86b7af8d090a836c642688dabbc1d508541dee8 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Thu, 25 Feb 2021 15:19:27 +0100 Subject: [PATCH 2/5] fix can_cast evaluation --- rust/arrow/src/compute/kernels/cast.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/rust/arrow/src/compute/kernels/cast.rs b/rust/arrow/src/compute/kernels/cast.rs index b51115f81e11..63e047eb05f3 100644 --- a/rust/arrow/src/compute/kernels/cast.rs +++ b/rust/arrow/src/compute/kernels/cast.rs @@ -73,6 +73,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (List(_), _) => false, (_, List(list_to)) => can_cast_types(from_type, list_to.data_type()), (_, LargeList(list_to)) => can_cast_types(from_type, list_to.data_type()), + (Dictionary(_, _), LargeUtf8) => false, (Dictionary(_, from_value_type), Dictionary(_, to_value_type)) => { can_cast_types(from_value_type, to_value_type) } From 6e8db6a5bafa9813fdb08efc509d8e33c05b71d5 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Thu, 25 Feb 2021 16:41:28 +0100 Subject: [PATCH 3/5] fix tests --- rust/arrow/src/compute/kernels/cast.rs | 44 +++++++++++++------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/rust/arrow/src/compute/kernels/cast.rs b/rust/arrow/src/compute/kernels/cast.rs index 63e047eb05f3..c4d4c284794a 100644 --- a/rust/arrow/src/compute/kernels/cast.rs +++ b/rust/arrow/src/compute/kernels/cast.rs @@ -73,7 +73,6 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (List(_), _) => false, (_, List(list_to)) => can_cast_types(from_type, list_to.data_type()), (_, LargeList(list_to)) => can_cast_types(from_type, list_to.data_type()), - (Dictionary(_, _), LargeUtf8) => false, (Dictionary(_, from_value_type), Dictionary(_, to_value_type)) => { can_cast_types(from_value_type, to_value_type) } @@ -369,28 +368,8 @@ pub fn cast(array: &ArrayRef, to_type: &DataType) -> Result { from_type, to_type, ))), }, - (LargeUtf8, _) => match to_type { - UInt8 => cast_string_to_numeric::(array), - UInt16 => cast_string_to_numeric::(array), - UInt32 => cast_string_to_numeric::(array), - UInt64 => cast_string_to_numeric::(array), - Int8 => cast_string_to_numeric::(array), - Int16 => cast_string_to_numeric::(array), - Int32 => cast_string_to_numeric::(array), - Int64 => cast_string_to_numeric::(array), - Float32 => cast_string_to_numeric::(array), - Float64 => cast_string_to_numeric::(array), - Date32 => cast_string_to_date32::(&**array), - Date64 => cast_string_to_date64::(&**array), - Timestamp(TimeUnit::Nanosecond, None) => { - cast_string_to_timestamp_ns::(&**array) - } - _ => Err(ArrowError::ComputeError(format!( - "Casting from {:?} to {:?} not supported", - from_type, to_type, - ))), - }, (Utf8, _) => match to_type { + LargeUtf8 => cast_str_container::(&**array), UInt8 => cast_string_to_numeric::(array), UInt16 => cast_string_to_numeric::(array), UInt32 => cast_string_to_numeric::(array), @@ -466,6 +445,27 @@ pub fn cast(array: &ArrayRef, to_type: &DataType) -> Result { from_type, to_type, ))), }, + (LargeUtf8, _) => match to_type { + UInt8 => cast_string_to_numeric::(array), + UInt16 => cast_string_to_numeric::(array), + UInt32 => cast_string_to_numeric::(array), + UInt64 => cast_string_to_numeric::(array), + Int8 => cast_string_to_numeric::(array), + Int16 => cast_string_to_numeric::(array), + Int32 => cast_string_to_numeric::(array), + Int64 => cast_string_to_numeric::(array), + Float32 => cast_string_to_numeric::(array), + Float64 => cast_string_to_numeric::(array), + Date32 => cast_string_to_date32::(&**array), + Date64 => cast_string_to_date64::(&**array), + Timestamp(TimeUnit::Nanosecond, None) => { + cast_string_to_timestamp_ns::(&**array) + } + _ => Err(ArrowError::ComputeError(format!( + "Casting from {:?} to {:?} not supported", + from_type, to_type, + ))), + }, // start numeric casts (UInt8, UInt16) => cast_numeric_arrays::(array), From 312c0827bcd7c136b578cb5dc6040454ade0c94e Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Fri, 26 Feb 2021 22:03:58 +0200 Subject: [PATCH 4/5] clippy: allow unnecessary wraps --- rust/arrow/src/compute/kernels/cast.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/rust/arrow/src/compute/kernels/cast.rs b/rust/arrow/src/compute/kernels/cast.rs index c4d4c284794a..02bf52fe0c3b 100644 --- a/rust/arrow/src/compute/kernels/cast.rs +++ b/rust/arrow/src/compute/kernels/cast.rs @@ -964,6 +964,7 @@ where } /// Casts generic string arrays to Date32Array +#[allow(clippy::unnecessary_wraps)] fn cast_string_to_date32( array: &dyn Array, ) -> Result { @@ -994,6 +995,7 @@ fn cast_string_to_date32( } /// Casts generic string arrays to Date64Array +#[allow(clippy::unnecessary_wraps)] fn cast_string_to_date64( array: &dyn Array, ) -> Result { @@ -1023,6 +1025,7 @@ fn cast_string_to_date64( } /// Casts generic string arrays to TimeStampNanosecondArray +#[allow(clippy::unnecessary_wraps)] fn cast_string_to_timestamp_ns( array: &dyn Array, ) -> Result { From f7276501a427fe8f3c1138014cebbaf42d187d6f Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sat, 27 Feb 2021 09:28:25 +0100 Subject: [PATCH 5/5] actually test largestring to timestamp --- rust/arrow/src/compute/kernels/cast.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/arrow/src/compute/kernels/cast.rs b/rust/arrow/src/compute/kernels/cast.rs index 02bf52fe0c3b..0d8dc822fd96 100644 --- a/rust/arrow/src/compute/kernels/cast.rs +++ b/rust/arrow/src/compute/kernels/cast.rs @@ -1796,7 +1796,7 @@ mod tests { Some("Not a valid date"), None, ])) as ArrayRef; - let a2 = Arc::new(StringArray::from(vec![ + let a2 = Arc::new(LargeStringArray::from(vec![ Some("2020-09-08T12:00:00+00:00"), Some("Not a valid date"), None,