diff --git a/rust/arrow/src/compute/kernels/cast.rs b/rust/arrow/src/compute/kernels/cast.rs index 9a547bdefafa..0d8dc822fd96 100644 --- a/rust/arrow/src/compute/kernels/cast.rs +++ b/rust/arrow/src/compute/kernels/cast.rs @@ -88,6 +88,10 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Utf8, Date64) => true, (Utf8, Timestamp(TimeUnit::Nanosecond, None)) => true, (Utf8, _) => DataType::is_numeric(to_type), + (LargeUtf8, Date32) => true, + (LargeUtf8, Date64) => true, + (LargeUtf8, Timestamp(TimeUnit::Nanosecond, None)) => true, + (LargeUtf8, _) => DataType::is_numeric(to_type), (_, Utf8) | (_, LargeUtf8) => { DataType::is_numeric(from_type) || from_type == &Binary } @@ -366,66 +370,20 @@ pub fn cast(array: &ArrayRef, to_type: &DataType) -> Result { }, (Utf8, _) => match to_type { LargeUtf8 => cast_str_container::(&**array), - UInt8 => cast_string_to_numeric::(array), - UInt16 => cast_string_to_numeric::(array), - UInt32 => cast_string_to_numeric::(array), - UInt64 => cast_string_to_numeric::(array), - Int8 => cast_string_to_numeric::(array), - Int16 => cast_string_to_numeric::(array), - Int32 => cast_string_to_numeric::(array), - Int64 => cast_string_to_numeric::(array), - Float32 => cast_string_to_numeric::(array), - Float64 => cast_string_to_numeric::(array), - Date32 => { - use chrono::Datelike; - let string_array = array.as_any().downcast_ref::().unwrap(); - let mut builder = PrimitiveBuilder::::new(string_array.len()); - for i in 0..string_array.len() { - if string_array.is_null(i) { - builder.append_null()?; - } else { - match string_array.value(i).parse::() { - Ok(date) => builder.append_value( - date.num_days_from_ce() - EPOCH_DAYS_FROM_CE, - )?, - Err(_) => builder.append_null()?, // not a valid date - }; - } - } - Ok(Arc::new(builder.finish()) as ArrayRef) - } - Date64 => { - let string_array = array.as_any().downcast_ref::().unwrap(); - let mut builder = PrimitiveBuilder::::new(string_array.len()); - for i in 0..string_array.len() { - if string_array.is_null(i) { - builder.append_null()?; - } else { - match string_array.value(i).parse::() { - Ok(date_time) => { - builder.append_value(date_time.timestamp_millis())? - } - Err(_) => builder.append_null()?, // not a valid date - }; - } - } - Ok(Arc::new(builder.finish()) as ArrayRef) - } + UInt8 => cast_string_to_numeric::(array), + UInt16 => cast_string_to_numeric::(array), + UInt32 => cast_string_to_numeric::(array), + UInt64 => cast_string_to_numeric::(array), + Int8 => cast_string_to_numeric::(array), + Int16 => cast_string_to_numeric::(array), + Int32 => cast_string_to_numeric::(array), + Int64 => cast_string_to_numeric::(array), + Float32 => cast_string_to_numeric::(array), + Float64 => cast_string_to_numeric::(array), + Date32 => cast_string_to_date32::(&**array), + Date64 => cast_string_to_date64::(&**array), Timestamp(TimeUnit::Nanosecond, None) => { - let string_array = array.as_any().downcast_ref::().unwrap(); - let mut builder = - PrimitiveBuilder::::new(string_array.len()); - for i in 0..string_array.len() { - if string_array.is_null(i) { - builder.append_null()?; - } else { - match string_to_timestamp_nanos(string_array.value(i)) { - Ok(nanos) => builder.append_value(nanos)?, - Err(_) => builder.append_null()?, // not a valid date - }; - } - } - Ok(Arc::new(builder.finish()) as ArrayRef) + cast_string_to_timestamp_ns::(&**array) } _ => Err(ArrowError::ComputeError(format!( "Casting from {:?} to {:?} not supported", @@ -487,6 +445,27 @@ pub fn cast(array: &ArrayRef, to_type: &DataType) -> Result { from_type, to_type, ))), }, + (LargeUtf8, _) => match to_type { + UInt8 => cast_string_to_numeric::(array), + UInt16 => cast_string_to_numeric::(array), + UInt32 => cast_string_to_numeric::(array), + UInt64 => cast_string_to_numeric::(array), + Int8 => cast_string_to_numeric::(array), + Int16 => cast_string_to_numeric::(array), + Int32 => cast_string_to_numeric::(array), + Int64 => cast_string_to_numeric::(array), + Float32 => cast_string_to_numeric::(array), + Float64 => cast_string_to_numeric::(array), + Date32 => cast_string_to_date32::(&**array), + Date64 => cast_string_to_date64::(&**array), + Timestamp(TimeUnit::Nanosecond, None) => { + cast_string_to_timestamp_ns::(&**array) + } + _ => Err(ArrowError::ComputeError(format!( + "Casting from {:?} to {:?} not supported", + from_type, to_type, + ))), + }, // start numeric casts (UInt8, UInt16) => cast_numeric_arrays::(array), @@ -949,17 +928,23 @@ where /// Cast numeric types to Utf8 #[allow(clippy::unnecessary_wraps)] -fn cast_string_to_numeric(from: &ArrayRef) -> Result +fn cast_string_to_numeric( + from: &ArrayRef, +) -> Result where T: ArrowNumericType, ::Native: lexical_core::FromLexical, { - Ok(Arc::new(string_to_numeric_cast::( - from.as_any().downcast_ref::().unwrap(), + Ok(Arc::new(string_to_numeric_cast::( + from.as_any() + .downcast_ref::>() + .unwrap(), ))) } -fn string_to_numeric_cast(from: &StringArray) -> PrimitiveArray +fn string_to_numeric_cast( + from: &GenericStringArray, +) -> PrimitiveArray where T: ArrowNumericType, ::Native: lexical_core::FromLexical, @@ -978,6 +963,93 @@ where unsafe { PrimitiveArray::::from_trusted_len_iter(iter) } } +/// Casts generic string arrays to Date32Array +#[allow(clippy::unnecessary_wraps)] +fn cast_string_to_date32( + array: &dyn Array, +) -> Result { + use chrono::Datelike; + let string_array = array + .as_any() + .downcast_ref::>() + .unwrap(); + + let iter = (0..string_array.len()).map(|i| { + if string_array.is_null(i) { + None + } else { + string_array + .value(i) + .parse::() + .map(|date| date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) + .ok() + } + }); + + // Benefit: + // 20% performance improvement + // Soundness: + // The iterator is trustedLen because it comes from an `StringArray`. + let array = unsafe { Date32Array::from_trusted_len_iter(iter) }; + Ok(Arc::new(array) as ArrayRef) +} + +/// Casts generic string arrays to Date64Array +#[allow(clippy::unnecessary_wraps)] +fn cast_string_to_date64( + array: &dyn Array, +) -> Result { + let string_array = array + .as_any() + .downcast_ref::>() + .unwrap(); + + let iter = (0..string_array.len()).map(|i| { + if string_array.is_null(i) { + None + } else { + string_array + .value(i) + .parse::() + .map(|datetime| datetime.timestamp_millis()) + .ok() + } + }); + + // Benefit: + // 20% performance improvement + // Soundness: + // The iterator is trustedLen because it comes from an `StringArray`. + let array = unsafe { Date64Array::from_trusted_len_iter(iter) }; + Ok(Arc::new(array) as ArrayRef) +} + +/// Casts generic string arrays to TimeStampNanosecondArray +#[allow(clippy::unnecessary_wraps)] +fn cast_string_to_timestamp_ns( + array: &dyn Array, +) -> Result { + let string_array = array + .as_any() + .downcast_ref::>() + .unwrap(); + + let iter = (0..string_array.len()).map(|i| { + if string_array.is_null(i) { + None + } else { + string_to_timestamp_nanos(string_array.value(i)).ok() + } + }); + + // Benefit: + // 20% performance improvement + // Soundness: + // The iterator is trustedLen because it comes from an `StringArray`. + let array = unsafe { TimestampNanosecondArray::from_trusted_len_iter(iter) }; + Ok(Arc::new(array) as ArrayRef) +} + /// Cast numeric types to Boolean /// /// Any zero value returns `false` while non-zero returns `true` @@ -1719,20 +1791,27 @@ mod tests { #[test] fn test_cast_string_to_timestamp() { - let a = StringArray::from(vec![ + let a1 = Arc::new(StringArray::from(vec![ Some("2020-09-08T12:00:00+00:00"), Some("Not a valid date"), None, - ]); - let array = Arc::new(a) as ArrayRef; - let b = cast(&array, &DataType::Timestamp(TimeUnit::Nanosecond, None)).unwrap(); - let c = b - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(1599566400000000000, c.value(0)); - assert!(c.is_null(1)); - assert!(c.is_null(2)); + ])) as ArrayRef; + let a2 = Arc::new(LargeStringArray::from(vec![ + Some("2020-09-08T12:00:00+00:00"), + Some("Not a valid date"), + None, + ])) as ArrayRef; + for array in &[a1, a2] { + let b = + cast(array, &DataType::Timestamp(TimeUnit::Nanosecond, None)).unwrap(); + let c = b + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(1599566400000000000, c.value(0)); + assert!(c.is_null(1)); + assert!(c.is_null(2)); + } } #[test]