Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions common/utils/src/main/resources/error/error-conditions.json
Original file line number Diff line number Diff line change
Expand Up @@ -3729,6 +3729,12 @@
],
"sqlState" : "42K0N"
},
"INVALID_EXTERNAL_VALUE" : {
"message" : [
"The value (<other>) of the type (<otherClass>) cannot be converted to the <dataType> type."
],
"sqlState" : "42K0N"
},
"INVALID_EXTRACT_BASE_FIELD_TYPE" : {
"message" : [
"Can't extract a value from <base>. Need a complex type [STRUCT, ARRAY, MAP] but got <other>."
Expand Down Expand Up @@ -11487,11 +11493,6 @@
"Must be 2 children: <others>"
]
},
"_LEGACY_ERROR_TEMP_3219" : {
"message" : [
"The value (<other>) of the type (<otherClass>) cannot be converted to the <dataType> type."
]
},
"_LEGACY_ERROR_TEMP_3220" : {
"message" : [
"The value (<other>) of the type (<otherClass>) cannot be converted to an array of <elementType>"
Expand Down
2 changes: 1 addition & 1 deletion docs/sql-ref-datatypes.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ Spark SQL and DataFrames support the following data types:
- `TimestampNTZType`: Timestamp without time zone(TIMESTAMP_NTZ). It represents values comprising values of fields year, month, day,
hour, minute, and second. All operations are performed without taking any time zone into account.
- Note: TIMESTAMP in Spark is a user-specified alias associated with one of the TIMESTAMP_LTZ and TIMESTAMP_NTZ variations. Users can set the default timestamp type as `TIMESTAMP_LTZ`(default value) or `TIMESTAMP_NTZ` via the configuration `spark.sql.timestampType`.
- `TimestampNTZNanosType(precision)` / `TimestampLTZNanosType(precision)`: Preview nanosecond-capable variants of `TIMESTAMP_NTZ` and `TIMESTAMP_LTZ` with fractional seconds precision `precision` in `[7, 9]`. Unparameterized `TIMESTAMP`, `TIMESTAMP_NTZ`, and `TIMESTAMP_LTZ` remain microsecond types. Enable the preview feature with `SET spark.sql.timestampNanosTypes.enabled=true;` before using these types in schemas or SQL.
- `TimestampNTZNanosType(precision)` / `TimestampLTZNanosType(precision)`: Preview nanosecond-capable variants of `TIMESTAMP_NTZ` and `TIMESTAMP_LTZ` with fractional seconds precision `precision` in `[7, 9]`. Unparameterized `TIMESTAMP`, `TIMESTAMP_NTZ`, and `TIMESTAMP_LTZ` remain microsecond types. In schema-driven Dataset/DataFrame conversion, Spark maps `TimestampNTZNanosType` to `java.time.LocalDateTime` and `TimestampLTZNanosType` to `java.time.Instant`; values with more sub-micro digits than declared by `precision` are floor-truncated to that precision. Enable the preview feature with `SET spark.sql.timestampNanosTypes.enabled=true;` before using these types in schemas or SQL.

* Interval types
- `YearMonthIntervalType(startField, endField)`: Represents a year-month interval which is made up of a contiguous subset of the following fields:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,14 @@ object AgnosticEncoders {
case class InstantEncoder(override val lenientSerialization: Boolean)
extends LeafEncoder[Instant](TimestampType)
case object LocalDateTimeEncoder extends LeafEncoder[LocalDateTime](TimestampNTZType)
// Nanosecond-precision counterparts of `LocalDateTimeEncoder` / `InstantEncoder(false)`.
// They are used by `RowEncoder` when the schema declares a `TimestampNTZNanosType(p)` or
// `TimestampLTZNanosType(p)` column, so Dataset create/collect roundtrips preserve full
// nanosecond precision. See SPARK-57033.
case class LocalDateTimeNanosEncoder(precision: Int)
extends LeafEncoder[LocalDateTime](TimestampNTZNanosType(precision))
case class InstantNanosEncoder(precision: Int)
extends LeafEncoder[Instant](TimestampLTZNanosType(precision))
case object LocalTimeEncoder extends LeafEncoder[LocalTime](TimeType())

case class SparkDecimalEncoder(dt: DecimalType) extends LeafEncoder[Decimal](dt)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ import scala.collection.mutable
import scala.reflect.classTag

import org.apache.spark.sql.{AnalysisException, Row}
import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{BinaryEncoder, BoxedBooleanEncoder, BoxedByteEncoder, BoxedDoubleEncoder, BoxedFloatEncoder, BoxedIntEncoder, BoxedLongEncoder, BoxedShortEncoder, CalendarIntervalEncoder, CharEncoder, DateEncoder, DayTimeIntervalEncoder, EncoderField, GeographyEncoder, GeometryEncoder, InstantEncoder, IterableEncoder, JavaDecimalEncoder, LocalDateEncoder, LocalDateTimeEncoder, LocalTimeEncoder, MapEncoder, NullEncoder, RowEncoder => AgnosticRowEncoder, StringEncoder, TimestampEncoder, UDTEncoder, VarcharEncoder, VariantEncoder, YearMonthIntervalEncoder}
import org.apache.spark.sql.errors.DataTypeErrorsBase
import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{BinaryEncoder, BoxedBooleanEncoder, BoxedByteEncoder, BoxedDoubleEncoder, BoxedFloatEncoder, BoxedIntEncoder, BoxedLongEncoder, BoxedShortEncoder, CalendarIntervalEncoder, CharEncoder, DateEncoder, DayTimeIntervalEncoder, EncoderField, GeographyEncoder, GeometryEncoder, InstantEncoder, InstantNanosEncoder, IterableEncoder, JavaDecimalEncoder, LocalDateEncoder, LocalDateTimeEncoder, LocalDateTimeNanosEncoder, LocalTimeEncoder, MapEncoder, NullEncoder, RowEncoder => AgnosticRowEncoder, StringEncoder, TimestampEncoder, UDTEncoder, VarcharEncoder, VariantEncoder, YearMonthIntervalEncoder}
import org.apache.spark.sql.errors.{DataTypeErrors, DataTypeErrorsBase}
import org.apache.spark.sql.internal.SqlApiConf
import org.apache.spark.sql.types._
import org.apache.spark.sql.types.ops.TypeApiOps
Expand Down Expand Up @@ -50,6 +50,8 @@ import org.apache.spark.util.ArrayImplicits._
* TimestampType -> java.time.Instant if spark.sql.datetime.java8API.enabled is true
*
* TimestampNTZType -> java.time.LocalDateTime
* TimestampNTZNanosType -> java.time.LocalDateTime
* TimestampLTZNanosType -> java.time.Instant
* TimeType -> java.time.LocalTime
*
* DayTimeIntervalType -> java.time.Duration
Expand Down Expand Up @@ -97,6 +99,14 @@ object RowEncoder extends DataTypeErrorsBase {
case TimestampType if SqlApiConf.get.datetimeJava8ApiEnabled => InstantEncoder(lenient)
case TimestampType => TimestampEncoder(lenient)
case TimestampNTZType => LocalDateTimeEncoder
// Nano timestamp types intentionally do not honor `lenient`: legacy `java.sql.Timestamp` /
// `java.sql.Date` external types are out of scope for nanosecond precision (SPARK-57033).
case t: TimestampNTZNanosType =>

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The PR's DatasetSuite / JavaDatasetSuite tests confirm createDataFrame(rows, schemaWithNanos).collect() works under classic mode. The same call under Spark Connect throws before it ever reaches collect():

  • sql/api/src/main/scala/org/apache/spark/sql/util/ArrowUtils.scala:51-77 (toArrowTypeDefault) has no TimestampNTZNanosType / TimestampLTZNanosType case — falls to case _ => throw ExecutionErrors.unsupportedDataTypeError(dt) (line 75-76). Server-side schema → Arrow type construction fails.
  • sql/connect/common/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowSerializer.scala:524-525 and .../ArrowDeserializer.scala:437-439 have no cases for the new LocalDateTimeNanosEncoder / InstantNanosEncoder — fall to case _ => throw new RuntimeException(s"Unsupported Encoder($encoder)/Vector($v) combination."). Even if the Arrow schema mapping were added, the encoder ↔ vector dispatch would still fail.

Is the Connect / Arrow integration tracked in a follow-up ticket?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes - this is intentional. The scope of SPARK-57033 is the classic-mode
java.time <-> internal-row conversion (encoders / CatalystTypeConverters /
Dataset roundtrip), so the Connect + Arrow path is deliberately left out here and
is tracked by follow-up sub-tasks under the umbrella SPARK-56822:

  • Arrow type mapping (ArrowUtils.toArrowType / fromArrowType - the
    unsupportedDataTypeError you hit): SPARK-57159 "Add Arrow type mapping for
    nanosecond-capable timestamp types".
  • The encoder <-> vector dispatch in ArrowSerializer / ArrowDeserializer
    (the Unsupported Encoder/Vector RuntimeException) plus the end-to-end Connect
    flow: SPARK-57160 "Add Spark Connect protocol support for nanosecond-capable
    timestamp types and literals" and SPARK-57161 "Convert nanosecond-capable
    timestamp types and literals between proto and Catalyst in Spark Connect".

DataTypeErrors.checkTimestampNanosTypesEnabled()
LocalDateTimeNanosEncoder(t.precision)
case t: TimestampLTZNanosType =>
DataTypeErrors.checkTimestampNanosTypesEnabled()
InstantNanosEncoder(t.precision)
case DateType if SqlApiConf.get.datetimeJava8ApiEnabled => LocalDateEncoder(lenient)
case DateType => DateEncoder(lenient)
case _: TimeType => LocalTimeEncoder
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ import org.apache.spark.sql.catalyst.util.DateTimeConstants._
import org.apache.spark.sql.catalyst.util.RebaseDateTime.{rebaseGregorianToJulianDays, rebaseGregorianToJulianMicros, rebaseJulianToGregorianDays, rebaseJulianToGregorianMicros}
import org.apache.spark.sql.errors.ExecutionErrors
import org.apache.spark.sql.types.{DateType, TimestampType, TimeType}
import org.apache.spark.unsafe.types.UTF8String
import org.apache.spark.unsafe.types.{TimestampNanosVal, UTF8String}
import org.apache.spark.util.SparkClassUtils

trait SparkDateTimeUtils {
Expand Down Expand Up @@ -208,6 +208,82 @@ trait SparkDateTimeUtils {
instantToMicros(localDateTime.toInstant(ZoneOffset.UTC))
}

/**
* Truncates the sub-microsecond nanosecond part to the given timestamp precision `p` in [7, 9].
* Precision 9 keeps all three digits, 8 zeros the last digit, 7 zeros the last two.
*
* The input is the already-extracted `nanosWithinMicro` component (`0..999`), so truncation is
* independent of the epoch sign of the original timestamp value.
*
* Precisions outside `[7, 9]` are passed through unchanged because the surrounding timestamp
* nanos types validate the bound.
*/
private def truncateNanosWithinMicroToPrecision(nanosWithinMicro: Int, precision: Int): Int = {
precision match {
case 7 => (nanosWithinMicro / 100) * 100
case 8 => (nanosWithinMicro / 10) * 10
case _ => nanosWithinMicro
}
}

/**
* Converts a `java.time.LocalDateTime` into the composite `(epochMicros, nanosWithinMicro)`
* pair used by `TimestampNTZNanosType(precision)` (interpreted at UTC). `epochMicros` comes
* from [[localDateTimeToMicros]] (which is floor toward `-inf` for the integral micro part);
* the last three decimal digits of `localDateTime.getNano` (`[0, 999]`) become
* `nanosWithinMicro` after dropping `(9 - precision)` low digits.
*
* Combined, the result is the floor toward `-inf` of the original nanosecond value rounded down
* to the precision step (10^(9 - precision) ns). At `precision = 9` the conversion is lossless
* within the valid range; at 7 / 8 the lowest 2 / 1 sub-micro digits are dropped. The same
* flooring will be the basis of the future `CAST(... AS TIMESTAMP_NTZ(precision))` rule.
*/
def localDateTimeToTimestampNanos(
localDateTime: LocalDateTime,
precision: Int): TimestampNanosVal = {
val epochMicros = localDateTimeToMicros(localDateTime)
val rawNanosWithinMicro = localDateTime.getNano % NANOS_PER_MICROS.toInt
val nanosWithinMicro = truncateNanosWithinMicroToPrecision(rawNanosWithinMicro, precision)
TimestampNanosVal.fromParts(epochMicros, nanosWithinMicro.toShort)
}

/**
* Reverse of [[localDateTimeToTimestampNanos]]: rebuilds a `java.time.LocalDateTime` (at UTC)
* from a `TimestampNanosVal`. `nanosWithinMicro` is in `[0, 999]` so `plusNanos` never crosses
* the second boundary.
*/
def timestampNanosToLocalDateTime(v: TimestampNanosVal): LocalDateTime = {
microsToLocalDateTime(v.epochMicros).plusNanos(v.nanosWithinMicro.toLong)
}

/**
* Converts a `java.time.Instant` into the composite `(epochMicros, nanosWithinMicro)` pair used
* by `TimestampLTZNanosType(precision)`. `epochMicros` comes from [[instantToMicros]] (floor
* toward `-inf` for the integral micro part); the last three decimal digits of
* `instant.getNano` (`[0, 999]`) become `nanosWithinMicro` after dropping `(9 - precision)` low
* digits.
*
* Combined, the result is the floor toward `-inf` of the original nanosecond value rounded down
* to the precision step (10^(9 - precision) ns). At `precision = 9` the conversion is lossless
* within the valid range; at 7 / 8 the lowest 2 / 1 sub-micro digits are dropped. The same
* flooring will be the basis of the future `CAST(... AS TIMESTAMP_LTZ(precision))` rule.
*/
def instantToTimestampNanos(instant: Instant, precision: Int): TimestampNanosVal = {
val epochMicros = instantToMicros(instant)
val rawNanosWithinMicro = instant.getNano % NANOS_PER_MICROS.toInt
val nanosWithinMicro = truncateNanosWithinMicroToPrecision(rawNanosWithinMicro, precision)
TimestampNanosVal.fromParts(epochMicros, nanosWithinMicro.toShort)
}

/**
* Reverse of [[instantToTimestampNanos]]: rebuilds a `java.time.Instant` from a
* `TimestampNanosVal`. `nanosWithinMicro` is in `[0, 999]` so `plusNanos` never crosses the
* second boundary.
*/
def timestampNanosToInstant(v: TimestampNanosVal): Instant = {
microsToInstant(v.epochMicros).plusNanos(v.nanosWithinMicro.toLong)
}

/**
* Converts the local date to the number of days since 1970-01-01.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types._
import org.apache.spark.sql.types.DayTimeIntervalType._
import org.apache.spark.sql.types.YearMonthIntervalType._
import org.apache.spark.unsafe.types.{GeographyVal, GeometryVal, UTF8String}
import org.apache.spark.unsafe.types.{GeographyVal, GeometryVal, TimestampNanosVal, UTF8String}
import org.apache.spark.util.ArrayImplicits._
import org.apache.spark.util.collection.Utils

Expand Down Expand Up @@ -88,6 +88,8 @@ object CatalystTypeConverters {
case TimestampType if SQLConf.get.datetimeJava8ApiEnabled => InstantConverter
case TimestampType => TimestampConverter
case TimestampNTZType => TimestampNTZConverter
case t: TimestampNTZNanosType => new TimestampNTZNanosConverter(t)
case t: TimestampLTZNanosType => new TimestampLTZNanosConverter(t)
case dt: DecimalType => new DecimalConverter(dt)
case BooleanType => BooleanConverter
case ByteType => ByteConverter
Expand Down Expand Up @@ -298,7 +300,7 @@ object CatalystTypeConverters {
}
new GenericInternalRow(ar)
case other => throw new SparkIllegalArgumentException(
errorClass = "_LEGACY_ERROR_TEMP_3219",
errorClass = "INVALID_EXTERNAL_VALUE",

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why rename the legacy error in this PR?

@MaxGekk MaxGekk May 28, 2026

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do believe it is related to the changes. We postpone this issue every time while adding new types. I think it is worth to assign proper name now.

messageParameters = scala.collection.immutable.Map(
"other" -> other.toString,
"otherClass" -> other.getClass.getCanonicalName,
Expand Down Expand Up @@ -357,7 +359,7 @@ object CatalystTypeConverters {
case chr: Char => UTF8String.fromString(chr.toString)
case ac: Array[Char] => UTF8String.fromString(String.valueOf(ac))
case other => throw new SparkIllegalArgumentException(
errorClass = "_LEGACY_ERROR_TEMP_3219",
errorClass = "INVALID_EXTERNAL_VALUE",
messageParameters = scala.collection.immutable.Map(
"other" -> other.toString,
"otherClass" -> other.getClass.getCanonicalName,
Expand All @@ -383,7 +385,7 @@ object CatalystTypeConverters {
case g: org.apache.spark.sql.types.Geometry if SQLConf.get.geospatialEnabled =>
STUtils.serializeGeomFromWKB(g, dataType)
case other => throw new SparkIllegalArgumentException(
errorClass = "_LEGACY_ERROR_TEMP_3219",
errorClass = "INVALID_EXTERNAL_VALUE",
messageParameters = scala.collection.immutable.Map(
"other" -> other.toString,
"otherClass" -> other.getClass.getCanonicalName,
Expand All @@ -408,7 +410,7 @@ object CatalystTypeConverters {
case g: org.apache.spark.sql.types.Geography if SQLConf.get.geospatialEnabled =>
STUtils.serializeGeogFromWKB(g, dataType)
case other => throw new SparkIllegalArgumentException(
errorClass = "_LEGACY_ERROR_TEMP_3219",
errorClass = "INVALID_EXTERNAL_VALUE",
messageParameters = scala.collection.immutable.Map(
"other" -> other.toString,
"otherClass" -> other.getClass.getCanonicalName,
Expand All @@ -432,7 +434,7 @@ object CatalystTypeConverters {
case d: Date => DateTimeUtils.fromJavaDate(d)
case l: LocalDate => DateTimeUtils.localDateToDays(l)
case other => throw new SparkIllegalArgumentException(
errorClass = "_LEGACY_ERROR_TEMP_3219",
errorClass = "INVALID_EXTERNAL_VALUE",
messageParameters = scala.collection.immutable.Map(
"other" -> other.toString,
"otherClass" -> other.getClass.getCanonicalName,
Expand Down Expand Up @@ -472,7 +474,7 @@ object CatalystTypeConverters {
case t: Timestamp => DateTimeUtils.fromJavaTimestamp(t)
case i: Instant => DateTimeUtils.instantToMicros(i)
case other => throw new SparkIllegalArgumentException(
errorClass = "_LEGACY_ERROR_TEMP_3219",
errorClass = "INVALID_EXTERNAL_VALUE",
messageParameters = scala.collection.immutable.Map(
"other" -> other.toString,
"otherClass" -> other.getClass.getCanonicalName,
Expand Down Expand Up @@ -500,7 +502,7 @@ object CatalystTypeConverters {
override def toCatalystImpl(scalaValue: Any): Any = scalaValue match {
case l: LocalDateTime => DateTimeUtils.localDateTimeToMicros(l)
case other => throw new SparkIllegalArgumentException(
errorClass = "_LEGACY_ERROR_TEMP_3219",
errorClass = "INVALID_EXTERNAL_VALUE",
messageParameters = scala.collection.immutable.Map(
"other" -> other.toString,
"otherClass" -> other.getClass.getCanonicalName,
Expand All @@ -515,6 +517,50 @@ object CatalystTypeConverters {
DateTimeUtils.microsToLocalDateTime(row.getLong(column))
}

private class TimestampNTZNanosConverter(dataType: TimestampNTZNanosType)
extends CatalystTypeConverter[Any, LocalDateTime, TimestampNanosVal] {
override def toCatalystImpl(scalaValue: Any): TimestampNanosVal = scalaValue match {
case l: LocalDateTime => DateTimeUtils.localDateTimeToTimestampNanos(l, dataType.precision)
case other => throw new SparkIllegalArgumentException(
errorClass = "INVALID_EXTERNAL_VALUE",
messageParameters = scala.collection.immutable.Map(
"other" -> other.toString,
"otherClass" -> other.getClass.getCanonicalName,
"dataType" -> dataType.sql))
}

override def toScala(catalystValue: TimestampNanosVal): LocalDateTime =
if (catalystValue == null) null
else DateTimeUtils.timestampNanosToLocalDateTime(catalystValue)

override def toScalaImpl(row: InternalRow, column: Int): LocalDateTime =
DateTimeUtils.timestampNanosToLocalDateTime(row.getTimestampNTZNanos(column))
}

// Always maps `TimestampLTZNanosType` to `java.time.Instant`. Unlike micro `TimestampType`,
// the mapping does not consult `spark.sql.datetime.java8API.enabled`: the nanos LTZ type is
// post-Java-8 and the legacy `java.sql.Timestamp` external type is intentionally out of scope
// here. See SPARK-57033.
private class TimestampLTZNanosConverter(dataType: TimestampLTZNanosType)
extends CatalystTypeConverter[Any, Instant, TimestampNanosVal] {
override def toCatalystImpl(scalaValue: Any): TimestampNanosVal = scalaValue match {
case i: Instant => DateTimeUtils.instantToTimestampNanos(i, dataType.precision)
case other => throw new SparkIllegalArgumentException(
errorClass = "INVALID_EXTERNAL_VALUE",
messageParameters = scala.collection.immutable.Map(
"other" -> other.toString,
"otherClass" -> other.getClass.getCanonicalName,
"dataType" -> dataType.sql))
}

override def toScala(catalystValue: TimestampNanosVal): Instant =
if (catalystValue == null) null
else DateTimeUtils.timestampNanosToInstant(catalystValue)

override def toScalaImpl(row: InternalRow, column: Int): Instant =
DateTimeUtils.timestampNanosToInstant(row.getTimestampLTZNanos(column))
}

private class DecimalConverter(dataType: DecimalType)
extends CatalystTypeConverter[Any, JavaBigDecimal, Decimal] {

Expand All @@ -527,7 +573,7 @@ object CatalystTypeConverters {
case d: JavaBigInteger => Decimal(d)
case d: Decimal => d
case other => throw new SparkIllegalArgumentException(
errorClass = "_LEGACY_ERROR_TEMP_3219",
errorClass = "INVALID_EXTERNAL_VALUE",
messageParameters = scala.collection.immutable.Map(
"other" -> other.toString,
"otherClass" -> other.getClass.getCanonicalName,
Expand Down Expand Up @@ -655,6 +701,9 @@ object CatalystTypeConverters {
case ld: LocalDate => LocalDateConverter.toCatalyst(ld)
case t: LocalTime => TimeConverter.toCatalyst(t)
case t: Timestamp => TimestampConverter.toCatalyst(t)
// SPARK-57033: schema-less convertToCatalyst keeps bare `Instant` / `LocalDateTime` on the
// microsecond converters. The nanosecond path is schema-driven only - users opt in via an
// explicit `TimestampLTZNanosType` / `TimestampNTZNanosType` column in the schema.
case i: Instant => InstantConverter.toCatalyst(i)
case l: LocalDateTime => TimestampNTZConverter.toCatalyst(l)
case d: BigDecimal =>
Expand Down
Loading